From 47fe4e686212096ec96ef638b0389baece652c16 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Wed, 25 Feb 2026 21:29:05 -0500
Subject: [PATCH 1/9] Add an RFC for Execution V2

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 index.ts                       |   5 +-
 proposals/0001-execution-v2.md | 734 +++++++++++++++++++++++++++++++++
 styles.css                     |  64 +++
 3 files changed, 802 insertions(+), 1 deletion(-)
 create mode 100644 proposals/0001-execution-v2.md
diff --git a/index.ts b/index.ts
index 0b1f5a9..980e60a 100644
--- a/index.ts
+++ b/index.ts
@@ -86,6 +86,8 @@ function baseHTML(title: string, content: string, cssPath: string = "styles.css"
   <link rel="icon" type="image/svg+xml" href="${basePath}vortex_logo.svg">
   <link rel="stylesheet" href="${cssPath}">
   <script>${THEME_SCRIPT}</script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.11.1/highlight.min.js"></script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.11.1/languages/rust.min.js"></script>
 </head>
 <body>
   <div class="container">
@@ -106,7 +108,8 @@ ${content}
       Vortex RFC Archive
     </footer>
   </div>
-  <script>${TOGGLE_SCRIPT}</script>${liveReload ? `\n  <script>${LIVE_RELOAD_SCRIPT}</script>` : ""}
+  <script>${TOGGLE_SCRIPT}</script>
+  <script>hljs.highlightAll();</script>${liveReload ? `\n  <script>${LIVE_RELOAD_SCRIPT}</script>` : ""}
 </body>
 </html>`;
 }
diff --git a/proposals/0001-execution-v2.md b/proposals/0001-execution-v2.md
new file mode 100644
index 0000000..170ccb8
--- /dev/null
+++ b/proposals/0001-execution-v2.md
@@ -0,0 +1,734 @@
+- Start Date: 2026-02-25
+- RFC PR: [vortex-data/rfcs#0000](https://github.com/vortex-data/rfcs/pull/0000)
+- Tracking Issue: [vortex-data/vortex#0000](https://github.com/vortex-data/vortex/issues/0000)
+
+## Summary
+
+Replace the current iterative `execute`/`execute_parent` execution loop with a three-phase
+pipeline -- **optimize**, **peel selections**, **canonicalize** -- that reduces the VTable
+surface from five methods to three, unifies Filter/Slice/Take into a composable `Selection`
+type fused with decompression, and replaces the executor loop with a single recursive descent
+through a new `canonicalize_into` VTable method.
+
+## Motivation
+
+The current execution model has five VTable extension points: `reduce`, `reduce_parent`,
+`execute`, `execute_parent`, and `append_to_builder`. Each encoding can implement all five, and
+adaptor types (Filter, Slice, Take, Compare, Cast, FillNull, Between, Mask, Zip, Like, Not,
+ListContains) each come in Reduce and Execute variants. This leads to a combinatorial explosion
+of implementations -- Dict alone has 7 parent reduce rules and 3 parent execute kernels.
+
+**The distinction between extension points is unclear.** The stated rule is "reduce =
+metadata-only, execute = may read buffers," but many operations blur this boundary. Some
+`execute_parent` implementations (Dict + Compare, ALP + Compare, FSST + Compare) are
+metadata-only tree rewrites that belong in `reduce_parent`. Others (Primitive + Compare,
+Bool + FillNull) are just the scalar function's default implementation on canonical inputs --
+redundant with the function's own `execute` method.
+
+**Selection operations are the same thing expressed differently.** Filter (boolean mask), Slice
+(contiguous range), and Take (integer indices) are all row selection. The per-encoding
+selection logic itself is inherent -- each encoding needs to know how to select rows -- but
+today it is spread across three separate adaptor traits. Unifying them into a single
+`Selection` parameter on `canonicalize_into` eliminates the trait boilerplate and makes it harder
+for an encoding to implement Slice but forget Take.
+
+**The executor loop is hard to reason about.** The iterative reduce → reduce_parent →
+execute_parent → execute loop runs up to 128 iterations. Each iteration may rewrite the tree
+in surprising ways. Debugging requires tracing through multiple levels of indirection to
+understand which vtable method fired and why.
+
+**Strict metadata-only reduce is required for GPU.** When buffers live on a GPU device,
+host-side code *cannot* access them. The boundary between reduce (metadata-only) and execute
+(buffer-accessing) is well-defined, but several existing implementations violate it -- some
+`execute_parent` implementations that are actually metadata-only tree rewrites are registered
+as execute rather than reduce. The new model eliminates execute entirely, making violations
+impossible.
+
+Concrete use cases that are painful today:
+
+- **DuckDB dictionary export**: The DuckDB exporter wants to export dictionary-encoded columns
+  as DuckDB dictionary vectors. Today it must drive the executor loop, hoping it converges to a
+  DictArray, with no guarantee of success. With the new model, it runs optimize → peel → inspect
+  for DictArray in a predictable sequence.
+
+- **Selection-fused decompression**: BitPacked, FSST, and RunEnd can decompress only selected
+  rows, but today each must implement separate Filter, Slice, and Take adaptor traits. The
+  selection logic itself is the same, but with a unified `Selection` parameter it is harder
+  for an encoding to forget to implement one of the three selection forms, and the boilerplate
+  of three separate trait impls is eliminated.
+
+- **GPU execution**: The reduce/reduce_parent phase must be strictly metadata-only so it can run
+  on arrays whose buffers live on a GPU. The current model does not enforce this.
+
+## Design
+
+### VTable surface
+
+The new model has three VTable methods with clear, non-overlapping roles:
+
+```rust
+pub trait VTable {
+    /// Rewrite this array to a simpler form without touching data buffers.
+    /// Returns Ok(None) when no rewrite is possible.
+    fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>>;
+
+    /// Rewrite the parent array using knowledge of this child's encoding.
+    /// Used for expression push-down, cast push-down, constant folding, etc.
+    /// Must not access data buffers.
+    fn reduce_parent(
+        array: &Self::Array,
+        parent: &ArrayRef,
+        child_idx: usize,
+    ) -> VortexResult<Option<ArrayRef>>;
+
+    /// Materialize this array into a canonical builder, fusing the given row selection
+    /// with decompression. This is where all O(n) buffer-reading work happens.
+    fn canonicalize_into(
+        array: &Self::Array,
+        builder: &mut CanonicalBuilder,
+        selection: &Selection,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<()>;
+}
+```
+
+| Method             | Called by         | Purpose                                     | Buffer access |
+|--------------------|-------------------|---------------------------------------------|---------------|
+| `reduce`           | `optimize()` loop | Self-rewrite                                | Never         |
+| `reduce_parent`    | `optimize()` loop | Rewrite parent (expression push-down, etc.) | Never         |
+| `canonicalize_into`| `canonicalize()`  | Decompress with fused selection             | Yes           |
+
+`reduce` and `reduce_parent` are unchanged from the current model. `canonicalize_into` replaces
+`execute`, `execute_parent`, and `append_to_builder`. The top-level `canonicalize()` free
+function runs the full three-phase pipeline (optimize → peel → `canonicalize_into`).
+
+### Selection
+
+```rust
+pub enum Selection {
+    /// All rows are selected.
+    All,
+    /// A single row by index (replaces scalar_at).
+    Index(usize),
+    /// A contiguous range of rows.
+    Range(Range<usize>),
+    /// A boolean mask (true = selected).
+    Mask(Mask),
+    /// Arbitrary indices, possibly repeated, possibly nullable.
+    /// A null index produces a null row in the output.
+    Indices(PrimitiveArray),
+}
+
+impl Selection {
+    /// Compose two selections. self is applied first, then further restricted by other.
+    fn compose(&self, other: &Selection) -> Selection;
+    /// Number of selected rows.
+    fn count(&self) -> usize;
+
+    /// Apply selection in-place, compacting the selected elements to the front.
+    fn apply_in_place<T>(&self, data: &mut [T]);
+    /// Apply selection to a buffer, returning a new buffer with only selected elements.
+    /// Takes &Buffer<T> (not &[T]) so the output alignment is known.
+    fn apply<T>(&self, data: &Buffer<T>) -> BufferMut<T>;
+}
+```
+
+`Index` and `Range` avoid allocating a `Mask` for trivial cases (single element lookup,
+contiguous slice).
+
+The first four variants (`All`, `Index`, `Range`, `Mask`) are pure row subsetting -- the
+output is always a subset of the input, in input order, with no repetition. `Indices` is
+semantically different: it is a **gather** that can repeat positions, reorder rows, and insert
+nulls via null indices. This is not an ideal unification -- gather has different semantics from
+selection (output can be larger than input, row order is caller-defined, nullability is
+introduced). However, the alternative is a separate gather API that every encoding must
+implement alongside `canonicalize_into`, which recreates the combinatorial problem this RFC
+aims to eliminate. DictArray is used both as genuine dictionary encoding (small values, many
+codes) and as a general-purpose take over large arrays (e.g., defining a take mask over a full
+Vortex file, with null indices to insert null rows). The latter case requires `Indices` --
+materializing all values with `Selection::All` would be catastrophic when values is an entire
+file column.
+
+Encodings can handle `Indices` as a fallback: canonicalize with `Selection::All`, then gather
+from the canonical result. Performance-critical encodings (BitPacked, FSST) can optimize the
+gather path directly, just as they implement `TakeExecute` today.
+
+The `Index` variant subsumes `scalar_at` -- getting a single element is just materializing with
+a one-row selection:
+
+```rust
+pub fn scalar_at(array: &dyn Array, idx: usize, ctx: &mut ExecutionCtx) -> VortexResult<Scalar> {
+    let mut builder = CanonicalBuilder::new(array.dtype());
+    array.canonicalize_into(&mut builder, &Selection::Index(idx), ctx)?;
+    builder.finish().into_array().scalar_at(0)  // trivial on canonical
+}
+```
+
+### CanonicalBuilder
+
+`CanonicalBuilder` replaces the current `dyn ArrayBuilder` with a closed enum. The current
+`ArrayBuilder` is a trait object -- callers get a `Box<dyn ArrayBuilder>` and can only interact
+with it through generic methods like `append_scalar()` and `extend_from_array()`. Encodings
+cannot know what concrete builder they are writing into, so they must produce intermediate
+arrays or scalars and let the builder copy the data in.
+
+`CanonicalBuilder` is an enum that mirrors the `Canonical` variants in mutable builder form:
+
+```rust
+pub enum CanonicalBuilder {
+    Null(NullBuilder),
+    Bool(BoolBuilder),
+    Primitive(PrimitiveBuilder),
+    Decimal(DecimalBuilder),
+    VarBinView(VarBinViewBuilder),
+    List(ListViewBuilder),
+    FixedSizeList(FixedSizeListBuilder),
+    Struct(StructBuilder),
+    Extension(ExtensionBuilder),
+}
+
+impl CanonicalBuilder {
+    pub fn new(dtype: &DType) -> Self { ... }
+    pub fn finish(self) -> Canonical { ... }
+}
+```
+
+Because it is an enum (not `dyn`), encodings can match on it to access the concrete builder
+type directly. This enables zero-copy decompression paths that are impossible with `dyn
+ArrayBuilder`. For example, FSST can match for the `VarBinView` variant and push decompressed
+buffers and views directly into the builder's internal storage, avoiding an intermediate string
+allocation:
+
+```rust
+fn canonicalize_into(fsst: &FSSTArray, builder: &mut CanonicalBuilder, selection, ctx) {
+    let CanonicalBuilder::VarBinView(vbv_builder) = builder else {
+        unreachable!("FSST must canonicalize into VarBinView");
+    };
+    let (buffers, views) = fsst_decode_views(fsst, vbv_builder.completed_block_count(), ctx)?;
+    vbv_builder.push_buffer_and_adjusted_views(&buffers, &views, ...);
+}
+```
+
+`CanonicalBuilder` is only canonical at the root level. For Struct, field arrays are
+accumulated as `Vec<ArrayRef>` (potentially still compressed/chunked) rather than being
+recursively decompressed into nested builders. This generalizes the `pack_struct_chunks`
+pattern that already exists in the chunked canonicalization path.
+
+### The three-phase pipeline
+
+Canonicalization is a three-phase pipeline with no executor loop:
+
+```rust
+pub fn canonicalize(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Canonical> {
+    // Phase 1: optimize tree (reduce rules, expression push-down)
+    let array = array.optimize_recursive()?;
+
+    // Phase 2: peel selection wrappers
+    let (array, selection) = array.peel_selection();
+
+    // Phase 3: canonicalize_into (single recursive descent, no loop)
+    let mut builder = CanonicalBuilder::new(array.dtype());
+    array.canonicalize_into(&mut builder, &selection, ctx)?;
+    Ok(builder.finish())
+}
+```
+
+**Phase 1** runs `reduce`/`reduce_parent` in a loop until the tree is stable. This handles all
+metadata-only rewrites: expression push-down into Dict values, RunEnd values; constant folding;
+comparison target transformation for ALP, FoR, FSST; structural simplification. No data buffers
+are read.
+
+**Phase 2** calls `array.peel_selection()` to strip Filter, Slice, and Take wrappers from the
+root, composing them into a single `Selection`. The remaining array tree has no selection
+wrappers at its root.
+
+**Phase 3** is a single recursive descent through `canonicalize_into`. Each encoding decompresses
+with the fused selection. There is no iteration -- each encoding knows how to decompress in one
+call. Selection wrappers encountered deeper in the tree compose themselves into the selection
+during the recursive descent.
+
+### Selection composition during materialization
+
+Selection wrappers (Slice, Filter) still exist as array types for lazy representation (e.g., a
+file reader produces sliced arrays). During materialization, their `canonicalize_into` composes the
+wrapper into the selection and delegates to the child:
+
+```rust
+// SliceArray's canonicalize_into
+fn canonicalize_into(slice: &SliceArray, builder, selection, ctx) {
+    let composed = selection.compose(Selection::Range(slice.range()));
+    slice.child().canonicalize_into(builder, &composed, ctx)
+}
+
+// FilterArray's canonicalize_into
+fn canonicalize_into(filter: &FilterArray, builder, selection, ctx) {
+    let composed = selection.compose(Selection::Mask(filter.mask()));
+    filter.child().canonicalize_into(builder, &composed, ctx)
+}
+```
+
+Phase 2 (`peel_selection`) is only needed when the caller wants to inspect the array before
+materializing (e.g., the DuckDB exporter checking for DictArray). During normal
+materialization, selection composition happens naturally during the recursive descent.
+
+### ScalarFnArray materialization
+
+ScalarFnArray's `canonicalize_into` materializes its children, then evaluates the function on
+the results:
+
+```rust
+fn canonicalize_into(
+    array: &ScalarFnArray,
+    builder: &mut CanonicalBuilder,
+    selection: &Selection,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let children: Vec<Columnar> = array.children().iter()
+        .map(|child| {
+            // Constants pass through without expansion
+            if let Some(c) = child.as_opt::<ConstantVTable>() {
+                return Ok(Columnar::Constant(c.clone()));
+            }
+            let mut child_builder = CanonicalBuilder::new(child.dtype());
+            child.canonicalize_into(&mut child_builder, selection, ctx)?;
+            Ok(Columnar::Canonical(child_builder.finish()))
+        })
+        .try_collect()?;
+
+    let result = array.scalar_fn().execute(children, ctx)?;
+    builder.extend_from_array(&result);
+    Ok(())
+}
+```
+
+By the time `canonicalize_into` runs, `reduce_parent` has already pushed operations into compressed
+domains. Dict's `reduce_parent` rewrites `compare(dict(codes, values), x)` into
+`dict(codes, compare(values, x))`, so the comparison only runs on unique values, not every row.
+
+### Constants
+
+Constants are inherently special. "This value repeated N times without N copies" requires
+*someone* to know about broadcasting, and no representation trick eliminates that.
+
+`ConstantArray` continues to exist as an encoding. Its `canonicalize_into` expands the value
+into the builder, just like any other encoding decompresses. The only place constants get
+special treatment is ScalarFnArray's `canonicalize_into` -- one check to avoid expanding
+constants before
+passing to the scalar function. This preserves the `Columnar` enum:
+
+```rust
+pub enum Columnar {
+    Canonical(Canonical),
+    Constant(ConstantArray),
+}
+```
+
+`ScalarFn::execute` receives `Vec<Columnar>`. Most scalar function implementations handle
+constant inputs efficiently -- e.g., `add(primitive, constant(1))` broadcasts the scalar
+without allocating a repeated buffer.
+
+Reduce rules interact with constants via `as_opt::<ConstantVTable>()` to read the scalar value
+(metadata access, not buffer access). Constant folding, constant propagation (`slice(constant)
+→ constant`, `filter(constant, mask) → constant`), and related rules are all metadata-only.
+
+### Per-encoding canonicalize_into
+
+Each encoding implements a single decompression method that handles selection:
+
+- **BitPacked**: unpack only selected positions into PrimitiveBuilder
+- **FSST**: decompress only selected strings into VarBinViewBuilder
+- **RunEnd**: binary-search ends to find runs intersecting the selection, expand only those
+- **Dict**: materialize codes with selection, take from values
+- **Chunked**: split selection across chunks, recurse per-chunk
+- **Struct**: pass selection to each field's materialization
+- **Constant**: create scalar repeated to selection count
+
+For fused decompression patterns (e.g., Dict-RLE), the parent encoding checks inline:
+
+```rust
+// Dict's canonicalize_into
+fn canonicalize_into(dict: &DictArray, builder, selection, ctx) {
+    // Check for fused Dict-RLE pattern
+    if let Some(runend) = dict.codes().as_opt::<RunEndVTable>() {
+        return fused_dict_rle(dict.values(), runend, builder, selection, ctx);
+    }
+
+    // Generic path: materialize codes with selection, take from values
+    let mut code_builder = primitive_builder(dict.codes().dtype());
+    dict.codes().canonicalize_into(&mut code_builder, selection, ctx)?;
+    let codes = code_builder.finish_into_primitive();
+    take_into_builder(builder, dict.values(), &codes, ctx)
+}
+```
+
+### Decompression cache and CSE
+
+`ExecutionCtx` holds a pointer-identity cache (`HashMap<*const dyn Array, ArrayRef>`) scoped
+to a single `canonicalize()` call. This serves two purposes:
+
+- **Common sub-expression elimination**: when the same array node appears as a child of
+  multiple ScalarFnArray nodes (e.g., `x + x`), it is materialized once.
+- **Shared block decompression**: when a ZStd-compressed array is sliced into many chunks, all
+  slices share the same underlying compressed block. The first slice decompresses the block and
+  caches the result; subsequent slices index into it.
+
+The cache key is the raw pointer to the array, which is stable because arrays are `Arc`-based.
+The cache is dropped after `canonicalize()` returns, so it does not leak memory across calls.
+
+### Worked example: DuckDB dictionary export
+
+A Vortex file scan produces a chunk with a dictionary column. The query has a filter predicate
+(`name = 'alice'`) evaluated to a mask, and a projection (`upper(name)`). The array tree:
+
+```
+filter(
+    scalar_fn(upper, [
+        dict(
+            codes: bitpacked([0,1,0,2,1,0,...]),
+            values: fsst(["alice","bob","charlie"])
+        )
+    ]),
+    mask
+)
+```
+
+**Phase 1: optimize.** `reduce_parent` pushes `upper` into dict values:
+
+```
+filter(
+    dict(
+        codes: bitpacked([0,1,0,2,1,0,...]),
+        values: scalar_fn(upper, [fsst(["alice","bob","charlie"])])
+    ),
+    mask
+)
+```
+
+**Phase 2: peel selections.** Filter is at the root, so it peels:
+
+```
+array = dict(codes: bitpacked(...), values: scalar_fn(upper, [fsst(...)]))
+selection = Selection::Mask(mask)
+```
+
+**Phase 3: export.** The DuckDB exporter inspects the tree:
+
+```rust
+fn export_chunk(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<DuckDBVector> {
+    let array = array.optimize_recursive()?;
+    let (array, selection) = array.peel_selection();
+
+    if let Some(dict) = array.as_opt::<DictVTable>() {
+        return export_dict(dict, &selection, ctx);
+    }
+
+    // Fallback: full materialization
+    let mut builder = CanonicalBuilder::new(array.dtype());
+    array.canonicalize_into(&mut builder, &selection, ctx)?;
+    export_canonical(builder.finish())
+}
+
+fn export_dict(
+    dict: &DictArray,
+    selection: &Selection,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<DuckDBVector> {
+    // Materialize ALL unique values (no selection -- values are shared)
+    let mut val_builder = CanonicalBuilder::new(dict.values().dtype());
+    dict.values().canonicalize_into(&mut val_builder, &Selection::All, ctx)?;
+    let values = val_builder.finish();
+
+    // Materialize codes WITH selection (only rows that passed the filter)
+    let mut code_builder = primitive_builder(dict.codes().dtype());
+    dict.codes().canonicalize_into(&mut code_builder, selection, ctx)?;
+    let codes = code_builder.finish_into_primitive();
+
+    Ok(DuckDBVector::dictionary(values, codes))
+}
+```
+
+**Values path**: `scalar_fn(upper, [fsst(["alice","bob","charlie"])])` materializes via
+ScalarFnArray's `canonicalize_into`. FSST bulk-decompresses 3 strings, `upper` runs on canonical
+input, producing `["ALICE","BOB","CHARLIE"]`.
+
+**Codes path**: `bitpacked([0,1,0,2,1,0,...])` materializes with `Selection::Mask(mask)`.
+BitPacked unpacks only the positions where the mask is true.
+
+**Result**: DuckDB gets a dictionary vector with 3 uppercase values and only the selected
+codes. `upper()` was computed on 3 unique values instead of N rows. The filter was fused with
+bit-unpacking. No intermediate arrays were materialized.
+
+### What changes
+
+**Removed:**
+
+- `execute` VTable method -- replaced by `canonicalize_into(builder, selection)`.
+- `execute_parent` VTable method -- selection-related uses replaced by `Selection`;
+  compressed-domain computation uses replaced by `reduce_parent` rules.
+- The executor loop -- no more iterative execute-until-canonical. Optimization is a reduce-only
+  loop; materialization is a single recursive descent.
+- Selection-related adaptors -- `FilterReduceAdaptor`, `FilterExecuteAdaptor`,
+  `SliceReduceAdaptor`, `SliceExecuteAdaptor`, `TakeExecuteAdaptor`, and all their per-encoding
+  implementations (~40+ impls total).
+- Canonical-type `execute_parent` kernels -- Primitive + Compare, Bool + FillNull, Decimal +
+  Between, etc. These move into the scalar function's `execute` method where they belong.
+
+**Retained:**
+
+- `reduce` / `reduce_parent` -- unchanged. Still drive expression push-down, cast push-down,
+  constant folding, structural simplification. Strictly metadata-only, no buffer access.
+- `ScalarFnArray` -- kept as per-node lazy computation. Composes with `reduce_parent` for
+  expression push-down into compressed encodings.
+- Filter/Slice/Take array wrappers -- still exist for lazy representation. Their
+  `canonicalize_into` composes them into the Selection and delegates to the child. These could be
+  unified into a single `SelectionArray` wrapping a `Selection` in the future.
+
+**New:**
+
+- `Selection` type -- unifies Filter, Slice, and Take into a single row-selection
+  representation that composes and threads through materialization.
+- `CanonicalBuilder` -- one-level-deep builder enum. Struct children accumulate as
+  `Vec<ArrayRef>` rather than being decompressed into child builders.
+- `Array::peel_selection()` -- method to strip selection wrappers from the root, used by
+  exporters that want to inspect the tree before materializing.
+
+**Summary comparison:**
+
+| Aspect                         | Current                                                                         | Proposed                                          |
+|--------------------------------|---------------------------------------------------------------------------------|---------------------------------------------------|
+| VTable methods                 | 5 (`reduce`, `reduce_parent`, `execute`, `execute_parent`, `append_to_builder`) | 3 (`reduce`, `reduce_parent`, `canonicalize_into`)     |
+| Adaptor types per operation    | 2 (Reduce + Execute)                                                            | 1 (Reduce only, for expression push-down ops)     |
+| Selection handling             | Per-encoding Filter, Slice, Take adaptors                                       | Single `Selection` parameter                      |
+| Materialization                | Iterative loop calling execute until convergence                                | Single recursive descent through `canonicalize_into`   |
+| Executor loop                  | reduce → reduce_parent → execute_parent → execute, repeated                     | optimize (reduce loop) → peel selections → build  |
+| Builder model                  | Fully recursive (forces deep canonicalization)                                  | One-level canonical (children stay compressed)    |
+| Fused decompression (Dict-RLE) | Child volunteers via execute_parent                                             | Parent checks for pattern in its own `canonicalize_into` |
+| Buffer access in reduce        | Ambiguous                                                                       | Never (strict, works on any device including GPU) |
+
+### Migration path
+
+The migration is designed so that every step leaves the codebase in a working state. The old
+and new execution paths coexist until the new path covers all encodings, at which point the
+old path is deleted. Each phase is independently shippable.
+
+**Phase 1: Foundation types.** Introduce `Selection` (with `compose`, `count`, `apply`,
+`apply_in_place`), `CanonicalBuilder` enum, and `Array::peel_selection()`. All additive, no
+behavior changes.
+
+**Phase 2: Add `canonicalize_into` with fallback default.** Add `canonicalize_into` to `VTable` with a
+default implementation that bridges to the old execute path:
+
+```rust
+fn canonicalize_into(
+    array: &Self::Array,
+    builder: &mut CanonicalBuilder,
+    selection: &Selection,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    // Fallback: use existing execute-to-canonical, then apply selection
+    let canonical = Canonical::execute(array.as_array_ref().clone(), ctx)?;
+    let selected = selection.apply_to_array(canonical.into_array())?;
+    builder.extend_from_array(&selected)?;
+    Ok(())
+}
+```
+
+Wire up `canonicalize()` using the three-phase pipeline. Both old (`execute::<Canonical>`) and
+new (`canonicalize()`) paths work. Integration tests assert equivalence.
+
+**Phase 3: Migrate canonical encodings.** One encoding per PR: Null, Bool, Primitive, Decimal,
+VarBinView, Struct, List/FixedSizeList, Extension, Constant. Each writes directly into the
+builder with fused selection. Independent and parallelizable.
+
+**Phase 4: Migrate selection wrappers.** Filter, Slice, Chunked, Masked compose into
+`Selection` instead of executing independently.
+
+**Phase 5: Migrate compressed encodings.** Selection-fused decompression for BitPacked,
+RunEnd, Dict, FSST, ALP, FoR, ZStd, Sparse, and others. Start with highest-value encodings.
+Each encoding is an independent PR.
+
+**Phase 6: Move `execute_parent` → `reduce_parent`.** Compressed-domain rewrites (Dict +
+Compare, ALP + Compare, FoR + Compare, FSST + Compare) become `reduce_parent` rules.
+Canonical-type kernels (Primitive + Compare, Bool + FillNull) move into `ScalarFn::execute`.
+Once empty, remove `execute_parent` from VTable.
+
+**Phase 7: Delete old execution path.** Delete all selection adaptor traits and their ~40+
+per-encoding implementations. Delete `execute` VTable method, the executor loop, and the
+`Executable` trait. All callers of `execute::<Canonical>` migrate to `canonicalize()` (the
+free function that runs the full three-phase pipeline).
+
+**Phase 8: Replace `scalar_at`.** `scalar_at` becomes a free function using
+`Selection::Index`. Remove `OperationsVTable::scalar_at` and per-encoding implementations.
+
+**Phase 9: Wire up exporters.** DuckDB exporter uses optimize → peel → inspect → export.
+Add decompression cache to `ExecutionCtx`.
+
+**Ordering and parallelism:**
+
+```
+Phase 1: Foundation types
+    │
+Phase 2: canonicalize_into with fallback
+    │
+    ├── Phase 3: Canonical encodings (independent per encoding)
+    ├── Phase 4: Selection wrappers
+    ├── Phase 5: Compressed encodings (independent per encoding)
+    └── Phase 6: execute_parent → reduce_parent (independent per rule)
+         │
+    Phase 7: Delete old path (after 3-6 complete)
+         │
+    ├── Phase 8: scalar_at → Selection::Index
+    └── Phase 9: Exporter updates
+```
+
+Phases 3-6 can proceed in parallel across contributors. The fallback default in Phase 2 ensures
+nothing breaks while encodings are migrated at their own pace. Phase 7 is the cleanup gate.
+
+## Compatibility
+
+This RFC does not change the file format or wire format. All changes are internal to the
+execution engine.
+
+**Public API breakage:**
+
+- `Executable` trait and `execute::<T>()` are removed. Callers migrate to `canonicalize()` or
+  call `canonicalize_into` directly.
+- `OperationsVTable::scalar_at` is removed. Callers use the free function `scalar_at()` which
+  delegates to `canonicalize_into` with `Selection::Index`.
+- Third-party encodings that implement `execute`, `execute_parent`, or selection adaptors must
+  migrate to `canonicalize_into`. The fallback default (Phase 2) provides a bridge during
+  migration.
+
+**Performance implications:**
+
+- Selection-fused decompression should improve performance for filtered/sliced reads by
+  avoiding intermediate materialization.
+- Single-element access via `Selection::Index` may be marginally slower than today's
+  `scalar_at` for encodings that currently fast-path scalar access. These encodings can
+  fast-path `Selection::Index` in their `canonicalize_into` if needed.
+- The decompression cache avoids redundant work for shared sub-expressions and ZStd blocks.
+
+## Drawbacks
+
+- **Migration effort.** The migration touches every encoding in the codebase (~33 total). The
+  phased approach with a fallback default mitigates this, but it is still significant work
+  spread across many PRs.
+
+- **Fused pattern detection moves to the parent.** In the current model, a child encoding can
+  volunteer to handle its parent's execution via `execute_parent` (e.g., RunEnd volunteers for
+  Dict-RLE). In the new model, the parent must check for the pattern in its own
+  `canonicalize_into`. This means the parent must know about the child encoding, which is a
+  slight inversion of responsibility. In practice, the known fused patterns (Dict-RLE) involve
+  encodings that are both Arrow-supported and live in the builtin `vortex-array` crate, so the
+  cross-encoding dependency is not a crate boundary issue.
+
+- **Selection composition complexity.** `Selection::compose` must handle all variant
+  combinations correctly (Mask + Range, Indices + Mask, etc.). This is concentrated complexity
+  in one well-tested type, but getting it wrong would cause subtle data corruption.
+
+- **`Indices` conflates selection with gather.** The first four `Selection` variants are pure
+  row subsetting (output <= input, preserves order, no nulls introduced). `Indices` is a
+  gather: it can repeat rows, reorder them, and introduce nulls. Bundling both concepts into
+  one enum is a pragmatic compromise -- the alternative (a separate gather API every encoding
+  must implement) recreates the combinatorial explosion this RFC aims to eliminate -- but it
+  means `Selection` is not a clean abstraction.
+
+## Alternatives
+
+### Keep the iterative execution model
+
+We could keep the current `execute`/`execute_parent` loop and address individual pain points
+(e.g., adding selection fusion as a new adaptor). This avoids the migration cost but perpetuates
+the unclear distinction between reduce and execute, the combinatorial adaptor explosion, and the
+difficulty of reasoning about the executor loop. The VTable surface stays at 5 methods.
+
+### Use a single ExpressionArray instead of per-node ScalarFnArray
+
+Instead of one ScalarFnArray per operation, a single ExpressionArray could hold an entire
+expression tree. This would be simpler in some ways but prevents `reduce_parent` from
+interacting with individual expression nodes. The per-node approach is what makes Dict's
+`reduce_parent` able to push `compare` into dict values -- the key optimization for
+dictionary-encoded predicates.
+
+### Use `dyn ArrayBuilder` instead of `CanonicalBuilder` enum
+
+The builder could be a trait object (`&mut dyn ArrayBuilder`) instead of an enum. This is more
+extensible but prevents encodings from matching on the concrete builder type. FSST's ability to
+push directly into `VarBinViewBuilder` (avoiding an intermediate string allocation) depends on
+knowing the builder variant at compile time.
+
+### Separate `scalar_at` from `Selection`
+
+We could keep `scalar_at` as a separate VTable method for single-element access, rather than
+subsuming it into `Selection::Index`. This avoids any performance regression for scalar access
+but adds a fourth VTable method and requires every encoding to implement it separately.
+
+## Prior Art
+
+- **DataFusion** has `PhysicalExpr::evaluate_selection`, which takes an explicit selection
+  (equivalent to our `Selection`) and threads it through expression evaluation. This is the
+  closest analogue to threading `Selection` through `canonicalize_into`.
+
+## Unresolved Questions
+
+- **Selection composition correctness**: Composing Mask + Range + Indices needs careful
+  implementation and thorough testing, especially for `Indices` which introduces repetition,
+  reordering, and nullability. This is a well-tested utility type vs N x 3 adaptor
+  implementations, so the tradeoff is favorable, but it needs to be right.
+
+- **CanonicalBuilder design**: The exact interface for one-level-deep building, especially for
+  List and Extension types, needs design work.
+
+- **Aggregate and window functions**: This RFC focuses on scalar functions. AggregateFnArray
+  and WindowFnArray will need their own materialization paths, which are explicitly out of scope
+  for this RFC.
+
+- **`Indices` semantics mismatch**: `Indices` is a gather (repetition, reordering, nullable)
+  while the other `Selection` variants are pure subsetting. This means `compose`, `count`, and
+  `apply` must handle two fundamentally different operations. Whether this tension causes
+  practical problems (e.g., confusing behavior in compose chains) needs to be evaluated during
+  implementation.
+
+## Future Possibilities
+
+### Selection pull-up
+
+In the new model, selection wrappers compose naturally into the `Selection` parameter during
+the recursive descent. However, when a selection is nested inside an expression, it cannot be
+peeled:
+
+```
+scalar_fn(upper, [filter(dict(...), mask)])
+```
+
+`peel_selection()` sees ScalarFnArray at the root, not FilterArray. If `reduce_parent` has no
+rule for this function + encoding combination, the tree stays as-is and the filter gets
+composed in during descent. `upper` runs on N selected rows, producing a flat canonical result.
+The dictionary structure is lost.
+
+A future optimization could **pull selections out of expressions** to the root:
+
+```
+filter(scalar_fn(upper, [dict(...)]), mask)
+```
+
+With the selection at the root, `peel_selection()` extracts it, and subsequent optimizations
+can proceed. More optimal decisions can be made when the selection is available at the top and
+pushed down at execution time, rather than being trapped inside an expression.
+
+In practice, most scans apply filters at the top level of the array tree, so this edge case is
+uncommon. But for complex expression trees with embedded selections, pull-up would enable
+strictly better execution plans.
+
+### Aggregate and window functions
+
+The execution model is designed to support additional function types beyond scalar functions.
+`AggregateFnArray` (for sum, min, max, count) and `WindowFnArray` will follow a similar
+deferred pattern, using the same `reduce`/`reduce_parent` optimization strategy and
+`Selection`-based materialization.
+
+### GPU kernel fusion
+
+The strict metadata-only constraint on `reduce`/`reduce_parent` means the optimized tree can
+be shipped to a GPU context. A GPU-aware `canonicalize_into` could fuse the entire optimized tree
+into a single kernel launch, reducing memory traffic and kernel overhead.
diff --git a/styles.css b/styles.css
index d74bf70..94495b3 100644
--- a/styles.css
+++ b/styles.css
@@ -8,6 +8,16 @@
   --link-hover: #004499;
   --code-bg: #f5f5f5;
   --accent: #333;
+  --hl-keyword: #a626a4;
+  --hl-string: #50a14f;
+  --hl-comment: #a0a1a7;
+  --hl-function: #4078f2;
+  --hl-number: #986801;
+  --hl-type: #c18401;
+  --hl-built-in: #e45649;
+  --hl-attr: #986801;
+  --hl-symbol: #0184bc;
+  --hl-meta: #a0a1a7;
 }
 
 @media (prefers-color-scheme: dark) {
@@ -21,6 +31,16 @@
     --link-hover: #9acbf7;
     --code-bg: #252525;
     --accent: #ccc;
+    --hl-keyword: #c678dd;
+    --hl-string: #98c379;
+    --hl-comment: #5c6370;
+    --hl-function: #61afef;
+    --hl-number: #d19a66;
+    --hl-type: #e5c07b;
+    --hl-built-in: #e06c75;
+    --hl-attr: #d19a66;
+    --hl-symbol: #56b6c2;
+    --hl-meta: #5c6370;
   }
 }
 
@@ -34,6 +54,16 @@
   --link-hover: #9acbf7;
   --code-bg: #252525;
   --accent: #ccc;
+  --hl-keyword: #c678dd;
+  --hl-string: #98c379;
+  --hl-comment: #5c6370;
+  --hl-function: #61afef;
+  --hl-number: #d19a66;
+  --hl-type: #e5c07b;
+  --hl-built-in: #e06c75;
+  --hl-attr: #d19a66;
+  --hl-symbol: #56b6c2;
+  --hl-meta: #5c6370;
 }
 
 :root[data-theme="light"] {
@@ -46,6 +76,16 @@
   --link-hover: #004499;
   --code-bg: #f5f5f5;
   --accent: #333;
+  --hl-keyword: #a626a4;
+  --hl-string: #50a14f;
+  --hl-comment: #a0a1a7;
+  --hl-function: #4078f2;
+  --hl-number: #986801;
+  --hl-type: #c18401;
+  --hl-built-in: #e45649;
+  --hl-attr: #986801;
+  --hl-symbol: #0184bc;
+  --hl-meta: #a0a1a7;
 }
 
 * {
@@ -459,3 +499,27 @@ footer {
   color: var(--fg-muted);
   font-size: 0.875rem;
 }
+
+/* Syntax highlighting (highlight.js token classes) */
+.hljs { background: var(--code-bg); color: var(--fg); }
+.hljs-keyword,
+.hljs-selector-tag { color: var(--hl-keyword); }
+.hljs-string,
+.hljs-addition { color: var(--hl-string); }
+.hljs-comment,
+.hljs-quote { color: var(--hl-comment); font-style: italic; }
+.hljs-title,
+.hljs-title.function_ { color: var(--hl-function); }
+.hljs-number,
+.hljs-literal { color: var(--hl-number); }
+.hljs-type,
+.hljs-title.class_ { color: var(--hl-type); }
+.hljs-built_in { color: var(--hl-built-in); }
+.hljs-attr,
+.hljs-variable,
+.hljs-template-variable { color: var(--hl-attr); }
+.hljs-symbol,
+.hljs-link { color: var(--hl-symbol); }
+.hljs-meta,
+.hljs-deletion { color: var(--hl-meta); }
+.hljs-punctuation { color: var(--fg-muted); }

From d0d3f69c9ba737f627303f0c178af5b39c0fbb33 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Thu, 26 Feb 2026 21:31:15 -0500
Subject: [PATCH 2/9] RFCs

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 proposals/0001-execution-v2.md                | 1244 ++++++++++-------
 ...02-aggregate-functions-and-list-scalars.md |  858 ++++++++++++
 2 files changed, 1576 insertions(+), 526 deletions(-)
 create mode 100644 proposals/0002-aggregate-functions-and-list-scalars.md

diff --git a/proposals/0001-execution-v2.md b/proposals/0001-execution-v2.md
index 170ccb8..7ad5811 100644
--- a/proposals/0001-execution-v2.md
+++ b/proposals/0001-execution-v2.md
@@ -4,192 +4,113 @@
 
 ## Summary
 
-Replace the current iterative `execute`/`execute_parent` execution loop with a three-phase
-pipeline -- **optimize**, **peel selections**, **canonicalize** -- that reduces the VTable
-surface from five methods to three, unifies Filter/Slice/Take into a composable `Selection`
-type fused with decompression, and replaces the executor loop with a single recursive descent
-through a new `canonicalize_into` VTable method.
+Replace the current execution model with a cleaner architecture that:
+- Enables decompress-into-buffer (caller-owned output) for zero-copy decompression
+- Passes a `Mask` to the execution method, forcing encodings to handle row selection
+- Cleans up the reduce/execute boundary and removes migration-era adaptor boilerplate
 
-## Motivation
-
-The current execution model has five VTable extension points: `reduce`, `reduce_parent`,
-`execute`, `execute_parent`, and `append_to_builder`. Each encoding can implement all five, and
-adaptor types (Filter, Slice, Take, Compare, Cast, FillNull, Between, Mask, Zip, Like, Not,
-ListContains) each come in Reduce and Execute variants. This leads to a combinatorial explosion
-of implementations -- Dict alone has 7 parent reduce rules and 3 parent execute kernels.
-
-**The distinction between extension points is unclear.** The stated rule is "reduce =
-metadata-only, execute = may read buffers," but many operations blur this boundary. Some
-`execute_parent` implementations (Dict + Compare, ALP + Compare, FSST + Compare) are
-metadata-only tree rewrites that belong in `reduce_parent`. Others (Primitive + Compare,
-Bool + FillNull) are just the scalar function's default implementation on canonical inputs --
-redundant with the function's own `execute` method.
-
-**Selection operations are the same thing expressed differently.** Filter (boolean mask), Slice
-(contiguous range), and Take (integer indices) are all row selection. The per-encoding
-selection logic itself is inherent -- each encoding needs to know how to select rows -- but
-today it is spread across three separate adaptor traits. Unifying them into a single
-`Selection` parameter on `canonicalize_into` eliminates the trait boilerplate and makes it harder
-for an encoding to implement Slice but forget Take.
-
-**The executor loop is hard to reason about.** The iterative reduce → reduce_parent →
-execute_parent → execute loop runs up to 128 iterations. Each iteration may rewrite the tree
-in surprising ways. Debugging requires tracing through multiple levels of indirection to
-understand which vtable method fired and why.
-
-**Strict metadata-only reduce is required for GPU.** When buffers live on a GPU device,
-host-side code *cannot* access them. The boundary between reduce (metadata-only) and execute
-(buffer-accessing) is well-defined, but several existing implementations violate it -- some
-`execute_parent` implementations that are actually metadata-only tree rewrites are registered
-as execute rather than reduce. The new model eliminates execute entirely, making violations
-impossible.
+Two candidate designs are presented: **Option A** (three-phase recursive) and **Option B**
+(cleaned-up iterative). Both share the `Mask`-based row selection and the reduce boundary cleanup. They
+differ in how materialization is structured -- Option A passes a caller-owned builder through
+recursive descent, Option B keeps an iterative loop with typed execution steps.
 
-Concrete use cases that are painful today:
+## Motivation
 
-- **DuckDB dictionary export**: The DuckDB exporter wants to export dictionary-encoded columns
-  as DuckDB dictionary vectors. Today it must drive the executor loop, hoping it converges to a
-  DictArray, with no guarantee of success. With the new model, it runs optimize → peel → inspect
-  for DictArray in a predictable sequence.
+**No decompress-into-buffer.** The current execution model has no way to decompress an array
+into a caller-provided output buffer. Each encoding allocates its own output during execution.
+This means ChunkedArray must decompress each chunk separately and concatenate the results --
+an extra copy of the entire column. Exporters (DuckDB, Arrow) cannot write directly into their
+output format; they must receive Vortex's output and copy it. This is the primary pain point.
 
-- **Selection-fused decompression**: BitPacked, FSST, and RunEnd can decompress only selected
-  rows, but today each must implement separate Filter, Slice, and Take adaptor traits. The
-  selection logic itself is the same, but with a unified `Selection` parameter it is harder
-  for an encoding to forget to implement one of the three selection forms, and the boilerplate
-  of three separate trait impls is eliminated.
+**Stack overflows from recursive execution.** The current executor is recursive: each
+`execute` call may recurse into children, which recurse into their children. On deep array
+trees (deeply nested expressions, many-chunk arrays) this overflows the stack. The recursion
+depth is bounded by a 128-iteration limit, but this limit applies to the outer loop -- inner
+recursive calls have no bound.
 
-- **GPU execution**: The reduce/reduce_parent phase must be strictly metadata-only so it can run
-  on arrays whose buffers live on a GPU. The current model does not enforce this.
+**Easy to forget selection implementations.** Filter (boolean mask), Slice (contiguous range),
+and Take (integer indices) are separate adaptor traits. Each encoding can implement one and
+forget the others. Fusing selection with decompression is really just
+`execute_parent(FilterArray)` -- which is fine -- but the current design makes it easy to miss.
+Passing a `Mask` parameter to the execution method forces every encoding to handle
+selection, eliminating the gap.
 
-## Design
+**Adaptor explosion is migration noise.** The adaptor types (FilterReduceAdaptor,
+SliceExecuteAdaptor, TakeExecuteAdaptor, etc.) are a bridge from the old compute kernel
+dispatch to the new VTable world. The reduce/execute distinction is already enforced -- we just
+haven't deprecated the APIs that don't require an `ExecutionCtx`. The ~40+ adaptor impls are
+migration artifacts, not a fundamental design problem, but they add noise and make the codebase
+harder to navigate.
 
-### VTable surface
+**Strict metadata-only reduce is useful for GPU.** When buffers live on a GPU device,
+host-side code cannot easily access them. A clean metadata-only reduce boundary is useful
+(though not strictly required) for GPU execution paths.
 
-The new model has three VTable methods with clear, non-overlapping roles:
+## Shared design
 
-```rust
-pub trait VTable {
-    /// Rewrite this array to a simpler form without touching data buffers.
-    /// Returns Ok(None) when no rewrite is possible.
-    fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>>;
-
-    /// Rewrite the parent array using knowledge of this child's encoding.
-    /// Used for expression push-down, cast push-down, constant folding, etc.
-    /// Must not access data buffers.
-    fn reduce_parent(
-        array: &Self::Array,
-        parent: &ArrayRef,
-        child_idx: usize,
-    ) -> VortexResult<Option<ArrayRef>>;
-
-    /// Materialize this array into a canonical builder, fusing the given row selection
-    /// with decompression. This is where all O(n) buffer-reading work happens.
-    fn canonicalize_into(
-        array: &Self::Array,
-        builder: &mut CanonicalBuilder,
-        selection: &Selection,
-        ctx: &mut ExecutionCtx,
-    ) -> VortexResult<()>;
-}
-```
+Both options share these design elements.
 
-| Method             | Called by         | Purpose                                     | Buffer access |
-|--------------------|-------------------|---------------------------------------------|---------------|
-| `reduce`           | `optimize()` loop | Self-rewrite                                | Never         |
-| `reduce_parent`    | `optimize()` loop | Rewrite parent (expression push-down, etc.) | Never         |
-| `canonicalize_into`| `canonicalize()`  | Decompress with fused selection             | Yes           |
+### Mask as row selection
 
-`reduce` and `reduce_parent` are unchanged from the current model. `canonicalize_into` replaces
-`execute`, `execute_parent`, and `append_to_builder`. The top-level `canonicalize()` free
-function runs the full three-phase pipeline (optimize → peel → `canonicalize_into`).
+The existing `Mask` type (in `vortex-mask`) already represents row selection with optimized
+internal variants: `AllTrue(usize)`, `AllFalse(usize)`, and `Values(Arc<MaskValues>)` for a
+full bitmap. It already supports `slice()` for restricting to a range. No new type is needed --
+`Mask` is passed directly to the execution method as the row selection parameter.
 
-### Selection
+Composition is mask-over-mask: slicing a mask produces a narrower mask. The main addition is a
+`compose` method (or equivalent) for restricting one mask by another, and `apply` for
+compacting buffer elements.
 
 ```rust
-pub enum Selection {
-    /// All rows are selected.
-    All,
-    /// A single row by index (replaces scalar_at).
-    Index(usize),
-    /// A contiguous range of rows.
-    Range(Range<usize>),
-    /// A boolean mask (true = selected).
-    Mask(Mask),
-    /// Arbitrary indices, possibly repeated, possibly nullable.
-    /// A null index produces a null row in the output.
-    Indices(PrimitiveArray),
-}
+// Existing type, extended with:
+impl Mask {
+  /// Restrict self by other: keep only positions where both are true.
+  pub fn compose(&self, other: &Mask) -> Mask;
 
-impl Selection {
-    /// Compose two selections. self is applied first, then further restricted by other.
-    fn compose(&self, other: &Selection) -> Selection;
-    /// Number of selected rows.
-    fn count(&self) -> usize;
-
-    /// Apply selection in-place, compacting the selected elements to the front.
-    fn apply_in_place<T>(&self, data: &mut [T]);
-    /// Apply selection to a buffer, returning a new buffer with only selected elements.
-    /// Takes &Buffer<T> (not &[T]) so the output alignment is known.
-    fn apply<T>(&self, data: &Buffer<T>) -> BufferMut<T>;
+  /// Apply mask to a buffer, returning a new buffer with only selected elements.
+  pub fn apply<T>(&self, data: &Buffer<T>) -> BufferMut<T>;
 }
 ```
 
-`Index` and `Range` avoid allocating a `Mask` for trivial cases (single element lookup,
-contiguous slice).
-
-The first four variants (`All`, `Index`, `Range`, `Mask`) are pure row subsetting -- the
-output is always a subset of the input, in input order, with no repetition. `Indices` is
-semantically different: it is a **gather** that can repeat positions, reorder rows, and insert
-nulls via null indices. This is not an ideal unification -- gather has different semantics from
-selection (output can be larger than input, row order is caller-defined, nullability is
-introduced). However, the alternative is a separate gather API that every encoding must
-implement alongside `canonicalize_into`, which recreates the combinatorial problem this RFC
-aims to eliminate. DictArray is used both as genuine dictionary encoding (small values, many
-codes) and as a general-purpose take over large arrays (e.g., defining a take mask over a full
-Vortex file, with null indices to insert null rows). The latter case requires `Indices` --
-materializing all values with `Selection::All` would be catastrophic when values is an entire
-file column.
-
-Encodings can handle `Indices` as a fallback: canonicalize with `Selection::All`, then gather
-from the canonical result. Performance-critical encodings (BitPacked, FSST) can optimize the
-gather path directly, just as they implement `TakeExecute` today.
-
-The `Index` variant subsumes `scalar_at` -- getting a single element is just materializing with
-a one-row selection:
+**Take is not selection.** Take (integer indices) can reorder rows, repeat rows, and introduce
+nulls via out-of-bounds indices. These semantics are fundamentally different from subsetting.
+Gather remains an encoding-internal concern -- DictArray handles codes → values lookup inside
+its own materialization.
 
-```rust
-pub fn scalar_at(array: &dyn Array, idx: usize, ctx: &mut ExecutionCtx) -> VortexResult<Scalar> {
-    let mut builder = CanonicalBuilder::new(array.dtype());
-    array.canonicalize_into(&mut builder, &Selection::Index(idx), ctx)?;
-    builder.finish().into_array().scalar_at(0)  // trivial on canonical
-}
-```
+**`scalar_at` remains a separate VTable method.** Single-element access via a builder
+round-trip would regress hot paths like RunEnd binary search probing individual positions.
 
 ### CanonicalBuilder
 
-`CanonicalBuilder` replaces the current `dyn ArrayBuilder` with a closed enum. The current
-`ArrayBuilder` is a trait object -- callers get a `Box<dyn ArrayBuilder>` and can only interact
-with it through generic methods like `append_scalar()` and `extend_from_array()`. Encodings
-cannot know what concrete builder they are writing into, so they must produce intermediate
-arrays or scalars and let the builder copy the data in.
+`CanonicalBuilder` is specifically for the decompress-into case: a caller creates a builder,
+and the encoding writes directly into it. This is structurally incompatible with iterative
+execution (where each step returns an owned result), so `CanonicalBuilder` only applies to
+Option A's recursive model.
+
+It replaces the current `dyn ArrayBuilder` with a closed enum. The current `ArrayBuilder` is a
+trait object -- callers get a `Box<dyn ArrayBuilder>` and can only interact with it through
+generic methods like `append_scalar()` and `extend_from_array()`. Encodings cannot know what
+concrete builder they are writing into, so they must produce intermediate arrays or scalars and
+let the builder copy the data in.
 
 `CanonicalBuilder` is an enum that mirrors the `Canonical` variants in mutable builder form:
 
 ```rust
 pub enum CanonicalBuilder {
-    Null(NullBuilder),
-    Bool(BoolBuilder),
-    Primitive(PrimitiveBuilder),
-    Decimal(DecimalBuilder),
-    VarBinView(VarBinViewBuilder),
-    List(ListViewBuilder),
-    FixedSizeList(FixedSizeListBuilder),
-    Struct(StructBuilder),
-    Extension(ExtensionBuilder),
+  Null(NullBuilder),
+  Bool(BoolBuilder),
+  Primitive(PrimitiveBuilder),
+  Decimal(DecimalBuilder),
+  VarBinView(VarBinViewBuilder),
+  List(ListViewBuilder),
+  FixedSizeList(FixedSizeListBuilder),
+  Struct(StructBuilder),
+  Extension(ExtensionBuilder),
 }
 
 impl CanonicalBuilder {
-    pub fn new(dtype: &DType) -> Self { ... }
-    pub fn finish(self) -> Canonical { ... }
+  pub fn new(dtype: &DType) -> Self { ... }
+  pub fn finish(self) -> Canonical { ... }
 }
 ```
 
@@ -200,182 +121,614 @@ buffers and views directly into the builder's internal storage, avoiding an inte
 allocation:
 
 ```rust
-fn canonicalize_into(fsst: &FSSTArray, builder: &mut CanonicalBuilder, selection, ctx) {
-    let CanonicalBuilder::VarBinView(vbv_builder) = builder else {
-        unreachable!("FSST must canonicalize into VarBinView");
-    };
-    let (buffers, views) = fsst_decode_views(fsst, vbv_builder.completed_block_count(), ctx)?;
-    vbv_builder.push_buffer_and_adjusted_views(&buffers, &views, ...);
+fn canonicalize_into(fsst: &FSSTArray, builder: &mut CanonicalBuilder, mask, ctx) {
+  let CanonicalBuilder::VarBinView(vbv_builder) = builder else {
+    unreachable!("FSST must canonicalize into VarBinView");
+  };
+  let (buffers, views) = fsst_decode_views(fsst, vbv_builder.completed_block_count(), ctx)?;
+  vbv_builder.push_buffer_and_adjusted_views(&buffers, &views, ...);
 }
 ```
 
 `CanonicalBuilder` is only canonical at the root level. For Struct, field arrays are
 accumulated as `Vec<ArrayRef>` (potentially still compressed/chunked) rather than being
-recursively decompressed into nested builders. This generalizes the `pack_struct_chunks`
-pattern that already exists in the chunked canonicalization path.
+recursively decompressed into nested builders.
 
-### The three-phase pipeline
+### Reduce boundary cleanup
 
-Canonicalization is a three-phase pipeline with no executor loop:
+Both options retain `reduce` and `reduce_parent` unchanged:
 
 ```rust
-pub fn canonicalize(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Canonical> {
-    // Phase 1: optimize tree (reduce rules, expression push-down)
-    let array = array.optimize_recursive()?;
+fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>>;
+fn reduce_parent(
+  array: &Self::Array,
+  parent: &ArrayRef,
+  child_idx: usize,
+) -> VortexResult<Option<ArrayRef>>;
+```
+
+Both are strictly metadata-only, never reading data buffers. This boundary is already enforced
+today -- the remaining work is deprecating the APIs that don't require an `ExecutionCtx` and
+migrating the misplaced implementations. Specifically: the metadata-only rewrites currently
+registered as `execute_parent` (Dict + Compare, ALP + Compare, FoR + Compare, FSST + Compare)
+move to `reduce_parent`. The canonical-type kernels currently in `execute_parent` (Primitive +
+Compare, Bool + FillNull, Decimal + Between) move into the scalar function's `execute` method,
+where they belong.
 
-    // Phase 2: peel selection wrappers
-    let (array, selection) = array.peel_selection();
+### Constants
 
-    // Phase 3: canonicalize_into (single recursive descent, no loop)
-    let mut builder = CanonicalBuilder::new(array.dtype());
-    array.canonicalize_into(&mut builder, &selection, ctx)?;
-    Ok(builder.finish())
+`ConstantArray` continues to exist as an encoding. It is special-cased only in ScalarFnArray
+materialization -- one check to avoid expanding constants before passing to the scalar function.
+This preserves the `Columnar` enum:
+
+```rust
+pub enum Columnar {
+  Canonical(Canonical),
+  Constant(ConstantArray),
 }
 ```
 
-**Phase 1** runs `reduce`/`reduce_parent` in a loop until the tree is stable. This handles all
-metadata-only rewrites: expression push-down into Dict values, RunEnd values; constant folding;
-comparison target transformation for ALP, FoR, FSST; structural simplification. No data buffers
-are read.
+**Open question:** The current scalar representation uses heap-allocated value trees for nested
+types (structs, lists). For deeply nested types, this is expensive. An alternative is to hold
+constants as length-1 arrays instead of scalars, avoiding the scalar value tree entirely. This
+is orthogonal to the execution model choice but may affect `ConstantArray`'s internal
+representation.
+
+### Decompression cache and CSE
 
-**Phase 2** calls `array.peel_selection()` to strip Filter, Slice, and Take wrappers from the
-root, composing them into a single `Selection`. The remaining array tree has no selection
-wrappers at its root.
+`ExecutionCtx` holds a pointer-identity cache (`HashMap<*const dyn Array, ArrayRef>`) scoped
+to a single materialization call. When the same array node appears as a child of multiple
+ScalarFnArray nodes (e.g., `x + x`), it is materialized once. When a ZStd-compressed array is
+sliced into many chunks, the first slice decompresses the block and caches the result. The
+cache is dropped after materialization returns.
 
-**Phase 3** is a single recursive descent through `canonicalize_into`. Each encoding decompresses
-with the fused selection. There is no iteration -- each encoding knows how to decompress in one
-call. Selection wrappers encountered deeper in the tree compose themselves into the selection
-during the recursive descent.
+---
 
-### Selection composition during materialization
+## Option A: Three-phase recursive model
 
-Selection wrappers (Slice, Filter) still exist as array types for lazy representation (e.g., a
-file reader produces sliced arrays). During materialization, their `canonicalize_into` composes the
-wrapper into the selection and delegates to the child:
+### VTable surface (3 methods)
 
 ```rust
-// SliceArray's canonicalize_into
-fn canonicalize_into(slice: &SliceArray, builder, selection, ctx) {
-    let composed = selection.compose(Selection::Range(slice.range()));
-    slice.child().canonicalize_into(builder, &composed, ctx)
+pub trait VTable {
+  fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>>;
+  fn reduce_parent(
+    array: &Self::Array,
+    parent: &ArrayRef,
+    child_idx: usize,
+  ) -> VortexResult<Option<ArrayRef>>;
+
+  /// Materialize this array into a canonical builder, fusing the given row mask
+  /// with decompression. This is where all buffer-reading work happens.
+  fn canonicalize_into(
+    array: &Self::Array,
+    builder: &mut CanonicalBuilder,
+    mask: &Mask,
+    ctx: &mut ExecutionCtx,
+  ) -> VortexResult<()>;
 }
+```
 
-// FilterArray's canonicalize_into
-fn canonicalize_into(filter: &FilterArray, builder, selection, ctx) {
-    let composed = selection.compose(Selection::Mask(filter.mask()));
-    filter.child().canonicalize_into(builder, &composed, ctx)
+| Method             | Purpose                                     | Buffer access |
+|--------------------|---------------------------------------------|---------------|
+| `reduce`           | Self-rewrite (metadata only)                | Never         |
+| `reduce_parent`    | Rewrite parent (expression push-down, etc.) | Never         |
+| `canonicalize_into`| Decompress with fused mask                  | Yes           |
+
+### Three-phase pipeline
+
+```rust
+pub fn canonicalize(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Canonical> {
+  // Phase 1: optimize tree (reduce rules, expression push-down)
+  let array = array.optimize_recursive()?;
+
+  // Phase 2: peel selection wrappers
+  let (array, mask) = array.peel_selection();
+
+  // Phase 3: single recursive descent, no loop
+  let mut builder = CanonicalBuilder::new(array.dtype());
+  array.vtable().canonicalize_into(&array, &mut builder, &mask, ctx)?;
+  Ok(builder.finish())
 }
 ```
 
-Phase 2 (`peel_selection`) is only needed when the caller wants to inspect the array before
-materializing (e.g., the DuckDB exporter checking for DictArray). During normal
-materialization, selection composition happens naturally during the recursive descent.
+**Phase 1** runs `reduce`/`reduce_parent` in a loop until the tree is stable. All
+metadata-only rewrites happen here: expression push-down into Dict values, constant folding,
+comparison target transformation for ALP/FoR/FSST.
 
-### ScalarFnArray materialization
+**Phase 2** strips Filter and Slice wrappers from the root, composing them into a single
+`Mask`. *(Open question: does lifting selection to the root provide meaningful benefit
+beyond what passing `Mask` through recursion already gives? It may be orthogonal to the
+core proposal.)*
+
+**Phase 3** is a single recursive descent. Each encoding decompresses with the fused selection.
+No iteration. Selection wrappers deeper in the tree compose themselves into the selection
+during descent:
+
+```rust
+// SliceArray's canonicalize_into: slicing composes as masking out the excluded rows
+fn canonicalize_into(slice: &SliceArray, builder, mask, ctx) {
+  let composed = mask.slice(slice.range());
+  slice.child().canonicalize_into(builder, &composed, ctx)
+}
+```
+
+### Per-encoding canonicalize_into
+
+- **BitPacked**: unpack only selected positions into PrimitiveBuilder
+- **FSST**: decompress only selected strings into VarBinViewBuilder
+- **RunEnd**: binary-search ends to find runs intersecting the selection, expand only those
+- **Dict**: materialize codes with selection, gather from values
+- **Chunked**: split selection across chunks, recurse per-chunk
+- **Struct**: pass selection to each field's materialization
+- **Constant**: create scalar repeated to selection count
+
+For fused decompression patterns (e.g., Dict-RLE), the parent checks inline:
 
-ScalarFnArray's `canonicalize_into` materializes its children, then evaluates the function on
-the results:
+```rust
+fn canonicalize_into(dict: &DictArray, builder, mask, ctx) {
+  if let Some(runend) = dict.codes().as_opt::<RunEndVTable>() {
+    return fused_dict_rle(dict.values(), runend, builder, mask, ctx);
+  }
+
+  let mut code_builder = primitive_builder(dict.codes().dtype());
+  dict.codes().canonicalize_into(&mut code_builder, mask, ctx)?;
+  let codes = code_builder.finish_into_primitive();
+  take_into_builder(builder, dict.values(), &codes, ctx)
+}
+```
+
+### ScalarFnArray materialization
 
 ```rust
 fn canonicalize_into(
-    array: &ScalarFnArray,
-    builder: &mut CanonicalBuilder,
-    selection: &Selection,
-    ctx: &mut ExecutionCtx,
+  array: &ScalarFnArray,
+  builder: &mut CanonicalBuilder,
+  mask: &Mask,
+  ctx: &mut ExecutionCtx,
 ) -> VortexResult<()> {
-    let children: Vec<Columnar> = array.children().iter()
-        .map(|child| {
-            // Constants pass through without expansion
+  let children: Vec<Columnar> = array.children().iter()
+          .map(|child| {
             if let Some(c) = child.as_opt::<ConstantVTable>() {
-                return Ok(Columnar::Constant(c.clone()));
+              return Ok(Columnar::Constant(c.clone()));
             }
             let mut child_builder = CanonicalBuilder::new(child.dtype());
-            child.canonicalize_into(&mut child_builder, selection, ctx)?;
+            child.canonicalize_into(&mut child_builder, mask, ctx)?;
             Ok(Columnar::Canonical(child_builder.finish()))
-        })
-        .try_collect()?;
+          })
+          .try_collect()?;
 
-    let result = array.scalar_fn().execute(children, ctx)?;
-    builder.extend_from_array(&result);
-    Ok(())
+  let result = array.scalar_fn().execute(children, ctx)?;
+  builder.extend_from_array(&result);
+  Ok(())
 }
 ```
 
-By the time `canonicalize_into` runs, `reduce_parent` has already pushed operations into compressed
-domains. Dict's `reduce_parent` rewrites `compare(dict(codes, values), x)` into
-`dict(codes, compare(values, x))`, so the comparison only runs on unique values, not every row.
+### Trade-offs
 
-### Constants
+**Strengths:**
+
+- **Minimal VTable surface.** 3 methods, down from 5. Each has a clear, non-overlapping role.
+- **No iteration loop.** Materialization is a single recursive descent. Predictable, easy to
+  debug, easy to reason about performance. (Note: still recursive -- see weakness below.)
+- **Mask fused with decompression.** Each encoding handles row selection once, in one place.
+- **Strict reduce/canonicalize boundary.** reduce = metadata-only. canonicalize_into =
+  buffer-reading. No ambiguity, works on GPU.
+
+**Weaknesses:**
+
+- **No encoding-preserving mask application.** `canonicalize_into` always decompresses into a
+  `CanonicalBuilder`. There is no VTable mechanism for an encoding to absorb a mask without
+  decompressing. Exporters that want to preserve an encoding (DuckDB FSST vectors, Arrow
+  dictionary arrays) must call encoding-specific filter methods outside the framework. See
+  trade-off #2 for the concrete FSST example.
 
-Constants are inherently special. "This value repeated N times without N copies" requires
-*someone* to know about broadcasting, and no representation trick eliminates that.
+- **Fused patterns require parent-knows-child.** Dict must check `as_opt::<RunEndVTable>()` to
+  fuse Dict-RLE. An external encoding can't volunteer a fused path for a built-in parent
+  without modifying the parent. The known fused patterns (Dict-RLE) are all between encodings
+  in `vortex-array`, so this isn't a practical problem today, but it limits extensibility.
 
-`ConstantArray` continues to exist as an encoding. Its `canonicalize_into` expands the value
-into the builder, just like any other encoding decompresses. The only place constants get
-special treatment is ScalarFnArray's `canonicalize_into` -- one check to avoid expanding
-constants before
-passing to the scalar function. This preserves the `Columnar` enum:
+- **Exporters can't inspect intermediates.** After optimize + peel, the exporter sees the tree
+  as-is. If the root is an opaque encoding that would decode to Dict, the exporter can't
+  discover that -- Phase 3 goes straight to canonical. The DuckDB dictionary export use case
+  only works when DictArray is already visible after Phase 1, which is the common case (Vortex
+  files store Dict as DictArray) but not guaranteed.
+
+- **Still recursive.** `canonicalize_into` recurses into children. On deeply nested trees
+  (many layers of encoding, deeply nested structs/lists), this can still overflow the stack --
+  the same class of problem the current executor has. An explicit work stack or trampoline
+  could mitigate this but adds implementation complexity.
+
+---
+
+## Option B: Cleaned-up iterative model
+
+Keep the framework-driven iteration loop but clean up the VTable surface, enforce the
+reduce/execute boundary, and thread `Mask` through execution.
+
+### VTable surface (3 methods)
 
 ```rust
-pub enum Columnar {
-    Canonical(Canonical),
-    Constant(ConstantArray),
+pub trait VTable {
+  fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>>;
+  fn reduce_parent(
+    array: &Self::Array,
+    parent: &ArrayRef,
+    child_idx: usize,
+  ) -> VortexResult<Option<ArrayRef>>;
+
+  /// Take one execution step toward materialization.
+  /// Returns an ExecutionStep telling the framework what happened.
+  fn execute(
+    array: &Self::Array,
+    mask: &Mask,
+    ctx: &mut ExecutionCtx,
+  ) -> VortexResult<ExecutionStep>;
 }
 ```
 
-`ScalarFn::execute` receives `Vec<Columnar>`. Most scalar function implementations handle
-constant inputs efficiently -- e.g., `add(primitive, constant(1))` broadcasts the scalar
-without allocating a repeated buffer.
+```rust
+pub enum ExecutionStep {
+  /// Fully materialized.
+  Done(Columnar),
+  /// Rewrote to a new array tree. Framework should optimize and execute again.
+  Rewrite(ArrayRef),
+  /// Composed the mask and delegated to child. Framework should continue
+  /// executing the returned array with the new mask.
+  Peel(ArrayRef, Mask),
+}
+```
 
-Reduce rules interact with constants via `as_opt::<ConstantVTable>()` to read the scalar value
-(metadata access, not buffer access). Constant folding, constant propagation (`slice(constant)
-→ constant`, `filter(constant, mask) → constant`), and related rules are all metadata-only.
+| Method    | Purpose                                     | Buffer access |
+|-----------|---------------------------------------------|---------------|
+| `reduce`  | Self-rewrite (metadata only)                | Never         |
+| `reduce_parent` | Rewrite parent (expression push-down)  | Never         |
+| `execute` | One step toward materialization              | Yes           |
 
-### Per-encoding canonicalize_into
+### Framework loop
 
-Each encoding implements a single decompression method that handles selection:
+```rust
+pub fn execute_to_columnar(
+  mut array: ArrayRef,
+  mut mask: Mask,
+  ctx: &mut ExecutionCtx,
+) -> VortexResult<Columnar> {
+  for _ in 0..MAX_ITERATIONS {
+    // Check termination
+    if let Some(c) = array.as_opt::<ConstantVTable>() {
+      return Ok(Columnar::Constant(c.apply_mask(&mask)));
+    }
+    if let Some(c) = array.as_opt::<AnyCanonical>() {
+      return Ok(Columnar::Canonical(mask.apply_to(c.into())?));
+    }
 
-- **BitPacked**: unpack only selected positions into PrimitiveBuilder
-- **FSST**: decompress only selected strings into VarBinViewBuilder
-- **RunEnd**: binary-search ends to find runs intersecting the selection, expand only those
-- **Dict**: materialize codes with selection, take from values
-- **Chunked**: split selection across chunks, recurse per-chunk
-- **Struct**: pass selection to each field's materialization
-- **Constant**: create scalar repeated to selection count
+    // Optimize (reduce/reduce_parent to fixpoint)
+    array = array.optimize_recursive()?;
+
+    // Execute one step
+    match array.vtable().execute(&array, &mask, ctx)? {
+      ExecutionStep::Done(c) => return Ok(c),
+      ExecutionStep::Rewrite(new_array) => {
+        array = new_array;
+      }
+      ExecutionStep::Peel(child, new_mask) => {
+        array = child;
+        mask = new_mask;
+      }
+    }
+  }
+  vortex_bail!("exceeded max iterations")
+}
+```
+
+Note: `optimize_recursive()` runs inside the loop, after every step. This means reduce_parent
+rules can fire on intermediate forms exposed by execution -- the key capability that Option A
+lacks.
+
+### Per-encoding execute
+
+Each encoding returns a typed step, making execution transparent:
+
+```rust
+// FilterArray: compose masks, delegate
+fn execute(filter: &FilterArray, mask, ctx) -> ExecutionStep {
+  let composed = mask.compose(&filter.mask());
+  ExecutionStep::Peel(filter.child().clone(), composed)
+}
+
+// SliceArray: slice the mask, delegate
+fn execute(slice: &SliceArray, mask, ctx) -> ExecutionStep {
+  let composed = mask.slice(slice.range());
+  ExecutionStep::Peel(slice.child().clone(), composed)
+}
+
+// BitPacked: fully decompress with mask
+fn execute(bp: &BitPackedArray, mask, ctx) -> ExecutionStep {
+  let mut builder = primitive_builder(bp.dtype());
+  unpack_selected(bp, &mut builder, mask)?;
+  ExecutionStep::Done(Columnar::Canonical(builder.finish().into()))
+}
+
+// Dict: decompress one step -- execute codes, then take
+fn execute(dict: &DictArray, mask, ctx) -> ExecutionStep {
+  if !dict.codes().is_canonical() {
+    // Execute codes one step, rebuild dict
+    let stepped_codes = dict.codes().execute_one_step(mask, ctx)?;
+    let new_dict = DictArray::new(stepped_codes, dict.values().clone());
+    return ExecutionStep::Rewrite(new_dict.into_array());
+  }
+  // Codes are canonical -- do the take with mask
+  let result = take_canonical(dict.values(), dict.codes().as_primitive(), mask, ctx)?;
+  ExecutionStep::Done(Columnar::Canonical(result))
+}
+
+// ScalarFnArray: execute children, then apply function
+fn execute(sfn: &ScalarFnArray, mask, ctx) -> ExecutionStep {
+  // Find first non-columnar child, execute it one step
+  for (i, child) in sfn.children().iter().enumerate() {
+    if !child.is_columnar() {
+      let stepped = child.execute_one_step(mask, ctx)?;
+      let new_sfn = sfn.replace_child(i, stepped);
+      return ExecutionStep::Rewrite(new_sfn.into_array());
+    }
+  }
+  // All children are columnar -- apply function
+  let children = sfn.children_as_columnar(mask);
+  let result = sfn.scalar_fn().execute(children, ctx)?;
+  ExecutionStep::Done(Columnar::Canonical(result.into()))
+}
+```
+
+### Fused patterns via reduce_parent
 
-For fused decompression patterns (e.g., Dict-RLE), the parent encoding checks inline:
+Because `optimize_recursive()` runs after every step, fused patterns work naturally. Dict
+executes its codes one step (RunEnd → Primitive), then on the next iteration,
+`optimize_recursive()` runs and reduce_parent rules can fire on the new tree. The Dict-RLE
+fused path can be implemented as a reduce_parent rule on RunEnd that recognizes the Dict parent
+and rewrites the tree, or Dict can check inline in its execute -- both work because the
+framework loops.
+
+### Exporter tree inspection
+
+Exporters drive the loop directly, inspecting after each step:
 
 ```rust
-// Dict's canonicalize_into
-fn canonicalize_into(dict: &DictArray, builder, selection, ctx) {
-    // Check for fused Dict-RLE pattern
-    if let Some(runend) = dict.codes().as_opt::<RunEndVTable>() {
-        return fused_dict_rle(dict.values(), runend, builder, selection, ctx);
+fn export_chunk(mut array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<DuckDBVector> {
+  let mut mask = Mask::new_true(array.len());
+
+  for _ in 0..MAX_ITERATIONS {
+    array = array.optimize_recursive()?;
+    let (peeled, m) = array.peel_selection();
+    array = peeled;
+    mask = mask.compose(&m);
+
+    if let Some(dict) = array.as_opt::<DictVTable>() {
+      return export_dict(dict, &mask, ctx);
+    }
+    if array.is_canonical() {
+      return export_canonical(mask.apply_to(array)?, ctx);
     }
 
-    // Generic path: materialize codes with selection, take from values
-    let mut code_builder = primitive_builder(dict.codes().dtype());
-    dict.codes().canonicalize_into(&mut code_builder, selection, ctx)?;
-    let codes = code_builder.finish_into_primitive();
-    take_into_builder(builder, dict.values(), &codes, ctx)
+    match array.vtable().execute(&array, &mask, ctx)? {
+      ExecutionStep::Done(c) => return export_columnar(c, ctx),
+      ExecutionStep::Rewrite(new) => { array = new; }
+      ExecutionStep::Peel(child, m) => { array = child; mask = m; }
+    }
+  }
+  vortex_bail!("exceeded max iterations")
 }
 ```
 
-### Decompression cache and CSE
+The exporter inspects the tree after every step. If an opaque encoding decodes to DictArray
+during execution, the exporter sees it on the next iteration and can export it as a dictionary.
+
+### Trade-offs
+
+**Strengths:**
+
+- **Encoding-preserving mask application.** An encoding can implement `execute(mask)` to
+  absorb the mask and return `Rewrite(still_encoded)`. FSST can filter without decompressing,
+  and exporters see the result through the standard loop -- no encoding-specific logic needed
+  for the mask-application step.
+- **Exporters inspect intermediates.** The DuckDB exporter can find DictArray even when it's
+  hidden behind an opaque encoding, because it inspects after each step.
+- **Child-volunteers extensibility preserved.** A fused decompression pattern can be expressed
+  as a reduce_parent rule or an inline check, without closing the door on external encodings
+  discovering patterns through the iteration.
+- **Same 3 VTable methods.** reduce, reduce_parent, execute. Clean.
+- **No stack overflow risk.** The outer loop is iterative -- each step returns control to the
+  framework. Stack depth is bounded to one execution step, regardless of tree depth.
+
+**Weaknesses:**
+
+- **Iteration loop remains.** Execution is still iterative with a MAX_ITERATIONS bound. Each
+  step produces a new array tree, and the framework loops. This is more predictable than before
+  (no execute_parent, Mask threaded through, typed ExecutionStep) but still requires
+  tracing through iterations to understand execution. The recursive model's "one call, done"
+  is easier to reason about.
+- **Intermediate allocations.** Each `Rewrite` step allocates a new array tree. The recursive
+  model writes directly into the builder with no intermediates. For hot paths (large batches,
+  tight loops), the allocation overhead may be measurable.
+- **Mask and the loop interact awkwardly.** When an encoding returns `Peel(child,
+  new_mask)`, the mask changes mid-loop. The framework must track the current mask across
+  iterations. In the recursive model, the mask flows naturally through function arguments.
+- **No decompress-into-buffer.** The iterative model returns `Columnar` from `Done` -- the
+  encoding owns its output. There is no way to pass a caller-owned `CanonicalBuilder` through
+  the loop, because the framework doesn't know whether this step is final or intermediate.
+  This is the primary pain point that motivated this RFC, and Option B does not address it.
+  ChunkedArray still can't share a builder across chunks. Exporters still can't write directly
+  into their output format.
+
+---
+
+## Fundamental trade-offs
+
+The two options differ on three architectural axes. These are genuine trade-offs -- each option
+wins on some axes and loses on others. They are not fixable with clever engineering; they follow
+from the structural decision of recursion vs. iteration.
+
+### 1. Caller-owned output buffers (Option A wins)
+
+In Option A, the caller creates a `CanonicalBuilder` and passes it down through the recursive
+descent. Every encoding writes directly into this caller-owned buffer. The buffer exists before
+materialization begins and is filled during a single pass.
+
+In Option B, each encoding returns `ExecutionStep::Done(Columnar)` -- it allocates and owns its
+output. The caller receives the finished result. There is no way to pass a builder through the
+loop because the encoding doesn't know at call time whether this is the final step or an
+intermediate rewrite that will be fed back into the loop.
+
+**Why this matters:**
+
+- **ChunkedArray concat avoidance.** With a caller-owned builder, ChunkedArray iterates its
+  chunks and each chunk writes into the same builder. The output is a single contiguous
+  allocation. Without it, each chunk produces its own `Columnar`, and the framework must
+  concatenate them -- an extra copy of the entire column.
+
+- **FSST zero-copy push.** FSST can decompress string views and data buffers directly into a
+  `VarBinViewBuilder`'s internal storage, adjusting view offsets in-place. This avoids
+  materializing an intermediate string array. With `Done(Columnar)`, FSST must allocate the
+  `VarBinView` canonical array itself, losing the ability to share buffer space with other
+  chunks or avoid a final copy.
+
+- **Pre-allocated output.** When the output length is known (common case: the selection count),
+  the builder can pre-allocate once. The iterative model allocates per step.
+
+This trade-off is structural: passing a mutable builder down requires a call stack (recursion).
+A loop that yields intermediate values cannot thread a mutable builder through iterations
+without fundamentally changing the `ExecutionStep` API (e.g., adding a `Done`-with-builder
+variant), which would complicate the common case where an encoding returns `Rewrite`.
+
+### 2. Encoding-preserving mask application (Option B wins)
+
+Some encodings can apply a row mask **without decompressing**. FSST can filter its compressed
+data and produce a smaller FSST array. This matters when the downstream consumer natively
+supports the encoding -- DuckDB supports FSST vectors, so exporting a filtered FSST directly
+avoids decompression entirely.
+
+**Concrete example:** `filter(scalar_fn(upper, [fsst(data)]), mask)`
+
+Both options share the reduce phase. FSST's reduce_parent pushes `upper` into the symbol
+table (metadata-only, analogous to Dict pushing scalar functions into values). After reduce +
+peel, both options see: `fsst_upper(data)` + mask.
+
+In **Option B**, FSST's standard `execute(mask)` method handles this:
 
-`ExecutionCtx` holds a pointer-identity cache (`HashMap<*const dyn Array, ArrayRef>`) scoped
-to a single `canonicalize()` call. This serves two purposes:
+```rust
+fn execute(fsst: &FSSTArray, mask, ctx) -> ExecutionStep {
+  let filtered = fsst.filter_preserving(&mask)?;  // still FSST
+  ExecutionStep::Rewrite(filtered.into_array())
+}
+```
+
+The DuckDB exporter drives the loop, sees FSST on the next iteration, and exports directly.
+The standard path would execute again next iteration and decompress -- no harm. Any encoding
+can implement this pattern through the general VTable.
+
+In **Option A**, the exporter must use encoding-specific logic outside the framework:
+
+```rust
+fn export_chunk(array: ArrayRef, ctx: &mut ExecutionCtx) -> DuckDBVector {
+  let array = array.optimize_recursive()?;
+  let (array, mask) = array.peel_selection();
+
+  // Encoding-specific: exporter must know about FSST
+  if let Some(fsst) = array.as_opt::<FSSTVTable>() {
+    let filtered = fsst.filter_preserving(&mask)?;
+    return export_fsst(filtered, ctx);
+  }
+
+  let mut builder = CanonicalBuilder::new(array.dtype());
+  array.canonicalize_into(&mut builder, &mask, ctx)?;
+  export_canonical(builder.finish())
+}
+```
+
+The general `canonicalize_into` path always decompresses -- there is no VTable mechanism for
+an encoding to absorb a mask without decompressing. The exporter must know about each encoding
+it wants to preserve and call encoding-specific methods.
+
+**The difference:** Option B handles encoding-preserving mask application through the standard
+`execute` VTable method -- any encoding can participate. Option A requires the exporter to have
+encoding-specific knowledge for each format it wants to preserve. In practice, exporters
+already need encoding-specific logic to export to native formats (DuckDB FSST vectors, Arrow
+dictionary arrays), so the additional burden is the mask-application call rather than just an
+`as_opt` check.
+
+This extends to cross-crate encodings. An external encoding that can filter itself without
+decompressing just implements `execute(mask) → Rewrite(filtered_self)` in Option B. In
+Option A, every exporter that wants to preserve that encoding must be updated to call its
+specific filter method.
+
+### 3. Exporter intermediate inspection (Option B wins)
+
+In Option B, exporters drive the loop directly and can inspect the array tree after each step.
+After an encoding applies a mask via `Rewrite`, the exporter sees the result on the next
+iteration and can export it in its native format.
+
+In Option A, exporters see the tree after Phase 1 (optimize) and Phase 2 (peel selection).
+When the encoding the exporter cares about is already visible after peel (the common case --
+Dict is stored as DictArray, FSST is stored as FSSTArray), the exporter can intercept and
+handle it with encoding-specific logic. This covers the typical Vortex file scan path. The
+gap is when an encoding is hidden behind a layer that must be executed to reveal it, or when
+the exporter wants the encoding to absorb the mask through the general VTable (see trade-off
+#2 above).
+
+### Summary
+
+| Trade-off | Winner | Practical impact today |
+|-----------|--------|----------------------|
+| Caller-owned output buffers | **Option A** | **Concrete.** ChunkedArray concat avoidance and FSST zero-copy decompression are measurable. |
+| Encoding-preserving mask application | **Option B** | **Concrete.** FSST filter-without-decompress, Dict filter-without-decompress. Exporters need encoding-specific logic in Option A. |
+| Exporter intermediate inspection | **Option B** | **Moderate.** Exporters can discover encodings revealed by execution. Fixed by peel_selection when the filter is at the root. |
+
+Secondary differences:
+
+| Aspect | Option A | Option B |
+|--------|----------|----------|
+| Stack overflow risk | Still recursive (can overflow on deep trees) | Iterative loop, bounded stack depth |
+| Intermediate allocations | None (direct to builder) | One array tree per `Rewrite` step |
+| Debuggability | One recursive call, stack trace shows position | Loop iterations, requires tracing |
+| Mask threading | Natural function argument | Mutable state across iterations |
+| Fused patterns (Dict-RLE) | Parent checks child inline | reduce_parent on intermediates, or inline |
+| GPU kernel fusion potential | Clean tree maps to single kernel | Step-by-step harder to fuse |
+| Determinism | Always same call sequence | Iteration count varies with input |
+
+## What changes (shared)
+
+**Removed (both options):**
+
+- `execute_parent` VTable method -- metadata-only rewrites move to `reduce_parent`;
+  canonical-type kernels move into `ScalarFn::execute`.
+- `append_to_builder` VTable method.
+- Selection-related adaptors -- `FilterReduceAdaptor`, `FilterExecuteAdaptor`,
+  `SliceReduceAdaptor`, `SliceExecuteAdaptor`, and all their per-encoding implementations.
+  `TakeExecuteAdaptor` is also removed (Take/gather is not row masking; it remains an
+  encoding-internal concern).
+
+**Retained (both options):**
+
+- `reduce` / `reduce_parent` -- strictly metadata-only.
+- `ScalarFnArray` -- per-node lazy computation with `reduce_parent` push-down.
+- Filter/Slice array wrappers -- for lazy representation. Could be unified into a single
+  `MaskedSelectionArray` in the future. Take remains separate (gather semantics, not masking).
+- `scalar_at` -- separate VTable method for single-element access.
 
-- **Common sub-expression elimination**: when the same array node appears as a child of
-  multiple ScalarFnArray nodes (e.g., `x + x`), it is materialized once.
-- **Shared block decompression**: when a ZStd-compressed array is sliced into many chunks, all
-  slices share the same underlying compressed block. The first slice decompresses the block and
-  caches the result; subsequent slices index into it.
+**New (both options):**
 
-The cache key is the raw pointer to the array, which is stable because arrays are `Arc`-based.
-The cache is dropped after `canonicalize()` returns, so it does not leak memory across calls.
+- `Mask::compose()` and `Mask::apply()` methods on the existing `Mask` type.
+- `Array::peel_selection()` -- strip selection wrappers from root.
 
-### Worked example: DuckDB dictionary export
+**New (Option A only):**
+
+- `CanonicalBuilder` -- one-level-deep builder enum for decompress-into.
+- `canonicalize_into` VTable method -- single recursive descent with fused mask.
+
+**New (Option B only):**
+
+- `ExecutionStep` enum -- typed return from execute, replacing bare `ArrayRef`.
+- `execute` takes `&Mask` parameter -- row mask threaded through the loop.
+
+## Worked example: DuckDB dictionary export
 
 A Vortex file scan produces a chunk with a dictionary column. The query has a filter predicate
 (`name = 'alice'`) evaluated to a mask, and a projection (`upper(name)`). The array tree:
@@ -392,7 +745,7 @@ filter(
 )
 ```
 
-**Phase 1: optimize.** `reduce_parent` pushes `upper` into dict values:
+**Optimize.** `reduce_parent` pushes `upper` into dict values:
 
 ```
 filter(
@@ -404,185 +757,88 @@ filter(
 )
 ```
 
-**Phase 2: peel selections.** Filter is at the root, so it peels:
+**Peel selections.** Filter is at the root, so it peels:
 
 ```
 array = dict(codes: bitpacked(...), values: scalar_fn(upper, [fsst(...)]))
-selection = Selection::Mask(mask)
+mask = filter_mask
 ```
 
-**Phase 3: export.** The DuckDB exporter inspects the tree:
+**Export (Option A).** The exporter inspects the tree after optimize + peel:
 
 ```rust
 fn export_chunk(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<DuckDBVector> {
-    let array = array.optimize_recursive()?;
-    let (array, selection) = array.peel_selection();
+  let array = array.optimize_recursive()?;
+  let (array, mask) = array.peel_selection();
 
-    if let Some(dict) = array.as_opt::<DictVTable>() {
-        return export_dict(dict, &selection, ctx);
-    }
+  if let Some(dict) = array.as_opt::<DictVTable>() {
+    return export_dict(dict, &mask, ctx);
+  }
 
-    // Fallback: full materialization
-    let mut builder = CanonicalBuilder::new(array.dtype());
-    array.canonicalize_into(&mut builder, &selection, ctx)?;
-    export_canonical(builder.finish())
-}
-
-fn export_dict(
-    dict: &DictArray,
-    selection: &Selection,
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<DuckDBVector> {
-    // Materialize ALL unique values (no selection -- values are shared)
-    let mut val_builder = CanonicalBuilder::new(dict.values().dtype());
-    dict.values().canonicalize_into(&mut val_builder, &Selection::All, ctx)?;
-    let values = val_builder.finish();
-
-    // Materialize codes WITH selection (only rows that passed the filter)
-    let mut code_builder = primitive_builder(dict.codes().dtype());
-    dict.codes().canonicalize_into(&mut code_builder, selection, ctx)?;
-    let codes = code_builder.finish_into_primitive();
-
-    Ok(DuckDBVector::dictionary(values, codes))
+  let mut builder = CanonicalBuilder::new(array.dtype());
+  array.canonicalize_into(&mut builder, &mask, ctx)?;
+  export_canonical(builder.finish())
 }
 ```
 
-**Values path**: `scalar_fn(upper, [fsst(["alice","bob","charlie"])])` materializes via
-ScalarFnArray's `canonicalize_into`. FSST bulk-decompresses 3 strings, `upper` runs on canonical
-input, producing `["ALICE","BOB","CHARLIE"]`.
-
-**Codes path**: `bitpacked([0,1,0,2,1,0,...])` materializes with `Selection::Mask(mask)`.
-BitPacked unpacks only the positions where the mask is true.
-
-**Result**: DuckDB gets a dictionary vector with 3 uppercase values and only the selected
-codes. `upper()` was computed on 3 unique values instead of N rows. The filter was fused with
-bit-unpacking. No intermediate arrays were materialized.
-
-### What changes
-
-**Removed:**
-
-- `execute` VTable method -- replaced by `canonicalize_into(builder, selection)`.
-- `execute_parent` VTable method -- selection-related uses replaced by `Selection`;
-  compressed-domain computation uses replaced by `reduce_parent` rules.
-- The executor loop -- no more iterative execute-until-canonical. Optimization is a reduce-only
-  loop; materialization is a single recursive descent.
-- Selection-related adaptors -- `FilterReduceAdaptor`, `FilterExecuteAdaptor`,
-  `SliceReduceAdaptor`, `SliceExecuteAdaptor`, `TakeExecuteAdaptor`, and all their per-encoding
-  implementations (~40+ impls total).
-- Canonical-type `execute_parent` kernels -- Primitive + Compare, Bool + FillNull, Decimal +
-  Between, etc. These move into the scalar function's `execute` method where they belong.
-
-**Retained:**
-
-- `reduce` / `reduce_parent` -- unchanged. Still drive expression push-down, cast push-down,
-  constant folding, structural simplification. Strictly metadata-only, no buffer access.
-- `ScalarFnArray` -- kept as per-node lazy computation. Composes with `reduce_parent` for
-  expression push-down into compressed encodings.
-- Filter/Slice/Take array wrappers -- still exist for lazy representation. Their
-  `canonicalize_into` composes them into the Selection and delegates to the child. These could be
-  unified into a single `SelectionArray` wrapping a `Selection` in the future.
-
-**New:**
-
-- `Selection` type -- unifies Filter, Slice, and Take into a single row-selection
-  representation that composes and threads through materialization.
-- `CanonicalBuilder` -- one-level-deep builder enum. Struct children accumulate as
-  `Vec<ArrayRef>` rather than being decompressed into child builders.
-- `Array::peel_selection()` -- method to strip selection wrappers from the root, used by
-  exporters that want to inspect the tree before materializing.
-
-**Summary comparison:**
-
-| Aspect                         | Current                                                                         | Proposed                                          |
-|--------------------------------|---------------------------------------------------------------------------------|---------------------------------------------------|
-| VTable methods                 | 5 (`reduce`, `reduce_parent`, `execute`, `execute_parent`, `append_to_builder`) | 3 (`reduce`, `reduce_parent`, `canonicalize_into`)     |
-| Adaptor types per operation    | 2 (Reduce + Execute)                                                            | 1 (Reduce only, for expression push-down ops)     |
-| Selection handling             | Per-encoding Filter, Slice, Take adaptors                                       | Single `Selection` parameter                      |
-| Materialization                | Iterative loop calling execute until convergence                                | Single recursive descent through `canonicalize_into`   |
-| Executor loop                  | reduce → reduce_parent → execute_parent → execute, repeated                     | optimize (reduce loop) → peel selections → build  |
-| Builder model                  | Fully recursive (forces deep canonicalization)                                  | One-level canonical (children stay compressed)    |
-| Fused decompression (Dict-RLE) | Child volunteers via execute_parent                                             | Parent checks for pattern in its own `canonicalize_into` |
-| Buffer access in reduce        | Ambiguous                                                                       | Never (strict, works on any device including GPU) |
-
-### Migration path
-
-The migration is designed so that every step leaves the codebase in a working state. The old
-and new execution paths coexist until the new path covers all encodings, at which point the
-old path is deleted. Each phase is independently shippable.
-
-**Phase 1: Foundation types.** Introduce `Selection` (with `compose`, `count`, `apply`,
-`apply_in_place`), `CanonicalBuilder` enum, and `Array::peel_selection()`. All additive, no
-behavior changes.
-
-**Phase 2: Add `canonicalize_into` with fallback default.** Add `canonicalize_into` to `VTable` with a
-default implementation that bridges to the old execute path:
+**Export (Option B).** The exporter drives the loop, inspecting after each step:
 
 ```rust
-fn canonicalize_into(
-    array: &Self::Array,
-    builder: &mut CanonicalBuilder,
-    selection: &Selection,
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    // Fallback: use existing execute-to-canonical, then apply selection
-    let canonical = Canonical::execute(array.as_array_ref().clone(), ctx)?;
-    let selected = selection.apply_to_array(canonical.into_array())?;
-    builder.extend_from_array(&selected)?;
-    Ok(())
+fn export_chunk(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<DuckDBVector> {
+  let (mut array, mut mask) = (array, Mask::new_true(array.len()));
+  for _ in 0..MAX_ITERATIONS {
+    array = array.optimize_recursive()?;
+    let (peeled, m) = array.peel_selection();
+    array = peeled;
+    mask = mask.compose(&m);
+
+    if let Some(dict) = array.as_opt::<DictVTable>() {
+      return export_dict(dict, &mask, ctx);
+    }
+    if array.is_canonical() {
+      return export_canonical(mask.apply_to(array)?);
+    }
+    match array.vtable().execute(&array, &mask, ctx)? {
+      ExecutionStep::Done(c) => return export_columnar(c),
+      ExecutionStep::Rewrite(a) => { array = a; }
+      ExecutionStep::Peel(a, m) => { array = a; mask = m; }
+    }
+  }
+  vortex_bail!("export did not converge")
 }
 ```
 
-Wire up `canonicalize()` using the three-phase pipeline. Both old (`execute::<Canonical>`) and
-new (`canonicalize()`) paths work. Integration tests assert equivalence.
+Both options produce the same result: DuckDB gets a dictionary vector with 3 uppercase values
+and only the selected codes. The difference is that Option B can discover DictArray even if it
+was hidden behind an opaque encoding, while Option A only sees what's visible after Phase 1.
 
-**Phase 3: Migrate canonical encodings.** One encoding per PR: Null, Bool, Primitive, Decimal,
-VarBinView, Struct, List/FixedSizeList, Extension, Constant. Each writes directly into the
-builder with fused selection. Independent and parallelizable.
+## Migration path
 
-**Phase 4: Migrate selection wrappers.** Filter, Slice, Chunked, Masked compose into
-`Selection` instead of executing independently.
+The migration is the same for both options. The key difference is Phase 3/5 -- whether
+encodings implement `canonicalize_into` (Option A) or return `ExecutionStep` (Option B).
 
-**Phase 5: Migrate compressed encodings.** Selection-fused decompression for BitPacked,
-RunEnd, Dict, FSST, ALP, FoR, ZStd, Sparse, and others. Start with highest-value encodings.
-Each encoding is an independent PR.
+**Phase 1: Foundation types.** Add `Mask::compose()`/`Mask::apply()`, introduce `CanonicalBuilder`,
+`Array::peel_selection()`. All additive.
 
-**Phase 6: Move `execute_parent` → `reduce_parent`.** Compressed-domain rewrites (Dict +
-Compare, ALP + Compare, FoR + Compare, FSST + Compare) become `reduce_parent` rules.
-Canonical-type kernels (Primitive + Compare, Bool + FillNull) move into `ScalarFn::execute`.
-Once empty, remove `execute_parent` from VTable.
+**Phase 2: Add new execution method with fallback.** Add `canonicalize_into` (Option A) or
+typed `execute` (Option B) with a default that bridges to the old path. Both old and new paths
+work.
 
-**Phase 7: Delete old execution path.** Delete all selection adaptor traits and their ~40+
-per-encoding implementations. Delete `execute` VTable method, the executor loop, and the
-`Executable` trait. All callers of `execute::<Canonical>` migrate to `canonicalize()` (the
-free function that runs the full three-phase pipeline).
+**Phase 3: Migrate canonical encodings.** One per PR. Independent, parallelizable.
 
-**Phase 8: Replace `scalar_at`.** `scalar_at` becomes a free function using
-`Selection::Index`. Remove `OperationsVTable::scalar_at` and per-encoding implementations.
+**Phase 4: Migrate selection wrappers.** Filter, Slice, Chunked, Masked compose into
+`Mask`.
 
-**Phase 9: Wire up exporters.** DuckDB exporter uses optimize → peel → inspect → export.
-Add decompression cache to `ExecutionCtx`.
+**Phase 5: Migrate compressed encodings.** Mask-fused decompression. Independent per
+encoding.
 
-**Ordering and parallelism:**
+**Phase 6: Move execute_parent → reduce_parent.** Metadata-only rewrites become reduce rules.
+Canonical-type kernels move into ScalarFn::execute. Remove execute_parent.
 
-```
-Phase 1: Foundation types
-    │
-Phase 2: canonicalize_into with fallback
-    │
-    ├── Phase 3: Canonical encodings (independent per encoding)
-    ├── Phase 4: Selection wrappers
-    ├── Phase 5: Compressed encodings (independent per encoding)
-    └── Phase 6: execute_parent → reduce_parent (independent per rule)
-         │
-    Phase 7: Delete old path (after 3-6 complete)
-         │
-    ├── Phase 8: scalar_at → Selection::Index
-    └── Phase 9: Exporter updates
-```
+**Phase 7: Delete old path.** Remove old `execute` method, the `Executable` trait, and all
+selection adaptors.
 
-Phases 3-6 can proceed in parallel across contributors. The fallback default in Phase 2 ensures
-nothing breaks while encodings are migrated at their own pace. Phase 7 is the cleanup gate.
+**Phase 8: Wire up exporters.** DuckDB, Arrow exporters migrate to new API.
 
 ## Compatibility
 
@@ -591,144 +847,80 @@ execution engine.
 
 **Public API breakage:**
 
-- `Executable` trait and `execute::<T>()` are removed. Callers migrate to `canonicalize()` or
-  call `canonicalize_into` directly.
-- `OperationsVTable::scalar_at` is removed. Callers use the free function `scalar_at()` which
-  delegates to `canonicalize_into` with `Selection::Index`.
-- Third-party encodings that implement `execute`, `execute_parent`, or selection adaptors must
-  migrate to `canonicalize_into`. The fallback default (Phase 2) provides a bridge during
-  migration.
-
-**Performance implications:**
-
-- Selection-fused decompression should improve performance for filtered/sliced reads by
-  avoiding intermediate materialization.
-- Single-element access via `Selection::Index` may be marginally slower than today's
-  `scalar_at` for encodings that currently fast-path scalar access. These encodings can
-  fast-path `Selection::Index` in their `canonicalize_into` if needed.
-- The decompression cache avoids redundant work for shared sub-expressions and ZStd blocks.
+- `Executable` trait and `execute::<T>()` are removed. Callers migrate to `canonicalize()` /
+  `execute_to_columnar()`.
+- Third-party encodings must migrate to the new execution method. The fallback default
+  (Phase 2) provides a bridge during migration.
 
 ## Drawbacks
 
-- **Migration effort.** The migration touches every encoding in the codebase (~33 total). The
-  phased approach with a fallback default mitigates this, but it is still significant work
-  spread across many PRs.
-
-- **Fused pattern detection moves to the parent.** In the current model, a child encoding can
-  volunteer to handle its parent's execution via `execute_parent` (e.g., RunEnd volunteers for
-  Dict-RLE). In the new model, the parent must check for the pattern in its own
-  `canonicalize_into`. This means the parent must know about the child encoding, which is a
-  slight inversion of responsibility. In practice, the known fused patterns (Dict-RLE) involve
-  encodings that are both Arrow-supported and live in the builtin `vortex-array` crate, so the
-  cross-encoding dependency is not a crate boundary issue.
-
-- **Selection composition complexity.** `Selection::compose` must handle all variant
-  combinations correctly (Mask + Range, Indices + Mask, etc.). This is concentrated complexity
-  in one well-tested type, but getting it wrong would cause subtle data corruption.
-
-- **`Indices` conflates selection with gather.** The first four `Selection` variants are pure
-  row subsetting (output <= input, preserves order, no nulls introduced). `Indices` is a
-  gather: it can repeat rows, reorder them, and introduce nulls. Bundling both concepts into
-  one enum is a pragmatic compromise -- the alternative (a separate gather API every encoding
-  must implement) recreates the combinatorial explosion this RFC aims to eliminate -- but it
-  means `Selection` is not a clean abstraction.
+- **Migration effort.** ~33 encodings must migrate. The phased approach with a fallback default
+  mitigates this.
+
+- **Mask composition complexity.** `Mask::compose` must be correct for all combinations of
+  internal representations (`AllTrue`, `AllFalse`, `Values`). This is concentrated, well-tested
+  complexity, but getting it wrong causes data corruption.
 
 ## Alternatives
 
-### Keep the iterative execution model
+### Keep the iterative execution model as-is
 
-We could keep the current `execute`/`execute_parent` loop and address individual pain points
-(e.g., adding selection fusion as a new adaptor). This avoids the migration cost but perpetuates
-the unclear distinction between reduce and execute, the combinatorial adaptor explosion, and the
-difficulty of reasoning about the executor loop. The VTable surface stays at 5 methods.
+Address individual pain points without restructuring. Avoids migration cost but does not
+solve decompress-into-buffer and perpetuates the adaptor boilerplate.
 
 ### Use a single ExpressionArray instead of per-node ScalarFnArray
 
-Instead of one ScalarFnArray per operation, a single ExpressionArray could hold an entire
-expression tree. This would be simpler in some ways but prevents `reduce_parent` from
-interacting with individual expression nodes. The per-node approach is what makes Dict's
-`reduce_parent` able to push `compare` into dict values -- the key optimization for
-dictionary-encoded predicates.
+Prevents `reduce_parent` from interacting with individual expression nodes -- the key
+optimization for dictionary-encoded predicates.
 
 ### Use `dyn ArrayBuilder` instead of `CanonicalBuilder` enum
 
-The builder could be a trait object (`&mut dyn ArrayBuilder`) instead of an enum. This is more
-extensible but prevents encodings from matching on the concrete builder type. FSST's ability to
-push directly into `VarBinViewBuilder` (avoiding an intermediate string allocation) depends on
-knowing the builder variant at compile time.
+More extensible but prevents zero-copy builder paths. FSST pushing directly into
+`VarBinViewBuilder` depends on knowing the builder variant at compile time.
 
-### Separate `scalar_at` from `Selection`
+### Subsume `scalar_at` into Mask
 
-We could keep `scalar_at` as a separate VTable method for single-element access, rather than
-subsuming it into `Selection::Index`. This avoids any performance regression for scalar access
-but adds a fourth VTable method and requires every encoding to implement it separately.
+Deferred. Single-element access via a builder round-trip would regress hot paths. Revisit
+after profiling.
 
 ## Prior Art
 
-- **DataFusion** has `PhysicalExpr::evaluate_selection`, which takes an explicit selection
-  (equivalent to our `Selection`) and threads it through expression evaluation. This is the
-  closest analogue to threading `Selection` through `canonicalize_into`.
+- **DataFusion** has `PhysicalExpr::evaluate_selection`, threading an explicit selection
+  through expression evaluation.
 
 ## Unresolved Questions
 
-- **Selection composition correctness**: Composing Mask + Range + Indices needs careful
-  implementation and thorough testing, especially for `Indices` which introduces repetition,
-  reordering, and nullability. This is a well-tested utility type vs N x 3 adaptor
-  implementations, so the tradeoff is favorable, but it needs to be right.
+- **Mask composition correctness**: `Mask::compose` and `Mask::slice` interactions need
+  careful implementation and thorough testing.
 
 - **CanonicalBuilder design**: The exact interface for one-level-deep building, especially for
   List and Extension types, needs design work.
 
-- **Aggregate and window functions**: This RFC focuses on scalar functions. AggregateFnArray
-  and WindowFnArray will need their own materialization paths, which are explicitly out of scope
-  for this RFC.
+- **Aggregate and window functions**: Out of scope. AggregateFnArray and WindowFnArray will
+  need their own materialization paths.
 
-- **`Indices` semantics mismatch**: `Indices` is a gather (repetition, reordering, nullable)
-  while the other `Selection` variants are pure subsetting. This means `compose`, `count`, and
-  `apply` must handle two fundamentally different operations. Whether this tension causes
-  practical problems (e.g., confusing behavior in compose chains) needs to be evaluated during
-  implementation.
+- **Option A vs Option B**: This RFC presents both. The decision reduces to: are caller-owned
+  output buffers (ChunkedArray concat avoidance, FSST zero-copy decompression) worth giving up
+  encoding-preserving mask application through the general VTable (FSST filter-without-decompress,
+  Dict filter-without-decompress for exporters)? Both trade-offs are concrete. In Option A,
+  exporters need encoding-specific filter methods; in Option B, ChunkedArray can't share a
+  builder across chunks.
 
 ## Future Possibilities
 
-### Selection pull-up
-
-In the new model, selection wrappers compose naturally into the `Selection` parameter during
-the recursive descent. However, when a selection is nested inside an expression, it cannot be
-peeled:
-
-```
-scalar_fn(upper, [filter(dict(...), mask)])
-```
-
-`peel_selection()` sees ScalarFnArray at the root, not FilterArray. If `reduce_parent` has no
-rule for this function + encoding combination, the tree stays as-is and the filter gets
-composed in during descent. `upper` runs on N selected rows, producing a flat canonical result.
-The dictionary structure is lost.
-
-A future optimization could **pull selections out of expressions** to the root:
-
-```
-filter(scalar_fn(upper, [dict(...)]), mask)
-```
-
-With the selection at the root, `peel_selection()` extracts it, and subsequent optimizations
-can proceed. More optimal decisions can be made when the selection is available at the top and
-pushed down at execution time, rather than being trapped inside an expression.
+### Mask pull-up
 
-In practice, most scans apply filters at the top level of the array tree, so this edge case is
-uncommon. But for complex expression trees with embedded selections, pull-up would enable
-strictly better execution plans.
+When a filter mask is nested inside an expression (`scalar_fn(upper, [filter(dict(...), mask)])`),
+`peel_selection()` can't extract it. A future optimization could pull masks out of expressions
+to the root, enabling further optimizations.
 
 ### Aggregate and window functions
 
-The execution model is designed to support additional function types beyond scalar functions.
-`AggregateFnArray` (for sum, min, max, count) and `WindowFnArray` will follow a similar
-deferred pattern, using the same `reduce`/`reduce_parent` optimization strategy and
-`Selection`-based materialization.
+`AggregateFnArray` and `WindowFnArray` will follow a similar deferred pattern with
+`reduce`/`reduce_parent` optimization and `Mask`-based materialization.
 
 ### GPU kernel fusion
 
-The strict metadata-only constraint on `reduce`/`reduce_parent` means the optimized tree can
-be shipped to a GPU context. A GPU-aware `canonicalize_into` could fuse the entire optimized tree
-into a single kernel launch, reducing memory traffic and kernel overhead.
+The strict metadata-only reduce constraint means the optimized tree can be shipped to a GPU
+context. A GPU-aware materialization path could fuse the entire optimized tree into a single
+kernel launch.
diff --git a/proposals/0002-aggregate-functions-and-list-scalars.md b/proposals/0002-aggregate-functions-and-list-scalars.md
new file mode 100644
index 0000000..854de89
--- /dev/null
+++ b/proposals/0002-aggregate-functions-and-list-scalars.md
@@ -0,0 +1,858 @@
+- Start Date: 2026-02-26
+- RFC PR: [vortex-data/rfcs#0000](https://github.com/vortex-data/rfcs/pull/0000)
+- Tracking Issue: [vortex-data/vortex#0000](https://github.com/vortex-data/vortex/issues/0000)
+
+## Summary
+
+Introduce `AggregateFnVTable` to Vortex as a trait for defining aggregate operations, and a
+single `ListAggregate` scalar function that applies any aggregate to a list column. There is
+no dedicated `AggregateFnArray` — `ListAggregate` is a `ScalarFnVTable` implementation whose
+lazy evaluation is handled by the existing `ScalarFnArray` infrastructure. Child encodings
+match on `ExactScalarFn<ListAggregate>` in their reduce_parent rules for aggregate-specific
+optimizations.
+
+The key insight is the **aggregate/list-scalar duality**: a list column stored as
+`(offsets, elements)` is a pre-materialized grouping. Computing `list_sum(list_column)` is
+literally a grouped `sum` over the flat elements array partitioned by offsets. Rather than
+implementing N separate list scalar functions (`list_sum`, `list_min`, `list_max`, ...),
+we implement N aggregate functions and a single `ListAggregate` scalar function that applies
+any aggregate to a list column.
+
+The `Accumulator` trait is the core primitive. It processes one group at a time via
+`accumulate(batch)` / `flush()` / `finish()`, supporting streaming through chunked elements,
+very large lists, and ungrouped column-level aggregation. `execute_grouped` is a convenience
+built on top of the accumulator.
+
+Streaming ordered GROUP BY falls out naturally: the query engine constructs a `ListArray` from
+`(flat_column, group_offsets)` per batch — a zero-copy operation — and wraps it in
+`ScalarFnArray(ListAggregate(Sum), [list_array])`.
+
+This RFC also evaluates whether the canonical list encoding should switch from `ListViewArray`
+(offsets+sizes) to `ListArray` (offsets-only), since the choice directly affects how cleanly
+the duality works.
+
+## Motivation
+
+### Aggregate functions exist but in the old framework
+
+Vortex has two compute systems:
+
+- **`ScalarFnVTable`**: element-wise functions (1 input row → 1 output row) with lazy
+  `ScalarFnArray` evaluation, expression trees, and reduce/reduce_parent optimization.
+- **`ComputeFn`**: kernel-based operations like `sum`, `min_max`, `nan_count`. Standalone,
+  not part of the expression tree, dispatched to per-encoding kernels via `inventory`.
+
+Aggregate operations like `sum` already exist, but as `ComputeFn` kernels. They cannot
+participate in the expression tree or optimizer. This means:
+
+- No lazy evaluation that defers computation and can be optimized via reduce_parent.
+- No way to push aggregate operations through Dict values, RunEnd runs, or other encodings
+  via the reduce rule framework.
+- No unified framework for list scalar functions and grouped aggregates.
+
+### List scalar functions are redundant with aggregate functions
+
+The only list-specific scalar function today is `list_contains`. Adding `list_sum`,
+`list_min`, `list_max`, `list_count`, `list_mean`, etc. as individual `ScalarFnVTable`
+implementations would duplicate the aggregation logic that already exists in `ComputeFn`
+kernels. If Vortex had first-class aggregate functions, each list scalar function would
+simply be "apply this aggregate to each list" — one `ListAggregate` scalar function
+parameterized by the aggregate, rather than N separate implementations.
+
+### The aggregate/list-scalar duality
+
+Consider a list column `[[1,2,3], [4,5]]` stored as:
+
+```
+elements: [1, 2, 3, 4, 5]
+offsets:  [0, 3, 5]
+```
+
+Computing `list_sum` yields `[6, 9]`. This is identical to computing a grouped `sum` over
+elements partitioned by offset ranges `[0..3, 3..5]`. Every aggregate function has a
+corresponding list scalar function:
+
+| Aggregate    | List scalar            | Grouped operation                 |
+|--------------|------------------------|-----------------------------------|
+| `sum(col)`   | `list_sum(list_col)`   | Sum elements per group            |
+| `count(col)` | `list_count(list_col)` | Count non-null elements per group |
+| `min(col)`   | `list_min(list_col)`   | Min element per group             |
+| `max(col)`   | `list_max(list_col)`   | Max element per group             |
+| `mean(col)`  | `list_mean(list_col)`  | Mean of elements per group        |
+
+### Streaming ordered GROUP BY
+
+For databases with sorted storage, GROUP BY over a sorted key is a streaming operation:
+groups arrive in order and don't cross batch boundaries. This maps directly to the
+aggregate/list-scalar duality:
+
+1. For each batch, compute group boundaries from the sorted key (run boundaries → offsets).
+2. Construct `ListArray::new(value_column, group_offsets, ...)` — zero-copy.
+3. Wrap in `ScalarFnArray(ListAggregate(Sum), [list_array])`.
+4. Execute → one result per group.
+
+No hash tables, no shuffling. The same `execute_grouped` path serves both list scalar
+functions and streaming ordered GROUP BY.
+
+### Expensive list interop
+
+The current canonical list encoding is `ListViewArray` (offsets+sizes, out-of-order allowed).
+Converting between `ListArray` and `ListViewArray` is expensive — see the rebuild
+infrastructure with modes `MakeZeroCopyToList`, `TrimElements`, `MakeExact`, and the
+`is_zero_copy_to_list` flag. This complexity exists primarily to support out-of-order
+offsets, which complicates the aggregate duality since `execute_grouped` needs monotonic
+offsets.
+
+## Design
+
+### `Accumulator` — the core primitive
+
+The `Accumulator` trait is the fundamental aggregation interface. It processes one group
+at a time: the caller feeds element batches, then flushes to finalize the group and start
+the next one. The accumulator owns its output buffer and returns all results at the end.
+
+```rust
+pub trait Accumulator: Send + Sync {
+    /// Feed a batch of element values for the current group.
+    /// Can be called multiple times per group (e.g., chunked elements).
+    fn accumulate(&mut self, batch: &ArrayRef) -> VortexResult<()>;
+
+    /// Finalize the current group: push its result into the output buffer
+    /// and reset internal state for the next group.
+    fn flush(&mut self) -> VortexResult<()>;
+
+    /// Return all flushed results as a single array.
+    /// Length = number of flush() calls.
+    fn finish(self: Box<Self>) -> VortexResult<ArrayRef>;
+}
+```
+
+This handles all aggregation cases uniformly:
+
+**Case 1 — List scalar with contiguous elements:**
+```rust
+let mut acc = aggregate.accumulator(element_dtype)?;
+for i in 0..n_lists {
+    let group_elements = elements.slice(offsets[i]..offsets[i+1])?;
+    acc.accumulate(&group_elements)?;
+    acc.flush()?;
+}
+acc.finish()  // → ArrayRef of length n_lists
+```
+
+**Case 2 — Large lists with chunked elements:**
+A list's elements may be a `ChunkedArray`. The accumulator handles this by feeding
+each chunk separately before flushing:
+```rust
+let mut acc = aggregate.accumulator(element_dtype)?;
+for i in 0..n_lists {
+    let group_elements = elements.slice(offsets[i]..offsets[i+1])?;
+    for chunk in iter_chunks(&group_elements) {
+        acc.accumulate(&chunk)?;
+    }
+    acc.flush()?;
+}
+acc.finish()
+```
+
+**Case 3 — Ungrouped full-column aggregation:**
+One group, fold across chunks:
+```rust
+let mut acc = aggregate.accumulator(dtype)?;
+for chunk in chunked_array.chunks() {
+    acc.accumulate(&chunk)?;
+}
+acc.flush()?;
+acc.finish()  // → 1-element array
+```
+
+**Case 4 — ListView with scattered elements:**
+Canonicalize to ListArray first (rebuilding to sorted form), then use Case 1 or 2.
+The accumulator itself always sees contiguous element batches.
+
+### `AggregateFnVTable`
+
+A new trait parallel to `ScalarFnVTable`. The `accumulator()` method is the required
+core — `execute_grouped` and `execute_scalar` have default implementations built on it:
+
+```rust
+pub trait AggregateFnVTable: 'static + Sized + Clone + Send + Sync {
+    type Options: 'static + Send + Sync + Clone + Debug + Display + PartialEq + Eq + Hash;
+
+    fn id(&self) -> AggregateFnId;
+
+    fn serialize(&self, options: &Self::Options) -> VortexResult<Option<Vec<u8>>>;
+    fn deserialize(
+        &self,
+        metadata: &[u8],
+        session: &VortexSession,
+    ) -> VortexResult<Self::Options>;
+
+    /// Arity is always 1. For multi-column aggregates (e.g., covariance), the input
+    /// should be a struct array containing the columns.
+    fn arity(&self, _options: &Self::Options) -> Arity {
+        Arity::Exact(1)
+    }
+
+    /// Result dtype for each group (not a list — the scalar result type).
+    fn return_dtype(
+        &self,
+        options: &Self::Options,
+        input_dtypes: &[DType],
+    ) -> VortexResult<DType>;
+
+    /// Format in SQL style, e.g. `sum(col)`.
+    fn fmt_sql(
+        &self,
+        options: &Self::Options,
+        expr: &Expression,
+        f: &mut Formatter<'_>,
+    ) -> fmt::Result;
+
+    /// Create an accumulator for streaming aggregation.
+    ///
+    /// This is the core primitive. All other execution methods have default
+    /// implementations built on this.
+    fn accumulator(
+        &self,
+        options: &Self::Options,
+        input_dtype: &DType,
+    ) -> VortexResult<Box<dyn Accumulator>>;
+
+    /// One-shot grouped execution over contiguous elements + monotonic offsets.
+    ///
+    /// Default: loop over groups, slice elements, accumulate each, flush, finish.
+    /// Override for vectorized one-shot paths (e.g., SIMD segmented sum).
+    fn execute_grouped(
+        &self,
+        options: &Self::Options,
+        input: &ArrayRef,
+        offsets: &ArrayRef,
+    ) -> VortexResult<ArrayRef> {
+        let n_groups = offsets.len() - 1;
+        let mut acc = self.accumulator(options, input.dtype())?;
+        for i in 0..n_groups {
+            let start = scalar_at(offsets, i)?;
+            let end = scalar_at(offsets, i + 1)?;
+            acc.accumulate(&input.slice(start..end)?)?;
+            acc.flush()?;
+        }
+        acc.finish()
+    }
+
+    /// Ungrouped full-column aggregation returning a scalar.
+    ///
+    /// Default: single-group execute_grouped with offsets [0, n].
+    /// Replaces standalone `ComputeFn` kernels like `compute::sum()`.
+    fn execute_scalar(
+        &self,
+        options: &Self::Options,
+        input: &ArrayRef,
+    ) -> VortexResult<Scalar> {
+        let n = input.len();
+        let offsets: ArrayRef = buffer![0u64, n as u64].into_array();
+        let result = self.execute_grouped(options, input, &offsets)?;
+        result.scalar_at(0)
+    }
+}
+```
+
+Key differences from `ScalarFnVTable`:
+
+| Property | `ScalarFnVTable` | `AggregateFnVTable` |
+|----------|-------------------|----------------------|
+| Row semantics | 1 input → 1 output | N inputs → 1 output per group |
+| Core primitive | `execute(args) -> ArrayRef` | `accumulator() -> Box<dyn Accumulator>` |
+| Input shape | All children same length | Flat elements + offsets defining variable-size groups |
+| Streaming | N/A | accumulate/flush/finish across chunks |
+
+### Streaming AVG example
+
+To illustrate how the accumulator works for a stateful aggregate, consider `Mean`:
+
+```rust
+struct MeanAccumulator {
+    running_sum: f64,
+    running_count: u64,
+    output: Vec<f64>,
+}
+
+impl Accumulator for MeanAccumulator {
+    fn accumulate(&mut self, batch: &ArrayRef) -> VortexResult<()> {
+        self.running_sum += Sum.execute_scalar(&EmptyOptions, batch)?.as_f64()?;
+        self.running_count += Count.execute_scalar(&EmptyOptions, batch)?.as_u64()?;
+        Ok(())
+    }
+
+    fn flush(&mut self) -> VortexResult<()> {
+        let mean = if self.running_count > 0 {
+            self.running_sum / self.running_count as f64
+        } else {
+            f64::NAN
+        };
+        self.output.push(mean);
+        self.running_sum = 0.0;
+        self.running_count = 0;
+        Ok(())
+    }
+
+    fn finish(self: Box<Self>) -> VortexResult<ArrayRef> {
+        Ok(PrimitiveArray::from_vec(self.output).into_array())
+    }
+}
+```
+
+For a list column `[[1.0, 2.0, 3.0], [4.0, 5.0]]` where the first list's elements are
+chunked as `[1.0, 2.0]` and `[3.0]`:
+
+1. `accumulate([1.0, 2.0])` → running_sum=3.0, running_count=2
+2. `accumulate([3.0])` → running_sum=6.0, running_count=3
+3. `flush()` → push 2.0, reset
+4. `accumulate([4.0, 5.0])` → running_sum=9.0, running_count=2
+5. `flush()` → push 4.5, reset
+6. `finish()` → `[2.0, 4.5]`
+
+### Built-in aggregates
+
+The initial set, each implementing `AggregateFnVTable`:
+
+```rust
+pub struct Sum;           // sum of elements per group
+pub struct Count;         // count of non-null elements per group
+pub struct Min;           // minimum element per group
+pub struct Max;           // maximum element per group
+pub struct Mean;          // mean of elements per group (returns f64)
+pub struct Any;           // logical OR per group (bool input)
+pub struct All;           // logical AND per group (bool input)
+pub struct CollectList;   // identity: collect elements into list (returns list)
+```
+
+**Migration strategy**: `execute_scalar` replaces standalone `ComputeFn` kernel dispatch
+(e.g., `Sum::execute_scalar` replaces `compute::sum()`). The `accumulator` implementations
+can start simple (canonicalize + iterate) and be optimized over time with encoding-aware
+fast paths.
+
+### `ListAggregate` scalar function
+
+A single `ScalarFnVTable` implementation that bridges list columns to the aggregate system.
+Because `ListAggregate` is a scalar function, wrapping it in an expression produces a
+`ScalarFnArray` — reusing the existing lazy evaluation, slicing, and reduce infrastructure
+with no new array type needed.
+
+```rust
+pub struct ListAggregate;
+
+pub struct ListAggregateOptions {
+    pub aggregate_fn: AggregateFnRef,
+}
+
+impl ScalarFnVTable for ListAggregate {
+    type Options = ListAggregateOptions;
+
+    fn id(&self) -> ScalarFnId {
+        ScalarFnId::from("vortex.list.aggregate")
+    }
+
+    fn arity(&self, _options: &Self::Options) -> Arity {
+        Arity::Exact(1)
+    }
+
+    fn return_dtype(
+        &self,
+        options: &Self::Options,
+        arg_dtypes: &[DType],
+    ) -> VortexResult<DType> {
+        let element_dtype = arg_dtypes[0].as_list_element()
+            .ok_or_else(|| vortex_err!("ListAggregate input must be a list"))?;
+        options.aggregate_fn.return_dtype(&[element_dtype.clone()])
+    }
+
+    fn execute(
+        &self,
+        options: &Self::Options,
+        args: ExecutionArgs,
+    ) -> VortexResult<ArrayRef> {
+        let list_input = &args.inputs[0];
+        // Execute the child into a ListArray, decompose, and aggregate.
+        let list = list_input.to_list()?;
+        let elements = list.elements();
+        let offsets = list.offsets();
+
+        // If elements are chunked, use the accumulator to stream through chunks.
+        // Otherwise, use one-shot execute_grouped.
+        if elements.is::<ChunkedArray>() {
+            let mut acc = options.aggregate_fn.accumulator(elements.dtype())?;
+            let n_groups = offsets.len() - 1;
+            for i in 0..n_groups {
+                let group_elements = elements.slice(
+                    offset_at(&offsets, i)..offset_at(&offsets, i+1)
+                )?;
+                for chunk in iter_chunks(&group_elements) {
+                    acc.accumulate(&chunk)?;
+                }
+                acc.flush()?;
+            }
+            acc.finish()
+        } else {
+            options.aggregate_fn.execute_grouped(&elements, &offsets)
+        }
+    }
+
+    // ...
+}
+```
+
+Expression-level sugar:
+
+```rust
+pub fn list_sum(list: Expression) -> Expression {
+    ListAggregate.new_expr(
+        ListAggregateOptions { aggregate_fn: Sum.bind(EmptyOptions) },
+        [list],
+    )
+}
+
+pub fn list_count(list: Expression) -> Expression {
+    ListAggregate.new_expr(
+        ListAggregateOptions { aggregate_fn: Count.bind(EmptyOptions) },
+        [list],
+    )
+}
+
+// list_min, list_max, list_mean, list_any, list_all analogously
+```
+
+This is one scalar function parameterized by the aggregate — not N separate functions.
+Analogous to DuckDB's `list_aggregate(list, 'sum')`.
+
+**Why this works without a dedicated array type:**
+
+1. **Lazy evaluation.** `ScalarFnArray(ListAggregate(Sum), [list_col])` defers computation
+   until `execute()` is called, just as it does for any other scalar function.
+2. **Slicing.** `ScalarFnArray` already slices its children. Slicing the list child
+   correctly adjusts both grouping and elements.
+3. **Reduce rules.** `ListAggregate` can implement `ScalarFnVTable::reduce` for self-reduce
+   optimizations (constant folding, statistics). Encoding-specific reduce_parent rules
+   use `ExactScalarFn<ListAggregate>` to match on the parent (see below).
+4. **Type safety.** The single child has `DType::List` — enforced by `return_dtype`.
+
+### Streaming ordered GROUP BY
+
+For databases with sorted storage, GROUP BY over a sorted key maps directly to the
+aggregate/list-scalar duality. The query engine processes batches where:
+
+1. Groups are defined by a sorted key column (e.g., `customer_id`).
+2. Groups are ordered and complete within each batch (no cross-batch groups).
+3. No hash tables or shuffling needed.
+
+**Physical execution per batch:**
+
+```rust
+fn execute_ordered_group_by(
+    value_column: &ArrayRef,
+    key_column: &ArrayRef,     // sorted
+    aggregate: &AggregateFnRef,
+) -> VortexResult<ArrayRef> {
+    // Step 1: Compute group offsets from run boundaries in the key column.
+    // E.g., key = [A, A, A, B, B, C] → offsets = [0, 3, 5, 6]
+    let group_offsets = compute_run_offsets(key_column)?;
+
+    // Step 2: Wrap as a list — zero-copy, just references.
+    let list = ListArray::new(value_column.clone(), group_offsets, Validity::NonNullable);
+
+    // Step 3: Create lazy ScalarFnArray with ListAggregate.
+    let scalar_fn = ListAggregate.bind(ListAggregateOptions {
+        aggregate_fn: aggregate.clone(),
+    });
+    let result = ScalarFnArray::try_new(scalar_fn, vec![list.into_array()])?;
+
+    // Step 4: Execute (or let the optimizer reduce first).
+    result.into_array().execute(ctx)
+}
+```
+
+The same `execute_grouped` serves both list scalar functions (where the list already exists
+in the schema) and GROUP BY (where the list is constructed from a flat column +
+sort-derived offsets).
+
+### Reduce rules
+
+Since `ListAggregate` is a `ScalarFnVTable`, its reduce rules use the existing
+`ScalarFnArray` optimization infrastructure.
+
+**Self-reduce rules (via `ScalarFnVTable::reduce`):**
+
+`ListAggregate` implements `ScalarFnVTable::reduce` to handle aggregate-specific
+optimizations when the list child's structure is known:
+
+- **Constant list folding**: If the list child is a `ConstantArray` of list scalars,
+  compute the aggregate directly. `ListAggregate(Sum, Constant([[1,2,3]], 1000))`
+  → `Constant(6, 1000)`.
+
+- **Count from metadata**: `ListAggregate(Count, list)` can use list sizes from metadata
+  without touching element data.
+
+- **Min/Max from statistics**: If element-level statistics are available, `Min`/`Max` can
+  short-circuit without decompression.
+
+- **Sum of constant elements**: `ListAggregate(Sum, list)` where elements are constant
+  → `constant_value * group_size` per group.
+
+**Parent-reduce rules (encoding-specific, via `ExactScalarFn<ListAggregate>`):**
+
+Child encodings register reduce_parent rules that match on `ExactScalarFn<ListAggregate>`.
+The `ExactScalarFn` matcher provides typed access to `ListAggregateOptions`, from which the
+specific `AggregateFnRef` can be inspected:
+
+```rust
+impl ArrayParentReduceRule<DictVTable> for DictAggregateRule {
+    type Parent = ExactScalarFn<ListAggregate>;
+
+    fn reduce_parent(
+        &self,
+        array: &DictArray,
+        parent: ScalarFnArrayView<'_, ListAggregate>,
+        child_idx: usize,
+    ) -> VortexResult<Option<ArrayRef>> {
+        let aggregate_fn = &parent.options.aggregate_fn;
+
+        // Min/Max over Dict elements → Min/Max(values)
+        if aggregate_fn.is::<Min>() || aggregate_fn.is::<Max>() {
+            // Min/max of dictionary values is the global min/max.
+            let values = array.values();
+            return Ok(Some(
+                ListAggregate.new_expr_with(
+                    parent.options.clone(),
+                    // Rebuild list with dict values as elements
+                    // ...
+                ).into_array()
+            ));
+        }
+
+        Ok(None)
+    }
+}
+```
+
+Example encoding-specific rules:
+
+- **Dict**: `ListAggregate(Min, List(Dict(codes, values)))` → `ListAggregate(Min, List(values))`.
+  `Max` similarly. `Sum` cannot push down without frequency weighting.
+
+- **RunEnd**: `ListAggregate(Sum, List(RunEnd(values, run_ends)))` → weighted sum.
+  `ListAggregate(Min/Max, List(RunEnd))` → `ListAggregate(Min/Max, List(values))`.
+
+These rules operate on the **elements** within the list child. The list's offsets define
+the grouping; the elements are what the encoding wraps. A list child like
+`ListArray(elements: DictArray, offsets: ...)` allows the Dict reduce_parent rule to fire
+on the elements when the `ScalarFnArray(ListAggregate)` is being optimized.
+
+### Interaction with `list_contains`
+
+The existing `list_contains` function is NOT an aggregate — it's a true scalar function
+(takes a list + needle, returns bool per row). It stays as-is. `ListAggregate` is
+specifically for operations that reduce list elements to a scalar per list.
+
+## Canonical list representation
+
+The choice of canonical list encoding directly affects how cleanly the aggregate duality
+works. Currently, `ListViewArray` is canonical for `DType::List`. This section evaluates
+whether to switch to `ListArray`.
+
+### Option A: Switch canonical to `ListArray` (offsets-only)
+
+Change `Canonical::List` from `ListViewArray` to `ListArray`.
+
+`ListArray` stores elements + an (n+1)-length monotonically increasing offsets array.
+List `i` spans `elements[offsets[i]..offsets[i+1]]`.
+
+**Advantages for the aggregate duality:**
+
+- `execute_grouped` receives offsets directly from the list — no conversion needed.
+- Offsets are monotonic by construction, matching the `execute_grouped` contract.
+- One array (n+1 integers) vs two arrays (2n integers) — less memory.
+- Offset deltas (list sizes) compress extremely well with delta/FoR encoding.
+- Direct match to Arrow `ListArray` and DuckDB's list format — zero-copy Arrow export.
+
+**Disadvantages:**
+
+- **Breaking change.** All code calling `to_canonical()` on list data gets `ListArray`
+  instead of `ListViewArray`. Migration required.
+- **No out-of-order offsets.** Operations like filter/take that scramble row order must
+  rebuild the elements array (or produce a `ListViewArray` as a non-canonical intermediate).
+- **Sequential dependency in offsets.** Reading `offsets[i+1]` depends on `offsets[i]` for
+  computing list size — slightly less SIMD-friendly than separate sizes.
+
+**Migration path:**
+
+1. Add `to_list()` on `ToCanonical` alongside `to_listview()`.
+2. Change `Canonical::List` to hold `ListArray`.
+3. `ListViewArray` becomes a non-canonical encoding (like `VarBinArray` for strings).
+4. Update `list_contains` and all list compute paths.
+
+### Option B: Keep `ListViewArray` as canonical
+
+Keep the current design where `ListViewArray` (offsets + sizes, out-of-order allowed)
+is canonical for `DType::List`.
+
+**Advantages:**
+
+- No breaking change.
+- SIMD-friendly: sizes can be read independently without sequential offset dependency.
+- Out-of-order offsets enable better compression when list order doesn't match element order.
+- Filter/take produce valid `ListViewArray` directly without rebuilding.
+
+**Disadvantages for the aggregate duality:**
+
+- `execute_grouped` requires monotonic offsets. Out-of-order `ListViewArray` must be
+  rebuilt to sorted form first (the `MakeZeroCopyToList` path), which copies data.
+- The ZCTL flag, rebuild modes, and validation code remain as ongoing complexity.
+- Arrow/DuckDB export still requires conversion.
+- Two arrays (2n integers) instead of one (n+1 integers).
+
+**Impact on `ListAggregate::execute`:**
+
+`ListAggregate::execute` would need to handle the non-sorted case:
+
+```rust
+fn execute(&self, options: &Self::Options, args: ExecutionArgs) -> VortexResult<ArrayRef> {
+    let list_input = &args.inputs[0];
+    let listview = list_input.to_listview()?;
+    if listview.is_zero_copy_to_list() {
+        // Fast path: offsets are sorted, use directly
+        let offsets = build_list_offsets_from_list_view(&listview);
+        options.aggregate_fn.execute_grouped(&listview.elements(), &offsets)
+    } else {
+        // Slow path: rebuild to sorted form first
+        let list = list_from_list_view(listview)?;
+        options.aggregate_fn.execute_grouped(&list.elements(), &list.offsets())
+    }
+}
+```
+
+### ListView as direct aggregate input
+
+An alternative to canonicalizing to ListArray before executing is to support `ListView`
+directly in `execute_grouped`. A ListView with out-of-order offsets represents
+variable-length groups that may reference scattered regions of the elements array.
+
+**Advantages:**
+
+- More powerful — supports aggregation over non-contiguous element ranges.
+- Avoids the expensive rebuild-to-sorted-form step for scrambled ListViews.
+
+**Disadvantages:**
+
+- **Complexity in aggregate implementations.** Each aggregate must handle potentially
+  scattered, non-contiguous element access patterns instead of simple contiguous slices.
+- **Unpredictable decompression cost.** A ListView with sparse, scattered offsets may
+  trigger excessive random-access decompression of the elements array. In the worst case
+  this is more expensive than one-pass decompression of all elements into contiguous form.
+- **SIMD unfriendly.** Contiguous offset ranges map to sequential memory access patterns
+  that vectorize well. Scattered access does not.
+
+For now, this RFC assumes `execute_grouped` takes monotonic (n+1) offsets. Aggregates
+over ListView canonicalize to sorted form first. Supporting scattered access as an
+optimization can be explored in the future.
+
+### Comparison
+
+| Criterion | ListArray (Option A) | ListViewArray (Option B) |
+|-----------|---------------------|--------------------------|
+| Aggregate duality | Direct — offsets define groups | Indirect — must ensure sorted or rebuild |
+| Arrow/DuckDB export | Zero-copy | Requires conversion |
+| Memory | n+1 integers | 2n integers |
+| Filter/take | Must rebuild elements | Produces valid array directly |
+| SIMD for size access | Sequential offset dependency | Independent size reads |
+| Offset compression | Delta/FoR on monotonic data | May be non-monotonic |
+| Breaking change | Yes | No |
+
+## Compatibility
+
+This RFC does not change the file format or wire format. `ListAggregate` produces a
+`ScalarFnArray` at runtime (like any other scalar function). It is not persisted to disk.
+
+**Public API additions:**
+
+- `Accumulator` trait — core streaming aggregation primitive.
+- `AggregateFnVTable` trait and built-in implementations (Sum, Count, Min, Max, Mean,
+  Any, All, CollectList).
+- `ListAggregate` scalar function.
+- Expression constructors: `list_sum()`, `list_count()`, `list_min()`, `list_max()`,
+  `list_mean()`, `list_any()`, `list_all()`.
+
+**If canonical list changes (Option A):**
+
+- `Canonical::List` changes from `ListViewArray` to `ListArray`.
+- `to_canonical()` on list data returns `ListArray` instead of `ListViewArray`.
+- `ListViewArray` remains available as a non-canonical encoding.
+- Migration: callers using `to_listview()` should transition to `to_list()`.
+
+## Drawbacks
+
+- **New trait surface area.** `AggregateFnVTable` and `Accumulator` are new traits to
+  learn, though they closely mirror `ScalarFnVTable` and the existing accumulator pattern
+  in `compute::sum`.
+
+- **Reduce rule coverage.** Not all encoding × aggregate combinations will have optimized
+  reduce_parent rules initially. The fallback (canonicalize list, then accumulator loop)
+  is correct but may be slower.
+
+- **Canonical list change (if Option A).** Breaking change affecting all code that calls
+  `to_canonical()` on list data.
+
+## Alternatives
+
+### List scalar functions as separate `ScalarFnVTable` implementations
+
+Implement `ListSum`, `ListCount`, `ListMin`, etc. as individual scalar functions
+(like `list_contains`), each operating directly on list arrays without an aggregate
+abstraction.
+
+**Rejected because:** Duplicates logic across N functions. No shared optimization rules.
+No path to GROUP BY. Each function must independently handle list decomposition. This is
+exactly the redundancy that motivates this RFC.
+
+### Aggregates as `ComputeFn` only
+
+Keep aggregates in the kernel-based `ComputeFn` system. Implement list scalar functions
+by dispatching to `ComputeFn` with offset ranges.
+
+**Rejected because:** No lazy evaluation, no expression tree participation, no
+reduce_parent optimization. This is the status quo for `sum`/`min_max` and has the
+limitations this RFC addresses.
+
+### Dedicated `AggregateFnArray` array type
+
+Introduce a new array type `AggregateFnArray` parallel to `ScalarFnArray`, wrapping
+a single list child and an `AggregateFnRef`:
+
+```rust
+struct AggregateFnArray {
+    aggregate_fn: AggregateFnRef,
+    dtype: DType,
+    child: ArrayRef,     // must have DType::List
+    stats: ArrayStats,
+}
+```
+
+With a `ListAggregate` scalar function that returns an `AggregateFnArray` from its
+`execute` method, creating two lazy wrappers in sequence:
+`ScalarFnArray(ListAggregate) → AggregateFnArray → actual computation`.
+
+**Rejected because:** `AggregateFnArray` is structurally identical to
+`ScalarFnArray` with one child — it duplicates the lazy evaluation, slicing, and reduce
+infrastructure. Child encodings can already match on specific scalar functions using
+`ExactScalarFn<ListAggregate>`, which provides typed access to `ListAggregateOptions`
+and the embedded `AggregateFnRef`. No new array type is needed.
+
+### No accumulator — one-shot `execute_grouped` only
+
+Make `execute_grouped` the only execution method, with no streaming accumulation.
+
+**Rejected because:** Doesn't handle chunked elements. A list column may have elements
+stored as a `ChunkedArray`. Without an accumulator, all elements must be materialized
+into a single contiguous array before aggregation, defeating the purpose of chunked
+storage. Additionally, aggregates like `Mean` need `running_sum` and `running_count`
+state across element batches, and numerically stable summation (Kahan) requires ordered
+accumulation. The existing `sum_with_accumulator` pattern in `ComputeFn` demonstrates
+this need.
+
+### Grouped accumulator tracking N groups simultaneously
+
+An accumulator that manages all groups at once: `accumulate(input, offsets)` where offsets
+has N+1 entries per call, and sub-offsets are split at chunk boundaries.
+
+**Rejected because:** Over-engineered for our use cases. For list scalar functions and
+ordered GROUP BY, groups are processed sequentially — each group's elements are fully
+available before the next group starts. The per-group flush model is simpler and handles
+all cases. If vectorized all-groups-at-once processing proves necessary, it can be added
+as a separate `GroupsAccumulator` trait (see Future Possibilities).
+
+## Prior Art
+
+- **Apache Arrow**: Separates `ListArray` (offsets-only) from `ListView` (offsets+sizes).
+  Arrow Compute has aggregate kernels separate from scalar kernels. ListView is a recent
+  addition, not widely adopted.
+
+- **DuckDB**: Uses offsets-only lists. Has `list_aggregate(list, 'func_name')` — a single
+  function parameterized by the aggregate name. Separates scalar and aggregate function
+  registries.
+
+- **Apache DataFusion**: Two-tier accumulator design. `AggregateUDFImpl` has a required
+  `accumulator()` factory method (like ours). The `Accumulator` trait handles one group at
+  a time with `update_batch()` → `evaluate()`, plus `state()` and `merge_batch()` for
+  partial aggregation (serializing intermediate state for distributed execution). The
+  optional `GroupsAccumulator` trait manages all groups simultaneously with
+  `group_indices: &[usize]` per row for vectorized hash-based grouping, plus
+  `evaluate(EmitTo::First(n))` for streaming emission. Our `accumulate/flush/finish`
+  corresponds to DataFusion's per-group `Accumulator` with built-in output buffering.
+  Their `state/merge` and `GroupsAccumulator` are future extensions for us (see below).
+
+- **Velox**: `AggregateFunction` with `addRawInput`, `addIntermediateResults`,
+  `extractValues` — full streaming accumulator model. Uses row-major (offset, size)
+  pairs for lists.
+
+- **Polars**: Separate scalar and aggregate expression systems. List namespace functions
+  like `list.sum()`, `list.min()` internally dispatch to grouped aggregation over the
+  list's flat values.
+
+## Unresolved Questions
+
+- **`list_contains` integration**: Should `list_contains` be reimplemented as a
+  `ListAggregate` with a `Contains` aggregate, or does it remain a separate scalar
+  function? It doesn't fit the "reduce N to 1" pattern cleanly since it takes both a
+  list and a needle.
+
+- **Multi-column aggregates**: Some aggregates operate on multiple columns
+  (e.g., covariance, weighted sum). With arity fixed at 1, these require a struct array
+  as input. Is this ergonomic enough, or do we need a separate mechanism?
+
+- **Window functions**: Window functions (rolling sum, rank) operate on sliding windows
+  rather than fixed groups. They share the "operate within parent boundaries" property
+  but have different execution semantics. Should they be a separate trait or an
+  extension of `AggregateFnVTable`?
+
+- **Canonical list decision**: Option A (ListArray) vs Option B (ListViewArray). This
+  can be decided independently of the aggregate framework and implemented as a
+  preparatory or follow-up change.
+
+- **`CollectList` semantics**: `CollectList` is the identity aggregate — it returns
+  the elements as a list. For the list-scalar case, `list_collect(list_col)` is a no-op.
+  For GROUP BY, it materializes the grouped elements into a list column. Should this
+  be a built-in aggregate or handled separately?
+
+- **Null list handling**: When a list entry is null, should `flush()` push null into
+  the output? Or should the caller skip accumulation and use a separate mechanism to
+  mark null outputs? The accumulator currently has no way to signal "this group is null
+  without receiving any elements."
+
+## Future Possibilities
+
+- **Partial aggregation with `state()` / `merge()`**: Adding `state()` and
+  `merge_batch()` methods to `Accumulator` (like DataFusion) for distributed aggregation
+  where partial results must be serialized and merged across nodes. Not needed for
+  the sequential processing model in this RFC.
+
+- **`GroupsAccumulator`**: A separate trait (like DataFusion's) that manages all groups
+  simultaneously with `group_indices` per row, enabling vectorized hash-based grouping.
+  Our per-group flush model handles ordered GROUP BY; this extension would handle
+  unordered GROUP BY with hash tables.
+
+- **Aggregate push-down in file scanning**: Using `ListAggregate` reduce rules to
+  push aggregates into `LayoutReader`, computing aggregates during file scan without
+  materializing full columns.
+
+- **`list_distinct`, `list_sort`, `list_reverse`**: List transformation functions that
+  don't reduce elements to a scalar. These are not aggregates and would remain as
+  `ScalarFnVTable` implementations.
+
+- **Nested aggregation**: `list_sum(list_of_lists)` producing a list of sums at the
+  inner level. The duality recurses naturally.
+
+- **GPU execution**: `execute_grouped` with monotonic offsets maps cleanly to GPU
+  segmented reduction primitives (e.g., `cub::DeviceSegmentedReduce`).
+
+- **ListView scatter optimization**: If profiling shows that rebuilding scrambled
+  ListViews to sorted form is a bottleneck, aggregate implementations could accept
+  `(offsets, sizes)` pairs for scattered access. This would be opt-in per aggregate.

From c335c0d48f52afb3ad08377f2ad27bec8fb00278 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 27 Feb 2026 07:59:21 -0500
Subject: [PATCH 3/9] RFCs

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 ...02-aggregate-functions-and-list-scalars.md | 62 ++++++++++---------
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/proposals/0002-aggregate-functions-and-list-scalars.md b/proposals/0002-aggregate-functions-and-list-scalars.md
index 854de89..0cc053a 100644
--- a/proposals/0002-aggregate-functions-and-list-scalars.md
+++ b/proposals/0002-aggregate-functions-and-list-scalars.md
@@ -115,15 +115,15 @@ the next one. The accumulator owns its output buffer and returns all results at
 pub trait Accumulator: Send + Sync {
     /// Feed a batch of element values for the current group.
     /// Can be called multiple times per group (e.g., chunked elements).
-    fn accumulate(&mut self, batch: &ArrayRef) -> VortexResult<()>;
+    fn accumulate(&mut self, batch: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<()>;
 
     /// Finalize the current group: push its result into the output buffer
     /// and reset internal state for the next group.
-    fn flush(&mut self) -> VortexResult<()>;
+    fn flush(&mut self, ctx: &mut ExecutionCtx) -> VortexResult<()>;
 
     /// Return all flushed results as a single array.
     /// Length = number of flush() calls.
-    fn finish(self: Box<Self>) -> VortexResult<ArrayRef>;
+    fn finish(self: Box<Self>, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef>;
 }
 ```
 
@@ -134,10 +134,10 @@ This handles all aggregation cases uniformly:
 let mut acc = aggregate.accumulator(element_dtype)?;
 for i in 0..n_lists {
     let group_elements = elements.slice(offsets[i]..offsets[i+1])?;
-    acc.accumulate(&group_elements)?;
-    acc.flush()?;
+    acc.accumulate(&group_elements, ctx)?;
+    acc.flush(ctx)?;
 }
-acc.finish()  // → ArrayRef of length n_lists
+acc.finish(ctx)  // → ArrayRef of length n_lists
 ```
 
 **Case 2 — Large lists with chunked elements:**
@@ -148,11 +148,11 @@ let mut acc = aggregate.accumulator(element_dtype)?;
 for i in 0..n_lists {
     let group_elements = elements.slice(offsets[i]..offsets[i+1])?;
     for chunk in iter_chunks(&group_elements) {
-        acc.accumulate(&chunk)?;
+        acc.accumulate(&chunk, ctx)?;
     }
-    acc.flush()?;
+    acc.flush(ctx)?;
 }
-acc.finish()
+acc.finish(ctx)
 ```
 
 **Case 3 — Ungrouped full-column aggregation:**
@@ -160,10 +160,10 @@ One group, fold across chunks:
 ```rust
 let mut acc = aggregate.accumulator(dtype)?;
 for chunk in chunked_array.chunks() {
-    acc.accumulate(&chunk)?;
+    acc.accumulate(&chunk, ctx)?;
 }
-acc.flush()?;
-acc.finish()  // → 1-element array
+acc.flush(ctx)?;
+acc.finish(ctx)  // → 1-element array
 ```
 
 **Case 4 — ListView with scattered elements:**
@@ -228,16 +228,17 @@ pub trait AggregateFnVTable: 'static + Sized + Clone + Send + Sync {
         options: &Self::Options,
         input: &ArrayRef,
         offsets: &ArrayRef,
+        ctx: &mut ExecutionCtx,
     ) -> VortexResult<ArrayRef> {
         let n_groups = offsets.len() - 1;
         let mut acc = self.accumulator(options, input.dtype())?;
         for i in 0..n_groups {
             let start = scalar_at(offsets, i)?;
             let end = scalar_at(offsets, i + 1)?;
-            acc.accumulate(&input.slice(start..end)?)?;
-            acc.flush()?;
+            acc.accumulate(&input.slice(start..end)?, ctx)?;
+            acc.flush(ctx)?;
         }
-        acc.finish()
+        acc.finish(ctx)
     }
 
     /// Ungrouped full-column aggregation returning a scalar.
@@ -248,10 +249,11 @@ pub trait AggregateFnVTable: 'static + Sized + Clone + Send + Sync {
         &self,
         options: &Self::Options,
         input: &ArrayRef,
+        ctx: &mut ExecutionCtx,
     ) -> VortexResult<Scalar> {
         let n = input.len();
         let offsets: ArrayRef = buffer![0u64, n as u64].into_array();
-        let result = self.execute_grouped(options, input, &offsets)?;
+        let result = self.execute_grouped(options, input, &offsets, ctx)?;
         result.scalar_at(0)
     }
 }
@@ -278,13 +280,13 @@ struct MeanAccumulator {
 }
 
 impl Accumulator for MeanAccumulator {
-    fn accumulate(&mut self, batch: &ArrayRef) -> VortexResult<()> {
-        self.running_sum += Sum.execute_scalar(&EmptyOptions, batch)?.as_f64()?;
-        self.running_count += Count.execute_scalar(&EmptyOptions, batch)?.as_u64()?;
+    fn accumulate(&mut self, batch: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<()> {
+        self.running_sum += Sum.execute_scalar(&EmptyOptions, batch, ctx)?.as_f64()?;
+        self.running_count += Count.execute_scalar(&EmptyOptions, batch, ctx)?.as_u64()?;
         Ok(())
     }
 
-    fn flush(&mut self) -> VortexResult<()> {
+    fn flush(&mut self, _ctx: &mut ExecutionCtx) -> VortexResult<()> {
         let mean = if self.running_count > 0 {
             self.running_sum / self.running_count as f64
         } else {
@@ -296,7 +298,7 @@ impl Accumulator for MeanAccumulator {
         Ok(())
     }
 
-    fn finish(self: Box<Self>) -> VortexResult<ArrayRef> {
+    fn finish(self: Box<Self>, _ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
         Ok(PrimitiveArray::from_vec(self.output).into_array())
     }
 }
@@ -373,6 +375,7 @@ impl ScalarFnVTable for ListAggregate {
         args: ExecutionArgs,
     ) -> VortexResult<ArrayRef> {
         let list_input = &args.inputs[0];
+        let ctx = args.ctx;
         // Execute the child into a ListArray, decompose, and aggregate.
         let list = list_input.to_list()?;
         let elements = list.elements();
@@ -388,13 +391,13 @@ impl ScalarFnVTable for ListAggregate {
                     offset_at(&offsets, i)..offset_at(&offsets, i+1)
                 )?;
                 for chunk in iter_chunks(&group_elements) {
-                    acc.accumulate(&chunk)?;
+                    acc.accumulate(&chunk, ctx)?;
                 }
-                acc.flush()?;
+                acc.flush(ctx)?;
             }
-            acc.finish()
+            acc.finish(ctx)
         } else {
-            options.aggregate_fn.execute_grouped(&elements, &offsets)
+            options.aggregate_fn.execute_grouped(&elements, &offsets, ctx)
         }
     }
 
@@ -452,6 +455,7 @@ fn execute_ordered_group_by(
     value_column: &ArrayRef,
     key_column: &ArrayRef,     // sorted
     aggregate: &AggregateFnRef,
+    ctx: &mut ExecutionCtx,
 ) -> VortexResult<ArrayRef> {
     // Step 1: Compute group offsets from run boundaries in the key column.
     // E.g., key = [A, A, A, B, B, C] → offsets = [0, 3, 5, 6]
@@ -617,15 +621,16 @@ is canonical for `DType::List`.
 ```rust
 fn execute(&self, options: &Self::Options, args: ExecutionArgs) -> VortexResult<ArrayRef> {
     let list_input = &args.inputs[0];
+    let ctx = args.ctx;
     let listview = list_input.to_listview()?;
     if listview.is_zero_copy_to_list() {
         // Fast path: offsets are sorted, use directly
         let offsets = build_list_offsets_from_list_view(&listview);
-        options.aggregate_fn.execute_grouped(&listview.elements(), &offsets)
+        options.aggregate_fn.execute_grouped(&listview.elements(), &offsets, ctx)
     } else {
         // Slow path: rebuild to sorted form first
         let list = list_from_list_view(listview)?;
-        options.aggregate_fn.execute_grouped(&list.elements(), &list.offsets())
+        options.aggregate_fn.execute_grouped(&list.elements(), &list.offsets(), ctx)
     }
 }
 ```
@@ -691,8 +696,7 @@ This RFC does not change the file format or wire format. `ListAggregate` produce
 ## Drawbacks
 
 - **New trait surface area.** `AggregateFnVTable` and `Accumulator` are new traits to
-  learn, though they closely mirror `ScalarFnVTable` and the existing accumulator pattern
-  in `compute::sum`.
+  learn, though they closely mirror `ScalarFnVTable`.
 
 - **Reduce rule coverage.** Not all encoding × aggregate combinations will have optimized
   reduce_parent rules initially. The fallback (canonicalize list, then accumulator loop)

From 8b67c21d627c9fbd9ab1de66cc9e7dd06f3c577f Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 27 Feb 2026 20:42:20 -0500
Subject: [PATCH 4/9] Make ExecutionArgs a dyn trait

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 proposals/0001-execution-v2.md | 1050 +++++++++-----------------------
 1 file changed, 275 insertions(+), 775 deletions(-)

diff --git a/proposals/0001-execution-v2.md b/proposals/0001-execution-v2.md
index 7ad5811..0500359 100644
--- a/proposals/0001-execution-v2.md
+++ b/proposals/0001-execution-v2.md
@@ -4,923 +4,423 @@
 
 ## Summary
 
-Replace the current execution model with a cleaner architecture that:
-- Enables decompress-into-buffer (caller-owned output) for zero-copy decompression
-- Passes a `Mask` to the execution method, forcing encodings to handle row selection
-- Cleans up the reduce/execute boundary and removes migration-era adaptor boilerplate
+Replace the current execution VTable with a cleaner model. `reduce` and `reduce_parent` remain
+metadata-only in both proposals and are identical. The proposals differ in how `execute` and
+`execute_parent` work:
 
-Two candidate designs are presented: **Option A** (three-phase recursive) and **Option B**
-(cleaned-up iterative). Both share the `Mask`-based row selection and the reduce boundary cleanup. They
-differ in how materialization is structured -- Option A passes a caller-owned builder through
-recursive descent, Option B keeps an iterative loop with typed execution steps.
+- **Proposal A (Scheduler-driven)**: `execute` returns an `ExecutionStep` — either requesting
+  the scheduler to execute a specific child, or declaring that it is done. The scheduler drives
+  all iteration. `execute_parent` is retained and returns an `ArrayRef` (any encoding).
+
+- **Proposal B (Canonical builder)**: `execute` pushes its result into a caller-owned
+  `CanonicalBuilder`. The result is always canonical. `execute_parent` is retained and also
+  pushes into the builder.
 
 ## Motivation
 
-**No decompress-into-buffer.** The current execution model has no way to decompress an array
-into a caller-provided output buffer. Each encoding allocates its own output during execution.
-This means ChunkedArray must decompress each chunk separately and concatenate the results --
-an extra copy of the entire column. Exporters (DuckDB, Arrow) cannot write directly into their
-output format; they must receive Vortex's output and copy it. This is the primary pain point.
-
-**Stack overflows from recursive execution.** The current executor is recursive: each
-`execute` call may recurse into children, which recurse into their children. On deep array
-trees (deeply nested expressions, many-chunk arrays) this overflows the stack. The recursion
-depth is bounded by a 128-iteration limit, but this limit applies to the outer loop -- inner
-recursive calls have no bound.
-
-**Easy to forget selection implementations.** Filter (boolean mask), Slice (contiguous range),
-and Take (integer indices) are separate adaptor traits. Each encoding can implement one and
-forget the others. Fusing selection with decompression is really just
-`execute_parent(FilterArray)` -- which is fine -- but the current design makes it easy to miss.
-Passing a `Mask` parameter to the execution method forces every encoding to handle
-selection, eliminating the gap.
-
-**Adaptor explosion is migration noise.** The adaptor types (FilterReduceAdaptor,
-SliceExecuteAdaptor, TakeExecuteAdaptor, etc.) are a bridge from the old compute kernel
-dispatch to the new VTable world. The reduce/execute distinction is already enforced -- we just
-haven't deprecated the APIs that don't require an `ExecutionCtx`. The ~40+ adaptor impls are
-migration artifacts, not a fundamental design problem, but they add noise and make the codebase
-harder to navigate.
-
-**Strict metadata-only reduce is useful for GPU.** When buffers live on a GPU device,
-host-side code cannot easily access them. A clean metadata-only reduce boundary is useful
-(though not strictly required) for GPU execution paths.
+**No decompress-into-buffer.** Encodings allocate their own output during execution.
+ChunkedArray must decompress each chunk separately and concatenate — an extra copy of the
+entire column. Exporters (DuckDB, Arrow) can't write directly into their output format.
 
-## Shared design
+**Stack overflow from recursion.** The current executor recurses into children. Deep encoding
+trees overflow the stack.
 
-Both options share these design elements.
+**Unclear execute/reduce boundary.** Some `execute_parent` implementations are metadata-only
+and belong in `reduce_parent`. The boundary isn't enforced.
 
-### Mask as row selection
+## Shared design
 
-The existing `Mask` type (in `vortex-mask`) already represents row selection with optimized
-internal variants: `AllTrue(usize)`, `AllFalse(usize)`, and `Values(Arc<MaskValues>)` for a
-full bitmap. It already supports `slice()` for restricting to a range. No new type is needed --
-`Mask` is passed directly to the execution method as the row selection parameter.
+### reduce / reduce_parent (identical in both proposals)
 
-Composition is mask-over-mask: slicing a mask produces a narrower mask. The main addition is a
-`compose` method (or equivalent) for restricting one mask by another, and `apply` for
-compacting buffer elements.
+Both proposals keep these methods with unchanged signatures. They are **strictly metadata-only**
+— they never read data buffers.
 
 ```rust
-// Existing type, extended with:
-impl Mask {
-  /// Restrict self by other: keep only positions where both are true.
-  pub fn compose(&self, other: &Mask) -> Mask;
-
-  /// Apply mask to a buffer, returning a new buffer with only selected elements.
-  pub fn apply<T>(&self, data: &Buffer<T>) -> BufferMut<T>;
-}
+fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>>;
+fn reduce_parent(
+    array: &Self::Array,
+    parent: &ArrayRef,
+    child_idx: usize,
+) -> VortexResult<Option<ArrayRef>>;
 ```
 
-**Take is not selection.** Take (integer indices) can reorder rows, repeat rows, and introduce
-nulls via out-of-bounds indices. These semantics are fundamentally different from subsetting.
-Gather remains an encoding-internal concern -- DictArray handles codes → values lookup inside
-its own materialization.
+`reduce` rewrites an array using only metadata. Filter(AllTrue(x)) → x. Constant folding.
 
-**`scalar_at` remains a separate VTable method.** Single-element access via a builder
-round-trip would regress hot paths like RunEnd binary search probing individual positions.
+`reduce_parent` rewrites a parent from a child's perspective. Dict child pushes ScalarFn into
+values. RunEnd child pulls ScalarFn through to its values. FSST child rewrites Compare by
+compressing the RHS literal.
 
-### CanonicalBuilder
-
-`CanonicalBuilder` is specifically for the decompress-into case: a caller creates a builder,
-and the encoding writes directly into it. This is structurally incompatible with iterative
-execution (where each step returns an owned result), so `CanonicalBuilder` only applies to
-Option A's recursive model.
+The framework runs these to a fixpoint before execution begins (and, in Proposal A, between
+execution steps).
 
-It replaces the current `dyn ArrayBuilder` with a closed enum. The current `ArrayBuilder` is a
-trait object -- callers get a `Box<dyn ArrayBuilder>` and can only interact with it through
-generic methods like `append_scalar()` and `extend_from_array()`. Encodings cannot know what
-concrete builder they are writing into, so they must produce intermediate arrays or scalars and
-let the builder copy the data in.
+Implementations currently misplaced in `execute_parent` that are metadata-only (Dict + Compare,
+ALP + Compare, FoR + Compare, FSST + Compare) move to `reduce_parent`. Implementations that
+are really canonical-type compute kernels (Primitive + Compare, Bool + FillNull, Decimal +
+Between) move into `ScalarFn::execute`.
 
-`CanonicalBuilder` is an enum that mirrors the `Canonical` variants in mutable builder form:
+### FilterArray
 
-```rust
-pub enum CanonicalBuilder {
-  Null(NullBuilder),
-  Bool(BoolBuilder),
-  Primitive(PrimitiveBuilder),
-  Decimal(DecimalBuilder),
-  VarBinView(VarBinViewBuilder),
-  List(ListViewBuilder),
-  FixedSizeList(FixedSizeListBuilder),
-  Struct(StructBuilder),
-  Extension(ExtensionBuilder),
-}
+FilterArray continues to exist as a lazy wrapper in both models. It is not subsumed by the
+execution method. Both models handle it — the difference is what `execute_parent` can return
+when a child encoding wants to handle a FilterArray parent (see comparison section).
 
-impl CanonicalBuilder {
-  pub fn new(dtype: &DType) -> Self { ... }
-  pub fn finish(self) -> Canonical { ... }
-}
-```
-
-Because it is an enum (not `dyn`), encodings can match on it to access the concrete builder
-type directly. This enables zero-copy decompression paths that are impossible with `dyn
-ArrayBuilder`. For example, FSST can match for the `VarBinView` variant and push decompressed
-buffers and views directly into the builder's internal storage, avoiding an intermediate string
-allocation:
-
-```rust
-fn canonicalize_into(fsst: &FSSTArray, builder: &mut CanonicalBuilder, mask, ctx) {
-  let CanonicalBuilder::VarBinView(vbv_builder) = builder else {
-    unreachable!("FSST must canonicalize into VarBinView");
-  };
-  let (buffers, views) = fsst_decode_views(fsst, vbv_builder.completed_block_count(), ctx)?;
-  vbv_builder.push_buffer_and_adjusted_views(&buffers, &views, ...);
-}
-```
-
-`CanonicalBuilder` is only canonical at the root level. For Struct, field arrays are
-accumulated as `Vec<ArrayRef>` (potentially still compressed/chunked) rather than being
-recursively decompressed into nested builders.
+---
 
-### Reduce boundary cleanup
+## Proposal A: Scheduler-driven iterative execution
 
-Both options retain `reduce` and `reduce_parent` unchanged:
+### VTable
 
 ```rust
 fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>>;
-fn reduce_parent(
-  array: &Self::Array,
-  parent: &ArrayRef,
-  child_idx: usize,
-) -> VortexResult<Option<ArrayRef>>;
-```
+fn reduce_parent(array: &Self::Array, parent: &ArrayRef, child_idx: usize)
+    -> VortexResult<Option<ArrayRef>>;
 
-Both are strictly metadata-only, never reading data buffers. This boundary is already enforced
-today -- the remaining work is deprecating the APIs that don't require an `ExecutionCtx` and
-migrating the misplaced implementations. Specifically: the metadata-only rewrites currently
-registered as `execute_parent` (Dict + Compare, ALP + Compare, FoR + Compare, FSST + Compare)
-move to `reduce_parent`. The canonical-type kernels currently in `execute_parent` (Primitive +
-Compare, Bool + FillNull, Decimal + Between) move into the scalar function's `execute` method,
-where they belong.
+fn execute(array: &Self::Array, ctx: &mut ExecutionCtx)
+    -> VortexResult<ExecutionStep>;
 
-### Constants
-
-`ConstantArray` continues to exist as an encoding. It is special-cased only in ScalarFnArray
-materialization -- one check to avoid expanding constants before passing to the scalar function.
-This preserves the `Columnar` enum:
-
-```rust
-pub enum Columnar {
-  Canonical(Canonical),
-  Constant(ConstantArray),
-}
+fn execute_parent(array: &Self::Array, parent: &ArrayRef, child_idx: usize, ctx: &mut ExecutionCtx)
+    -> VortexResult<Option<ArrayRef>>;
 ```
 
-**Open question:** The current scalar representation uses heap-allocated value trees for nested
-types (structs, lists). For deeply nested types, this is expensive. An alternative is to hold
-constants as length-1 arrays instead of scalars, avoiding the scalar value tree entirely. This
-is orthogonal to the execution model choice but may affect `ConstantArray`'s internal
-representation.
-
-### Decompression cache and CSE
-
-`ExecutionCtx` holds a pointer-identity cache (`HashMap<*const dyn Array, ArrayRef>`) scoped
-to a single materialization call. When the same array node appears as a child of multiple
-ScalarFnArray nodes (e.g., `x + x`), it is materialized once. When a ZStd-compressed array is
-sliced into many chunks, the first slice decompresses the block and caches the result. The
-cache is dropped after materialization returns.
-
----
-
-## Option A: Three-phase recursive model
-
-### VTable surface (3 methods)
-
 ```rust
-pub trait VTable {
-  fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>>;
-  fn reduce_parent(
-    array: &Self::Array,
-    parent: &ArrayRef,
-    child_idx: usize,
-  ) -> VortexResult<Option<ArrayRef>>;
+pub enum ExecutionStep {
+    /// Ask the scheduler to execute the child at this index to columnar,
+    /// replace it, then call execute on this array again.
+    ExecuteChild(usize),
 
-  /// Materialize this array into a canonical builder, fusing the given row mask
-  /// with decompression. This is where all buffer-reading work happens.
-  fn canonicalize_into(
-    array: &Self::Array,
-    builder: &mut CanonicalBuilder,
-    mask: &Mask,
-    ctx: &mut ExecutionCtx,
-  ) -> VortexResult<()>;
+    /// Execution is complete.
+    Done(Columnar),
 }
 ```
 
-| Method             | Purpose                                     | Buffer access |
-|--------------------|---------------------------------------------|---------------|
-| `reduce`           | Self-rewrite (metadata only)                | Never         |
-| `reduce_parent`    | Rewrite parent (expression push-down, etc.) | Never         |
-| `canonicalize_into`| Decompress with fused mask                  | Yes           |
+The encoding never recurses into children. It yields control back to the scheduler, telling it
+what work is needed. The scheduler maintains the work stack.
 
-### Three-phase pipeline
+### Scheduler
 
 ```rust
-pub fn canonicalize(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Canonical> {
-  // Phase 1: optimize tree (reduce rules, expression push-down)
-  let array = array.optimize_recursive()?;
-
-  // Phase 2: peel selection wrappers
-  let (array, mask) = array.peel_selection();
-
-  // Phase 3: single recursive descent, no loop
-  let mut builder = CanonicalBuilder::new(array.dtype());
-  array.vtable().canonicalize_into(&array, &mut builder, &mask, ctx)?;
-  Ok(builder.finish())
+fn execute_to_columnar(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Columnar> {
+    let mut array = optimize_recursive(array)?;
+
+    loop {
+        if let Some(c) = array.as_columnar() { return Ok(c); }
+
+        // Try execute_parent (child-driven specialized execution)
+        if let Some(rewritten) = try_execute_parent(&array, ctx)? {
+            array = optimize_recursive(rewritten)?;
+            continue;
+        }
+
+        match array.vtable().execute(&array, ctx)? {
+            ExecutionStep::ExecuteChild(i) => {
+                let child = array.child(i);
+                let executed = execute_to_columnar(child, ctx)?;
+                array = array.with_child(i, executed.into_array());
+                array = optimize_recursive(array)?;
+            }
+            ExecutionStep::Done(result) => return Ok(result),
+        }
+    }
 }
 ```
 
-**Phase 1** runs `reduce`/`reduce_parent` in a loop until the tree is stable. All
-metadata-only rewrites happen here: expression push-down into Dict values, constant folding,
-comparison target transformation for ALP/FoR/FSST.
+After each child execution, `optimize_recursive` runs again — reduce rules can fire on the
+new tree shape. For truly bounded stack depth, an explicit work stack replaces the recursive
+`execute_to_columnar(child)` call.
 
-**Phase 2** strips Filter and Slice wrappers from the root, composing them into a single
-`Mask`. *(Open question: does lifting selection to the root provide meaningful benefit
-beyond what passing `Mask` through recursion already gives? It may be orthogonal to the
-core proposal.)*
+### Per-encoding examples
 
-**Phase 3** is a single recursive descent. Each encoding decompresses with the fused selection.
-No iteration. Selection wrappers deeper in the tree compose themselves into the selection
-during descent:
+**DictArray** — requests codes first, then gathers:
 
 ```rust
-// SliceArray's canonicalize_into: slicing composes as masking out the excluded rows
-fn canonicalize_into(slice: &SliceArray, builder, mask, ctx) {
-  let composed = mask.slice(slice.range());
-  slice.child().canonicalize_into(builder, &composed, ctx)
+fn execute(dict: &DictArray, ctx: &mut ExecutionCtx) -> VortexResult<ExecutionStep> {
+    if !dict.codes().is_columnar() {
+        return Ok(ExecutionStep::ExecuteChild(0));
+    }
+    let codes = dict.codes().as_primitive();
+    let gathered = gather(dict.values(), &codes, ctx)?;
+    Ok(ExecutionStep::Done(gathered))
 }
 ```
 
-### Per-encoding canonicalize_into
-
-- **BitPacked**: unpack only selected positions into PrimitiveBuilder
-- **FSST**: decompress only selected strings into VarBinViewBuilder
-- **RunEnd**: binary-search ends to find runs intersecting the selection, expand only those
-- **Dict**: materialize codes with selection, gather from values
-- **Chunked**: split selection across chunks, recurse per-chunk
-- **Struct**: pass selection to each field's materialization
-- **Constant**: create scalar repeated to selection count
-
-For fused decompression patterns (e.g., Dict-RLE), the parent checks inline:
+**ScalarFnArray** — requests children left-to-right, then evaluates:
 
 ```rust
-fn canonicalize_into(dict: &DictArray, builder, mask, ctx) {
-  if let Some(runend) = dict.codes().as_opt::<RunEndVTable>() {
-    return fused_dict_rle(dict.values(), runend, builder, mask, ctx);
-  }
-
-  let mut code_builder = primitive_builder(dict.codes().dtype());
-  dict.codes().canonicalize_into(&mut code_builder, mask, ctx)?;
-  let codes = code_builder.finish_into_primitive();
-  take_into_builder(builder, dict.values(), &codes, ctx)
+fn execute(sfn: &ScalarFnArray, ctx: &mut ExecutionCtx) -> VortexResult<ExecutionStep> {
+    for (i, child) in sfn.children().iter().enumerate() {
+        if !child.is_columnar() {
+            return Ok(ExecutionStep::ExecuteChild(i));
+        }
+    }
+    let result = sfn.scalar_fn().execute(sfn.columnar_children(), ctx)?;
+    Ok(ExecutionStep::Done(result))
 }
 ```
 
-### ScalarFnArray materialization
+**FilterArray** — requests child, then applies mask:
 
 ```rust
-fn canonicalize_into(
-  array: &ScalarFnArray,
-  builder: &mut CanonicalBuilder,
-  mask: &Mask,
-  ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-  let children: Vec<Columnar> = array.children().iter()
-          .map(|child| {
-            if let Some(c) = child.as_opt::<ConstantVTable>() {
-              return Ok(Columnar::Constant(c.clone()));
-            }
-            let mut child_builder = CanonicalBuilder::new(child.dtype());
-            child.canonicalize_into(&mut child_builder, mask, ctx)?;
-            Ok(Columnar::Canonical(child_builder.finish()))
-          })
-          .try_collect()?;
-
-  let result = array.scalar_fn().execute(children, ctx)?;
-  builder.extend_from_array(&result);
-  Ok(())
+fn execute(filter: &FilterArray, ctx: &mut ExecutionCtx) -> VortexResult<ExecutionStep> {
+    if !filter.child().is_columnar() {
+        return Ok(ExecutionStep::ExecuteChild(0));
+    }
+    let filtered = filter.mask().apply_to(filter.child().as_canonical())?;
+    Ok(ExecutionStep::Done(Columnar::Canonical(filtered)))
 }
 ```
 
-### Trade-offs
-
-**Strengths:**
-
-- **Minimal VTable surface.** 3 methods, down from 5. Each has a clear, non-overlapping role.
-- **No iteration loop.** Materialization is a single recursive descent. Predictable, easy to
-  debug, easy to reason about performance. (Note: still recursive -- see weakness below.)
-- **Mask fused with decompression.** Each encoding handles row selection once, in one place.
-- **Strict reduce/canonicalize boundary.** reduce = metadata-only. canonicalize_into =
-  buffer-reading. No ambiguity, works on GPU.
-
-**Weaknesses:**
-
-- **No encoding-preserving mask application.** `canonicalize_into` always decompresses into a
-  `CanonicalBuilder`. There is no VTable mechanism for an encoding to absorb a mask without
-  decompressing. Exporters that want to preserve an encoding (DuckDB FSST vectors, Arrow
-  dictionary arrays) must call encoding-specific filter methods outside the framework. See
-  trade-off #2 for the concrete FSST example.
+**BitPacked** — leaf encoding, decompresses directly:
 
-- **Fused patterns require parent-knows-child.** Dict must check `as_opt::<RunEndVTable>()` to
-  fuse Dict-RLE. An external encoding can't volunteer a fused path for a built-in parent
-  without modifying the parent. The known fused patterns (Dict-RLE) are all between encodings
-  in `vortex-array`, so this isn't a practical problem today, but it limits extensibility.
-
-- **Exporters can't inspect intermediates.** After optimize + peel, the exporter sees the tree
-  as-is. If the root is an opaque encoding that would decode to Dict, the exporter can't
-  discover that -- Phase 3 goes straight to canonical. The DuckDB dictionary export use case
-  only works when DictArray is already visible after Phase 1, which is the common case (Vortex
-  files store Dict as DictArray) but not guaranteed.
-
-- **Still recursive.** `canonicalize_into` recurses into children. On deeply nested trees
-  (many layers of encoding, deeply nested structs/lists), this can still overflow the stack --
-  the same class of problem the current executor has. An explicit work stack or trampoline
-  could mitigate this but adds implementation complexity.
+```rust
+fn execute(bp: &BitPackedArray, ctx: &mut ExecutionCtx) -> VortexResult<ExecutionStep> {
+    let primitive = unpack(bp)?;
+    Ok(ExecutionStep::Done(Columnar::Canonical(Canonical::Primitive(primitive))))
+}
+```
 
 ---
 
-## Option B: Cleaned-up iterative model
+## Proposal B: Canonical builder execution
 
-Keep the framework-driven iteration loop but clean up the VTable surface, enforce the
-reduce/execute boundary, and thread `Mask` through execution.
-
-### VTable surface (3 methods)
+### VTable
 
 ```rust
-pub trait VTable {
-  fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>>;
-  fn reduce_parent(
-    array: &Self::Array,
-    parent: &ArrayRef,
-    child_idx: usize,
-  ) -> VortexResult<Option<ArrayRef>>;
+fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>>;
+fn reduce_parent(array: &Self::Array, parent: &ArrayRef, child_idx: usize)
+    -> VortexResult<Option<ArrayRef>>;
 
-  /// Take one execution step toward materialization.
-  /// Returns an ExecutionStep telling the framework what happened.
-  fn execute(
+fn execute(
     array: &Self::Array,
-    mask: &Mask,
+    builder: &mut CanonicalBuilder,
     ctx: &mut ExecutionCtx,
-  ) -> VortexResult<ExecutionStep>;
-}
-```
+) -> VortexResult<()>;
 
-```rust
-pub enum ExecutionStep {
-  /// Fully materialized.
-  Done(Columnar),
-  /// Rewrote to a new array tree. Framework should optimize and execute again.
-  Rewrite(ArrayRef),
-  /// Composed the mask and delegated to child. Framework should continue
-  /// executing the returned array with the new mask.
-  Peel(ArrayRef, Mask),
-}
+fn execute_parent(
+    array: &Self::Array,
+    parent: &ArrayRef,
+    child_idx: usize,
+    builder: &mut CanonicalBuilder,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<bool>;  // true if handled
 ```
 
-| Method    | Purpose                                     | Buffer access |
-|-----------|---------------------------------------------|---------------|
-| `reduce`  | Self-rewrite (metadata only)                | Never         |
-| `reduce_parent` | Rewrite parent (expression push-down)  | Never         |
-| `execute` | One step toward materialization              | Yes           |
+### CanonicalBuilder
 
-### Framework loop
+A closed enum mirroring `Canonical` in mutable builder form:
 
 ```rust
-pub fn execute_to_columnar(
-  mut array: ArrayRef,
-  mut mask: Mask,
-  ctx: &mut ExecutionCtx,
-) -> VortexResult<Columnar> {
-  for _ in 0..MAX_ITERATIONS {
-    // Check termination
-    if let Some(c) = array.as_opt::<ConstantVTable>() {
-      return Ok(Columnar::Constant(c.apply_mask(&mask)));
-    }
-    if let Some(c) = array.as_opt::<AnyCanonical>() {
-      return Ok(Columnar::Canonical(mask.apply_to(c.into())?));
-    }
-
-    // Optimize (reduce/reduce_parent to fixpoint)
-    array = array.optimize_recursive()?;
-
-    // Execute one step
-    match array.vtable().execute(&array, &mask, ctx)? {
-      ExecutionStep::Done(c) => return Ok(c),
-      ExecutionStep::Rewrite(new_array) => {
-        array = new_array;
-      }
-      ExecutionStep::Peel(child, new_mask) => {
-        array = child;
-        mask = new_mask;
-      }
-    }
-  }
-  vortex_bail!("exceeded max iterations")
+pub enum CanonicalBuilder {
+    Null(NullBuilder),
+    Bool(BoolBuilder),
+    Primitive(PrimitiveBuilder),
+    Decimal(DecimalBuilder),
+    VarBinView(VarBinViewBuilder),
+    List(ListViewBuilder),
+    FixedSizeList(FixedSizeListBuilder),
+    Struct(StructBuilder),
+    Extension(ExtensionBuilder),
 }
 ```
 
-Note: `optimize_recursive()` runs inside the loop, after every step. This means reduce_parent
-rules can fire on intermediate forms exposed by execution -- the key capability that Option A
-lacks.
-
-### Per-encoding execute
+Because it is an enum (not `dyn`), encodings can match on the concrete variant. This enables
+zero-copy decompression paths impossible with the current `dyn ArrayBuilder`.
 
-Each encoding returns a typed step, making execution transparent:
+### Framework
 
 ```rust
-// FilterArray: compose masks, delegate
-fn execute(filter: &FilterArray, mask, ctx) -> ExecutionStep {
-  let composed = mask.compose(&filter.mask());
-  ExecutionStep::Peel(filter.child().clone(), composed)
-}
-
-// SliceArray: slice the mask, delegate
-fn execute(slice: &SliceArray, mask, ctx) -> ExecutionStep {
-  let composed = mask.slice(slice.range());
-  ExecutionStep::Peel(slice.child().clone(), composed)
-}
-
-// BitPacked: fully decompress with mask
-fn execute(bp: &BitPackedArray, mask, ctx) -> ExecutionStep {
-  let mut builder = primitive_builder(bp.dtype());
-  unpack_selected(bp, &mut builder, mask)?;
-  ExecutionStep::Done(Columnar::Canonical(builder.finish().into()))
-}
-
-// Dict: decompress one step -- execute codes, then take
-fn execute(dict: &DictArray, mask, ctx) -> ExecutionStep {
-  if !dict.codes().is_canonical() {
-    // Execute codes one step, rebuild dict
-    let stepped_codes = dict.codes().execute_one_step(mask, ctx)?;
-    let new_dict = DictArray::new(stepped_codes, dict.values().clone());
-    return ExecutionStep::Rewrite(new_dict.into_array());
-  }
-  // Codes are canonical -- do the take with mask
-  let result = take_canonical(dict.values(), dict.codes().as_primitive(), mask, ctx)?;
-  ExecutionStep::Done(Columnar::Canonical(result))
-}
-
-// ScalarFnArray: execute children, then apply function
-fn execute(sfn: &ScalarFnArray, mask, ctx) -> ExecutionStep {
-  // Find first non-columnar child, execute it one step
-  for (i, child) in sfn.children().iter().enumerate() {
-    if !child.is_columnar() {
-      let stepped = child.execute_one_step(mask, ctx)?;
-      let new_sfn = sfn.replace_child(i, stepped);
-      return ExecutionStep::Rewrite(new_sfn.into_array());
+fn canonicalize(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Canonical> {
+    let array = optimize_recursive(array)?;
+    let mut builder = CanonicalBuilder::new(array.dtype());
+
+    // Try execute_parent
+    for child_idx in 0..array.nchildren() {
+        let child = array.child(child_idx);
+        if child.vtable().execute_parent(&child, &array, child_idx, &mut builder, ctx)? {
+            return Ok(builder.finish());
+        }
     }
-  }
-  // All children are columnar -- apply function
-  let children = sfn.children_as_columnar(mask);
-  let result = sfn.scalar_fn().execute(children, ctx)?;
-  ExecutionStep::Done(Columnar::Canonical(result.into()))
+
+    array.vtable().execute(&array, &mut builder, ctx)?;
+    Ok(builder.finish())
 }
 ```
 
-### Fused patterns via reduce_parent
+Single call. No iteration loop. No MAX_ITERATIONS. Reduce runs once, then one recursive
+descent fills the builder.
 
-Because `optimize_recursive()` runs after every step, fused patterns work naturally. Dict
-executes its codes one step (RunEnd → Primitive), then on the next iteration,
-`optimize_recursive()` runs and reduce_parent rules can fire on the new tree. The Dict-RLE
-fused path can be implemented as a reduce_parent rule on RunEnd that recognizes the Dict parent
-and rewrites the tree, or Dict can check inline in its execute -- both work because the
-framework loops.
+### Per-encoding examples
 
-### Exporter tree inspection
-
-Exporters drive the loop directly, inspecting after each step:
+**FSST** — zero-copy push into VarBinViewBuilder:
 
 ```rust
-fn export_chunk(mut array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<DuckDBVector> {
-  let mut mask = Mask::new_true(array.len());
-
-  for _ in 0..MAX_ITERATIONS {
-    array = array.optimize_recursive()?;
-    let (peeled, m) = array.peel_selection();
-    array = peeled;
-    mask = mask.compose(&m);
-
-    if let Some(dict) = array.as_opt::<DictVTable>() {
-      return export_dict(dict, &mask, ctx);
-    }
-    if array.is_canonical() {
-      return export_canonical(mask.apply_to(array)?, ctx);
-    }
-
-    match array.vtable().execute(&array, &mask, ctx)? {
-      ExecutionStep::Done(c) => return export_columnar(c, ctx),
-      ExecutionStep::Rewrite(new) => { array = new; }
-      ExecutionStep::Peel(child, m) => { array = child; mask = m; }
-    }
-  }
-  vortex_bail!("exceeded max iterations")
+fn execute(fsst: &FSSTArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> {
+    let CanonicalBuilder::VarBinView(vbv) = builder else { unreachable!() };
+    let (buffers, views) = fsst_decompress(fsst, vbv.completed_block_count())?;
+    vbv.push_buffers_and_views(&buffers, &views);
+    Ok(())
 }
 ```
 
-The exporter inspects the tree after every step. If an opaque encoding decodes to DictArray
-during execution, the exporter sees it on the next iteration and can export it as a dictionary.
-
-### Trade-offs
-
-**Strengths:**
-
-- **Encoding-preserving mask application.** An encoding can implement `execute(mask)` to
-  absorb the mask and return `Rewrite(still_encoded)`. FSST can filter without decompressing,
-  and exporters see the result through the standard loop -- no encoding-specific logic needed
-  for the mask-application step.
-- **Exporters inspect intermediates.** The DuckDB exporter can find DictArray even when it's
-  hidden behind an opaque encoding, because it inspects after each step.
-- **Child-volunteers extensibility preserved.** A fused decompression pattern can be expressed
-  as a reduce_parent rule or an inline check, without closing the door on external encodings
-  discovering patterns through the iteration.
-- **Same 3 VTable methods.** reduce, reduce_parent, execute. Clean.
-- **No stack overflow risk.** The outer loop is iterative -- each step returns control to the
-  framework. Stack depth is bounded to one execution step, regardless of tree depth.
-
-**Weaknesses:**
-
-- **Iteration loop remains.** Execution is still iterative with a MAX_ITERATIONS bound. Each
-  step produces a new array tree, and the framework loops. This is more predictable than before
-  (no execute_parent, Mask threaded through, typed ExecutionStep) but still requires
-  tracing through iterations to understand execution. The recursive model's "one call, done"
-  is easier to reason about.
-- **Intermediate allocations.** Each `Rewrite` step allocates a new array tree. The recursive
-  model writes directly into the builder with no intermediates. For hot paths (large batches,
-  tight loops), the allocation overhead may be measurable.
-- **Mask and the loop interact awkwardly.** When an encoding returns `Peel(child,
-  new_mask)`, the mask changes mid-loop. The framework must track the current mask across
-  iterations. In the recursive model, the mask flows naturally through function arguments.
-- **No decompress-into-buffer.** The iterative model returns `Columnar` from `Done` -- the
-  encoding owns its output. There is no way to pass a caller-owned `CanonicalBuilder` through
-  the loop, because the framework doesn't know whether this step is final or intermediate.
-  This is the primary pain point that motivated this RFC, and Option B does not address it.
-  ChunkedArray still can't share a builder across chunks. Exporters still can't write directly
-  into their output format.
-
----
-
-## Fundamental trade-offs
-
-The two options differ on three architectural axes. These are genuine trade-offs -- each option
-wins on some axes and loses on others. They are not fixable with clever engineering; they follow
-from the structural decision of recursion vs. iteration.
-
-### 1. Caller-owned output buffers (Option A wins)
-
-In Option A, the caller creates a `CanonicalBuilder` and passes it down through the recursive
-descent. Every encoding writes directly into this caller-owned buffer. The buffer exists before
-materialization begins and is filled during a single pass.
-
-In Option B, each encoding returns `ExecutionStep::Done(Columnar)` -- it allocates and owns its
-output. The caller receives the finished result. There is no way to pass a builder through the
-loop because the encoding doesn't know at call time whether this is the final step or an
-intermediate rewrite that will be fed back into the loop.
-
-**Why this matters:**
-
-- **ChunkedArray concat avoidance.** With a caller-owned builder, ChunkedArray iterates its
-  chunks and each chunk writes into the same builder. The output is a single contiguous
-  allocation. Without it, each chunk produces its own `Columnar`, and the framework must
-  concatenate them -- an extra copy of the entire column.
-
-- **FSST zero-copy push.** FSST can decompress string views and data buffers directly into a
-  `VarBinViewBuilder`'s internal storage, adjusting view offsets in-place. This avoids
-  materializing an intermediate string array. With `Done(Columnar)`, FSST must allocate the
-  `VarBinView` canonical array itself, losing the ability to share buffer space with other
-  chunks or avoid a final copy.
-
-- **Pre-allocated output.** When the output length is known (common case: the selection count),
-  the builder can pre-allocate once. The iterative model allocates per step.
-
-This trade-off is structural: passing a mutable builder down requires a call stack (recursion).
-A loop that yields intermediate values cannot thread a mutable builder through iterations
-without fundamentally changing the `ExecutionStep` API (e.g., adding a `Done`-with-builder
-variant), which would complicate the common case where an encoding returns `Rewrite`.
-
-### 2. Encoding-preserving mask application (Option B wins)
-
-Some encodings can apply a row mask **without decompressing**. FSST can filter its compressed
-data and produce a smaller FSST array. This matters when the downstream consumer natively
-supports the encoding -- DuckDB supports FSST vectors, so exporting a filtered FSST directly
-avoids decompression entirely.
-
-**Concrete example:** `filter(scalar_fn(upper, [fsst(data)]), mask)`
-
-Both options share the reduce phase. FSST's reduce_parent pushes `upper` into the symbol
-table (metadata-only, analogous to Dict pushing scalar functions into values). After reduce +
-peel, both options see: `fsst_upper(data)` + mask.
-
-In **Option B**, FSST's standard `execute(mask)` method handles this:
+**ChunkedArray** — shared builder, no concat:
 
 ```rust
-fn execute(fsst: &FSSTArray, mask, ctx) -> ExecutionStep {
-  let filtered = fsst.filter_preserving(&mask)?;  // still FSST
-  ExecutionStep::Rewrite(filtered.into_array())
+fn execute(chunked: &ChunkedArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> {
+    for chunk in chunked.chunks() {
+        canonicalize_into(chunk, builder, ctx)?;
+    }
+    // All chunks wrote into the same builder. No concatenation needed.
+    Ok(())
 }
 ```
 
-The DuckDB exporter drives the loop, sees FSST on the next iteration, and exports directly.
-The standard path would execute again next iteration and decompress -- no harm. Any encoding
-can implement this pattern through the general VTable.
-
-In **Option A**, the exporter must use encoding-specific logic outside the framework:
+**DictArray** — executes codes into sub-builder, then gathers:
 
 ```rust
-fn export_chunk(array: ArrayRef, ctx: &mut ExecutionCtx) -> DuckDBVector {
-  let array = array.optimize_recursive()?;
-  let (array, mask) = array.peel_selection();
-
-  // Encoding-specific: exporter must know about FSST
-  if let Some(fsst) = array.as_opt::<FSSTVTable>() {
-    let filtered = fsst.filter_preserving(&mask)?;
-    return export_fsst(filtered, ctx);
-  }
-
-  let mut builder = CanonicalBuilder::new(array.dtype());
-  array.canonicalize_into(&mut builder, &mask, ctx)?;
-  export_canonical(builder.finish())
+fn execute(dict: &DictArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> {
+    let mut code_builder = CanonicalBuilder::new(dict.codes().dtype());
+    canonicalize_into(dict.codes(), &mut code_builder, ctx)?;
+    let codes = code_builder.finish().into_primitive();
+    gather_into_builder(dict.values(), &codes, builder, ctx)
 }
 ```
 
-The general `canonicalize_into` path always decompresses -- there is no VTable mechanism for
-an encoding to absorb a mask without decompressing. The exporter must know about each encoding
-it wants to preserve and call encoding-specific methods.
-
-**The difference:** Option B handles encoding-preserving mask application through the standard
-`execute` VTable method -- any encoding can participate. Option A requires the exporter to have
-encoding-specific knowledge for each format it wants to preserve. In practice, exporters
-already need encoding-specific logic to export to native formats (DuckDB FSST vectors, Arrow
-dictionary arrays), so the additional burden is the mask-application call rather than just an
-`as_opt` check.
-
-This extends to cross-crate encodings. An external encoding that can filter itself without
-decompressing just implements `execute(mask) → Rewrite(filtered_self)` in Option B. In
-Option A, every exporter that wants to preserve that encoding must be updated to call its
-specific filter method.
-
-### 3. Exporter intermediate inspection (Option B wins)
-
-In Option B, exporters drive the loop directly and can inspect the array tree after each step.
-After an encoding applies a mask via `Rewrite`, the exporter sees the result on the next
-iteration and can export it in its native format.
-
-In Option A, exporters see the tree after Phase 1 (optimize) and Phase 2 (peel selection).
-When the encoding the exporter cares about is already visible after peel (the common case --
-Dict is stored as DictArray, FSST is stored as FSSTArray), the exporter can intercept and
-handle it with encoding-specific logic. This covers the typical Vortex file scan path. The
-gap is when an encoding is hidden behind a layer that must be executed to reveal it, or when
-the exporter wants the encoding to absorb the mask through the general VTable (see trade-off
-#2 above).
-
-### Summary
-
-| Trade-off | Winner | Practical impact today |
-|-----------|--------|----------------------|
-| Caller-owned output buffers | **Option A** | **Concrete.** ChunkedArray concat avoidance and FSST zero-copy decompression are measurable. |
-| Encoding-preserving mask application | **Option B** | **Concrete.** FSST filter-without-decompress, Dict filter-without-decompress. Exporters need encoding-specific logic in Option A. |
-| Exporter intermediate inspection | **Option B** | **Moderate.** Exporters can discover encodings revealed by execution. Fixed by peel_selection when the filter is at the root. |
-
-Secondary differences:
-
-| Aspect | Option A | Option B |
-|--------|----------|----------|
-| Stack overflow risk | Still recursive (can overflow on deep trees) | Iterative loop, bounded stack depth |
-| Intermediate allocations | None (direct to builder) | One array tree per `Rewrite` step |
-| Debuggability | One recursive call, stack trace shows position | Loop iterations, requires tracing |
-| Mask threading | Natural function argument | Mutable state across iterations |
-| Fused patterns (Dict-RLE) | Parent checks child inline | reduce_parent on intermediates, or inline |
-| GPU kernel fusion potential | Clean tree maps to single kernel | Step-by-step harder to fuse |
-| Determinism | Always same call sequence | Iteration count varies with input |
-
-## What changes (shared)
-
-**Removed (both options):**
-
-- `execute_parent` VTable method -- metadata-only rewrites move to `reduce_parent`;
-  canonical-type kernels move into `ScalarFn::execute`.
-- `append_to_builder` VTable method.
-- Selection-related adaptors -- `FilterReduceAdaptor`, `FilterExecuteAdaptor`,
-  `SliceReduceAdaptor`, `SliceExecuteAdaptor`, and all their per-encoding implementations.
-  `TakeExecuteAdaptor` is also removed (Take/gather is not row masking; it remains an
-  encoding-internal concern).
-
-**Retained (both options):**
-
-- `reduce` / `reduce_parent` -- strictly metadata-only.
-- `ScalarFnArray` -- per-node lazy computation with `reduce_parent` push-down.
-- Filter/Slice array wrappers -- for lazy representation. Could be unified into a single
-  `MaskedSelectionArray` in the future. Take remains separate (gather semantics, not masking).
-- `scalar_at` -- separate VTable method for single-element access.
-
-**New (both options):**
-
-- `Mask::compose()` and `Mask::apply()` methods on the existing `Mask` type.
-- `Array::peel_selection()` -- strip selection wrappers from root.
-
-**New (Option A only):**
-
-- `CanonicalBuilder` -- one-level-deep builder enum for decompress-into.
-- `canonicalize_into` VTable method -- single recursive descent with fused mask.
-
-**New (Option B only):**
-
-- `ExecutionStep` enum -- typed return from execute, replacing bare `ArrayRef`.
-- `execute` takes `&Mask` parameter -- row mask threaded through the loop.
-
-## Worked example: DuckDB dictionary export
-
-A Vortex file scan produces a chunk with a dictionary column. The query has a filter predicate
-(`name = 'alice'`) evaluated to a mask, and a projection (`upper(name)`). The array tree:
-
-```
-filter(
-    scalar_fn(upper, [
-        dict(
-            codes: bitpacked([0,1,0,2,1,0,...]),
-            values: fsst(["alice","bob","charlie"])
-        )
-    ]),
-    mask
-)
-```
-
-**Optimize.** `reduce_parent` pushes `upper` into dict values:
-
-```
-filter(
-    dict(
-        codes: bitpacked([0,1,0,2,1,0,...]),
-        values: scalar_fn(upper, [fsst(["alice","bob","charlie"])])
-    ),
-    mask
-)
-```
-
-**Peel selections.** Filter is at the root, so it peels:
-
-```
-array = dict(codes: bitpacked(...), values: scalar_fn(upper, [fsst(...)]))
-mask = filter_mask
-```
-
-**Export (Option A).** The exporter inspects the tree after optimize + peel:
+**FilterArray** — execute child, apply mask:
 
 ```rust
-fn export_chunk(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<DuckDBVector> {
-  let array = array.optimize_recursive()?;
-  let (array, mask) = array.peel_selection();
-
-  if let Some(dict) = array.as_opt::<DictVTable>() {
-    return export_dict(dict, &mask, ctx);
-  }
-
-  let mut builder = CanonicalBuilder::new(array.dtype());
-  array.canonicalize_into(&mut builder, &mask, ctx)?;
-  export_canonical(builder.finish())
+fn execute(filter: &FilterArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> {
+    let mut sub = CanonicalBuilder::new(filter.child().dtype());
+    canonicalize_into(filter.child(), &mut sub, ctx)?;
+    let canonical = sub.finish();
+    let filtered = filter.mask().apply_to(canonical)?;
+    builder.extend_from_canonical(&filtered);
+    Ok(())
 }
 ```
 
-**Export (Option B).** The exporter drives the loop, inspecting after each step:
+**BitPacked** — unpack directly into PrimitiveBuilder:
 
 ```rust
-fn export_chunk(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<DuckDBVector> {
-  let (mut array, mut mask) = (array, Mask::new_true(array.len()));
-  for _ in 0..MAX_ITERATIONS {
-    array = array.optimize_recursive()?;
-    let (peeled, m) = array.peel_selection();
-    array = peeled;
-    mask = mask.compose(&m);
-
-    if let Some(dict) = array.as_opt::<DictVTable>() {
-      return export_dict(dict, &mask, ctx);
-    }
-    if array.is_canonical() {
-      return export_canonical(mask.apply_to(array)?);
-    }
-    match array.vtable().execute(&array, &mask, ctx)? {
-      ExecutionStep::Done(c) => return export_columnar(c),
-      ExecutionStep::Rewrite(a) => { array = a; }
-      ExecutionStep::Peel(a, m) => { array = a; mask = m; }
-    }
-  }
-  vortex_bail!("export did not converge")
+fn execute(bp: &BitPackedArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> {
+    let CanonicalBuilder::Primitive(pb) = builder else { unreachable!() };
+    unpack_into(bp, pb)?;
+    Ok(())
 }
 ```
 
-Both options produce the same result: DuckDB gets a dictionary vector with 3 uppercase values
-and only the selected codes. The difference is that Option B can discover DictArray even if it
-was hidden behind an opaque encoding, while Option A only sees what's visible after Phase 1.
-
-## Migration path
-
-The migration is the same for both options. The key difference is Phase 3/5 -- whether
-encodings implement `canonicalize_into` (Option A) or return `ExecutionStep` (Option B).
-
-**Phase 1: Foundation types.** Add `Mask::compose()`/`Mask::apply()`, introduce `CanonicalBuilder`,
-`Array::peel_selection()`. All additive.
-
-**Phase 2: Add new execution method with fallback.** Add `canonicalize_into` (Option A) or
-typed `execute` (Option B) with a default that bridges to the old path. Both old and new paths
-work.
+---
 
-**Phase 3: Migrate canonical encodings.** One per PR. Independent, parallelizable.
+## Comparison
 
-**Phase 4: Migrate selection wrappers.** Filter, Slice, Chunked, Masked compose into
-`Mask`.
+### What Proposal A can do that Proposal B cannot
 
-**Phase 5: Migrate compressed encodings.** Mask-fused decompression. Independent per
-encoding.
+**1. Encoding-preserving execute_parent.**
 
-**Phase 6: Move execute_parent → reduce_parent.** Metadata-only rewrites become reduce rules.
-Canonical-type kernels move into ScalarFn::execute. Remove execute_parent.
+In A, `execute_parent` returns `Option<ArrayRef>` — the result can be in *any* encoding. In B,
+`execute_parent` pushes into a `CanonicalBuilder` — the result is always canonical.
 
-**Phase 7: Delete old path.** Remove old `execute` method, the `Executable` trait, and all
-selection adaptors.
+Concrete example: `Filter(FSST(data), mask)`. FSST's `execute_parent` sees the FilterArray
+parent.
 
-**Phase 8: Wire up exporters.** DuckDB, Arrow exporters migrate to new API.
+- In A, it can return a filtered FSST array (still FSST-encoded). An exporter driving the
+  scheduler sees FSST on the next iteration and exports it as a DuckDB FSST vector — no
+  decompression.
+- In B, it must push decompressed VarBinView into the builder. The FSST encoding is gone.
 
-## Compatibility
+Same applies to DictArray for DuckDB dictionary vector export, or any encoding where the
+downstream consumer natively supports the compressed form.
 
-This RFC does not change the file format or wire format. All changes are internal to the
-execution engine.
+**2. Cross-step optimization.**
 
-**Public API breakage:**
+In A, the scheduler runs `optimize_recursive` after each child execution. Reduce rules fire on
+the new tree shape. Patterns only visible after partial execution can still be optimized.
 
-- `Executable` trait and `execute::<T>()` are removed. Callers migrate to `canonicalize()` /
-  `execute_to_columnar()`.
-- Third-party encodings must migrate to the new execution method. The fallback default
-  (Phase 2) provides a bridge during migration.
+Example: `ScalarFn(upper, [Dict(BitPacked(codes), values)])`. After the scheduler executes
+BitPacked codes to PrimitiveArray, the tree becomes
+`ScalarFn(upper, [Dict(Primitive(codes), values)])`. A reduce_parent rule on Dict pushes
+`upper` into values — this optimization fires between steps.
 
-## Drawbacks
+In B, reduce runs once before the single recursive descent. If a pattern only becomes visible
+after partial execution, it's missed.
 
-- **Migration effort.** ~33 encodings must migrate. The phased approach with a fallback default
-  mitigates this.
+**3. Bounded stack depth.**
 
-- **Mask composition complexity.** `Mask::compose` must be correct for all combinations of
-  internal representations (`AllTrue`, `AllFalse`, `Values`). This is concentrated, well-tested
-  complexity, but getting it wrong causes data corruption.
+In A, the scheduler can use an explicit work stack instead of recursion. Stack depth is O(1)
+regardless of encoding depth. In B, `execute` recurses into children — stack depth equals
+encoding tree depth.
 
-## Alternatives
+### What Proposal B can do that Proposal A cannot
 
-### Keep the iterative execution model as-is
+**1. Caller-owned output buffers (decompress-into).**
 
-Address individual pain points without restructuring. Avoids migration cost but does not
-solve decompress-into-buffer and perpetuates the adaptor boilerplate.
+The builder is created by the caller and passed down. This enables three concrete optimizations:
 
-### Use a single ExpressionArray instead of per-node ScalarFnArray
+- **ChunkedArray without concat.** One builder, each chunk writes into it. Single contiguous
+  allocation. In A, each chunk returns its own Columnar; the framework must concat them — an
+  extra copy of the entire column.
 
-Prevents `reduce_parent` from interacting with individual expression nodes -- the key
-optimization for dictionary-encoded predicates.
+- **FSST zero-copy push.** FSST decompresses views and data buffers directly into
+  VarBinViewBuilder's internal storage, adjusting view offsets to account for already-completed
+  blocks. In A, FSST must allocate its own VarBinViewArray.
 
-### Use `dyn ArrayBuilder` instead of `CanonicalBuilder` enum
+- **Pre-allocated output.** When output length is known, the builder pre-allocates once. In A,
+  each Done allocates independently.
 
-More extensible but prevents zero-copy builder paths. FSST pushing directly into
-`VarBinViewBuilder` depends on knowing the builder variant at compile time.
+**2. No intermediate allocations.**
 
-### Subsume `scalar_at` into Mask
+Every encoding writes directly into the final output buffer. No temporary array trees between
+steps. For hot paths on large batches, this is measurable.
 
-Deferred. Single-element access via a builder round-trip would regress hot paths. Revisit
-after profiling.
+**3. No iteration bound.**
 
-## Prior Art
+Single recursive descent. No MAX_ITERATIONS. No risk of non-convergence.
 
-- **DataFusion** has `PhysicalExpr::evaluate_selection`, threading an explicit selection
-  through expression evaluation.
+### Summary table
 
-## Unresolved Questions
+| Capability                          | Proposal A | Proposal B |
+|-------------------------------------|------------|------------|
+| Caller-owned output buffers         | No         | **Yes**    |
+| ChunkedArray without concat         | No         | **Yes**    |
+| FSST zero-copy into builder         | No         | **Yes**    |
+| Pre-allocated output                | No         | **Yes**    |
+| No intermediate allocations         | No         | **Yes**    |
+| Encoding-preserving execute_parent  | **Yes**    | No         |
+| Exporter intercepts intermediates   | **Yes**    | No         |
+| Cross-step optimization             | **Yes**    | No         |
+| Bounded stack depth (explicit stack)| **Yes**    | No         |
+| No iteration bound / convergence    | No         | **Yes**    |
 
-- **Mask composition correctness**: `Mask::compose` and `Mask::slice` interactions need
-  careful implementation and thorough testing.
+### The core trade-off
 
-- **CanonicalBuilder design**: The exact interface for one-level-deep building, especially for
-  List and Extension types, needs design work.
+Proposal A optimizes for **flexibility**: intermediate encodings are visible, exporters can
+intercept, reduce rules fire between steps. The cost is that every encoding allocates its own
+output and ChunkedArray must concat.
 
-- **Aggregate and window functions**: Out of scope. AggregateFnArray and WindowFnArray will
-  need their own materialization paths.
+Proposal B optimizes for **allocation efficiency**: caller-owned buffers, zero-copy
+decompression, no intermediates. The cost is that execution always produces canonical — encodings
+can't preserve themselves through execution, and exporters must use encoding-specific logic
+outside the framework.
 
-- **Option A vs Option B**: This RFC presents both. The decision reduces to: are caller-owned
-  output buffers (ChunkedArray concat avoidance, FSST zero-copy decompression) worth giving up
-  encoding-preserving mask application through the general VTable (FSST filter-without-decompress,
-  Dict filter-without-decompress for exporters)? Both trade-offs are concrete. In Option A,
-  exporters need encoding-specific filter methods; in Option B, ChunkedArray can't share a
-  builder across chunks.
+## Compatibility
 
-## Future Possibilities
+No file format or wire format changes. All changes are internal to the execution engine.
 
-### Mask pull-up
+Public API breakage: the `Executable` trait and `execute::<T>()` method are replaced.
+Third-party encodings must migrate. A default implementation bridging to the old path eases
+migration.
 
-When a filter mask is nested inside an expression (`scalar_fn(upper, [filter(dict(...), mask)])`),
-`peel_selection()` can't extract it. A future optimization could pull masks out of expressions
-to the root, enabling further optimizations.
+## Unresolved Questions
 
-### Aggregate and window functions
+- **CanonicalBuilder design (Proposal B):** The exact interface for one-level-deep building,
+  especially for List and Extension types, needs design work. Struct fields are accumulated as
+  `Vec<ArrayRef>` (potentially compressed), not recursively built.
 
-`AggregateFnArray` and `WindowFnArray` will follow a similar deferred pattern with
-`reduce`/`reduce_parent` optimization and `Mask`-based materialization.
+- **Explicit work stack (Proposal A):** The scheduler can be recursive (simple, but same stack
+  overflow risk as today) or use an explicit work stack (bounded depth, but more complex).
+  The RFC assumes an explicit stack is feasible but doesn't specify the implementation.
 
-### GPU kernel fusion
+- **Constants:** `ConstantArray` continues to exist. ScalarFnArray special-cases it to avoid
+  expanding constants. The `Columnar` enum (`Canonical | Constant`) is preserved. This is
+  orthogonal to the proposal choice.
 
-The strict metadata-only reduce constraint means the optimized tree can be shipped to a GPU
-context. A GPU-aware materialization path could fuse the entire optimized tree into a single
-kernel launch.
+- **Decompression cache / CSE:** `ExecutionCtx` can hold a pointer-identity cache so that shared
+  sub-arrays (e.g., `x + x`) are only executed once. Orthogonal to the proposal choice.

From ce38bb9841cff9c876d1d3f208fbadf299804953 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 27 Feb 2026 20:59:49 -0500
Subject: [PATCH 5/9] Make ExecutionArgs a dyn trait

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 proposals/0001-execution-v2.md | 351 +++++++++++----------------------
 1 file changed, 115 insertions(+), 236 deletions(-)

diff --git a/proposals/0001-execution-v2.md b/proposals/0001-execution-v2.md
index 0500359..1e84171 100644
--- a/proposals/0001-execution-v2.md
+++ b/proposals/0001-execution-v2.md
@@ -4,17 +4,11 @@
 
 ## Summary
 
-Replace the current execution VTable with a cleaner model. `reduce` and `reduce_parent` remain
-metadata-only in both proposals and are identical. The proposals differ in how `execute` and
-`execute_parent` work:
-
-- **Proposal A (Scheduler-driven)**: `execute` returns an `ExecutionStep` — either requesting
-  the scheduler to execute a specific child, or declaring that it is done. The scheduler drives
-  all iteration. `execute_parent` is retained and returns an `ArrayRef` (any encoding).
-
-- **Proposal B (Canonical builder)**: `execute` pushes its result into a caller-owned
-  `CanonicalBuilder`. The result is always canonical. `execute_parent` is retained and also
-  pushes into the builder.
+Replace the current execution VTable with a scheduler-driven model. `reduce` and
+`reduce_parent` remain metadata-only. `execute` returns an `ExecutionStep` telling the scheduler
+which child to execute next, or that execution is done. `execute_parent` is retained and returns
+an `ArrayRef` in any encoding. The scheduler drives all iteration and runs reduce rules between
+steps.
 
 ## Motivation
 
@@ -28,12 +22,12 @@ trees overflow the stack.
 **Unclear execute/reduce boundary.** Some `execute_parent` implementations are metadata-only
 and belong in `reduce_parent`. The boundary isn't enforced.
 
-## Shared design
+## Design
 
-### reduce / reduce_parent (identical in both proposals)
+### reduce / reduce_parent (unchanged)
 
-Both proposals keep these methods with unchanged signatures. They are **strictly metadata-only**
-— they never read data buffers.
+These methods keep their current signatures. They are **strictly metadata-only** — they never
+read data buffers.
 
 ```rust
 fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>>;
@@ -50,8 +44,7 @@ fn reduce_parent(
 values. RunEnd child pulls ScalarFn through to its values. FSST child rewrites Compare by
 compressing the RHS literal.
 
-The framework runs these to a fixpoint before execution begins (and, in Proposal A, between
-execution steps).
+The framework runs these to a fixpoint before execution begins and between execution steps.
 
 Implementations currently misplaced in `execute_parent` that are metadata-only (Dict + Compare,
 ALP + Compare, FoR + Compare, FSST + Compare) move to `reduce_parent`. Implementations that
@@ -60,26 +53,21 @@ Between) move into `ScalarFn::execute`.
 
 ### FilterArray
 
-FilterArray continues to exist as a lazy wrapper in both models. It is not subsumed by the
-execution method. Both models handle it — the difference is what `execute_parent` can return
-when a child encoding wants to handle a FilterArray parent (see comparison section).
-
----
-
-## Proposal A: Scheduler-driven iterative execution
+FilterArray continues to exist as a lazy wrapper. It is not subsumed by the execution method
+signature.
 
-### VTable
+### execute / execute_parent
 
 ```rust
-fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>>;
-fn reduce_parent(array: &Self::Array, parent: &ArrayRef, child_idx: usize)
-    -> VortexResult<Option<ArrayRef>>;
-
 fn execute(array: &Self::Array, ctx: &mut ExecutionCtx)
     -> VortexResult<ExecutionStep>;
 
-fn execute_parent(array: &Self::Array, parent: &ArrayRef, child_idx: usize, ctx: &mut ExecutionCtx)
-    -> VortexResult<Option<ArrayRef>>;
+fn execute_parent(
+    array: &Self::Array,
+    parent: &ArrayRef,
+    child_idx: usize,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<ArrayRef>>;
 ```
 
 ```rust
@@ -96,6 +84,9 @@ pub enum ExecutionStep {
 The encoding never recurses into children. It yields control back to the scheduler, telling it
 what work is needed. The scheduler maintains the work stack.
 
+`execute_parent` returns `Option<ArrayRef>` — the result can be in **any encoding**, not just
+canonical. This is critical for encoding-preserving execution (see examples below).
+
 ### Scheduler
 
 ```rust
@@ -178,249 +169,137 @@ fn execute(bp: &BitPackedArray, ctx: &mut ExecutionCtx) -> VortexResult<Executio
 }
 ```
 
----
-
-## Proposal B: Canonical builder execution
+### Cross-step optimization
 
-### VTable
+Because `optimize_recursive` runs after each child execution, patterns that only become visible
+after partial execution can still be optimized.
 
-```rust
-fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>>;
-fn reduce_parent(array: &Self::Array, parent: &ArrayRef, child_idx: usize)
-    -> VortexResult<Option<ArrayRef>>;
-
-fn execute(
-    array: &Self::Array,
-    builder: &mut CanonicalBuilder,
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()>;
+Example: `ScalarFn(upper, [Dict(BitPacked(codes), values)])`. After the scheduler executes
+BitPacked codes to PrimitiveArray, the tree becomes
+`ScalarFn(upper, [Dict(Primitive(codes), values)])`. A reduce_parent rule on Dict pushes
+`upper` into values — this optimization fires between steps.
 
-fn execute_parent(
-    array: &Self::Array,
-    parent: &ArrayRef,
-    child_idx: usize,
-    builder: &mut CanonicalBuilder,
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<bool>;  // true if handled
-```
+### Encoding-preserving execute_parent
 
-### CanonicalBuilder
+`execute_parent` returns `Option<ArrayRef>` in any encoding. This enables exporters to
+intercept intermediate forms without decompressing.
 
-A closed enum mirroring `Canonical` in mutable builder form:
+Concrete example: `Filter(FSST(data), mask)`. FSST's `execute_parent` sees the FilterArray
+parent and returns a filtered FSST array (still FSST-encoded). An exporter driving the
+scheduler sees FSST on the next iteration and exports it as a DuckDB FSST vector — no
+decompression needed.
 
-```rust
-pub enum CanonicalBuilder {
-    Null(NullBuilder),
-    Bool(BoolBuilder),
-    Primitive(PrimitiveBuilder),
-    Decimal(DecimalBuilder),
-    VarBinView(VarBinViewBuilder),
-    List(ListViewBuilder),
-    FixedSizeList(FixedSizeListBuilder),
-    Struct(StructBuilder),
-    Extension(ExtensionBuilder),
-}
-```
+Same applies to DictArray for DuckDB dictionary vector export, or any encoding where the
+downstream consumer natively supports the compressed form.
 
-Because it is an enum (not `dyn`), encodings can match on the concrete variant. This enables
-zero-copy decompression paths impossible with the current `dyn ArrayBuilder`.
+### Exporter integration
 
-### Framework
+Exporters drive the scheduler loop directly and inspect the array after each step:
 
 ```rust
-fn canonicalize(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Canonical> {
-    let array = optimize_recursive(array)?;
-    let mut builder = CanonicalBuilder::new(array.dtype());
-
-    // Try execute_parent
-    for child_idx in 0..array.nchildren() {
-        let child = array.child(child_idx);
-        if child.vtable().execute_parent(&child, &array, child_idx, &mut builder, ctx)? {
-            return Ok(builder.finish());
-        }
-    }
-
-    array.vtable().execute(&array, &mut builder, ctx)?;
-    Ok(builder.finish())
-}
-```
-
-Single call. No iteration loop. No MAX_ITERATIONS. Reduce runs once, then one recursive
-descent fills the builder.
-
-### Per-encoding examples
-
-**FSST** — zero-copy push into VarBinViewBuilder:
+fn export_chunk(mut array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<DuckDBVector> {
+    let mut array = optimize_recursive(array)?;
 
-```rust
-fn execute(fsst: &FSSTArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> {
-    let CanonicalBuilder::VarBinView(vbv) = builder else { unreachable!() };
-    let (buffers, views) = fsst_decompress(fsst, vbv.completed_block_count())?;
-    vbv.push_buffers_and_views(&buffers, &views);
-    Ok(())
-}
-```
+    loop {
+        if let Some(dict) = array.as_opt::<DictVTable>() {
+            return export_dict(dict, ctx);
+        }
+        if let Some(fsst) = array.as_opt::<FSSTVTable>() {
+            return export_fsst(fsst, ctx);
+        }
+        if let Some(c) = array.as_columnar() {
+            return export_columnar(c, ctx);
+        }
 
-**ChunkedArray** — shared builder, no concat:
+        if let Some(rewritten) = try_execute_parent(&array, ctx)? {
+            array = optimize_recursive(rewritten)?;
+            continue;
+        }
 
-```rust
-fn execute(chunked: &ChunkedArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> {
-    for chunk in chunked.chunks() {
-        canonicalize_into(chunk, builder, ctx)?;
+        match array.vtable().execute(&array, ctx)? {
+            ExecutionStep::ExecuteChild(i) => {
+                let child = array.child(i);
+                let executed = execute_to_columnar(child, ctx)?;
+                array = array.with_child(i, executed.into_array());
+                array = optimize_recursive(array)?;
+            }
+            ExecutionStep::Done(c) => return export_columnar(c, ctx),
+        }
     }
-    // All chunks wrote into the same builder. No concatenation needed.
-    Ok(())
-}
-```
-
-**DictArray** — executes codes into sub-builder, then gathers:
-
-```rust
-fn execute(dict: &DictArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> {
-    let mut code_builder = CanonicalBuilder::new(dict.codes().dtype());
-    canonicalize_into(dict.codes(), &mut code_builder, ctx)?;
-    let codes = code_builder.finish().into_primitive();
-    gather_into_builder(dict.values(), &codes, builder, ctx)
 }
 ```
 
-**FilterArray** — execute child, apply mask:
+If an opaque encoding decodes to DictArray during execution, the exporter discovers it on the
+next iteration and exports it natively. This is impossible in a model that always produces
+canonical output.
 
-```rust
-fn execute(filter: &FilterArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> {
-    let mut sub = CanonicalBuilder::new(filter.child().dtype());
-    canonicalize_into(filter.child(), &mut sub, ctx)?;
-    let canonical = sub.finish();
-    let filtered = filter.mask().apply_to(canonical)?;
-    builder.extend_from_canonical(&filtered);
-    Ok(())
-}
-```
-
-**BitPacked** — unpack directly into PrimitiveBuilder:
-
-```rust
-fn execute(bp: &BitPackedArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> {
-    let CanonicalBuilder::Primitive(pb) = builder else { unreachable!() };
-    unpack_into(bp, pb)?;
-    Ok(())
-}
-```
-
----
+### Decompress-into-buffer
 
-## Comparison
+The scheduler-driven model does not natively support caller-owned output buffers (each
+`Done(Columnar)` allocates its own output). The two concrete cases where this matters —
+ChunkedArray concat and FSST zero-copy push — can be addressed at the framework level:
 
-### What Proposal A can do that Proposal B cannot
+- **ChunkedArray**: The scheduler special-cases ChunkedArray by pre-allocating a single output
+  buffer and copying each chunk's `Columnar` result into it. One allocation + N memcpys rather
+  than N allocations + one concat. Not zero-copy, but eliminates the extra full-column copy.
 
-**1. Encoding-preserving execute_parent.**
+- **FSST**: The extra allocation is the views array (16 bytes per string). The actual string
+  data buffers are shared via `Arc` — they are not copied. The overhead is bounded and small
+  relative to decompression cost.
 
-In A, `execute_parent` returns `Option<ArrayRef>` — the result can be in *any* encoding. In B,
-`execute_parent` pushes into a `CanonicalBuilder` — the result is always canonical.
-
-Concrete example: `Filter(FSST(data), mask)`. FSST's `execute_parent` sees the FilterArray
-parent.
+These are framework-level optimizations, not VTable concerns. They don't require encoding
+authors to think about builders.
 
-- In A, it can return a filtered FSST array (still FSST-encoded). An exporter driving the
-  scheduler sees FSST on the next iteration and exports it as a DuckDB FSST vector — no
-  decompression.
-- In B, it must push decompressed VarBinView into the builder. The FSST encoding is gone.
-
-Same applies to DictArray for DuckDB dictionary vector export, or any encoding where the
-downstream consumer natively supports the compressed form.
-
-**2. Cross-step optimization.**
-
-In A, the scheduler runs `optimize_recursive` after each child execution. Reduce rules fire on
-the new tree shape. Patterns only visible after partial execution can still be optimized.
-
-Example: `ScalarFn(upper, [Dict(BitPacked(codes), values)])`. After the scheduler executes
-BitPacked codes to PrimitiveArray, the tree becomes
-`ScalarFn(upper, [Dict(Primitive(codes), values)])`. A reduce_parent rule on Dict pushes
-`upper` into values — this optimization fires between steps.
-
-In B, reduce runs once before the single recursive descent. If a pattern only becomes visible
-after partial execution, it's missed.
-
-**3. Bounded stack depth.**
-
-In A, the scheduler can use an explicit work stack instead of recursion. Stack depth is O(1)
-regardless of encoding depth. In B, `execute` recurses into children — stack depth equals
-encoding tree depth.
-
-### What Proposal B can do that Proposal A cannot
-
-**1. Caller-owned output buffers (decompress-into).**
-
-The builder is created by the caller and passed down. This enables three concrete optimizations:
-
-- **ChunkedArray without concat.** One builder, each chunk writes into it. Single contiguous
-  allocation. In A, each chunk returns its own Columnar; the framework must concat them — an
-  extra copy of the entire column.
+## Compatibility
 
-- **FSST zero-copy push.** FSST decompresses views and data buffers directly into
-  VarBinViewBuilder's internal storage, adjusting view offsets to account for already-completed
-  blocks. In A, FSST must allocate its own VarBinViewArray.
+No file format or wire format changes. All changes are internal to the execution engine.
 
-- **Pre-allocated output.** When output length is known, the builder pre-allocates once. In A,
-  each Done allocates independently.
+Public API breakage: the `Executable` trait and `execute::<T>()` method are replaced.
+Third-party encodings must migrate. A default implementation bridging to the old path eases
+migration.
 
-**2. No intermediate allocations.**
+## Alternatives
 
-Every encoding writes directly into the final output buffer. No temporary array trees between
-steps. For hot paths on large batches, this is measurable.
+### Canonical builder model
 
-**3. No iteration bound.**
+Instead of returning `ExecutionStep`, `execute` could push results into a caller-owned
+`CanonicalBuilder` (a closed enum mirroring `Canonical` in mutable builder form). Each encoding
+decompresses directly into the builder in a single recursive descent. `execute_parent` would
+also push into the builder.
 
-Single recursive descent. No MAX_ITERATIONS. No risk of non-convergence.
+This model natively supports decompress-into-buffer: ChunkedArray writes all chunks into one
+builder (no concat), FSST pushes views directly into VarBinViewBuilder (zero-copy). No
+iteration loop, no MAX_ITERATIONS.
 
-### Summary table
+However, requiring canonical output from `execute` is a structural limitation that cannot be
+worked around:
 
-| Capability                          | Proposal A | Proposal B |
-|-------------------------------------|------------|------------|
-| Caller-owned output buffers         | No         | **Yes**    |
-| ChunkedArray without concat         | No         | **Yes**    |
-| FSST zero-copy into builder         | No         | **Yes**    |
-| Pre-allocated output                | No         | **Yes**    |
-| No intermediate allocations         | No         | **Yes**    |
-| Encoding-preserving execute_parent  | **Yes**    | No         |
-| Exporter intercepts intermediates   | **Yes**    | No         |
-| Cross-step optimization             | **Yes**    | No         |
-| Bounded stack depth (explicit stack)| **Yes**    | No         |
-| No iteration bound / convergence    | No         | **Yes**    |
+- **No encoding-preserving execute_parent.** `execute_parent` must push canonical into the
+  builder. FSST can't return a filtered FSST array — the encoding is always lost. Exporters
+  must use encoding-specific logic outside the framework for every format they want to preserve.
 
-### The core trade-off
+- **No cross-step optimization.** Reduce runs once before the single descent. Patterns visible
+  only after partial execution are missed.
 
-Proposal A optimizes for **flexibility**: intermediate encodings are visible, exporters can
-intercept, reduce rules fire between steps. The cost is that every encoding allocates its own
-output and ChunkedArray must concat.
+- **No exporter intermediate inspection.** Exporters see the tree after reduce, before
+  execution. Encodings hidden behind opaque wrappers are never visible.
 
-Proposal B optimizes for **allocation efficiency**: caller-owned buffers, zero-copy
-decompression, no intermediates. The cost is that execution always produces canonical — encodings
-can't preserve themselves through execution, and exporters must use encoding-specific logic
-outside the framework.
+- **Stack overflow.** `execute` recurses into children. Stack depth equals encoding tree depth.
 
-## Compatibility
-
-No file format or wire format changes. All changes are internal to the execution engine.
-
-Public API breakage: the `Executable` trait and `execute::<T>()` method are replaced.
-Third-party encodings must migrate. A default implementation bridging to the old path eases
-migration.
+The scheduler-driven model's allocation wins (encoding-preserving intermediates, cross-step
+optimization, exporter interception, bounded stack) are structurally unrecoverable in the
+builder model. The builder model's allocation wins (shared builder, zero-copy push) are
+recoverable in the scheduler model via framework-level optimizations. This asymmetry makes the
+scheduler-driven model strictly preferable.
 
 ## Unresolved Questions
 
-- **CanonicalBuilder design (Proposal B):** The exact interface for one-level-deep building,
-  especially for List and Extension types, needs design work. Struct fields are accumulated as
-  `Vec<ArrayRef>` (potentially compressed), not recursively built.
-
-- **Explicit work stack (Proposal A):** The scheduler can be recursive (simple, but same stack
-  overflow risk as today) or use an explicit work stack (bounded depth, but more complex).
-  The RFC assumes an explicit stack is feasible but doesn't specify the implementation.
+- **Explicit work stack:** The scheduler can be recursive (simple, but same stack overflow risk
+  as today) or use an explicit work stack (bounded depth, but more complex). The RFC assumes an
+  explicit stack is feasible but doesn't specify the implementation.
 
 - **Constants:** `ConstantArray` continues to exist. ScalarFnArray special-cases it to avoid
-  expanding constants. The `Columnar` enum (`Canonical | Constant`) is preserved. This is
-  orthogonal to the proposal choice.
+  expanding constants. The `Columnar` enum (`Canonical | Constant`) is preserved.
 
-- **Decompression cache / CSE:** `ExecutionCtx` can hold a pointer-identity cache so that shared
-  sub-arrays (e.g., `x + x`) are only executed once. Orthogonal to the proposal choice.
+- **Decompression cache / CSE:** `ExecutionCtx` can hold a pointer-identity cache so that
+  shared sub-arrays (e.g., `x + x`) are only executed once.

From 287fe87897a2eca6126002c245c9aca1204a148c Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 27 Feb 2026 21:23:14 -0500
Subject: [PATCH 6/9] Make ExecutionArgs a dyn trait

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 proposals/0001-execution-v2.md | 147 ++++++++++++++++++++++++++-------
 1 file changed, 118 insertions(+), 29 deletions(-)

diff --git a/proposals/0001-execution-v2.md b/proposals/0001-execution-v2.md
index 1e84171..e5eb448 100644
--- a/proposals/0001-execution-v2.md
+++ b/proposals/0001-execution-v2.md
@@ -67,13 +67,13 @@ fn execute_parent(
     parent: &ArrayRef,
     child_idx: usize,
     ctx: &mut ExecutionCtx,
-) -> VortexResult<Option<ArrayRef>>;
+) -> VortexResult<Option<ExecutionStep>>;
 ```
 
 ```rust
 pub enum ExecutionStep {
-    /// Ask the scheduler to execute the child at this index to columnar,
-    /// replace it, then call execute on this array again.
+    /// Ask the scheduler to execute the child at this index one step,
+    /// replace it, then call execute (or execute_parent) on this array again.
     ExecuteChild(usize),
 
     /// Execution is complete.
@@ -81,43 +81,95 @@ pub enum ExecutionStep {
 }
 ```
 
-The encoding never recurses into children. It yields control back to the scheduler, telling it
-what work is needed. The scheduler maintains the work stack.
+Both `execute` and `execute_parent` return `ExecutionStep`. The encoding never recurses into
+children — it yields control back to the scheduler, telling it what work is needed. The
+scheduler maintains the work stack.
+
+`execute_parent` returns `Option<ExecutionStep>`:
+- `None` — the child cannot handle this parent, fall through to the parent's own `execute`.
+- `Some(ExecuteChild(i))` — the child needs its own child at index `i` executed one step before
+  it can handle the parent. The scheduler does this, then retries `execute_parent`.
+- `Some(Done(result))` — the child handled the parent, here is the result in **any encoding**
+  (not just canonical). This is critical for encoding-preserving execution.
+
+Making `execute_parent` iterative is necessary because some parent-handling logic requires
+data access to the child's own children. For example, `Slice(RunEnd(ends=PCodec(...)))`:
+RunEnd's `execute_parent` sees the Slice parent and wants to binary-search its `ends` to find
+the physical indices for the slice boundaries. But `ends` is PCodec-compressed — each
+`scalar_at` probe during binary search would decompress the full array. Instead, RunEnd returns
+`ExecuteChild(0)` to request its ends be executed first. The scheduler decompresses PCodec to
+PrimitiveArray, then retries `execute_parent`. RunEnd now binary-searches canonical ends and
+returns an efficiently sliced RunEnd array.
 
-`execute_parent` returns `Option<ArrayRef>` — the result can be in **any encoding**, not just
-canonical. This is critical for encoding-preserving execution (see examples below).
+```rust
+// RunEnd's execute_parent for Slice
+fn execute_parent(
+    re: &RunEndArray, parent: &ArrayRef, child_idx: usize, ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<ExecutionStep>> {
+    let Some(slice) = parent.as_opt::<SliceVTable>() else { return Ok(None) };
+
+    if !re.ends().is_columnar() {
+        // Need canonical ends for binary search
+        return Ok(Some(ExecutionStep::ExecuteChild(0)));
+    }
+
+    // Ends are canonical — binary search is cheap
+    let physical_start = re.find_physical_index(slice.start())?;
+    let physical_end = re.find_physical_index(slice.end())?;
+    let sliced = RunEndArray::new(
+        re.ends().slice(physical_start..physical_end)?,
+        re.values().slice(physical_start..physical_end)?,
+        slice.start() + re.offset(),
+        slice.len(),
+    )?;
+    Ok(Some(ExecutionStep::Done(Columnar::from(sliced))))
+}
+```
 
 ### Scheduler
 
 ```rust
-fn execute_to_columnar(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Columnar> {
+fn execute_one_step(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
     let mut array = optimize_recursive(array)?;
 
-    loop {
-        if let Some(c) = array.as_columnar() { return Ok(c); }
-
-        // Try execute_parent (child-driven specialized execution)
-        if let Some(rewritten) = try_execute_parent(&array, ctx)? {
-            array = optimize_recursive(rewritten)?;
-            continue;
+    if array.as_columnar().is_some() { return Ok(array); }
+
+    // Try execute_parent (child-driven specialized execution)
+    for child_idx in 0..array.nchildren() {
+        let child = array.child(child_idx);
+        match child.vtable().execute_parent(&child, &array, child_idx, ctx)? {
+            None => continue,
+            Some(ExecutionStep::ExecuteChild(i)) => {
+                // Child needs its own child executed first — do one step, retry
+                let grandchild = child.child(i);
+                let stepped = execute_one_step(grandchild, ctx)?;
+                let new_child = child.with_child(i, stepped);
+                array = array.with_child(child_idx, new_child);
+                return Ok(array);
+            }
+            Some(ExecutionStep::Done(result)) => {
+                return Ok(result.into_array());
+            }
         }
+    }
 
-        match array.vtable().execute(&array, ctx)? {
-            ExecutionStep::ExecuteChild(i) => {
-                let child = array.child(i);
-                let executed = execute_to_columnar(child, ctx)?;
-                array = array.with_child(i, executed.into_array());
-                array = optimize_recursive(array)?;
-            }
-            ExecutionStep::Done(result) => return Ok(result),
+    // Fall through to the array's own execute
+    match array.vtable().execute(&array, ctx)? {
+        ExecutionStep::ExecuteChild(i) => {
+            let child = array.child(i);
+            let stepped = execute_one_step(child, ctx)?;
+            array = array.with_child(i, stepped);
+            Ok(array)
         }
+        ExecutionStep::Done(result) => Ok(result.into_array()),
     }
 }
 ```
 
-After each child execution, `optimize_recursive` runs again — reduce rules can fire on the
-new tree shape. For truly bounded stack depth, an explicit work stack replaces the recursive
-`execute_to_columnar(child)` call.
+Each call to `execute_one_step` makes one unit of progress. The top-level
+`execute_to_columnar` simply loops until the result is columnar. After each step,
+`optimize_recursive` runs again — reduce rules can fire on the new tree shape. For truly
+bounded stack depth, an explicit work stack replaces the recursive calls.
 
 ### Per-encoding examples
 
@@ -250,6 +302,46 @@ ChunkedArray concat and FSST zero-copy push — can be addressed at the framewor
 These are framework-level optimizations, not VTable concerns. They don't require encoding
 authors to think about builders.
 
+### Execution cache in ExecutionCtx
+
+The scheduler executes each child one step at a time, replacing it in the parent tree. This
+means the result of executing an array is only visible to its immediate parent — if the same
+`Arc<dyn Array>` appears as a child of multiple parents (e.g., `a < 10 & a > 5` where `a` is
+the same ArrayRef), it will be executed independently each time.
+
+`ExecutionCtx` holds a cache keyed by the raw pointer of the source array (`Arc::as_ptr()`).
+The cache entry stores a clone of the source `Arc` (to pin the pointer address — preventing
+deallocation and reuse) alongside the one-step execution result.
+
+```rust
+pub struct ExecutionCtx {
+    // ...
+    cache: HashMap<*const dyn Array, CacheEntry>,
+}
+
+struct CacheEntry {
+    /// Holds a strong reference to the source array, preventing the Arc from
+    /// being deallocated and its pointer reused for a different array.
+    _source: ArrayRef,
+    /// The result of executing the source array one step.
+    result: ArrayRef,
+}
+```
+
+**When to cache.** Not every array should be cached — most are executed exactly once, and
+caching adds overhead (HashMap insert + Arc clone). The `Arc::strong_count` is used as a
+heuristic: if `strong_count > 1`, the array is referenced from multiple places and its
+execution result is likely to be reused. This is an imperfect approximation — strong_count is a
+global reference count, not local to the array tree being executed. An array may have
+`strong_count > 1` because the layout reader or scan builder holds a reference, not because it
+appears twice in the expression tree. But it never misses a genuinely shared sub-expression (if
+shared, the count is > 1), and false positives only cost memory, not correctness.
+
+**Scope.** The cache lives in `ExecutionCtx`, not in individual arrays. Users can share an
+`ExecutionCtx` across multiple independent array executions (e.g., multiple columns in a scan),
+so the cache naturally deduplicates work across columns that share sub-arrays (shared
+dictionaries, common filter masks). The cache is dropped when the `ExecutionCtx` is dropped.
+
 ## Compatibility
 
 No file format or wire format changes. All changes are internal to the execution engine.
@@ -300,6 +392,3 @@ scheduler-driven model strictly preferable.
 
 - **Constants:** `ConstantArray` continues to exist. ScalarFnArray special-cases it to avoid
   expanding constants. The `Columnar` enum (`Canonical | Constant`) is preserved.
-
-- **Decompression cache / CSE:** `ExecutionCtx` can hold a pointer-identity cache so that
-  shared sub-arrays (e.g., `x + x`) are only executed once.

From 16cac214d9dac00099b63589780a9d710a98fff5 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 27 Feb 2026 22:03:32 -0500
Subject: [PATCH 7/9] Make ExecutionArgs a dyn trait

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 proposals/0001-execution-v2.md | 494 ++++++++++++++-------------------
 1 file changed, 210 insertions(+), 284 deletions(-)

diff --git a/proposals/0001-execution-v2.md b/proposals/0001-execution-v2.md
index e5eb448..e06eaba 100644
--- a/proposals/0001-execution-v2.md
+++ b/proposals/0001-execution-v2.md
@@ -4,76 +4,51 @@
 
 ## Summary
 
-Replace the current execution VTable with a scheduler-driven model. `reduce` and
-`reduce_parent` remain metadata-only. `execute` returns an `ExecutionStep` telling the scheduler
-which child to execute next, or that execution is done. `execute_parent` is retained and returns
-an `ArrayRef` in any encoding. The scheduler drives all iteration and runs reduce rules between
-steps.
+Replace the current execution model with a scheduler-driven design. `reduce` and
+`reduce_parent` remain metadata-only. `execute` returns an `ExecutionStep` — requesting a child
+be executed or declaring completion. The scheduler drives all iteration using an explicit work
+stack, runs reduce rules between steps, and caches results for shared sub-expressions.
 
 ## Motivation
 
-**No decompress-into-buffer.** Encodings allocate their own output during execution.
-ChunkedArray must decompress each chunk separately and concatenate — an extra copy of the
-entire column. Exporters (DuckDB, Arrow) can't write directly into their output format.
-
 **Stack overflow from recursion.** The current executor recurses into children. Deep encoding
-trees overflow the stack.
+trees overflow the stack. The 128-iteration limit applies to the outer loop, not inner
+recursive calls.
+
+**Repeated decompression.** Some operations access the same compressed child multiple times.
+Each access decompresses independently. For example, binary search on RunEnd's PCodec-compressed
+ends calls `scalar_at` per probe, each decompressing the full array.
+
+**Shared sub-expressions.** `a < 10 & a > 5` references the same array `a` twice. The current
+model executes it independently for each reference.
 
 **Unclear execute/reduce boundary.** Some `execute_parent` implementations are metadata-only
 and belong in `reduce_parent`. The boundary isn't enforced.
 
 ## Design
 
-### reduce / reduce_parent (unchanged)
-
-These methods keep their current signatures. They are **strictly metadata-only** — they never
-read data buffers.
+### VTable methods
 
 ```rust
+// Metadata-only rewrites. Never read data buffers.
 fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>>;
 fn reduce_parent(
-    array: &Self::Array,
-    parent: &ArrayRef,
-    child_idx: usize,
+    array: &Self::Array, parent: &ArrayRef, child_idx: usize,
 ) -> VortexResult<Option<ArrayRef>>;
-```
-
-`reduce` rewrites an array using only metadata. Filter(AllTrue(x)) → x. Constant folding.
-
-`reduce_parent` rewrites a parent from a child's perspective. Dict child pushes ScalarFn into
-values. RunEnd child pulls ScalarFn through to its values. FSST child rewrites Compare by
-compressing the RHS literal.
-
-The framework runs these to a fixpoint before execution begins and between execution steps.
-
-Implementations currently misplaced in `execute_parent` that are metadata-only (Dict + Compare,
-ALP + Compare, FoR + Compare, FSST + Compare) move to `reduce_parent`. Implementations that
-are really canonical-type compute kernels (Primitive + Compare, Bool + FillNull, Decimal +
-Between) move into `ScalarFn::execute`.
-
-### FilterArray
-
-FilterArray continues to exist as a lazy wrapper. It is not subsumed by the execution method
-signature.
-
-### execute / execute_parent
-
-```rust
-fn execute(array: &Self::Array, ctx: &mut ExecutionCtx)
-    -> VortexResult<ExecutionStep>;
 
+// Execution. May read data buffers.
+fn execute(
+    array: &Self::Array, ctx: &mut ExecutionCtx,
+) -> VortexResult<ExecutionStep>;
 fn execute_parent(
-    array: &Self::Array,
-    parent: &ArrayRef,
-    child_idx: usize,
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<Option<ExecutionStep>>;
+    array: &Self::Array, parent: &ArrayRef, child_idx: usize, ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<ArrayRef>>;
 ```
 
 ```rust
 pub enum ExecutionStep {
-    /// Ask the scheduler to execute the child at this index one step,
-    /// replace it, then call execute (or execute_parent) on this array again.
+    /// Execute the child at this index to columnar, replace it,
+    /// then call execute on this array again.
     ExecuteChild(usize),
 
     /// Execution is complete.
@@ -81,117 +56,149 @@ pub enum ExecutionStep {
 }
 ```
 
-Both `execute` and `execute_parent` return `ExecutionStep`. The encoding never recurses into
-children — it yields control back to the scheduler, telling it what work is needed. The
-scheduler maintains the work stack.
-
-`execute_parent` returns `Option<ExecutionStep>`:
-- `None` — the child cannot handle this parent, fall through to the parent's own `execute`.
-- `Some(ExecuteChild(i))` — the child needs its own child at index `i` executed one step before
-  it can handle the parent. The scheduler does this, then retries `execute_parent`.
-- `Some(Done(result))` — the child handled the parent, here is the result in **any encoding**
-  (not just canonical). This is critical for encoding-preserving execution.
-
-Making `execute_parent` iterative is necessary because some parent-handling logic requires
-data access to the child's own children. For example, `Slice(RunEnd(ends=PCodec(...)))`:
-RunEnd's `execute_parent` sees the Slice parent and wants to binary-search its `ends` to find
-the physical indices for the slice boundaries. But `ends` is PCodec-compressed — each
-`scalar_at` probe during binary search would decompress the full array. Instead, RunEnd returns
-`ExecuteChild(0)` to request its ends be executed first. The scheduler decompresses PCodec to
-PrimitiveArray, then retries `execute_parent`. RunEnd now binary-searches canonical ends and
-returns an efficiently sliced RunEnd array.
+**reduce / reduce_parent** are unchanged. Strictly metadata-only. The framework runs them to a
+fixpoint before execution and between execution steps. Implementations currently misplaced in
+`execute_parent` that are metadata-only (Dict + Compare, ALP + Compare, FoR + Compare, FSST +
+Compare) move to `reduce_parent`.
 
-```rust
-// RunEnd's execute_parent for Slice
-fn execute_parent(
-    re: &RunEndArray, parent: &ArrayRef, child_idx: usize, ctx: &mut ExecutionCtx,
-) -> VortexResult<Option<ExecutionStep>> {
-    let Some(slice) = parent.as_opt::<SliceVTable>() else { return Ok(None) };
+**execute** returns `ExecutionStep`. The encoding never recurses into children — it yields
+control to the scheduler.
 
-    if !re.ends().is_columnar() {
-        // Need canonical ends for binary search
-        return Ok(Some(ExecutionStep::ExecuteChild(0)));
-    }
+- `ExecuteChild(i)` asks the scheduler to execute child `i` to columnar, replace it, and call
+  `execute` again.
+- `Done(columnar)` returns the final columnar result.
 
-    // Ends are canonical — binary search is cheap
-    let physical_start = re.find_physical_index(slice.start())?;
-    let physical_end = re.find_physical_index(slice.end())?;
-    let sliced = RunEndArray::new(
-        re.ends().slice(physical_start..physical_end)?,
-        re.values().slice(physical_start..physical_end)?,
-        slice.start() + re.offset(),
-        slice.len(),
-    )?;
-    Ok(Some(ExecutionStep::Done(Columnar::from(sliced))))
-}
-```
+**execute_parent** returns `Option<ArrayRef>`. `None` means the child can't handle this parent.
+`Some(result)` means it handled the parent — the result can be in **any encoding**, not just
+canonical, enabling encoding-preserving execution (e.g., FSST returning a filtered FSST array
+to a DuckDB exporter).
+
+### Constant handling
+
+When an encoding matches on a specific child type (e.g., `as_opt::<PrimitiveVTable>()`), a
+`ConstantArray` child won't match — it's columnar but not Primitive. Encodings should either
+check for constants explicitly, or — preferably — register a reduce rule that handles the
+constant case before execution runs. For example:
+`Dict(Constant(code), values)` → `Constant(values.scalar_at(code))`.
+
+Note: an encoding that blindly returns `ExecuteChild(i)` when its child is already columnar
+(but doesn't match the expected concrete type) will loop forever. This is trivial for the
+scheduler to detect — if `ExecuteChild(i)` is returned and child `i` is already columnar, the
+scheduler can abort with an error.
 
 ### Scheduler
 
+The scheduler uses an explicit work stack, bounding stack depth regardless of encoding tree
+depth.
+
 ```rust
-fn execute_one_step(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
-    let mut array = optimize_recursive(array)?;
-
-    if array.as_columnar().is_some() { return Ok(array); }
-
-    // Try execute_parent (child-driven specialized execution)
-    for child_idx in 0..array.nchildren() {
-        let child = array.child(child_idx);
-        match child.vtable().execute_parent(&child, &array, child_idx, ctx)? {
-            None => continue,
-            Some(ExecutionStep::ExecuteChild(i)) => {
-                // Child needs its own child executed first — do one step, retry
-                let grandchild = child.child(i);
-                let stepped = execute_one_step(grandchild, ctx)?;
-                let new_child = child.with_child(i, stepped);
-                array = array.with_child(child_idx, new_child);
-                return Ok(array);
+fn execute_to_columnar(root: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Columnar> {
+    let mut current = optimize_recursive(root, ctx)?;
+    let mut stack: Vec<(ArrayRef, usize)> = vec![];  // (parent, child_idx)
+
+    loop {
+        // Check if current is columnar — return to parent or finish
+        if let Some(c) = current.as_columnar() {
+            match stack.pop() {
+                None => return Ok(c),
+                Some((parent, child_idx)) => {
+                    current = parent.with_child(child_idx, c.into_array());
+                    current = optimize_recursive(current, ctx)?;
+                    continue;
+                }
+            }
+        }
+
+        // Try execute_parent
+        if let Some(rewritten) = try_execute_parent(&current, ctx)? {
+            current = optimize_recursive(rewritten, ctx)?;
+            continue;
+        }
+
+        // Execute
+        match current.vtable().execute(&current, ctx)? {
+            ExecutionStep::ExecuteChild(i) => {
+                let child = current.child(i);
+                stack.push((current, i));
+                current = optimize_recursive(child, ctx)?;
             }
-            Some(ExecutionStep::Done(result)) => {
-                return Ok(result.into_array());
+            ExecutionStep::Done(result) => {
+                current = result.into_array();
             }
         }
     }
+}
+```
 
-    // Fall through to the array's own execute
-    match array.vtable().execute(&array, ctx)? {
-        ExecutionStep::ExecuteChild(i) => {
-            let child = array.child(i);
-            let stepped = execute_one_step(child, ctx)?;
-            array = array.with_child(i, stepped);
-            Ok(array)
-        }
-        ExecutionStep::Done(result) => Ok(result.into_array()),
+### Execution cache
+
+If the same `Arc<dyn Array>` appears as a child of multiple parents (e.g., `a < 10 & a > 5`),
+the scheduler executes it independently each time. The execution cache deduplicates this work.
+
+The cache lives in `ExecutionCtx`, keyed by `ByPtr<ArrayRef>` — a newtype that implements
+`Hash` and `Eq` via `Arc::as_ptr()`. Because the key holds a clone of the `Arc`, the source
+array cannot be deallocated while cached, so the pointer cannot be reused for a different array.
+
+```rust
+struct ByPtr(ArrayRef);
+
+impl Hash for ByPtr {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        Arc::as_ptr(&self.0).hash(state);
+    }
+}
+
+impl PartialEq for ByPtr {
+    fn eq(&self, other: &Self) -> bool {
+        Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0)
     }
 }
+
+pub struct ExecutionCtx {
+    cache: HashMap<ByPtr, ArrayRef>,  // source → one-step result
+    // ...
+}
 ```
 
-Each call to `execute_one_step` makes one unit of progress. The top-level
-`execute_to_columnar` simply loops until the result is columnar. After each step,
-`optimize_recursive` runs again — reduce rules can fire on the new tree shape. For truly
-bounded stack depth, an explicit work stack replaces the recursive calls.
+**When to cache.** Three options:
+
+1. **Always cache.** Memory explodes — most arrays are executed once and never revisited.
 
-### Per-encoding examples
+2. **Pre-scan the tree.** Walk the tree before execution, count pointer occurrences, cache only
+   shared nodes. Accurate for a single tree, but doesn't work across multiple trees executed
+   with the same `ExecutionCtx` (e.g., scan columns sharing a dictionary or filter mask).
 
-**DictArray** — requests codes first, then gathers:
+3. **`Arc::strong_count > 1` heuristic.** O(1) to check. Over-caches when external references
+   exist (layout reader, scan builder) but never under-caches for a genuinely shared
+   sub-expression. Works across independent tree executions within the same `ExecutionCtx`.
+   False positives cost memory, not correctness.
+
+We use option 3. The cache is dropped when the `ExecutionCtx` is dropped.
+
+### Examples
+
+**DictArray** — execute codes into Primitive, then gather:
 
 ```rust
 fn execute(dict: &DictArray, ctx: &mut ExecutionCtx) -> VortexResult<ExecutionStep> {
-    if !dict.codes().is_columnar() {
+    let Some(codes) = dict.codes().as_opt::<PrimitiveVTable>() else {
         return Ok(ExecutionStep::ExecuteChild(0));
-    }
-    let codes = dict.codes().as_primitive();
-    let gathered = gather(dict.values(), &codes, ctx)?;
+    };
+    let gathered = gather(dict.values(), codes, ctx)?;
     Ok(ExecutionStep::Done(gathered))
 }
 ```
 
-**ScalarFnArray** — requests children left-to-right, then evaluates:
+Note: if codes is a `ConstantArray`, the scheduler returns it as columnar. A reduce rule
+`Dict(Constant(code), values) → Constant(values.scalar_at(code))` handles this before execute
+runs.
+
+**ScalarFnArray** — columnarize children left-to-right, then evaluate:
 
 ```rust
 fn execute(sfn: &ScalarFnArray, ctx: &mut ExecutionCtx) -> VortexResult<ExecutionStep> {
     for (i, child) in sfn.children().iter().enumerate() {
-        if !child.is_columnar() {
+        if child.as_opt::<AnyColumnar>().is_none() {
             return Ok(ExecutionStep::ExecuteChild(i));
         }
     }
@@ -200,195 +207,114 @@ fn execute(sfn: &ScalarFnArray, ctx: &mut ExecutionCtx) -> VortexResult<Executio
 }
 ```
 
-**FilterArray** — requests child, then applies mask:
+Left-to-right ordering is deterministic and simple. ScalarFn designers should consider this
+when choosing input order — the first input is executed first.
+
+**FilterArray** — columnarize child, then apply mask:
 
 ```rust
 fn execute(filter: &FilterArray, ctx: &mut ExecutionCtx) -> VortexResult<ExecutionStep> {
-    if !filter.child().is_columnar() {
+    let Some(child) = filter.child().as_opt::<AnyCanonical>() else {
         return Ok(ExecutionStep::ExecuteChild(0));
-    }
-    let filtered = filter.mask().apply_to(filter.child().as_canonical())?;
+    };
+    let filtered = filter.mask().apply_to(child.into())?;
     Ok(ExecutionStep::Done(Columnar::Canonical(filtered)))
 }
 ```
 
-**BitPacked** — leaf encoding, decompresses directly:
+**BitPacked** — leaf, decompresses directly:
 
 ```rust
 fn execute(bp: &BitPackedArray, ctx: &mut ExecutionCtx) -> VortexResult<ExecutionStep> {
-    let primitive = unpack(bp)?;
-    Ok(ExecutionStep::Done(Columnar::Canonical(Canonical::Primitive(primitive))))
+    Ok(ExecutionStep::Done(Columnar::Canonical(Canonical::Primitive(unpack(bp)?))))
 }
 ```
 
 ### Cross-step optimization
 
-Because `optimize_recursive` runs after each child execution, patterns that only become visible
-after partial execution can still be optimized.
-
-Example: `ScalarFn(upper, [Dict(BitPacked(codes), values)])`. After the scheduler executes
-BitPacked codes to PrimitiveArray, the tree becomes
-`ScalarFn(upper, [Dict(Primitive(codes), values)])`. A reduce_parent rule on Dict pushes
-`upper` into values — this optimization fires between steps.
-
-### Encoding-preserving execute_parent
+`optimize_recursive` runs when the scheduler pops from the stack (after a child reaches
+columnar and is replaced into its parent). Reduce rules fire on the new tree shape.
 
-`execute_parent` returns `Option<ArrayRef>` in any encoding. This enables exporters to
-intercept intermediate forms without decompressing.
+Example: `ScalarFn(upper, [Dict(BitPacked(codes), values)])`. Pre-execution optimization fires
+a reduce_parent rule on Dict, pushing `upper` into values. The tree becomes
+`Dict(BitPacked(codes), upper(values))`. Dict then executes its codes into Primitive and
+gathers.
 
-Concrete example: `Filter(FSST(data), mask)`. FSST's `execute_parent` sees the FilterArray
-parent and returns a filtered FSST array (still FSST-encoded). An exporter driving the
-scheduler sees FSST on the next iteration and exports it as a DuckDB FSST vector — no
-decompression needed.
+### Encoding-preserving execution
 
-Same applies to DictArray for DuckDB dictionary vector export, or any encoding where the
-downstream consumer natively supports the compressed form.
+`execute_parent` returns results in any encoding. Exporters drive the scheduler loop and
+inspect the tree after each step. If an encoding the exporter cares about becomes visible
+(DictArray for DuckDB dictionary vectors, FSST for DuckDB FSST vectors), the exporter
+intercepts it without decompressing.
 
-### Exporter integration
-
-Exporters drive the scheduler loop directly and inspect the array after each step:
-
-```rust
-fn export_chunk(mut array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<DuckDBVector> {
-    let mut array = optimize_recursive(array)?;
+### Removing ExecutionCtx from VTable methods
 
-    loop {
-        if let Some(dict) = array.as_opt::<DictVTable>() {
-            return export_dict(dict, ctx);
-        }
-        if let Some(fsst) = array.as_opt::<FSSTVTable>() {
-            return export_fsst(fsst, ctx);
-        }
-        if let Some(c) = array.as_columnar() {
-            return export_columnar(c, ctx);
-        }
+The `execute` and `execute_parent` signatures shown above accept `&mut ExecutionCtx`. This gives
+encodings the ability to recursively execute children, bypassing the scheduler's caching and
+cross-step optimization. Nothing in the type system prevents it.
 
-        if let Some(rewritten) = try_execute_parent(&array, ctx)? {
-            array = optimize_recursive(rewritten)?;
-            continue;
-        }
+A stronger design: remove `ExecutionCtx` from the VTable method signatures entirely. The
+scheduler owns the execution state (cache, tracing). `execute` receives no context. The method signature itself
+communicates "return a step, don't execute anything."
 
-        match array.vtable().execute(&array, ctx)? {
-            ExecutionStep::ExecuteChild(i) => {
-                let child = array.child(i);
-                let executed = execute_to_columnar(child, ctx)?;
-                array = array.with_child(i, executed.into_array());
-                array = optimize_recursive(array)?;
-            }
-            ExecutionStep::Done(c) => return export_columnar(c, ctx),
-        }
-    }
-}
-```
+This also eliminates the current ergonomic friction of
+`let ctx = session.create_execution_ctx(); array.execute(&mut ctx)` — callers just call the
+scheduler directly.
 
-If an opaque encoding decodes to DictArray during execution, the exporter discovers it on the
-next iteration and exports it natively. This is impossible in a model that always produces
-canonical output.
+If `execute_parent` also yields `ExecutionStep` (see unresolved questions), the same argument
+applies: it gets resource access but not execution power. The scheduler is the only code that
+drives execution.
 
 ### Decompress-into-buffer
 
-The scheduler-driven model does not natively support caller-owned output buffers (each
-`Done(Columnar)` allocates its own output). The two concrete cases where this matters —
-ChunkedArray concat and FSST zero-copy push — can be addressed at the framework level:
-
-- **ChunkedArray**: The scheduler special-cases ChunkedArray by pre-allocating a single output
-  buffer and copying each chunk's `Columnar` result into it. One allocation + N memcpys rather
-  than N allocations + one concat. Not zero-copy, but eliminates the extra full-column copy.
-
-- **FSST**: The extra allocation is the views array (16 bytes per string). The actual string
-  data buffers are shared via `Arc` — they are not copied. The overhead is bounded and small
-  relative to decompression cost.
-
-These are framework-level optimizations, not VTable concerns. They don't require encoding
-authors to think about builders.
-
-### Execution cache in ExecutionCtx
-
-The scheduler executes each child one step at a time, replacing it in the parent tree. This
-means the result of executing an array is only visible to its immediate parent — if the same
-`Arc<dyn Array>` appears as a child of multiple parents (e.g., `a < 10 & a > 5` where `a` is
-the same ArrayRef), it will be executed independently each time.
-
-`ExecutionCtx` holds a cache keyed by the raw pointer of the source array (`Arc::as_ptr()`).
-The cache entry stores a clone of the source `Arc` (to pin the pointer address — preventing
-deallocation and reuse) alongside the one-step execution result.
-
-```rust
-pub struct ExecutionCtx {
-    // ...
-    cache: HashMap<*const dyn Array, CacheEntry>,
-}
-
-struct CacheEntry {
-    /// Holds a strong reference to the source array, preventing the Arc from
-    /// being deallocated and its pointer reused for a different array.
-    _source: ArrayRef,
-    /// The result of executing the source array one step.
-    result: ArrayRef,
-}
-```
-
-**When to cache.** Not every array should be cached — most are executed exactly once, and
-caching adds overhead (HashMap insert + Arc clone). The `Arc::strong_count` is used as a
-heuristic: if `strong_count > 1`, the array is referenced from multiple places and its
-execution result is likely to be reused. This is an imperfect approximation — strong_count is a
-global reference count, not local to the array tree being executed. An array may have
-`strong_count > 1` because the layout reader or scan builder holds a reference, not because it
-appears twice in the expression tree. But it never misses a genuinely shared sub-expression (if
-shared, the count is > 1), and false positives only cost memory, not correctness.
-
-**Scope.** The cache lives in `ExecutionCtx`, not in individual arrays. Users can share an
-`ExecutionCtx` across multiple independent array executions (e.g., multiple columns in a scan),
-so the cache naturally deduplicates work across columns that share sub-arrays (shared
-dictionaries, common filter masks). The cache is dropped when the `ExecutionCtx` is dropped.
-
-## Compatibility
-
-No file format or wire format changes. All changes are internal to the execution engine.
-
-Public API breakage: the `Executable` trait and `execute::<T>()` method are replaced.
-Third-party encodings must migrate. A default implementation bridging to the old path eases
-migration.
+This model does not support caller-owned output buffers. Each `Done(Columnar)` allocates its
+own output. ChunkedArray cannot share a builder across chunks, and FSST cannot push views
+directly into a caller's VarBinViewBuilder. This is a trade-off we accept in exchange for
+encoding-preserving execution, cross-step optimization, and bounded stack depth.
 
 ## Alternatives
 
 ### Canonical builder model
 
-Instead of returning `ExecutionStep`, `execute` could push results into a caller-owned
-`CanonicalBuilder` (a closed enum mirroring `Canonical` in mutable builder form). Each encoding
-decompresses directly into the builder in a single recursive descent. `execute_parent` would
-also push into the builder.
-
-This model natively supports decompress-into-buffer: ChunkedArray writes all chunks into one
-builder (no concat), FSST pushes views directly into VarBinViewBuilder (zero-copy). No
-iteration loop, no MAX_ITERATIONS.
-
-However, requiring canonical output from `execute` is a structural limitation that cannot be
-worked around:
+`execute` pushes results into a caller-owned `CanonicalBuilder` (closed enum mirroring
+`Canonical` in mutable builder form). Single recursive descent, no iteration loop. Natively
+supports decompress-into-buffer: ChunkedArray writes all chunks into one builder, FSST pushes
+views directly into VarBinViewBuilder.
 
-- **No encoding-preserving execute_parent.** `execute_parent` must push canonical into the
-  builder. FSST can't return a filtered FSST array — the encoding is always lost. Exporters
-  must use encoding-specific logic outside the framework for every format they want to preserve.
+However, always producing canonical output has structural limitations:
 
-- **No cross-step optimization.** Reduce runs once before the single descent. Patterns visible
-  only after partial execution are missed.
+- **No encoding-preserving execution.** `execute_parent` pushes into a builder, so results are
+  always canonical. FSST can handle a FilterArray parent (fused filter + decompress into the
+  builder), but it can't return a filtered FSST array for a DuckDB exporter. Exporters that
+  want to preserve an encoding must use encoding-specific logic outside the framework.
+- **No cross-step optimization.** Reduce runs once before the single descent.
+- **Stack overflow.** Recurses into children; stack depth equals encoding tree depth.
 
-- **No exporter intermediate inspection.** Exporters see the tree after reduce, before
-  execution. Encodings hidden behind opaque wrappers are never visible.
-
-- **Stack overflow.** `execute` recurses into children. Stack depth equals encoding tree depth.
-
-The scheduler-driven model's allocation wins (encoding-preserving intermediates, cross-step
-optimization, exporter interception, bounded stack) are structurally unrecoverable in the
-builder model. The builder model's allocation wins (shared builder, zero-copy push) are
-recoverable in the scheduler model via framework-level optimizations. This asymmetry makes the
-scheduler-driven model strictly preferable.
+The scheduler model's wins (encoding preservation, cross-step optimization, bounded stack) are
+structurally unrecoverable in the builder model. The builder model's win (zero-copy
+decompress-into-buffer) is a real cost of the scheduler model, but one we accept given the
+asymmetry.
 
 ## Unresolved Questions
 
-- **Explicit work stack:** The scheduler can be recursive (simple, but same stack overflow risk
-  as today) or use an explicit work stack (bounded depth, but more complex). The RFC assumes an
-  explicit stack is feasible but doesn't specify the implementation.
-
-- **Constants:** `ConstantArray` continues to exist. ScalarFnArray special-cases it to avoid
-  expanding constants. The `Columnar` enum (`Canonical | Constant`) is preserved.
+- **Explicit work stack details.** The scheduler sketch shows the concept. The exact data
+  structure (e.g., handling multiple `ExecuteChild` calls from the same parent for different
+  children) needs design work.
+
+- **Iterative execute_parent.** The current design has `execute_parent` return
+  `Option<ArrayRef>`. An alternative is to return `Option<ExecutionStep>`, allowing it to
+  request child execution before handling the parent. The execution cache may be sufficient for
+  cases where `execute_parent` needs data access to children (e.g., RunEnd binary-searching
+  compressed ends — the cache ensures decompression happens once). We have no compelling
+  example for or against iterative `execute_parent` yet.
+
+- **Targeted child execution.** `ExecuteChild(i)` currently executes the child to columnar.
+  An alternative is to allow the encoding to specify a matcher for when it should be re-entered
+  — e.g., `ExecuteChildInto(i, ArrayId)` would execute child `i` until it matches a specific
+  encoding or reaches columnar, whichever comes first. This enables early re-entry before full
+  columnarization — for example, an encoding could request its child be executed until it becomes
+  a DictArray, then operate on the dictionary directly. The trade-off is additional complexity in
+  the scheduler and ExecutionStep enum.
+
+- **FilterArray.** FilterArray continues to exist as a lazy wrapper. It is not subsumed by the
+  execution method. Whether to unify Filter/Slice/Take wrappers is orthogonal.

From 531c24c5e43463ec81952d50365b781c0d423927 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 27 Feb 2026 22:04:52 -0500
Subject: [PATCH 8/9] Make ExecutionArgs a dyn trait

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 ...02-aggregate-functions-and-list-scalars.md | 862 ------------------
 ...xecution-v2.md => 0019-array-execution.md} |   3 +-
 2 files changed, 1 insertion(+), 864 deletions(-)
 delete mode 100644 proposals/0002-aggregate-functions-and-list-scalars.md
 rename proposals/{0001-execution-v2.md => 0019-array-execution.md} (99%)

diff --git a/proposals/0002-aggregate-functions-and-list-scalars.md b/proposals/0002-aggregate-functions-and-list-scalars.md
deleted file mode 100644
index 0cc053a..0000000
--- a/proposals/0002-aggregate-functions-and-list-scalars.md
+++ /dev/null
@@ -1,862 +0,0 @@
-- Start Date: 2026-02-26
-- RFC PR: [vortex-data/rfcs#0000](https://github.com/vortex-data/rfcs/pull/0000)
-- Tracking Issue: [vortex-data/vortex#0000](https://github.com/vortex-data/vortex/issues/0000)
-
-## Summary
-
-Introduce `AggregateFnVTable` to Vortex as a trait for defining aggregate operations, and a
-single `ListAggregate` scalar function that applies any aggregate to a list column. There is
-no dedicated `AggregateFnArray` — `ListAggregate` is a `ScalarFnVTable` implementation whose
-lazy evaluation is handled by the existing `ScalarFnArray` infrastructure. Child encodings
-match on `ExactScalarFn<ListAggregate>` in their reduce_parent rules for aggregate-specific
-optimizations.
-
-The key insight is the **aggregate/list-scalar duality**: a list column stored as
-`(offsets, elements)` is a pre-materialized grouping. Computing `list_sum(list_column)` is
-literally a grouped `sum` over the flat elements array partitioned by offsets. Rather than
-implementing N separate list scalar functions (`list_sum`, `list_min`, `list_max`, ...),
-we implement N aggregate functions and a single `ListAggregate` scalar function that applies
-any aggregate to a list column.
-
-The `Accumulator` trait is the core primitive. It processes one group at a time via
-`accumulate(batch)` / `flush()` / `finish()`, supporting streaming through chunked elements,
-very large lists, and ungrouped column-level aggregation. `execute_grouped` is a convenience
-built on top of the accumulator.
-
-Streaming ordered GROUP BY falls out naturally: the query engine constructs a `ListArray` from
-`(flat_column, group_offsets)` per batch — a zero-copy operation — and wraps it in
-`ScalarFnArray(ListAggregate(Sum), [list_array])`.
-
-This RFC also evaluates whether the canonical list encoding should switch from `ListViewArray`
-(offsets+sizes) to `ListArray` (offsets-only), since the choice directly affects how cleanly
-the duality works.
-
-## Motivation
-
-### Aggregate functions exist but in the old framework
-
-Vortex has two compute systems:
-
-- **`ScalarFnVTable`**: element-wise functions (1 input row → 1 output row) with lazy
-  `ScalarFnArray` evaluation, expression trees, and reduce/reduce_parent optimization.
-- **`ComputeFn`**: kernel-based operations like `sum`, `min_max`, `nan_count`. Standalone,
-  not part of the expression tree, dispatched to per-encoding kernels via `inventory`.
-
-Aggregate operations like `sum` already exist, but as `ComputeFn` kernels. They cannot
-participate in the expression tree or optimizer. This means:
-
-- No lazy evaluation that defers computation and can be optimized via reduce_parent.
-- No way to push aggregate operations through Dict values, RunEnd runs, or other encodings
-  via the reduce rule framework.
-- No unified framework for list scalar functions and grouped aggregates.
-
-### List scalar functions are redundant with aggregate functions
-
-The only list-specific scalar function today is `list_contains`. Adding `list_sum`,
-`list_min`, `list_max`, `list_count`, `list_mean`, etc. as individual `ScalarFnVTable`
-implementations would duplicate the aggregation logic that already exists in `ComputeFn`
-kernels. If Vortex had first-class aggregate functions, each list scalar function would
-simply be "apply this aggregate to each list" — one `ListAggregate` scalar function
-parameterized by the aggregate, rather than N separate implementations.
-
-### The aggregate/list-scalar duality
-
-Consider a list column `[[1,2,3], [4,5]]` stored as:
-
-```
-elements: [1, 2, 3, 4, 5]
-offsets:  [0, 3, 5]
-```
-
-Computing `list_sum` yields `[6, 9]`. This is identical to computing a grouped `sum` over
-elements partitioned by offset ranges `[0..3, 3..5]`. Every aggregate function has a
-corresponding list scalar function:
-
-| Aggregate    | List scalar            | Grouped operation                 |
-|--------------|------------------------|-----------------------------------|
-| `sum(col)`   | `list_sum(list_col)`   | Sum elements per group            |
-| `count(col)` | `list_count(list_col)` | Count non-null elements per group |
-| `min(col)`   | `list_min(list_col)`   | Min element per group             |
-| `max(col)`   | `list_max(list_col)`   | Max element per group             |
-| `mean(col)`  | `list_mean(list_col)`  | Mean of elements per group        |
-
-### Streaming ordered GROUP BY
-
-For databases with sorted storage, GROUP BY over a sorted key is a streaming operation:
-groups arrive in order and don't cross batch boundaries. This maps directly to the
-aggregate/list-scalar duality:
-
-1. For each batch, compute group boundaries from the sorted key (run boundaries → offsets).
-2. Construct `ListArray::new(value_column, group_offsets, ...)` — zero-copy.
-3. Wrap in `ScalarFnArray(ListAggregate(Sum), [list_array])`.
-4. Execute → one result per group.
-
-No hash tables, no shuffling. The same `execute_grouped` path serves both list scalar
-functions and streaming ordered GROUP BY.
-
-### Expensive list interop
-
-The current canonical list encoding is `ListViewArray` (offsets+sizes, out-of-order allowed).
-Converting between `ListArray` and `ListViewArray` is expensive — see the rebuild
-infrastructure with modes `MakeZeroCopyToList`, `TrimElements`, `MakeExact`, and the
-`is_zero_copy_to_list` flag. This complexity exists primarily to support out-of-order
-offsets, which complicates the aggregate duality since `execute_grouped` needs monotonic
-offsets.
-
-## Design
-
-### `Accumulator` — the core primitive
-
-The `Accumulator` trait is the fundamental aggregation interface. It processes one group
-at a time: the caller feeds element batches, then flushes to finalize the group and start
-the next one. The accumulator owns its output buffer and returns all results at the end.
-
-```rust
-pub trait Accumulator: Send + Sync {
-    /// Feed a batch of element values for the current group.
-    /// Can be called multiple times per group (e.g., chunked elements).
-    fn accumulate(&mut self, batch: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<()>;
-
-    /// Finalize the current group: push its result into the output buffer
-    /// and reset internal state for the next group.
-    fn flush(&mut self, ctx: &mut ExecutionCtx) -> VortexResult<()>;
-
-    /// Return all flushed results as a single array.
-    /// Length = number of flush() calls.
-    fn finish(self: Box<Self>, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef>;
-}
-```
-
-This handles all aggregation cases uniformly:
-
-**Case 1 — List scalar with contiguous elements:**
-```rust
-let mut acc = aggregate.accumulator(element_dtype)?;
-for i in 0..n_lists {
-    let group_elements = elements.slice(offsets[i]..offsets[i+1])?;
-    acc.accumulate(&group_elements, ctx)?;
-    acc.flush(ctx)?;
-}
-acc.finish(ctx)  // → ArrayRef of length n_lists
-```
-
-**Case 2 — Large lists with chunked elements:**
-A list's elements may be a `ChunkedArray`. The accumulator handles this by feeding
-each chunk separately before flushing:
-```rust
-let mut acc = aggregate.accumulator(element_dtype)?;
-for i in 0..n_lists {
-    let group_elements = elements.slice(offsets[i]..offsets[i+1])?;
-    for chunk in iter_chunks(&group_elements) {
-        acc.accumulate(&chunk, ctx)?;
-    }
-    acc.flush(ctx)?;
-}
-acc.finish(ctx)
-```
-
-**Case 3 — Ungrouped full-column aggregation:**
-One group, fold across chunks:
-```rust
-let mut acc = aggregate.accumulator(dtype)?;
-for chunk in chunked_array.chunks() {
-    acc.accumulate(&chunk, ctx)?;
-}
-acc.flush(ctx)?;
-acc.finish(ctx)  // → 1-element array
-```
-
-**Case 4 — ListView with scattered elements:**
-Canonicalize to ListArray first (rebuilding to sorted form), then use Case 1 or 2.
-The accumulator itself always sees contiguous element batches.
-
-### `AggregateFnVTable`
-
-A new trait parallel to `ScalarFnVTable`. The `accumulator()` method is the required
-core — `execute_grouped` and `execute_scalar` have default implementations built on it:
-
-```rust
-pub trait AggregateFnVTable: 'static + Sized + Clone + Send + Sync {
-    type Options: 'static + Send + Sync + Clone + Debug + Display + PartialEq + Eq + Hash;
-
-    fn id(&self) -> AggregateFnId;
-
-    fn serialize(&self, options: &Self::Options) -> VortexResult<Option<Vec<u8>>>;
-    fn deserialize(
-        &self,
-        metadata: &[u8],
-        session: &VortexSession,
-    ) -> VortexResult<Self::Options>;
-
-    /// Arity is always 1. For multi-column aggregates (e.g., covariance), the input
-    /// should be a struct array containing the columns.
-    fn arity(&self, _options: &Self::Options) -> Arity {
-        Arity::Exact(1)
-    }
-
-    /// Result dtype for each group (not a list — the scalar result type).
-    fn return_dtype(
-        &self,
-        options: &Self::Options,
-        input_dtypes: &[DType],
-    ) -> VortexResult<DType>;
-
-    /// Format in SQL style, e.g. `sum(col)`.
-    fn fmt_sql(
-        &self,
-        options: &Self::Options,
-        expr: &Expression,
-        f: &mut Formatter<'_>,
-    ) -> fmt::Result;
-
-    /// Create an accumulator for streaming aggregation.
-    ///
-    /// This is the core primitive. All other execution methods have default
-    /// implementations built on this.
-    fn accumulator(
-        &self,
-        options: &Self::Options,
-        input_dtype: &DType,
-    ) -> VortexResult<Box<dyn Accumulator>>;
-
-    /// One-shot grouped execution over contiguous elements + monotonic offsets.
-    ///
-    /// Default: loop over groups, slice elements, accumulate each, flush, finish.
-    /// Override for vectorized one-shot paths (e.g., SIMD segmented sum).
-    fn execute_grouped(
-        &self,
-        options: &Self::Options,
-        input: &ArrayRef,
-        offsets: &ArrayRef,
-        ctx: &mut ExecutionCtx,
-    ) -> VortexResult<ArrayRef> {
-        let n_groups = offsets.len() - 1;
-        let mut acc = self.accumulator(options, input.dtype())?;
-        for i in 0..n_groups {
-            let start = scalar_at(offsets, i)?;
-            let end = scalar_at(offsets, i + 1)?;
-            acc.accumulate(&input.slice(start..end)?, ctx)?;
-            acc.flush(ctx)?;
-        }
-        acc.finish(ctx)
-    }
-
-    /// Ungrouped full-column aggregation returning a scalar.
-    ///
-    /// Default: single-group execute_grouped with offsets [0, n].
-    /// Replaces standalone `ComputeFn` kernels like `compute::sum()`.
-    fn execute_scalar(
-        &self,
-        options: &Self::Options,
-        input: &ArrayRef,
-        ctx: &mut ExecutionCtx,
-    ) -> VortexResult<Scalar> {
-        let n = input.len();
-        let offsets: ArrayRef = buffer![0u64, n as u64].into_array();
-        let result = self.execute_grouped(options, input, &offsets, ctx)?;
-        result.scalar_at(0)
-    }
-}
-```
-
-Key differences from `ScalarFnVTable`:
-
-| Property | `ScalarFnVTable` | `AggregateFnVTable` |
-|----------|-------------------|----------------------|
-| Row semantics | 1 input → 1 output | N inputs → 1 output per group |
-| Core primitive | `execute(args) -> ArrayRef` | `accumulator() -> Box<dyn Accumulator>` |
-| Input shape | All children same length | Flat elements + offsets defining variable-size groups |
-| Streaming | N/A | accumulate/flush/finish across chunks |
-
-### Streaming AVG example
-
-To illustrate how the accumulator works for a stateful aggregate, consider `Mean`:
-
-```rust
-struct MeanAccumulator {
-    running_sum: f64,
-    running_count: u64,
-    output: Vec<f64>,
-}
-
-impl Accumulator for MeanAccumulator {
-    fn accumulate(&mut self, batch: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<()> {
-        self.running_sum += Sum.execute_scalar(&EmptyOptions, batch, ctx)?.as_f64()?;
-        self.running_count += Count.execute_scalar(&EmptyOptions, batch, ctx)?.as_u64()?;
-        Ok(())
-    }
-
-    fn flush(&mut self, _ctx: &mut ExecutionCtx) -> VortexResult<()> {
-        let mean = if self.running_count > 0 {
-            self.running_sum / self.running_count as f64
-        } else {
-            f64::NAN
-        };
-        self.output.push(mean);
-        self.running_sum = 0.0;
-        self.running_count = 0;
-        Ok(())
-    }
-
-    fn finish(self: Box<Self>, _ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
-        Ok(PrimitiveArray::from_vec(self.output).into_array())
-    }
-}
-```
-
-For a list column `[[1.0, 2.0, 3.0], [4.0, 5.0]]` where the first list's elements are
-chunked as `[1.0, 2.0]` and `[3.0]`:
-
-1. `accumulate([1.0, 2.0])` → running_sum=3.0, running_count=2
-2. `accumulate([3.0])` → running_sum=6.0, running_count=3
-3. `flush()` → push 2.0, reset
-4. `accumulate([4.0, 5.0])` → running_sum=9.0, running_count=2
-5. `flush()` → push 4.5, reset
-6. `finish()` → `[2.0, 4.5]`
-
-### Built-in aggregates
-
-The initial set, each implementing `AggregateFnVTable`:
-
-```rust
-pub struct Sum;           // sum of elements per group
-pub struct Count;         // count of non-null elements per group
-pub struct Min;           // minimum element per group
-pub struct Max;           // maximum element per group
-pub struct Mean;          // mean of elements per group (returns f64)
-pub struct Any;           // logical OR per group (bool input)
-pub struct All;           // logical AND per group (bool input)
-pub struct CollectList;   // identity: collect elements into list (returns list)
-```
-
-**Migration strategy**: `execute_scalar` replaces standalone `ComputeFn` kernel dispatch
-(e.g., `Sum::execute_scalar` replaces `compute::sum()`). The `accumulator` implementations
-can start simple (canonicalize + iterate) and be optimized over time with encoding-aware
-fast paths.
-
-### `ListAggregate` scalar function
-
-A single `ScalarFnVTable` implementation that bridges list columns to the aggregate system.
-Because `ListAggregate` is a scalar function, wrapping it in an expression produces a
-`ScalarFnArray` — reusing the existing lazy evaluation, slicing, and reduce infrastructure
-with no new array type needed.
-
-```rust
-pub struct ListAggregate;
-
-pub struct ListAggregateOptions {
-    pub aggregate_fn: AggregateFnRef,
-}
-
-impl ScalarFnVTable for ListAggregate {
-    type Options = ListAggregateOptions;
-
-    fn id(&self) -> ScalarFnId {
-        ScalarFnId::from("vortex.list.aggregate")
-    }
-
-    fn arity(&self, _options: &Self::Options) -> Arity {
-        Arity::Exact(1)
-    }
-
-    fn return_dtype(
-        &self,
-        options: &Self::Options,
-        arg_dtypes: &[DType],
-    ) -> VortexResult<DType> {
-        let element_dtype = arg_dtypes[0].as_list_element()
-            .ok_or_else(|| vortex_err!("ListAggregate input must be a list"))?;
-        options.aggregate_fn.return_dtype(&[element_dtype.clone()])
-    }
-
-    fn execute(
-        &self,
-        options: &Self::Options,
-        args: ExecutionArgs,
-    ) -> VortexResult<ArrayRef> {
-        let list_input = &args.inputs[0];
-        let ctx = args.ctx;
-        // Execute the child into a ListArray, decompose, and aggregate.
-        let list = list_input.to_list()?;
-        let elements = list.elements();
-        let offsets = list.offsets();
-
-        // If elements are chunked, use the accumulator to stream through chunks.
-        // Otherwise, use one-shot execute_grouped.
-        if elements.is::<ChunkedArray>() {
-            let mut acc = options.aggregate_fn.accumulator(elements.dtype())?;
-            let n_groups = offsets.len() - 1;
-            for i in 0..n_groups {
-                let group_elements = elements.slice(
-                    offset_at(&offsets, i)..offset_at(&offsets, i+1)
-                )?;
-                for chunk in iter_chunks(&group_elements) {
-                    acc.accumulate(&chunk, ctx)?;
-                }
-                acc.flush(ctx)?;
-            }
-            acc.finish(ctx)
-        } else {
-            options.aggregate_fn.execute_grouped(&elements, &offsets, ctx)
-        }
-    }
-
-    // ...
-}
-```
-
-Expression-level sugar:
-
-```rust
-pub fn list_sum(list: Expression) -> Expression {
-    ListAggregate.new_expr(
-        ListAggregateOptions { aggregate_fn: Sum.bind(EmptyOptions) },
-        [list],
-    )
-}
-
-pub fn list_count(list: Expression) -> Expression {
-    ListAggregate.new_expr(
-        ListAggregateOptions { aggregate_fn: Count.bind(EmptyOptions) },
-        [list],
-    )
-}
-
-// list_min, list_max, list_mean, list_any, list_all analogously
-```
-
-This is one scalar function parameterized by the aggregate — not N separate functions.
-Analogous to DuckDB's `list_aggregate(list, 'sum')`.
-
-**Why this works without a dedicated array type:**
-
-1. **Lazy evaluation.** `ScalarFnArray(ListAggregate(Sum), [list_col])` defers computation
-   until `execute()` is called, just as it does for any other scalar function.
-2. **Slicing.** `ScalarFnArray` already slices its children. Slicing the list child
-   correctly adjusts both grouping and elements.
-3. **Reduce rules.** `ListAggregate` can implement `ScalarFnVTable::reduce` for self-reduce
-   optimizations (constant folding, statistics). Encoding-specific reduce_parent rules
-   use `ExactScalarFn<ListAggregate>` to match on the parent (see below).
-4. **Type safety.** The single child has `DType::List` — enforced by `return_dtype`.
-
-### Streaming ordered GROUP BY
-
-For databases with sorted storage, GROUP BY over a sorted key maps directly to the
-aggregate/list-scalar duality. The query engine processes batches where:
-
-1. Groups are defined by a sorted key column (e.g., `customer_id`).
-2. Groups are ordered and complete within each batch (no cross-batch groups).
-3. No hash tables or shuffling needed.
-
-**Physical execution per batch:**
-
-```rust
-fn execute_ordered_group_by(
-    value_column: &ArrayRef,
-    key_column: &ArrayRef,     // sorted
-    aggregate: &AggregateFnRef,
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<ArrayRef> {
-    // Step 1: Compute group offsets from run boundaries in the key column.
-    // E.g., key = [A, A, A, B, B, C] → offsets = [0, 3, 5, 6]
-    let group_offsets = compute_run_offsets(key_column)?;
-
-    // Step 2: Wrap as a list — zero-copy, just references.
-    let list = ListArray::new(value_column.clone(), group_offsets, Validity::NonNullable);
-
-    // Step 3: Create lazy ScalarFnArray with ListAggregate.
-    let scalar_fn = ListAggregate.bind(ListAggregateOptions {
-        aggregate_fn: aggregate.clone(),
-    });
-    let result = ScalarFnArray::try_new(scalar_fn, vec![list.into_array()])?;
-
-    // Step 4: Execute (or let the optimizer reduce first).
-    result.into_array().execute(ctx)
-}
-```
-
-The same `execute_grouped` serves both list scalar functions (where the list already exists
-in the schema) and GROUP BY (where the list is constructed from a flat column +
-sort-derived offsets).
-
-### Reduce rules
-
-Since `ListAggregate` is a `ScalarFnVTable`, its reduce rules use the existing
-`ScalarFnArray` optimization infrastructure.
-
-**Self-reduce rules (via `ScalarFnVTable::reduce`):**
-
-`ListAggregate` implements `ScalarFnVTable::reduce` to handle aggregate-specific
-optimizations when the list child's structure is known:
-
-- **Constant list folding**: If the list child is a `ConstantArray` of list scalars,
-  compute the aggregate directly. `ListAggregate(Sum, Constant([[1,2,3]], 1000))`
-  → `Constant(6, 1000)`.
-
-- **Count from metadata**: `ListAggregate(Count, list)` can use list sizes from metadata
-  without touching element data.
-
-- **Min/Max from statistics**: If element-level statistics are available, `Min`/`Max` can
-  short-circuit without decompression.
-
-- **Sum of constant elements**: `ListAggregate(Sum, list)` where elements are constant
-  → `constant_value * group_size` per group.
-
-**Parent-reduce rules (encoding-specific, via `ExactScalarFn<ListAggregate>`):**
-
-Child encodings register reduce_parent rules that match on `ExactScalarFn<ListAggregate>`.
-The `ExactScalarFn` matcher provides typed access to `ListAggregateOptions`, from which the
-specific `AggregateFnRef` can be inspected:
-
-```rust
-impl ArrayParentReduceRule<DictVTable> for DictAggregateRule {
-    type Parent = ExactScalarFn<ListAggregate>;
-
-    fn reduce_parent(
-        &self,
-        array: &DictArray,
-        parent: ScalarFnArrayView<'_, ListAggregate>,
-        child_idx: usize,
-    ) -> VortexResult<Option<ArrayRef>> {
-        let aggregate_fn = &parent.options.aggregate_fn;
-
-        // Min/Max over Dict elements → Min/Max(values)
-        if aggregate_fn.is::<Min>() || aggregate_fn.is::<Max>() {
-            // Min/max of dictionary values is the global min/max.
-            let values = array.values();
-            return Ok(Some(
-                ListAggregate.new_expr_with(
-                    parent.options.clone(),
-                    // Rebuild list with dict values as elements
-                    // ...
-                ).into_array()
-            ));
-        }
-
-        Ok(None)
-    }
-}
-```
-
-Example encoding-specific rules:
-
-- **Dict**: `ListAggregate(Min, List(Dict(codes, values)))` → `ListAggregate(Min, List(values))`.
-  `Max` similarly. `Sum` cannot push down without frequency weighting.
-
-- **RunEnd**: `ListAggregate(Sum, List(RunEnd(values, run_ends)))` → weighted sum.
-  `ListAggregate(Min/Max, List(RunEnd))` → `ListAggregate(Min/Max, List(values))`.
-
-These rules operate on the **elements** within the list child. The list's offsets define
-the grouping; the elements are what the encoding wraps. A list child like
-`ListArray(elements: DictArray, offsets: ...)` allows the Dict reduce_parent rule to fire
-on the elements when the `ScalarFnArray(ListAggregate)` is being optimized.
-
-### Interaction with `list_contains`
-
-The existing `list_contains` function is NOT an aggregate — it's a true scalar function
-(takes a list + needle, returns bool per row). It stays as-is. `ListAggregate` is
-specifically for operations that reduce list elements to a scalar per list.
-
-## Canonical list representation
-
-The choice of canonical list encoding directly affects how cleanly the aggregate duality
-works. Currently, `ListViewArray` is canonical for `DType::List`. This section evaluates
-whether to switch to `ListArray`.
-
-### Option A: Switch canonical to `ListArray` (offsets-only)
-
-Change `Canonical::List` from `ListViewArray` to `ListArray`.
-
-`ListArray` stores elements + an (n+1)-length monotonically increasing offsets array.
-List `i` spans `elements[offsets[i]..offsets[i+1]]`.
-
-**Advantages for the aggregate duality:**
-
-- `execute_grouped` receives offsets directly from the list — no conversion needed.
-- Offsets are monotonic by construction, matching the `execute_grouped` contract.
-- One array (n+1 integers) vs two arrays (2n integers) — less memory.
-- Offset deltas (list sizes) compress extremely well with delta/FoR encoding.
-- Direct match to Arrow `ListArray` and DuckDB's list format — zero-copy Arrow export.
-
-**Disadvantages:**
-
-- **Breaking change.** All code calling `to_canonical()` on list data gets `ListArray`
-  instead of `ListViewArray`. Migration required.
-- **No out-of-order offsets.** Operations like filter/take that scramble row order must
-  rebuild the elements array (or produce a `ListViewArray` as a non-canonical intermediate).
-- **Sequential dependency in offsets.** Reading `offsets[i+1]` depends on `offsets[i]` for
-  computing list size — slightly less SIMD-friendly than separate sizes.
-
-**Migration path:**
-
-1. Add `to_list()` on `ToCanonical` alongside `to_listview()`.
-2. Change `Canonical::List` to hold `ListArray`.
-3. `ListViewArray` becomes a non-canonical encoding (like `VarBinArray` for strings).
-4. Update `list_contains` and all list compute paths.
-
-### Option B: Keep `ListViewArray` as canonical
-
-Keep the current design where `ListViewArray` (offsets + sizes, out-of-order allowed)
-is canonical for `DType::List`.
-
-**Advantages:**
-
-- No breaking change.
-- SIMD-friendly: sizes can be read independently without sequential offset dependency.
-- Out-of-order offsets enable better compression when list order doesn't match element order.
-- Filter/take produce valid `ListViewArray` directly without rebuilding.
-
-**Disadvantages for the aggregate duality:**
-
-- `execute_grouped` requires monotonic offsets. Out-of-order `ListViewArray` must be
-  rebuilt to sorted form first (the `MakeZeroCopyToList` path), which copies data.
-- The ZCTL flag, rebuild modes, and validation code remain as ongoing complexity.
-- Arrow/DuckDB export still requires conversion.
-- Two arrays (2n integers) instead of one (n+1 integers).
-
-**Impact on `ListAggregate::execute`:**
-
-`ListAggregate::execute` would need to handle the non-sorted case:
-
-```rust
-fn execute(&self, options: &Self::Options, args: ExecutionArgs) -> VortexResult<ArrayRef> {
-    let list_input = &args.inputs[0];
-    let ctx = args.ctx;
-    let listview = list_input.to_listview()?;
-    if listview.is_zero_copy_to_list() {
-        // Fast path: offsets are sorted, use directly
-        let offsets = build_list_offsets_from_list_view(&listview);
-        options.aggregate_fn.execute_grouped(&listview.elements(), &offsets, ctx)
-    } else {
-        // Slow path: rebuild to sorted form first
-        let list = list_from_list_view(listview)?;
-        options.aggregate_fn.execute_grouped(&list.elements(), &list.offsets(), ctx)
-    }
-}
-```
-
-### ListView as direct aggregate input
-
-An alternative to canonicalizing to ListArray before executing is to support `ListView`
-directly in `execute_grouped`. A ListView with out-of-order offsets represents
-variable-length groups that may reference scattered regions of the elements array.
-
-**Advantages:**
-
-- More powerful — supports aggregation over non-contiguous element ranges.
-- Avoids the expensive rebuild-to-sorted-form step for scrambled ListViews.
-
-**Disadvantages:**
-
-- **Complexity in aggregate implementations.** Each aggregate must handle potentially
-  scattered, non-contiguous element access patterns instead of simple contiguous slices.
-- **Unpredictable decompression cost.** A ListView with sparse, scattered offsets may
-  trigger excessive random-access decompression of the elements array. In the worst case
-  this is more expensive than one-pass decompression of all elements into contiguous form.
-- **SIMD unfriendly.** Contiguous offset ranges map to sequential memory access patterns
-  that vectorize well. Scattered access does not.
-
-For now, this RFC assumes `execute_grouped` takes monotonic (n+1) offsets. Aggregates
-over ListView canonicalize to sorted form first. Supporting scattered access as an
-optimization can be explored in the future.
-
-### Comparison
-
-| Criterion | ListArray (Option A) | ListViewArray (Option B) |
-|-----------|---------------------|--------------------------|
-| Aggregate duality | Direct — offsets define groups | Indirect — must ensure sorted or rebuild |
-| Arrow/DuckDB export | Zero-copy | Requires conversion |
-| Memory | n+1 integers | 2n integers |
-| Filter/take | Must rebuild elements | Produces valid array directly |
-| SIMD for size access | Sequential offset dependency | Independent size reads |
-| Offset compression | Delta/FoR on monotonic data | May be non-monotonic |
-| Breaking change | Yes | No |
-
-## Compatibility
-
-This RFC does not change the file format or wire format. `ListAggregate` produces a
-`ScalarFnArray` at runtime (like any other scalar function). It is not persisted to disk.
-
-**Public API additions:**
-
-- `Accumulator` trait — core streaming aggregation primitive.
-- `AggregateFnVTable` trait and built-in implementations (Sum, Count, Min, Max, Mean,
-  Any, All, CollectList).
-- `ListAggregate` scalar function.
-- Expression constructors: `list_sum()`, `list_count()`, `list_min()`, `list_max()`,
-  `list_mean()`, `list_any()`, `list_all()`.
-
-**If canonical list changes (Option A):**
-
-- `Canonical::List` changes from `ListViewArray` to `ListArray`.
-- `to_canonical()` on list data returns `ListArray` instead of `ListViewArray`.
-- `ListViewArray` remains available as a non-canonical encoding.
-- Migration: callers using `to_listview()` should transition to `to_list()`.
-
-## Drawbacks
-
-- **New trait surface area.** `AggregateFnVTable` and `Accumulator` are new traits to
-  learn, though they closely mirror `ScalarFnVTable`.
-
-- **Reduce rule coverage.** Not all encoding × aggregate combinations will have optimized
-  reduce_parent rules initially. The fallback (canonicalize list, then accumulator loop)
-  is correct but may be slower.
-
-- **Canonical list change (if Option A).** Breaking change affecting all code that calls
-  `to_canonical()` on list data.
-
-## Alternatives
-
-### List scalar functions as separate `ScalarFnVTable` implementations
-
-Implement `ListSum`, `ListCount`, `ListMin`, etc. as individual scalar functions
-(like `list_contains`), each operating directly on list arrays without an aggregate
-abstraction.
-
-**Rejected because:** Duplicates logic across N functions. No shared optimization rules.
-No path to GROUP BY. Each function must independently handle list decomposition. This is
-exactly the redundancy that motivates this RFC.
-
-### Aggregates as `ComputeFn` only
-
-Keep aggregates in the kernel-based `ComputeFn` system. Implement list scalar functions
-by dispatching to `ComputeFn` with offset ranges.
-
-**Rejected because:** No lazy evaluation, no expression tree participation, no
-reduce_parent optimization. This is the status quo for `sum`/`min_max` and has the
-limitations this RFC addresses.
-
-### Dedicated `AggregateFnArray` array type
-
-Introduce a new array type `AggregateFnArray` parallel to `ScalarFnArray`, wrapping
-a single list child and an `AggregateFnRef`:
-
-```rust
-struct AggregateFnArray {
-    aggregate_fn: AggregateFnRef,
-    dtype: DType,
-    child: ArrayRef,     // must have DType::List
-    stats: ArrayStats,
-}
-```
-
-With a `ListAggregate` scalar function that returns an `AggregateFnArray` from its
-`execute` method, creating two lazy wrappers in sequence:
-`ScalarFnArray(ListAggregate) → AggregateFnArray → actual computation`.
-
-**Rejected because:** `AggregateFnArray` is structurally identical to
-`ScalarFnArray` with one child — it duplicates the lazy evaluation, slicing, and reduce
-infrastructure. Child encodings can already match on specific scalar functions using
-`ExactScalarFn<ListAggregate>`, which provides typed access to `ListAggregateOptions`
-and the embedded `AggregateFnRef`. No new array type is needed.
-
-### No accumulator — one-shot `execute_grouped` only
-
-Make `execute_grouped` the only execution method, with no streaming accumulation.
-
-**Rejected because:** Doesn't handle chunked elements. A list column may have elements
-stored as a `ChunkedArray`. Without an accumulator, all elements must be materialized
-into a single contiguous array before aggregation, defeating the purpose of chunked
-storage. Additionally, aggregates like `Mean` need `running_sum` and `running_count`
-state across element batches, and numerically stable summation (Kahan) requires ordered
-accumulation. The existing `sum_with_accumulator` pattern in `ComputeFn` demonstrates
-this need.
-
-### Grouped accumulator tracking N groups simultaneously
-
-An accumulator that manages all groups at once: `accumulate(input, offsets)` where offsets
-has N+1 entries per call, and sub-offsets are split at chunk boundaries.
-
-**Rejected because:** Over-engineered for our use cases. For list scalar functions and
-ordered GROUP BY, groups are processed sequentially — each group's elements are fully
-available before the next group starts. The per-group flush model is simpler and handles
-all cases. If vectorized all-groups-at-once processing proves necessary, it can be added
-as a separate `GroupsAccumulator` trait (see Future Possibilities).
-
-## Prior Art
-
-- **Apache Arrow**: Separates `ListArray` (offsets-only) from `ListView` (offsets+sizes).
-  Arrow Compute has aggregate kernels separate from scalar kernels. ListView is a recent
-  addition, not widely adopted.
-
-- **DuckDB**: Uses offsets-only lists. Has `list_aggregate(list, 'func_name')` — a single
-  function parameterized by the aggregate name. Separates scalar and aggregate function
-  registries.
-
-- **Apache DataFusion**: Two-tier accumulator design. `AggregateUDFImpl` has a required
-  `accumulator()` factory method (like ours). The `Accumulator` trait handles one group at
-  a time with `update_batch()` → `evaluate()`, plus `state()` and `merge_batch()` for
-  partial aggregation (serializing intermediate state for distributed execution). The
-  optional `GroupsAccumulator` trait manages all groups simultaneously with
-  `group_indices: &[usize]` per row for vectorized hash-based grouping, plus
-  `evaluate(EmitTo::First(n))` for streaming emission. Our `accumulate/flush/finish`
-  corresponds to DataFusion's per-group `Accumulator` with built-in output buffering.
-  Their `state/merge` and `GroupsAccumulator` are future extensions for us (see below).
-
-- **Velox**: `AggregateFunction` with `addRawInput`, `addIntermediateResults`,
-  `extractValues` — full streaming accumulator model. Uses row-major (offset, size)
-  pairs for lists.
-
-- **Polars**: Separate scalar and aggregate expression systems. List namespace functions
-  like `list.sum()`, `list.min()` internally dispatch to grouped aggregation over the
-  list's flat values.
-
-## Unresolved Questions
-
-- **`list_contains` integration**: Should `list_contains` be reimplemented as a
-  `ListAggregate` with a `Contains` aggregate, or does it remain a separate scalar
-  function? It doesn't fit the "reduce N to 1" pattern cleanly since it takes both a
-  list and a needle.
-
-- **Multi-column aggregates**: Some aggregates operate on multiple columns
-  (e.g., covariance, weighted sum). With arity fixed at 1, these require a struct array
-  as input. Is this ergonomic enough, or do we need a separate mechanism?
-
-- **Window functions**: Window functions (rolling sum, rank) operate on sliding windows
-  rather than fixed groups. They share the "operate within parent boundaries" property
-  but have different execution semantics. Should they be a separate trait or an
-  extension of `AggregateFnVTable`?
-
-- **Canonical list decision**: Option A (ListArray) vs Option B (ListViewArray). This
-  can be decided independently of the aggregate framework and implemented as a
-  preparatory or follow-up change.
-
-- **`CollectList` semantics**: `CollectList` is the identity aggregate — it returns
-  the elements as a list. For the list-scalar case, `list_collect(list_col)` is a no-op.
-  For GROUP BY, it materializes the grouped elements into a list column. Should this
-  be a built-in aggregate or handled separately?
-
-- **Null list handling**: When a list entry is null, should `flush()` push null into
-  the output? Or should the caller skip accumulation and use a separate mechanism to
-  mark null outputs? The accumulator currently has no way to signal "this group is null
-  without receiving any elements."
-
-## Future Possibilities
-
-- **Partial aggregation with `state()` / `merge()`**: Adding `state()` and
-  `merge_batch()` methods to `Accumulator` (like DataFusion) for distributed aggregation
-  where partial results must be serialized and merged across nodes. Not needed for
-  the sequential processing model in this RFC.
-
-- **`GroupsAccumulator`**: A separate trait (like DataFusion's) that manages all groups
-  simultaneously with `group_indices` per row, enabling vectorized hash-based grouping.
-  Our per-group flush model handles ordered GROUP BY; this extension would handle
-  unordered GROUP BY with hash tables.
-
-- **Aggregate push-down in file scanning**: Using `ListAggregate` reduce rules to
-  push aggregates into `LayoutReader`, computing aggregates during file scan without
-  materializing full columns.
-
-- **`list_distinct`, `list_sort`, `list_reverse`**: List transformation functions that
-  don't reduce elements to a scalar. These are not aggregates and would remain as
-  `ScalarFnVTable` implementations.
-
-- **Nested aggregation**: `list_sum(list_of_lists)` producing a list of sums at the
-  inner level. The duality recurses naturally.
-
-- **GPU execution**: `execute_grouped` with monotonic offsets maps cleanly to GPU
-  segmented reduction primitives (e.g., `cub::DeviceSegmentedReduce`).
-
-- **ListView scatter optimization**: If profiling shows that rebuilding scrambled
-  ListViews to sorted form is a bottleneck, aggregate implementations could accept
-  `(offsets, sizes)` pairs for scattered access. This would be opt-in per aggregate.
diff --git a/proposals/0001-execution-v2.md b/proposals/0019-array-execution.md
similarity index 99%
rename from proposals/0001-execution-v2.md
rename to proposals/0019-array-execution.md
index e06eaba..1123572 100644
--- a/proposals/0001-execution-v2.md
+++ b/proposals/0019-array-execution.md
@@ -1,6 +1,5 @@
 - Start Date: 2026-02-25
-- RFC PR: [vortex-data/rfcs#0000](https://github.com/vortex-data/rfcs/pull/0000)
-- Tracking Issue: [vortex-data/vortex#0000](https://github.com/vortex-data/vortex/issues/0000)
+- RFC PR: [vortex-data/rfcs#0000](https://github.com/vortex-data/rfcs/pull/19)
 
 ## Summary
 

From 69485dcfc1d187186bb34151bbc20120970d3c82 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 27 Feb 2026 22:05:40 -0500
Subject: [PATCH 9/9] Make ExecutionArgs a dyn trait

Signed-off-by: Nicholas Gates <nick@nickgates.com>
---
 index.ts   |  5 +----
 styles.css | 64 ------------------------------------------------------
 2 files changed, 1 insertion(+), 68 deletions(-)

diff --git a/index.ts b/index.ts
index 980e60a..0b1f5a9 100644
--- a/index.ts
+++ b/index.ts
@@ -86,8 +86,6 @@ function baseHTML(title: string, content: string, cssPath: string = "styles.css"
   <link rel="icon" type="image/svg+xml" href="${basePath}vortex_logo.svg">
   <link rel="stylesheet" href="${cssPath}">
   <script>${THEME_SCRIPT}</script>
-  <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.11.1/highlight.min.js"></script>
-  <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.11.1/languages/rust.min.js"></script>
 </head>
 <body>
   <div class="container">
@@ -108,8 +106,7 @@ ${content}
       Vortex RFC Archive
     </footer>
   </div>
-  <script>${TOGGLE_SCRIPT}</script>
-  <script>hljs.highlightAll();</script>${liveReload ? `\n  <script>${LIVE_RELOAD_SCRIPT}</script>` : ""}
+  <script>${TOGGLE_SCRIPT}</script>${liveReload ? `\n  <script>${LIVE_RELOAD_SCRIPT}</script>` : ""}
 </body>
 </html>`;
 }
diff --git a/styles.css b/styles.css
index 94495b3..d74bf70 100644
--- a/styles.css
+++ b/styles.css
@@ -8,16 +8,6 @@
   --link-hover: #004499;
   --code-bg: #f5f5f5;
   --accent: #333;
-  --hl-keyword: #a626a4;
-  --hl-string: #50a14f;
-  --hl-comment: #a0a1a7;
-  --hl-function: #4078f2;
-  --hl-number: #986801;
-  --hl-type: #c18401;
-  --hl-built-in: #e45649;
-  --hl-attr: #986801;
-  --hl-symbol: #0184bc;
-  --hl-meta: #a0a1a7;
 }
 
 @media (prefers-color-scheme: dark) {
@@ -31,16 +21,6 @@
     --link-hover: #9acbf7;
     --code-bg: #252525;
     --accent: #ccc;
-    --hl-keyword: #c678dd;
-    --hl-string: #98c379;
-    --hl-comment: #5c6370;
-    --hl-function: #61afef;
-    --hl-number: #d19a66;
-    --hl-type: #e5c07b;
-    --hl-built-in: #e06c75;
-    --hl-attr: #d19a66;
-    --hl-symbol: #56b6c2;
-    --hl-meta: #5c6370;
   }
 }
 
@@ -54,16 +34,6 @@
   --link-hover: #9acbf7;
   --code-bg: #252525;
   --accent: #ccc;
-  --hl-keyword: #c678dd;
-  --hl-string: #98c379;
-  --hl-comment: #5c6370;
-  --hl-function: #61afef;
-  --hl-number: #d19a66;
-  --hl-type: #e5c07b;
-  --hl-built-in: #e06c75;
-  --hl-attr: #d19a66;
-  --hl-symbol: #56b6c2;
-  --hl-meta: #5c6370;
 }
 
 :root[data-theme="light"] {
@@ -76,16 +46,6 @@
   --link-hover: #004499;
   --code-bg: #f5f5f5;
   --accent: #333;
-  --hl-keyword: #a626a4;
-  --hl-string: #50a14f;
-  --hl-comment: #a0a1a7;
-  --hl-function: #4078f2;
-  --hl-number: #986801;
-  --hl-type: #c18401;
-  --hl-built-in: #e45649;
-  --hl-attr: #986801;
-  --hl-symbol: #0184bc;
-  --hl-meta: #a0a1a7;
 }
 
 * {
@@ -499,27 +459,3 @@ footer {
   color: var(--fg-muted);
   font-size: 0.875rem;
 }
-
-/* Syntax highlighting (highlight.js token classes) */
-.hljs { background: var(--code-bg); color: var(--fg); }
-.hljs-keyword,
-.hljs-selector-tag { color: var(--hl-keyword); }
-.hljs-string,
-.hljs-addition { color: var(--hl-string); }
-.hljs-comment,
-.hljs-quote { color: var(--hl-comment); font-style: italic; }
-.hljs-title,
-.hljs-title.function_ { color: var(--hl-function); }
-.hljs-number,
-.hljs-literal { color: var(--hl-number); }
-.hljs-type,
-.hljs-title.class_ { color: var(--hl-type); }
-.hljs-built_in { color: var(--hl-built-in); }
-.hljs-attr,
-.hljs-variable,
-.hljs-template-variable { color: var(--hl-attr); }
-.hljs-symbol,
-.hljs-link { color: var(--hl-symbol); }
-.hljs-meta,
-.hljs-deletion { color: var(--hl-meta); }
-.hljs-punctuation { color: var(--fg-muted); }