From 47fe4e686212096ec96ef638b0389baece652c16 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 25 Feb 2026 21:29:05 -0500 Subject: [PATCH 1/9] Add an RFC for Execution V2 Signed-off-by: Nicholas Gates --- index.ts | 5 +- proposals/0001-execution-v2.md | 734 +++++++++++++++++++++++++++++++++ styles.css | 64 +++ 3 files changed, 802 insertions(+), 1 deletion(-) create mode 100644 proposals/0001-execution-v2.md diff --git a/index.ts b/index.ts index 0b1f5a9..980e60a 100644 --- a/index.ts +++ b/index.ts @@ -86,6 +86,8 @@ function baseHTML(title: string, content: string, cssPath: string = "styles.css" + +
@@ -106,7 +108,8 @@ ${content} Vortex RFC Archive
- ${liveReload ? `\n ` : ""} + + ${liveReload ? `\n ` : ""} `; } diff --git a/proposals/0001-execution-v2.md b/proposals/0001-execution-v2.md new file mode 100644 index 0000000..170ccb8 --- /dev/null +++ b/proposals/0001-execution-v2.md @@ -0,0 +1,734 @@ +- Start Date: 2026-02-25 +- RFC PR: [vortex-data/rfcs#0000](https://github.com/vortex-data/rfcs/pull/0000) +- Tracking Issue: [vortex-data/vortex#0000](https://github.com/vortex-data/vortex/issues/0000) + +## Summary + +Replace the current iterative `execute`/`execute_parent` execution loop with a three-phase +pipeline -- **optimize**, **peel selections**, **canonicalize** -- that reduces the VTable +surface from five methods to three, unifies Filter/Slice/Take into a composable `Selection` +type fused with decompression, and replaces the executor loop with a single recursive descent +through a new `canonicalize_into` VTable method. + +## Motivation + +The current execution model has five VTable extension points: `reduce`, `reduce_parent`, +`execute`, `execute_parent`, and `append_to_builder`. Each encoding can implement all five, and +adaptor types (Filter, Slice, Take, Compare, Cast, FillNull, Between, Mask, Zip, Like, Not, +ListContains) each come in Reduce and Execute variants. This leads to a combinatorial explosion +of implementations -- Dict alone has 7 parent reduce rules and 3 parent execute kernels. + +**The distinction between extension points is unclear.** The stated rule is "reduce = +metadata-only, execute = may read buffers," but many operations blur this boundary. Some +`execute_parent` implementations (Dict + Compare, ALP + Compare, FSST + Compare) are +metadata-only tree rewrites that belong in `reduce_parent`. Others (Primitive + Compare, +Bool + FillNull) are just the scalar function's default implementation on canonical inputs -- +redundant with the function's own `execute` method. + +**Selection operations are the same thing expressed differently.** Filter (boolean mask), Slice +(contiguous range), and Take (integer indices) are all row selection. The per-encoding +selection logic itself is inherent -- each encoding needs to know how to select rows -- but +today it is spread across three separate adaptor traits. Unifying them into a single +`Selection` parameter on `canonicalize_into` eliminates the trait boilerplate and makes it harder +for an encoding to implement Slice but forget Take. + +**The executor loop is hard to reason about.** The iterative reduce → reduce_parent → +execute_parent → execute loop runs up to 128 iterations. Each iteration may rewrite the tree +in surprising ways. Debugging requires tracing through multiple levels of indirection to +understand which vtable method fired and why. + +**Strict metadata-only reduce is required for GPU.** When buffers live on a GPU device, +host-side code *cannot* access them. The boundary between reduce (metadata-only) and execute +(buffer-accessing) is well-defined, but several existing implementations violate it -- some +`execute_parent` implementations that are actually metadata-only tree rewrites are registered +as execute rather than reduce. The new model eliminates execute entirely, making violations +impossible. + +Concrete use cases that are painful today: + +- **DuckDB dictionary export**: The DuckDB exporter wants to export dictionary-encoded columns + as DuckDB dictionary vectors. Today it must drive the executor loop, hoping it converges to a + DictArray, with no guarantee of success. With the new model, it runs optimize → peel → inspect + for DictArray in a predictable sequence. + +- **Selection-fused decompression**: BitPacked, FSST, and RunEnd can decompress only selected + rows, but today each must implement separate Filter, Slice, and Take adaptor traits. The + selection logic itself is the same, but with a unified `Selection` parameter it is harder + for an encoding to forget to implement one of the three selection forms, and the boilerplate + of three separate trait impls is eliminated. + +- **GPU execution**: The reduce/reduce_parent phase must be strictly metadata-only so it can run + on arrays whose buffers live on a GPU. The current model does not enforce this. + +## Design + +### VTable surface + +The new model has three VTable methods with clear, non-overlapping roles: + +```rust +pub trait VTable { + /// Rewrite this array to a simpler form without touching data buffers. + /// Returns Ok(None) when no rewrite is possible. + fn reduce(array: &Self::Array) -> VortexResult>; + + /// Rewrite the parent array using knowledge of this child's encoding. + /// Used for expression push-down, cast push-down, constant folding, etc. + /// Must not access data buffers. + fn reduce_parent( + array: &Self::Array, + parent: &ArrayRef, + child_idx: usize, + ) -> VortexResult>; + + /// Materialize this array into a canonical builder, fusing the given row selection + /// with decompression. This is where all O(n) buffer-reading work happens. + fn canonicalize_into( + array: &Self::Array, + builder: &mut CanonicalBuilder, + selection: &Selection, + ctx: &mut ExecutionCtx, + ) -> VortexResult<()>; +} +``` + +| Method | Called by | Purpose | Buffer access | +|--------------------|-------------------|---------------------------------------------|---------------| +| `reduce` | `optimize()` loop | Self-rewrite | Never | +| `reduce_parent` | `optimize()` loop | Rewrite parent (expression push-down, etc.) | Never | +| `canonicalize_into`| `canonicalize()` | Decompress with fused selection | Yes | + +`reduce` and `reduce_parent` are unchanged from the current model. `canonicalize_into` replaces +`execute`, `execute_parent`, and `append_to_builder`. The top-level `canonicalize()` free +function runs the full three-phase pipeline (optimize → peel → `canonicalize_into`). + +### Selection + +```rust +pub enum Selection { + /// All rows are selected. + All, + /// A single row by index (replaces scalar_at). + Index(usize), + /// A contiguous range of rows. + Range(Range), + /// A boolean mask (true = selected). + Mask(Mask), + /// Arbitrary indices, possibly repeated, possibly nullable. + /// A null index produces a null row in the output. + Indices(PrimitiveArray), +} + +impl Selection { + /// Compose two selections. self is applied first, then further restricted by other. + fn compose(&self, other: &Selection) -> Selection; + /// Number of selected rows. + fn count(&self) -> usize; + + /// Apply selection in-place, compacting the selected elements to the front. + fn apply_in_place(&self, data: &mut [T]); + /// Apply selection to a buffer, returning a new buffer with only selected elements. + /// Takes &Buffer (not &[T]) so the output alignment is known. + fn apply(&self, data: &Buffer) -> BufferMut; +} +``` + +`Index` and `Range` avoid allocating a `Mask` for trivial cases (single element lookup, +contiguous slice). + +The first four variants (`All`, `Index`, `Range`, `Mask`) are pure row subsetting -- the +output is always a subset of the input, in input order, with no repetition. `Indices` is +semantically different: it is a **gather** that can repeat positions, reorder rows, and insert +nulls via null indices. This is not an ideal unification -- gather has different semantics from +selection (output can be larger than input, row order is caller-defined, nullability is +introduced). However, the alternative is a separate gather API that every encoding must +implement alongside `canonicalize_into`, which recreates the combinatorial problem this RFC +aims to eliminate. DictArray is used both as genuine dictionary encoding (small values, many +codes) and as a general-purpose take over large arrays (e.g., defining a take mask over a full +Vortex file, with null indices to insert null rows). The latter case requires `Indices` -- +materializing all values with `Selection::All` would be catastrophic when values is an entire +file column. + +Encodings can handle `Indices` as a fallback: canonicalize with `Selection::All`, then gather +from the canonical result. Performance-critical encodings (BitPacked, FSST) can optimize the +gather path directly, just as they implement `TakeExecute` today. + +The `Index` variant subsumes `scalar_at` -- getting a single element is just materializing with +a one-row selection: + +```rust +pub fn scalar_at(array: &dyn Array, idx: usize, ctx: &mut ExecutionCtx) -> VortexResult { + let mut builder = CanonicalBuilder::new(array.dtype()); + array.canonicalize_into(&mut builder, &Selection::Index(idx), ctx)?; + builder.finish().into_array().scalar_at(0) // trivial on canonical +} +``` + +### CanonicalBuilder + +`CanonicalBuilder` replaces the current `dyn ArrayBuilder` with a closed enum. The current +`ArrayBuilder` is a trait object -- callers get a `Box` and can only interact +with it through generic methods like `append_scalar()` and `extend_from_array()`. Encodings +cannot know what concrete builder they are writing into, so they must produce intermediate +arrays or scalars and let the builder copy the data in. + +`CanonicalBuilder` is an enum that mirrors the `Canonical` variants in mutable builder form: + +```rust +pub enum CanonicalBuilder { + Null(NullBuilder), + Bool(BoolBuilder), + Primitive(PrimitiveBuilder), + Decimal(DecimalBuilder), + VarBinView(VarBinViewBuilder), + List(ListViewBuilder), + FixedSizeList(FixedSizeListBuilder), + Struct(StructBuilder), + Extension(ExtensionBuilder), +} + +impl CanonicalBuilder { + pub fn new(dtype: &DType) -> Self { ... } + pub fn finish(self) -> Canonical { ... } +} +``` + +Because it is an enum (not `dyn`), encodings can match on it to access the concrete builder +type directly. This enables zero-copy decompression paths that are impossible with `dyn +ArrayBuilder`. For example, FSST can match for the `VarBinView` variant and push decompressed +buffers and views directly into the builder's internal storage, avoiding an intermediate string +allocation: + +```rust +fn canonicalize_into(fsst: &FSSTArray, builder: &mut CanonicalBuilder, selection, ctx) { + let CanonicalBuilder::VarBinView(vbv_builder) = builder else { + unreachable!("FSST must canonicalize into VarBinView"); + }; + let (buffers, views) = fsst_decode_views(fsst, vbv_builder.completed_block_count(), ctx)?; + vbv_builder.push_buffer_and_adjusted_views(&buffers, &views, ...); +} +``` + +`CanonicalBuilder` is only canonical at the root level. For Struct, field arrays are +accumulated as `Vec` (potentially still compressed/chunked) rather than being +recursively decompressed into nested builders. This generalizes the `pack_struct_chunks` +pattern that already exists in the chunked canonicalization path. + +### The three-phase pipeline + +Canonicalization is a three-phase pipeline with no executor loop: + +```rust +pub fn canonicalize(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { + // Phase 1: optimize tree (reduce rules, expression push-down) + let array = array.optimize_recursive()?; + + // Phase 2: peel selection wrappers + let (array, selection) = array.peel_selection(); + + // Phase 3: canonicalize_into (single recursive descent, no loop) + let mut builder = CanonicalBuilder::new(array.dtype()); + array.canonicalize_into(&mut builder, &selection, ctx)?; + Ok(builder.finish()) +} +``` + +**Phase 1** runs `reduce`/`reduce_parent` in a loop until the tree is stable. This handles all +metadata-only rewrites: expression push-down into Dict values, RunEnd values; constant folding; +comparison target transformation for ALP, FoR, FSST; structural simplification. No data buffers +are read. + +**Phase 2** calls `array.peel_selection()` to strip Filter, Slice, and Take wrappers from the +root, composing them into a single `Selection`. The remaining array tree has no selection +wrappers at its root. + +**Phase 3** is a single recursive descent through `canonicalize_into`. Each encoding decompresses +with the fused selection. There is no iteration -- each encoding knows how to decompress in one +call. Selection wrappers encountered deeper in the tree compose themselves into the selection +during the recursive descent. + +### Selection composition during materialization + +Selection wrappers (Slice, Filter) still exist as array types for lazy representation (e.g., a +file reader produces sliced arrays). During materialization, their `canonicalize_into` composes the +wrapper into the selection and delegates to the child: + +```rust +// SliceArray's canonicalize_into +fn canonicalize_into(slice: &SliceArray, builder, selection, ctx) { + let composed = selection.compose(Selection::Range(slice.range())); + slice.child().canonicalize_into(builder, &composed, ctx) +} + +// FilterArray's canonicalize_into +fn canonicalize_into(filter: &FilterArray, builder, selection, ctx) { + let composed = selection.compose(Selection::Mask(filter.mask())); + filter.child().canonicalize_into(builder, &composed, ctx) +} +``` + +Phase 2 (`peel_selection`) is only needed when the caller wants to inspect the array before +materializing (e.g., the DuckDB exporter checking for DictArray). During normal +materialization, selection composition happens naturally during the recursive descent. + +### ScalarFnArray materialization + +ScalarFnArray's `canonicalize_into` materializes its children, then evaluates the function on +the results: + +```rust +fn canonicalize_into( + array: &ScalarFnArray, + builder: &mut CanonicalBuilder, + selection: &Selection, + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let children: Vec = array.children().iter() + .map(|child| { + // Constants pass through without expansion + if let Some(c) = child.as_opt::() { + return Ok(Columnar::Constant(c.clone())); + } + let mut child_builder = CanonicalBuilder::new(child.dtype()); + child.canonicalize_into(&mut child_builder, selection, ctx)?; + Ok(Columnar::Canonical(child_builder.finish())) + }) + .try_collect()?; + + let result = array.scalar_fn().execute(children, ctx)?; + builder.extend_from_array(&result); + Ok(()) +} +``` + +By the time `canonicalize_into` runs, `reduce_parent` has already pushed operations into compressed +domains. Dict's `reduce_parent` rewrites `compare(dict(codes, values), x)` into +`dict(codes, compare(values, x))`, so the comparison only runs on unique values, not every row. + +### Constants + +Constants are inherently special. "This value repeated N times without N copies" requires +*someone* to know about broadcasting, and no representation trick eliminates that. + +`ConstantArray` continues to exist as an encoding. Its `canonicalize_into` expands the value +into the builder, just like any other encoding decompresses. The only place constants get +special treatment is ScalarFnArray's `canonicalize_into` -- one check to avoid expanding +constants before +passing to the scalar function. This preserves the `Columnar` enum: + +```rust +pub enum Columnar { + Canonical(Canonical), + Constant(ConstantArray), +} +``` + +`ScalarFn::execute` receives `Vec`. Most scalar function implementations handle +constant inputs efficiently -- e.g., `add(primitive, constant(1))` broadcasts the scalar +without allocating a repeated buffer. + +Reduce rules interact with constants via `as_opt::()` to read the scalar value +(metadata access, not buffer access). Constant folding, constant propagation (`slice(constant) +→ constant`, `filter(constant, mask) → constant`), and related rules are all metadata-only. + +### Per-encoding canonicalize_into + +Each encoding implements a single decompression method that handles selection: + +- **BitPacked**: unpack only selected positions into PrimitiveBuilder +- **FSST**: decompress only selected strings into VarBinViewBuilder +- **RunEnd**: binary-search ends to find runs intersecting the selection, expand only those +- **Dict**: materialize codes with selection, take from values +- **Chunked**: split selection across chunks, recurse per-chunk +- **Struct**: pass selection to each field's materialization +- **Constant**: create scalar repeated to selection count + +For fused decompression patterns (e.g., Dict-RLE), the parent encoding checks inline: + +```rust +// Dict's canonicalize_into +fn canonicalize_into(dict: &DictArray, builder, selection, ctx) { + // Check for fused Dict-RLE pattern + if let Some(runend) = dict.codes().as_opt::() { + return fused_dict_rle(dict.values(), runend, builder, selection, ctx); + } + + // Generic path: materialize codes with selection, take from values + let mut code_builder = primitive_builder(dict.codes().dtype()); + dict.codes().canonicalize_into(&mut code_builder, selection, ctx)?; + let codes = code_builder.finish_into_primitive(); + take_into_builder(builder, dict.values(), &codes, ctx) +} +``` + +### Decompression cache and CSE + +`ExecutionCtx` holds a pointer-identity cache (`HashMap<*const dyn Array, ArrayRef>`) scoped +to a single `canonicalize()` call. This serves two purposes: + +- **Common sub-expression elimination**: when the same array node appears as a child of + multiple ScalarFnArray nodes (e.g., `x + x`), it is materialized once. +- **Shared block decompression**: when a ZStd-compressed array is sliced into many chunks, all + slices share the same underlying compressed block. The first slice decompresses the block and + caches the result; subsequent slices index into it. + +The cache key is the raw pointer to the array, which is stable because arrays are `Arc`-based. +The cache is dropped after `canonicalize()` returns, so it does not leak memory across calls. + +### Worked example: DuckDB dictionary export + +A Vortex file scan produces a chunk with a dictionary column. The query has a filter predicate +(`name = 'alice'`) evaluated to a mask, and a projection (`upper(name)`). The array tree: + +``` +filter( + scalar_fn(upper, [ + dict( + codes: bitpacked([0,1,0,2,1,0,...]), + values: fsst(["alice","bob","charlie"]) + ) + ]), + mask +) +``` + +**Phase 1: optimize.** `reduce_parent` pushes `upper` into dict values: + +``` +filter( + dict( + codes: bitpacked([0,1,0,2,1,0,...]), + values: scalar_fn(upper, [fsst(["alice","bob","charlie"])]) + ), + mask +) +``` + +**Phase 2: peel selections.** Filter is at the root, so it peels: + +``` +array = dict(codes: bitpacked(...), values: scalar_fn(upper, [fsst(...)])) +selection = Selection::Mask(mask) +``` + +**Phase 3: export.** The DuckDB exporter inspects the tree: + +```rust +fn export_chunk(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { + let array = array.optimize_recursive()?; + let (array, selection) = array.peel_selection(); + + if let Some(dict) = array.as_opt::() { + return export_dict(dict, &selection, ctx); + } + + // Fallback: full materialization + let mut builder = CanonicalBuilder::new(array.dtype()); + array.canonicalize_into(&mut builder, &selection, ctx)?; + export_canonical(builder.finish()) +} + +fn export_dict( + dict: &DictArray, + selection: &Selection, + ctx: &mut ExecutionCtx, +) -> VortexResult { + // Materialize ALL unique values (no selection -- values are shared) + let mut val_builder = CanonicalBuilder::new(dict.values().dtype()); + dict.values().canonicalize_into(&mut val_builder, &Selection::All, ctx)?; + let values = val_builder.finish(); + + // Materialize codes WITH selection (only rows that passed the filter) + let mut code_builder = primitive_builder(dict.codes().dtype()); + dict.codes().canonicalize_into(&mut code_builder, selection, ctx)?; + let codes = code_builder.finish_into_primitive(); + + Ok(DuckDBVector::dictionary(values, codes)) +} +``` + +**Values path**: `scalar_fn(upper, [fsst(["alice","bob","charlie"])])` materializes via +ScalarFnArray's `canonicalize_into`. FSST bulk-decompresses 3 strings, `upper` runs on canonical +input, producing `["ALICE","BOB","CHARLIE"]`. + +**Codes path**: `bitpacked([0,1,0,2,1,0,...])` materializes with `Selection::Mask(mask)`. +BitPacked unpacks only the positions where the mask is true. + +**Result**: DuckDB gets a dictionary vector with 3 uppercase values and only the selected +codes. `upper()` was computed on 3 unique values instead of N rows. The filter was fused with +bit-unpacking. No intermediate arrays were materialized. + +### What changes + +**Removed:** + +- `execute` VTable method -- replaced by `canonicalize_into(builder, selection)`. +- `execute_parent` VTable method -- selection-related uses replaced by `Selection`; + compressed-domain computation uses replaced by `reduce_parent` rules. +- The executor loop -- no more iterative execute-until-canonical. Optimization is a reduce-only + loop; materialization is a single recursive descent. +- Selection-related adaptors -- `FilterReduceAdaptor`, `FilterExecuteAdaptor`, + `SliceReduceAdaptor`, `SliceExecuteAdaptor`, `TakeExecuteAdaptor`, and all their per-encoding + implementations (~40+ impls total). +- Canonical-type `execute_parent` kernels -- Primitive + Compare, Bool + FillNull, Decimal + + Between, etc. These move into the scalar function's `execute` method where they belong. + +**Retained:** + +- `reduce` / `reduce_parent` -- unchanged. Still drive expression push-down, cast push-down, + constant folding, structural simplification. Strictly metadata-only, no buffer access. +- `ScalarFnArray` -- kept as per-node lazy computation. Composes with `reduce_parent` for + expression push-down into compressed encodings. +- Filter/Slice/Take array wrappers -- still exist for lazy representation. Their + `canonicalize_into` composes them into the Selection and delegates to the child. These could be + unified into a single `SelectionArray` wrapping a `Selection` in the future. + +**New:** + +- `Selection` type -- unifies Filter, Slice, and Take into a single row-selection + representation that composes and threads through materialization. +- `CanonicalBuilder` -- one-level-deep builder enum. Struct children accumulate as + `Vec` rather than being decompressed into child builders. +- `Array::peel_selection()` -- method to strip selection wrappers from the root, used by + exporters that want to inspect the tree before materializing. + +**Summary comparison:** + +| Aspect | Current | Proposed | +|--------------------------------|---------------------------------------------------------------------------------|---------------------------------------------------| +| VTable methods | 5 (`reduce`, `reduce_parent`, `execute`, `execute_parent`, `append_to_builder`) | 3 (`reduce`, `reduce_parent`, `canonicalize_into`) | +| Adaptor types per operation | 2 (Reduce + Execute) | 1 (Reduce only, for expression push-down ops) | +| Selection handling | Per-encoding Filter, Slice, Take adaptors | Single `Selection` parameter | +| Materialization | Iterative loop calling execute until convergence | Single recursive descent through `canonicalize_into` | +| Executor loop | reduce → reduce_parent → execute_parent → execute, repeated | optimize (reduce loop) → peel selections → build | +| Builder model | Fully recursive (forces deep canonicalization) | One-level canonical (children stay compressed) | +| Fused decompression (Dict-RLE) | Child volunteers via execute_parent | Parent checks for pattern in its own `canonicalize_into` | +| Buffer access in reduce | Ambiguous | Never (strict, works on any device including GPU) | + +### Migration path + +The migration is designed so that every step leaves the codebase in a working state. The old +and new execution paths coexist until the new path covers all encodings, at which point the +old path is deleted. Each phase is independently shippable. + +**Phase 1: Foundation types.** Introduce `Selection` (with `compose`, `count`, `apply`, +`apply_in_place`), `CanonicalBuilder` enum, and `Array::peel_selection()`. All additive, no +behavior changes. + +**Phase 2: Add `canonicalize_into` with fallback default.** Add `canonicalize_into` to `VTable` with a +default implementation that bridges to the old execute path: + +```rust +fn canonicalize_into( + array: &Self::Array, + builder: &mut CanonicalBuilder, + selection: &Selection, + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + // Fallback: use existing execute-to-canonical, then apply selection + let canonical = Canonical::execute(array.as_array_ref().clone(), ctx)?; + let selected = selection.apply_to_array(canonical.into_array())?; + builder.extend_from_array(&selected)?; + Ok(()) +} +``` + +Wire up `canonicalize()` using the three-phase pipeline. Both old (`execute::`) and +new (`canonicalize()`) paths work. Integration tests assert equivalence. + +**Phase 3: Migrate canonical encodings.** One encoding per PR: Null, Bool, Primitive, Decimal, +VarBinView, Struct, List/FixedSizeList, Extension, Constant. Each writes directly into the +builder with fused selection. Independent and parallelizable. + +**Phase 4: Migrate selection wrappers.** Filter, Slice, Chunked, Masked compose into +`Selection` instead of executing independently. + +**Phase 5: Migrate compressed encodings.** Selection-fused decompression for BitPacked, +RunEnd, Dict, FSST, ALP, FoR, ZStd, Sparse, and others. Start with highest-value encodings. +Each encoding is an independent PR. + +**Phase 6: Move `execute_parent` → `reduce_parent`.** Compressed-domain rewrites (Dict + +Compare, ALP + Compare, FoR + Compare, FSST + Compare) become `reduce_parent` rules. +Canonical-type kernels (Primitive + Compare, Bool + FillNull) move into `ScalarFn::execute`. +Once empty, remove `execute_parent` from VTable. + +**Phase 7: Delete old execution path.** Delete all selection adaptor traits and their ~40+ +per-encoding implementations. Delete `execute` VTable method, the executor loop, and the +`Executable` trait. All callers of `execute::` migrate to `canonicalize()` (the +free function that runs the full three-phase pipeline). + +**Phase 8: Replace `scalar_at`.** `scalar_at` becomes a free function using +`Selection::Index`. Remove `OperationsVTable::scalar_at` and per-encoding implementations. + +**Phase 9: Wire up exporters.** DuckDB exporter uses optimize → peel → inspect → export. +Add decompression cache to `ExecutionCtx`. + +**Ordering and parallelism:** + +``` +Phase 1: Foundation types + │ +Phase 2: canonicalize_into with fallback + │ + ├── Phase 3: Canonical encodings (independent per encoding) + ├── Phase 4: Selection wrappers + ├── Phase 5: Compressed encodings (independent per encoding) + └── Phase 6: execute_parent → reduce_parent (independent per rule) + │ + Phase 7: Delete old path (after 3-6 complete) + │ + ├── Phase 8: scalar_at → Selection::Index + └── Phase 9: Exporter updates +``` + +Phases 3-6 can proceed in parallel across contributors. The fallback default in Phase 2 ensures +nothing breaks while encodings are migrated at their own pace. Phase 7 is the cleanup gate. + +## Compatibility + +This RFC does not change the file format or wire format. All changes are internal to the +execution engine. + +**Public API breakage:** + +- `Executable` trait and `execute::()` are removed. Callers migrate to `canonicalize()` or + call `canonicalize_into` directly. +- `OperationsVTable::scalar_at` is removed. Callers use the free function `scalar_at()` which + delegates to `canonicalize_into` with `Selection::Index`. +- Third-party encodings that implement `execute`, `execute_parent`, or selection adaptors must + migrate to `canonicalize_into`. The fallback default (Phase 2) provides a bridge during + migration. + +**Performance implications:** + +- Selection-fused decompression should improve performance for filtered/sliced reads by + avoiding intermediate materialization. +- Single-element access via `Selection::Index` may be marginally slower than today's + `scalar_at` for encodings that currently fast-path scalar access. These encodings can + fast-path `Selection::Index` in their `canonicalize_into` if needed. +- The decompression cache avoids redundant work for shared sub-expressions and ZStd blocks. + +## Drawbacks + +- **Migration effort.** The migration touches every encoding in the codebase (~33 total). The + phased approach with a fallback default mitigates this, but it is still significant work + spread across many PRs. + +- **Fused pattern detection moves to the parent.** In the current model, a child encoding can + volunteer to handle its parent's execution via `execute_parent` (e.g., RunEnd volunteers for + Dict-RLE). In the new model, the parent must check for the pattern in its own + `canonicalize_into`. This means the parent must know about the child encoding, which is a + slight inversion of responsibility. In practice, the known fused patterns (Dict-RLE) involve + encodings that are both Arrow-supported and live in the builtin `vortex-array` crate, so the + cross-encoding dependency is not a crate boundary issue. + +- **Selection composition complexity.** `Selection::compose` must handle all variant + combinations correctly (Mask + Range, Indices + Mask, etc.). This is concentrated complexity + in one well-tested type, but getting it wrong would cause subtle data corruption. + +- **`Indices` conflates selection with gather.** The first four `Selection` variants are pure + row subsetting (output <= input, preserves order, no nulls introduced). `Indices` is a + gather: it can repeat rows, reorder them, and introduce nulls. Bundling both concepts into + one enum is a pragmatic compromise -- the alternative (a separate gather API every encoding + must implement) recreates the combinatorial explosion this RFC aims to eliminate -- but it + means `Selection` is not a clean abstraction. + +## Alternatives + +### Keep the iterative execution model + +We could keep the current `execute`/`execute_parent` loop and address individual pain points +(e.g., adding selection fusion as a new adaptor). This avoids the migration cost but perpetuates +the unclear distinction between reduce and execute, the combinatorial adaptor explosion, and the +difficulty of reasoning about the executor loop. The VTable surface stays at 5 methods. + +### Use a single ExpressionArray instead of per-node ScalarFnArray + +Instead of one ScalarFnArray per operation, a single ExpressionArray could hold an entire +expression tree. This would be simpler in some ways but prevents `reduce_parent` from +interacting with individual expression nodes. The per-node approach is what makes Dict's +`reduce_parent` able to push `compare` into dict values -- the key optimization for +dictionary-encoded predicates. + +### Use `dyn ArrayBuilder` instead of `CanonicalBuilder` enum + +The builder could be a trait object (`&mut dyn ArrayBuilder`) instead of an enum. This is more +extensible but prevents encodings from matching on the concrete builder type. FSST's ability to +push directly into `VarBinViewBuilder` (avoiding an intermediate string allocation) depends on +knowing the builder variant at compile time. + +### Separate `scalar_at` from `Selection` + +We could keep `scalar_at` as a separate VTable method for single-element access, rather than +subsuming it into `Selection::Index`. This avoids any performance regression for scalar access +but adds a fourth VTable method and requires every encoding to implement it separately. + +## Prior Art + +- **DataFusion** has `PhysicalExpr::evaluate_selection`, which takes an explicit selection + (equivalent to our `Selection`) and threads it through expression evaluation. This is the + closest analogue to threading `Selection` through `canonicalize_into`. + +## Unresolved Questions + +- **Selection composition correctness**: Composing Mask + Range + Indices needs careful + implementation and thorough testing, especially for `Indices` which introduces repetition, + reordering, and nullability. This is a well-tested utility type vs N x 3 adaptor + implementations, so the tradeoff is favorable, but it needs to be right. + +- **CanonicalBuilder design**: The exact interface for one-level-deep building, especially for + List and Extension types, needs design work. + +- **Aggregate and window functions**: This RFC focuses on scalar functions. AggregateFnArray + and WindowFnArray will need their own materialization paths, which are explicitly out of scope + for this RFC. + +- **`Indices` semantics mismatch**: `Indices` is a gather (repetition, reordering, nullable) + while the other `Selection` variants are pure subsetting. This means `compose`, `count`, and + `apply` must handle two fundamentally different operations. Whether this tension causes + practical problems (e.g., confusing behavior in compose chains) needs to be evaluated during + implementation. + +## Future Possibilities + +### Selection pull-up + +In the new model, selection wrappers compose naturally into the `Selection` parameter during +the recursive descent. However, when a selection is nested inside an expression, it cannot be +peeled: + +``` +scalar_fn(upper, [filter(dict(...), mask)]) +``` + +`peel_selection()` sees ScalarFnArray at the root, not FilterArray. If `reduce_parent` has no +rule for this function + encoding combination, the tree stays as-is and the filter gets +composed in during descent. `upper` runs on N selected rows, producing a flat canonical result. +The dictionary structure is lost. + +A future optimization could **pull selections out of expressions** to the root: + +``` +filter(scalar_fn(upper, [dict(...)]), mask) +``` + +With the selection at the root, `peel_selection()` extracts it, and subsequent optimizations +can proceed. More optimal decisions can be made when the selection is available at the top and +pushed down at execution time, rather than being trapped inside an expression. + +In practice, most scans apply filters at the top level of the array tree, so this edge case is +uncommon. But for complex expression trees with embedded selections, pull-up would enable +strictly better execution plans. + +### Aggregate and window functions + +The execution model is designed to support additional function types beyond scalar functions. +`AggregateFnArray` (for sum, min, max, count) and `WindowFnArray` will follow a similar +deferred pattern, using the same `reduce`/`reduce_parent` optimization strategy and +`Selection`-based materialization. + +### GPU kernel fusion + +The strict metadata-only constraint on `reduce`/`reduce_parent` means the optimized tree can +be shipped to a GPU context. A GPU-aware `canonicalize_into` could fuse the entire optimized tree +into a single kernel launch, reducing memory traffic and kernel overhead. diff --git a/styles.css b/styles.css index d74bf70..94495b3 100644 --- a/styles.css +++ b/styles.css @@ -8,6 +8,16 @@ --link-hover: #004499; --code-bg: #f5f5f5; --accent: #333; + --hl-keyword: #a626a4; + --hl-string: #50a14f; + --hl-comment: #a0a1a7; + --hl-function: #4078f2; + --hl-number: #986801; + --hl-type: #c18401; + --hl-built-in: #e45649; + --hl-attr: #986801; + --hl-symbol: #0184bc; + --hl-meta: #a0a1a7; } @media (prefers-color-scheme: dark) { @@ -21,6 +31,16 @@ --link-hover: #9acbf7; --code-bg: #252525; --accent: #ccc; + --hl-keyword: #c678dd; + --hl-string: #98c379; + --hl-comment: #5c6370; + --hl-function: #61afef; + --hl-number: #d19a66; + --hl-type: #e5c07b; + --hl-built-in: #e06c75; + --hl-attr: #d19a66; + --hl-symbol: #56b6c2; + --hl-meta: #5c6370; } } @@ -34,6 +54,16 @@ --link-hover: #9acbf7; --code-bg: #252525; --accent: #ccc; + --hl-keyword: #c678dd; + --hl-string: #98c379; + --hl-comment: #5c6370; + --hl-function: #61afef; + --hl-number: #d19a66; + --hl-type: #e5c07b; + --hl-built-in: #e06c75; + --hl-attr: #d19a66; + --hl-symbol: #56b6c2; + --hl-meta: #5c6370; } :root[data-theme="light"] { @@ -46,6 +76,16 @@ --link-hover: #004499; --code-bg: #f5f5f5; --accent: #333; + --hl-keyword: #a626a4; + --hl-string: #50a14f; + --hl-comment: #a0a1a7; + --hl-function: #4078f2; + --hl-number: #986801; + --hl-type: #c18401; + --hl-built-in: #e45649; + --hl-attr: #986801; + --hl-symbol: #0184bc; + --hl-meta: #a0a1a7; } * { @@ -459,3 +499,27 @@ footer { color: var(--fg-muted); font-size: 0.875rem; } + +/* Syntax highlighting (highlight.js token classes) */ +.hljs { background: var(--code-bg); color: var(--fg); } +.hljs-keyword, +.hljs-selector-tag { color: var(--hl-keyword); } +.hljs-string, +.hljs-addition { color: var(--hl-string); } +.hljs-comment, +.hljs-quote { color: var(--hl-comment); font-style: italic; } +.hljs-title, +.hljs-title.function_ { color: var(--hl-function); } +.hljs-number, +.hljs-literal { color: var(--hl-number); } +.hljs-type, +.hljs-title.class_ { color: var(--hl-type); } +.hljs-built_in { color: var(--hl-built-in); } +.hljs-attr, +.hljs-variable, +.hljs-template-variable { color: var(--hl-attr); } +.hljs-symbol, +.hljs-link { color: var(--hl-symbol); } +.hljs-meta, +.hljs-deletion { color: var(--hl-meta); } +.hljs-punctuation { color: var(--fg-muted); } From d0d3f69c9ba737f627303f0c178af5b39c0fbb33 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Thu, 26 Feb 2026 21:31:15 -0500 Subject: [PATCH 2/9] RFCs Signed-off-by: Nicholas Gates --- proposals/0001-execution-v2.md | 1244 ++++++++++------- ...02-aggregate-functions-and-list-scalars.md | 858 ++++++++++++ 2 files changed, 1576 insertions(+), 526 deletions(-) create mode 100644 proposals/0002-aggregate-functions-and-list-scalars.md diff --git a/proposals/0001-execution-v2.md b/proposals/0001-execution-v2.md index 170ccb8..7ad5811 100644 --- a/proposals/0001-execution-v2.md +++ b/proposals/0001-execution-v2.md @@ -4,192 +4,113 @@ ## Summary -Replace the current iterative `execute`/`execute_parent` execution loop with a three-phase -pipeline -- **optimize**, **peel selections**, **canonicalize** -- that reduces the VTable -surface from five methods to three, unifies Filter/Slice/Take into a composable `Selection` -type fused with decompression, and replaces the executor loop with a single recursive descent -through a new `canonicalize_into` VTable method. +Replace the current execution model with a cleaner architecture that: +- Enables decompress-into-buffer (caller-owned output) for zero-copy decompression +- Passes a `Mask` to the execution method, forcing encodings to handle row selection +- Cleans up the reduce/execute boundary and removes migration-era adaptor boilerplate -## Motivation - -The current execution model has five VTable extension points: `reduce`, `reduce_parent`, -`execute`, `execute_parent`, and `append_to_builder`. Each encoding can implement all five, and -adaptor types (Filter, Slice, Take, Compare, Cast, FillNull, Between, Mask, Zip, Like, Not, -ListContains) each come in Reduce and Execute variants. This leads to a combinatorial explosion -of implementations -- Dict alone has 7 parent reduce rules and 3 parent execute kernels. - -**The distinction between extension points is unclear.** The stated rule is "reduce = -metadata-only, execute = may read buffers," but many operations blur this boundary. Some -`execute_parent` implementations (Dict + Compare, ALP + Compare, FSST + Compare) are -metadata-only tree rewrites that belong in `reduce_parent`. Others (Primitive + Compare, -Bool + FillNull) are just the scalar function's default implementation on canonical inputs -- -redundant with the function's own `execute` method. - -**Selection operations are the same thing expressed differently.** Filter (boolean mask), Slice -(contiguous range), and Take (integer indices) are all row selection. The per-encoding -selection logic itself is inherent -- each encoding needs to know how to select rows -- but -today it is spread across three separate adaptor traits. Unifying them into a single -`Selection` parameter on `canonicalize_into` eliminates the trait boilerplate and makes it harder -for an encoding to implement Slice but forget Take. - -**The executor loop is hard to reason about.** The iterative reduce → reduce_parent → -execute_parent → execute loop runs up to 128 iterations. Each iteration may rewrite the tree -in surprising ways. Debugging requires tracing through multiple levels of indirection to -understand which vtable method fired and why. - -**Strict metadata-only reduce is required for GPU.** When buffers live on a GPU device, -host-side code *cannot* access them. The boundary between reduce (metadata-only) and execute -(buffer-accessing) is well-defined, but several existing implementations violate it -- some -`execute_parent` implementations that are actually metadata-only tree rewrites are registered -as execute rather than reduce. The new model eliminates execute entirely, making violations -impossible. +Two candidate designs are presented: **Option A** (three-phase recursive) and **Option B** +(cleaned-up iterative). Both share the `Mask`-based row selection and the reduce boundary cleanup. They +differ in how materialization is structured -- Option A passes a caller-owned builder through +recursive descent, Option B keeps an iterative loop with typed execution steps. -Concrete use cases that are painful today: +## Motivation -- **DuckDB dictionary export**: The DuckDB exporter wants to export dictionary-encoded columns - as DuckDB dictionary vectors. Today it must drive the executor loop, hoping it converges to a - DictArray, with no guarantee of success. With the new model, it runs optimize → peel → inspect - for DictArray in a predictable sequence. +**No decompress-into-buffer.** The current execution model has no way to decompress an array +into a caller-provided output buffer. Each encoding allocates its own output during execution. +This means ChunkedArray must decompress each chunk separately and concatenate the results -- +an extra copy of the entire column. Exporters (DuckDB, Arrow) cannot write directly into their +output format; they must receive Vortex's output and copy it. This is the primary pain point. -- **Selection-fused decompression**: BitPacked, FSST, and RunEnd can decompress only selected - rows, but today each must implement separate Filter, Slice, and Take adaptor traits. The - selection logic itself is the same, but with a unified `Selection` parameter it is harder - for an encoding to forget to implement one of the three selection forms, and the boilerplate - of three separate trait impls is eliminated. +**Stack overflows from recursive execution.** The current executor is recursive: each +`execute` call may recurse into children, which recurse into their children. On deep array +trees (deeply nested expressions, many-chunk arrays) this overflows the stack. The recursion +depth is bounded by a 128-iteration limit, but this limit applies to the outer loop -- inner +recursive calls have no bound. -- **GPU execution**: The reduce/reduce_parent phase must be strictly metadata-only so it can run - on arrays whose buffers live on a GPU. The current model does not enforce this. +**Easy to forget selection implementations.** Filter (boolean mask), Slice (contiguous range), +and Take (integer indices) are separate adaptor traits. Each encoding can implement one and +forget the others. Fusing selection with decompression is really just +`execute_parent(FilterArray)` -- which is fine -- but the current design makes it easy to miss. +Passing a `Mask` parameter to the execution method forces every encoding to handle +selection, eliminating the gap. -## Design +**Adaptor explosion is migration noise.** The adaptor types (FilterReduceAdaptor, +SliceExecuteAdaptor, TakeExecuteAdaptor, etc.) are a bridge from the old compute kernel +dispatch to the new VTable world. The reduce/execute distinction is already enforced -- we just +haven't deprecated the APIs that don't require an `ExecutionCtx`. The ~40+ adaptor impls are +migration artifacts, not a fundamental design problem, but they add noise and make the codebase +harder to navigate. -### VTable surface +**Strict metadata-only reduce is useful for GPU.** When buffers live on a GPU device, +host-side code cannot easily access them. A clean metadata-only reduce boundary is useful +(though not strictly required) for GPU execution paths. -The new model has three VTable methods with clear, non-overlapping roles: +## Shared design -```rust -pub trait VTable { - /// Rewrite this array to a simpler form without touching data buffers. - /// Returns Ok(None) when no rewrite is possible. - fn reduce(array: &Self::Array) -> VortexResult>; - - /// Rewrite the parent array using knowledge of this child's encoding. - /// Used for expression push-down, cast push-down, constant folding, etc. - /// Must not access data buffers. - fn reduce_parent( - array: &Self::Array, - parent: &ArrayRef, - child_idx: usize, - ) -> VortexResult>; - - /// Materialize this array into a canonical builder, fusing the given row selection - /// with decompression. This is where all O(n) buffer-reading work happens. - fn canonicalize_into( - array: &Self::Array, - builder: &mut CanonicalBuilder, - selection: &Selection, - ctx: &mut ExecutionCtx, - ) -> VortexResult<()>; -} -``` +Both options share these design elements. -| Method | Called by | Purpose | Buffer access | -|--------------------|-------------------|---------------------------------------------|---------------| -| `reduce` | `optimize()` loop | Self-rewrite | Never | -| `reduce_parent` | `optimize()` loop | Rewrite parent (expression push-down, etc.) | Never | -| `canonicalize_into`| `canonicalize()` | Decompress with fused selection | Yes | +### Mask as row selection -`reduce` and `reduce_parent` are unchanged from the current model. `canonicalize_into` replaces -`execute`, `execute_parent`, and `append_to_builder`. The top-level `canonicalize()` free -function runs the full three-phase pipeline (optimize → peel → `canonicalize_into`). +The existing `Mask` type (in `vortex-mask`) already represents row selection with optimized +internal variants: `AllTrue(usize)`, `AllFalse(usize)`, and `Values(Arc)` for a +full bitmap. It already supports `slice()` for restricting to a range. No new type is needed -- +`Mask` is passed directly to the execution method as the row selection parameter. -### Selection +Composition is mask-over-mask: slicing a mask produces a narrower mask. The main addition is a +`compose` method (or equivalent) for restricting one mask by another, and `apply` for +compacting buffer elements. ```rust -pub enum Selection { - /// All rows are selected. - All, - /// A single row by index (replaces scalar_at). - Index(usize), - /// A contiguous range of rows. - Range(Range), - /// A boolean mask (true = selected). - Mask(Mask), - /// Arbitrary indices, possibly repeated, possibly nullable. - /// A null index produces a null row in the output. - Indices(PrimitiveArray), -} +// Existing type, extended with: +impl Mask { + /// Restrict self by other: keep only positions where both are true. + pub fn compose(&self, other: &Mask) -> Mask; -impl Selection { - /// Compose two selections. self is applied first, then further restricted by other. - fn compose(&self, other: &Selection) -> Selection; - /// Number of selected rows. - fn count(&self) -> usize; - - /// Apply selection in-place, compacting the selected elements to the front. - fn apply_in_place(&self, data: &mut [T]); - /// Apply selection to a buffer, returning a new buffer with only selected elements. - /// Takes &Buffer (not &[T]) so the output alignment is known. - fn apply(&self, data: &Buffer) -> BufferMut; + /// Apply mask to a buffer, returning a new buffer with only selected elements. + pub fn apply(&self, data: &Buffer) -> BufferMut; } ``` -`Index` and `Range` avoid allocating a `Mask` for trivial cases (single element lookup, -contiguous slice). - -The first four variants (`All`, `Index`, `Range`, `Mask`) are pure row subsetting -- the -output is always a subset of the input, in input order, with no repetition. `Indices` is -semantically different: it is a **gather** that can repeat positions, reorder rows, and insert -nulls via null indices. This is not an ideal unification -- gather has different semantics from -selection (output can be larger than input, row order is caller-defined, nullability is -introduced). However, the alternative is a separate gather API that every encoding must -implement alongside `canonicalize_into`, which recreates the combinatorial problem this RFC -aims to eliminate. DictArray is used both as genuine dictionary encoding (small values, many -codes) and as a general-purpose take over large arrays (e.g., defining a take mask over a full -Vortex file, with null indices to insert null rows). The latter case requires `Indices` -- -materializing all values with `Selection::All` would be catastrophic when values is an entire -file column. - -Encodings can handle `Indices` as a fallback: canonicalize with `Selection::All`, then gather -from the canonical result. Performance-critical encodings (BitPacked, FSST) can optimize the -gather path directly, just as they implement `TakeExecute` today. - -The `Index` variant subsumes `scalar_at` -- getting a single element is just materializing with -a one-row selection: +**Take is not selection.** Take (integer indices) can reorder rows, repeat rows, and introduce +nulls via out-of-bounds indices. These semantics are fundamentally different from subsetting. +Gather remains an encoding-internal concern -- DictArray handles codes → values lookup inside +its own materialization. -```rust -pub fn scalar_at(array: &dyn Array, idx: usize, ctx: &mut ExecutionCtx) -> VortexResult { - let mut builder = CanonicalBuilder::new(array.dtype()); - array.canonicalize_into(&mut builder, &Selection::Index(idx), ctx)?; - builder.finish().into_array().scalar_at(0) // trivial on canonical -} -``` +**`scalar_at` remains a separate VTable method.** Single-element access via a builder +round-trip would regress hot paths like RunEnd binary search probing individual positions. ### CanonicalBuilder -`CanonicalBuilder` replaces the current `dyn ArrayBuilder` with a closed enum. The current -`ArrayBuilder` is a trait object -- callers get a `Box` and can only interact -with it through generic methods like `append_scalar()` and `extend_from_array()`. Encodings -cannot know what concrete builder they are writing into, so they must produce intermediate -arrays or scalars and let the builder copy the data in. +`CanonicalBuilder` is specifically for the decompress-into case: a caller creates a builder, +and the encoding writes directly into it. This is structurally incompatible with iterative +execution (where each step returns an owned result), so `CanonicalBuilder` only applies to +Option A's recursive model. + +It replaces the current `dyn ArrayBuilder` with a closed enum. The current `ArrayBuilder` is a +trait object -- callers get a `Box` and can only interact with it through +generic methods like `append_scalar()` and `extend_from_array()`. Encodings cannot know what +concrete builder they are writing into, so they must produce intermediate arrays or scalars and +let the builder copy the data in. `CanonicalBuilder` is an enum that mirrors the `Canonical` variants in mutable builder form: ```rust pub enum CanonicalBuilder { - Null(NullBuilder), - Bool(BoolBuilder), - Primitive(PrimitiveBuilder), - Decimal(DecimalBuilder), - VarBinView(VarBinViewBuilder), - List(ListViewBuilder), - FixedSizeList(FixedSizeListBuilder), - Struct(StructBuilder), - Extension(ExtensionBuilder), + Null(NullBuilder), + Bool(BoolBuilder), + Primitive(PrimitiveBuilder), + Decimal(DecimalBuilder), + VarBinView(VarBinViewBuilder), + List(ListViewBuilder), + FixedSizeList(FixedSizeListBuilder), + Struct(StructBuilder), + Extension(ExtensionBuilder), } impl CanonicalBuilder { - pub fn new(dtype: &DType) -> Self { ... } - pub fn finish(self) -> Canonical { ... } + pub fn new(dtype: &DType) -> Self { ... } + pub fn finish(self) -> Canonical { ... } } ``` @@ -200,182 +121,614 @@ buffers and views directly into the builder's internal storage, avoiding an inte allocation: ```rust -fn canonicalize_into(fsst: &FSSTArray, builder: &mut CanonicalBuilder, selection, ctx) { - let CanonicalBuilder::VarBinView(vbv_builder) = builder else { - unreachable!("FSST must canonicalize into VarBinView"); - }; - let (buffers, views) = fsst_decode_views(fsst, vbv_builder.completed_block_count(), ctx)?; - vbv_builder.push_buffer_and_adjusted_views(&buffers, &views, ...); +fn canonicalize_into(fsst: &FSSTArray, builder: &mut CanonicalBuilder, mask, ctx) { + let CanonicalBuilder::VarBinView(vbv_builder) = builder else { + unreachable!("FSST must canonicalize into VarBinView"); + }; + let (buffers, views) = fsst_decode_views(fsst, vbv_builder.completed_block_count(), ctx)?; + vbv_builder.push_buffer_and_adjusted_views(&buffers, &views, ...); } ``` `CanonicalBuilder` is only canonical at the root level. For Struct, field arrays are accumulated as `Vec` (potentially still compressed/chunked) rather than being -recursively decompressed into nested builders. This generalizes the `pack_struct_chunks` -pattern that already exists in the chunked canonicalization path. +recursively decompressed into nested builders. -### The three-phase pipeline +### Reduce boundary cleanup -Canonicalization is a three-phase pipeline with no executor loop: +Both options retain `reduce` and `reduce_parent` unchanged: ```rust -pub fn canonicalize(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { - // Phase 1: optimize tree (reduce rules, expression push-down) - let array = array.optimize_recursive()?; +fn reduce(array: &Self::Array) -> VortexResult>; +fn reduce_parent( + array: &Self::Array, + parent: &ArrayRef, + child_idx: usize, +) -> VortexResult>; +``` + +Both are strictly metadata-only, never reading data buffers. This boundary is already enforced +today -- the remaining work is deprecating the APIs that don't require an `ExecutionCtx` and +migrating the misplaced implementations. Specifically: the metadata-only rewrites currently +registered as `execute_parent` (Dict + Compare, ALP + Compare, FoR + Compare, FSST + Compare) +move to `reduce_parent`. The canonical-type kernels currently in `execute_parent` (Primitive + +Compare, Bool + FillNull, Decimal + Between) move into the scalar function's `execute` method, +where they belong. - // Phase 2: peel selection wrappers - let (array, selection) = array.peel_selection(); +### Constants - // Phase 3: canonicalize_into (single recursive descent, no loop) - let mut builder = CanonicalBuilder::new(array.dtype()); - array.canonicalize_into(&mut builder, &selection, ctx)?; - Ok(builder.finish()) +`ConstantArray` continues to exist as an encoding. It is special-cased only in ScalarFnArray +materialization -- one check to avoid expanding constants before passing to the scalar function. +This preserves the `Columnar` enum: + +```rust +pub enum Columnar { + Canonical(Canonical), + Constant(ConstantArray), } ``` -**Phase 1** runs `reduce`/`reduce_parent` in a loop until the tree is stable. This handles all -metadata-only rewrites: expression push-down into Dict values, RunEnd values; constant folding; -comparison target transformation for ALP, FoR, FSST; structural simplification. No data buffers -are read. +**Open question:** The current scalar representation uses heap-allocated value trees for nested +types (structs, lists). For deeply nested types, this is expensive. An alternative is to hold +constants as length-1 arrays instead of scalars, avoiding the scalar value tree entirely. This +is orthogonal to the execution model choice but may affect `ConstantArray`'s internal +representation. + +### Decompression cache and CSE -**Phase 2** calls `array.peel_selection()` to strip Filter, Slice, and Take wrappers from the -root, composing them into a single `Selection`. The remaining array tree has no selection -wrappers at its root. +`ExecutionCtx` holds a pointer-identity cache (`HashMap<*const dyn Array, ArrayRef>`) scoped +to a single materialization call. When the same array node appears as a child of multiple +ScalarFnArray nodes (e.g., `x + x`), it is materialized once. When a ZStd-compressed array is +sliced into many chunks, the first slice decompresses the block and caches the result. The +cache is dropped after materialization returns. -**Phase 3** is a single recursive descent through `canonicalize_into`. Each encoding decompresses -with the fused selection. There is no iteration -- each encoding knows how to decompress in one -call. Selection wrappers encountered deeper in the tree compose themselves into the selection -during the recursive descent. +--- -### Selection composition during materialization +## Option A: Three-phase recursive model -Selection wrappers (Slice, Filter) still exist as array types for lazy representation (e.g., a -file reader produces sliced arrays). During materialization, their `canonicalize_into` composes the -wrapper into the selection and delegates to the child: +### VTable surface (3 methods) ```rust -// SliceArray's canonicalize_into -fn canonicalize_into(slice: &SliceArray, builder, selection, ctx) { - let composed = selection.compose(Selection::Range(slice.range())); - slice.child().canonicalize_into(builder, &composed, ctx) +pub trait VTable { + fn reduce(array: &Self::Array) -> VortexResult>; + fn reduce_parent( + array: &Self::Array, + parent: &ArrayRef, + child_idx: usize, + ) -> VortexResult>; + + /// Materialize this array into a canonical builder, fusing the given row mask + /// with decompression. This is where all buffer-reading work happens. + fn canonicalize_into( + array: &Self::Array, + builder: &mut CanonicalBuilder, + mask: &Mask, + ctx: &mut ExecutionCtx, + ) -> VortexResult<()>; } +``` -// FilterArray's canonicalize_into -fn canonicalize_into(filter: &FilterArray, builder, selection, ctx) { - let composed = selection.compose(Selection::Mask(filter.mask())); - filter.child().canonicalize_into(builder, &composed, ctx) +| Method | Purpose | Buffer access | +|--------------------|---------------------------------------------|---------------| +| `reduce` | Self-rewrite (metadata only) | Never | +| `reduce_parent` | Rewrite parent (expression push-down, etc.) | Never | +| `canonicalize_into`| Decompress with fused mask | Yes | + +### Three-phase pipeline + +```rust +pub fn canonicalize(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { + // Phase 1: optimize tree (reduce rules, expression push-down) + let array = array.optimize_recursive()?; + + // Phase 2: peel selection wrappers + let (array, mask) = array.peel_selection(); + + // Phase 3: single recursive descent, no loop + let mut builder = CanonicalBuilder::new(array.dtype()); + array.vtable().canonicalize_into(&array, &mut builder, &mask, ctx)?; + Ok(builder.finish()) } ``` -Phase 2 (`peel_selection`) is only needed when the caller wants to inspect the array before -materializing (e.g., the DuckDB exporter checking for DictArray). During normal -materialization, selection composition happens naturally during the recursive descent. +**Phase 1** runs `reduce`/`reduce_parent` in a loop until the tree is stable. All +metadata-only rewrites happen here: expression push-down into Dict values, constant folding, +comparison target transformation for ALP/FoR/FSST. -### ScalarFnArray materialization +**Phase 2** strips Filter and Slice wrappers from the root, composing them into a single +`Mask`. *(Open question: does lifting selection to the root provide meaningful benefit +beyond what passing `Mask` through recursion already gives? It may be orthogonal to the +core proposal.)* + +**Phase 3** is a single recursive descent. Each encoding decompresses with the fused selection. +No iteration. Selection wrappers deeper in the tree compose themselves into the selection +during descent: + +```rust +// SliceArray's canonicalize_into: slicing composes as masking out the excluded rows +fn canonicalize_into(slice: &SliceArray, builder, mask, ctx) { + let composed = mask.slice(slice.range()); + slice.child().canonicalize_into(builder, &composed, ctx) +} +``` + +### Per-encoding canonicalize_into + +- **BitPacked**: unpack only selected positions into PrimitiveBuilder +- **FSST**: decompress only selected strings into VarBinViewBuilder +- **RunEnd**: binary-search ends to find runs intersecting the selection, expand only those +- **Dict**: materialize codes with selection, gather from values +- **Chunked**: split selection across chunks, recurse per-chunk +- **Struct**: pass selection to each field's materialization +- **Constant**: create scalar repeated to selection count + +For fused decompression patterns (e.g., Dict-RLE), the parent checks inline: -ScalarFnArray's `canonicalize_into` materializes its children, then evaluates the function on -the results: +```rust +fn canonicalize_into(dict: &DictArray, builder, mask, ctx) { + if let Some(runend) = dict.codes().as_opt::() { + return fused_dict_rle(dict.values(), runend, builder, mask, ctx); + } + + let mut code_builder = primitive_builder(dict.codes().dtype()); + dict.codes().canonicalize_into(&mut code_builder, mask, ctx)?; + let codes = code_builder.finish_into_primitive(); + take_into_builder(builder, dict.values(), &codes, ctx) +} +``` + +### ScalarFnArray materialization ```rust fn canonicalize_into( - array: &ScalarFnArray, - builder: &mut CanonicalBuilder, - selection: &Selection, - ctx: &mut ExecutionCtx, + array: &ScalarFnArray, + builder: &mut CanonicalBuilder, + mask: &Mask, + ctx: &mut ExecutionCtx, ) -> VortexResult<()> { - let children: Vec = array.children().iter() - .map(|child| { - // Constants pass through without expansion + let children: Vec = array.children().iter() + .map(|child| { if let Some(c) = child.as_opt::() { - return Ok(Columnar::Constant(c.clone())); + return Ok(Columnar::Constant(c.clone())); } let mut child_builder = CanonicalBuilder::new(child.dtype()); - child.canonicalize_into(&mut child_builder, selection, ctx)?; + child.canonicalize_into(&mut child_builder, mask, ctx)?; Ok(Columnar::Canonical(child_builder.finish())) - }) - .try_collect()?; + }) + .try_collect()?; - let result = array.scalar_fn().execute(children, ctx)?; - builder.extend_from_array(&result); - Ok(()) + let result = array.scalar_fn().execute(children, ctx)?; + builder.extend_from_array(&result); + Ok(()) } ``` -By the time `canonicalize_into` runs, `reduce_parent` has already pushed operations into compressed -domains. Dict's `reduce_parent` rewrites `compare(dict(codes, values), x)` into -`dict(codes, compare(values, x))`, so the comparison only runs on unique values, not every row. +### Trade-offs -### Constants +**Strengths:** + +- **Minimal VTable surface.** 3 methods, down from 5. Each has a clear, non-overlapping role. +- **No iteration loop.** Materialization is a single recursive descent. Predictable, easy to + debug, easy to reason about performance. (Note: still recursive -- see weakness below.) +- **Mask fused with decompression.** Each encoding handles row selection once, in one place. +- **Strict reduce/canonicalize boundary.** reduce = metadata-only. canonicalize_into = + buffer-reading. No ambiguity, works on GPU. + +**Weaknesses:** + +- **No encoding-preserving mask application.** `canonicalize_into` always decompresses into a + `CanonicalBuilder`. There is no VTable mechanism for an encoding to absorb a mask without + decompressing. Exporters that want to preserve an encoding (DuckDB FSST vectors, Arrow + dictionary arrays) must call encoding-specific filter methods outside the framework. See + trade-off #2 for the concrete FSST example. -Constants are inherently special. "This value repeated N times without N copies" requires -*someone* to know about broadcasting, and no representation trick eliminates that. +- **Fused patterns require parent-knows-child.** Dict must check `as_opt::()` to + fuse Dict-RLE. An external encoding can't volunteer a fused path for a built-in parent + without modifying the parent. The known fused patterns (Dict-RLE) are all between encodings + in `vortex-array`, so this isn't a practical problem today, but it limits extensibility. -`ConstantArray` continues to exist as an encoding. Its `canonicalize_into` expands the value -into the builder, just like any other encoding decompresses. The only place constants get -special treatment is ScalarFnArray's `canonicalize_into` -- one check to avoid expanding -constants before -passing to the scalar function. This preserves the `Columnar` enum: +- **Exporters can't inspect intermediates.** After optimize + peel, the exporter sees the tree + as-is. If the root is an opaque encoding that would decode to Dict, the exporter can't + discover that -- Phase 3 goes straight to canonical. The DuckDB dictionary export use case + only works when DictArray is already visible after Phase 1, which is the common case (Vortex + files store Dict as DictArray) but not guaranteed. + +- **Still recursive.** `canonicalize_into` recurses into children. On deeply nested trees + (many layers of encoding, deeply nested structs/lists), this can still overflow the stack -- + the same class of problem the current executor has. An explicit work stack or trampoline + could mitigate this but adds implementation complexity. + +--- + +## Option B: Cleaned-up iterative model + +Keep the framework-driven iteration loop but clean up the VTable surface, enforce the +reduce/execute boundary, and thread `Mask` through execution. + +### VTable surface (3 methods) ```rust -pub enum Columnar { - Canonical(Canonical), - Constant(ConstantArray), +pub trait VTable { + fn reduce(array: &Self::Array) -> VortexResult>; + fn reduce_parent( + array: &Self::Array, + parent: &ArrayRef, + child_idx: usize, + ) -> VortexResult>; + + /// Take one execution step toward materialization. + /// Returns an ExecutionStep telling the framework what happened. + fn execute( + array: &Self::Array, + mask: &Mask, + ctx: &mut ExecutionCtx, + ) -> VortexResult; } ``` -`ScalarFn::execute` receives `Vec`. Most scalar function implementations handle -constant inputs efficiently -- e.g., `add(primitive, constant(1))` broadcasts the scalar -without allocating a repeated buffer. +```rust +pub enum ExecutionStep { + /// Fully materialized. + Done(Columnar), + /// Rewrote to a new array tree. Framework should optimize and execute again. + Rewrite(ArrayRef), + /// Composed the mask and delegated to child. Framework should continue + /// executing the returned array with the new mask. + Peel(ArrayRef, Mask), +} +``` -Reduce rules interact with constants via `as_opt::()` to read the scalar value -(metadata access, not buffer access). Constant folding, constant propagation (`slice(constant) -→ constant`, `filter(constant, mask) → constant`), and related rules are all metadata-only. +| Method | Purpose | Buffer access | +|-----------|---------------------------------------------|---------------| +| `reduce` | Self-rewrite (metadata only) | Never | +| `reduce_parent` | Rewrite parent (expression push-down) | Never | +| `execute` | One step toward materialization | Yes | -### Per-encoding canonicalize_into +### Framework loop -Each encoding implements a single decompression method that handles selection: +```rust +pub fn execute_to_columnar( + mut array: ArrayRef, + mut mask: Mask, + ctx: &mut ExecutionCtx, +) -> VortexResult { + for _ in 0..MAX_ITERATIONS { + // Check termination + if let Some(c) = array.as_opt::() { + return Ok(Columnar::Constant(c.apply_mask(&mask))); + } + if let Some(c) = array.as_opt::() { + return Ok(Columnar::Canonical(mask.apply_to(c.into())?)); + } -- **BitPacked**: unpack only selected positions into PrimitiveBuilder -- **FSST**: decompress only selected strings into VarBinViewBuilder -- **RunEnd**: binary-search ends to find runs intersecting the selection, expand only those -- **Dict**: materialize codes with selection, take from values -- **Chunked**: split selection across chunks, recurse per-chunk -- **Struct**: pass selection to each field's materialization -- **Constant**: create scalar repeated to selection count + // Optimize (reduce/reduce_parent to fixpoint) + array = array.optimize_recursive()?; + + // Execute one step + match array.vtable().execute(&array, &mask, ctx)? { + ExecutionStep::Done(c) => return Ok(c), + ExecutionStep::Rewrite(new_array) => { + array = new_array; + } + ExecutionStep::Peel(child, new_mask) => { + array = child; + mask = new_mask; + } + } + } + vortex_bail!("exceeded max iterations") +} +``` + +Note: `optimize_recursive()` runs inside the loop, after every step. This means reduce_parent +rules can fire on intermediate forms exposed by execution -- the key capability that Option A +lacks. + +### Per-encoding execute + +Each encoding returns a typed step, making execution transparent: + +```rust +// FilterArray: compose masks, delegate +fn execute(filter: &FilterArray, mask, ctx) -> ExecutionStep { + let composed = mask.compose(&filter.mask()); + ExecutionStep::Peel(filter.child().clone(), composed) +} + +// SliceArray: slice the mask, delegate +fn execute(slice: &SliceArray, mask, ctx) -> ExecutionStep { + let composed = mask.slice(slice.range()); + ExecutionStep::Peel(slice.child().clone(), composed) +} + +// BitPacked: fully decompress with mask +fn execute(bp: &BitPackedArray, mask, ctx) -> ExecutionStep { + let mut builder = primitive_builder(bp.dtype()); + unpack_selected(bp, &mut builder, mask)?; + ExecutionStep::Done(Columnar::Canonical(builder.finish().into())) +} + +// Dict: decompress one step -- execute codes, then take +fn execute(dict: &DictArray, mask, ctx) -> ExecutionStep { + if !dict.codes().is_canonical() { + // Execute codes one step, rebuild dict + let stepped_codes = dict.codes().execute_one_step(mask, ctx)?; + let new_dict = DictArray::new(stepped_codes, dict.values().clone()); + return ExecutionStep::Rewrite(new_dict.into_array()); + } + // Codes are canonical -- do the take with mask + let result = take_canonical(dict.values(), dict.codes().as_primitive(), mask, ctx)?; + ExecutionStep::Done(Columnar::Canonical(result)) +} + +// ScalarFnArray: execute children, then apply function +fn execute(sfn: &ScalarFnArray, mask, ctx) -> ExecutionStep { + // Find first non-columnar child, execute it one step + for (i, child) in sfn.children().iter().enumerate() { + if !child.is_columnar() { + let stepped = child.execute_one_step(mask, ctx)?; + let new_sfn = sfn.replace_child(i, stepped); + return ExecutionStep::Rewrite(new_sfn.into_array()); + } + } + // All children are columnar -- apply function + let children = sfn.children_as_columnar(mask); + let result = sfn.scalar_fn().execute(children, ctx)?; + ExecutionStep::Done(Columnar::Canonical(result.into())) +} +``` + +### Fused patterns via reduce_parent -For fused decompression patterns (e.g., Dict-RLE), the parent encoding checks inline: +Because `optimize_recursive()` runs after every step, fused patterns work naturally. Dict +executes its codes one step (RunEnd → Primitive), then on the next iteration, +`optimize_recursive()` runs and reduce_parent rules can fire on the new tree. The Dict-RLE +fused path can be implemented as a reduce_parent rule on RunEnd that recognizes the Dict parent +and rewrites the tree, or Dict can check inline in its execute -- both work because the +framework loops. + +### Exporter tree inspection + +Exporters drive the loop directly, inspecting after each step: ```rust -// Dict's canonicalize_into -fn canonicalize_into(dict: &DictArray, builder, selection, ctx) { - // Check for fused Dict-RLE pattern - if let Some(runend) = dict.codes().as_opt::() { - return fused_dict_rle(dict.values(), runend, builder, selection, ctx); +fn export_chunk(mut array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { + let mut mask = Mask::new_true(array.len()); + + for _ in 0..MAX_ITERATIONS { + array = array.optimize_recursive()?; + let (peeled, m) = array.peel_selection(); + array = peeled; + mask = mask.compose(&m); + + if let Some(dict) = array.as_opt::() { + return export_dict(dict, &mask, ctx); + } + if array.is_canonical() { + return export_canonical(mask.apply_to(array)?, ctx); } - // Generic path: materialize codes with selection, take from values - let mut code_builder = primitive_builder(dict.codes().dtype()); - dict.codes().canonicalize_into(&mut code_builder, selection, ctx)?; - let codes = code_builder.finish_into_primitive(); - take_into_builder(builder, dict.values(), &codes, ctx) + match array.vtable().execute(&array, &mask, ctx)? { + ExecutionStep::Done(c) => return export_columnar(c, ctx), + ExecutionStep::Rewrite(new) => { array = new; } + ExecutionStep::Peel(child, m) => { array = child; mask = m; } + } + } + vortex_bail!("exceeded max iterations") } ``` -### Decompression cache and CSE +The exporter inspects the tree after every step. If an opaque encoding decodes to DictArray +during execution, the exporter sees it on the next iteration and can export it as a dictionary. + +### Trade-offs + +**Strengths:** + +- **Encoding-preserving mask application.** An encoding can implement `execute(mask)` to + absorb the mask and return `Rewrite(still_encoded)`. FSST can filter without decompressing, + and exporters see the result through the standard loop -- no encoding-specific logic needed + for the mask-application step. +- **Exporters inspect intermediates.** The DuckDB exporter can find DictArray even when it's + hidden behind an opaque encoding, because it inspects after each step. +- **Child-volunteers extensibility preserved.** A fused decompression pattern can be expressed + as a reduce_parent rule or an inline check, without closing the door on external encodings + discovering patterns through the iteration. +- **Same 3 VTable methods.** reduce, reduce_parent, execute. Clean. +- **No stack overflow risk.** The outer loop is iterative -- each step returns control to the + framework. Stack depth is bounded to one execution step, regardless of tree depth. + +**Weaknesses:** + +- **Iteration loop remains.** Execution is still iterative with a MAX_ITERATIONS bound. Each + step produces a new array tree, and the framework loops. This is more predictable than before + (no execute_parent, Mask threaded through, typed ExecutionStep) but still requires + tracing through iterations to understand execution. The recursive model's "one call, done" + is easier to reason about. +- **Intermediate allocations.** Each `Rewrite` step allocates a new array tree. The recursive + model writes directly into the builder with no intermediates. For hot paths (large batches, + tight loops), the allocation overhead may be measurable. +- **Mask and the loop interact awkwardly.** When an encoding returns `Peel(child, + new_mask)`, the mask changes mid-loop. The framework must track the current mask across + iterations. In the recursive model, the mask flows naturally through function arguments. +- **No decompress-into-buffer.** The iterative model returns `Columnar` from `Done` -- the + encoding owns its output. There is no way to pass a caller-owned `CanonicalBuilder` through + the loop, because the framework doesn't know whether this step is final or intermediate. + This is the primary pain point that motivated this RFC, and Option B does not address it. + ChunkedArray still can't share a builder across chunks. Exporters still can't write directly + into their output format. + +--- + +## Fundamental trade-offs + +The two options differ on three architectural axes. These are genuine trade-offs -- each option +wins on some axes and loses on others. They are not fixable with clever engineering; they follow +from the structural decision of recursion vs. iteration. + +### 1. Caller-owned output buffers (Option A wins) + +In Option A, the caller creates a `CanonicalBuilder` and passes it down through the recursive +descent. Every encoding writes directly into this caller-owned buffer. The buffer exists before +materialization begins and is filled during a single pass. + +In Option B, each encoding returns `ExecutionStep::Done(Columnar)` -- it allocates and owns its +output. The caller receives the finished result. There is no way to pass a builder through the +loop because the encoding doesn't know at call time whether this is the final step or an +intermediate rewrite that will be fed back into the loop. + +**Why this matters:** + +- **ChunkedArray concat avoidance.** With a caller-owned builder, ChunkedArray iterates its + chunks and each chunk writes into the same builder. The output is a single contiguous + allocation. Without it, each chunk produces its own `Columnar`, and the framework must + concatenate them -- an extra copy of the entire column. + +- **FSST zero-copy push.** FSST can decompress string views and data buffers directly into a + `VarBinViewBuilder`'s internal storage, adjusting view offsets in-place. This avoids + materializing an intermediate string array. With `Done(Columnar)`, FSST must allocate the + `VarBinView` canonical array itself, losing the ability to share buffer space with other + chunks or avoid a final copy. + +- **Pre-allocated output.** When the output length is known (common case: the selection count), + the builder can pre-allocate once. The iterative model allocates per step. + +This trade-off is structural: passing a mutable builder down requires a call stack (recursion). +A loop that yields intermediate values cannot thread a mutable builder through iterations +without fundamentally changing the `ExecutionStep` API (e.g., adding a `Done`-with-builder +variant), which would complicate the common case where an encoding returns `Rewrite`. + +### 2. Encoding-preserving mask application (Option B wins) + +Some encodings can apply a row mask **without decompressing**. FSST can filter its compressed +data and produce a smaller FSST array. This matters when the downstream consumer natively +supports the encoding -- DuckDB supports FSST vectors, so exporting a filtered FSST directly +avoids decompression entirely. + +**Concrete example:** `filter(scalar_fn(upper, [fsst(data)]), mask)` + +Both options share the reduce phase. FSST's reduce_parent pushes `upper` into the symbol +table (metadata-only, analogous to Dict pushing scalar functions into values). After reduce + +peel, both options see: `fsst_upper(data)` + mask. + +In **Option B**, FSST's standard `execute(mask)` method handles this: -`ExecutionCtx` holds a pointer-identity cache (`HashMap<*const dyn Array, ArrayRef>`) scoped -to a single `canonicalize()` call. This serves two purposes: +```rust +fn execute(fsst: &FSSTArray, mask, ctx) -> ExecutionStep { + let filtered = fsst.filter_preserving(&mask)?; // still FSST + ExecutionStep::Rewrite(filtered.into_array()) +} +``` + +The DuckDB exporter drives the loop, sees FSST on the next iteration, and exports directly. +The standard path would execute again next iteration and decompress -- no harm. Any encoding +can implement this pattern through the general VTable. + +In **Option A**, the exporter must use encoding-specific logic outside the framework: + +```rust +fn export_chunk(array: ArrayRef, ctx: &mut ExecutionCtx) -> DuckDBVector { + let array = array.optimize_recursive()?; + let (array, mask) = array.peel_selection(); + + // Encoding-specific: exporter must know about FSST + if let Some(fsst) = array.as_opt::() { + let filtered = fsst.filter_preserving(&mask)?; + return export_fsst(filtered, ctx); + } + + let mut builder = CanonicalBuilder::new(array.dtype()); + array.canonicalize_into(&mut builder, &mask, ctx)?; + export_canonical(builder.finish()) +} +``` + +The general `canonicalize_into` path always decompresses -- there is no VTable mechanism for +an encoding to absorb a mask without decompressing. The exporter must know about each encoding +it wants to preserve and call encoding-specific methods. + +**The difference:** Option B handles encoding-preserving mask application through the standard +`execute` VTable method -- any encoding can participate. Option A requires the exporter to have +encoding-specific knowledge for each format it wants to preserve. In practice, exporters +already need encoding-specific logic to export to native formats (DuckDB FSST vectors, Arrow +dictionary arrays), so the additional burden is the mask-application call rather than just an +`as_opt` check. + +This extends to cross-crate encodings. An external encoding that can filter itself without +decompressing just implements `execute(mask) → Rewrite(filtered_self)` in Option B. In +Option A, every exporter that wants to preserve that encoding must be updated to call its +specific filter method. + +### 3. Exporter intermediate inspection (Option B wins) + +In Option B, exporters drive the loop directly and can inspect the array tree after each step. +After an encoding applies a mask via `Rewrite`, the exporter sees the result on the next +iteration and can export it in its native format. + +In Option A, exporters see the tree after Phase 1 (optimize) and Phase 2 (peel selection). +When the encoding the exporter cares about is already visible after peel (the common case -- +Dict is stored as DictArray, FSST is stored as FSSTArray), the exporter can intercept and +handle it with encoding-specific logic. This covers the typical Vortex file scan path. The +gap is when an encoding is hidden behind a layer that must be executed to reveal it, or when +the exporter wants the encoding to absorb the mask through the general VTable (see trade-off +#2 above). + +### Summary + +| Trade-off | Winner | Practical impact today | +|-----------|--------|----------------------| +| Caller-owned output buffers | **Option A** | **Concrete.** ChunkedArray concat avoidance and FSST zero-copy decompression are measurable. | +| Encoding-preserving mask application | **Option B** | **Concrete.** FSST filter-without-decompress, Dict filter-without-decompress. Exporters need encoding-specific logic in Option A. | +| Exporter intermediate inspection | **Option B** | **Moderate.** Exporters can discover encodings revealed by execution. Fixed by peel_selection when the filter is at the root. | + +Secondary differences: + +| Aspect | Option A | Option B | +|--------|----------|----------| +| Stack overflow risk | Still recursive (can overflow on deep trees) | Iterative loop, bounded stack depth | +| Intermediate allocations | None (direct to builder) | One array tree per `Rewrite` step | +| Debuggability | One recursive call, stack trace shows position | Loop iterations, requires tracing | +| Mask threading | Natural function argument | Mutable state across iterations | +| Fused patterns (Dict-RLE) | Parent checks child inline | reduce_parent on intermediates, or inline | +| GPU kernel fusion potential | Clean tree maps to single kernel | Step-by-step harder to fuse | +| Determinism | Always same call sequence | Iteration count varies with input | + +## What changes (shared) + +**Removed (both options):** + +- `execute_parent` VTable method -- metadata-only rewrites move to `reduce_parent`; + canonical-type kernels move into `ScalarFn::execute`. +- `append_to_builder` VTable method. +- Selection-related adaptors -- `FilterReduceAdaptor`, `FilterExecuteAdaptor`, + `SliceReduceAdaptor`, `SliceExecuteAdaptor`, and all their per-encoding implementations. + `TakeExecuteAdaptor` is also removed (Take/gather is not row masking; it remains an + encoding-internal concern). + +**Retained (both options):** + +- `reduce` / `reduce_parent` -- strictly metadata-only. +- `ScalarFnArray` -- per-node lazy computation with `reduce_parent` push-down. +- Filter/Slice array wrappers -- for lazy representation. Could be unified into a single + `MaskedSelectionArray` in the future. Take remains separate (gather semantics, not masking). +- `scalar_at` -- separate VTable method for single-element access. -- **Common sub-expression elimination**: when the same array node appears as a child of - multiple ScalarFnArray nodes (e.g., `x + x`), it is materialized once. -- **Shared block decompression**: when a ZStd-compressed array is sliced into many chunks, all - slices share the same underlying compressed block. The first slice decompresses the block and - caches the result; subsequent slices index into it. +**New (both options):** -The cache key is the raw pointer to the array, which is stable because arrays are `Arc`-based. -The cache is dropped after `canonicalize()` returns, so it does not leak memory across calls. +- `Mask::compose()` and `Mask::apply()` methods on the existing `Mask` type. +- `Array::peel_selection()` -- strip selection wrappers from root. -### Worked example: DuckDB dictionary export +**New (Option A only):** + +- `CanonicalBuilder` -- one-level-deep builder enum for decompress-into. +- `canonicalize_into` VTable method -- single recursive descent with fused mask. + +**New (Option B only):** + +- `ExecutionStep` enum -- typed return from execute, replacing bare `ArrayRef`. +- `execute` takes `&Mask` parameter -- row mask threaded through the loop. + +## Worked example: DuckDB dictionary export A Vortex file scan produces a chunk with a dictionary column. The query has a filter predicate (`name = 'alice'`) evaluated to a mask, and a projection (`upper(name)`). The array tree: @@ -392,7 +745,7 @@ filter( ) ``` -**Phase 1: optimize.** `reduce_parent` pushes `upper` into dict values: +**Optimize.** `reduce_parent` pushes `upper` into dict values: ``` filter( @@ -404,185 +757,88 @@ filter( ) ``` -**Phase 2: peel selections.** Filter is at the root, so it peels: +**Peel selections.** Filter is at the root, so it peels: ``` array = dict(codes: bitpacked(...), values: scalar_fn(upper, [fsst(...)])) -selection = Selection::Mask(mask) +mask = filter_mask ``` -**Phase 3: export.** The DuckDB exporter inspects the tree: +**Export (Option A).** The exporter inspects the tree after optimize + peel: ```rust fn export_chunk(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { - let array = array.optimize_recursive()?; - let (array, selection) = array.peel_selection(); + let array = array.optimize_recursive()?; + let (array, mask) = array.peel_selection(); - if let Some(dict) = array.as_opt::() { - return export_dict(dict, &selection, ctx); - } + if let Some(dict) = array.as_opt::() { + return export_dict(dict, &mask, ctx); + } - // Fallback: full materialization - let mut builder = CanonicalBuilder::new(array.dtype()); - array.canonicalize_into(&mut builder, &selection, ctx)?; - export_canonical(builder.finish()) -} - -fn export_dict( - dict: &DictArray, - selection: &Selection, - ctx: &mut ExecutionCtx, -) -> VortexResult { - // Materialize ALL unique values (no selection -- values are shared) - let mut val_builder = CanonicalBuilder::new(dict.values().dtype()); - dict.values().canonicalize_into(&mut val_builder, &Selection::All, ctx)?; - let values = val_builder.finish(); - - // Materialize codes WITH selection (only rows that passed the filter) - let mut code_builder = primitive_builder(dict.codes().dtype()); - dict.codes().canonicalize_into(&mut code_builder, selection, ctx)?; - let codes = code_builder.finish_into_primitive(); - - Ok(DuckDBVector::dictionary(values, codes)) + let mut builder = CanonicalBuilder::new(array.dtype()); + array.canonicalize_into(&mut builder, &mask, ctx)?; + export_canonical(builder.finish()) } ``` -**Values path**: `scalar_fn(upper, [fsst(["alice","bob","charlie"])])` materializes via -ScalarFnArray's `canonicalize_into`. FSST bulk-decompresses 3 strings, `upper` runs on canonical -input, producing `["ALICE","BOB","CHARLIE"]`. - -**Codes path**: `bitpacked([0,1,0,2,1,0,...])` materializes with `Selection::Mask(mask)`. -BitPacked unpacks only the positions where the mask is true. - -**Result**: DuckDB gets a dictionary vector with 3 uppercase values and only the selected -codes. `upper()` was computed on 3 unique values instead of N rows. The filter was fused with -bit-unpacking. No intermediate arrays were materialized. - -### What changes - -**Removed:** - -- `execute` VTable method -- replaced by `canonicalize_into(builder, selection)`. -- `execute_parent` VTable method -- selection-related uses replaced by `Selection`; - compressed-domain computation uses replaced by `reduce_parent` rules. -- The executor loop -- no more iterative execute-until-canonical. Optimization is a reduce-only - loop; materialization is a single recursive descent. -- Selection-related adaptors -- `FilterReduceAdaptor`, `FilterExecuteAdaptor`, - `SliceReduceAdaptor`, `SliceExecuteAdaptor`, `TakeExecuteAdaptor`, and all their per-encoding - implementations (~40+ impls total). -- Canonical-type `execute_parent` kernels -- Primitive + Compare, Bool + FillNull, Decimal + - Between, etc. These move into the scalar function's `execute` method where they belong. - -**Retained:** - -- `reduce` / `reduce_parent` -- unchanged. Still drive expression push-down, cast push-down, - constant folding, structural simplification. Strictly metadata-only, no buffer access. -- `ScalarFnArray` -- kept as per-node lazy computation. Composes with `reduce_parent` for - expression push-down into compressed encodings. -- Filter/Slice/Take array wrappers -- still exist for lazy representation. Their - `canonicalize_into` composes them into the Selection and delegates to the child. These could be - unified into a single `SelectionArray` wrapping a `Selection` in the future. - -**New:** - -- `Selection` type -- unifies Filter, Slice, and Take into a single row-selection - representation that composes and threads through materialization. -- `CanonicalBuilder` -- one-level-deep builder enum. Struct children accumulate as - `Vec` rather than being decompressed into child builders. -- `Array::peel_selection()` -- method to strip selection wrappers from the root, used by - exporters that want to inspect the tree before materializing. - -**Summary comparison:** - -| Aspect | Current | Proposed | -|--------------------------------|---------------------------------------------------------------------------------|---------------------------------------------------| -| VTable methods | 5 (`reduce`, `reduce_parent`, `execute`, `execute_parent`, `append_to_builder`) | 3 (`reduce`, `reduce_parent`, `canonicalize_into`) | -| Adaptor types per operation | 2 (Reduce + Execute) | 1 (Reduce only, for expression push-down ops) | -| Selection handling | Per-encoding Filter, Slice, Take adaptors | Single `Selection` parameter | -| Materialization | Iterative loop calling execute until convergence | Single recursive descent through `canonicalize_into` | -| Executor loop | reduce → reduce_parent → execute_parent → execute, repeated | optimize (reduce loop) → peel selections → build | -| Builder model | Fully recursive (forces deep canonicalization) | One-level canonical (children stay compressed) | -| Fused decompression (Dict-RLE) | Child volunteers via execute_parent | Parent checks for pattern in its own `canonicalize_into` | -| Buffer access in reduce | Ambiguous | Never (strict, works on any device including GPU) | - -### Migration path - -The migration is designed so that every step leaves the codebase in a working state. The old -and new execution paths coexist until the new path covers all encodings, at which point the -old path is deleted. Each phase is independently shippable. - -**Phase 1: Foundation types.** Introduce `Selection` (with `compose`, `count`, `apply`, -`apply_in_place`), `CanonicalBuilder` enum, and `Array::peel_selection()`. All additive, no -behavior changes. - -**Phase 2: Add `canonicalize_into` with fallback default.** Add `canonicalize_into` to `VTable` with a -default implementation that bridges to the old execute path: +**Export (Option B).** The exporter drives the loop, inspecting after each step: ```rust -fn canonicalize_into( - array: &Self::Array, - builder: &mut CanonicalBuilder, - selection: &Selection, - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - // Fallback: use existing execute-to-canonical, then apply selection - let canonical = Canonical::execute(array.as_array_ref().clone(), ctx)?; - let selected = selection.apply_to_array(canonical.into_array())?; - builder.extend_from_array(&selected)?; - Ok(()) +fn export_chunk(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { + let (mut array, mut mask) = (array, Mask::new_true(array.len())); + for _ in 0..MAX_ITERATIONS { + array = array.optimize_recursive()?; + let (peeled, m) = array.peel_selection(); + array = peeled; + mask = mask.compose(&m); + + if let Some(dict) = array.as_opt::() { + return export_dict(dict, &mask, ctx); + } + if array.is_canonical() { + return export_canonical(mask.apply_to(array)?); + } + match array.vtable().execute(&array, &mask, ctx)? { + ExecutionStep::Done(c) => return export_columnar(c), + ExecutionStep::Rewrite(a) => { array = a; } + ExecutionStep::Peel(a, m) => { array = a; mask = m; } + } + } + vortex_bail!("export did not converge") } ``` -Wire up `canonicalize()` using the three-phase pipeline. Both old (`execute::`) and -new (`canonicalize()`) paths work. Integration tests assert equivalence. +Both options produce the same result: DuckDB gets a dictionary vector with 3 uppercase values +and only the selected codes. The difference is that Option B can discover DictArray even if it +was hidden behind an opaque encoding, while Option A only sees what's visible after Phase 1. -**Phase 3: Migrate canonical encodings.** One encoding per PR: Null, Bool, Primitive, Decimal, -VarBinView, Struct, List/FixedSizeList, Extension, Constant. Each writes directly into the -builder with fused selection. Independent and parallelizable. +## Migration path -**Phase 4: Migrate selection wrappers.** Filter, Slice, Chunked, Masked compose into -`Selection` instead of executing independently. +The migration is the same for both options. The key difference is Phase 3/5 -- whether +encodings implement `canonicalize_into` (Option A) or return `ExecutionStep` (Option B). -**Phase 5: Migrate compressed encodings.** Selection-fused decompression for BitPacked, -RunEnd, Dict, FSST, ALP, FoR, ZStd, Sparse, and others. Start with highest-value encodings. -Each encoding is an independent PR. +**Phase 1: Foundation types.** Add `Mask::compose()`/`Mask::apply()`, introduce `CanonicalBuilder`, +`Array::peel_selection()`. All additive. -**Phase 6: Move `execute_parent` → `reduce_parent`.** Compressed-domain rewrites (Dict + -Compare, ALP + Compare, FoR + Compare, FSST + Compare) become `reduce_parent` rules. -Canonical-type kernels (Primitive + Compare, Bool + FillNull) move into `ScalarFn::execute`. -Once empty, remove `execute_parent` from VTable. +**Phase 2: Add new execution method with fallback.** Add `canonicalize_into` (Option A) or +typed `execute` (Option B) with a default that bridges to the old path. Both old and new paths +work. -**Phase 7: Delete old execution path.** Delete all selection adaptor traits and their ~40+ -per-encoding implementations. Delete `execute` VTable method, the executor loop, and the -`Executable` trait. All callers of `execute::` migrate to `canonicalize()` (the -free function that runs the full three-phase pipeline). +**Phase 3: Migrate canonical encodings.** One per PR. Independent, parallelizable. -**Phase 8: Replace `scalar_at`.** `scalar_at` becomes a free function using -`Selection::Index`. Remove `OperationsVTable::scalar_at` and per-encoding implementations. +**Phase 4: Migrate selection wrappers.** Filter, Slice, Chunked, Masked compose into +`Mask`. -**Phase 9: Wire up exporters.** DuckDB exporter uses optimize → peel → inspect → export. -Add decompression cache to `ExecutionCtx`. +**Phase 5: Migrate compressed encodings.** Mask-fused decompression. Independent per +encoding. -**Ordering and parallelism:** +**Phase 6: Move execute_parent → reduce_parent.** Metadata-only rewrites become reduce rules. +Canonical-type kernels move into ScalarFn::execute. Remove execute_parent. -``` -Phase 1: Foundation types - │ -Phase 2: canonicalize_into with fallback - │ - ├── Phase 3: Canonical encodings (independent per encoding) - ├── Phase 4: Selection wrappers - ├── Phase 5: Compressed encodings (independent per encoding) - └── Phase 6: execute_parent → reduce_parent (independent per rule) - │ - Phase 7: Delete old path (after 3-6 complete) - │ - ├── Phase 8: scalar_at → Selection::Index - └── Phase 9: Exporter updates -``` +**Phase 7: Delete old path.** Remove old `execute` method, the `Executable` trait, and all +selection adaptors. -Phases 3-6 can proceed in parallel across contributors. The fallback default in Phase 2 ensures -nothing breaks while encodings are migrated at their own pace. Phase 7 is the cleanup gate. +**Phase 8: Wire up exporters.** DuckDB, Arrow exporters migrate to new API. ## Compatibility @@ -591,144 +847,80 @@ execution engine. **Public API breakage:** -- `Executable` trait and `execute::()` are removed. Callers migrate to `canonicalize()` or - call `canonicalize_into` directly. -- `OperationsVTable::scalar_at` is removed. Callers use the free function `scalar_at()` which - delegates to `canonicalize_into` with `Selection::Index`. -- Third-party encodings that implement `execute`, `execute_parent`, or selection adaptors must - migrate to `canonicalize_into`. The fallback default (Phase 2) provides a bridge during - migration. - -**Performance implications:** - -- Selection-fused decompression should improve performance for filtered/sliced reads by - avoiding intermediate materialization. -- Single-element access via `Selection::Index` may be marginally slower than today's - `scalar_at` for encodings that currently fast-path scalar access. These encodings can - fast-path `Selection::Index` in their `canonicalize_into` if needed. -- The decompression cache avoids redundant work for shared sub-expressions and ZStd blocks. +- `Executable` trait and `execute::()` are removed. Callers migrate to `canonicalize()` / + `execute_to_columnar()`. +- Third-party encodings must migrate to the new execution method. The fallback default + (Phase 2) provides a bridge during migration. ## Drawbacks -- **Migration effort.** The migration touches every encoding in the codebase (~33 total). The - phased approach with a fallback default mitigates this, but it is still significant work - spread across many PRs. - -- **Fused pattern detection moves to the parent.** In the current model, a child encoding can - volunteer to handle its parent's execution via `execute_parent` (e.g., RunEnd volunteers for - Dict-RLE). In the new model, the parent must check for the pattern in its own - `canonicalize_into`. This means the parent must know about the child encoding, which is a - slight inversion of responsibility. In practice, the known fused patterns (Dict-RLE) involve - encodings that are both Arrow-supported and live in the builtin `vortex-array` crate, so the - cross-encoding dependency is not a crate boundary issue. - -- **Selection composition complexity.** `Selection::compose` must handle all variant - combinations correctly (Mask + Range, Indices + Mask, etc.). This is concentrated complexity - in one well-tested type, but getting it wrong would cause subtle data corruption. - -- **`Indices` conflates selection with gather.** The first four `Selection` variants are pure - row subsetting (output <= input, preserves order, no nulls introduced). `Indices` is a - gather: it can repeat rows, reorder them, and introduce nulls. Bundling both concepts into - one enum is a pragmatic compromise -- the alternative (a separate gather API every encoding - must implement) recreates the combinatorial explosion this RFC aims to eliminate -- but it - means `Selection` is not a clean abstraction. +- **Migration effort.** ~33 encodings must migrate. The phased approach with a fallback default + mitigates this. + +- **Mask composition complexity.** `Mask::compose` must be correct for all combinations of + internal representations (`AllTrue`, `AllFalse`, `Values`). This is concentrated, well-tested + complexity, but getting it wrong causes data corruption. ## Alternatives -### Keep the iterative execution model +### Keep the iterative execution model as-is -We could keep the current `execute`/`execute_parent` loop and address individual pain points -(e.g., adding selection fusion as a new adaptor). This avoids the migration cost but perpetuates -the unclear distinction between reduce and execute, the combinatorial adaptor explosion, and the -difficulty of reasoning about the executor loop. The VTable surface stays at 5 methods. +Address individual pain points without restructuring. Avoids migration cost but does not +solve decompress-into-buffer and perpetuates the adaptor boilerplate. ### Use a single ExpressionArray instead of per-node ScalarFnArray -Instead of one ScalarFnArray per operation, a single ExpressionArray could hold an entire -expression tree. This would be simpler in some ways but prevents `reduce_parent` from -interacting with individual expression nodes. The per-node approach is what makes Dict's -`reduce_parent` able to push `compare` into dict values -- the key optimization for -dictionary-encoded predicates. +Prevents `reduce_parent` from interacting with individual expression nodes -- the key +optimization for dictionary-encoded predicates. ### Use `dyn ArrayBuilder` instead of `CanonicalBuilder` enum -The builder could be a trait object (`&mut dyn ArrayBuilder`) instead of an enum. This is more -extensible but prevents encodings from matching on the concrete builder type. FSST's ability to -push directly into `VarBinViewBuilder` (avoiding an intermediate string allocation) depends on -knowing the builder variant at compile time. +More extensible but prevents zero-copy builder paths. FSST pushing directly into +`VarBinViewBuilder` depends on knowing the builder variant at compile time. -### Separate `scalar_at` from `Selection` +### Subsume `scalar_at` into Mask -We could keep `scalar_at` as a separate VTable method for single-element access, rather than -subsuming it into `Selection::Index`. This avoids any performance regression for scalar access -but adds a fourth VTable method and requires every encoding to implement it separately. +Deferred. Single-element access via a builder round-trip would regress hot paths. Revisit +after profiling. ## Prior Art -- **DataFusion** has `PhysicalExpr::evaluate_selection`, which takes an explicit selection - (equivalent to our `Selection`) and threads it through expression evaluation. This is the - closest analogue to threading `Selection` through `canonicalize_into`. +- **DataFusion** has `PhysicalExpr::evaluate_selection`, threading an explicit selection + through expression evaluation. ## Unresolved Questions -- **Selection composition correctness**: Composing Mask + Range + Indices needs careful - implementation and thorough testing, especially for `Indices` which introduces repetition, - reordering, and nullability. This is a well-tested utility type vs N x 3 adaptor - implementations, so the tradeoff is favorable, but it needs to be right. +- **Mask composition correctness**: `Mask::compose` and `Mask::slice` interactions need + careful implementation and thorough testing. - **CanonicalBuilder design**: The exact interface for one-level-deep building, especially for List and Extension types, needs design work. -- **Aggregate and window functions**: This RFC focuses on scalar functions. AggregateFnArray - and WindowFnArray will need their own materialization paths, which are explicitly out of scope - for this RFC. +- **Aggregate and window functions**: Out of scope. AggregateFnArray and WindowFnArray will + need their own materialization paths. -- **`Indices` semantics mismatch**: `Indices` is a gather (repetition, reordering, nullable) - while the other `Selection` variants are pure subsetting. This means `compose`, `count`, and - `apply` must handle two fundamentally different operations. Whether this tension causes - practical problems (e.g., confusing behavior in compose chains) needs to be evaluated during - implementation. +- **Option A vs Option B**: This RFC presents both. The decision reduces to: are caller-owned + output buffers (ChunkedArray concat avoidance, FSST zero-copy decompression) worth giving up + encoding-preserving mask application through the general VTable (FSST filter-without-decompress, + Dict filter-without-decompress for exporters)? Both trade-offs are concrete. In Option A, + exporters need encoding-specific filter methods; in Option B, ChunkedArray can't share a + builder across chunks. ## Future Possibilities -### Selection pull-up - -In the new model, selection wrappers compose naturally into the `Selection` parameter during -the recursive descent. However, when a selection is nested inside an expression, it cannot be -peeled: - -``` -scalar_fn(upper, [filter(dict(...), mask)]) -``` - -`peel_selection()` sees ScalarFnArray at the root, not FilterArray. If `reduce_parent` has no -rule for this function + encoding combination, the tree stays as-is and the filter gets -composed in during descent. `upper` runs on N selected rows, producing a flat canonical result. -The dictionary structure is lost. - -A future optimization could **pull selections out of expressions** to the root: - -``` -filter(scalar_fn(upper, [dict(...)]), mask) -``` - -With the selection at the root, `peel_selection()` extracts it, and subsequent optimizations -can proceed. More optimal decisions can be made when the selection is available at the top and -pushed down at execution time, rather than being trapped inside an expression. +### Mask pull-up -In practice, most scans apply filters at the top level of the array tree, so this edge case is -uncommon. But for complex expression trees with embedded selections, pull-up would enable -strictly better execution plans. +When a filter mask is nested inside an expression (`scalar_fn(upper, [filter(dict(...), mask)])`), +`peel_selection()` can't extract it. A future optimization could pull masks out of expressions +to the root, enabling further optimizations. ### Aggregate and window functions -The execution model is designed to support additional function types beyond scalar functions. -`AggregateFnArray` (for sum, min, max, count) and `WindowFnArray` will follow a similar -deferred pattern, using the same `reduce`/`reduce_parent` optimization strategy and -`Selection`-based materialization. +`AggregateFnArray` and `WindowFnArray` will follow a similar deferred pattern with +`reduce`/`reduce_parent` optimization and `Mask`-based materialization. ### GPU kernel fusion -The strict metadata-only constraint on `reduce`/`reduce_parent` means the optimized tree can -be shipped to a GPU context. A GPU-aware `canonicalize_into` could fuse the entire optimized tree -into a single kernel launch, reducing memory traffic and kernel overhead. +The strict metadata-only reduce constraint means the optimized tree can be shipped to a GPU +context. A GPU-aware materialization path could fuse the entire optimized tree into a single +kernel launch. diff --git a/proposals/0002-aggregate-functions-and-list-scalars.md b/proposals/0002-aggregate-functions-and-list-scalars.md new file mode 100644 index 0000000..854de89 --- /dev/null +++ b/proposals/0002-aggregate-functions-and-list-scalars.md @@ -0,0 +1,858 @@ +- Start Date: 2026-02-26 +- RFC PR: [vortex-data/rfcs#0000](https://github.com/vortex-data/rfcs/pull/0000) +- Tracking Issue: [vortex-data/vortex#0000](https://github.com/vortex-data/vortex/issues/0000) + +## Summary + +Introduce `AggregateFnVTable` to Vortex as a trait for defining aggregate operations, and a +single `ListAggregate` scalar function that applies any aggregate to a list column. There is +no dedicated `AggregateFnArray` — `ListAggregate` is a `ScalarFnVTable` implementation whose +lazy evaluation is handled by the existing `ScalarFnArray` infrastructure. Child encodings +match on `ExactScalarFn` in their reduce_parent rules for aggregate-specific +optimizations. + +The key insight is the **aggregate/list-scalar duality**: a list column stored as +`(offsets, elements)` is a pre-materialized grouping. Computing `list_sum(list_column)` is +literally a grouped `sum` over the flat elements array partitioned by offsets. Rather than +implementing N separate list scalar functions (`list_sum`, `list_min`, `list_max`, ...), +we implement N aggregate functions and a single `ListAggregate` scalar function that applies +any aggregate to a list column. + +The `Accumulator` trait is the core primitive. It processes one group at a time via +`accumulate(batch)` / `flush()` / `finish()`, supporting streaming through chunked elements, +very large lists, and ungrouped column-level aggregation. `execute_grouped` is a convenience +built on top of the accumulator. + +Streaming ordered GROUP BY falls out naturally: the query engine constructs a `ListArray` from +`(flat_column, group_offsets)` per batch — a zero-copy operation — and wraps it in +`ScalarFnArray(ListAggregate(Sum), [list_array])`. + +This RFC also evaluates whether the canonical list encoding should switch from `ListViewArray` +(offsets+sizes) to `ListArray` (offsets-only), since the choice directly affects how cleanly +the duality works. + +## Motivation + +### Aggregate functions exist but in the old framework + +Vortex has two compute systems: + +- **`ScalarFnVTable`**: element-wise functions (1 input row → 1 output row) with lazy + `ScalarFnArray` evaluation, expression trees, and reduce/reduce_parent optimization. +- **`ComputeFn`**: kernel-based operations like `sum`, `min_max`, `nan_count`. Standalone, + not part of the expression tree, dispatched to per-encoding kernels via `inventory`. + +Aggregate operations like `sum` already exist, but as `ComputeFn` kernels. They cannot +participate in the expression tree or optimizer. This means: + +- No lazy evaluation that defers computation and can be optimized via reduce_parent. +- No way to push aggregate operations through Dict values, RunEnd runs, or other encodings + via the reduce rule framework. +- No unified framework for list scalar functions and grouped aggregates. + +### List scalar functions are redundant with aggregate functions + +The only list-specific scalar function today is `list_contains`. Adding `list_sum`, +`list_min`, `list_max`, `list_count`, `list_mean`, etc. as individual `ScalarFnVTable` +implementations would duplicate the aggregation logic that already exists in `ComputeFn` +kernels. If Vortex had first-class aggregate functions, each list scalar function would +simply be "apply this aggregate to each list" — one `ListAggregate` scalar function +parameterized by the aggregate, rather than N separate implementations. + +### The aggregate/list-scalar duality + +Consider a list column `[[1,2,3], [4,5]]` stored as: + +``` +elements: [1, 2, 3, 4, 5] +offsets: [0, 3, 5] +``` + +Computing `list_sum` yields `[6, 9]`. This is identical to computing a grouped `sum` over +elements partitioned by offset ranges `[0..3, 3..5]`. Every aggregate function has a +corresponding list scalar function: + +| Aggregate | List scalar | Grouped operation | +|--------------|------------------------|-----------------------------------| +| `sum(col)` | `list_sum(list_col)` | Sum elements per group | +| `count(col)` | `list_count(list_col)` | Count non-null elements per group | +| `min(col)` | `list_min(list_col)` | Min element per group | +| `max(col)` | `list_max(list_col)` | Max element per group | +| `mean(col)` | `list_mean(list_col)` | Mean of elements per group | + +### Streaming ordered GROUP BY + +For databases with sorted storage, GROUP BY over a sorted key is a streaming operation: +groups arrive in order and don't cross batch boundaries. This maps directly to the +aggregate/list-scalar duality: + +1. For each batch, compute group boundaries from the sorted key (run boundaries → offsets). +2. Construct `ListArray::new(value_column, group_offsets, ...)` — zero-copy. +3. Wrap in `ScalarFnArray(ListAggregate(Sum), [list_array])`. +4. Execute → one result per group. + +No hash tables, no shuffling. The same `execute_grouped` path serves both list scalar +functions and streaming ordered GROUP BY. + +### Expensive list interop + +The current canonical list encoding is `ListViewArray` (offsets+sizes, out-of-order allowed). +Converting between `ListArray` and `ListViewArray` is expensive — see the rebuild +infrastructure with modes `MakeZeroCopyToList`, `TrimElements`, `MakeExact`, and the +`is_zero_copy_to_list` flag. This complexity exists primarily to support out-of-order +offsets, which complicates the aggregate duality since `execute_grouped` needs monotonic +offsets. + +## Design + +### `Accumulator` — the core primitive + +The `Accumulator` trait is the fundamental aggregation interface. It processes one group +at a time: the caller feeds element batches, then flushes to finalize the group and start +the next one. The accumulator owns its output buffer and returns all results at the end. + +```rust +pub trait Accumulator: Send + Sync { + /// Feed a batch of element values for the current group. + /// Can be called multiple times per group (e.g., chunked elements). + fn accumulate(&mut self, batch: &ArrayRef) -> VortexResult<()>; + + /// Finalize the current group: push its result into the output buffer + /// and reset internal state for the next group. + fn flush(&mut self) -> VortexResult<()>; + + /// Return all flushed results as a single array. + /// Length = number of flush() calls. + fn finish(self: Box) -> VortexResult; +} +``` + +This handles all aggregation cases uniformly: + +**Case 1 — List scalar with contiguous elements:** +```rust +let mut acc = aggregate.accumulator(element_dtype)?; +for i in 0..n_lists { + let group_elements = elements.slice(offsets[i]..offsets[i+1])?; + acc.accumulate(&group_elements)?; + acc.flush()?; +} +acc.finish() // → ArrayRef of length n_lists +``` + +**Case 2 — Large lists with chunked elements:** +A list's elements may be a `ChunkedArray`. The accumulator handles this by feeding +each chunk separately before flushing: +```rust +let mut acc = aggregate.accumulator(element_dtype)?; +for i in 0..n_lists { + let group_elements = elements.slice(offsets[i]..offsets[i+1])?; + for chunk in iter_chunks(&group_elements) { + acc.accumulate(&chunk)?; + } + acc.flush()?; +} +acc.finish() +``` + +**Case 3 — Ungrouped full-column aggregation:** +One group, fold across chunks: +```rust +let mut acc = aggregate.accumulator(dtype)?; +for chunk in chunked_array.chunks() { + acc.accumulate(&chunk)?; +} +acc.flush()?; +acc.finish() // → 1-element array +``` + +**Case 4 — ListView with scattered elements:** +Canonicalize to ListArray first (rebuilding to sorted form), then use Case 1 or 2. +The accumulator itself always sees contiguous element batches. + +### `AggregateFnVTable` + +A new trait parallel to `ScalarFnVTable`. The `accumulator()` method is the required +core — `execute_grouped` and `execute_scalar` have default implementations built on it: + +```rust +pub trait AggregateFnVTable: 'static + Sized + Clone + Send + Sync { + type Options: 'static + Send + Sync + Clone + Debug + Display + PartialEq + Eq + Hash; + + fn id(&self) -> AggregateFnId; + + fn serialize(&self, options: &Self::Options) -> VortexResult>>; + fn deserialize( + &self, + metadata: &[u8], + session: &VortexSession, + ) -> VortexResult; + + /// Arity is always 1. For multi-column aggregates (e.g., covariance), the input + /// should be a struct array containing the columns. + fn arity(&self, _options: &Self::Options) -> Arity { + Arity::Exact(1) + } + + /// Result dtype for each group (not a list — the scalar result type). + fn return_dtype( + &self, + options: &Self::Options, + input_dtypes: &[DType], + ) -> VortexResult; + + /// Format in SQL style, e.g. `sum(col)`. + fn fmt_sql( + &self, + options: &Self::Options, + expr: &Expression, + f: &mut Formatter<'_>, + ) -> fmt::Result; + + /// Create an accumulator for streaming aggregation. + /// + /// This is the core primitive. All other execution methods have default + /// implementations built on this. + fn accumulator( + &self, + options: &Self::Options, + input_dtype: &DType, + ) -> VortexResult>; + + /// One-shot grouped execution over contiguous elements + monotonic offsets. + /// + /// Default: loop over groups, slice elements, accumulate each, flush, finish. + /// Override for vectorized one-shot paths (e.g., SIMD segmented sum). + fn execute_grouped( + &self, + options: &Self::Options, + input: &ArrayRef, + offsets: &ArrayRef, + ) -> VortexResult { + let n_groups = offsets.len() - 1; + let mut acc = self.accumulator(options, input.dtype())?; + for i in 0..n_groups { + let start = scalar_at(offsets, i)?; + let end = scalar_at(offsets, i + 1)?; + acc.accumulate(&input.slice(start..end)?)?; + acc.flush()?; + } + acc.finish() + } + + /// Ungrouped full-column aggregation returning a scalar. + /// + /// Default: single-group execute_grouped with offsets [0, n]. + /// Replaces standalone `ComputeFn` kernels like `compute::sum()`. + fn execute_scalar( + &self, + options: &Self::Options, + input: &ArrayRef, + ) -> VortexResult { + let n = input.len(); + let offsets: ArrayRef = buffer![0u64, n as u64].into_array(); + let result = self.execute_grouped(options, input, &offsets)?; + result.scalar_at(0) + } +} +``` + +Key differences from `ScalarFnVTable`: + +| Property | `ScalarFnVTable` | `AggregateFnVTable` | +|----------|-------------------|----------------------| +| Row semantics | 1 input → 1 output | N inputs → 1 output per group | +| Core primitive | `execute(args) -> ArrayRef` | `accumulator() -> Box` | +| Input shape | All children same length | Flat elements + offsets defining variable-size groups | +| Streaming | N/A | accumulate/flush/finish across chunks | + +### Streaming AVG example + +To illustrate how the accumulator works for a stateful aggregate, consider `Mean`: + +```rust +struct MeanAccumulator { + running_sum: f64, + running_count: u64, + output: Vec, +} + +impl Accumulator for MeanAccumulator { + fn accumulate(&mut self, batch: &ArrayRef) -> VortexResult<()> { + self.running_sum += Sum.execute_scalar(&EmptyOptions, batch)?.as_f64()?; + self.running_count += Count.execute_scalar(&EmptyOptions, batch)?.as_u64()?; + Ok(()) + } + + fn flush(&mut self) -> VortexResult<()> { + let mean = if self.running_count > 0 { + self.running_sum / self.running_count as f64 + } else { + f64::NAN + }; + self.output.push(mean); + self.running_sum = 0.0; + self.running_count = 0; + Ok(()) + } + + fn finish(self: Box) -> VortexResult { + Ok(PrimitiveArray::from_vec(self.output).into_array()) + } +} +``` + +For a list column `[[1.0, 2.0, 3.0], [4.0, 5.0]]` where the first list's elements are +chunked as `[1.0, 2.0]` and `[3.0]`: + +1. `accumulate([1.0, 2.0])` → running_sum=3.0, running_count=2 +2. `accumulate([3.0])` → running_sum=6.0, running_count=3 +3. `flush()` → push 2.0, reset +4. `accumulate([4.0, 5.0])` → running_sum=9.0, running_count=2 +5. `flush()` → push 4.5, reset +6. `finish()` → `[2.0, 4.5]` + +### Built-in aggregates + +The initial set, each implementing `AggregateFnVTable`: + +```rust +pub struct Sum; // sum of elements per group +pub struct Count; // count of non-null elements per group +pub struct Min; // minimum element per group +pub struct Max; // maximum element per group +pub struct Mean; // mean of elements per group (returns f64) +pub struct Any; // logical OR per group (bool input) +pub struct All; // logical AND per group (bool input) +pub struct CollectList; // identity: collect elements into list (returns list) +``` + +**Migration strategy**: `execute_scalar` replaces standalone `ComputeFn` kernel dispatch +(e.g., `Sum::execute_scalar` replaces `compute::sum()`). The `accumulator` implementations +can start simple (canonicalize + iterate) and be optimized over time with encoding-aware +fast paths. + +### `ListAggregate` scalar function + +A single `ScalarFnVTable` implementation that bridges list columns to the aggregate system. +Because `ListAggregate` is a scalar function, wrapping it in an expression produces a +`ScalarFnArray` — reusing the existing lazy evaluation, slicing, and reduce infrastructure +with no new array type needed. + +```rust +pub struct ListAggregate; + +pub struct ListAggregateOptions { + pub aggregate_fn: AggregateFnRef, +} + +impl ScalarFnVTable for ListAggregate { + type Options = ListAggregateOptions; + + fn id(&self) -> ScalarFnId { + ScalarFnId::from("vortex.list.aggregate") + } + + fn arity(&self, _options: &Self::Options) -> Arity { + Arity::Exact(1) + } + + fn return_dtype( + &self, + options: &Self::Options, + arg_dtypes: &[DType], + ) -> VortexResult { + let element_dtype = arg_dtypes[0].as_list_element() + .ok_or_else(|| vortex_err!("ListAggregate input must be a list"))?; + options.aggregate_fn.return_dtype(&[element_dtype.clone()]) + } + + fn execute( + &self, + options: &Self::Options, + args: ExecutionArgs, + ) -> VortexResult { + let list_input = &args.inputs[0]; + // Execute the child into a ListArray, decompose, and aggregate. + let list = list_input.to_list()?; + let elements = list.elements(); + let offsets = list.offsets(); + + // If elements are chunked, use the accumulator to stream through chunks. + // Otherwise, use one-shot execute_grouped. + if elements.is::() { + let mut acc = options.aggregate_fn.accumulator(elements.dtype())?; + let n_groups = offsets.len() - 1; + for i in 0..n_groups { + let group_elements = elements.slice( + offset_at(&offsets, i)..offset_at(&offsets, i+1) + )?; + for chunk in iter_chunks(&group_elements) { + acc.accumulate(&chunk)?; + } + acc.flush()?; + } + acc.finish() + } else { + options.aggregate_fn.execute_grouped(&elements, &offsets) + } + } + + // ... +} +``` + +Expression-level sugar: + +```rust +pub fn list_sum(list: Expression) -> Expression { + ListAggregate.new_expr( + ListAggregateOptions { aggregate_fn: Sum.bind(EmptyOptions) }, + [list], + ) +} + +pub fn list_count(list: Expression) -> Expression { + ListAggregate.new_expr( + ListAggregateOptions { aggregate_fn: Count.bind(EmptyOptions) }, + [list], + ) +} + +// list_min, list_max, list_mean, list_any, list_all analogously +``` + +This is one scalar function parameterized by the aggregate — not N separate functions. +Analogous to DuckDB's `list_aggregate(list, 'sum')`. + +**Why this works without a dedicated array type:** + +1. **Lazy evaluation.** `ScalarFnArray(ListAggregate(Sum), [list_col])` defers computation + until `execute()` is called, just as it does for any other scalar function. +2. **Slicing.** `ScalarFnArray` already slices its children. Slicing the list child + correctly adjusts both grouping and elements. +3. **Reduce rules.** `ListAggregate` can implement `ScalarFnVTable::reduce` for self-reduce + optimizations (constant folding, statistics). Encoding-specific reduce_parent rules + use `ExactScalarFn` to match on the parent (see below). +4. **Type safety.** The single child has `DType::List` — enforced by `return_dtype`. + +### Streaming ordered GROUP BY + +For databases with sorted storage, GROUP BY over a sorted key maps directly to the +aggregate/list-scalar duality. The query engine processes batches where: + +1. Groups are defined by a sorted key column (e.g., `customer_id`). +2. Groups are ordered and complete within each batch (no cross-batch groups). +3. No hash tables or shuffling needed. + +**Physical execution per batch:** + +```rust +fn execute_ordered_group_by( + value_column: &ArrayRef, + key_column: &ArrayRef, // sorted + aggregate: &AggregateFnRef, +) -> VortexResult { + // Step 1: Compute group offsets from run boundaries in the key column. + // E.g., key = [A, A, A, B, B, C] → offsets = [0, 3, 5, 6] + let group_offsets = compute_run_offsets(key_column)?; + + // Step 2: Wrap as a list — zero-copy, just references. + let list = ListArray::new(value_column.clone(), group_offsets, Validity::NonNullable); + + // Step 3: Create lazy ScalarFnArray with ListAggregate. + let scalar_fn = ListAggregate.bind(ListAggregateOptions { + aggregate_fn: aggregate.clone(), + }); + let result = ScalarFnArray::try_new(scalar_fn, vec![list.into_array()])?; + + // Step 4: Execute (or let the optimizer reduce first). + result.into_array().execute(ctx) +} +``` + +The same `execute_grouped` serves both list scalar functions (where the list already exists +in the schema) and GROUP BY (where the list is constructed from a flat column + +sort-derived offsets). + +### Reduce rules + +Since `ListAggregate` is a `ScalarFnVTable`, its reduce rules use the existing +`ScalarFnArray` optimization infrastructure. + +**Self-reduce rules (via `ScalarFnVTable::reduce`):** + +`ListAggregate` implements `ScalarFnVTable::reduce` to handle aggregate-specific +optimizations when the list child's structure is known: + +- **Constant list folding**: If the list child is a `ConstantArray` of list scalars, + compute the aggregate directly. `ListAggregate(Sum, Constant([[1,2,3]], 1000))` + → `Constant(6, 1000)`. + +- **Count from metadata**: `ListAggregate(Count, list)` can use list sizes from metadata + without touching element data. + +- **Min/Max from statistics**: If element-level statistics are available, `Min`/`Max` can + short-circuit without decompression. + +- **Sum of constant elements**: `ListAggregate(Sum, list)` where elements are constant + → `constant_value * group_size` per group. + +**Parent-reduce rules (encoding-specific, via `ExactScalarFn`):** + +Child encodings register reduce_parent rules that match on `ExactScalarFn`. +The `ExactScalarFn` matcher provides typed access to `ListAggregateOptions`, from which the +specific `AggregateFnRef` can be inspected: + +```rust +impl ArrayParentReduceRule for DictAggregateRule { + type Parent = ExactScalarFn; + + fn reduce_parent( + &self, + array: &DictArray, + parent: ScalarFnArrayView<'_, ListAggregate>, + child_idx: usize, + ) -> VortexResult> { + let aggregate_fn = &parent.options.aggregate_fn; + + // Min/Max over Dict elements → Min/Max(values) + if aggregate_fn.is::() || aggregate_fn.is::() { + // Min/max of dictionary values is the global min/max. + let values = array.values(); + return Ok(Some( + ListAggregate.new_expr_with( + parent.options.clone(), + // Rebuild list with dict values as elements + // ... + ).into_array() + )); + } + + Ok(None) + } +} +``` + +Example encoding-specific rules: + +- **Dict**: `ListAggregate(Min, List(Dict(codes, values)))` → `ListAggregate(Min, List(values))`. + `Max` similarly. `Sum` cannot push down without frequency weighting. + +- **RunEnd**: `ListAggregate(Sum, List(RunEnd(values, run_ends)))` → weighted sum. + `ListAggregate(Min/Max, List(RunEnd))` → `ListAggregate(Min/Max, List(values))`. + +These rules operate on the **elements** within the list child. The list's offsets define +the grouping; the elements are what the encoding wraps. A list child like +`ListArray(elements: DictArray, offsets: ...)` allows the Dict reduce_parent rule to fire +on the elements when the `ScalarFnArray(ListAggregate)` is being optimized. + +### Interaction with `list_contains` + +The existing `list_contains` function is NOT an aggregate — it's a true scalar function +(takes a list + needle, returns bool per row). It stays as-is. `ListAggregate` is +specifically for operations that reduce list elements to a scalar per list. + +## Canonical list representation + +The choice of canonical list encoding directly affects how cleanly the aggregate duality +works. Currently, `ListViewArray` is canonical for `DType::List`. This section evaluates +whether to switch to `ListArray`. + +### Option A: Switch canonical to `ListArray` (offsets-only) + +Change `Canonical::List` from `ListViewArray` to `ListArray`. + +`ListArray` stores elements + an (n+1)-length monotonically increasing offsets array. +List `i` spans `elements[offsets[i]..offsets[i+1]]`. + +**Advantages for the aggregate duality:** + +- `execute_grouped` receives offsets directly from the list — no conversion needed. +- Offsets are monotonic by construction, matching the `execute_grouped` contract. +- One array (n+1 integers) vs two arrays (2n integers) — less memory. +- Offset deltas (list sizes) compress extremely well with delta/FoR encoding. +- Direct match to Arrow `ListArray` and DuckDB's list format — zero-copy Arrow export. + +**Disadvantages:** + +- **Breaking change.** All code calling `to_canonical()` on list data gets `ListArray` + instead of `ListViewArray`. Migration required. +- **No out-of-order offsets.** Operations like filter/take that scramble row order must + rebuild the elements array (or produce a `ListViewArray` as a non-canonical intermediate). +- **Sequential dependency in offsets.** Reading `offsets[i+1]` depends on `offsets[i]` for + computing list size — slightly less SIMD-friendly than separate sizes. + +**Migration path:** + +1. Add `to_list()` on `ToCanonical` alongside `to_listview()`. +2. Change `Canonical::List` to hold `ListArray`. +3. `ListViewArray` becomes a non-canonical encoding (like `VarBinArray` for strings). +4. Update `list_contains` and all list compute paths. + +### Option B: Keep `ListViewArray` as canonical + +Keep the current design where `ListViewArray` (offsets + sizes, out-of-order allowed) +is canonical for `DType::List`. + +**Advantages:** + +- No breaking change. +- SIMD-friendly: sizes can be read independently without sequential offset dependency. +- Out-of-order offsets enable better compression when list order doesn't match element order. +- Filter/take produce valid `ListViewArray` directly without rebuilding. + +**Disadvantages for the aggregate duality:** + +- `execute_grouped` requires monotonic offsets. Out-of-order `ListViewArray` must be + rebuilt to sorted form first (the `MakeZeroCopyToList` path), which copies data. +- The ZCTL flag, rebuild modes, and validation code remain as ongoing complexity. +- Arrow/DuckDB export still requires conversion. +- Two arrays (2n integers) instead of one (n+1 integers). + +**Impact on `ListAggregate::execute`:** + +`ListAggregate::execute` would need to handle the non-sorted case: + +```rust +fn execute(&self, options: &Self::Options, args: ExecutionArgs) -> VortexResult { + let list_input = &args.inputs[0]; + let listview = list_input.to_listview()?; + if listview.is_zero_copy_to_list() { + // Fast path: offsets are sorted, use directly + let offsets = build_list_offsets_from_list_view(&listview); + options.aggregate_fn.execute_grouped(&listview.elements(), &offsets) + } else { + // Slow path: rebuild to sorted form first + let list = list_from_list_view(listview)?; + options.aggregate_fn.execute_grouped(&list.elements(), &list.offsets()) + } +} +``` + +### ListView as direct aggregate input + +An alternative to canonicalizing to ListArray before executing is to support `ListView` +directly in `execute_grouped`. A ListView with out-of-order offsets represents +variable-length groups that may reference scattered regions of the elements array. + +**Advantages:** + +- More powerful — supports aggregation over non-contiguous element ranges. +- Avoids the expensive rebuild-to-sorted-form step for scrambled ListViews. + +**Disadvantages:** + +- **Complexity in aggregate implementations.** Each aggregate must handle potentially + scattered, non-contiguous element access patterns instead of simple contiguous slices. +- **Unpredictable decompression cost.** A ListView with sparse, scattered offsets may + trigger excessive random-access decompression of the elements array. In the worst case + this is more expensive than one-pass decompression of all elements into contiguous form. +- **SIMD unfriendly.** Contiguous offset ranges map to sequential memory access patterns + that vectorize well. Scattered access does not. + +For now, this RFC assumes `execute_grouped` takes monotonic (n+1) offsets. Aggregates +over ListView canonicalize to sorted form first. Supporting scattered access as an +optimization can be explored in the future. + +### Comparison + +| Criterion | ListArray (Option A) | ListViewArray (Option B) | +|-----------|---------------------|--------------------------| +| Aggregate duality | Direct — offsets define groups | Indirect — must ensure sorted or rebuild | +| Arrow/DuckDB export | Zero-copy | Requires conversion | +| Memory | n+1 integers | 2n integers | +| Filter/take | Must rebuild elements | Produces valid array directly | +| SIMD for size access | Sequential offset dependency | Independent size reads | +| Offset compression | Delta/FoR on monotonic data | May be non-monotonic | +| Breaking change | Yes | No | + +## Compatibility + +This RFC does not change the file format or wire format. `ListAggregate` produces a +`ScalarFnArray` at runtime (like any other scalar function). It is not persisted to disk. + +**Public API additions:** + +- `Accumulator` trait — core streaming aggregation primitive. +- `AggregateFnVTable` trait and built-in implementations (Sum, Count, Min, Max, Mean, + Any, All, CollectList). +- `ListAggregate` scalar function. +- Expression constructors: `list_sum()`, `list_count()`, `list_min()`, `list_max()`, + `list_mean()`, `list_any()`, `list_all()`. + +**If canonical list changes (Option A):** + +- `Canonical::List` changes from `ListViewArray` to `ListArray`. +- `to_canonical()` on list data returns `ListArray` instead of `ListViewArray`. +- `ListViewArray` remains available as a non-canonical encoding. +- Migration: callers using `to_listview()` should transition to `to_list()`. + +## Drawbacks + +- **New trait surface area.** `AggregateFnVTable` and `Accumulator` are new traits to + learn, though they closely mirror `ScalarFnVTable` and the existing accumulator pattern + in `compute::sum`. + +- **Reduce rule coverage.** Not all encoding × aggregate combinations will have optimized + reduce_parent rules initially. The fallback (canonicalize list, then accumulator loop) + is correct but may be slower. + +- **Canonical list change (if Option A).** Breaking change affecting all code that calls + `to_canonical()` on list data. + +## Alternatives + +### List scalar functions as separate `ScalarFnVTable` implementations + +Implement `ListSum`, `ListCount`, `ListMin`, etc. as individual scalar functions +(like `list_contains`), each operating directly on list arrays without an aggregate +abstraction. + +**Rejected because:** Duplicates logic across N functions. No shared optimization rules. +No path to GROUP BY. Each function must independently handle list decomposition. This is +exactly the redundancy that motivates this RFC. + +### Aggregates as `ComputeFn` only + +Keep aggregates in the kernel-based `ComputeFn` system. Implement list scalar functions +by dispatching to `ComputeFn` with offset ranges. + +**Rejected because:** No lazy evaluation, no expression tree participation, no +reduce_parent optimization. This is the status quo for `sum`/`min_max` and has the +limitations this RFC addresses. + +### Dedicated `AggregateFnArray` array type + +Introduce a new array type `AggregateFnArray` parallel to `ScalarFnArray`, wrapping +a single list child and an `AggregateFnRef`: + +```rust +struct AggregateFnArray { + aggregate_fn: AggregateFnRef, + dtype: DType, + child: ArrayRef, // must have DType::List + stats: ArrayStats, +} +``` + +With a `ListAggregate` scalar function that returns an `AggregateFnArray` from its +`execute` method, creating two lazy wrappers in sequence: +`ScalarFnArray(ListAggregate) → AggregateFnArray → actual computation`. + +**Rejected because:** `AggregateFnArray` is structurally identical to +`ScalarFnArray` with one child — it duplicates the lazy evaluation, slicing, and reduce +infrastructure. Child encodings can already match on specific scalar functions using +`ExactScalarFn`, which provides typed access to `ListAggregateOptions` +and the embedded `AggregateFnRef`. No new array type is needed. + +### No accumulator — one-shot `execute_grouped` only + +Make `execute_grouped` the only execution method, with no streaming accumulation. + +**Rejected because:** Doesn't handle chunked elements. A list column may have elements +stored as a `ChunkedArray`. Without an accumulator, all elements must be materialized +into a single contiguous array before aggregation, defeating the purpose of chunked +storage. Additionally, aggregates like `Mean` need `running_sum` and `running_count` +state across element batches, and numerically stable summation (Kahan) requires ordered +accumulation. The existing `sum_with_accumulator` pattern in `ComputeFn` demonstrates +this need. + +### Grouped accumulator tracking N groups simultaneously + +An accumulator that manages all groups at once: `accumulate(input, offsets)` where offsets +has N+1 entries per call, and sub-offsets are split at chunk boundaries. + +**Rejected because:** Over-engineered for our use cases. For list scalar functions and +ordered GROUP BY, groups are processed sequentially — each group's elements are fully +available before the next group starts. The per-group flush model is simpler and handles +all cases. If vectorized all-groups-at-once processing proves necessary, it can be added +as a separate `GroupsAccumulator` trait (see Future Possibilities). + +## Prior Art + +- **Apache Arrow**: Separates `ListArray` (offsets-only) from `ListView` (offsets+sizes). + Arrow Compute has aggregate kernels separate from scalar kernels. ListView is a recent + addition, not widely adopted. + +- **DuckDB**: Uses offsets-only lists. Has `list_aggregate(list, 'func_name')` — a single + function parameterized by the aggregate name. Separates scalar and aggregate function + registries. + +- **Apache DataFusion**: Two-tier accumulator design. `AggregateUDFImpl` has a required + `accumulator()` factory method (like ours). The `Accumulator` trait handles one group at + a time with `update_batch()` → `evaluate()`, plus `state()` and `merge_batch()` for + partial aggregation (serializing intermediate state for distributed execution). The + optional `GroupsAccumulator` trait manages all groups simultaneously with + `group_indices: &[usize]` per row for vectorized hash-based grouping, plus + `evaluate(EmitTo::First(n))` for streaming emission. Our `accumulate/flush/finish` + corresponds to DataFusion's per-group `Accumulator` with built-in output buffering. + Their `state/merge` and `GroupsAccumulator` are future extensions for us (see below). + +- **Velox**: `AggregateFunction` with `addRawInput`, `addIntermediateResults`, + `extractValues` — full streaming accumulator model. Uses row-major (offset, size) + pairs for lists. + +- **Polars**: Separate scalar and aggregate expression systems. List namespace functions + like `list.sum()`, `list.min()` internally dispatch to grouped aggregation over the + list's flat values. + +## Unresolved Questions + +- **`list_contains` integration**: Should `list_contains` be reimplemented as a + `ListAggregate` with a `Contains` aggregate, or does it remain a separate scalar + function? It doesn't fit the "reduce N to 1" pattern cleanly since it takes both a + list and a needle. + +- **Multi-column aggregates**: Some aggregates operate on multiple columns + (e.g., covariance, weighted sum). With arity fixed at 1, these require a struct array + as input. Is this ergonomic enough, or do we need a separate mechanism? + +- **Window functions**: Window functions (rolling sum, rank) operate on sliding windows + rather than fixed groups. They share the "operate within parent boundaries" property + but have different execution semantics. Should they be a separate trait or an + extension of `AggregateFnVTable`? + +- **Canonical list decision**: Option A (ListArray) vs Option B (ListViewArray). This + can be decided independently of the aggregate framework and implemented as a + preparatory or follow-up change. + +- **`CollectList` semantics**: `CollectList` is the identity aggregate — it returns + the elements as a list. For the list-scalar case, `list_collect(list_col)` is a no-op. + For GROUP BY, it materializes the grouped elements into a list column. Should this + be a built-in aggregate or handled separately? + +- **Null list handling**: When a list entry is null, should `flush()` push null into + the output? Or should the caller skip accumulation and use a separate mechanism to + mark null outputs? The accumulator currently has no way to signal "this group is null + without receiving any elements." + +## Future Possibilities + +- **Partial aggregation with `state()` / `merge()`**: Adding `state()` and + `merge_batch()` methods to `Accumulator` (like DataFusion) for distributed aggregation + where partial results must be serialized and merged across nodes. Not needed for + the sequential processing model in this RFC. + +- **`GroupsAccumulator`**: A separate trait (like DataFusion's) that manages all groups + simultaneously with `group_indices` per row, enabling vectorized hash-based grouping. + Our per-group flush model handles ordered GROUP BY; this extension would handle + unordered GROUP BY with hash tables. + +- **Aggregate push-down in file scanning**: Using `ListAggregate` reduce rules to + push aggregates into `LayoutReader`, computing aggregates during file scan without + materializing full columns. + +- **`list_distinct`, `list_sort`, `list_reverse`**: List transformation functions that + don't reduce elements to a scalar. These are not aggregates and would remain as + `ScalarFnVTable` implementations. + +- **Nested aggregation**: `list_sum(list_of_lists)` producing a list of sums at the + inner level. The duality recurses naturally. + +- **GPU execution**: `execute_grouped` with monotonic offsets maps cleanly to GPU + segmented reduction primitives (e.g., `cub::DeviceSegmentedReduce`). + +- **ListView scatter optimization**: If profiling shows that rebuilding scrambled + ListViews to sorted form is a bottleneck, aggregate implementations could accept + `(offsets, sizes)` pairs for scattered access. This would be opt-in per aggregate. From c335c0d48f52afb3ad08377f2ad27bec8fb00278 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 27 Feb 2026 07:59:21 -0500 Subject: [PATCH 3/9] RFCs Signed-off-by: Nicholas Gates --- ...02-aggregate-functions-and-list-scalars.md | 62 ++++++++++--------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/proposals/0002-aggregate-functions-and-list-scalars.md b/proposals/0002-aggregate-functions-and-list-scalars.md index 854de89..0cc053a 100644 --- a/proposals/0002-aggregate-functions-and-list-scalars.md +++ b/proposals/0002-aggregate-functions-and-list-scalars.md @@ -115,15 +115,15 @@ the next one. The accumulator owns its output buffer and returns all results at pub trait Accumulator: Send + Sync { /// Feed a batch of element values for the current group. /// Can be called multiple times per group (e.g., chunked elements). - fn accumulate(&mut self, batch: &ArrayRef) -> VortexResult<()>; + fn accumulate(&mut self, batch: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<()>; /// Finalize the current group: push its result into the output buffer /// and reset internal state for the next group. - fn flush(&mut self) -> VortexResult<()>; + fn flush(&mut self, ctx: &mut ExecutionCtx) -> VortexResult<()>; /// Return all flushed results as a single array. /// Length = number of flush() calls. - fn finish(self: Box) -> VortexResult; + fn finish(self: Box, ctx: &mut ExecutionCtx) -> VortexResult; } ``` @@ -134,10 +134,10 @@ This handles all aggregation cases uniformly: let mut acc = aggregate.accumulator(element_dtype)?; for i in 0..n_lists { let group_elements = elements.slice(offsets[i]..offsets[i+1])?; - acc.accumulate(&group_elements)?; - acc.flush()?; + acc.accumulate(&group_elements, ctx)?; + acc.flush(ctx)?; } -acc.finish() // → ArrayRef of length n_lists +acc.finish(ctx) // → ArrayRef of length n_lists ``` **Case 2 — Large lists with chunked elements:** @@ -148,11 +148,11 @@ let mut acc = aggregate.accumulator(element_dtype)?; for i in 0..n_lists { let group_elements = elements.slice(offsets[i]..offsets[i+1])?; for chunk in iter_chunks(&group_elements) { - acc.accumulate(&chunk)?; + acc.accumulate(&chunk, ctx)?; } - acc.flush()?; + acc.flush(ctx)?; } -acc.finish() +acc.finish(ctx) ``` **Case 3 — Ungrouped full-column aggregation:** @@ -160,10 +160,10 @@ One group, fold across chunks: ```rust let mut acc = aggregate.accumulator(dtype)?; for chunk in chunked_array.chunks() { - acc.accumulate(&chunk)?; + acc.accumulate(&chunk, ctx)?; } -acc.flush()?; -acc.finish() // → 1-element array +acc.flush(ctx)?; +acc.finish(ctx) // → 1-element array ``` **Case 4 — ListView with scattered elements:** @@ -228,16 +228,17 @@ pub trait AggregateFnVTable: 'static + Sized + Clone + Send + Sync { options: &Self::Options, input: &ArrayRef, offsets: &ArrayRef, + ctx: &mut ExecutionCtx, ) -> VortexResult { let n_groups = offsets.len() - 1; let mut acc = self.accumulator(options, input.dtype())?; for i in 0..n_groups { let start = scalar_at(offsets, i)?; let end = scalar_at(offsets, i + 1)?; - acc.accumulate(&input.slice(start..end)?)?; - acc.flush()?; + acc.accumulate(&input.slice(start..end)?, ctx)?; + acc.flush(ctx)?; } - acc.finish() + acc.finish(ctx) } /// Ungrouped full-column aggregation returning a scalar. @@ -248,10 +249,11 @@ pub trait AggregateFnVTable: 'static + Sized + Clone + Send + Sync { &self, options: &Self::Options, input: &ArrayRef, + ctx: &mut ExecutionCtx, ) -> VortexResult { let n = input.len(); let offsets: ArrayRef = buffer![0u64, n as u64].into_array(); - let result = self.execute_grouped(options, input, &offsets)?; + let result = self.execute_grouped(options, input, &offsets, ctx)?; result.scalar_at(0) } } @@ -278,13 +280,13 @@ struct MeanAccumulator { } impl Accumulator for MeanAccumulator { - fn accumulate(&mut self, batch: &ArrayRef) -> VortexResult<()> { - self.running_sum += Sum.execute_scalar(&EmptyOptions, batch)?.as_f64()?; - self.running_count += Count.execute_scalar(&EmptyOptions, batch)?.as_u64()?; + fn accumulate(&mut self, batch: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<()> { + self.running_sum += Sum.execute_scalar(&EmptyOptions, batch, ctx)?.as_f64()?; + self.running_count += Count.execute_scalar(&EmptyOptions, batch, ctx)?.as_u64()?; Ok(()) } - fn flush(&mut self) -> VortexResult<()> { + fn flush(&mut self, _ctx: &mut ExecutionCtx) -> VortexResult<()> { let mean = if self.running_count > 0 { self.running_sum / self.running_count as f64 } else { @@ -296,7 +298,7 @@ impl Accumulator for MeanAccumulator { Ok(()) } - fn finish(self: Box) -> VortexResult { + fn finish(self: Box, _ctx: &mut ExecutionCtx) -> VortexResult { Ok(PrimitiveArray::from_vec(self.output).into_array()) } } @@ -373,6 +375,7 @@ impl ScalarFnVTable for ListAggregate { args: ExecutionArgs, ) -> VortexResult { let list_input = &args.inputs[0]; + let ctx = args.ctx; // Execute the child into a ListArray, decompose, and aggregate. let list = list_input.to_list()?; let elements = list.elements(); @@ -388,13 +391,13 @@ impl ScalarFnVTable for ListAggregate { offset_at(&offsets, i)..offset_at(&offsets, i+1) )?; for chunk in iter_chunks(&group_elements) { - acc.accumulate(&chunk)?; + acc.accumulate(&chunk, ctx)?; } - acc.flush()?; + acc.flush(ctx)?; } - acc.finish() + acc.finish(ctx) } else { - options.aggregate_fn.execute_grouped(&elements, &offsets) + options.aggregate_fn.execute_grouped(&elements, &offsets, ctx) } } @@ -452,6 +455,7 @@ fn execute_ordered_group_by( value_column: &ArrayRef, key_column: &ArrayRef, // sorted aggregate: &AggregateFnRef, + ctx: &mut ExecutionCtx, ) -> VortexResult { // Step 1: Compute group offsets from run boundaries in the key column. // E.g., key = [A, A, A, B, B, C] → offsets = [0, 3, 5, 6] @@ -617,15 +621,16 @@ is canonical for `DType::List`. ```rust fn execute(&self, options: &Self::Options, args: ExecutionArgs) -> VortexResult { let list_input = &args.inputs[0]; + let ctx = args.ctx; let listview = list_input.to_listview()?; if listview.is_zero_copy_to_list() { // Fast path: offsets are sorted, use directly let offsets = build_list_offsets_from_list_view(&listview); - options.aggregate_fn.execute_grouped(&listview.elements(), &offsets) + options.aggregate_fn.execute_grouped(&listview.elements(), &offsets, ctx) } else { // Slow path: rebuild to sorted form first let list = list_from_list_view(listview)?; - options.aggregate_fn.execute_grouped(&list.elements(), &list.offsets()) + options.aggregate_fn.execute_grouped(&list.elements(), &list.offsets(), ctx) } } ``` @@ -691,8 +696,7 @@ This RFC does not change the file format or wire format. `ListAggregate` produce ## Drawbacks - **New trait surface area.** `AggregateFnVTable` and `Accumulator` are new traits to - learn, though they closely mirror `ScalarFnVTable` and the existing accumulator pattern - in `compute::sum`. + learn, though they closely mirror `ScalarFnVTable`. - **Reduce rule coverage.** Not all encoding × aggregate combinations will have optimized reduce_parent rules initially. The fallback (canonicalize list, then accumulator loop) From 8b67c21d627c9fbd9ab1de66cc9e7dd06f3c577f Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 27 Feb 2026 20:42:20 -0500 Subject: [PATCH 4/9] Make ExecutionArgs a dyn trait Signed-off-by: Nicholas Gates --- proposals/0001-execution-v2.md | 1050 +++++++++----------------------- 1 file changed, 275 insertions(+), 775 deletions(-) diff --git a/proposals/0001-execution-v2.md b/proposals/0001-execution-v2.md index 7ad5811..0500359 100644 --- a/proposals/0001-execution-v2.md +++ b/proposals/0001-execution-v2.md @@ -4,923 +4,423 @@ ## Summary -Replace the current execution model with a cleaner architecture that: -- Enables decompress-into-buffer (caller-owned output) for zero-copy decompression -- Passes a `Mask` to the execution method, forcing encodings to handle row selection -- Cleans up the reduce/execute boundary and removes migration-era adaptor boilerplate +Replace the current execution VTable with a cleaner model. `reduce` and `reduce_parent` remain +metadata-only in both proposals and are identical. The proposals differ in how `execute` and +`execute_parent` work: -Two candidate designs are presented: **Option A** (three-phase recursive) and **Option B** -(cleaned-up iterative). Both share the `Mask`-based row selection and the reduce boundary cleanup. They -differ in how materialization is structured -- Option A passes a caller-owned builder through -recursive descent, Option B keeps an iterative loop with typed execution steps. +- **Proposal A (Scheduler-driven)**: `execute` returns an `ExecutionStep` — either requesting + the scheduler to execute a specific child, or declaring that it is done. The scheduler drives + all iteration. `execute_parent` is retained and returns an `ArrayRef` (any encoding). + +- **Proposal B (Canonical builder)**: `execute` pushes its result into a caller-owned + `CanonicalBuilder`. The result is always canonical. `execute_parent` is retained and also + pushes into the builder. ## Motivation -**No decompress-into-buffer.** The current execution model has no way to decompress an array -into a caller-provided output buffer. Each encoding allocates its own output during execution. -This means ChunkedArray must decompress each chunk separately and concatenate the results -- -an extra copy of the entire column. Exporters (DuckDB, Arrow) cannot write directly into their -output format; they must receive Vortex's output and copy it. This is the primary pain point. - -**Stack overflows from recursive execution.** The current executor is recursive: each -`execute` call may recurse into children, which recurse into their children. On deep array -trees (deeply nested expressions, many-chunk arrays) this overflows the stack. The recursion -depth is bounded by a 128-iteration limit, but this limit applies to the outer loop -- inner -recursive calls have no bound. - -**Easy to forget selection implementations.** Filter (boolean mask), Slice (contiguous range), -and Take (integer indices) are separate adaptor traits. Each encoding can implement one and -forget the others. Fusing selection with decompression is really just -`execute_parent(FilterArray)` -- which is fine -- but the current design makes it easy to miss. -Passing a `Mask` parameter to the execution method forces every encoding to handle -selection, eliminating the gap. - -**Adaptor explosion is migration noise.** The adaptor types (FilterReduceAdaptor, -SliceExecuteAdaptor, TakeExecuteAdaptor, etc.) are a bridge from the old compute kernel -dispatch to the new VTable world. The reduce/execute distinction is already enforced -- we just -haven't deprecated the APIs that don't require an `ExecutionCtx`. The ~40+ adaptor impls are -migration artifacts, not a fundamental design problem, but they add noise and make the codebase -harder to navigate. - -**Strict metadata-only reduce is useful for GPU.** When buffers live on a GPU device, -host-side code cannot easily access them. A clean metadata-only reduce boundary is useful -(though not strictly required) for GPU execution paths. +**No decompress-into-buffer.** Encodings allocate their own output during execution. +ChunkedArray must decompress each chunk separately and concatenate — an extra copy of the +entire column. Exporters (DuckDB, Arrow) can't write directly into their output format. -## Shared design +**Stack overflow from recursion.** The current executor recurses into children. Deep encoding +trees overflow the stack. -Both options share these design elements. +**Unclear execute/reduce boundary.** Some `execute_parent` implementations are metadata-only +and belong in `reduce_parent`. The boundary isn't enforced. -### Mask as row selection +## Shared design -The existing `Mask` type (in `vortex-mask`) already represents row selection with optimized -internal variants: `AllTrue(usize)`, `AllFalse(usize)`, and `Values(Arc)` for a -full bitmap. It already supports `slice()` for restricting to a range. No new type is needed -- -`Mask` is passed directly to the execution method as the row selection parameter. +### reduce / reduce_parent (identical in both proposals) -Composition is mask-over-mask: slicing a mask produces a narrower mask. The main addition is a -`compose` method (or equivalent) for restricting one mask by another, and `apply` for -compacting buffer elements. +Both proposals keep these methods with unchanged signatures. They are **strictly metadata-only** +— they never read data buffers. ```rust -// Existing type, extended with: -impl Mask { - /// Restrict self by other: keep only positions where both are true. - pub fn compose(&self, other: &Mask) -> Mask; - - /// Apply mask to a buffer, returning a new buffer with only selected elements. - pub fn apply(&self, data: &Buffer) -> BufferMut; -} +fn reduce(array: &Self::Array) -> VortexResult>; +fn reduce_parent( + array: &Self::Array, + parent: &ArrayRef, + child_idx: usize, +) -> VortexResult>; ``` -**Take is not selection.** Take (integer indices) can reorder rows, repeat rows, and introduce -nulls via out-of-bounds indices. These semantics are fundamentally different from subsetting. -Gather remains an encoding-internal concern -- DictArray handles codes → values lookup inside -its own materialization. +`reduce` rewrites an array using only metadata. Filter(AllTrue(x)) → x. Constant folding. -**`scalar_at` remains a separate VTable method.** Single-element access via a builder -round-trip would regress hot paths like RunEnd binary search probing individual positions. +`reduce_parent` rewrites a parent from a child's perspective. Dict child pushes ScalarFn into +values. RunEnd child pulls ScalarFn through to its values. FSST child rewrites Compare by +compressing the RHS literal. -### CanonicalBuilder - -`CanonicalBuilder` is specifically for the decompress-into case: a caller creates a builder, -and the encoding writes directly into it. This is structurally incompatible with iterative -execution (where each step returns an owned result), so `CanonicalBuilder` only applies to -Option A's recursive model. +The framework runs these to a fixpoint before execution begins (and, in Proposal A, between +execution steps). -It replaces the current `dyn ArrayBuilder` with a closed enum. The current `ArrayBuilder` is a -trait object -- callers get a `Box` and can only interact with it through -generic methods like `append_scalar()` and `extend_from_array()`. Encodings cannot know what -concrete builder they are writing into, so they must produce intermediate arrays or scalars and -let the builder copy the data in. +Implementations currently misplaced in `execute_parent` that are metadata-only (Dict + Compare, +ALP + Compare, FoR + Compare, FSST + Compare) move to `reduce_parent`. Implementations that +are really canonical-type compute kernels (Primitive + Compare, Bool + FillNull, Decimal + +Between) move into `ScalarFn::execute`. -`CanonicalBuilder` is an enum that mirrors the `Canonical` variants in mutable builder form: +### FilterArray -```rust -pub enum CanonicalBuilder { - Null(NullBuilder), - Bool(BoolBuilder), - Primitive(PrimitiveBuilder), - Decimal(DecimalBuilder), - VarBinView(VarBinViewBuilder), - List(ListViewBuilder), - FixedSizeList(FixedSizeListBuilder), - Struct(StructBuilder), - Extension(ExtensionBuilder), -} +FilterArray continues to exist as a lazy wrapper in both models. It is not subsumed by the +execution method. Both models handle it — the difference is what `execute_parent` can return +when a child encoding wants to handle a FilterArray parent (see comparison section). -impl CanonicalBuilder { - pub fn new(dtype: &DType) -> Self { ... } - pub fn finish(self) -> Canonical { ... } -} -``` - -Because it is an enum (not `dyn`), encodings can match on it to access the concrete builder -type directly. This enables zero-copy decompression paths that are impossible with `dyn -ArrayBuilder`. For example, FSST can match for the `VarBinView` variant and push decompressed -buffers and views directly into the builder's internal storage, avoiding an intermediate string -allocation: - -```rust -fn canonicalize_into(fsst: &FSSTArray, builder: &mut CanonicalBuilder, mask, ctx) { - let CanonicalBuilder::VarBinView(vbv_builder) = builder else { - unreachable!("FSST must canonicalize into VarBinView"); - }; - let (buffers, views) = fsst_decode_views(fsst, vbv_builder.completed_block_count(), ctx)?; - vbv_builder.push_buffer_and_adjusted_views(&buffers, &views, ...); -} -``` - -`CanonicalBuilder` is only canonical at the root level. For Struct, field arrays are -accumulated as `Vec` (potentially still compressed/chunked) rather than being -recursively decompressed into nested builders. +--- -### Reduce boundary cleanup +## Proposal A: Scheduler-driven iterative execution -Both options retain `reduce` and `reduce_parent` unchanged: +### VTable ```rust fn reduce(array: &Self::Array) -> VortexResult>; -fn reduce_parent( - array: &Self::Array, - parent: &ArrayRef, - child_idx: usize, -) -> VortexResult>; -``` +fn reduce_parent(array: &Self::Array, parent: &ArrayRef, child_idx: usize) + -> VortexResult>; -Both are strictly metadata-only, never reading data buffers. This boundary is already enforced -today -- the remaining work is deprecating the APIs that don't require an `ExecutionCtx` and -migrating the misplaced implementations. Specifically: the metadata-only rewrites currently -registered as `execute_parent` (Dict + Compare, ALP + Compare, FoR + Compare, FSST + Compare) -move to `reduce_parent`. The canonical-type kernels currently in `execute_parent` (Primitive + -Compare, Bool + FillNull, Decimal + Between) move into the scalar function's `execute` method, -where they belong. +fn execute(array: &Self::Array, ctx: &mut ExecutionCtx) + -> VortexResult; -### Constants - -`ConstantArray` continues to exist as an encoding. It is special-cased only in ScalarFnArray -materialization -- one check to avoid expanding constants before passing to the scalar function. -This preserves the `Columnar` enum: - -```rust -pub enum Columnar { - Canonical(Canonical), - Constant(ConstantArray), -} +fn execute_parent(array: &Self::Array, parent: &ArrayRef, child_idx: usize, ctx: &mut ExecutionCtx) + -> VortexResult>; ``` -**Open question:** The current scalar representation uses heap-allocated value trees for nested -types (structs, lists). For deeply nested types, this is expensive. An alternative is to hold -constants as length-1 arrays instead of scalars, avoiding the scalar value tree entirely. This -is orthogonal to the execution model choice but may affect `ConstantArray`'s internal -representation. - -### Decompression cache and CSE - -`ExecutionCtx` holds a pointer-identity cache (`HashMap<*const dyn Array, ArrayRef>`) scoped -to a single materialization call. When the same array node appears as a child of multiple -ScalarFnArray nodes (e.g., `x + x`), it is materialized once. When a ZStd-compressed array is -sliced into many chunks, the first slice decompresses the block and caches the result. The -cache is dropped after materialization returns. - ---- - -## Option A: Three-phase recursive model - -### VTable surface (3 methods) - ```rust -pub trait VTable { - fn reduce(array: &Self::Array) -> VortexResult>; - fn reduce_parent( - array: &Self::Array, - parent: &ArrayRef, - child_idx: usize, - ) -> VortexResult>; +pub enum ExecutionStep { + /// Ask the scheduler to execute the child at this index to columnar, + /// replace it, then call execute on this array again. + ExecuteChild(usize), - /// Materialize this array into a canonical builder, fusing the given row mask - /// with decompression. This is where all buffer-reading work happens. - fn canonicalize_into( - array: &Self::Array, - builder: &mut CanonicalBuilder, - mask: &Mask, - ctx: &mut ExecutionCtx, - ) -> VortexResult<()>; + /// Execution is complete. + Done(Columnar), } ``` -| Method | Purpose | Buffer access | -|--------------------|---------------------------------------------|---------------| -| `reduce` | Self-rewrite (metadata only) | Never | -| `reduce_parent` | Rewrite parent (expression push-down, etc.) | Never | -| `canonicalize_into`| Decompress with fused mask | Yes | +The encoding never recurses into children. It yields control back to the scheduler, telling it +what work is needed. The scheduler maintains the work stack. -### Three-phase pipeline +### Scheduler ```rust -pub fn canonicalize(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { - // Phase 1: optimize tree (reduce rules, expression push-down) - let array = array.optimize_recursive()?; - - // Phase 2: peel selection wrappers - let (array, mask) = array.peel_selection(); - - // Phase 3: single recursive descent, no loop - let mut builder = CanonicalBuilder::new(array.dtype()); - array.vtable().canonicalize_into(&array, &mut builder, &mask, ctx)?; - Ok(builder.finish()) +fn execute_to_columnar(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { + let mut array = optimize_recursive(array)?; + + loop { + if let Some(c) = array.as_columnar() { return Ok(c); } + + // Try execute_parent (child-driven specialized execution) + if let Some(rewritten) = try_execute_parent(&array, ctx)? { + array = optimize_recursive(rewritten)?; + continue; + } + + match array.vtable().execute(&array, ctx)? { + ExecutionStep::ExecuteChild(i) => { + let child = array.child(i); + let executed = execute_to_columnar(child, ctx)?; + array = array.with_child(i, executed.into_array()); + array = optimize_recursive(array)?; + } + ExecutionStep::Done(result) => return Ok(result), + } + } } ``` -**Phase 1** runs `reduce`/`reduce_parent` in a loop until the tree is stable. All -metadata-only rewrites happen here: expression push-down into Dict values, constant folding, -comparison target transformation for ALP/FoR/FSST. +After each child execution, `optimize_recursive` runs again — reduce rules can fire on the +new tree shape. For truly bounded stack depth, an explicit work stack replaces the recursive +`execute_to_columnar(child)` call. -**Phase 2** strips Filter and Slice wrappers from the root, composing them into a single -`Mask`. *(Open question: does lifting selection to the root provide meaningful benefit -beyond what passing `Mask` through recursion already gives? It may be orthogonal to the -core proposal.)* +### Per-encoding examples -**Phase 3** is a single recursive descent. Each encoding decompresses with the fused selection. -No iteration. Selection wrappers deeper in the tree compose themselves into the selection -during descent: +**DictArray** — requests codes first, then gathers: ```rust -// SliceArray's canonicalize_into: slicing composes as masking out the excluded rows -fn canonicalize_into(slice: &SliceArray, builder, mask, ctx) { - let composed = mask.slice(slice.range()); - slice.child().canonicalize_into(builder, &composed, ctx) +fn execute(dict: &DictArray, ctx: &mut ExecutionCtx) -> VortexResult { + if !dict.codes().is_columnar() { + return Ok(ExecutionStep::ExecuteChild(0)); + } + let codes = dict.codes().as_primitive(); + let gathered = gather(dict.values(), &codes, ctx)?; + Ok(ExecutionStep::Done(gathered)) } ``` -### Per-encoding canonicalize_into - -- **BitPacked**: unpack only selected positions into PrimitiveBuilder -- **FSST**: decompress only selected strings into VarBinViewBuilder -- **RunEnd**: binary-search ends to find runs intersecting the selection, expand only those -- **Dict**: materialize codes with selection, gather from values -- **Chunked**: split selection across chunks, recurse per-chunk -- **Struct**: pass selection to each field's materialization -- **Constant**: create scalar repeated to selection count - -For fused decompression patterns (e.g., Dict-RLE), the parent checks inline: +**ScalarFnArray** — requests children left-to-right, then evaluates: ```rust -fn canonicalize_into(dict: &DictArray, builder, mask, ctx) { - if let Some(runend) = dict.codes().as_opt::() { - return fused_dict_rle(dict.values(), runend, builder, mask, ctx); - } - - let mut code_builder = primitive_builder(dict.codes().dtype()); - dict.codes().canonicalize_into(&mut code_builder, mask, ctx)?; - let codes = code_builder.finish_into_primitive(); - take_into_builder(builder, dict.values(), &codes, ctx) +fn execute(sfn: &ScalarFnArray, ctx: &mut ExecutionCtx) -> VortexResult { + for (i, child) in sfn.children().iter().enumerate() { + if !child.is_columnar() { + return Ok(ExecutionStep::ExecuteChild(i)); + } + } + let result = sfn.scalar_fn().execute(sfn.columnar_children(), ctx)?; + Ok(ExecutionStep::Done(result)) } ``` -### ScalarFnArray materialization +**FilterArray** — requests child, then applies mask: ```rust -fn canonicalize_into( - array: &ScalarFnArray, - builder: &mut CanonicalBuilder, - mask: &Mask, - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - let children: Vec = array.children().iter() - .map(|child| { - if let Some(c) = child.as_opt::() { - return Ok(Columnar::Constant(c.clone())); - } - let mut child_builder = CanonicalBuilder::new(child.dtype()); - child.canonicalize_into(&mut child_builder, mask, ctx)?; - Ok(Columnar::Canonical(child_builder.finish())) - }) - .try_collect()?; - - let result = array.scalar_fn().execute(children, ctx)?; - builder.extend_from_array(&result); - Ok(()) +fn execute(filter: &FilterArray, ctx: &mut ExecutionCtx) -> VortexResult { + if !filter.child().is_columnar() { + return Ok(ExecutionStep::ExecuteChild(0)); + } + let filtered = filter.mask().apply_to(filter.child().as_canonical())?; + Ok(ExecutionStep::Done(Columnar::Canonical(filtered))) } ``` -### Trade-offs - -**Strengths:** - -- **Minimal VTable surface.** 3 methods, down from 5. Each has a clear, non-overlapping role. -- **No iteration loop.** Materialization is a single recursive descent. Predictable, easy to - debug, easy to reason about performance. (Note: still recursive -- see weakness below.) -- **Mask fused with decompression.** Each encoding handles row selection once, in one place. -- **Strict reduce/canonicalize boundary.** reduce = metadata-only. canonicalize_into = - buffer-reading. No ambiguity, works on GPU. - -**Weaknesses:** - -- **No encoding-preserving mask application.** `canonicalize_into` always decompresses into a - `CanonicalBuilder`. There is no VTable mechanism for an encoding to absorb a mask without - decompressing. Exporters that want to preserve an encoding (DuckDB FSST vectors, Arrow - dictionary arrays) must call encoding-specific filter methods outside the framework. See - trade-off #2 for the concrete FSST example. +**BitPacked** — leaf encoding, decompresses directly: -- **Fused patterns require parent-knows-child.** Dict must check `as_opt::()` to - fuse Dict-RLE. An external encoding can't volunteer a fused path for a built-in parent - without modifying the parent. The known fused patterns (Dict-RLE) are all between encodings - in `vortex-array`, so this isn't a practical problem today, but it limits extensibility. - -- **Exporters can't inspect intermediates.** After optimize + peel, the exporter sees the tree - as-is. If the root is an opaque encoding that would decode to Dict, the exporter can't - discover that -- Phase 3 goes straight to canonical. The DuckDB dictionary export use case - only works when DictArray is already visible after Phase 1, which is the common case (Vortex - files store Dict as DictArray) but not guaranteed. - -- **Still recursive.** `canonicalize_into` recurses into children. On deeply nested trees - (many layers of encoding, deeply nested structs/lists), this can still overflow the stack -- - the same class of problem the current executor has. An explicit work stack or trampoline - could mitigate this but adds implementation complexity. +```rust +fn execute(bp: &BitPackedArray, ctx: &mut ExecutionCtx) -> VortexResult { + let primitive = unpack(bp)?; + Ok(ExecutionStep::Done(Columnar::Canonical(Canonical::Primitive(primitive)))) +} +``` --- -## Option B: Cleaned-up iterative model +## Proposal B: Canonical builder execution -Keep the framework-driven iteration loop but clean up the VTable surface, enforce the -reduce/execute boundary, and thread `Mask` through execution. - -### VTable surface (3 methods) +### VTable ```rust -pub trait VTable { - fn reduce(array: &Self::Array) -> VortexResult>; - fn reduce_parent( - array: &Self::Array, - parent: &ArrayRef, - child_idx: usize, - ) -> VortexResult>; +fn reduce(array: &Self::Array) -> VortexResult>; +fn reduce_parent(array: &Self::Array, parent: &ArrayRef, child_idx: usize) + -> VortexResult>; - /// Take one execution step toward materialization. - /// Returns an ExecutionStep telling the framework what happened. - fn execute( +fn execute( array: &Self::Array, - mask: &Mask, + builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx, - ) -> VortexResult; -} -``` +) -> VortexResult<()>; -```rust -pub enum ExecutionStep { - /// Fully materialized. - Done(Columnar), - /// Rewrote to a new array tree. Framework should optimize and execute again. - Rewrite(ArrayRef), - /// Composed the mask and delegated to child. Framework should continue - /// executing the returned array with the new mask. - Peel(ArrayRef, Mask), -} +fn execute_parent( + array: &Self::Array, + parent: &ArrayRef, + child_idx: usize, + builder: &mut CanonicalBuilder, + ctx: &mut ExecutionCtx, +) -> VortexResult; // true if handled ``` -| Method | Purpose | Buffer access | -|-----------|---------------------------------------------|---------------| -| `reduce` | Self-rewrite (metadata only) | Never | -| `reduce_parent` | Rewrite parent (expression push-down) | Never | -| `execute` | One step toward materialization | Yes | +### CanonicalBuilder -### Framework loop +A closed enum mirroring `Canonical` in mutable builder form: ```rust -pub fn execute_to_columnar( - mut array: ArrayRef, - mut mask: Mask, - ctx: &mut ExecutionCtx, -) -> VortexResult { - for _ in 0..MAX_ITERATIONS { - // Check termination - if let Some(c) = array.as_opt::() { - return Ok(Columnar::Constant(c.apply_mask(&mask))); - } - if let Some(c) = array.as_opt::() { - return Ok(Columnar::Canonical(mask.apply_to(c.into())?)); - } - - // Optimize (reduce/reduce_parent to fixpoint) - array = array.optimize_recursive()?; - - // Execute one step - match array.vtable().execute(&array, &mask, ctx)? { - ExecutionStep::Done(c) => return Ok(c), - ExecutionStep::Rewrite(new_array) => { - array = new_array; - } - ExecutionStep::Peel(child, new_mask) => { - array = child; - mask = new_mask; - } - } - } - vortex_bail!("exceeded max iterations") +pub enum CanonicalBuilder { + Null(NullBuilder), + Bool(BoolBuilder), + Primitive(PrimitiveBuilder), + Decimal(DecimalBuilder), + VarBinView(VarBinViewBuilder), + List(ListViewBuilder), + FixedSizeList(FixedSizeListBuilder), + Struct(StructBuilder), + Extension(ExtensionBuilder), } ``` -Note: `optimize_recursive()` runs inside the loop, after every step. This means reduce_parent -rules can fire on intermediate forms exposed by execution -- the key capability that Option A -lacks. - -### Per-encoding execute +Because it is an enum (not `dyn`), encodings can match on the concrete variant. This enables +zero-copy decompression paths impossible with the current `dyn ArrayBuilder`. -Each encoding returns a typed step, making execution transparent: +### Framework ```rust -// FilterArray: compose masks, delegate -fn execute(filter: &FilterArray, mask, ctx) -> ExecutionStep { - let composed = mask.compose(&filter.mask()); - ExecutionStep::Peel(filter.child().clone(), composed) -} - -// SliceArray: slice the mask, delegate -fn execute(slice: &SliceArray, mask, ctx) -> ExecutionStep { - let composed = mask.slice(slice.range()); - ExecutionStep::Peel(slice.child().clone(), composed) -} - -// BitPacked: fully decompress with mask -fn execute(bp: &BitPackedArray, mask, ctx) -> ExecutionStep { - let mut builder = primitive_builder(bp.dtype()); - unpack_selected(bp, &mut builder, mask)?; - ExecutionStep::Done(Columnar::Canonical(builder.finish().into())) -} - -// Dict: decompress one step -- execute codes, then take -fn execute(dict: &DictArray, mask, ctx) -> ExecutionStep { - if !dict.codes().is_canonical() { - // Execute codes one step, rebuild dict - let stepped_codes = dict.codes().execute_one_step(mask, ctx)?; - let new_dict = DictArray::new(stepped_codes, dict.values().clone()); - return ExecutionStep::Rewrite(new_dict.into_array()); - } - // Codes are canonical -- do the take with mask - let result = take_canonical(dict.values(), dict.codes().as_primitive(), mask, ctx)?; - ExecutionStep::Done(Columnar::Canonical(result)) -} - -// ScalarFnArray: execute children, then apply function -fn execute(sfn: &ScalarFnArray, mask, ctx) -> ExecutionStep { - // Find first non-columnar child, execute it one step - for (i, child) in sfn.children().iter().enumerate() { - if !child.is_columnar() { - let stepped = child.execute_one_step(mask, ctx)?; - let new_sfn = sfn.replace_child(i, stepped); - return ExecutionStep::Rewrite(new_sfn.into_array()); +fn canonicalize(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { + let array = optimize_recursive(array)?; + let mut builder = CanonicalBuilder::new(array.dtype()); + + // Try execute_parent + for child_idx in 0..array.nchildren() { + let child = array.child(child_idx); + if child.vtable().execute_parent(&child, &array, child_idx, &mut builder, ctx)? { + return Ok(builder.finish()); + } } - } - // All children are columnar -- apply function - let children = sfn.children_as_columnar(mask); - let result = sfn.scalar_fn().execute(children, ctx)?; - ExecutionStep::Done(Columnar::Canonical(result.into())) + + array.vtable().execute(&array, &mut builder, ctx)?; + Ok(builder.finish()) } ``` -### Fused patterns via reduce_parent +Single call. No iteration loop. No MAX_ITERATIONS. Reduce runs once, then one recursive +descent fills the builder. -Because `optimize_recursive()` runs after every step, fused patterns work naturally. Dict -executes its codes one step (RunEnd → Primitive), then on the next iteration, -`optimize_recursive()` runs and reduce_parent rules can fire on the new tree. The Dict-RLE -fused path can be implemented as a reduce_parent rule on RunEnd that recognizes the Dict parent -and rewrites the tree, or Dict can check inline in its execute -- both work because the -framework loops. +### Per-encoding examples -### Exporter tree inspection - -Exporters drive the loop directly, inspecting after each step: +**FSST** — zero-copy push into VarBinViewBuilder: ```rust -fn export_chunk(mut array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { - let mut mask = Mask::new_true(array.len()); - - for _ in 0..MAX_ITERATIONS { - array = array.optimize_recursive()?; - let (peeled, m) = array.peel_selection(); - array = peeled; - mask = mask.compose(&m); - - if let Some(dict) = array.as_opt::() { - return export_dict(dict, &mask, ctx); - } - if array.is_canonical() { - return export_canonical(mask.apply_to(array)?, ctx); - } - - match array.vtable().execute(&array, &mask, ctx)? { - ExecutionStep::Done(c) => return export_columnar(c, ctx), - ExecutionStep::Rewrite(new) => { array = new; } - ExecutionStep::Peel(child, m) => { array = child; mask = m; } - } - } - vortex_bail!("exceeded max iterations") +fn execute(fsst: &FSSTArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> { + let CanonicalBuilder::VarBinView(vbv) = builder else { unreachable!() }; + let (buffers, views) = fsst_decompress(fsst, vbv.completed_block_count())?; + vbv.push_buffers_and_views(&buffers, &views); + Ok(()) } ``` -The exporter inspects the tree after every step. If an opaque encoding decodes to DictArray -during execution, the exporter sees it on the next iteration and can export it as a dictionary. - -### Trade-offs - -**Strengths:** - -- **Encoding-preserving mask application.** An encoding can implement `execute(mask)` to - absorb the mask and return `Rewrite(still_encoded)`. FSST can filter without decompressing, - and exporters see the result through the standard loop -- no encoding-specific logic needed - for the mask-application step. -- **Exporters inspect intermediates.** The DuckDB exporter can find DictArray even when it's - hidden behind an opaque encoding, because it inspects after each step. -- **Child-volunteers extensibility preserved.** A fused decompression pattern can be expressed - as a reduce_parent rule or an inline check, without closing the door on external encodings - discovering patterns through the iteration. -- **Same 3 VTable methods.** reduce, reduce_parent, execute. Clean. -- **No stack overflow risk.** The outer loop is iterative -- each step returns control to the - framework. Stack depth is bounded to one execution step, regardless of tree depth. - -**Weaknesses:** - -- **Iteration loop remains.** Execution is still iterative with a MAX_ITERATIONS bound. Each - step produces a new array tree, and the framework loops. This is more predictable than before - (no execute_parent, Mask threaded through, typed ExecutionStep) but still requires - tracing through iterations to understand execution. The recursive model's "one call, done" - is easier to reason about. -- **Intermediate allocations.** Each `Rewrite` step allocates a new array tree. The recursive - model writes directly into the builder with no intermediates. For hot paths (large batches, - tight loops), the allocation overhead may be measurable. -- **Mask and the loop interact awkwardly.** When an encoding returns `Peel(child, - new_mask)`, the mask changes mid-loop. The framework must track the current mask across - iterations. In the recursive model, the mask flows naturally through function arguments. -- **No decompress-into-buffer.** The iterative model returns `Columnar` from `Done` -- the - encoding owns its output. There is no way to pass a caller-owned `CanonicalBuilder` through - the loop, because the framework doesn't know whether this step is final or intermediate. - This is the primary pain point that motivated this RFC, and Option B does not address it. - ChunkedArray still can't share a builder across chunks. Exporters still can't write directly - into their output format. - ---- - -## Fundamental trade-offs - -The two options differ on three architectural axes. These are genuine trade-offs -- each option -wins on some axes and loses on others. They are not fixable with clever engineering; they follow -from the structural decision of recursion vs. iteration. - -### 1. Caller-owned output buffers (Option A wins) - -In Option A, the caller creates a `CanonicalBuilder` and passes it down through the recursive -descent. Every encoding writes directly into this caller-owned buffer. The buffer exists before -materialization begins and is filled during a single pass. - -In Option B, each encoding returns `ExecutionStep::Done(Columnar)` -- it allocates and owns its -output. The caller receives the finished result. There is no way to pass a builder through the -loop because the encoding doesn't know at call time whether this is the final step or an -intermediate rewrite that will be fed back into the loop. - -**Why this matters:** - -- **ChunkedArray concat avoidance.** With a caller-owned builder, ChunkedArray iterates its - chunks and each chunk writes into the same builder. The output is a single contiguous - allocation. Without it, each chunk produces its own `Columnar`, and the framework must - concatenate them -- an extra copy of the entire column. - -- **FSST zero-copy push.** FSST can decompress string views and data buffers directly into a - `VarBinViewBuilder`'s internal storage, adjusting view offsets in-place. This avoids - materializing an intermediate string array. With `Done(Columnar)`, FSST must allocate the - `VarBinView` canonical array itself, losing the ability to share buffer space with other - chunks or avoid a final copy. - -- **Pre-allocated output.** When the output length is known (common case: the selection count), - the builder can pre-allocate once. The iterative model allocates per step. - -This trade-off is structural: passing a mutable builder down requires a call stack (recursion). -A loop that yields intermediate values cannot thread a mutable builder through iterations -without fundamentally changing the `ExecutionStep` API (e.g., adding a `Done`-with-builder -variant), which would complicate the common case where an encoding returns `Rewrite`. - -### 2. Encoding-preserving mask application (Option B wins) - -Some encodings can apply a row mask **without decompressing**. FSST can filter its compressed -data and produce a smaller FSST array. This matters when the downstream consumer natively -supports the encoding -- DuckDB supports FSST vectors, so exporting a filtered FSST directly -avoids decompression entirely. - -**Concrete example:** `filter(scalar_fn(upper, [fsst(data)]), mask)` - -Both options share the reduce phase. FSST's reduce_parent pushes `upper` into the symbol -table (metadata-only, analogous to Dict pushing scalar functions into values). After reduce + -peel, both options see: `fsst_upper(data)` + mask. - -In **Option B**, FSST's standard `execute(mask)` method handles this: +**ChunkedArray** — shared builder, no concat: ```rust -fn execute(fsst: &FSSTArray, mask, ctx) -> ExecutionStep { - let filtered = fsst.filter_preserving(&mask)?; // still FSST - ExecutionStep::Rewrite(filtered.into_array()) +fn execute(chunked: &ChunkedArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> { + for chunk in chunked.chunks() { + canonicalize_into(chunk, builder, ctx)?; + } + // All chunks wrote into the same builder. No concatenation needed. + Ok(()) } ``` -The DuckDB exporter drives the loop, sees FSST on the next iteration, and exports directly. -The standard path would execute again next iteration and decompress -- no harm. Any encoding -can implement this pattern through the general VTable. - -In **Option A**, the exporter must use encoding-specific logic outside the framework: +**DictArray** — executes codes into sub-builder, then gathers: ```rust -fn export_chunk(array: ArrayRef, ctx: &mut ExecutionCtx) -> DuckDBVector { - let array = array.optimize_recursive()?; - let (array, mask) = array.peel_selection(); - - // Encoding-specific: exporter must know about FSST - if let Some(fsst) = array.as_opt::() { - let filtered = fsst.filter_preserving(&mask)?; - return export_fsst(filtered, ctx); - } - - let mut builder = CanonicalBuilder::new(array.dtype()); - array.canonicalize_into(&mut builder, &mask, ctx)?; - export_canonical(builder.finish()) +fn execute(dict: &DictArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> { + let mut code_builder = CanonicalBuilder::new(dict.codes().dtype()); + canonicalize_into(dict.codes(), &mut code_builder, ctx)?; + let codes = code_builder.finish().into_primitive(); + gather_into_builder(dict.values(), &codes, builder, ctx) } ``` -The general `canonicalize_into` path always decompresses -- there is no VTable mechanism for -an encoding to absorb a mask without decompressing. The exporter must know about each encoding -it wants to preserve and call encoding-specific methods. - -**The difference:** Option B handles encoding-preserving mask application through the standard -`execute` VTable method -- any encoding can participate. Option A requires the exporter to have -encoding-specific knowledge for each format it wants to preserve. In practice, exporters -already need encoding-specific logic to export to native formats (DuckDB FSST vectors, Arrow -dictionary arrays), so the additional burden is the mask-application call rather than just an -`as_opt` check. - -This extends to cross-crate encodings. An external encoding that can filter itself without -decompressing just implements `execute(mask) → Rewrite(filtered_self)` in Option B. In -Option A, every exporter that wants to preserve that encoding must be updated to call its -specific filter method. - -### 3. Exporter intermediate inspection (Option B wins) - -In Option B, exporters drive the loop directly and can inspect the array tree after each step. -After an encoding applies a mask via `Rewrite`, the exporter sees the result on the next -iteration and can export it in its native format. - -In Option A, exporters see the tree after Phase 1 (optimize) and Phase 2 (peel selection). -When the encoding the exporter cares about is already visible after peel (the common case -- -Dict is stored as DictArray, FSST is stored as FSSTArray), the exporter can intercept and -handle it with encoding-specific logic. This covers the typical Vortex file scan path. The -gap is when an encoding is hidden behind a layer that must be executed to reveal it, or when -the exporter wants the encoding to absorb the mask through the general VTable (see trade-off -#2 above). - -### Summary - -| Trade-off | Winner | Practical impact today | -|-----------|--------|----------------------| -| Caller-owned output buffers | **Option A** | **Concrete.** ChunkedArray concat avoidance and FSST zero-copy decompression are measurable. | -| Encoding-preserving mask application | **Option B** | **Concrete.** FSST filter-without-decompress, Dict filter-without-decompress. Exporters need encoding-specific logic in Option A. | -| Exporter intermediate inspection | **Option B** | **Moderate.** Exporters can discover encodings revealed by execution. Fixed by peel_selection when the filter is at the root. | - -Secondary differences: - -| Aspect | Option A | Option B | -|--------|----------|----------| -| Stack overflow risk | Still recursive (can overflow on deep trees) | Iterative loop, bounded stack depth | -| Intermediate allocations | None (direct to builder) | One array tree per `Rewrite` step | -| Debuggability | One recursive call, stack trace shows position | Loop iterations, requires tracing | -| Mask threading | Natural function argument | Mutable state across iterations | -| Fused patterns (Dict-RLE) | Parent checks child inline | reduce_parent on intermediates, or inline | -| GPU kernel fusion potential | Clean tree maps to single kernel | Step-by-step harder to fuse | -| Determinism | Always same call sequence | Iteration count varies with input | - -## What changes (shared) - -**Removed (both options):** - -- `execute_parent` VTable method -- metadata-only rewrites move to `reduce_parent`; - canonical-type kernels move into `ScalarFn::execute`. -- `append_to_builder` VTable method. -- Selection-related adaptors -- `FilterReduceAdaptor`, `FilterExecuteAdaptor`, - `SliceReduceAdaptor`, `SliceExecuteAdaptor`, and all their per-encoding implementations. - `TakeExecuteAdaptor` is also removed (Take/gather is not row masking; it remains an - encoding-internal concern). - -**Retained (both options):** - -- `reduce` / `reduce_parent` -- strictly metadata-only. -- `ScalarFnArray` -- per-node lazy computation with `reduce_parent` push-down. -- Filter/Slice array wrappers -- for lazy representation. Could be unified into a single - `MaskedSelectionArray` in the future. Take remains separate (gather semantics, not masking). -- `scalar_at` -- separate VTable method for single-element access. - -**New (both options):** - -- `Mask::compose()` and `Mask::apply()` methods on the existing `Mask` type. -- `Array::peel_selection()` -- strip selection wrappers from root. - -**New (Option A only):** - -- `CanonicalBuilder` -- one-level-deep builder enum for decompress-into. -- `canonicalize_into` VTable method -- single recursive descent with fused mask. - -**New (Option B only):** - -- `ExecutionStep` enum -- typed return from execute, replacing bare `ArrayRef`. -- `execute` takes `&Mask` parameter -- row mask threaded through the loop. - -## Worked example: DuckDB dictionary export - -A Vortex file scan produces a chunk with a dictionary column. The query has a filter predicate -(`name = 'alice'`) evaluated to a mask, and a projection (`upper(name)`). The array tree: - -``` -filter( - scalar_fn(upper, [ - dict( - codes: bitpacked([0,1,0,2,1,0,...]), - values: fsst(["alice","bob","charlie"]) - ) - ]), - mask -) -``` - -**Optimize.** `reduce_parent` pushes `upper` into dict values: - -``` -filter( - dict( - codes: bitpacked([0,1,0,2,1,0,...]), - values: scalar_fn(upper, [fsst(["alice","bob","charlie"])]) - ), - mask -) -``` - -**Peel selections.** Filter is at the root, so it peels: - -``` -array = dict(codes: bitpacked(...), values: scalar_fn(upper, [fsst(...)])) -mask = filter_mask -``` - -**Export (Option A).** The exporter inspects the tree after optimize + peel: +**FilterArray** — execute child, apply mask: ```rust -fn export_chunk(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { - let array = array.optimize_recursive()?; - let (array, mask) = array.peel_selection(); - - if let Some(dict) = array.as_opt::() { - return export_dict(dict, &mask, ctx); - } - - let mut builder = CanonicalBuilder::new(array.dtype()); - array.canonicalize_into(&mut builder, &mask, ctx)?; - export_canonical(builder.finish()) +fn execute(filter: &FilterArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> { + let mut sub = CanonicalBuilder::new(filter.child().dtype()); + canonicalize_into(filter.child(), &mut sub, ctx)?; + let canonical = sub.finish(); + let filtered = filter.mask().apply_to(canonical)?; + builder.extend_from_canonical(&filtered); + Ok(()) } ``` -**Export (Option B).** The exporter drives the loop, inspecting after each step: +**BitPacked** — unpack directly into PrimitiveBuilder: ```rust -fn export_chunk(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { - let (mut array, mut mask) = (array, Mask::new_true(array.len())); - for _ in 0..MAX_ITERATIONS { - array = array.optimize_recursive()?; - let (peeled, m) = array.peel_selection(); - array = peeled; - mask = mask.compose(&m); - - if let Some(dict) = array.as_opt::() { - return export_dict(dict, &mask, ctx); - } - if array.is_canonical() { - return export_canonical(mask.apply_to(array)?); - } - match array.vtable().execute(&array, &mask, ctx)? { - ExecutionStep::Done(c) => return export_columnar(c), - ExecutionStep::Rewrite(a) => { array = a; } - ExecutionStep::Peel(a, m) => { array = a; mask = m; } - } - } - vortex_bail!("export did not converge") +fn execute(bp: &BitPackedArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> { + let CanonicalBuilder::Primitive(pb) = builder else { unreachable!() }; + unpack_into(bp, pb)?; + Ok(()) } ``` -Both options produce the same result: DuckDB gets a dictionary vector with 3 uppercase values -and only the selected codes. The difference is that Option B can discover DictArray even if it -was hidden behind an opaque encoding, while Option A only sees what's visible after Phase 1. - -## Migration path - -The migration is the same for both options. The key difference is Phase 3/5 -- whether -encodings implement `canonicalize_into` (Option A) or return `ExecutionStep` (Option B). - -**Phase 1: Foundation types.** Add `Mask::compose()`/`Mask::apply()`, introduce `CanonicalBuilder`, -`Array::peel_selection()`. All additive. - -**Phase 2: Add new execution method with fallback.** Add `canonicalize_into` (Option A) or -typed `execute` (Option B) with a default that bridges to the old path. Both old and new paths -work. +--- -**Phase 3: Migrate canonical encodings.** One per PR. Independent, parallelizable. +## Comparison -**Phase 4: Migrate selection wrappers.** Filter, Slice, Chunked, Masked compose into -`Mask`. +### What Proposal A can do that Proposal B cannot -**Phase 5: Migrate compressed encodings.** Mask-fused decompression. Independent per -encoding. +**1. Encoding-preserving execute_parent.** -**Phase 6: Move execute_parent → reduce_parent.** Metadata-only rewrites become reduce rules. -Canonical-type kernels move into ScalarFn::execute. Remove execute_parent. +In A, `execute_parent` returns `Option` — the result can be in *any* encoding. In B, +`execute_parent` pushes into a `CanonicalBuilder` — the result is always canonical. -**Phase 7: Delete old path.** Remove old `execute` method, the `Executable` trait, and all -selection adaptors. +Concrete example: `Filter(FSST(data), mask)`. FSST's `execute_parent` sees the FilterArray +parent. -**Phase 8: Wire up exporters.** DuckDB, Arrow exporters migrate to new API. +- In A, it can return a filtered FSST array (still FSST-encoded). An exporter driving the + scheduler sees FSST on the next iteration and exports it as a DuckDB FSST vector — no + decompression. +- In B, it must push decompressed VarBinView into the builder. The FSST encoding is gone. -## Compatibility +Same applies to DictArray for DuckDB dictionary vector export, or any encoding where the +downstream consumer natively supports the compressed form. -This RFC does not change the file format or wire format. All changes are internal to the -execution engine. +**2. Cross-step optimization.** -**Public API breakage:** +In A, the scheduler runs `optimize_recursive` after each child execution. Reduce rules fire on +the new tree shape. Patterns only visible after partial execution can still be optimized. -- `Executable` trait and `execute::()` are removed. Callers migrate to `canonicalize()` / - `execute_to_columnar()`. -- Third-party encodings must migrate to the new execution method. The fallback default - (Phase 2) provides a bridge during migration. +Example: `ScalarFn(upper, [Dict(BitPacked(codes), values)])`. After the scheduler executes +BitPacked codes to PrimitiveArray, the tree becomes +`ScalarFn(upper, [Dict(Primitive(codes), values)])`. A reduce_parent rule on Dict pushes +`upper` into values — this optimization fires between steps. -## Drawbacks +In B, reduce runs once before the single recursive descent. If a pattern only becomes visible +after partial execution, it's missed. -- **Migration effort.** ~33 encodings must migrate. The phased approach with a fallback default - mitigates this. +**3. Bounded stack depth.** -- **Mask composition complexity.** `Mask::compose` must be correct for all combinations of - internal representations (`AllTrue`, `AllFalse`, `Values`). This is concentrated, well-tested - complexity, but getting it wrong causes data corruption. +In A, the scheduler can use an explicit work stack instead of recursion. Stack depth is O(1) +regardless of encoding depth. In B, `execute` recurses into children — stack depth equals +encoding tree depth. -## Alternatives +### What Proposal B can do that Proposal A cannot -### Keep the iterative execution model as-is +**1. Caller-owned output buffers (decompress-into).** -Address individual pain points without restructuring. Avoids migration cost but does not -solve decompress-into-buffer and perpetuates the adaptor boilerplate. +The builder is created by the caller and passed down. This enables three concrete optimizations: -### Use a single ExpressionArray instead of per-node ScalarFnArray +- **ChunkedArray without concat.** One builder, each chunk writes into it. Single contiguous + allocation. In A, each chunk returns its own Columnar; the framework must concat them — an + extra copy of the entire column. -Prevents `reduce_parent` from interacting with individual expression nodes -- the key -optimization for dictionary-encoded predicates. +- **FSST zero-copy push.** FSST decompresses views and data buffers directly into + VarBinViewBuilder's internal storage, adjusting view offsets to account for already-completed + blocks. In A, FSST must allocate its own VarBinViewArray. -### Use `dyn ArrayBuilder` instead of `CanonicalBuilder` enum +- **Pre-allocated output.** When output length is known, the builder pre-allocates once. In A, + each Done allocates independently. -More extensible but prevents zero-copy builder paths. FSST pushing directly into -`VarBinViewBuilder` depends on knowing the builder variant at compile time. +**2. No intermediate allocations.** -### Subsume `scalar_at` into Mask +Every encoding writes directly into the final output buffer. No temporary array trees between +steps. For hot paths on large batches, this is measurable. -Deferred. Single-element access via a builder round-trip would regress hot paths. Revisit -after profiling. +**3. No iteration bound.** -## Prior Art +Single recursive descent. No MAX_ITERATIONS. No risk of non-convergence. -- **DataFusion** has `PhysicalExpr::evaluate_selection`, threading an explicit selection - through expression evaluation. +### Summary table -## Unresolved Questions +| Capability | Proposal A | Proposal B | +|-------------------------------------|------------|------------| +| Caller-owned output buffers | No | **Yes** | +| ChunkedArray without concat | No | **Yes** | +| FSST zero-copy into builder | No | **Yes** | +| Pre-allocated output | No | **Yes** | +| No intermediate allocations | No | **Yes** | +| Encoding-preserving execute_parent | **Yes** | No | +| Exporter intercepts intermediates | **Yes** | No | +| Cross-step optimization | **Yes** | No | +| Bounded stack depth (explicit stack)| **Yes** | No | +| No iteration bound / convergence | No | **Yes** | -- **Mask composition correctness**: `Mask::compose` and `Mask::slice` interactions need - careful implementation and thorough testing. +### The core trade-off -- **CanonicalBuilder design**: The exact interface for one-level-deep building, especially for - List and Extension types, needs design work. +Proposal A optimizes for **flexibility**: intermediate encodings are visible, exporters can +intercept, reduce rules fire between steps. The cost is that every encoding allocates its own +output and ChunkedArray must concat. -- **Aggregate and window functions**: Out of scope. AggregateFnArray and WindowFnArray will - need their own materialization paths. +Proposal B optimizes for **allocation efficiency**: caller-owned buffers, zero-copy +decompression, no intermediates. The cost is that execution always produces canonical — encodings +can't preserve themselves through execution, and exporters must use encoding-specific logic +outside the framework. -- **Option A vs Option B**: This RFC presents both. The decision reduces to: are caller-owned - output buffers (ChunkedArray concat avoidance, FSST zero-copy decompression) worth giving up - encoding-preserving mask application through the general VTable (FSST filter-without-decompress, - Dict filter-without-decompress for exporters)? Both trade-offs are concrete. In Option A, - exporters need encoding-specific filter methods; in Option B, ChunkedArray can't share a - builder across chunks. +## Compatibility -## Future Possibilities +No file format or wire format changes. All changes are internal to the execution engine. -### Mask pull-up +Public API breakage: the `Executable` trait and `execute::()` method are replaced. +Third-party encodings must migrate. A default implementation bridging to the old path eases +migration. -When a filter mask is nested inside an expression (`scalar_fn(upper, [filter(dict(...), mask)])`), -`peel_selection()` can't extract it. A future optimization could pull masks out of expressions -to the root, enabling further optimizations. +## Unresolved Questions -### Aggregate and window functions +- **CanonicalBuilder design (Proposal B):** The exact interface for one-level-deep building, + especially for List and Extension types, needs design work. Struct fields are accumulated as + `Vec` (potentially compressed), not recursively built. -`AggregateFnArray` and `WindowFnArray` will follow a similar deferred pattern with -`reduce`/`reduce_parent` optimization and `Mask`-based materialization. +- **Explicit work stack (Proposal A):** The scheduler can be recursive (simple, but same stack + overflow risk as today) or use an explicit work stack (bounded depth, but more complex). + The RFC assumes an explicit stack is feasible but doesn't specify the implementation. -### GPU kernel fusion +- **Constants:** `ConstantArray` continues to exist. ScalarFnArray special-cases it to avoid + expanding constants. The `Columnar` enum (`Canonical | Constant`) is preserved. This is + orthogonal to the proposal choice. -The strict metadata-only reduce constraint means the optimized tree can be shipped to a GPU -context. A GPU-aware materialization path could fuse the entire optimized tree into a single -kernel launch. +- **Decompression cache / CSE:** `ExecutionCtx` can hold a pointer-identity cache so that shared + sub-arrays (e.g., `x + x`) are only executed once. Orthogonal to the proposal choice. From ce38bb9841cff9c876d1d3f208fbadf299804953 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 27 Feb 2026 20:59:49 -0500 Subject: [PATCH 5/9] Make ExecutionArgs a dyn trait Signed-off-by: Nicholas Gates --- proposals/0001-execution-v2.md | 351 +++++++++++---------------------- 1 file changed, 115 insertions(+), 236 deletions(-) diff --git a/proposals/0001-execution-v2.md b/proposals/0001-execution-v2.md index 0500359..1e84171 100644 --- a/proposals/0001-execution-v2.md +++ b/proposals/0001-execution-v2.md @@ -4,17 +4,11 @@ ## Summary -Replace the current execution VTable with a cleaner model. `reduce` and `reduce_parent` remain -metadata-only in both proposals and are identical. The proposals differ in how `execute` and -`execute_parent` work: - -- **Proposal A (Scheduler-driven)**: `execute` returns an `ExecutionStep` — either requesting - the scheduler to execute a specific child, or declaring that it is done. The scheduler drives - all iteration. `execute_parent` is retained and returns an `ArrayRef` (any encoding). - -- **Proposal B (Canonical builder)**: `execute` pushes its result into a caller-owned - `CanonicalBuilder`. The result is always canonical. `execute_parent` is retained and also - pushes into the builder. +Replace the current execution VTable with a scheduler-driven model. `reduce` and +`reduce_parent` remain metadata-only. `execute` returns an `ExecutionStep` telling the scheduler +which child to execute next, or that execution is done. `execute_parent` is retained and returns +an `ArrayRef` in any encoding. The scheduler drives all iteration and runs reduce rules between +steps. ## Motivation @@ -28,12 +22,12 @@ trees overflow the stack. **Unclear execute/reduce boundary.** Some `execute_parent` implementations are metadata-only and belong in `reduce_parent`. The boundary isn't enforced. -## Shared design +## Design -### reduce / reduce_parent (identical in both proposals) +### reduce / reduce_parent (unchanged) -Both proposals keep these methods with unchanged signatures. They are **strictly metadata-only** -— they never read data buffers. +These methods keep their current signatures. They are **strictly metadata-only** — they never +read data buffers. ```rust fn reduce(array: &Self::Array) -> VortexResult>; @@ -50,8 +44,7 @@ fn reduce_parent( values. RunEnd child pulls ScalarFn through to its values. FSST child rewrites Compare by compressing the RHS literal. -The framework runs these to a fixpoint before execution begins (and, in Proposal A, between -execution steps). +The framework runs these to a fixpoint before execution begins and between execution steps. Implementations currently misplaced in `execute_parent` that are metadata-only (Dict + Compare, ALP + Compare, FoR + Compare, FSST + Compare) move to `reduce_parent`. Implementations that @@ -60,26 +53,21 @@ Between) move into `ScalarFn::execute`. ### FilterArray -FilterArray continues to exist as a lazy wrapper in both models. It is not subsumed by the -execution method. Both models handle it — the difference is what `execute_parent` can return -when a child encoding wants to handle a FilterArray parent (see comparison section). - ---- - -## Proposal A: Scheduler-driven iterative execution +FilterArray continues to exist as a lazy wrapper. It is not subsumed by the execution method +signature. -### VTable +### execute / execute_parent ```rust -fn reduce(array: &Self::Array) -> VortexResult>; -fn reduce_parent(array: &Self::Array, parent: &ArrayRef, child_idx: usize) - -> VortexResult>; - fn execute(array: &Self::Array, ctx: &mut ExecutionCtx) -> VortexResult; -fn execute_parent(array: &Self::Array, parent: &ArrayRef, child_idx: usize, ctx: &mut ExecutionCtx) - -> VortexResult>; +fn execute_parent( + array: &Self::Array, + parent: &ArrayRef, + child_idx: usize, + ctx: &mut ExecutionCtx, +) -> VortexResult>; ``` ```rust @@ -96,6 +84,9 @@ pub enum ExecutionStep { The encoding never recurses into children. It yields control back to the scheduler, telling it what work is needed. The scheduler maintains the work stack. +`execute_parent` returns `Option` — the result can be in **any encoding**, not just +canonical. This is critical for encoding-preserving execution (see examples below). + ### Scheduler ```rust @@ -178,249 +169,137 @@ fn execute(bp: &BitPackedArray, ctx: &mut ExecutionCtx) -> VortexResult VortexResult>; -fn reduce_parent(array: &Self::Array, parent: &ArrayRef, child_idx: usize) - -> VortexResult>; - -fn execute( - array: &Self::Array, - builder: &mut CanonicalBuilder, - ctx: &mut ExecutionCtx, -) -> VortexResult<()>; +Example: `ScalarFn(upper, [Dict(BitPacked(codes), values)])`. After the scheduler executes +BitPacked codes to PrimitiveArray, the tree becomes +`ScalarFn(upper, [Dict(Primitive(codes), values)])`. A reduce_parent rule on Dict pushes +`upper` into values — this optimization fires between steps. -fn execute_parent( - array: &Self::Array, - parent: &ArrayRef, - child_idx: usize, - builder: &mut CanonicalBuilder, - ctx: &mut ExecutionCtx, -) -> VortexResult; // true if handled -``` +### Encoding-preserving execute_parent -### CanonicalBuilder +`execute_parent` returns `Option` in any encoding. This enables exporters to +intercept intermediate forms without decompressing. -A closed enum mirroring `Canonical` in mutable builder form: +Concrete example: `Filter(FSST(data), mask)`. FSST's `execute_parent` sees the FilterArray +parent and returns a filtered FSST array (still FSST-encoded). An exporter driving the +scheduler sees FSST on the next iteration and exports it as a DuckDB FSST vector — no +decompression needed. -```rust -pub enum CanonicalBuilder { - Null(NullBuilder), - Bool(BoolBuilder), - Primitive(PrimitiveBuilder), - Decimal(DecimalBuilder), - VarBinView(VarBinViewBuilder), - List(ListViewBuilder), - FixedSizeList(FixedSizeListBuilder), - Struct(StructBuilder), - Extension(ExtensionBuilder), -} -``` +Same applies to DictArray for DuckDB dictionary vector export, or any encoding where the +downstream consumer natively supports the compressed form. -Because it is an enum (not `dyn`), encodings can match on the concrete variant. This enables -zero-copy decompression paths impossible with the current `dyn ArrayBuilder`. +### Exporter integration -### Framework +Exporters drive the scheduler loop directly and inspect the array after each step: ```rust -fn canonicalize(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { - let array = optimize_recursive(array)?; - let mut builder = CanonicalBuilder::new(array.dtype()); - - // Try execute_parent - for child_idx in 0..array.nchildren() { - let child = array.child(child_idx); - if child.vtable().execute_parent(&child, &array, child_idx, &mut builder, ctx)? { - return Ok(builder.finish()); - } - } - - array.vtable().execute(&array, &mut builder, ctx)?; - Ok(builder.finish()) -} -``` - -Single call. No iteration loop. No MAX_ITERATIONS. Reduce runs once, then one recursive -descent fills the builder. - -### Per-encoding examples - -**FSST** — zero-copy push into VarBinViewBuilder: +fn export_chunk(mut array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { + let mut array = optimize_recursive(array)?; -```rust -fn execute(fsst: &FSSTArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> { - let CanonicalBuilder::VarBinView(vbv) = builder else { unreachable!() }; - let (buffers, views) = fsst_decompress(fsst, vbv.completed_block_count())?; - vbv.push_buffers_and_views(&buffers, &views); - Ok(()) -} -``` + loop { + if let Some(dict) = array.as_opt::() { + return export_dict(dict, ctx); + } + if let Some(fsst) = array.as_opt::() { + return export_fsst(fsst, ctx); + } + if let Some(c) = array.as_columnar() { + return export_columnar(c, ctx); + } -**ChunkedArray** — shared builder, no concat: + if let Some(rewritten) = try_execute_parent(&array, ctx)? { + array = optimize_recursive(rewritten)?; + continue; + } -```rust -fn execute(chunked: &ChunkedArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> { - for chunk in chunked.chunks() { - canonicalize_into(chunk, builder, ctx)?; + match array.vtable().execute(&array, ctx)? { + ExecutionStep::ExecuteChild(i) => { + let child = array.child(i); + let executed = execute_to_columnar(child, ctx)?; + array = array.with_child(i, executed.into_array()); + array = optimize_recursive(array)?; + } + ExecutionStep::Done(c) => return export_columnar(c, ctx), + } } - // All chunks wrote into the same builder. No concatenation needed. - Ok(()) -} -``` - -**DictArray** — executes codes into sub-builder, then gathers: - -```rust -fn execute(dict: &DictArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> { - let mut code_builder = CanonicalBuilder::new(dict.codes().dtype()); - canonicalize_into(dict.codes(), &mut code_builder, ctx)?; - let codes = code_builder.finish().into_primitive(); - gather_into_builder(dict.values(), &codes, builder, ctx) } ``` -**FilterArray** — execute child, apply mask: +If an opaque encoding decodes to DictArray during execution, the exporter discovers it on the +next iteration and exports it natively. This is impossible in a model that always produces +canonical output. -```rust -fn execute(filter: &FilterArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> { - let mut sub = CanonicalBuilder::new(filter.child().dtype()); - canonicalize_into(filter.child(), &mut sub, ctx)?; - let canonical = sub.finish(); - let filtered = filter.mask().apply_to(canonical)?; - builder.extend_from_canonical(&filtered); - Ok(()) -} -``` - -**BitPacked** — unpack directly into PrimitiveBuilder: - -```rust -fn execute(bp: &BitPackedArray, builder: &mut CanonicalBuilder, ctx: &mut ExecutionCtx) -> VortexResult<()> { - let CanonicalBuilder::Primitive(pb) = builder else { unreachable!() }; - unpack_into(bp, pb)?; - Ok(()) -} -``` - ---- +### Decompress-into-buffer -## Comparison +The scheduler-driven model does not natively support caller-owned output buffers (each +`Done(Columnar)` allocates its own output). The two concrete cases where this matters — +ChunkedArray concat and FSST zero-copy push — can be addressed at the framework level: -### What Proposal A can do that Proposal B cannot +- **ChunkedArray**: The scheduler special-cases ChunkedArray by pre-allocating a single output + buffer and copying each chunk's `Columnar` result into it. One allocation + N memcpys rather + than N allocations + one concat. Not zero-copy, but eliminates the extra full-column copy. -**1. Encoding-preserving execute_parent.** +- **FSST**: The extra allocation is the views array (16 bytes per string). The actual string + data buffers are shared via `Arc` — they are not copied. The overhead is bounded and small + relative to decompression cost. -In A, `execute_parent` returns `Option` — the result can be in *any* encoding. In B, -`execute_parent` pushes into a `CanonicalBuilder` — the result is always canonical. - -Concrete example: `Filter(FSST(data), mask)`. FSST's `execute_parent` sees the FilterArray -parent. +These are framework-level optimizations, not VTable concerns. They don't require encoding +authors to think about builders. -- In A, it can return a filtered FSST array (still FSST-encoded). An exporter driving the - scheduler sees FSST on the next iteration and exports it as a DuckDB FSST vector — no - decompression. -- In B, it must push decompressed VarBinView into the builder. The FSST encoding is gone. - -Same applies to DictArray for DuckDB dictionary vector export, or any encoding where the -downstream consumer natively supports the compressed form. - -**2. Cross-step optimization.** - -In A, the scheduler runs `optimize_recursive` after each child execution. Reduce rules fire on -the new tree shape. Patterns only visible after partial execution can still be optimized. - -Example: `ScalarFn(upper, [Dict(BitPacked(codes), values)])`. After the scheduler executes -BitPacked codes to PrimitiveArray, the tree becomes -`ScalarFn(upper, [Dict(Primitive(codes), values)])`. A reduce_parent rule on Dict pushes -`upper` into values — this optimization fires between steps. - -In B, reduce runs once before the single recursive descent. If a pattern only becomes visible -after partial execution, it's missed. - -**3. Bounded stack depth.** - -In A, the scheduler can use an explicit work stack instead of recursion. Stack depth is O(1) -regardless of encoding depth. In B, `execute` recurses into children — stack depth equals -encoding tree depth. - -### What Proposal B can do that Proposal A cannot - -**1. Caller-owned output buffers (decompress-into).** - -The builder is created by the caller and passed down. This enables three concrete optimizations: - -- **ChunkedArray without concat.** One builder, each chunk writes into it. Single contiguous - allocation. In A, each chunk returns its own Columnar; the framework must concat them — an - extra copy of the entire column. +## Compatibility -- **FSST zero-copy push.** FSST decompresses views and data buffers directly into - VarBinViewBuilder's internal storage, adjusting view offsets to account for already-completed - blocks. In A, FSST must allocate its own VarBinViewArray. +No file format or wire format changes. All changes are internal to the execution engine. -- **Pre-allocated output.** When output length is known, the builder pre-allocates once. In A, - each Done allocates independently. +Public API breakage: the `Executable` trait and `execute::()` method are replaced. +Third-party encodings must migrate. A default implementation bridging to the old path eases +migration. -**2. No intermediate allocations.** +## Alternatives -Every encoding writes directly into the final output buffer. No temporary array trees between -steps. For hot paths on large batches, this is measurable. +### Canonical builder model -**3. No iteration bound.** +Instead of returning `ExecutionStep`, `execute` could push results into a caller-owned +`CanonicalBuilder` (a closed enum mirroring `Canonical` in mutable builder form). Each encoding +decompresses directly into the builder in a single recursive descent. `execute_parent` would +also push into the builder. -Single recursive descent. No MAX_ITERATIONS. No risk of non-convergence. +This model natively supports decompress-into-buffer: ChunkedArray writes all chunks into one +builder (no concat), FSST pushes views directly into VarBinViewBuilder (zero-copy). No +iteration loop, no MAX_ITERATIONS. -### Summary table +However, requiring canonical output from `execute` is a structural limitation that cannot be +worked around: -| Capability | Proposal A | Proposal B | -|-------------------------------------|------------|------------| -| Caller-owned output buffers | No | **Yes** | -| ChunkedArray without concat | No | **Yes** | -| FSST zero-copy into builder | No | **Yes** | -| Pre-allocated output | No | **Yes** | -| No intermediate allocations | No | **Yes** | -| Encoding-preserving execute_parent | **Yes** | No | -| Exporter intercepts intermediates | **Yes** | No | -| Cross-step optimization | **Yes** | No | -| Bounded stack depth (explicit stack)| **Yes** | No | -| No iteration bound / convergence | No | **Yes** | +- **No encoding-preserving execute_parent.** `execute_parent` must push canonical into the + builder. FSST can't return a filtered FSST array — the encoding is always lost. Exporters + must use encoding-specific logic outside the framework for every format they want to preserve. -### The core trade-off +- **No cross-step optimization.** Reduce runs once before the single descent. Patterns visible + only after partial execution are missed. -Proposal A optimizes for **flexibility**: intermediate encodings are visible, exporters can -intercept, reduce rules fire between steps. The cost is that every encoding allocates its own -output and ChunkedArray must concat. +- **No exporter intermediate inspection.** Exporters see the tree after reduce, before + execution. Encodings hidden behind opaque wrappers are never visible. -Proposal B optimizes for **allocation efficiency**: caller-owned buffers, zero-copy -decompression, no intermediates. The cost is that execution always produces canonical — encodings -can't preserve themselves through execution, and exporters must use encoding-specific logic -outside the framework. +- **Stack overflow.** `execute` recurses into children. Stack depth equals encoding tree depth. -## Compatibility - -No file format or wire format changes. All changes are internal to the execution engine. - -Public API breakage: the `Executable` trait and `execute::()` method are replaced. -Third-party encodings must migrate. A default implementation bridging to the old path eases -migration. +The scheduler-driven model's allocation wins (encoding-preserving intermediates, cross-step +optimization, exporter interception, bounded stack) are structurally unrecoverable in the +builder model. The builder model's allocation wins (shared builder, zero-copy push) are +recoverable in the scheduler model via framework-level optimizations. This asymmetry makes the +scheduler-driven model strictly preferable. ## Unresolved Questions -- **CanonicalBuilder design (Proposal B):** The exact interface for one-level-deep building, - especially for List and Extension types, needs design work. Struct fields are accumulated as - `Vec` (potentially compressed), not recursively built. - -- **Explicit work stack (Proposal A):** The scheduler can be recursive (simple, but same stack - overflow risk as today) or use an explicit work stack (bounded depth, but more complex). - The RFC assumes an explicit stack is feasible but doesn't specify the implementation. +- **Explicit work stack:** The scheduler can be recursive (simple, but same stack overflow risk + as today) or use an explicit work stack (bounded depth, but more complex). The RFC assumes an + explicit stack is feasible but doesn't specify the implementation. - **Constants:** `ConstantArray` continues to exist. ScalarFnArray special-cases it to avoid - expanding constants. The `Columnar` enum (`Canonical | Constant`) is preserved. This is - orthogonal to the proposal choice. + expanding constants. The `Columnar` enum (`Canonical | Constant`) is preserved. -- **Decompression cache / CSE:** `ExecutionCtx` can hold a pointer-identity cache so that shared - sub-arrays (e.g., `x + x`) are only executed once. Orthogonal to the proposal choice. +- **Decompression cache / CSE:** `ExecutionCtx` can hold a pointer-identity cache so that + shared sub-arrays (e.g., `x + x`) are only executed once. From 287fe87897a2eca6126002c245c9aca1204a148c Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 27 Feb 2026 21:23:14 -0500 Subject: [PATCH 6/9] Make ExecutionArgs a dyn trait Signed-off-by: Nicholas Gates --- proposals/0001-execution-v2.md | 147 ++++++++++++++++++++++++++------- 1 file changed, 118 insertions(+), 29 deletions(-) diff --git a/proposals/0001-execution-v2.md b/proposals/0001-execution-v2.md index 1e84171..e5eb448 100644 --- a/proposals/0001-execution-v2.md +++ b/proposals/0001-execution-v2.md @@ -67,13 +67,13 @@ fn execute_parent( parent: &ArrayRef, child_idx: usize, ctx: &mut ExecutionCtx, -) -> VortexResult>; +) -> VortexResult>; ``` ```rust pub enum ExecutionStep { - /// Ask the scheduler to execute the child at this index to columnar, - /// replace it, then call execute on this array again. + /// Ask the scheduler to execute the child at this index one step, + /// replace it, then call execute (or execute_parent) on this array again. ExecuteChild(usize), /// Execution is complete. @@ -81,43 +81,95 @@ pub enum ExecutionStep { } ``` -The encoding never recurses into children. It yields control back to the scheduler, telling it -what work is needed. The scheduler maintains the work stack. +Both `execute` and `execute_parent` return `ExecutionStep`. The encoding never recurses into +children — it yields control back to the scheduler, telling it what work is needed. The +scheduler maintains the work stack. + +`execute_parent` returns `Option`: +- `None` — the child cannot handle this parent, fall through to the parent's own `execute`. +- `Some(ExecuteChild(i))` — the child needs its own child at index `i` executed one step before + it can handle the parent. The scheduler does this, then retries `execute_parent`. +- `Some(Done(result))` — the child handled the parent, here is the result in **any encoding** + (not just canonical). This is critical for encoding-preserving execution. + +Making `execute_parent` iterative is necessary because some parent-handling logic requires +data access to the child's own children. For example, `Slice(RunEnd(ends=PCodec(...)))`: +RunEnd's `execute_parent` sees the Slice parent and wants to binary-search its `ends` to find +the physical indices for the slice boundaries. But `ends` is PCodec-compressed — each +`scalar_at` probe during binary search would decompress the full array. Instead, RunEnd returns +`ExecuteChild(0)` to request its ends be executed first. The scheduler decompresses PCodec to +PrimitiveArray, then retries `execute_parent`. RunEnd now binary-searches canonical ends and +returns an efficiently sliced RunEnd array. -`execute_parent` returns `Option` — the result can be in **any encoding**, not just -canonical. This is critical for encoding-preserving execution (see examples below). +```rust +// RunEnd's execute_parent for Slice +fn execute_parent( + re: &RunEndArray, parent: &ArrayRef, child_idx: usize, ctx: &mut ExecutionCtx, +) -> VortexResult> { + let Some(slice) = parent.as_opt::() else { return Ok(None) }; + + if !re.ends().is_columnar() { + // Need canonical ends for binary search + return Ok(Some(ExecutionStep::ExecuteChild(0))); + } + + // Ends are canonical — binary search is cheap + let physical_start = re.find_physical_index(slice.start())?; + let physical_end = re.find_physical_index(slice.end())?; + let sliced = RunEndArray::new( + re.ends().slice(physical_start..physical_end)?, + re.values().slice(physical_start..physical_end)?, + slice.start() + re.offset(), + slice.len(), + )?; + Ok(Some(ExecutionStep::Done(Columnar::from(sliced)))) +} +``` ### Scheduler ```rust -fn execute_to_columnar(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { +fn execute_one_step(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { let mut array = optimize_recursive(array)?; - loop { - if let Some(c) = array.as_columnar() { return Ok(c); } - - // Try execute_parent (child-driven specialized execution) - if let Some(rewritten) = try_execute_parent(&array, ctx)? { - array = optimize_recursive(rewritten)?; - continue; + if array.as_columnar().is_some() { return Ok(array); } + + // Try execute_parent (child-driven specialized execution) + for child_idx in 0..array.nchildren() { + let child = array.child(child_idx); + match child.vtable().execute_parent(&child, &array, child_idx, ctx)? { + None => continue, + Some(ExecutionStep::ExecuteChild(i)) => { + // Child needs its own child executed first — do one step, retry + let grandchild = child.child(i); + let stepped = execute_one_step(grandchild, ctx)?; + let new_child = child.with_child(i, stepped); + array = array.with_child(child_idx, new_child); + return Ok(array); + } + Some(ExecutionStep::Done(result)) => { + return Ok(result.into_array()); + } } + } - match array.vtable().execute(&array, ctx)? { - ExecutionStep::ExecuteChild(i) => { - let child = array.child(i); - let executed = execute_to_columnar(child, ctx)?; - array = array.with_child(i, executed.into_array()); - array = optimize_recursive(array)?; - } - ExecutionStep::Done(result) => return Ok(result), + // Fall through to the array's own execute + match array.vtable().execute(&array, ctx)? { + ExecutionStep::ExecuteChild(i) => { + let child = array.child(i); + let stepped = execute_one_step(child, ctx)?; + array = array.with_child(i, stepped); + Ok(array) } + ExecutionStep::Done(result) => Ok(result.into_array()), } } ``` -After each child execution, `optimize_recursive` runs again — reduce rules can fire on the -new tree shape. For truly bounded stack depth, an explicit work stack replaces the recursive -`execute_to_columnar(child)` call. +Each call to `execute_one_step` makes one unit of progress. The top-level +`execute_to_columnar` simply loops until the result is columnar. After each step, +`optimize_recursive` runs again — reduce rules can fire on the new tree shape. For truly +bounded stack depth, an explicit work stack replaces the recursive calls. ### Per-encoding examples @@ -250,6 +302,46 @@ ChunkedArray concat and FSST zero-copy push — can be addressed at the framewor These are framework-level optimizations, not VTable concerns. They don't require encoding authors to think about builders. +### Execution cache in ExecutionCtx + +The scheduler executes each child one step at a time, replacing it in the parent tree. This +means the result of executing an array is only visible to its immediate parent — if the same +`Arc` appears as a child of multiple parents (e.g., `a < 10 & a > 5` where `a` is +the same ArrayRef), it will be executed independently each time. + +`ExecutionCtx` holds a cache keyed by the raw pointer of the source array (`Arc::as_ptr()`). +The cache entry stores a clone of the source `Arc` (to pin the pointer address — preventing +deallocation and reuse) alongside the one-step execution result. + +```rust +pub struct ExecutionCtx { + // ... + cache: HashMap<*const dyn Array, CacheEntry>, +} + +struct CacheEntry { + /// Holds a strong reference to the source array, preventing the Arc from + /// being deallocated and its pointer reused for a different array. + _source: ArrayRef, + /// The result of executing the source array one step. + result: ArrayRef, +} +``` + +**When to cache.** Not every array should be cached — most are executed exactly once, and +caching adds overhead (HashMap insert + Arc clone). The `Arc::strong_count` is used as a +heuristic: if `strong_count > 1`, the array is referenced from multiple places and its +execution result is likely to be reused. This is an imperfect approximation — strong_count is a +global reference count, not local to the array tree being executed. An array may have +`strong_count > 1` because the layout reader or scan builder holds a reference, not because it +appears twice in the expression tree. But it never misses a genuinely shared sub-expression (if +shared, the count is > 1), and false positives only cost memory, not correctness. + +**Scope.** The cache lives in `ExecutionCtx`, not in individual arrays. Users can share an +`ExecutionCtx` across multiple independent array executions (e.g., multiple columns in a scan), +so the cache naturally deduplicates work across columns that share sub-arrays (shared +dictionaries, common filter masks). The cache is dropped when the `ExecutionCtx` is dropped. + ## Compatibility No file format or wire format changes. All changes are internal to the execution engine. @@ -300,6 +392,3 @@ scheduler-driven model strictly preferable. - **Constants:** `ConstantArray` continues to exist. ScalarFnArray special-cases it to avoid expanding constants. The `Columnar` enum (`Canonical | Constant`) is preserved. - -- **Decompression cache / CSE:** `ExecutionCtx` can hold a pointer-identity cache so that - shared sub-arrays (e.g., `x + x`) are only executed once. From 16cac214d9dac00099b63589780a9d710a98fff5 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 27 Feb 2026 22:03:32 -0500 Subject: [PATCH 7/9] Make ExecutionArgs a dyn trait Signed-off-by: Nicholas Gates --- proposals/0001-execution-v2.md | 494 ++++++++++++++------------------- 1 file changed, 210 insertions(+), 284 deletions(-) diff --git a/proposals/0001-execution-v2.md b/proposals/0001-execution-v2.md index e5eb448..e06eaba 100644 --- a/proposals/0001-execution-v2.md +++ b/proposals/0001-execution-v2.md @@ -4,76 +4,51 @@ ## Summary -Replace the current execution VTable with a scheduler-driven model. `reduce` and -`reduce_parent` remain metadata-only. `execute` returns an `ExecutionStep` telling the scheduler -which child to execute next, or that execution is done. `execute_parent` is retained and returns -an `ArrayRef` in any encoding. The scheduler drives all iteration and runs reduce rules between -steps. +Replace the current execution model with a scheduler-driven design. `reduce` and +`reduce_parent` remain metadata-only. `execute` returns an `ExecutionStep` — requesting a child +be executed or declaring completion. The scheduler drives all iteration using an explicit work +stack, runs reduce rules between steps, and caches results for shared sub-expressions. ## Motivation -**No decompress-into-buffer.** Encodings allocate their own output during execution. -ChunkedArray must decompress each chunk separately and concatenate — an extra copy of the -entire column. Exporters (DuckDB, Arrow) can't write directly into their output format. - **Stack overflow from recursion.** The current executor recurses into children. Deep encoding -trees overflow the stack. +trees overflow the stack. The 128-iteration limit applies to the outer loop, not inner +recursive calls. + +**Repeated decompression.** Some operations access the same compressed child multiple times. +Each access decompresses independently. For example, binary search on RunEnd's PCodec-compressed +ends calls `scalar_at` per probe, each decompressing the full array. + +**Shared sub-expressions.** `a < 10 & a > 5` references the same array `a` twice. The current +model executes it independently for each reference. **Unclear execute/reduce boundary.** Some `execute_parent` implementations are metadata-only and belong in `reduce_parent`. The boundary isn't enforced. ## Design -### reduce / reduce_parent (unchanged) - -These methods keep their current signatures. They are **strictly metadata-only** — they never -read data buffers. +### VTable methods ```rust +// Metadata-only rewrites. Never read data buffers. fn reduce(array: &Self::Array) -> VortexResult>; fn reduce_parent( - array: &Self::Array, - parent: &ArrayRef, - child_idx: usize, + array: &Self::Array, parent: &ArrayRef, child_idx: usize, ) -> VortexResult>; -``` - -`reduce` rewrites an array using only metadata. Filter(AllTrue(x)) → x. Constant folding. - -`reduce_parent` rewrites a parent from a child's perspective. Dict child pushes ScalarFn into -values. RunEnd child pulls ScalarFn through to its values. FSST child rewrites Compare by -compressing the RHS literal. - -The framework runs these to a fixpoint before execution begins and between execution steps. - -Implementations currently misplaced in `execute_parent` that are metadata-only (Dict + Compare, -ALP + Compare, FoR + Compare, FSST + Compare) move to `reduce_parent`. Implementations that -are really canonical-type compute kernels (Primitive + Compare, Bool + FillNull, Decimal + -Between) move into `ScalarFn::execute`. - -### FilterArray - -FilterArray continues to exist as a lazy wrapper. It is not subsumed by the execution method -signature. - -### execute / execute_parent - -```rust -fn execute(array: &Self::Array, ctx: &mut ExecutionCtx) - -> VortexResult; +// Execution. May read data buffers. +fn execute( + array: &Self::Array, ctx: &mut ExecutionCtx, +) -> VortexResult; fn execute_parent( - array: &Self::Array, - parent: &ArrayRef, - child_idx: usize, - ctx: &mut ExecutionCtx, -) -> VortexResult>; + array: &Self::Array, parent: &ArrayRef, child_idx: usize, ctx: &mut ExecutionCtx, +) -> VortexResult>; ``` ```rust pub enum ExecutionStep { - /// Ask the scheduler to execute the child at this index one step, - /// replace it, then call execute (or execute_parent) on this array again. + /// Execute the child at this index to columnar, replace it, + /// then call execute on this array again. ExecuteChild(usize), /// Execution is complete. @@ -81,117 +56,149 @@ pub enum ExecutionStep { } ``` -Both `execute` and `execute_parent` return `ExecutionStep`. The encoding never recurses into -children — it yields control back to the scheduler, telling it what work is needed. The -scheduler maintains the work stack. - -`execute_parent` returns `Option`: -- `None` — the child cannot handle this parent, fall through to the parent's own `execute`. -- `Some(ExecuteChild(i))` — the child needs its own child at index `i` executed one step before - it can handle the parent. The scheduler does this, then retries `execute_parent`. -- `Some(Done(result))` — the child handled the parent, here is the result in **any encoding** - (not just canonical). This is critical for encoding-preserving execution. - -Making `execute_parent` iterative is necessary because some parent-handling logic requires -data access to the child's own children. For example, `Slice(RunEnd(ends=PCodec(...)))`: -RunEnd's `execute_parent` sees the Slice parent and wants to binary-search its `ends` to find -the physical indices for the slice boundaries. But `ends` is PCodec-compressed — each -`scalar_at` probe during binary search would decompress the full array. Instead, RunEnd returns -`ExecuteChild(0)` to request its ends be executed first. The scheduler decompresses PCodec to -PrimitiveArray, then retries `execute_parent`. RunEnd now binary-searches canonical ends and -returns an efficiently sliced RunEnd array. +**reduce / reduce_parent** are unchanged. Strictly metadata-only. The framework runs them to a +fixpoint before execution and between execution steps. Implementations currently misplaced in +`execute_parent` that are metadata-only (Dict + Compare, ALP + Compare, FoR + Compare, FSST + +Compare) move to `reduce_parent`. -```rust -// RunEnd's execute_parent for Slice -fn execute_parent( - re: &RunEndArray, parent: &ArrayRef, child_idx: usize, ctx: &mut ExecutionCtx, -) -> VortexResult> { - let Some(slice) = parent.as_opt::() else { return Ok(None) }; +**execute** returns `ExecutionStep`. The encoding never recurses into children — it yields +control to the scheduler. - if !re.ends().is_columnar() { - // Need canonical ends for binary search - return Ok(Some(ExecutionStep::ExecuteChild(0))); - } +- `ExecuteChild(i)` asks the scheduler to execute child `i` to columnar, replace it, and call + `execute` again. +- `Done(columnar)` returns the final columnar result. - // Ends are canonical — binary search is cheap - let physical_start = re.find_physical_index(slice.start())?; - let physical_end = re.find_physical_index(slice.end())?; - let sliced = RunEndArray::new( - re.ends().slice(physical_start..physical_end)?, - re.values().slice(physical_start..physical_end)?, - slice.start() + re.offset(), - slice.len(), - )?; - Ok(Some(ExecutionStep::Done(Columnar::from(sliced)))) -} -``` +**execute_parent** returns `Option`. `None` means the child can't handle this parent. +`Some(result)` means it handled the parent — the result can be in **any encoding**, not just +canonical, enabling encoding-preserving execution (e.g., FSST returning a filtered FSST array +to a DuckDB exporter). + +### Constant handling + +When an encoding matches on a specific child type (e.g., `as_opt::()`), a +`ConstantArray` child won't match — it's columnar but not Primitive. Encodings should either +check for constants explicitly, or — preferably — register a reduce rule that handles the +constant case before execution runs. For example: +`Dict(Constant(code), values)` → `Constant(values.scalar_at(code))`. + +Note: an encoding that blindly returns `ExecuteChild(i)` when its child is already columnar +(but doesn't match the expected concrete type) will loop forever. This is trivial for the +scheduler to detect — if `ExecuteChild(i)` is returned and child `i` is already columnar, the +scheduler can abort with an error. ### Scheduler +The scheduler uses an explicit work stack, bounding stack depth regardless of encoding tree +depth. + ```rust -fn execute_one_step(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { - let mut array = optimize_recursive(array)?; - - if array.as_columnar().is_some() { return Ok(array); } - - // Try execute_parent (child-driven specialized execution) - for child_idx in 0..array.nchildren() { - let child = array.child(child_idx); - match child.vtable().execute_parent(&child, &array, child_idx, ctx)? { - None => continue, - Some(ExecutionStep::ExecuteChild(i)) => { - // Child needs its own child executed first — do one step, retry - let grandchild = child.child(i); - let stepped = execute_one_step(grandchild, ctx)?; - let new_child = child.with_child(i, stepped); - array = array.with_child(child_idx, new_child); - return Ok(array); +fn execute_to_columnar(root: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { + let mut current = optimize_recursive(root, ctx)?; + let mut stack: Vec<(ArrayRef, usize)> = vec![]; // (parent, child_idx) + + loop { + // Check if current is columnar — return to parent or finish + if let Some(c) = current.as_columnar() { + match stack.pop() { + None => return Ok(c), + Some((parent, child_idx)) => { + current = parent.with_child(child_idx, c.into_array()); + current = optimize_recursive(current, ctx)?; + continue; + } + } + } + + // Try execute_parent + if let Some(rewritten) = try_execute_parent(¤t, ctx)? { + current = optimize_recursive(rewritten, ctx)?; + continue; + } + + // Execute + match current.vtable().execute(¤t, ctx)? { + ExecutionStep::ExecuteChild(i) => { + let child = current.child(i); + stack.push((current, i)); + current = optimize_recursive(child, ctx)?; } - Some(ExecutionStep::Done(result)) => { - return Ok(result.into_array()); + ExecutionStep::Done(result) => { + current = result.into_array(); } } } +} +``` - // Fall through to the array's own execute - match array.vtable().execute(&array, ctx)? { - ExecutionStep::ExecuteChild(i) => { - let child = array.child(i); - let stepped = execute_one_step(child, ctx)?; - array = array.with_child(i, stepped); - Ok(array) - } - ExecutionStep::Done(result) => Ok(result.into_array()), +### Execution cache + +If the same `Arc` appears as a child of multiple parents (e.g., `a < 10 & a > 5`), +the scheduler executes it independently each time. The execution cache deduplicates this work. + +The cache lives in `ExecutionCtx`, keyed by `ByPtr` — a newtype that implements +`Hash` and `Eq` via `Arc::as_ptr()`. Because the key holds a clone of the `Arc`, the source +array cannot be deallocated while cached, so the pointer cannot be reused for a different array. + +```rust +struct ByPtr(ArrayRef); + +impl Hash for ByPtr { + fn hash(&self, state: &mut H) { + Arc::as_ptr(&self.0).hash(state); + } +} + +impl PartialEq for ByPtr { + fn eq(&self, other: &Self) -> bool { + Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0) } } + +pub struct ExecutionCtx { + cache: HashMap, // source → one-step result + // ... +} ``` -Each call to `execute_one_step` makes one unit of progress. The top-level -`execute_to_columnar` simply loops until the result is columnar. After each step, -`optimize_recursive` runs again — reduce rules can fire on the new tree shape. For truly -bounded stack depth, an explicit work stack replaces the recursive calls. +**When to cache.** Three options: + +1. **Always cache.** Memory explodes — most arrays are executed once and never revisited. -### Per-encoding examples +2. **Pre-scan the tree.** Walk the tree before execution, count pointer occurrences, cache only + shared nodes. Accurate for a single tree, but doesn't work across multiple trees executed + with the same `ExecutionCtx` (e.g., scan columns sharing a dictionary or filter mask). -**DictArray** — requests codes first, then gathers: +3. **`Arc::strong_count > 1` heuristic.** O(1) to check. Over-caches when external references + exist (layout reader, scan builder) but never under-caches for a genuinely shared + sub-expression. Works across independent tree executions within the same `ExecutionCtx`. + False positives cost memory, not correctness. + +We use option 3. The cache is dropped when the `ExecutionCtx` is dropped. + +### Examples + +**DictArray** — execute codes into Primitive, then gather: ```rust fn execute(dict: &DictArray, ctx: &mut ExecutionCtx) -> VortexResult { - if !dict.codes().is_columnar() { + let Some(codes) = dict.codes().as_opt::() else { return Ok(ExecutionStep::ExecuteChild(0)); - } - let codes = dict.codes().as_primitive(); - let gathered = gather(dict.values(), &codes, ctx)?; + }; + let gathered = gather(dict.values(), codes, ctx)?; Ok(ExecutionStep::Done(gathered)) } ``` -**ScalarFnArray** — requests children left-to-right, then evaluates: +Note: if codes is a `ConstantArray`, the scheduler returns it as columnar. A reduce rule +`Dict(Constant(code), values) → Constant(values.scalar_at(code))` handles this before execute +runs. + +**ScalarFnArray** — columnarize children left-to-right, then evaluate: ```rust fn execute(sfn: &ScalarFnArray, ctx: &mut ExecutionCtx) -> VortexResult { for (i, child) in sfn.children().iter().enumerate() { - if !child.is_columnar() { + if child.as_opt::().is_none() { return Ok(ExecutionStep::ExecuteChild(i)); } } @@ -200,195 +207,114 @@ fn execute(sfn: &ScalarFnArray, ctx: &mut ExecutionCtx) -> VortexResult VortexResult { - if !filter.child().is_columnar() { + let Some(child) = filter.child().as_opt::() else { return Ok(ExecutionStep::ExecuteChild(0)); - } - let filtered = filter.mask().apply_to(filter.child().as_canonical())?; + }; + let filtered = filter.mask().apply_to(child.into())?; Ok(ExecutionStep::Done(Columnar::Canonical(filtered))) } ``` -**BitPacked** — leaf encoding, decompresses directly: +**BitPacked** — leaf, decompresses directly: ```rust fn execute(bp: &BitPackedArray, ctx: &mut ExecutionCtx) -> VortexResult { - let primitive = unpack(bp)?; - Ok(ExecutionStep::Done(Columnar::Canonical(Canonical::Primitive(primitive)))) + Ok(ExecutionStep::Done(Columnar::Canonical(Canonical::Primitive(unpack(bp)?)))) } ``` ### Cross-step optimization -Because `optimize_recursive` runs after each child execution, patterns that only become visible -after partial execution can still be optimized. - -Example: `ScalarFn(upper, [Dict(BitPacked(codes), values)])`. After the scheduler executes -BitPacked codes to PrimitiveArray, the tree becomes -`ScalarFn(upper, [Dict(Primitive(codes), values)])`. A reduce_parent rule on Dict pushes -`upper` into values — this optimization fires between steps. - -### Encoding-preserving execute_parent +`optimize_recursive` runs when the scheduler pops from the stack (after a child reaches +columnar and is replaced into its parent). Reduce rules fire on the new tree shape. -`execute_parent` returns `Option` in any encoding. This enables exporters to -intercept intermediate forms without decompressing. +Example: `ScalarFn(upper, [Dict(BitPacked(codes), values)])`. Pre-execution optimization fires +a reduce_parent rule on Dict, pushing `upper` into values. The tree becomes +`Dict(BitPacked(codes), upper(values))`. Dict then executes its codes into Primitive and +gathers. -Concrete example: `Filter(FSST(data), mask)`. FSST's `execute_parent` sees the FilterArray -parent and returns a filtered FSST array (still FSST-encoded). An exporter driving the -scheduler sees FSST on the next iteration and exports it as a DuckDB FSST vector — no -decompression needed. +### Encoding-preserving execution -Same applies to DictArray for DuckDB dictionary vector export, or any encoding where the -downstream consumer natively supports the compressed form. +`execute_parent` returns results in any encoding. Exporters drive the scheduler loop and +inspect the tree after each step. If an encoding the exporter cares about becomes visible +(DictArray for DuckDB dictionary vectors, FSST for DuckDB FSST vectors), the exporter +intercepts it without decompressing. -### Exporter integration - -Exporters drive the scheduler loop directly and inspect the array after each step: - -```rust -fn export_chunk(mut array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { - let mut array = optimize_recursive(array)?; +### Removing ExecutionCtx from VTable methods - loop { - if let Some(dict) = array.as_opt::() { - return export_dict(dict, ctx); - } - if let Some(fsst) = array.as_opt::() { - return export_fsst(fsst, ctx); - } - if let Some(c) = array.as_columnar() { - return export_columnar(c, ctx); - } +The `execute` and `execute_parent` signatures shown above accept `&mut ExecutionCtx`. This gives +encodings the ability to recursively execute children, bypassing the scheduler's caching and +cross-step optimization. Nothing in the type system prevents it. - if let Some(rewritten) = try_execute_parent(&array, ctx)? { - array = optimize_recursive(rewritten)?; - continue; - } +A stronger design: remove `ExecutionCtx` from the VTable method signatures entirely. The +scheduler owns the execution state (cache, tracing). `execute` receives no context. The method signature itself +communicates "return a step, don't execute anything." - match array.vtable().execute(&array, ctx)? { - ExecutionStep::ExecuteChild(i) => { - let child = array.child(i); - let executed = execute_to_columnar(child, ctx)?; - array = array.with_child(i, executed.into_array()); - array = optimize_recursive(array)?; - } - ExecutionStep::Done(c) => return export_columnar(c, ctx), - } - } -} -``` +This also eliminates the current ergonomic friction of +`let ctx = session.create_execution_ctx(); array.execute(&mut ctx)` — callers just call the +scheduler directly. -If an opaque encoding decodes to DictArray during execution, the exporter discovers it on the -next iteration and exports it natively. This is impossible in a model that always produces -canonical output. +If `execute_parent` also yields `ExecutionStep` (see unresolved questions), the same argument +applies: it gets resource access but not execution power. The scheduler is the only code that +drives execution. ### Decompress-into-buffer -The scheduler-driven model does not natively support caller-owned output buffers (each -`Done(Columnar)` allocates its own output). The two concrete cases where this matters — -ChunkedArray concat and FSST zero-copy push — can be addressed at the framework level: - -- **ChunkedArray**: The scheduler special-cases ChunkedArray by pre-allocating a single output - buffer and copying each chunk's `Columnar` result into it. One allocation + N memcpys rather - than N allocations + one concat. Not zero-copy, but eliminates the extra full-column copy. - -- **FSST**: The extra allocation is the views array (16 bytes per string). The actual string - data buffers are shared via `Arc` — they are not copied. The overhead is bounded and small - relative to decompression cost. - -These are framework-level optimizations, not VTable concerns. They don't require encoding -authors to think about builders. - -### Execution cache in ExecutionCtx - -The scheduler executes each child one step at a time, replacing it in the parent tree. This -means the result of executing an array is only visible to its immediate parent — if the same -`Arc` appears as a child of multiple parents (e.g., `a < 10 & a > 5` where `a` is -the same ArrayRef), it will be executed independently each time. - -`ExecutionCtx` holds a cache keyed by the raw pointer of the source array (`Arc::as_ptr()`). -The cache entry stores a clone of the source `Arc` (to pin the pointer address — preventing -deallocation and reuse) alongside the one-step execution result. - -```rust -pub struct ExecutionCtx { - // ... - cache: HashMap<*const dyn Array, CacheEntry>, -} - -struct CacheEntry { - /// Holds a strong reference to the source array, preventing the Arc from - /// being deallocated and its pointer reused for a different array. - _source: ArrayRef, - /// The result of executing the source array one step. - result: ArrayRef, -} -``` - -**When to cache.** Not every array should be cached — most are executed exactly once, and -caching adds overhead (HashMap insert + Arc clone). The `Arc::strong_count` is used as a -heuristic: if `strong_count > 1`, the array is referenced from multiple places and its -execution result is likely to be reused. This is an imperfect approximation — strong_count is a -global reference count, not local to the array tree being executed. An array may have -`strong_count > 1` because the layout reader or scan builder holds a reference, not because it -appears twice in the expression tree. But it never misses a genuinely shared sub-expression (if -shared, the count is > 1), and false positives only cost memory, not correctness. - -**Scope.** The cache lives in `ExecutionCtx`, not in individual arrays. Users can share an -`ExecutionCtx` across multiple independent array executions (e.g., multiple columns in a scan), -so the cache naturally deduplicates work across columns that share sub-arrays (shared -dictionaries, common filter masks). The cache is dropped when the `ExecutionCtx` is dropped. - -## Compatibility - -No file format or wire format changes. All changes are internal to the execution engine. - -Public API breakage: the `Executable` trait and `execute::()` method are replaced. -Third-party encodings must migrate. A default implementation bridging to the old path eases -migration. +This model does not support caller-owned output buffers. Each `Done(Columnar)` allocates its +own output. ChunkedArray cannot share a builder across chunks, and FSST cannot push views +directly into a caller's VarBinViewBuilder. This is a trade-off we accept in exchange for +encoding-preserving execution, cross-step optimization, and bounded stack depth. ## Alternatives ### Canonical builder model -Instead of returning `ExecutionStep`, `execute` could push results into a caller-owned -`CanonicalBuilder` (a closed enum mirroring `Canonical` in mutable builder form). Each encoding -decompresses directly into the builder in a single recursive descent. `execute_parent` would -also push into the builder. - -This model natively supports decompress-into-buffer: ChunkedArray writes all chunks into one -builder (no concat), FSST pushes views directly into VarBinViewBuilder (zero-copy). No -iteration loop, no MAX_ITERATIONS. - -However, requiring canonical output from `execute` is a structural limitation that cannot be -worked around: +`execute` pushes results into a caller-owned `CanonicalBuilder` (closed enum mirroring +`Canonical` in mutable builder form). Single recursive descent, no iteration loop. Natively +supports decompress-into-buffer: ChunkedArray writes all chunks into one builder, FSST pushes +views directly into VarBinViewBuilder. -- **No encoding-preserving execute_parent.** `execute_parent` must push canonical into the - builder. FSST can't return a filtered FSST array — the encoding is always lost. Exporters - must use encoding-specific logic outside the framework for every format they want to preserve. +However, always producing canonical output has structural limitations: -- **No cross-step optimization.** Reduce runs once before the single descent. Patterns visible - only after partial execution are missed. +- **No encoding-preserving execution.** `execute_parent` pushes into a builder, so results are + always canonical. FSST can handle a FilterArray parent (fused filter + decompress into the + builder), but it can't return a filtered FSST array for a DuckDB exporter. Exporters that + want to preserve an encoding must use encoding-specific logic outside the framework. +- **No cross-step optimization.** Reduce runs once before the single descent. +- **Stack overflow.** Recurses into children; stack depth equals encoding tree depth. -- **No exporter intermediate inspection.** Exporters see the tree after reduce, before - execution. Encodings hidden behind opaque wrappers are never visible. - -- **Stack overflow.** `execute` recurses into children. Stack depth equals encoding tree depth. - -The scheduler-driven model's allocation wins (encoding-preserving intermediates, cross-step -optimization, exporter interception, bounded stack) are structurally unrecoverable in the -builder model. The builder model's allocation wins (shared builder, zero-copy push) are -recoverable in the scheduler model via framework-level optimizations. This asymmetry makes the -scheduler-driven model strictly preferable. +The scheduler model's wins (encoding preservation, cross-step optimization, bounded stack) are +structurally unrecoverable in the builder model. The builder model's win (zero-copy +decompress-into-buffer) is a real cost of the scheduler model, but one we accept given the +asymmetry. ## Unresolved Questions -- **Explicit work stack:** The scheduler can be recursive (simple, but same stack overflow risk - as today) or use an explicit work stack (bounded depth, but more complex). The RFC assumes an - explicit stack is feasible but doesn't specify the implementation. - -- **Constants:** `ConstantArray` continues to exist. ScalarFnArray special-cases it to avoid - expanding constants. The `Columnar` enum (`Canonical | Constant`) is preserved. +- **Explicit work stack details.** The scheduler sketch shows the concept. The exact data + structure (e.g., handling multiple `ExecuteChild` calls from the same parent for different + children) needs design work. + +- **Iterative execute_parent.** The current design has `execute_parent` return + `Option`. An alternative is to return `Option`, allowing it to + request child execution before handling the parent. The execution cache may be sufficient for + cases where `execute_parent` needs data access to children (e.g., RunEnd binary-searching + compressed ends — the cache ensures decompression happens once). We have no compelling + example for or against iterative `execute_parent` yet. + +- **Targeted child execution.** `ExecuteChild(i)` currently executes the child to columnar. + An alternative is to allow the encoding to specify a matcher for when it should be re-entered + — e.g., `ExecuteChildInto(i, ArrayId)` would execute child `i` until it matches a specific + encoding or reaches columnar, whichever comes first. This enables early re-entry before full + columnarization — for example, an encoding could request its child be executed until it becomes + a DictArray, then operate on the dictionary directly. The trade-off is additional complexity in + the scheduler and ExecutionStep enum. + +- **FilterArray.** FilterArray continues to exist as a lazy wrapper. It is not subsumed by the + execution method. Whether to unify Filter/Slice/Take wrappers is orthogonal. From 531c24c5e43463ec81952d50365b781c0d423927 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 27 Feb 2026 22:04:52 -0500 Subject: [PATCH 8/9] Make ExecutionArgs a dyn trait Signed-off-by: Nicholas Gates --- ...02-aggregate-functions-and-list-scalars.md | 862 ------------------ ...xecution-v2.md => 0019-array-execution.md} | 3 +- 2 files changed, 1 insertion(+), 864 deletions(-) delete mode 100644 proposals/0002-aggregate-functions-and-list-scalars.md rename proposals/{0001-execution-v2.md => 0019-array-execution.md} (99%) diff --git a/proposals/0002-aggregate-functions-and-list-scalars.md b/proposals/0002-aggregate-functions-and-list-scalars.md deleted file mode 100644 index 0cc053a..0000000 --- a/proposals/0002-aggregate-functions-and-list-scalars.md +++ /dev/null @@ -1,862 +0,0 @@ -- Start Date: 2026-02-26 -- RFC PR: [vortex-data/rfcs#0000](https://github.com/vortex-data/rfcs/pull/0000) -- Tracking Issue: [vortex-data/vortex#0000](https://github.com/vortex-data/vortex/issues/0000) - -## Summary - -Introduce `AggregateFnVTable` to Vortex as a trait for defining aggregate operations, and a -single `ListAggregate` scalar function that applies any aggregate to a list column. There is -no dedicated `AggregateFnArray` — `ListAggregate` is a `ScalarFnVTable` implementation whose -lazy evaluation is handled by the existing `ScalarFnArray` infrastructure. Child encodings -match on `ExactScalarFn` in their reduce_parent rules for aggregate-specific -optimizations. - -The key insight is the **aggregate/list-scalar duality**: a list column stored as -`(offsets, elements)` is a pre-materialized grouping. Computing `list_sum(list_column)` is -literally a grouped `sum` over the flat elements array partitioned by offsets. Rather than -implementing N separate list scalar functions (`list_sum`, `list_min`, `list_max`, ...), -we implement N aggregate functions and a single `ListAggregate` scalar function that applies -any aggregate to a list column. - -The `Accumulator` trait is the core primitive. It processes one group at a time via -`accumulate(batch)` / `flush()` / `finish()`, supporting streaming through chunked elements, -very large lists, and ungrouped column-level aggregation. `execute_grouped` is a convenience -built on top of the accumulator. - -Streaming ordered GROUP BY falls out naturally: the query engine constructs a `ListArray` from -`(flat_column, group_offsets)` per batch — a zero-copy operation — and wraps it in -`ScalarFnArray(ListAggregate(Sum), [list_array])`. - -This RFC also evaluates whether the canonical list encoding should switch from `ListViewArray` -(offsets+sizes) to `ListArray` (offsets-only), since the choice directly affects how cleanly -the duality works. - -## Motivation - -### Aggregate functions exist but in the old framework - -Vortex has two compute systems: - -- **`ScalarFnVTable`**: element-wise functions (1 input row → 1 output row) with lazy - `ScalarFnArray` evaluation, expression trees, and reduce/reduce_parent optimization. -- **`ComputeFn`**: kernel-based operations like `sum`, `min_max`, `nan_count`. Standalone, - not part of the expression tree, dispatched to per-encoding kernels via `inventory`. - -Aggregate operations like `sum` already exist, but as `ComputeFn` kernels. They cannot -participate in the expression tree or optimizer. This means: - -- No lazy evaluation that defers computation and can be optimized via reduce_parent. -- No way to push aggregate operations through Dict values, RunEnd runs, or other encodings - via the reduce rule framework. -- No unified framework for list scalar functions and grouped aggregates. - -### List scalar functions are redundant with aggregate functions - -The only list-specific scalar function today is `list_contains`. Adding `list_sum`, -`list_min`, `list_max`, `list_count`, `list_mean`, etc. as individual `ScalarFnVTable` -implementations would duplicate the aggregation logic that already exists in `ComputeFn` -kernels. If Vortex had first-class aggregate functions, each list scalar function would -simply be "apply this aggregate to each list" — one `ListAggregate` scalar function -parameterized by the aggregate, rather than N separate implementations. - -### The aggregate/list-scalar duality - -Consider a list column `[[1,2,3], [4,5]]` stored as: - -``` -elements: [1, 2, 3, 4, 5] -offsets: [0, 3, 5] -``` - -Computing `list_sum` yields `[6, 9]`. This is identical to computing a grouped `sum` over -elements partitioned by offset ranges `[0..3, 3..5]`. Every aggregate function has a -corresponding list scalar function: - -| Aggregate | List scalar | Grouped operation | -|--------------|------------------------|-----------------------------------| -| `sum(col)` | `list_sum(list_col)` | Sum elements per group | -| `count(col)` | `list_count(list_col)` | Count non-null elements per group | -| `min(col)` | `list_min(list_col)` | Min element per group | -| `max(col)` | `list_max(list_col)` | Max element per group | -| `mean(col)` | `list_mean(list_col)` | Mean of elements per group | - -### Streaming ordered GROUP BY - -For databases with sorted storage, GROUP BY over a sorted key is a streaming operation: -groups arrive in order and don't cross batch boundaries. This maps directly to the -aggregate/list-scalar duality: - -1. For each batch, compute group boundaries from the sorted key (run boundaries → offsets). -2. Construct `ListArray::new(value_column, group_offsets, ...)` — zero-copy. -3. Wrap in `ScalarFnArray(ListAggregate(Sum), [list_array])`. -4. Execute → one result per group. - -No hash tables, no shuffling. The same `execute_grouped` path serves both list scalar -functions and streaming ordered GROUP BY. - -### Expensive list interop - -The current canonical list encoding is `ListViewArray` (offsets+sizes, out-of-order allowed). -Converting between `ListArray` and `ListViewArray` is expensive — see the rebuild -infrastructure with modes `MakeZeroCopyToList`, `TrimElements`, `MakeExact`, and the -`is_zero_copy_to_list` flag. This complexity exists primarily to support out-of-order -offsets, which complicates the aggregate duality since `execute_grouped` needs monotonic -offsets. - -## Design - -### `Accumulator` — the core primitive - -The `Accumulator` trait is the fundamental aggregation interface. It processes one group -at a time: the caller feeds element batches, then flushes to finalize the group and start -the next one. The accumulator owns its output buffer and returns all results at the end. - -```rust -pub trait Accumulator: Send + Sync { - /// Feed a batch of element values for the current group. - /// Can be called multiple times per group (e.g., chunked elements). - fn accumulate(&mut self, batch: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<()>; - - /// Finalize the current group: push its result into the output buffer - /// and reset internal state for the next group. - fn flush(&mut self, ctx: &mut ExecutionCtx) -> VortexResult<()>; - - /// Return all flushed results as a single array. - /// Length = number of flush() calls. - fn finish(self: Box, ctx: &mut ExecutionCtx) -> VortexResult; -} -``` - -This handles all aggregation cases uniformly: - -**Case 1 — List scalar with contiguous elements:** -```rust -let mut acc = aggregate.accumulator(element_dtype)?; -for i in 0..n_lists { - let group_elements = elements.slice(offsets[i]..offsets[i+1])?; - acc.accumulate(&group_elements, ctx)?; - acc.flush(ctx)?; -} -acc.finish(ctx) // → ArrayRef of length n_lists -``` - -**Case 2 — Large lists with chunked elements:** -A list's elements may be a `ChunkedArray`. The accumulator handles this by feeding -each chunk separately before flushing: -```rust -let mut acc = aggregate.accumulator(element_dtype)?; -for i in 0..n_lists { - let group_elements = elements.slice(offsets[i]..offsets[i+1])?; - for chunk in iter_chunks(&group_elements) { - acc.accumulate(&chunk, ctx)?; - } - acc.flush(ctx)?; -} -acc.finish(ctx) -``` - -**Case 3 — Ungrouped full-column aggregation:** -One group, fold across chunks: -```rust -let mut acc = aggregate.accumulator(dtype)?; -for chunk in chunked_array.chunks() { - acc.accumulate(&chunk, ctx)?; -} -acc.flush(ctx)?; -acc.finish(ctx) // → 1-element array -``` - -**Case 4 — ListView with scattered elements:** -Canonicalize to ListArray first (rebuilding to sorted form), then use Case 1 or 2. -The accumulator itself always sees contiguous element batches. - -### `AggregateFnVTable` - -A new trait parallel to `ScalarFnVTable`. The `accumulator()` method is the required -core — `execute_grouped` and `execute_scalar` have default implementations built on it: - -```rust -pub trait AggregateFnVTable: 'static + Sized + Clone + Send + Sync { - type Options: 'static + Send + Sync + Clone + Debug + Display + PartialEq + Eq + Hash; - - fn id(&self) -> AggregateFnId; - - fn serialize(&self, options: &Self::Options) -> VortexResult>>; - fn deserialize( - &self, - metadata: &[u8], - session: &VortexSession, - ) -> VortexResult; - - /// Arity is always 1. For multi-column aggregates (e.g., covariance), the input - /// should be a struct array containing the columns. - fn arity(&self, _options: &Self::Options) -> Arity { - Arity::Exact(1) - } - - /// Result dtype for each group (not a list — the scalar result type). - fn return_dtype( - &self, - options: &Self::Options, - input_dtypes: &[DType], - ) -> VortexResult; - - /// Format in SQL style, e.g. `sum(col)`. - fn fmt_sql( - &self, - options: &Self::Options, - expr: &Expression, - f: &mut Formatter<'_>, - ) -> fmt::Result; - - /// Create an accumulator for streaming aggregation. - /// - /// This is the core primitive. All other execution methods have default - /// implementations built on this. - fn accumulator( - &self, - options: &Self::Options, - input_dtype: &DType, - ) -> VortexResult>; - - /// One-shot grouped execution over contiguous elements + monotonic offsets. - /// - /// Default: loop over groups, slice elements, accumulate each, flush, finish. - /// Override for vectorized one-shot paths (e.g., SIMD segmented sum). - fn execute_grouped( - &self, - options: &Self::Options, - input: &ArrayRef, - offsets: &ArrayRef, - ctx: &mut ExecutionCtx, - ) -> VortexResult { - let n_groups = offsets.len() - 1; - let mut acc = self.accumulator(options, input.dtype())?; - for i in 0..n_groups { - let start = scalar_at(offsets, i)?; - let end = scalar_at(offsets, i + 1)?; - acc.accumulate(&input.slice(start..end)?, ctx)?; - acc.flush(ctx)?; - } - acc.finish(ctx) - } - - /// Ungrouped full-column aggregation returning a scalar. - /// - /// Default: single-group execute_grouped with offsets [0, n]. - /// Replaces standalone `ComputeFn` kernels like `compute::sum()`. - fn execute_scalar( - &self, - options: &Self::Options, - input: &ArrayRef, - ctx: &mut ExecutionCtx, - ) -> VortexResult { - let n = input.len(); - let offsets: ArrayRef = buffer![0u64, n as u64].into_array(); - let result = self.execute_grouped(options, input, &offsets, ctx)?; - result.scalar_at(0) - } -} -``` - -Key differences from `ScalarFnVTable`: - -| Property | `ScalarFnVTable` | `AggregateFnVTable` | -|----------|-------------------|----------------------| -| Row semantics | 1 input → 1 output | N inputs → 1 output per group | -| Core primitive | `execute(args) -> ArrayRef` | `accumulator() -> Box` | -| Input shape | All children same length | Flat elements + offsets defining variable-size groups | -| Streaming | N/A | accumulate/flush/finish across chunks | - -### Streaming AVG example - -To illustrate how the accumulator works for a stateful aggregate, consider `Mean`: - -```rust -struct MeanAccumulator { - running_sum: f64, - running_count: u64, - output: Vec, -} - -impl Accumulator for MeanAccumulator { - fn accumulate(&mut self, batch: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<()> { - self.running_sum += Sum.execute_scalar(&EmptyOptions, batch, ctx)?.as_f64()?; - self.running_count += Count.execute_scalar(&EmptyOptions, batch, ctx)?.as_u64()?; - Ok(()) - } - - fn flush(&mut self, _ctx: &mut ExecutionCtx) -> VortexResult<()> { - let mean = if self.running_count > 0 { - self.running_sum / self.running_count as f64 - } else { - f64::NAN - }; - self.output.push(mean); - self.running_sum = 0.0; - self.running_count = 0; - Ok(()) - } - - fn finish(self: Box, _ctx: &mut ExecutionCtx) -> VortexResult { - Ok(PrimitiveArray::from_vec(self.output).into_array()) - } -} -``` - -For a list column `[[1.0, 2.0, 3.0], [4.0, 5.0]]` where the first list's elements are -chunked as `[1.0, 2.0]` and `[3.0]`: - -1. `accumulate([1.0, 2.0])` → running_sum=3.0, running_count=2 -2. `accumulate([3.0])` → running_sum=6.0, running_count=3 -3. `flush()` → push 2.0, reset -4. `accumulate([4.0, 5.0])` → running_sum=9.0, running_count=2 -5. `flush()` → push 4.5, reset -6. `finish()` → `[2.0, 4.5]` - -### Built-in aggregates - -The initial set, each implementing `AggregateFnVTable`: - -```rust -pub struct Sum; // sum of elements per group -pub struct Count; // count of non-null elements per group -pub struct Min; // minimum element per group -pub struct Max; // maximum element per group -pub struct Mean; // mean of elements per group (returns f64) -pub struct Any; // logical OR per group (bool input) -pub struct All; // logical AND per group (bool input) -pub struct CollectList; // identity: collect elements into list (returns list) -``` - -**Migration strategy**: `execute_scalar` replaces standalone `ComputeFn` kernel dispatch -(e.g., `Sum::execute_scalar` replaces `compute::sum()`). The `accumulator` implementations -can start simple (canonicalize + iterate) and be optimized over time with encoding-aware -fast paths. - -### `ListAggregate` scalar function - -A single `ScalarFnVTable` implementation that bridges list columns to the aggregate system. -Because `ListAggregate` is a scalar function, wrapping it in an expression produces a -`ScalarFnArray` — reusing the existing lazy evaluation, slicing, and reduce infrastructure -with no new array type needed. - -```rust -pub struct ListAggregate; - -pub struct ListAggregateOptions { - pub aggregate_fn: AggregateFnRef, -} - -impl ScalarFnVTable for ListAggregate { - type Options = ListAggregateOptions; - - fn id(&self) -> ScalarFnId { - ScalarFnId::from("vortex.list.aggregate") - } - - fn arity(&self, _options: &Self::Options) -> Arity { - Arity::Exact(1) - } - - fn return_dtype( - &self, - options: &Self::Options, - arg_dtypes: &[DType], - ) -> VortexResult { - let element_dtype = arg_dtypes[0].as_list_element() - .ok_or_else(|| vortex_err!("ListAggregate input must be a list"))?; - options.aggregate_fn.return_dtype(&[element_dtype.clone()]) - } - - fn execute( - &self, - options: &Self::Options, - args: ExecutionArgs, - ) -> VortexResult { - let list_input = &args.inputs[0]; - let ctx = args.ctx; - // Execute the child into a ListArray, decompose, and aggregate. - let list = list_input.to_list()?; - let elements = list.elements(); - let offsets = list.offsets(); - - // If elements are chunked, use the accumulator to stream through chunks. - // Otherwise, use one-shot execute_grouped. - if elements.is::() { - let mut acc = options.aggregate_fn.accumulator(elements.dtype())?; - let n_groups = offsets.len() - 1; - for i in 0..n_groups { - let group_elements = elements.slice( - offset_at(&offsets, i)..offset_at(&offsets, i+1) - )?; - for chunk in iter_chunks(&group_elements) { - acc.accumulate(&chunk, ctx)?; - } - acc.flush(ctx)?; - } - acc.finish(ctx) - } else { - options.aggregate_fn.execute_grouped(&elements, &offsets, ctx) - } - } - - // ... -} -``` - -Expression-level sugar: - -```rust -pub fn list_sum(list: Expression) -> Expression { - ListAggregate.new_expr( - ListAggregateOptions { aggregate_fn: Sum.bind(EmptyOptions) }, - [list], - ) -} - -pub fn list_count(list: Expression) -> Expression { - ListAggregate.new_expr( - ListAggregateOptions { aggregate_fn: Count.bind(EmptyOptions) }, - [list], - ) -} - -// list_min, list_max, list_mean, list_any, list_all analogously -``` - -This is one scalar function parameterized by the aggregate — not N separate functions. -Analogous to DuckDB's `list_aggregate(list, 'sum')`. - -**Why this works without a dedicated array type:** - -1. **Lazy evaluation.** `ScalarFnArray(ListAggregate(Sum), [list_col])` defers computation - until `execute()` is called, just as it does for any other scalar function. -2. **Slicing.** `ScalarFnArray` already slices its children. Slicing the list child - correctly adjusts both grouping and elements. -3. **Reduce rules.** `ListAggregate` can implement `ScalarFnVTable::reduce` for self-reduce - optimizations (constant folding, statistics). Encoding-specific reduce_parent rules - use `ExactScalarFn` to match on the parent (see below). -4. **Type safety.** The single child has `DType::List` — enforced by `return_dtype`. - -### Streaming ordered GROUP BY - -For databases with sorted storage, GROUP BY over a sorted key maps directly to the -aggregate/list-scalar duality. The query engine processes batches where: - -1. Groups are defined by a sorted key column (e.g., `customer_id`). -2. Groups are ordered and complete within each batch (no cross-batch groups). -3. No hash tables or shuffling needed. - -**Physical execution per batch:** - -```rust -fn execute_ordered_group_by( - value_column: &ArrayRef, - key_column: &ArrayRef, // sorted - aggregate: &AggregateFnRef, - ctx: &mut ExecutionCtx, -) -> VortexResult { - // Step 1: Compute group offsets from run boundaries in the key column. - // E.g., key = [A, A, A, B, B, C] → offsets = [0, 3, 5, 6] - let group_offsets = compute_run_offsets(key_column)?; - - // Step 2: Wrap as a list — zero-copy, just references. - let list = ListArray::new(value_column.clone(), group_offsets, Validity::NonNullable); - - // Step 3: Create lazy ScalarFnArray with ListAggregate. - let scalar_fn = ListAggregate.bind(ListAggregateOptions { - aggregate_fn: aggregate.clone(), - }); - let result = ScalarFnArray::try_new(scalar_fn, vec![list.into_array()])?; - - // Step 4: Execute (or let the optimizer reduce first). - result.into_array().execute(ctx) -} -``` - -The same `execute_grouped` serves both list scalar functions (where the list already exists -in the schema) and GROUP BY (where the list is constructed from a flat column + -sort-derived offsets). - -### Reduce rules - -Since `ListAggregate` is a `ScalarFnVTable`, its reduce rules use the existing -`ScalarFnArray` optimization infrastructure. - -**Self-reduce rules (via `ScalarFnVTable::reduce`):** - -`ListAggregate` implements `ScalarFnVTable::reduce` to handle aggregate-specific -optimizations when the list child's structure is known: - -- **Constant list folding**: If the list child is a `ConstantArray` of list scalars, - compute the aggregate directly. `ListAggregate(Sum, Constant([[1,2,3]], 1000))` - → `Constant(6, 1000)`. - -- **Count from metadata**: `ListAggregate(Count, list)` can use list sizes from metadata - without touching element data. - -- **Min/Max from statistics**: If element-level statistics are available, `Min`/`Max` can - short-circuit without decompression. - -- **Sum of constant elements**: `ListAggregate(Sum, list)` where elements are constant - → `constant_value * group_size` per group. - -**Parent-reduce rules (encoding-specific, via `ExactScalarFn`):** - -Child encodings register reduce_parent rules that match on `ExactScalarFn`. -The `ExactScalarFn` matcher provides typed access to `ListAggregateOptions`, from which the -specific `AggregateFnRef` can be inspected: - -```rust -impl ArrayParentReduceRule for DictAggregateRule { - type Parent = ExactScalarFn; - - fn reduce_parent( - &self, - array: &DictArray, - parent: ScalarFnArrayView<'_, ListAggregate>, - child_idx: usize, - ) -> VortexResult> { - let aggregate_fn = &parent.options.aggregate_fn; - - // Min/Max over Dict elements → Min/Max(values) - if aggregate_fn.is::() || aggregate_fn.is::() { - // Min/max of dictionary values is the global min/max. - let values = array.values(); - return Ok(Some( - ListAggregate.new_expr_with( - parent.options.clone(), - // Rebuild list with dict values as elements - // ... - ).into_array() - )); - } - - Ok(None) - } -} -``` - -Example encoding-specific rules: - -- **Dict**: `ListAggregate(Min, List(Dict(codes, values)))` → `ListAggregate(Min, List(values))`. - `Max` similarly. `Sum` cannot push down without frequency weighting. - -- **RunEnd**: `ListAggregate(Sum, List(RunEnd(values, run_ends)))` → weighted sum. - `ListAggregate(Min/Max, List(RunEnd))` → `ListAggregate(Min/Max, List(values))`. - -These rules operate on the **elements** within the list child. The list's offsets define -the grouping; the elements are what the encoding wraps. A list child like -`ListArray(elements: DictArray, offsets: ...)` allows the Dict reduce_parent rule to fire -on the elements when the `ScalarFnArray(ListAggregate)` is being optimized. - -### Interaction with `list_contains` - -The existing `list_contains` function is NOT an aggregate — it's a true scalar function -(takes a list + needle, returns bool per row). It stays as-is. `ListAggregate` is -specifically for operations that reduce list elements to a scalar per list. - -## Canonical list representation - -The choice of canonical list encoding directly affects how cleanly the aggregate duality -works. Currently, `ListViewArray` is canonical for `DType::List`. This section evaluates -whether to switch to `ListArray`. - -### Option A: Switch canonical to `ListArray` (offsets-only) - -Change `Canonical::List` from `ListViewArray` to `ListArray`. - -`ListArray` stores elements + an (n+1)-length monotonically increasing offsets array. -List `i` spans `elements[offsets[i]..offsets[i+1]]`. - -**Advantages for the aggregate duality:** - -- `execute_grouped` receives offsets directly from the list — no conversion needed. -- Offsets are monotonic by construction, matching the `execute_grouped` contract. -- One array (n+1 integers) vs two arrays (2n integers) — less memory. -- Offset deltas (list sizes) compress extremely well with delta/FoR encoding. -- Direct match to Arrow `ListArray` and DuckDB's list format — zero-copy Arrow export. - -**Disadvantages:** - -- **Breaking change.** All code calling `to_canonical()` on list data gets `ListArray` - instead of `ListViewArray`. Migration required. -- **No out-of-order offsets.** Operations like filter/take that scramble row order must - rebuild the elements array (or produce a `ListViewArray` as a non-canonical intermediate). -- **Sequential dependency in offsets.** Reading `offsets[i+1]` depends on `offsets[i]` for - computing list size — slightly less SIMD-friendly than separate sizes. - -**Migration path:** - -1. Add `to_list()` on `ToCanonical` alongside `to_listview()`. -2. Change `Canonical::List` to hold `ListArray`. -3. `ListViewArray` becomes a non-canonical encoding (like `VarBinArray` for strings). -4. Update `list_contains` and all list compute paths. - -### Option B: Keep `ListViewArray` as canonical - -Keep the current design where `ListViewArray` (offsets + sizes, out-of-order allowed) -is canonical for `DType::List`. - -**Advantages:** - -- No breaking change. -- SIMD-friendly: sizes can be read independently without sequential offset dependency. -- Out-of-order offsets enable better compression when list order doesn't match element order. -- Filter/take produce valid `ListViewArray` directly without rebuilding. - -**Disadvantages for the aggregate duality:** - -- `execute_grouped` requires monotonic offsets. Out-of-order `ListViewArray` must be - rebuilt to sorted form first (the `MakeZeroCopyToList` path), which copies data. -- The ZCTL flag, rebuild modes, and validation code remain as ongoing complexity. -- Arrow/DuckDB export still requires conversion. -- Two arrays (2n integers) instead of one (n+1 integers). - -**Impact on `ListAggregate::execute`:** - -`ListAggregate::execute` would need to handle the non-sorted case: - -```rust -fn execute(&self, options: &Self::Options, args: ExecutionArgs) -> VortexResult { - let list_input = &args.inputs[0]; - let ctx = args.ctx; - let listview = list_input.to_listview()?; - if listview.is_zero_copy_to_list() { - // Fast path: offsets are sorted, use directly - let offsets = build_list_offsets_from_list_view(&listview); - options.aggregate_fn.execute_grouped(&listview.elements(), &offsets, ctx) - } else { - // Slow path: rebuild to sorted form first - let list = list_from_list_view(listview)?; - options.aggregate_fn.execute_grouped(&list.elements(), &list.offsets(), ctx) - } -} -``` - -### ListView as direct aggregate input - -An alternative to canonicalizing to ListArray before executing is to support `ListView` -directly in `execute_grouped`. A ListView with out-of-order offsets represents -variable-length groups that may reference scattered regions of the elements array. - -**Advantages:** - -- More powerful — supports aggregation over non-contiguous element ranges. -- Avoids the expensive rebuild-to-sorted-form step for scrambled ListViews. - -**Disadvantages:** - -- **Complexity in aggregate implementations.** Each aggregate must handle potentially - scattered, non-contiguous element access patterns instead of simple contiguous slices. -- **Unpredictable decompression cost.** A ListView with sparse, scattered offsets may - trigger excessive random-access decompression of the elements array. In the worst case - this is more expensive than one-pass decompression of all elements into contiguous form. -- **SIMD unfriendly.** Contiguous offset ranges map to sequential memory access patterns - that vectorize well. Scattered access does not. - -For now, this RFC assumes `execute_grouped` takes monotonic (n+1) offsets. Aggregates -over ListView canonicalize to sorted form first. Supporting scattered access as an -optimization can be explored in the future. - -### Comparison - -| Criterion | ListArray (Option A) | ListViewArray (Option B) | -|-----------|---------------------|--------------------------| -| Aggregate duality | Direct — offsets define groups | Indirect — must ensure sorted or rebuild | -| Arrow/DuckDB export | Zero-copy | Requires conversion | -| Memory | n+1 integers | 2n integers | -| Filter/take | Must rebuild elements | Produces valid array directly | -| SIMD for size access | Sequential offset dependency | Independent size reads | -| Offset compression | Delta/FoR on monotonic data | May be non-monotonic | -| Breaking change | Yes | No | - -## Compatibility - -This RFC does not change the file format or wire format. `ListAggregate` produces a -`ScalarFnArray` at runtime (like any other scalar function). It is not persisted to disk. - -**Public API additions:** - -- `Accumulator` trait — core streaming aggregation primitive. -- `AggregateFnVTable` trait and built-in implementations (Sum, Count, Min, Max, Mean, - Any, All, CollectList). -- `ListAggregate` scalar function. -- Expression constructors: `list_sum()`, `list_count()`, `list_min()`, `list_max()`, - `list_mean()`, `list_any()`, `list_all()`. - -**If canonical list changes (Option A):** - -- `Canonical::List` changes from `ListViewArray` to `ListArray`. -- `to_canonical()` on list data returns `ListArray` instead of `ListViewArray`. -- `ListViewArray` remains available as a non-canonical encoding. -- Migration: callers using `to_listview()` should transition to `to_list()`. - -## Drawbacks - -- **New trait surface area.** `AggregateFnVTable` and `Accumulator` are new traits to - learn, though they closely mirror `ScalarFnVTable`. - -- **Reduce rule coverage.** Not all encoding × aggregate combinations will have optimized - reduce_parent rules initially. The fallback (canonicalize list, then accumulator loop) - is correct but may be slower. - -- **Canonical list change (if Option A).** Breaking change affecting all code that calls - `to_canonical()` on list data. - -## Alternatives - -### List scalar functions as separate `ScalarFnVTable` implementations - -Implement `ListSum`, `ListCount`, `ListMin`, etc. as individual scalar functions -(like `list_contains`), each operating directly on list arrays without an aggregate -abstraction. - -**Rejected because:** Duplicates logic across N functions. No shared optimization rules. -No path to GROUP BY. Each function must independently handle list decomposition. This is -exactly the redundancy that motivates this RFC. - -### Aggregates as `ComputeFn` only - -Keep aggregates in the kernel-based `ComputeFn` system. Implement list scalar functions -by dispatching to `ComputeFn` with offset ranges. - -**Rejected because:** No lazy evaluation, no expression tree participation, no -reduce_parent optimization. This is the status quo for `sum`/`min_max` and has the -limitations this RFC addresses. - -### Dedicated `AggregateFnArray` array type - -Introduce a new array type `AggregateFnArray` parallel to `ScalarFnArray`, wrapping -a single list child and an `AggregateFnRef`: - -```rust -struct AggregateFnArray { - aggregate_fn: AggregateFnRef, - dtype: DType, - child: ArrayRef, // must have DType::List - stats: ArrayStats, -} -``` - -With a `ListAggregate` scalar function that returns an `AggregateFnArray` from its -`execute` method, creating two lazy wrappers in sequence: -`ScalarFnArray(ListAggregate) → AggregateFnArray → actual computation`. - -**Rejected because:** `AggregateFnArray` is structurally identical to -`ScalarFnArray` with one child — it duplicates the lazy evaluation, slicing, and reduce -infrastructure. Child encodings can already match on specific scalar functions using -`ExactScalarFn`, which provides typed access to `ListAggregateOptions` -and the embedded `AggregateFnRef`. No new array type is needed. - -### No accumulator — one-shot `execute_grouped` only - -Make `execute_grouped` the only execution method, with no streaming accumulation. - -**Rejected because:** Doesn't handle chunked elements. A list column may have elements -stored as a `ChunkedArray`. Without an accumulator, all elements must be materialized -into a single contiguous array before aggregation, defeating the purpose of chunked -storage. Additionally, aggregates like `Mean` need `running_sum` and `running_count` -state across element batches, and numerically stable summation (Kahan) requires ordered -accumulation. The existing `sum_with_accumulator` pattern in `ComputeFn` demonstrates -this need. - -### Grouped accumulator tracking N groups simultaneously - -An accumulator that manages all groups at once: `accumulate(input, offsets)` where offsets -has N+1 entries per call, and sub-offsets are split at chunk boundaries. - -**Rejected because:** Over-engineered for our use cases. For list scalar functions and -ordered GROUP BY, groups are processed sequentially — each group's elements are fully -available before the next group starts. The per-group flush model is simpler and handles -all cases. If vectorized all-groups-at-once processing proves necessary, it can be added -as a separate `GroupsAccumulator` trait (see Future Possibilities). - -## Prior Art - -- **Apache Arrow**: Separates `ListArray` (offsets-only) from `ListView` (offsets+sizes). - Arrow Compute has aggregate kernels separate from scalar kernels. ListView is a recent - addition, not widely adopted. - -- **DuckDB**: Uses offsets-only lists. Has `list_aggregate(list, 'func_name')` — a single - function parameterized by the aggregate name. Separates scalar and aggregate function - registries. - -- **Apache DataFusion**: Two-tier accumulator design. `AggregateUDFImpl` has a required - `accumulator()` factory method (like ours). The `Accumulator` trait handles one group at - a time with `update_batch()` → `evaluate()`, plus `state()` and `merge_batch()` for - partial aggregation (serializing intermediate state for distributed execution). The - optional `GroupsAccumulator` trait manages all groups simultaneously with - `group_indices: &[usize]` per row for vectorized hash-based grouping, plus - `evaluate(EmitTo::First(n))` for streaming emission. Our `accumulate/flush/finish` - corresponds to DataFusion's per-group `Accumulator` with built-in output buffering. - Their `state/merge` and `GroupsAccumulator` are future extensions for us (see below). - -- **Velox**: `AggregateFunction` with `addRawInput`, `addIntermediateResults`, - `extractValues` — full streaming accumulator model. Uses row-major (offset, size) - pairs for lists. - -- **Polars**: Separate scalar and aggregate expression systems. List namespace functions - like `list.sum()`, `list.min()` internally dispatch to grouped aggregation over the - list's flat values. - -## Unresolved Questions - -- **`list_contains` integration**: Should `list_contains` be reimplemented as a - `ListAggregate` with a `Contains` aggregate, or does it remain a separate scalar - function? It doesn't fit the "reduce N to 1" pattern cleanly since it takes both a - list and a needle. - -- **Multi-column aggregates**: Some aggregates operate on multiple columns - (e.g., covariance, weighted sum). With arity fixed at 1, these require a struct array - as input. Is this ergonomic enough, or do we need a separate mechanism? - -- **Window functions**: Window functions (rolling sum, rank) operate on sliding windows - rather than fixed groups. They share the "operate within parent boundaries" property - but have different execution semantics. Should they be a separate trait or an - extension of `AggregateFnVTable`? - -- **Canonical list decision**: Option A (ListArray) vs Option B (ListViewArray). This - can be decided independently of the aggregate framework and implemented as a - preparatory or follow-up change. - -- **`CollectList` semantics**: `CollectList` is the identity aggregate — it returns - the elements as a list. For the list-scalar case, `list_collect(list_col)` is a no-op. - For GROUP BY, it materializes the grouped elements into a list column. Should this - be a built-in aggregate or handled separately? - -- **Null list handling**: When a list entry is null, should `flush()` push null into - the output? Or should the caller skip accumulation and use a separate mechanism to - mark null outputs? The accumulator currently has no way to signal "this group is null - without receiving any elements." - -## Future Possibilities - -- **Partial aggregation with `state()` / `merge()`**: Adding `state()` and - `merge_batch()` methods to `Accumulator` (like DataFusion) for distributed aggregation - where partial results must be serialized and merged across nodes. Not needed for - the sequential processing model in this RFC. - -- **`GroupsAccumulator`**: A separate trait (like DataFusion's) that manages all groups - simultaneously with `group_indices` per row, enabling vectorized hash-based grouping. - Our per-group flush model handles ordered GROUP BY; this extension would handle - unordered GROUP BY with hash tables. - -- **Aggregate push-down in file scanning**: Using `ListAggregate` reduce rules to - push aggregates into `LayoutReader`, computing aggregates during file scan without - materializing full columns. - -- **`list_distinct`, `list_sort`, `list_reverse`**: List transformation functions that - don't reduce elements to a scalar. These are not aggregates and would remain as - `ScalarFnVTable` implementations. - -- **Nested aggregation**: `list_sum(list_of_lists)` producing a list of sums at the - inner level. The duality recurses naturally. - -- **GPU execution**: `execute_grouped` with monotonic offsets maps cleanly to GPU - segmented reduction primitives (e.g., `cub::DeviceSegmentedReduce`). - -- **ListView scatter optimization**: If profiling shows that rebuilding scrambled - ListViews to sorted form is a bottleneck, aggregate implementations could accept - `(offsets, sizes)` pairs for scattered access. This would be opt-in per aggregate. diff --git a/proposals/0001-execution-v2.md b/proposals/0019-array-execution.md similarity index 99% rename from proposals/0001-execution-v2.md rename to proposals/0019-array-execution.md index e06eaba..1123572 100644 --- a/proposals/0001-execution-v2.md +++ b/proposals/0019-array-execution.md @@ -1,6 +1,5 @@ - Start Date: 2026-02-25 -- RFC PR: [vortex-data/rfcs#0000](https://github.com/vortex-data/rfcs/pull/0000) -- Tracking Issue: [vortex-data/vortex#0000](https://github.com/vortex-data/vortex/issues/0000) +- RFC PR: [vortex-data/rfcs#0000](https://github.com/vortex-data/rfcs/pull/19) ## Summary From 69485dcfc1d187186bb34151bbc20120970d3c82 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 27 Feb 2026 22:05:40 -0500 Subject: [PATCH 9/9] Make ExecutionArgs a dyn trait Signed-off-by: Nicholas Gates --- index.ts | 5 +---- styles.css | 64 ------------------------------------------------------ 2 files changed, 1 insertion(+), 68 deletions(-) diff --git a/index.ts b/index.ts index 980e60a..0b1f5a9 100644 --- a/index.ts +++ b/index.ts @@ -86,8 +86,6 @@ function baseHTML(title: string, content: string, cssPath: string = "styles.css" - -
@@ -108,8 +106,7 @@ ${content} Vortex RFC Archive
- - ${liveReload ? `\n ` : ""} + ${liveReload ? `\n ` : ""} `; } diff --git a/styles.css b/styles.css index 94495b3..d74bf70 100644 --- a/styles.css +++ b/styles.css @@ -8,16 +8,6 @@ --link-hover: #004499; --code-bg: #f5f5f5; --accent: #333; - --hl-keyword: #a626a4; - --hl-string: #50a14f; - --hl-comment: #a0a1a7; - --hl-function: #4078f2; - --hl-number: #986801; - --hl-type: #c18401; - --hl-built-in: #e45649; - --hl-attr: #986801; - --hl-symbol: #0184bc; - --hl-meta: #a0a1a7; } @media (prefers-color-scheme: dark) { @@ -31,16 +21,6 @@ --link-hover: #9acbf7; --code-bg: #252525; --accent: #ccc; - --hl-keyword: #c678dd; - --hl-string: #98c379; - --hl-comment: #5c6370; - --hl-function: #61afef; - --hl-number: #d19a66; - --hl-type: #e5c07b; - --hl-built-in: #e06c75; - --hl-attr: #d19a66; - --hl-symbol: #56b6c2; - --hl-meta: #5c6370; } } @@ -54,16 +34,6 @@ --link-hover: #9acbf7; --code-bg: #252525; --accent: #ccc; - --hl-keyword: #c678dd; - --hl-string: #98c379; - --hl-comment: #5c6370; - --hl-function: #61afef; - --hl-number: #d19a66; - --hl-type: #e5c07b; - --hl-built-in: #e06c75; - --hl-attr: #d19a66; - --hl-symbol: #56b6c2; - --hl-meta: #5c6370; } :root[data-theme="light"] { @@ -76,16 +46,6 @@ --link-hover: #004499; --code-bg: #f5f5f5; --accent: #333; - --hl-keyword: #a626a4; - --hl-string: #50a14f; - --hl-comment: #a0a1a7; - --hl-function: #4078f2; - --hl-number: #986801; - --hl-type: #c18401; - --hl-built-in: #e45649; - --hl-attr: #986801; - --hl-symbol: #0184bc; - --hl-meta: #a0a1a7; } * { @@ -499,27 +459,3 @@ footer { color: var(--fg-muted); font-size: 0.875rem; } - -/* Syntax highlighting (highlight.js token classes) */ -.hljs { background: var(--code-bg); color: var(--fg); } -.hljs-keyword, -.hljs-selector-tag { color: var(--hl-keyword); } -.hljs-string, -.hljs-addition { color: var(--hl-string); } -.hljs-comment, -.hljs-quote { color: var(--hl-comment); font-style: italic; } -.hljs-title, -.hljs-title.function_ { color: var(--hl-function); } -.hljs-number, -.hljs-literal { color: var(--hl-number); } -.hljs-type, -.hljs-title.class_ { color: var(--hl-type); } -.hljs-built_in { color: var(--hl-built-in); } -.hljs-attr, -.hljs-variable, -.hljs-template-variable { color: var(--hl-attr); } -.hljs-symbol, -.hljs-link { color: var(--hl-symbol); } -.hljs-meta, -.hljs-deletion { color: var(--hl-meta); } -.hljs-punctuation { color: var(--fg-muted); }