From 635930ca0310e868b6ea043e178ea672636353b0 Mon Sep 17 00:00:00 2001 From: PB <37089506+pbower@users.noreply.github.com> Date: Mon, 16 Feb 2026 21:54:40 +0000 Subject: [PATCH 1/3] Add feature-flagged Table Metadata --- Cargo.toml | 5 + pyo3/Cargo.lock | 2 +- pyo3/Cargo.toml | 1 + pyo3/src/ffi/to_py.rs | 82 +++++++++++-- pyo3/src/ffi/to_rust.rs | 79 +++++++++++-- src/ffi/arrow_c_ffi.rs | 95 ++++++++++++++- src/kernels/broadcast/array.rs | 80 ++++++------- src/kernels/broadcast/array_view.rs | 70 +++++------ src/kernels/broadcast/field_array.rs | 60 +++++----- src/kernels/broadcast/scalar.rs | 60 +++++----- src/kernels/broadcast/super_array.rs | 10 +- src/kernels/broadcast/super_array_view.rs | 20 ++-- src/kernels/broadcast/super_table.rs | 10 +- src/kernels/broadcast/super_table_view.rs | 100 ++++++++-------- src/kernels/broadcast/table.rs | 20 ++-- src/kernels/broadcast/table_view.rs | 80 ++++++------- src/structs/chunked/super_table.rs | 29 +++-- src/structs/table.rs | 109 +++++++++++++----- src/structs/views/chunked/super_table_view.rs | 6 +- src/structs/views/table_view.rs | 13 +-- 20 files changed, 594 insertions(+), 337 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ff2dc4d..65b23f7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -195,6 +195,11 @@ size = [] # Adds Arena, Table::from_arena, and an optimised consolidate path. arena = [] +# Adds an optional `metadata: BTreeMap` field to `Table`. +# Captures schema-level metadata that Arrow producers like PyArrow embed +# in the top-level ArrowSchema.metadata, e.g. pandas categorical ordering. +table_metadata = [] + # Adds pandas-style selection for Table and TableV with .c() and .r() methods select = [] diff --git a/pyo3/Cargo.lock b/pyo3/Cargo.lock index 2150879..e99e775 100644 --- a/pyo3/Cargo.lock +++ b/pyo3/Cargo.lock @@ -46,7 +46,7 @@ dependencies = [ [[package]] name = "minarrow" -version = "0.9.1" +version = "0.9.2" dependencies = [ "libc", "num-traits", diff --git a/pyo3/Cargo.toml b/pyo3/Cargo.toml index ce6b928..03ab533 100644 --- a/pyo3/Cargo.toml +++ b/pyo3/Cargo.toml @@ -32,6 +32,7 @@ extension-module = ["pyo3/extension-module"] datetime = ["minarrow/datetime"] extended_numeric_types = ["minarrow/extended_numeric_types"] extended_categorical = ["minarrow/extended_categorical"] +table_metadata = ["minarrow/table_metadata"] [build-dependencies] pyo3-build-config = "0.23" diff --git a/pyo3/src/ffi/to_py.rs b/pyo3/src/ffi/to_py.rs index 398e33d..fe76e91 100644 --- a/pyo3/src/ffi/to_py.rs +++ b/pyo3/src/ffi/to_py.rs @@ -127,15 +127,20 @@ fn arrow_type_to_pyarrow<'py>( } } -/// Builds an Arrow metadata map containing the table name, if non-empty. -fn table_name_metadata(name: &str) -> Option> { - if name.is_empty() { - None - } else { - let mut m = std::collections::BTreeMap::new(); - m.insert(TABLE_NAME_KEY.to_string(), name.to_string()); - Some(m) +/// Builds the Arrow schema metadata map for a stream export. +/// +/// Always includes the table name when non-empty. When the `table_metadata` +/// feature is enabled, the table's metadata entries are included too. +fn build_stream_metadata(table: &Table) -> Option> { + #[cfg(feature = "table_metadata")] + let mut m = table.metadata.clone(); + #[cfg(not(feature = "table_metadata"))] + let mut m = std::collections::BTreeMap::new(); + + if !table.name.is_empty() { + m.insert(TABLE_NAME_KEY.to_string(), table.name.clone()); } + if m.is_empty() { None } else { Some(m) } } // PyArrow conversion - legacy C data interface @@ -218,8 +223,24 @@ pub fn table_to_py<'py>(table: &Table, py: Python<'py>) -> PyResult = table + .metadata + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + if !table.name.is_empty() { + meta_entries.push((TABLE_NAME_KEY.to_string(), table.name.clone())); + } + if !meta_entries.is_empty() { + let metadata = meta_entries.into_py_dict(py)?; + schema = schema.call_method1("with_metadata", (metadata,))?; + } + } + #[cfg(not(feature = "table_metadata"))] if !table.name.is_empty() { let metadata = [(TABLE_NAME_KEY, &table.name)].into_py_dict(py)?; schema = schema.call_method1("with_metadata", (metadata,))?; @@ -254,6 +275,22 @@ pub fn super_table_to_py<'py>( } let py_fields_list = PyList::new(py, &py_fields)?; let mut schema = pyarrow.call_method1("schema", (py_fields_list,))?; + #[cfg(feature = "table_metadata")] + { + let mut meta_entries: Vec<(String, String)> = super_table + .metadata() + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + if !super_table.name.is_empty() { + meta_entries.push((TABLE_NAME_KEY.to_string(), super_table.name.clone())); + } + if !meta_entries.is_empty() { + let metadata = meta_entries.into_py_dict(py)?; + schema = schema.call_method1("with_metadata", (metadata,))?; + } + } + #[cfg(not(feature = "table_metadata"))] if !super_table.name.is_empty() { let metadata = [(TABLE_NAME_KEY, &super_table.name)].into_py_dict(py)?; schema = schema.call_method1("with_metadata", (metadata,))?; @@ -287,7 +324,28 @@ pub fn super_table_to_py<'py>( PyMinarrowError::PyArrow(format!("Failed to create PyArrow Table: {}", e)) })?; - // Attach table name as schema metadata if present + // Attach table name and metadata as schema metadata if present + #[cfg(feature = "table_metadata")] + { + let mut meta_entries: Vec<(String, String)> = super_table + .metadata() + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + if !super_table.name.is_empty() { + meta_entries.push((TABLE_NAME_KEY.to_string(), super_table.name.clone())); + } + if !meta_entries.is_empty() { + let metadata = meta_entries.into_py_dict(py)?; + return py_table + .call_method1("replace_schema_metadata", (metadata,)) + .map_err(|e| { + PyMinarrowError::PyArrow(format!("Failed to set schema metadata: {}", e)) + .into() + }); + } + } + #[cfg(not(feature = "table_metadata"))] if !super_table.name.is_empty() { let metadata = [(TABLE_NAME_KEY, &super_table.name)] .into_py_dict(py)?; @@ -477,7 +535,7 @@ pub fn table_to_stream_capsule<'py>(table: &Table, py: Python<'py>) -> PyResult< }) .collect(); - let metadata = table_name_metadata(&table.name); + let metadata = build_stream_metadata(table); let stream = export_record_batch_stream_with_metadata(vec![columns], fields, metadata); let stream_ptr = Box::into_raw(stream); @@ -543,7 +601,7 @@ pub fn super_table_to_stream_capsule<'py>( }) .collect(); - let metadata = table_name_metadata(&super_table.name); + let metadata = build_stream_metadata(&super_table.batches[0]); let stream = export_record_batch_stream_with_metadata(batches, fields, metadata); let stream_ptr = Box::into_raw(stream); diff --git a/pyo3/src/ffi/to_rust.rs b/pyo3/src/ffi/to_rust.rs index 07ae3de..e50aea5 100644 --- a/pyo3/src/ffi/to_rust.rs +++ b/pyo3/src/ffi/to_rust.rs @@ -49,6 +49,35 @@ fn extract_table_name_from_pyarrow_schema(schema: &Bound) -> String { .unwrap_or_default() } +/// Splits imported stream metadata into the table name and remaining entries. +/// +/// The `minarrow:table_name` key is extracted as the table name. All other +/// entries are returned as remaining metadata for `Table::new_with_metadata`. +#[cfg(feature = "table_metadata")] +fn split_stream_metadata( + metadata: Option>, +) -> (String, std::collections::BTreeMap) { + match metadata { + None => (String::new(), std::collections::BTreeMap::new()), + Some(mut m) => { + let name = m.remove(TABLE_NAME_KEY).unwrap_or_default(); + (name, m) + } + } +} + +/// Extracts the table name from imported stream metadata. +#[cfg(not(feature = "table_metadata"))] +fn extract_table_name( + metadata: &Option>, +) -> String { + metadata + .as_ref() + .and_then(|m| m.get(TABLE_NAME_KEY)) + .cloned() + .unwrap_or_default() +} + // PyCapsule helpers /// Attempts to import a single array via the `__arrow_c_array__` PyCapsule protocol. @@ -305,11 +334,12 @@ pub fn record_batch_to_rust(obj: &Bound) -> PyMinarrowResult) -> PyMinarrowResult) -> PyMinarrowResult { // Try PyCapsule stream if let Some(result) = try_capsule_record_batch_stream(obj) { let (batches, metadata) = result?; - let table_name = metadata - .as_ref() - .and_then(|m| m.get(TABLE_NAME_KEY)) - .cloned() - .unwrap_or_default(); + + #[cfg(feature = "table_metadata")] + let (table_name, remaining_meta) = split_stream_metadata(metadata); + #[cfg(not(feature = "table_metadata"))] + let table_name = extract_table_name(&metadata); + if batches.is_empty() { return Ok(SuperTable::new(table_name)); } @@ -414,7 +457,19 @@ pub fn table_to_rust(obj: &Bound) -> PyMinarrowResult { .into_iter() .map(|(array, field)| FieldArray::new(field, (*array).clone())) .collect(); - tables.push(Arc::new(minarrow::Table::new(table_name.clone(), Some(cols)))); + #[cfg(feature = "table_metadata")] + let table = if remaining_meta.is_empty() { + minarrow::Table::new(table_name.clone(), Some(cols)) + } else { + minarrow::Table::new_with_metadata( + table_name.clone(), + Some(cols), + remaining_meta.clone(), + ) + }; + #[cfg(not(feature = "table_metadata"))] + let table = minarrow::Table::new(table_name.clone(), Some(cols)); + tables.push(Arc::new(table)); } return Ok(SuperTable::from_batches(tables, None)); diff --git a/src/ffi/arrow_c_ffi.rs b/src/ffi/arrow_c_ffi.rs index 274320f..d91ec38 100644 --- a/src/ffi/arrow_c_ffi.rs +++ b/src/ffi/arrow_c_ffi.rs @@ -1739,8 +1739,35 @@ struct ArrayStreamHolder { /// /// The resulting `ArrowArray` has format `"+s"` with one child per column. /// Callers must eventually call the release callback on the returned pointers. +/// +/// When the `table_metadata` feature is enabled, an optional `metadata` parameter +/// is accepted and encoded into the top-level struct ArrowSchema. +#[cfg(not(feature = "table_metadata"))] pub fn export_struct_to_c( columns: Vec<(Arc, Schema)>, +) -> (*mut ArrowArray, *mut ArrowSchema) { + export_struct_to_c_inner(columns, None) +} + +/// Exports a single record batch (a set of column arrays with a shared schema) +/// as an Arrow struct array + struct schema pair. +/// +/// The resulting `ArrowArray` has format `"+s"` with one child per column. +/// Callers must eventually call the release callback on the returned pointers. +/// +/// When `metadata` is provided, it is encoded and attached to the top-level +/// struct ArrowSchema. +#[cfg(feature = "table_metadata")] +pub fn export_struct_to_c( + columns: Vec<(Arc, Schema)>, + metadata: Option>, +) -> (*mut ArrowArray, *mut ArrowSchema) { + export_struct_to_c_inner(columns, metadata) +} + +fn export_struct_to_c_inner( + columns: Vec<(Arc, Schema)>, + metadata: Option>, ) -> (*mut ArrowArray, *mut ArrowSchema) { let n_cols = columns.len(); let n_rows = if n_cols > 0 { @@ -1783,16 +1810,22 @@ pub fn export_struct_to_c( let format_cstr = CString::new("+s").unwrap(); let name_cstr = CString::new("").unwrap(); + let metadata_bytes = metadata.map(|m| encode_arrow_metadata(&m)); + let metadata_ptr = metadata_bytes + .as_ref() + .map(|b| b.as_ptr() as *const i8) + .unwrap_or(ptr::null()); + let struct_holder = Box::new(StructSchemaHolder { format_cstr, name_cstr, - metadata_bytes: None, + metadata_bytes, }); let struct_sch = Box::new(ArrowSchema { format: struct_holder.format_cstr.as_ptr(), name: struct_holder.name_cstr.as_ptr(), - metadata: ptr::null(), + metadata: metadata_ptr, flags: 0, n_children: n_cols as i64, children: children_sch_ptr, @@ -2096,7 +2129,7 @@ unsafe extern "C" fn rb_stream_get_next( let batch = holder.batches[holder.cursor].clone(); holder.cursor += 1; - let (arr_ptr, _sch_ptr) = export_struct_to_c(batch); + let (arr_ptr, _sch_ptr) = export_struct_to_c_inner(batch, None); // Copy the exported struct array into the caller's out pointer unsafe { @@ -3092,4 +3125,60 @@ mod tests { ); } } + + #[test] + #[cfg(feature = "table_metadata")] + fn test_table_metadata_round_trip() { + use super::{ + export_record_batch_stream_with_metadata, import_record_batch_stream_with_metadata, + }; + use std::collections::BTreeMap; + + // Build a simple column + let mut arr = IntegerArray::::default(); + arr.push(1); + arr.push(2); + arr.push(3); + let array = Arc::new(Array::from_int32(arr)); + let field = Field::new("col1", ArrowType::Int32, false, None); + let col_schema = Schema { + fields: vec![field.clone()], + metadata: Default::default(), + }; + + // Schema-level metadata simulating pandas categorical info + let mut table_meta = BTreeMap::new(); + table_meta.insert( + "pandas".to_string(), + r#"{"columns":[{"name":"col1","pandas_type":"categorical","metadata":{"ordered":true}}]}"#.to_string(), + ); + + // Export as stream with metadata + let stream = export_record_batch_stream_with_metadata( + vec![vec![(array, col_schema)]], + vec![field], + Some(table_meta.clone()), + ); + let stream_ptr = Box::into_raw(stream); + + // Import with metadata + let (batches, schema_meta) = + unsafe { import_record_batch_stream_with_metadata(stream_ptr) }; + + // Verify metadata survived the roundtrip + assert_eq!(schema_meta, Some(table_meta.clone())); + + // Verify we can construct a Table with the imported metadata + let imported_cols: Vec<_> = batches[0] + .iter() + .map(|(arr, f)| crate::FieldArray::from_arr(f.name.as_str(), (**arr).clone())) + .collect(); + let table = crate::Table::new_with_metadata( + "test".to_string(), + Some(imported_cols), + table_meta.clone(), + ); + assert_eq!(table.metadata(), &table_meta); + assert_eq!(table.n_rows(), 3); + } } diff --git a/src/kernels/broadcast/array.rs b/src/kernels/broadcast/array.rs index 7f8a66d..ce25e47 100644 --- a/src/kernels/broadcast/array.rs +++ b/src/kernels/broadcast/array.rs @@ -546,8 +546,8 @@ mod tests { let table_arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); let table_arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![100, 200, 300])); - let table = Table { - cols: vec![ + let table = Table::build( + vec![ FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table_arr1, @@ -557,9 +557,9 @@ mod tests { table_arr2, ), ], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let result = broadcast_array_to_table(ArithmeticOperator::Add, &arr, &table).unwrap(); @@ -587,14 +587,14 @@ mod tests { let arr = Array::from_int32(IntegerArray::from_slice(&vec64![2, 3, 4])); let table_arr = Array::from_int32(IntegerArray::from_slice(&vec64![10, 10, 10])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table_arr, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let result = broadcast_array_to_table(ArithmeticOperator::Multiply, &arr, &table).unwrap(); @@ -616,25 +616,25 @@ mod tests { // Create SuperTableView with 2 slices let table1_arr = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table1_arr, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let table2_arr = Array::from_int32(IntegerArray::from_slice(&vec64![40, 50, 60])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table2_arr, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let super_table_view = SuperTableV { @@ -690,24 +690,24 @@ mod tests { // Create SuperTable with 2 batches let table1_arr = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table1_arr, )], - n_rows: 3, - name: "batch1".to_string(), - }; + 3, + "batch1".to_string(), + ); let table2_arr = Array::from_int32(IntegerArray::from_slice(&vec64![100, 200, 300])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table2_arr, )], - n_rows: 3, - name: "batch2".to_string(), - }; + 3, + "batch2".to_string(), + ); let super_table = SuperTable::from_batches( vec![Arc::new(table1), Arc::new(table2)], @@ -744,24 +744,24 @@ mod tests { // Create Cube with 2 tables let table1_arr = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table1_arr, )], - n_rows: 3, - name: "table1".to_string(), - }; + 3, + "table1".to_string(), + ); let table2_arr = Array::from_int32(IntegerArray::from_slice(&vec64![100, 200, 300])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table2_arr, )], - n_rows: 3, - name: "table2".to_string(), - }; + 3, + "table2".to_string(), + ); let cube = Cube { tables: vec![table1, table2], diff --git a/src/kernels/broadcast/array_view.rs b/src/kernels/broadcast/array_view.rs index 370c43d..807fae6 100644 --- a/src/kernels/broadcast/array_view.rs +++ b/src/kernels/broadcast/array_view.rs @@ -154,8 +154,8 @@ mod tests { // Create a table with 2 columns let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![100, 200, 300])); - let table = Table { - cols: vec![ + let table = Table::build( + vec![ FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, @@ -165,9 +165,9 @@ mod tests { arr2, ), ], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let result = broadcast_arrayview_to_table(ArithmeticOperator::Add, &array_view, &table).unwrap(); @@ -198,14 +198,14 @@ mod tests { // Create a table and table view let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 10, 10])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view = TableV::from_table(table, 0, 3); let result = broadcast_arrayview_to_tableview( @@ -234,8 +234,8 @@ mod tests { // Create a table view let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![100, 200, 300])); - let table = Table { - cols: vec![ + let table = Table::build( + vec![ FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, @@ -245,9 +245,9 @@ mod tests { arr2, ), ], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view = TableV::from_table(table, 0, 3); let result = broadcast_arrayview_to_tableview( @@ -283,25 +283,25 @@ mod tests { // Create SuperTableView with 2 slices let table1_arr = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table1_arr, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let table2_arr = Array::from_int32(IntegerArray::from_slice(&vec64![40, 50, 60])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table2_arr, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let super_table_view = SuperTableV { @@ -347,25 +347,25 @@ mod tests { // Create a SuperTableView with 6 total rows (mismatch) let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![40, 50, 60])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr2, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let super_table_view = SuperTableV { diff --git a/src/kernels/broadcast/field_array.rs b/src/kernels/broadcast/field_array.rs index 62f6257..27b1f49 100644 --- a/src/kernels/broadcast/field_array.rs +++ b/src/kernels/broadcast/field_array.rs @@ -140,24 +140,24 @@ mod tests { ); // Create SuperTableView with 2 slices - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3])), )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), Array::from_int32(IntegerArray::from_slice(&vec64![4, 5, 6])), )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let super_table_view = SuperTableV { @@ -195,24 +195,24 @@ mod tests { #[test] fn test_supertableview_to_fieldarray() { // Create SuperTableView with 2 slices - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), Array::from_int32(IntegerArray::from_slice(&vec64![100, 200, 300])), )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), Array::from_int32(IntegerArray::from_slice(&vec64![400, 500, 600])), )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let super_table_view = SuperTableV { @@ -261,24 +261,24 @@ mod tests { ); // Create SuperTableView with 6 total rows - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3])), )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), Array::from_int32(IntegerArray::from_slice(&vec64![4, 5, 6])), )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let super_table_view = SuperTableV { diff --git a/src/kernels/broadcast/scalar.rs b/src/kernels/broadcast/scalar.rs index f001fd4..05b7de8 100644 --- a/src/kernels/broadcast/scalar.rs +++ b/src/kernels/broadcast/scalar.rs @@ -919,8 +919,8 @@ mod tests { // Create a table with 2 columns let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3])); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); - let table = Table { - cols: vec![ + let table = Table::build( + vec![ FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, @@ -930,9 +930,9 @@ mod tests { arr2, ), ], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); // Create a scalar: 5 let scalar = Scalar::Int32(5); @@ -960,14 +960,14 @@ mod tests { #[test] fn test_scalar_to_table_multiply() { let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![2, 3, 4])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let scalar = Scalar::Int32(10); @@ -987,14 +987,14 @@ mod tests { fn test_scalar_to_tableview_subtract() { // Create a table let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![100, 200, 300])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view = TableV::from_table(table, 0, 3); let scalar = Scalar::Int32(50); @@ -1016,8 +1016,8 @@ mod tests { fn test_scalar_to_tableview_divide() { let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![100, 200, 300])); - let table = Table { - cols: vec![ + let table = Table::build( + vec![ FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, @@ -1027,9 +1027,9 @@ mod tests { arr2, ), ], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view = TableV::from_table(table, 0, 3); let scalar = Scalar::Int32(1000); @@ -1178,24 +1178,24 @@ mod tests { #[test] fn test_scalar_to_supertable() { let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![4, 5, 6])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr2, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let super_table = SuperTable::from_batches( vec![Arc::new(table1), Arc::new(table2)], diff --git a/src/kernels/broadcast/super_array.rs b/src/kernels/broadcast/super_array.rs index 4b1d1f1..294a7fe 100644 --- a/src/kernels/broadcast/super_array.rs +++ b/src/kernels/broadcast/super_array.rs @@ -676,14 +676,14 @@ mod tests { // Create a single-column table: [[10, 20, 30]] let table_arr = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), table_arr, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let result = broadcast_superarray_to_table(ArithmeticOperator::Add, &super_array, &table).unwrap(); diff --git a/src/kernels/broadcast/super_array_view.rs b/src/kernels/broadcast/super_array_view.rs index 59f20e2..edca89f 100644 --- a/src/kernels/broadcast/super_array_view.rs +++ b/src/kernels/broadcast/super_array_view.rs @@ -80,14 +80,14 @@ mod tests { // TableView: [[10, 20, 30]] (1 column, 3 rows) let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view = TableV::from_table(table, 0, 3); let result = broadcast_superarrayview_to_tableview( @@ -116,14 +116,14 @@ mod tests { // Create TableView with 5 rows (mismatched) let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30, 40, 50])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 5, - name: "test".to_string(), - }; + 5, + "test".to_string(), + ); let table_view = TableV::from_table(table, 0, 5); let result = broadcast_superarrayview_to_tableview( diff --git a/src/kernels/broadcast/super_table.rs b/src/kernels/broadcast/super_table.rs index 6e9413c..0b73e74 100644 --- a/src/kernels/broadcast/super_table.rs +++ b/src/kernels/broadcast/super_table.rs @@ -649,14 +649,14 @@ mod tests { } fn create_test_table(name: &str, data1: &[i32], data2: &[i32]) -> Table { - Table { - cols: vec![ + Table::build( + vec![ create_field_array("col1", data1), create_field_array("col2", data2), ], - n_rows: 3, - name: name.to_string(), - } + 3, + name.to_string(), + ) } fn create_super_table(batches: Vec) -> SuperTable { diff --git a/src/kernels/broadcast/super_table_view.rs b/src/kernels/broadcast/super_table_view.rs index 8e1b96b..296fe93 100644 --- a/src/kernels/broadcast/super_table_view.rs +++ b/src/kernels/broadcast/super_table_view.rs @@ -243,25 +243,25 @@ mod tests { fn test_supertableview_to_scalar_add() { // Create SuperTableView with 2 slices let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![4, 5, 6])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr2, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let super_table_view = SuperTableV { @@ -299,25 +299,25 @@ mod tests { fn test_supertableview_to_arrayview_multiply() { // Create SuperTableView with 2 slices let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![2, 3, 4])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![5, 6, 7])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr2, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let super_table_view = SuperTableV { @@ -360,25 +360,25 @@ mod tests { fn test_supertableview_to_arrayview_length_mismatch() { // Create SuperTableView with 6 elements let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![4, 5, 6])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr2, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let super_table_view = SuperTableV { @@ -422,14 +422,14 @@ mod tests { // Create Table: [[10, 20, 30, 40, 50, 60]] let arr = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30, 40, 50, 60])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr, )], - n_rows: 6, - name: "test".to_string(), - }; + 6, + "test".to_string(), + ); let result = broadcast_superarrayview_to_table( ArithmeticOperator::Subtract, @@ -476,14 +476,14 @@ mod tests { // Create Table with 5 rows (mismatch) let arr = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30, 40, 50])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr, )], - n_rows: 5, - name: "test".to_string(), - }; + 5, + "test".to_string(), + ); let result = broadcast_superarrayview_to_table(ArithmeticOperator::Add, &super_array_view, &table); @@ -500,25 +500,25 @@ mod tests { fn test_supertableview_to_array_divide() { // Create SuperTableView with 2 slices let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![100, 200, 300])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![400, 500, 600])); - let table2 = Table { - cols: vec![FieldArray::new( + let table2 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr2, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let super_table_view = SuperTableV { diff --git a/src/kernels/broadcast/table.rs b/src/kernels/broadcast/table.rs index ae3d7cb..cffc7ad 100644 --- a/src/kernels/broadcast/table.rs +++ b/src/kernels/broadcast/table.rs @@ -630,14 +630,14 @@ mod tests { use crate::ffi::arrow_dtype::ArrowType; // Table with 3 rows to match each chunk size - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), Array::from_int32(IntegerArray::from_slice(&vec64![2, 3, 4])), )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let field = Field::new("data".to_string(), ArrowType::Int32, false, None); let chunks = vec![ @@ -676,14 +676,14 @@ mod tests { fn test_broadcast_table_to_superarrayview() { use crate::ffi::arrow_dtype::ArrowType; - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3, 4, 5, 6])), )], - n_rows: 6, - name: "test".to_string(), - }; + 6, + "test".to_string(), + ); let arr = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30, 40, 50, 60])); let field = Field::new("data".to_string(), ArrowType::Int32, false, None); diff --git a/src/kernels/broadcast/table_view.rs b/src/kernels/broadcast/table_view.rs index 7edf4af..ccfebf1 100644 --- a/src/kernels/broadcast/table_view.rs +++ b/src/kernels/broadcast/table_view.rs @@ -186,8 +186,8 @@ mod tests { // Create two tables let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3])); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); - let table1 = Table { - cols: vec![ + let table1 = Table::build( + vec![ FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, @@ -197,15 +197,15 @@ mod tests { arr2, ), ], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let arr3 = Array::from_int32(IntegerArray::from_slice(&vec64![5, 5, 5])); let arr4 = Array::from_int32(IntegerArray::from_slice(&vec64![100, 100, 100])); - let table2 = Table { - cols: vec![ + let table2 = Table::build( + vec![ FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr3, @@ -215,9 +215,9 @@ mod tests { arr4, ), ], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let result = @@ -246,20 +246,20 @@ mod tests { fn test_tableview_to_tableview_column_mismatch() { // Create tables with different numbers of columns let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3])); - let table1 = Table { - cols: vec![FieldArray::new( + let table1 = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view1 = TableV::from_table(table1, 0, 3); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![5, 5, 5])); let arr3 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 10, 10])); - let table2 = Table { - cols: vec![ + let table2 = Table::build( + vec![ FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr2, @@ -269,9 +269,9 @@ mod tests { arr3, ), ], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view2 = TableV::from_table(table2, 0, 3); let result = @@ -290,8 +290,8 @@ mod tests { fn test_tableview_to_scalar_multiply() { let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![2, 3, 4])); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![5, 6, 7])); - let table = Table { - cols: vec![ + let table = Table::build( + vec![ FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, @@ -301,9 +301,9 @@ mod tests { arr2, ), ], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view = TableV::from_table(table, 0, 3); let scalar = Scalar::Int32(10); @@ -330,14 +330,14 @@ mod tests { #[test] fn test_tableview_to_arrayview_subtract() { let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![100, 200, 300])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 3, - name: "test".to_string(), - }; + 3, + "test".to_string(), + ); let table_view = TableV::from_table(table, 0, 3); let arr2 = Array::from_int32(IntegerArray::from_slice(&vec64![10, 20, 30])); @@ -369,14 +369,14 @@ mod tests { // Create table with 6 rows let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3, 4, 5, 6])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 6, - name: "test".to_string(), - }; + 6, + "test".to_string(), + ); let table_view = TableV::from_table(table, 0, 6); // Create SuperArrayView with 2 chunks of 3 elements each @@ -426,14 +426,14 @@ mod tests { use crate::{FieldArray as FA, SuperArray, SuperArrayV}; let arr1 = Array::from_int32(IntegerArray::from_slice(&vec64![1, 2, 3, 4, 5])); - let table = Table { - cols: vec![FieldArray::new( + let table = Table::build( + vec![FieldArray::new( Field::new("col1".to_string(), ArrowType::Int32, false, None), arr1, )], - n_rows: 5, - name: "test".to_string(), - }; + 5, + "test".to_string(), + ); let table_view = TableV::from_table(table, 0, 5); // Create SuperArrayView with 6 elements (mismatch) diff --git a/src/structs/chunked/super_table.rs b/src/structs/chunked/super_table.rs index 81482ea..01cbd6f 100644 --- a/src/structs/chunked/super_table.rs +++ b/src/structs/chunked/super_table.rs @@ -348,6 +348,15 @@ impl SuperTable { self.batches.get(idx) } + /// Returns the schema-level metadata from the first batch, or an empty map + /// if there are no batches. + #[cfg(feature = "table_metadata")] + pub fn metadata(&self) -> &std::collections::BTreeMap { + static EMPTY: std::sync::LazyLock> = + std::sync::LazyLock::new(std::collections::BTreeMap::new); + self.batches.first().map(|b| b.metadata()).unwrap_or(&EMPTY) + } + // Return a new BatchedTable over a sub-range of rows. #[cfg(feature = "views")] pub fn view(&self, offset: usize, len: usize) -> SuperTableV { @@ -688,11 +697,13 @@ impl SuperTable { }); } - Table { - cols: unified_cols, - n_rows: self.n_rows, - name: self.name, + #[allow(unused_mut)] + let mut table = Table::build(unified_cols, self.n_rows, self.name); + #[cfg(feature = "table_metadata")] + { + table.metadata = self.batches[0].metadata.clone(); } + table } /// Arena-based consolidation: writes all column buffers into a single @@ -874,11 +885,7 @@ mod tests { for c in &cols { assert_eq!(c.len(), n_rows, "all columns must have same len for Table"); } - Table { - cols, - n_rows, - name: "batch".to_string(), - } + Table::build(cols, n_rows, "batch".to_string()) } #[test] @@ -1460,6 +1467,8 @@ mod tests { ], n_rows: 2, name: "batch".into(), + #[cfg(feature = "table_metadata")] + metadata: std::collections::BTreeMap::new(), }); let b2 = Arc::new(Table { cols: vec![ @@ -1482,6 +1491,8 @@ mod tests { ], n_rows: 1, name: "batch".into(), + #[cfg(feature = "table_metadata")] + metadata: std::collections::BTreeMap::new(), }); let st = SuperTable::from_batches(vec![b1, b2], None); let result = st.consolidate(); diff --git a/src/structs/table.rs b/src/structs/table.rs index b4a92c2..25a20a3 100644 --- a/src/structs/table.rs +++ b/src/structs/table.rs @@ -92,9 +92,28 @@ pub struct Table { pub n_rows: usize, /// Table name pub name: String, + /// Schema-level metadata as key-value pairs. + /// Captures metadata that Arrow producers like PyArrow embed + /// in the top-level ArrowSchema.metadata, e.g. pandas categorical ordering. + #[cfg(feature = "table_metadata")] + pub metadata: std::collections::BTreeMap, } impl Table { + /// Internal constructor handling the conditional metadata field. + /// All code paths that build a `Table` from parts should go through here + /// so the `#[cfg]` lives in one place. + #[inline(always)] + pub(crate) fn build(cols: Vec, n_rows: usize, name: String) -> Self { + Self { + cols, + n_rows, + name, + #[cfg(feature = "table_metadata")] + metadata: std::collections::BTreeMap::new(), + } + } + /// Constructs a new Table with a specified name and optional columns. /// If `cols` is provided, the number of rows will be inferred from the first column. pub fn new(name: String, cols: Option>) -> Self { @@ -108,19 +127,35 @@ impl Table { name }; - Self { cols, n_rows, name } + Self::build(cols, n_rows, name) + } + + /// Constructs a new Table with schema-level metadata. + /// + /// Use this when importing data from Arrow producers that embed metadata + /// in the top-level schema, e.g. pandas categorical ordering via PyArrow. + #[cfg(feature = "table_metadata")] + pub fn new_with_metadata( + name: String, + cols: Option>, + metadata: std::collections::BTreeMap, + ) -> Self { + let mut table = Self::new(name, cols); + table.metadata = metadata; + table + } + + /// Returns a reference to the schema-level metadata. + #[cfg(feature = "table_metadata")] + pub fn metadata(&self) -> &std::collections::BTreeMap { + &self.metadata } /// Constructs a new, empty Table with a globally unique name. pub fn new_empty() -> Self { let id = UNNAMED_COUNTER.fetch_add(1, Ordering::Relaxed); let name = format!("UnnamedTable{}", id); - - Self { - cols: Vec::new(), - n_rows: 0, - name, - } + Self::build(Vec::new(), 0, name) } /// Build a Table from an Arena and its collected array regions. @@ -314,11 +349,13 @@ impl Table { .map(|fa| fa.slice_clone(offset, len)) .collect(); let name = format!("{}[{}, {})", self.name, offset, offset + len); - Table { - cols, - n_rows: len, - name, + #[allow(unused_mut)] + let mut table = Table::build(cols, len, name); + #[cfg(feature = "table_metadata")] + { + table.metadata = self.metadata.clone(); } + table } /// Returns a zero-copy view over rows `[offset, offset+len)`. @@ -548,16 +585,23 @@ impl Table { right_cols.push(right_field); } - let left_table = Table { - cols: left_cols, - n_rows: index, - name: format!("{}_left", self.name), + let left_table = Table::build(left_cols, index, format!("{}_left", self.name)); + let right_table = Table::build( + right_cols, + self.n_rows - index, + format!("{}_right", self.name), + ); + #[cfg(feature = "table_metadata")] + let left_table = { + let mut t = left_table; + t.metadata = self.metadata.clone(); + t }; - - let right_table = Table { - cols: right_cols, - n_rows: self.n_rows - index, - name: format!("{}_right", self.name), + #[cfg(feature = "table_metadata")] + let right_table = { + let mut t = right_table; + t.metadata = self.metadata.clone(); + t }; Ok(SuperTable::from_batches( @@ -754,12 +798,15 @@ impl Concatenate for Table { // Create result table let n_rows = result_cols.first().map(|c| c.len()).unwrap_or(0); let name = format!("{}+{}", self.name, other.name); + let table = Table::build(result_cols, n_rows, name); + #[cfg(feature = "table_metadata")] + let table = { + let mut t = table; + t.metadata = self.metadata; + t + }; - Ok(Table { - cols: result_cols, - n_rows, - name, - }) + Ok(table) } } @@ -821,11 +868,15 @@ fn consolidate_vec_concat(tables: Vec
) -> Table { let n_rows = unified_cols.first().map(|c| c.len()).unwrap_or(0); let name = tables[0].name.clone(); - Table { - cols: unified_cols, - n_rows, - name, + let table = Table::build(unified_cols, n_rows, name); + #[cfg(feature = "table_metadata")] + { + let mut t = table; + t.metadata = tables[0].metadata.clone(); + t } + #[cfg(not(feature = "table_metadata"))] + table } impl Display for Table { diff --git a/src/structs/views/chunked/super_table_view.rs b/src/structs/views/chunked/super_table_view.rs index 88ab716..a1cd5de 100644 --- a/src/structs/views/chunked/super_table_view.rs +++ b/src/structs/views/chunked/super_table_view.rs @@ -256,11 +256,7 @@ mod tests { /// One-column `Table` with Int32 data fn table(name: &str, vals: &[i32]) -> Table { - Table { - cols: Vec::from(vec![fa_i32(name, vals)]), - n_rows: vals.len(), - name: name.to_string(), - } + Table::build(vec![fa_i32(name, vals)], vals.len(), name.to_string()) } /// Handy lens into the first column of a 1-column table fn col_vals(t: &Table) -> Vec { diff --git a/src/structs/views/table_view.rs b/src/structs/views/table_view.rs index 81ecfa5..2fd44f7 100644 --- a/src/structs/views/table_view.rs +++ b/src/structs/views/table_view.rs @@ -261,12 +261,7 @@ impl TableV { .collect(); let n_rows = self.len; - - Table { - cols, - n_rows, - name: self.name.clone(), - } + Table::build(cols, n_rows, self.name.clone()) } /// Apply a transformation to each column view, producing a new table. @@ -669,11 +664,7 @@ impl TableV { }) .collect(); - Table { - cols, - n_rows: indices.len(), - name: self.name.clone(), - } + Table::build(cols, indices.len(), self.name.clone()) } } From a6f3d226495d7bdd05bf519761d6d9dd67efa8ca Mon Sep 17 00:00:00 2001 From: PB <37089506+pbower@users.noreply.github.com> Date: Mon, 16 Feb 2026 23:59:03 +0000 Subject: [PATCH 2/3] Fix test config --- pyo3/Cargo.toml | 12 +++--------- pyo3/examples/pycapsule_exchange.rs | 2 +- pyo3/src/lib.rs | 3 --- 3 files changed, 4 insertions(+), 13 deletions(-) diff --git a/pyo3/Cargo.toml b/pyo3/Cargo.toml index 03ab533..8c61b7b 100644 --- a/pyo3/Cargo.toml +++ b/pyo3/Cargo.toml @@ -17,6 +17,8 @@ keywords = [ categories = ["external-ffi-bindings", "api-bindings"] description = "PyO3 bindings for MinArrow - zero-copy Arrow interop with Python via PyArrow" +[workspace] + [lib] name = "minarrow_pyo3" crate-type = ["cdylib", "rlib"] @@ -27,7 +29,7 @@ pyo3 = { version = "0.23" } thiserror = "2" [features] -default = ["extension-module", "datetime", "extended_numeric_types", "extended_categorical"] +default = ["datetime", "extended_numeric_types", "extended_categorical"] extension-module = ["pyo3/extension-module"] datetime = ["minarrow/datetime"] extended_numeric_types = ["minarrow/extended_numeric_types"] @@ -37,14 +39,6 @@ table_metadata = ["minarrow/table_metadata"] [build-dependencies] pyo3-build-config = "0.23" -[[example]] -name = "atomic_tests" -path = "tests/atomic_tests.rs" - -[[example]] -name = "python_roundtrip" -path = "tests/python_roundtrip.rs" - [[example]] name = "pycapsule_exchange" path = "examples/pycapsule_exchange.rs" diff --git a/pyo3/examples/pycapsule_exchange.rs b/pyo3/examples/pycapsule_exchange.rs index c0ea3d8..c2c9e97 100644 --- a/pyo3/examples/pycapsule_exchange.rs +++ b/pyo3/examples/pycapsule_exchange.rs @@ -248,7 +248,7 @@ fn example_4_import_from_pyarrow_table(py: Python<'_>) -> PyResult<()> { let result = to_rust::try_capsule_record_batch_stream(&py_table); match result { - Some(Ok(batches)) => { + Some(Ok((batches, _metadata))) => { println!(" Imported {} batch(es) into MinArrow", batches.len()); for (batch_idx, batch) in batches.iter().enumerate() { println!(" Batch {}: {} columns", batch_idx, batch.len()); diff --git a/pyo3/src/lib.rs b/pyo3/src/lib.rs index 2941b06..e67d41a 100644 --- a/pyo3/src/lib.rs +++ b/pyo3/src/lib.rs @@ -103,9 +103,6 @@ pub mod error; pub mod ffi; pub mod types; -#[cfg(test)] -mod tests; - // Re-export the main types for ease of use pub use error::{PyMinarrowError, PyMinarrowResult}; pub use types::{PyArray, PyChunkedArray, PyField, PyRecordBatch, PyTable}; From 4262d3af5dc7b089bd7784f344d967c1b041865c Mon Sep 17 00:00:00 2001 From: PB <37089506+pbower@users.noreply.github.com> Date: Sun, 15 Mar 2026 23:20:49 +0000 Subject: [PATCH 3/3] Conditional broadcast handling for table_metadata --- src/ffi/arrow_c_ffi.rs | 19 ++------------ src/kernels/broadcast/array.rs | 10 +++++++- src/kernels/broadcast/array_view.rs | 10 +++++++- src/kernels/broadcast/cube.rs | 10 +++++++- src/kernels/broadcast/scalar.rs | 10 +++++++- src/kernels/broadcast/table.rs | 40 ++++++++++++++++++++++++++--- src/structs/chunked/super_table.rs | 4 +++ src/structs/table.rs | 2 +- 8 files changed, 79 insertions(+), 26 deletions(-) diff --git a/src/ffi/arrow_c_ffi.rs b/src/ffi/arrow_c_ffi.rs index d91ec38..67b6ffa 100644 --- a/src/ffi/arrow_c_ffi.rs +++ b/src/ffi/arrow_c_ffi.rs @@ -1734,21 +1734,6 @@ struct ArrayStreamHolder { // ── Struct array helpers ──────────────────────────────────────────────── -/// Exports a single record batch (a set of column arrays with a shared schema) -/// as an Arrow struct array + struct schema pair. -/// -/// The resulting `ArrowArray` has format `"+s"` with one child per column. -/// Callers must eventually call the release callback on the returned pointers. -/// -/// When the `table_metadata` feature is enabled, an optional `metadata` parameter -/// is accepted and encoded into the top-level struct ArrowSchema. -#[cfg(not(feature = "table_metadata"))] -pub fn export_struct_to_c( - columns: Vec<(Arc, Schema)>, -) -> (*mut ArrowArray, *mut ArrowSchema) { - export_struct_to_c_inner(columns, None) -} - /// Exports a single record batch (a set of column arrays with a shared schema) /// as an Arrow struct array + struct schema pair. /// @@ -1756,8 +1741,8 @@ pub fn export_struct_to_c( /// Callers must eventually call the release callback on the returned pointers. /// /// When `metadata` is provided, it is encoded and attached to the top-level -/// struct ArrowSchema. -#[cfg(feature = "table_metadata")] +/// struct ArrowSchema. The `table_metadata` feature controls whether metadata +/// is preserved on the minarrow rust side during import. pub fn export_struct_to_c( columns: Vec<(Arc, Schema)>, metadata: Option>, diff --git a/src/kernels/broadcast/array.rs b/src/kernels/broadcast/array.rs index ce25e47..8efb8f5 100644 --- a/src/kernels/broadcast/array.rs +++ b/src/kernels/broadcast/array.rs @@ -208,7 +208,15 @@ pub fn broadcast_array_to_table( }) .collect(); - Ok(Table::new(table.name.clone(), Some(new_cols?))) + let table_out = Table::new(table.name.clone(), Some(new_cols?)); + #[cfg(feature = "table_metadata")] + { + let mut t = table_out; + t.metadata = table.metadata.clone(); + return Ok(t); + } + #[cfg(not(feature = "table_metadata"))] + Ok(table_out) } /// Helper function for Array-SuperTable broadcasting - broadcast array to each table batch diff --git a/src/kernels/broadcast/array_view.rs b/src/kernels/broadcast/array_view.rs index 807fae6..1ce10b9 100644 --- a/src/kernels/broadcast/array_view.rs +++ b/src/kernels/broadcast/array_view.rs @@ -52,7 +52,15 @@ pub fn broadcast_arrayview_to_table( }) .collect(); - Ok(Table::new(table.name.clone(), Some(field_arrays))) + let table_out = Table::new(table.name.clone(), Some(field_arrays)); + #[cfg(feature = "table_metadata")] + { + let mut t = table_out; + t.metadata = table.metadata.clone(); + return Ok(t); + } + #[cfg(not(feature = "table_metadata"))] + Ok(table_out) } /// Helper function for arrayview-tableview broadcasting - work with views diff --git a/src/kernels/broadcast/cube.rs b/src/kernels/broadcast/cube.rs index 4a3483d..f6b238b 100644 --- a/src/kernels/broadcast/cube.rs +++ b/src/kernels/broadcast/cube.rs @@ -727,7 +727,15 @@ fn merge_supertable_batches(super_table: &crate::SuperTable) -> Result