From 20220ccba1765a6ce5748b168d0798e7fddef682 Mon Sep 17 00:00:00 2001 From: PB <37089506+pbower@users.noreply.github.com> Date: Sun, 29 Mar 2026 07:46:12 +1100 Subject: [PATCH] Add default_categorical_8 feature --- Cargo.toml | 16 +- examples/ffi/apache_arrow_ffi.rs | 6 + examples/ffi/polars_ffi.rs | 6 + examples/print/print_arrays.rs | 11 +- examples/print/print_table.rs | 7 +- src/conversions.rs | 42 +++-- src/enums/array.rs | 178 +++++++++++------- src/enums/collections/text_array.rs | 51 +++-- src/enums/value/conversions.rs | 3 +- src/ffi/arrow_c_ffi.rs | 44 ++++- src/ffi/arrow_dtype.rs | 14 +- src/kernels/broadcast/scalar.rs | 14 +- src/macros.rs | 3 +- src/structs/arena.rs | 15 +- src/structs/chunked/super_table.rs | 1 + src/structs/field.rs | 3 +- src/structs/field_array.rs | 3 +- src/structs/table.rs | 1 + src/structs/views/array_view.rs | 9 +- src/structs/views/chunked/super_array_view.rs | 34 +++- .../views/collections/text_array_view.rs | 6 +- src/structs/views/table_view.rs | 3 +- src/traits/byte_size.rs | 3 +- src/traits/print.rs | 3 +- tests/arrow_c_integration.rs | 1 + 25 files changed, 314 insertions(+), 163 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index e6653d2..7a958b5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -71,12 +71,18 @@ parallel_proc = ["rayon"] # in your build pipeline, as it's mostly C-code. c_ffi_tests = ['cc'] -# Adds Categorical8, Categorical16, and Categorical64. +# Swaps the default categorical type from CategoricalArray to CategoricalArray. +# When enabled, TextArray contains Categorical8 instead of Categorical32, keeping the +# match arm count at 1. Useful for SIMD-optimised workloads where categories fit +# within 256 values. +default_categorical_8 = [] + +# Adds Categorical16, Categorical64, and whichever of Categorical8/Categorical32 +# is not the current default. # -# Highly recommend keeping these off unless required -# E.g., constrained or embedded environments, as they add combinatorial -# weight to the binary and enum match arms -extended_categorical = [] +# Highly recommend keeping this off unless required, as it adds combinatorial +# weight to the binary and enum match arms. +extended_categorical = ["default_categorical_8"] # Adds UInt8, UInt16, Int8, Int16 types. # diff --git a/examples/ffi/apache_arrow_ffi.rs b/examples/ffi/apache_arrow_ffi.rs index 048b607..a521cf8 100644 --- a/examples/ffi/apache_arrow_ffi.rs +++ b/examples/ffi/apache_arrow_ffi.rs @@ -32,6 +32,7 @@ mod apache_arrow_test { }; use arrow::array::{ArrayRef, RecordBatch, make_array}; use minarrow::ffi::arrow_c_ffi::{export_to_c, import_from_c}; + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] use minarrow::ffi::arrow_dtype::CategoricalIndexType; use minarrow::ffi::schema::Schema; use minarrow::{Array, ArrowType, Field, FieldArray, NumericArray, Table, TextArray}; @@ -79,6 +80,7 @@ mod apache_arrow_test { let arr_string32 = Arc::new(minarrow::StringArray::::from_slice(&[ "abc", "def", "", ])) as Arc>; + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] let arr_categorical32 = Arc::new(minarrow::CategoricalArray::::from_slices( &[0, 1, 2], &["A".to_string(), "B".to_string(), "C".to_string()], @@ -122,6 +124,7 @@ mod apache_arrow_test { let minarr_float64 = Array::NumericArray(NumericArray::Float64(arr_float64)); let minarr_bool = Array::BooleanArray(arr_bool); let minarr_string32 = Array::TextArray(TextArray::String32(arr_string32)); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] let minarr_categorical32 = Array::TextArray(TextArray::Categorical32(arr_categorical32)); #[cfg(feature = "datetime")] let minarr_datetime32 = Array::TemporalArray(TemporalArray::Datetime32(arr_datetime32)); @@ -145,6 +148,7 @@ mod apache_arrow_test { let field_float64 = Field::new("float64", ArrowType::Float64, false, None); let field_bool = Field::new("bool", ArrowType::Boolean, false, None); let field_string32 = Field::new("string32", ArrowType::String, false, None); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] let field_categorical32 = Field::new( "categorical32", ArrowType::Dictionary(CategoricalIndexType::UInt32), @@ -174,6 +178,7 @@ mod apache_arrow_test { let fa_float64 = FieldArray::new(field_float64, minarr_float64); let fa_bool = FieldArray::new(field_bool, minarr_bool); let fa_string32 = FieldArray::new(field_string32, minarr_string32); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] let fa_categorical32 = FieldArray::new(field_categorical32, minarr_categorical32); #[cfg(feature = "datetime")] let fa_datetime32 = FieldArray::new(field_datetime32, minarr_datetime32); @@ -200,6 +205,7 @@ mod apache_arrow_test { cols.push(fa_float64); cols.push(fa_bool); cols.push(fa_string32); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] cols.push(fa_categorical32); #[cfg(feature = "datetime")] { diff --git a/examples/ffi/polars_ffi.rs b/examples/ffi/polars_ffi.rs index 20afddd..d9c625b 100644 --- a/examples/ffi/polars_ffi.rs +++ b/examples/ffi/polars_ffi.rs @@ -31,6 +31,7 @@ mod polars_roundtrip { use std::sync::Arc; use minarrow::ffi::arrow_c_ffi::{export_to_c, import_from_c}; + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] use minarrow::ffi::arrow_dtype::CategoricalIndexType; use minarrow::ffi::schema::Schema; use minarrow::{Array, ArrowType, Field, FieldArray, NumericArray, Table, TextArray}; @@ -82,6 +83,7 @@ mod polars_roundtrip { let arr_string32 = Arc::new(minarrow::StringArray::::from_slice(&[ "abc", "def", "", ])) as Arc>; + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] let arr_categorical32 = Arc::new(minarrow::CategoricalArray::::from_slices( &[0, 1, 2], &["A".to_string(), "B".to_string(), "C".to_string()], @@ -125,6 +127,7 @@ mod polars_roundtrip { let minarr_float64 = Array::NumericArray(NumericArray::Float64(arr_float64)); let minarr_bool = Array::BooleanArray(arr_bool); let minarr_string32 = Array::TextArray(TextArray::String32(arr_string32)); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] let minarr_categorical32 = Array::TextArray(TextArray::Categorical32(arr_categorical32)); #[cfg(feature = "datetime")] let minarr_datetime32 = Array::TemporalArray(TemporalArray::Datetime32(arr_datetime32)); @@ -148,6 +151,7 @@ mod polars_roundtrip { let field_float64 = Field::new("float64", ArrowType::Float64, false, None); let field_bool = Field::new("bool", ArrowType::Boolean, false, None); let field_string32 = Field::new("string32", ArrowType::String, false, None); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] let field_categorical32 = Field::new( "categorical32", ArrowType::Dictionary(CategoricalIndexType::UInt32), @@ -176,6 +180,7 @@ mod polars_roundtrip { let fa_float64 = FieldArray::new(field_float64, minarr_float64); let fa_bool = FieldArray::new(field_bool, minarr_bool); let fa_string32 = FieldArray::new(field_string32, minarr_string32); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] let fa_categorical32 = FieldArray::new(field_categorical32, minarr_categorical32); #[cfg(feature = "datetime")] let fa_datetime32 = FieldArray::new(field_datetime32, minarr_datetime32); @@ -202,6 +207,7 @@ mod polars_roundtrip { cols.push(fa_float64); cols.push(fa_bool); cols.push(fa_string32); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] cols.push(fa_categorical32); #[cfg(feature = "datetime")] { diff --git a/examples/print/print_arrays.rs b/examples/print/print_arrays.rs index 66f03a8..9b403ae 100644 --- a/examples/print/print_arrays.rs +++ b/examples/print/print_arrays.rs @@ -21,7 +21,7 @@ use std::sync::Arc; -use minarrow::aliases::{BoolArr, CatArr, FltArr, IntArr, StrArr}; +use minarrow::aliases::{BoolArr, FltArr, IntArr, StrArr}; use minarrow::enums::array::Array; use minarrow::{Bitmask, MaskedArray, NumericArray, Print, TextArray}; @@ -44,12 +44,6 @@ fn main() { // String and Dictionary/Categorical let col_str32 = StrArr::from_slice(&["red", "blue", "green", "yellow", "purple"]); - let col_cat32 = CatArr::::from_values( - ["apple", "banana", "cherry", "banana", "apple"] - .iter() - .copied(), - ); - // --- Print NumericArray, TextArray, TemporalArray enums println!("\n--- Enums: NumericArray, TextArray, TemporalArray ---"); NumericArray::Int32(Arc::new(col_i32.clone())).print(); @@ -60,7 +54,6 @@ fn main() { println!("\n"); TextArray::String32(Arc::new(col_str32.clone())).print(); println!("\n"); - let _ = &TextArray::Categorical32(Arc::new(col_cat32.clone())).print(); println!("\n--- Array (top-level) ---"); Array::from_int32(col_i32.clone()).print(); @@ -71,8 +64,6 @@ fn main() { println!("\n"); Array::from_string32(col_str32.clone()).print(); println!("\n"); - Array::from_categorical32(col_cat32.clone()).print(); - println!("\n"); // --- Print Array Views (ArrayV, NumericArrayV, TextArrayV, TemporalArrayV) #[cfg(feature = "views")] println!("\n--- Array Views ---"); diff --git a/examples/print/print_table.rs b/examples/print/print_table.rs index aa02587..492efb0 100644 --- a/examples/print/print_table.rs +++ b/examples/print/print_table.rs @@ -19,7 +19,9 @@ //! cargo run --example print_table //! --------------------------------------------------------- -use minarrow::aliases::{BoolArr, CatArr, FltArr, IntArr, StrArr}; +use minarrow::aliases::{BoolArr, FltArr, IntArr, StrArr}; +#[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] +use minarrow::aliases::CatArr; use minarrow::{Bitmask, FieldArray, MaskedArray, Print, Table}; #[cfg(feature = "datetime")] use minarrow::{DatetimeArray, enums::time_units::TimeUnit}; @@ -41,6 +43,7 @@ fn main() { // String and Dictionary/Categorical let col_str32 = StrArr::::from_slice(&["red", "blue", "green", "yellow", "purple"]); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] let col_cat32 = CatArr::::from_values( ["apple", "banana", "cherry", "banana", "apple"] .iter() @@ -74,6 +77,7 @@ fn main() { let fa_f64 = FieldArray::from_arr("float64_col", col_f64); let fa_bool = FieldArray::from_arr("bool_col", col_bool); let fa_str32 = FieldArray::from_arr("utf8_col", col_str32); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] let fa_cat32 = FieldArray::from_arr("dict32_col", col_cat32); #[cfg(feature = "datetime")] let fa_dt32 = FieldArray::from_arr("datetime32_col", col_dt32); @@ -90,6 +94,7 @@ fn main() { tbl.add_col(fa_f64); tbl.add_col(fa_bool); tbl.add_col(fa_str32); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] tbl.add_col(fa_cat32); #[cfg(feature = "datetime")] tbl.add_col(fa_dt32); diff --git a/src/conversions.rs b/src/conversions.rs index 5bbbcba..df8295d 100644 --- a/src/conversions.rs +++ b/src/conversions.rs @@ -527,20 +527,22 @@ macro_rules! string_to_cat { }; } -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] string_to_cat!(u32, u8); #[cfg(feature = "extended_categorical")] string_to_cat!(u32, u16); +#[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] string_to_cat!(u32, u32); #[cfg(feature = "extended_categorical")] string_to_cat!(u32, u64); -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] #[cfg(feature = "large_string")] string_to_cat!(u64, u8); #[cfg(feature = "extended_categorical")] #[cfg(feature = "large_string")] string_to_cat!(u64, u16); #[cfg(feature = "large_string")] +#[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] string_to_cat!(u64, u32); #[cfg(feature = "extended_categorical")] #[cfg(feature = "large_string")] @@ -586,20 +588,22 @@ macro_rules! cat_to_string { }; } -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] cat_to_string!(u8, u32); #[cfg(feature = "extended_categorical")] cat_to_string!(u16, u32); +#[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] cat_to_string!(u32, u32); #[cfg(feature = "extended_categorical")] cat_to_string!(u64, u32); -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] #[cfg(feature = "large_string")] cat_to_string!(u8, u64); #[cfg(feature = "large_string")] #[cfg(feature = "extended_categorical")] cat_to_string!(u16, u64); #[cfg(feature = "large_string")] +#[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] cat_to_string!(u32, u64); #[cfg(feature = "large_string")] #[cfg(feature = "extended_categorical")] @@ -644,7 +648,7 @@ impl TryFrom<&StringArray> for StringArray { } } -#[cfg(feature = "extended_categorical")] +#[cfg(any(feature = "default_categorical_8", feature = "extended_categorical"))] macro_rules! cat_to_cat_widen { ($src:ty, $dst:ty) => { impl From<&CategoricalArray<$src>> for CategoricalArray<$dst> { @@ -660,7 +664,7 @@ macro_rules! cat_to_cat_widen { }; } -#[cfg(feature = "extended_categorical")] +#[cfg(any(feature = "default_categorical_8", feature = "extended_categorical"))] macro_rules! cat_to_cat_narrow { ($src:ty, $dst:ty) => { impl TryFrom<&CategoricalArray<$src>> for CategoricalArray<$dst> { @@ -683,11 +687,11 @@ macro_rules! cat_to_cat_narrow { }; } -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] cat_to_cat_widen!(u8, u16); -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] cat_to_cat_widen!(u8, u32); -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] cat_to_cat_widen!(u8, u64); #[cfg(feature = "extended_categorical")] cat_to_cat_widen!(u16, u32); @@ -695,11 +699,11 @@ cat_to_cat_widen!(u16, u32); cat_to_cat_widen!(u16, u64); #[cfg(feature = "extended_categorical")] cat_to_cat_widen!(u32, u64); -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] cat_to_cat_narrow!(u16, u8); -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] cat_to_cat_narrow!(u32, u8); -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] cat_to_cat_narrow!(u64, u8); #[cfg(feature = "extended_categorical")] cat_to_cat_narrow!(u32, u16); @@ -709,7 +713,7 @@ cat_to_cat_narrow!(u64, u16); cat_to_cat_narrow!(u64, u32); // identity conversions (Arc-clone) for completeness -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] impl From<&CategoricalArray> for CategoricalArray { fn from(c: &CategoricalArray) -> Self { c.clone() @@ -952,7 +956,7 @@ impl View for Arc> { type BufferT = u8; } -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] impl From>> for Array { fn from(a: Arc>) -> Self { Array::TextArray(TextArray::Categorical8(a)) @@ -960,7 +964,7 @@ impl From>> for Array { } #[cfg(feature = "views")] -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] impl View for Arc> { type BufferT = u8; } @@ -978,6 +982,7 @@ impl View for Arc> { type BufferT = u16; } +#[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] impl From>> for Array { fn from(a: Arc>) -> Self { Array::TextArray(TextArray::Categorical32(a)) @@ -985,6 +990,7 @@ impl From>> for Array { } #[cfg(feature = "views")] +#[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] impl View for Arc> { type BufferT = u32; } @@ -1191,7 +1197,7 @@ impl View for StringArray { type BufferT = u8; } -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] impl From> for Array { fn from(a: CategoricalArray) -> Self { Array::TextArray(TextArray::Categorical8(a.into())) @@ -1199,7 +1205,7 @@ impl From> for Array { } #[cfg(feature = "views")] -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] impl View for CategoricalArray { type BufferT = u8; } @@ -1217,6 +1223,7 @@ impl View for CategoricalArray { type BufferT = u16; } +#[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] impl From> for Array { fn from(a: CategoricalArray) -> Self { Array::TextArray(TextArray::Categorical32(a.into())) @@ -1224,6 +1231,7 @@ impl From> for Array { } #[cfg(feature = "views")] +#[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] impl View for CategoricalArray { type BufferT = u32; } diff --git a/src/enums/array.rs b/src/enums/array.rs index 7b13a09..6d3eb0e 100644 --- a/src/enums/array.rs +++ b/src/enums/array.rs @@ -95,7 +95,7 @@ use crate::{ /// ## Examples /// ```rust /// use minarrow::{ -/// Array, IntegerArray, NumericArray, arr_bool, arr_cat32, arr_f64, arr_i32, arr_i64, +/// Array, IntegerArray, NumericArray, arr_bool, arr_f64, arr_i32, arr_i64, /// arr_str32, vec64 /// }; /// @@ -104,7 +104,6 @@ use crate::{ /// let float_arr = arr_f64![0.5, 1.5, 2.5]; /// let bool_arr = arr_bool![true, false, true]; /// let str_arr = arr_str32!["a", "b", "c"]; -/// let cat_arr = arr_cat32!["x", "y", "x", "z"]; /// /// assert_eq!(int_arr.len(), 4); /// assert_eq!(str_arr.len(), 3); @@ -193,12 +192,13 @@ impl Array { } /// Creates an Array enum with a Categorical32 array. + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] pub fn from_categorical32(arr: CategoricalArray) -> Self { Array::TextArray(TextArray::Categorical32(Arc::new(arr))) } /// Creates an Array enum with a Categorical8 array. - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] pub fn from_categorical8(arr: CategoricalArray) -> Self { Array::TextArray(TextArray::Categorical8(Arc::new(arr))) } @@ -490,6 +490,7 @@ impl Array { } /// Returns a reference to the inner `CategoricalArray`. + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] pub fn try_cat32_ref(&self) -> Result<&CategoricalArray, MinarrowError> { match self { Array::TextArray(TextArray::Categorical32(arc)) => Ok(arc.as_ref()), @@ -503,7 +504,7 @@ impl Array { } /// Returns a reference to the inner `CategoricalArray`. - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] pub fn try_cat8_ref(&self) -> Result<&CategoricalArray, MinarrowError> { match self { Array::TextArray(TextArray::Categorical8(arc)) => Ok(arc.as_ref()), @@ -684,7 +685,7 @@ impl Array { ))) } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(cat) => { let mut out = Vec64::with_capacity(cat.len()); let mut mask = Bitmask::with_capacity(cat.len()); @@ -736,6 +737,7 @@ impl Array { NumericArray::Int32(Arc::new(IntegerArray::::from_vec64(out, Some(mask)))) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(cat) => { let mut out = Vec64::with_capacity(cat.len()); let mut mask = Bitmask::with_capacity(cat.len()); @@ -1146,10 +1148,11 @@ impl Array { match_arm!(TextArray, String32, StringArray); #[cfg(feature = "large_string")] match_arm!(TextArray, String64, StringArray); - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] match_arm!(TextArray, Categorical8, CategoricalArray); #[cfg(feature = "extended_categorical")] match_arm!(TextArray, Categorical16, CategoricalArray); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] match_arm!(TextArray, Categorical32, CategoricalArray); #[cfg(feature = "extended_categorical")] match_arm!(TextArray, Categorical64, CategoricalArray); @@ -1216,10 +1219,11 @@ impl Array { match_arm!(TextArray, String32, StringArray); #[cfg(feature = "large_string")] match_arm!(TextArray, String64, StringArray); - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] match_arm!(TextArray, Categorical8, CategoricalArray); #[cfg(feature = "extended_categorical")] match_arm!(TextArray, Categorical16, CategoricalArray); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] match_arm!(TextArray, Categorical32, CategoricalArray); #[cfg(feature = "extended_categorical")] match_arm!(TextArray, Categorical64, CategoricalArray); @@ -1284,10 +1288,11 @@ impl Array { match_inner_type!(TextArray, String32, StringArray); #[cfg(feature = "large_string")] match_inner_type!(TextArray, String64, StringArray); - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] match_inner_type!(TextArray, Categorical8, CategoricalArray); #[cfg(feature = "extended_categorical")] match_inner_type!(TextArray, Categorical16, CategoricalArray); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] match_inner_type!(TextArray, Categorical32, CategoricalArray); #[cfg(feature = "extended_categorical")] match_inner_type!(TextArray, Categorical64, CategoricalArray); @@ -1347,10 +1352,11 @@ impl Array { match_inner_type_mut!(TextArray, String32, StringArray); #[cfg(feature = "large_string")] match_inner_type_mut!(TextArray, String64, StringArray); - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] match_inner_type_mut!(TextArray, Categorical8, CategoricalArray); #[cfg(feature = "extended_categorical")] match_inner_type_mut!(TextArray, Categorical16, CategoricalArray); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] match_inner_type_mut!(TextArray, Categorical32, CategoricalArray); #[cfg(feature = "extended_categorical")] match_inner_type_mut!(TextArray, Categorical64, CategoricalArray); @@ -1432,7 +1438,7 @@ impl Array { "Strings use UTF-8 + offsets. Use logical accessor instead, or `slice_raw` if you do want byte access." ) } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(arr) => { cast_slice::(arr.data(), offset, len).expect("cast failed") } @@ -1440,6 +1446,7 @@ impl Array { TextArray::Categorical16(arr) => { cast_slice::(arr.data(), offset, len).expect("cast failed") } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(arr) => { cast_slice::(arr.data(), offset, len).expect("cast failed") } @@ -1527,10 +1534,11 @@ impl Array { TextArray::String64(a) if TypeId::of::() == TypeId::of::() => { cast_slice::(&a.data, offset, len) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(a) if TypeId::of::() == TypeId::of::() => { cast_slice::(&a.data, offset, len) } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(a) if TypeId::of::() == TypeId::of::() => { cast_slice::(&a.data, offset, len) } @@ -1589,7 +1597,7 @@ impl Array { TextArray::String32(arr) => TextArray::String32(arr.slice_clone(offset, len)), #[cfg(feature = "large_string")] TextArray::String64(arr) => TextArray::String64(arr.slice_clone(offset, len)), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(arr) => { TextArray::Categorical8(arr.slice_clone(offset, len)) } @@ -1597,6 +1605,7 @@ impl Array { TextArray::Categorical16(arr) => { TextArray::Categorical16(arr.slice_clone(offset, len)) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(arr) => { TextArray::Categorical32(arr.slice_clone(offset, len)) } @@ -1645,10 +1654,11 @@ impl Array { TextArray::String32(_) => ArrowType::String, #[cfg(feature = "large_string")] TextArray::String64(_) => ArrowType::LargeString, - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(_) => ArrowType::Dictionary(CategoricalIndexType::UInt8), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(_) => ArrowType::Dictionary(CategoricalIndexType::UInt16), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(_) => ArrowType::Dictionary(CategoricalIndexType::UInt32), #[cfg(feature = "extended_categorical")] TextArray::Categorical64(_) => ArrowType::Dictionary(CategoricalIndexType::UInt64), @@ -1680,8 +1690,9 @@ impl Array { pub fn is_categorical_array(&self) -> bool { match self { Array::TextArray(text) => match text { + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(_) => true, - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(_) => true, #[cfg(feature = "extended_categorical")] TextArray::Categorical16(_) => true, @@ -1708,8 +1719,9 @@ impl Array { TextArray::String32(_) => true, #[cfg(feature = "large_string")] TextArray::String64(_) => true, + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(_) => false, - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(_) => false, #[cfg(feature = "extended_categorical")] TextArray::Categorical16(_) => false, @@ -1733,8 +1745,9 @@ impl Array { TextArray::String32(_) => true, #[cfg(feature = "large_string")] TextArray::String64(_) => true, + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(_) => true, - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(_) => true, #[cfg(feature = "extended_categorical")] TextArray::Categorical16(_) => true, @@ -1892,10 +1905,11 @@ impl Array { TextArray::String32(arr) => arr.null_mask.as_ref(), #[cfg(feature = "large_string")] TextArray::String64(arr) => arr.null_mask.as_ref(), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(arr) => arr.null_mask.as_ref(), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(arr) => arr.null_mask.as_ref(), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(arr) => arr.null_mask.as_ref(), #[cfg(feature = "extended_categorical")] TextArray::Categorical64(arr) => arr.null_mask.as_ref(), @@ -1957,8 +1971,9 @@ impl Array { TextArray::String32(a) => Some(Scalar::String32(a.get_str(idx)?.to_owned())), #[cfg(feature = "large_string")] TextArray::String64(a) => Some(Scalar::String64(a.get_str(idx)?.to_owned())), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(a) => Some(Scalar::String32(a.get_str(idx)?.to_owned())), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(a) => Some(Scalar::String32(a.get_str(idx)?.to_owned())), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(a) => Some(Scalar::String32(a.get_str(idx)?.to_owned())), @@ -2038,6 +2053,13 @@ impl Array { ArrowType::Dictionary(cat_idx) => { let strs: Vec<&str> = vec![""; n_rows]; match cat_idx { + #[cfg(feature = "default_categorical_8")] + CategoricalIndexType::UInt8 => { + let mut arr = CategoricalArray::::from_vec(strs, None); + arr.null_mask = Some(mask); + Array::TextArray(TextArray::Categorical8(Arc::new(arr))) + } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] CategoricalIndexType::UInt32 => { let mut arr = CategoricalArray::::from_vec(strs, None); arr.null_mask = Some(mask); @@ -2352,8 +2374,9 @@ impl Array { TextArray::String32(s) => s.get_str(i).cmp(&s.get_str(j)), #[cfg(feature = "large_string")] TextArray::String64(s) => s.get_str(i).cmp(&s.get_str(j)), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(c) => c.get_str(i).cmp(&c.get_str(j)), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(c) => c.get_str(i).cmp(&c.get_str(j)), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(c) => c.get_str(i).cmp(&c.get_str(j)), @@ -2411,8 +2434,9 @@ impl Array { TextArray::String32(s) => s.get_str(idx).hash(state), #[cfg(feature = "large_string")] TextArray::String64(s) => s.get_str(idx).hash(state), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(c) => c.get_str(idx).hash(state), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(c) => c.get_str(idx).hash(state), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(c) => c.get_str(idx).hash(state), @@ -2480,10 +2504,11 @@ impl Array { TextArray::String64(arr) => { Arc::make_mut(arr).set_null_mask(Some(mask)); } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(arr) => { Arc::make_mut(arr).set_null_mask(Some(mask)); } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(arr) => { Arc::make_mut(arr).set_null_mask(Some(mask)); } @@ -2582,7 +2607,7 @@ impl Array { TextArray::String32(a) => (a.data.as_ptr(), a.data.len(), 1), #[cfg(feature = "large_string")] TextArray::String64(a) => (a.data.as_ptr(), a.data.len(), 1), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(a) => ( a.data.as_ptr() as *const u8, a.len(), @@ -2594,6 +2619,7 @@ impl Array { a.len(), std::mem::size_of::(), ), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(a) => ( a.data.as_ptr() as *const u8, a.len(), @@ -2654,7 +2680,7 @@ impl Array { TextArray::String32(a) => a.null_mask.as_ref().map(|m| (m.as_ptr(), m.len())), #[cfg(feature = "large_string")] TextArray::String64(a) => a.null_mask.as_ref().map(|m| (m.as_ptr(), m.len())), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(a) => { a.null_mask.as_ref().map(|m| (m.as_ptr(), m.capacity())) } @@ -2662,6 +2688,7 @@ impl Array { TextArray::Categorical16(a) => { a.null_mask.as_ref().map(|m| (m.as_ptr(), m.capacity())) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(a) => { a.null_mask.as_ref().map(|m| (m.as_ptr(), m.capacity())) } @@ -2729,10 +2756,11 @@ impl Array { TextArray::String32(a) => a.null_count(), #[cfg(feature = "large_string")] TextArray::String64(a) => a.null_count(), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(a) => a.null_count(), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(a) => a.null_count(), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(a) => a.null_count(), #[cfg(feature = "extended_categorical")] TextArray::Categorical64(a) => a.null_count(), @@ -3102,7 +3130,7 @@ impl Array { crate::ffi::arrow_dtype::ArrowType::Dictionary(idx) => { let key: polars_arrow::datatypes::IntegerType = match idx { - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] crate::ffi::arrow_dtype::CategoricalIndexType::UInt8 => { polars_arrow::datatypes::IntegerType::UInt8 } @@ -3110,6 +3138,7 @@ impl Array { crate::ffi::arrow_dtype::CategoricalIndexType::UInt16 => { polars_arrow::datatypes::IntegerType::UInt16 } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] crate::ffi::arrow_dtype::CategoricalIndexType::UInt32 => { polars_arrow::datatypes::IntegerType::UInt32 } @@ -3581,7 +3610,7 @@ macro_rules! arr_str64 { // ======== Categorical ======== -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] #[macro_export] macro_rules! arr_cat8 { ($v:expr) => { @@ -3615,6 +3644,7 @@ macro_rules! arr_cat16 { }; } +#[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] #[macro_export] macro_rules! arr_cat32 { ($v:expr) => { @@ -3925,7 +3955,7 @@ macro_rules! arr_str64_opt { // ======== Categorical ======== -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] #[macro_export] macro_rules! arr_cat8_opt { ($v:expr) => {{ @@ -3967,6 +3997,7 @@ macro_rules! arr_cat16_opt { }}; } +#[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] #[macro_export] macro_rules! arr_cat32_opt { ($v:expr) => {{ @@ -4055,11 +4086,14 @@ mod tests { ArrowType::Boolean ); - let dict32 = Array::from_categorical32(CategoricalArray::::default()); - assert_eq!( - dict32.arrow_type(), - ArrowType::Dictionary(CategoricalIndexType::UInt32) - ); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] + { + let dict32 = Array::from_categorical32(CategoricalArray::::default()); + assert_eq!( + dict32.arrow_type(), + ArrowType::Dictionary(CategoricalIndexType::UInt32) + ); + } } #[test] @@ -4124,6 +4158,7 @@ mod tests { assert_eq!(sz, 1); } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] #[test] fn test_data_ptr_and_len_for_dictionary() { let mut dict = CategoricalArray::::default(); @@ -4256,6 +4291,7 @@ mod tests { } } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] #[test] fn test_num_from_categorical_array() { let arr = StringArray::::from_slice(&["42", "hi", "999"]); @@ -4453,6 +4489,7 @@ mod tests { assert!(array.try_str32_ref().is_err()); } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] #[test] fn test_try_cat32_ref_success() { let arr = CategoricalArray::::from_vec(vec!["a", "b", "a"], None); @@ -4802,6 +4839,7 @@ mod macro_tests { // ===== categorical ===== + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] #[test] fn arr_cat32_vec64_dense() { let v = vec64!["red", "green", "red"]; @@ -4816,6 +4854,7 @@ mod macro_tests { } } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] #[test] fn arr_cat32_vec64_opt() { let v = vec64![Some("red"), None::<&str>, Some("blue")]; @@ -4997,23 +5036,26 @@ mod macro_tests { #[test] fn test_categorical_types() { - let arr = arr_cat32!(vec64!["red", "green", "red"]); - if let Array::TextArray(TextArray::Categorical32(a)) = arr { - assert_eq!(a.get_str(0), Some("red")); - assert_eq!(a.get_str(1), Some("green")); - assert_eq!(a.get_str(2), Some("red")); - } else { - panic!("Wrong variant"); - } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] + { + let arr = arr_cat32!(vec64!["red", "green", "red"]); + if let Array::TextArray(TextArray::Categorical32(a)) = arr { + assert_eq!(a.get_str(0), Some("red")); + assert_eq!(a.get_str(1), Some("green")); + assert_eq!(a.get_str(2), Some("red")); + } else { + panic!("Wrong variant"); + } - let arr = arr_cat32_opt!(vec64![Some("red"), None::<&str>, Some("blue")]); - if let Array::TextArray(TextArray::Categorical32(a)) = arr { - assert_eq!(a.get_str(0), Some("red")); - assert_eq!(a.get_str(1), None); - assert_eq!(a.get_str(2), Some("blue")); - assert_mask(&a.null_mask, &[true, false, true]); - } else { - panic!("Wrong variant"); + let arr = arr_cat32_opt!(vec64![Some("red"), None::<&str>, Some("blue")]); + if let Array::TextArray(TextArray::Categorical32(a)) = arr { + assert_eq!(a.get_str(0), Some("red")); + assert_eq!(a.get_str(1), None); + assert_eq!(a.get_str(2), Some("blue")); + assert_mask(&a.null_mask, &[true, false, true]); + } else { + panic!("Wrong variant"); + } } #[cfg(feature = "extended_categorical")] @@ -5118,15 +5160,18 @@ mod macro_tests { } // Categorical - let arr = arr_cat32!["x", "y", "x", "z"]; - if let Array::TextArray(TextArray::Categorical32(a)) = arr { - assert_eq!(a.get_str(0), Some("x")); - assert_eq!(a.get_str(1), Some("y")); - assert_eq!(a.get_str(2), Some("x")); - assert_eq!(a.get_str(3), Some("z")); - assert!(a.null_mask.is_none()); - } else { - panic!("Wrong variant"); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] + { + let arr = arr_cat32!["x", "y", "x", "z"]; + if let Array::TextArray(TextArray::Categorical32(a)) = arr { + assert_eq!(a.get_str(0), Some("x")); + assert_eq!(a.get_str(1), Some("y")); + assert_eq!(a.get_str(2), Some("x")); + assert_eq!(a.get_str(3), Some("z")); + assert!(a.null_mask.is_none()); + } else { + panic!("Wrong variant"); + } } } @@ -5164,14 +5209,17 @@ mod macro_tests { panic!("Wrong variant"); } - let arr = arr_cat32_opt![Some("red"), None::<&str>, Some("blue")]; - if let Array::TextArray(TextArray::Categorical32(a)) = arr { - assert_eq!(a.get_str(0), Some("red")); - assert_eq!(a.get_str(1), None); - assert_eq!(a.get_str(2), Some("blue")); - assert!(a.null_mask.is_some()); - } else { - panic!("Wrong variant"); + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] + { + let arr = arr_cat32_opt![Some("red"), None::<&str>, Some("blue")]; + if let Array::TextArray(TextArray::Categorical32(a)) = arr { + assert_eq!(a.get_str(0), Some("red")); + assert_eq!(a.get_str(1), None); + assert_eq!(a.get_str(2), Some("blue")); + assert!(a.null_mask.is_some()); + } else { + panic!("Wrong variant"); + } } } diff --git a/src/enums/collections/text_array.rs b/src/enums/collections/text_array.rs index 5af0b14..e3a840d 100644 --- a/src/enums/collections/text_array.rs +++ b/src/enums/collections/text_array.rs @@ -76,10 +76,11 @@ pub enum TextArray { String32(Arc>), #[cfg(feature = "large_string")] String64(Arc>), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] Categorical8(Arc>), #[cfg(feature = "extended_categorical")] Categorical16(Arc>), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] Categorical32(Arc>), #[cfg(feature = "extended_categorical")] Categorical64(Arc>), @@ -95,10 +96,11 @@ impl TextArray { TextArray::String32(arr) => arr.len(), #[cfg(feature = "large_string")] TextArray::String64(arr) => arr.len(), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(arr) => arr.len(), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(arr) => arr.len(), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(arr) => arr.len(), #[cfg(feature = "extended_categorical")] TextArray::Categorical64(arr) => arr.len(), @@ -113,10 +115,11 @@ impl TextArray { TextArray::String32(arr) => arr.null_mask.as_ref(), #[cfg(feature = "large_string")] TextArray::String64(arr) => arr.null_mask.as_ref(), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(arr) => arr.null_mask.as_ref(), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(arr) => arr.null_mask.as_ref(), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(arr) => arr.null_mask.as_ref(), #[cfg(feature = "extended_categorical")] TextArray::Categorical64(arr) => arr.null_mask.as_ref(), @@ -140,7 +143,7 @@ impl TextArray { (TextArray::String32(a), TextArray::String32(b)) => Arc::make_mut(a).append_array(b), #[cfg(feature = "large_string")] (TextArray::String64(a), TextArray::String64(b)) => Arc::make_mut(a).append_array(b), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] (TextArray::Categorical8(a), TextArray::Categorical8(b)) => { Arc::make_mut(a).append_array(b) } @@ -148,6 +151,7 @@ impl TextArray { (TextArray::Categorical16(a), TextArray::Categorical16(b)) => { Arc::make_mut(a).append_array(b) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] (TextArray::Categorical32(a), TextArray::Categorical32(b)) => { Arc::make_mut(a).append_array(b) } @@ -165,10 +169,11 @@ impl TextArray { (TextArray::String32(a), TextArray::String32(b)) => Arc::make_mut(a).append_range(b, offset, len), #[cfg(feature = "large_string")] (TextArray::String64(a), TextArray::String64(b)) => Arc::make_mut(a).append_range(b, offset, len), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] (TextArray::Categorical8(a), TextArray::Categorical8(b)) => Arc::make_mut(a).append_range(b, offset, len), #[cfg(feature = "extended_categorical")] (TextArray::Categorical16(a), TextArray::Categorical16(b)) => Arc::make_mut(a).append_range(b, offset, len), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] (TextArray::Categorical32(a), TextArray::Categorical32(b)) => Arc::make_mut(a).append_range(b, offset, len), #[cfg(feature = "extended_categorical")] (TextArray::Categorical64(a), TextArray::Categorical64(b)) => Arc::make_mut(a).append_range(b, offset, len), @@ -198,7 +203,7 @@ impl TextArray { (TextArray::String64(a), TextArray::String64(b)) => { Arc::make_mut(a).insert_rows(index, b) } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] (TextArray::Categorical8(a), TextArray::Categorical8(b)) => { Arc::make_mut(a).insert_rows(index, b) } @@ -206,6 +211,7 @@ impl TextArray { (TextArray::Categorical16(a), TextArray::Categorical16(b)) => { Arc::make_mut(a).insert_rows(index, b) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] (TextArray::Categorical32(a), TextArray::Categorical32(b)) => { Arc::make_mut(a).insert_rows(index, b) } @@ -249,6 +255,7 @@ impl TextArray { TextArray::String64(Arc::new(right)), )) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(a) => { let (left, right) = Arc::try_unwrap(a) .unwrap_or_else(|arc| (*arc).clone()) @@ -258,7 +265,7 @@ impl TextArray { TextArray::Categorical32(Arc::new(right)), )) } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(a) => { let (left, right) = Arc::try_unwrap(a) .unwrap_or_else(|arc| (*arc).clone()) @@ -325,6 +332,7 @@ impl TextArray { /// Returns a reference to the inner `CategoricalArray` if the variant matches. /// No conversion or cloning is performed. + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] pub fn cat32_ref(&self) -> Result<&CategoricalArray, MinarrowError> { match self { TextArray::Categorical32(arr) => Ok(arr), @@ -354,7 +362,7 @@ impl TextArray { /// Returns a reference to the inner `CategoricalArray` if the variant matches. /// No conversion or cloning is performed. - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] pub fn cat8_ref(&self) -> Result<&CategoricalArray, MinarrowError> { match self { TextArray::Categorical8(arr) => Ok(arr), @@ -394,10 +402,11 @@ impl TextArray { }, #[cfg(feature = "large_string")] TextArray::String64(arr) => Ok(StringArray::::try_from(&*arr)?), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(arr) => Ok(StringArray::::try_from(&*arr)?), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(arr) => Ok(StringArray::::try_from(&*arr)?), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(arr) => Ok(StringArray::::try_from(&*arr)?), #[cfg(feature = "extended_categorical")] TextArray::Categorical64(arr) => Ok(StringArray::::try_from(&*arr)?), @@ -417,10 +426,11 @@ impl TextArray { Err(shared) => Ok((*shared).clone()), }, TextArray::String32(arr) => Ok(StringArray::::from(&*arr)), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(arr) => Ok(StringArray::::try_from(&*arr)?), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(arr) => Ok(StringArray::::try_from(&*arr)?), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(arr) => Ok(StringArray::::try_from(&*arr)?), #[cfg(feature = "extended_categorical")] TextArray::Categorical64(arr) => Ok(StringArray::::try_from(&*arr)?), @@ -432,6 +442,7 @@ impl TextArray { /// /// - Converts via `From` or `TryFrom`, depending on the inner type /// - Uses *CloneOnWrite (COW)* when it's already a `Categorical32`. + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] pub fn cat32(self) -> Result, MinarrowError> { match self { TextArray::Categorical32(arr) => match Arc::try_unwrap(arr) { @@ -441,7 +452,7 @@ impl TextArray { TextArray::String32(arr) => Ok(CategoricalArray::::try_from(&*arr)?), #[cfg(feature = "large_string")] TextArray::String64(arr) => Ok(CategoricalArray::::try_from(&*arr)?), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(arr) => Ok(CategoricalArray::::from(&*arr)), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(arr) => Ok(CategoricalArray::::from(&*arr)), @@ -465,10 +476,11 @@ impl TextArray { TextArray::String32(arr) => Ok(CategoricalArray::::try_from(&*arr)?), #[cfg(feature = "large_string")] TextArray::String64(arr) => Ok(CategoricalArray::::try_from(&*arr)?), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(arr) => Ok(CategoricalArray::::from(&*arr)), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(arr) => Ok(CategoricalArray::::from(&*arr)), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(arr) => Ok(CategoricalArray::::from(&*arr)), TextArray::Null => Err(MinarrowError::NullError { message: None }), } @@ -478,7 +490,7 @@ impl TextArray { /// /// - Converts via `From` or `TryFrom`, depending on the inner type /// - Uses *CloneOnWrite (COW)* when it's already a `Categorical8`. - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] pub fn cat8(self) -> Result, MinarrowError> { match self { TextArray::Categorical8(arr) => match Arc::try_unwrap(arr) { @@ -490,6 +502,7 @@ impl TextArray { TextArray::String64(arr) => Ok(CategoricalArray::::try_from(&*arr)?), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(arr) => Ok(CategoricalArray::::try_from(&*arr)?), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(arr) => Ok(CategoricalArray::::try_from(&*arr)?), #[cfg(feature = "extended_categorical")] TextArray::Categorical64(arr) => Ok(CategoricalArray::::try_from(&*arr)?), @@ -511,8 +524,9 @@ impl TextArray { TextArray::String32(arr) => Ok(CategoricalArray::::try_from(&*arr)?), #[cfg(feature = "large_string")] TextArray::String64(arr) => Ok(CategoricalArray::::try_from(&*arr)?), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(arr) => Ok(CategoricalArray::::from(&*arr)), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(arr) => Ok(CategoricalArray::::try_from(&*arr)?), #[cfg(feature = "extended_categorical")] TextArray::Categorical64(arr) => Ok(CategoricalArray::::try_from(&*arr)?), @@ -527,7 +541,7 @@ impl Display for TextArray { TextArray::String32(arr) => write_text_array_with_header(f, "String32", arr.as_ref()), #[cfg(feature = "large_string")] TextArray::String64(arr) => write_text_array_with_header(f, "String64", arr.as_ref()), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(arr) => { write_text_array_with_header(f, "Categorical8", arr.as_ref()) } @@ -535,6 +549,7 @@ impl Display for TextArray { TextArray::Categorical16(arr) => { write_text_array_with_header(f, "Categorical16", arr.as_ref()) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(arr) => { write_text_array_with_header(f, "Categorical32", arr.as_ref()) } @@ -582,7 +597,7 @@ impl Concatenate for TextArray { let b = Arc::try_unwrap(b).unwrap_or_else(|arc| (*arc).clone()); Ok(TextArray::String64(Arc::new(a.concat(b)?))) } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] (TextArray::Categorical8(a), TextArray::Categorical8(b)) => { let a = Arc::try_unwrap(a).unwrap_or_else(|arc| (*arc).clone()); let b = Arc::try_unwrap(b).unwrap_or_else(|arc| (*arc).clone()); @@ -594,6 +609,7 @@ impl Concatenate for TextArray { let b = Arc::try_unwrap(b).unwrap_or_else(|arc| (*arc).clone()); Ok(TextArray::Categorical16(Arc::new(a.concat(b)?))) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] (TextArray::Categorical32(a), TextArray::Categorical32(b)) => { let a = Arc::try_unwrap(a).unwrap_or_else(|arc| (*arc).clone()); let b = Arc::try_unwrap(b).unwrap_or_else(|arc| (*arc).clone()); @@ -625,10 +641,11 @@ fn text_variant_name(arr: &TextArray) -> &'static str { TextArray::String32(_) => "String32", #[cfg(feature = "large_string")] TextArray::String64(_) => "String64", - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(_) => "Categorical8", #[cfg(feature = "extended_categorical")] TextArray::Categorical16(_) => "Categorical16", + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(_) => "Categorical32", #[cfg(feature = "extended_categorical")] TextArray::Categorical64(_) => "Categorical64", diff --git a/src/enums/value/conversions.rs b/src/enums/value/conversions.rs index 70499a8..08db17c 100644 --- a/src/enums/value/conversions.rs +++ b/src/enums/value/conversions.rs @@ -1390,7 +1390,7 @@ macro_rules! val_str64 { // Categorical Array Values -#[cfg(feature = "extended_categorical")] +#[cfg(feature = "default_categorical_8")] #[macro_export] macro_rules! val_cat8 { ($($x:tt)*) => { @@ -1406,6 +1406,7 @@ macro_rules! val_cat16 { }; } +#[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] #[macro_export] macro_rules! val_cat32 { ($($x:tt)*) => { diff --git a/src/ffi/arrow_c_ffi.rs b/src/ffi/arrow_c_ffi.rs index 5ba47b7..10f9076 100644 --- a/src/ffi/arrow_c_ffi.rs +++ b/src/ffi/arrow_c_ffi.rs @@ -348,10 +348,11 @@ pub fn fmt_c(dtype: ArrowType) -> CString { // ---- dictionary (categorical) ---- ArrowType::Dictionary(idx) => match idx { - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] CategoricalIndexType::UInt8 => b"C", #[cfg(feature = "extended_categorical")] CategoricalIndexType::UInt16 => b"S", + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] CategoricalIndexType::UInt32 => b"I", #[cfg(feature = "extended_categorical")] CategoricalIndexType::UInt64 => b"L", @@ -444,6 +445,7 @@ pub fn export_to_c(array: Arc, schema: Schema) -> (*mut ArrowArray, *mut Array::TextArray(TextArray::String64(s)) => { export_string_array_to_c(&array, schema, s.len() as i64) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] Array::TextArray(TextArray::Categorical32(cat)) => export_categorical_array_to_c( &array, schema, @@ -451,7 +453,7 @@ pub fn export_to_c(array: Arc, schema: Schema) -> (*mut ArrowArray, *mut &cat.unique_values, 32, ), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] Array::TextArray(TextArray::Categorical8(cat)) => export_categorical_array_to_c( &array, schema, @@ -567,10 +569,11 @@ fn export_categorical_array_to_c( let mut field = schema.fields[0].clone(); field.dtype = match index_bits { - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] 8 => ArrowType::Dictionary(crate::ffi::arrow_dtype::CategoricalIndexType::UInt8), #[cfg(feature = "extended_categorical")] 16 => ArrowType::Dictionary(crate::ffi::arrow_dtype::CategoricalIndexType::UInt16), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] 32 => ArrowType::Dictionary(crate::ffi::arrow_dtype::CategoricalIndexType::UInt32), #[cfg(feature = "extended_categorical")] 64 => ArrowType::Dictionary(crate::ffi::arrow_dtype::CategoricalIndexType::UInt64), @@ -720,15 +723,24 @@ pub unsafe fn import_from_c(arr_ptr: *const ArrowArray, sch_ptr: *const ArrowSch }; // if the array owns a dictionary, map the physical index dtype ➜ CategoricalIndexType + #[allow(unreachable_code)] let maybe_cat_index = if is_dict { Some(match dtype { #[cfg(feature = "extended_numeric_types")] - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] ArrowType::Int8 | ArrowType::UInt8 => CategoricalIndexType::UInt8, #[cfg(feature = "extended_numeric_types")] #[cfg(feature = "extended_categorical")] ArrowType::Int16 | ArrowType::UInt16 => CategoricalIndexType::UInt16, + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] ArrowType::Int32 | ArrowType::UInt32 => CategoricalIndexType::UInt32, + // When default_categorical_8 is on without extended_categorical, + // 32-bit dictionary indices cannot be imported as CategoricalArray + #[cfg(all(feature = "default_categorical_8", not(feature = "extended_categorical")))] + ArrowType::Int32 | ArrowType::UInt32 => panic!( + "FFI import: Arrow dictionary uses 32-bit indices but only CategoricalArray is available. \ + Enable `extended_categorical` to import dictionaries with 32-bit indices." + ), #[cfg(feature = "extended_numeric_types")] #[cfg(feature = "extended_categorical")] ArrowType::Int64 | ArrowType::UInt64 => CategoricalIndexType::UInt64, @@ -868,12 +880,12 @@ pub unsafe fn import_from_c_owned( ArrowType::Dictionary(i) => i.clone(), #[cfg(feature = "extended_numeric_types")] ArrowType::Int8 | ArrowType::UInt8 => { - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] { CategoricalIndexType::UInt8 } - #[cfg(not(feature = "extended_categorical"))] - panic!("Extended categorical not enabled") + #[cfg(not(feature = "default_categorical_8"))] + panic!("default_categorical_8 not enabled") } #[cfg(feature = "extended_numeric_types")] ArrowType::Int16 | ArrowType::UInt16 => { @@ -884,6 +896,7 @@ pub unsafe fn import_from_c_owned( #[cfg(not(feature = "extended_categorical"))] panic!("Extended categorical not enabled") } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] ArrowType::Int32 | ArrowType::UInt32 => CategoricalIndexType::UInt32, #[cfg(feature = "extended_numeric_types")] ArrowType::Int64 | ArrowType::UInt64 => { @@ -984,11 +997,12 @@ unsafe fn import_array_zero_copy( let idx_type = match dtype.clone() { ArrowType::Dictionary(idx) => idx, _ => { + #[allow(unused_imports)] use crate::ffi::arrow_dtype::CategoricalIndexType; match dtype { #[cfg(all( feature = "extended_numeric_types", - feature = "extended_categorical" + feature = "default_categorical_8" ))] ArrowType::Int8 | ArrowType::UInt8 => CategoricalIndexType::UInt8, #[cfg(all( @@ -996,6 +1010,7 @@ unsafe fn import_array_zero_copy( feature = "extended_categorical" ))] ArrowType::Int16 | ArrowType::UInt16 => CategoricalIndexType::UInt16, + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] ArrowType::Int32 | ArrowType::UInt32 => CategoricalIndexType::UInt32, #[cfg(all( feature = "extended_numeric_types", @@ -1519,7 +1534,7 @@ unsafe fn import_categorical( // Build codes & wrap match index_type { - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] CategoricalIndexType::UInt8 => { let codes_buf = unsafe { build_codes::(codes_ptr, len, ownership) }; let arr = CategoricalArray::::new(codes_buf, dict_strings, null_mask); @@ -1531,6 +1546,7 @@ unsafe fn import_categorical( let arr = CategoricalArray::::new(codes_buf, dict_strings, null_mask); Arc::new(Array::TextArray(TextArray::Categorical16(Arc::new(arr)))) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] CategoricalIndexType::UInt32 => { let codes_buf = unsafe { build_codes::(codes_ptr, len, ownership) }; let arr = CategoricalArray::::new(codes_buf, dict_strings, null_mask); @@ -2533,11 +2549,19 @@ unsafe fn field_from_c_schema(schema: &ArrowSchema) -> crate::Field { // dictionary field describes the value type. use crate::ffi::arrow_dtype::CategoricalIndexType; let index_type = match fmt { - #[cfg(all(feature = "extended_numeric_types", feature = "extended_categorical"))] + #[cfg(feature = "default_categorical_8")] b"c" | b"C" => CategoricalIndexType::UInt8, #[cfg(all(feature = "extended_numeric_types", feature = "extended_categorical"))] b"s" | b"S" => CategoricalIndexType::UInt16, + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] b"i" | b"I" => CategoricalIndexType::UInt32, + // When default_categorical_8 is on without extended_categorical, + // 32-bit dictionary indices cannot be imported as CategoricalArray + #[cfg(all(feature = "default_categorical_8", not(feature = "extended_categorical")))] + b"i" | b"I" => panic!( + "FFI import: Arrow dictionary uses 32-bit indices but only CategoricalArray is available. \ + Enable `extended_categorical` to import dictionaries with 32-bit indices." + ), #[cfg(all(feature = "extended_numeric_types", feature = "extended_categorical"))] b"l" | b"L" => CategoricalIndexType::UInt64, _ => panic!( diff --git a/src/ffi/arrow_dtype.rs b/src/ffi/arrow_dtype.rs index f89c3cd..3ccf46d 100644 --- a/src/ffi/arrow_dtype.rs +++ b/src/ffi/arrow_dtype.rs @@ -131,8 +131,9 @@ pub enum ArrowType { /// - Smaller widths reduce memory footprint for low-cardinality data. /// - Larger widths enable more distinct categories without overflow. /// - Variant availability depends on feature flags: -/// - `UInt8`, `UInt16`, and `UInt64` require `extended_categorical`. -/// - `UInt32` is always available. +/// - `UInt8` requires `default_categorical_8` or `extended_categorical`. +/// - `UInt16` and `UInt64` require `extended_categorical`. +/// - `UInt32` is available unless `default_categorical_8` is enabled without `extended_categorical`. /// /// ## Interoperability /// - Maps directly to the integer index type in Apache Arrow's `DictionaryType`. @@ -140,10 +141,11 @@ pub enum ArrowType { #[derive(PartialEq, Clone, Debug)] pub enum CategoricalIndexType { - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] UInt8, #[cfg(feature = "extended_categorical")] UInt16, + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] UInt32, #[cfg(feature = "extended_categorical")] UInt64, @@ -177,7 +179,7 @@ impl CategoricalArray { /// The arrow type that backs this array pub fn arrow_type() -> ArrowType { let t = TypeId::of::(); - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] if t == TypeId::of::() { return ArrowType::Dictionary(CategoricalIndexType::UInt8); } @@ -185,6 +187,7 @@ impl CategoricalArray { if t == TypeId::of::() { return ArrowType::Dictionary(CategoricalIndexType::UInt16); } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] if t == TypeId::of::() { return ArrowType::Dictionary(CategoricalIndexType::UInt32); } @@ -296,10 +299,11 @@ impl Display for ArrowType { impl Display for CategoricalIndexType { fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { match self { - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] CategoricalIndexType::UInt8 => f.write_str("UInt8"), #[cfg(feature = "extended_categorical")] CategoricalIndexType::UInt16 => f.write_str("UInt16"), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] CategoricalIndexType::UInt32 => f.write_str("UInt32"), #[cfg(feature = "extended_categorical")] CategoricalIndexType::UInt64 => f.write_str("UInt64"), diff --git a/src/kernels/broadcast/scalar.rs b/src/kernels/broadcast/scalar.rs index 1ce8dfd..5ee071d 100644 --- a/src/kernels/broadcast/scalar.rs +++ b/src/kernels/broadcast/scalar.rs @@ -563,7 +563,7 @@ pub fn broadcast_scalar_to_text_arrayview( (Scalar::String64(val), TextArray::String64(_)) => { Array::from_string64(StringArray::from_slice(&[val.as_str()])) } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] (Scalar::String32(val), TextArray::Categorical8(_)) => { Array::from_categorical8(CategoricalArray::::from_values(vec![val.as_str()])) } @@ -571,6 +571,7 @@ pub fn broadcast_scalar_to_text_arrayview( (Scalar::String32(val), TextArray::Categorical16(_)) => { Array::from_categorical16(CategoricalArray::::from_values(vec![val.as_str()])) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] (Scalar::String32(val), TextArray::Categorical32(_)) => { Array::from_categorical32(CategoricalArray::::from_values(vec![val.as_str()])) } @@ -578,7 +579,7 @@ pub fn broadcast_scalar_to_text_arrayview( (Scalar::String32(val), TextArray::Categorical64(_)) => { Array::from_categorical64(CategoricalArray::::from_values(vec![val.as_str()])) } - #[cfg(all(feature = "large_string", feature = "extended_categorical"))] + #[cfg(all(feature = "large_string", feature = "default_categorical_8"))] (Scalar::String64(val), TextArray::Categorical8(_)) => { Array::from_categorical8(CategoricalArray::::from_values(vec![val.as_str()])) } @@ -586,7 +587,7 @@ pub fn broadcast_scalar_to_text_arrayview( (Scalar::String64(val), TextArray::Categorical16(_)) => { Array::from_categorical16(CategoricalArray::::from_values(vec![val.as_str()])) } - #[cfg(feature = "large_string")] + #[cfg(all(feature = "large_string", any(not(feature = "default_categorical_8"), feature = "extended_categorical")))] (Scalar::String64(val), TextArray::Categorical32(_)) => { Array::from_categorical32(CategoricalArray::::from_values(vec![val.as_str()])) } @@ -661,7 +662,7 @@ pub fn broadcast_text_arrayview_to_scalar( (TextArray::String64(_), Scalar::String64(val)) => { Array::from_string64(StringArray::from_slice(&[val.as_str()])) } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] (TextArray::Categorical8(_), Scalar::String32(val)) => { Array::from_categorical8(CategoricalArray::::from_values(vec![val.as_str()])) } @@ -669,6 +670,7 @@ pub fn broadcast_text_arrayview_to_scalar( (TextArray::Categorical16(_), Scalar::String32(val)) => { Array::from_categorical16(CategoricalArray::::from_values(vec![val.as_str()])) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] (TextArray::Categorical32(_), Scalar::String32(val)) => { Array::from_categorical32(CategoricalArray::::from_values(vec![val.as_str()])) } @@ -676,7 +678,7 @@ pub fn broadcast_text_arrayview_to_scalar( (TextArray::Categorical64(_), Scalar::String32(val)) => { Array::from_categorical64(CategoricalArray::::from_values(vec![val.as_str()])) } - #[cfg(all(feature = "large_string", feature = "extended_categorical"))] + #[cfg(all(feature = "large_string", feature = "default_categorical_8"))] (TextArray::Categorical8(_), Scalar::String64(val)) => { Array::from_categorical8(CategoricalArray::::from_values(vec![val.as_str()])) } @@ -684,7 +686,7 @@ pub fn broadcast_text_arrayview_to_scalar( (TextArray::Categorical16(_), Scalar::String64(val)) => { Array::from_categorical16(CategoricalArray::::from_values(vec![val.as_str()])) } - #[cfg(feature = "large_string")] + #[cfg(all(feature = "large_string", any(not(feature = "default_categorical_8"), feature = "extended_categorical")))] (TextArray::Categorical32(_), Scalar::String64(val)) => { Array::from_categorical32(CategoricalArray::::from_values(vec![val.as_str()])) } diff --git a/src/macros.rs b/src/macros.rs index cd82df7..4dd58fa 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -737,10 +737,11 @@ macro_rules! match_array { TextArray::String32(a) => a.$method($($args),*), #[cfg(feature = "large_string")] TextArray::String64(a) => a.$method($($args),*), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(a) => a.$method($($args),*), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(a) => a.$method($($args),*), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(a) => a.$method($($args),*), #[cfg(feature = "extended_categorical")] TextArray::Categorical64(a) => a.$method($($args),*), diff --git a/src/structs/arena.rs b/src/structs/arena.rs index 774144e..0f15195 100644 --- a/src/structs/arena.rs +++ b/src/structs/arena.rs @@ -684,6 +684,7 @@ impl AAMaker { } // --- Categorical --- + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] ( ArrowType::Dictionary(CategoricalIndexType::UInt32), AAMaker::Categorical { @@ -701,7 +702,7 @@ impl AAMaker { ), ))) } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] ( ArrowType::Dictionary(CategoricalIndexType::UInt8), AAMaker::Categorical { @@ -870,10 +871,11 @@ pub(crate) fn consolidate_array_arena(chunks: &[&Array], dtype: &ArrowType) -> A .sum(); total_bytes += align64(data_bytes); } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(_) => { total_bytes += align64(n_rows * 4); } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(_) => { total_bytes += align64(n_rows); } @@ -986,6 +988,7 @@ pub(crate) fn consolidate_array_arena(chunks: &[&Array], dtype: &ArrowType) -> A let total_data: usize = slices.iter().map(|(_, d, _)| d.len()).sum(); arena.write_string_slices(&slices, n_rows, total_data, has_nulls) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(_) => { let slices: Vec<_> = chunks .iter() @@ -1013,7 +1016,7 @@ pub(crate) fn consolidate_array_arena(chunks: &[&Array], dtype: &ArrowType) -> A unreachable!() } } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(_) => { let slices: Vec<_> = chunks .iter() @@ -1256,10 +1259,11 @@ pub(crate) fn consolidate_tables_arena( .sum(); total_bytes += align64(data_bytes); } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(_) => { total_bytes += align64(n_rows * 4); } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(_) => { total_bytes += align64(n_rows); } @@ -1384,6 +1388,7 @@ pub(crate) fn consolidate_tables_arena( let total_data: usize = slices.iter().map(|(_, d, _)| d.len()).sum(); arena.write_string_slices(&slices, n_rows, total_data, has_nulls) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(_) => { let slices: Vec<_> = tables .iter() @@ -1415,7 +1420,7 @@ pub(crate) fn consolidate_tables_arena( unreachable!() } } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(_) => { let slices: Vec<_> = tables .iter() diff --git a/src/structs/chunked/super_table.rs b/src/structs/chunked/super_table.rs index 7ec4df3..c8bdebd 100644 --- a/src/structs/chunked/super_table.rs +++ b/src/structs/chunked/super_table.rs @@ -1676,6 +1676,7 @@ mod tests { } } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] #[test] fn test_consolidate_arena_categorical() { use crate::CategoricalArray; diff --git a/src/structs/field.rs b/src/structs/field.rs index d6bfeb5..0f64bc9 100644 --- a/src/structs/field.rs +++ b/src/structs/field.rs @@ -159,7 +159,7 @@ impl Field { a.is_nullable(), Some(metadata), ), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(a) => Field::new( name, ArrowType::Dictionary(CategoricalIndexType::UInt8), @@ -173,6 +173,7 @@ impl Field { a.is_nullable(), Some(metadata), ), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(a) => Field::new( name, ArrowType::Dictionary(CategoricalIndexType::UInt32), diff --git a/src/structs/field_array.rs b/src/structs/field_array.rs index 20e1935..c7fb566 100644 --- a/src/structs/field_array.rs +++ b/src/structs/field_array.rs @@ -343,10 +343,11 @@ pub fn create_field_for_array( TextArray::String32(_) => ArrowType::String, #[cfg(feature = "large_string")] TextArray::String64(_) => ArrowType::LargeString, - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(_) => ArrowType::Dictionary(CategoricalIndexType::UInt8), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(_) => ArrowType::Dictionary(CategoricalIndexType::UInt16), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(_) => ArrowType::Dictionary(CategoricalIndexType::UInt32), #[cfg(feature = "extended_categorical")] TextArray::Categorical64(_) => ArrowType::Dictionary(CategoricalIndexType::UInt64), diff --git a/src/structs/table.rs b/src/structs/table.rs index 37dd694..5527d25 100644 --- a/src/structs/table.rs +++ b/src/structs/table.rs @@ -1711,6 +1711,7 @@ mod tests { } } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] #[test] fn test_from_arena_boolean_and_categorical() { use crate::ffi::arrow_dtype::CategoricalIndexType; diff --git a/src/structs/views/array_view.rs b/src/structs/views/array_view.rs index f4194c8..1dd8d4a 100644 --- a/src/structs/views/array_view.rs +++ b/src/structs/views/array_view.rs @@ -155,10 +155,11 @@ impl ArrayV { Array::TextArray(TextArray::String32(arr)) => arr.get_str(self.offset + i), #[cfg(feature = "large_string")] Array::TextArray(TextArray::String64(arr)) => arr.get_str(self.offset + i), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] Array::TextArray(TextArray::Categorical8(arr)) => arr.get_str(self.offset + i), #[cfg(feature = "extended_categorical")] Array::TextArray(TextArray::Categorical16(arr)) => arr.get_str(self.offset + i), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] Array::TextArray(TextArray::Categorical32(arr)) => arr.get_str(self.offset + i), #[cfg(feature = "extended_categorical")] Array::TextArray(TextArray::Categorical64(arr)) => arr.get_str(self.offset + i), @@ -188,7 +189,7 @@ impl ArrayV { Some(unsafe { arr.get_str_unchecked(self.offset + i) }) } } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] Array::TextArray(TextArray::Categorical8(arr)) => { if arr.is_null(self.offset + i) { None @@ -204,6 +205,7 @@ impl ArrayV { Some(unsafe { arr.get_str_unchecked(self.offset + i) }) } } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] Array::TextArray(TextArray::Categorical32(arr)) => { if arr.is_null(self.offset + i) { None @@ -491,6 +493,7 @@ impl ArrayV { } Array::from_string64(new_arr) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(_) => { let mut values: Vec<&str> = Vec::with_capacity(indices.len()); for &idx in indices { @@ -508,7 +511,7 @@ impl ArrayV { } Array::from_categorical32(new_arr) } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(_) => { let mut values: Vec<&str> = Vec::with_capacity(indices.len()); for &idx in indices { diff --git a/src/structs/views/chunked/super_array_view.rs b/src/structs/views/chunked/super_array_view.rs index c989b69..a1a4312 100644 --- a/src/structs/views/chunked/super_array_view.rs +++ b/src/structs/views/chunked/super_array_view.rs @@ -288,8 +288,9 @@ fn consolidate_text_slices(slices: &[ArrayV], first_text: &TextArray) -> Array { TextArray::String32(_) => consolidate_string_variant!(slices, String32, u32), #[cfg(feature = "large_string")] TextArray::String64(_) => consolidate_string_variant!(slices, String64, u64), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(_) => consolidate_categorical_slices::(slices), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(_) => consolidate_categorical_slices::(slices), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(_) => consolidate_categorical_slices::(slices), @@ -308,8 +309,9 @@ fn consolidate_categorical_slices(slices: &[ArrayV // Extract the dictionary from the first slice let first_dict = match &slices[0].array { + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] Array::TextArray(TextArray::Categorical32(arr)) => &arr.unique_values, - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] Array::TextArray(TextArray::Categorical8(arr)) => &arr.unique_values, #[cfg(feature = "extended_categorical")] Array::TextArray(TextArray::Categorical16(arr)) => &arr.unique_values, @@ -321,8 +323,9 @@ fn consolidate_categorical_slices(slices: &[ArrayV // Verify all slices share the same dictionary (via pointer comparison) let all_same_dict = slices.iter().all(|s| { let dict = match &s.array { + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] Array::TextArray(TextArray::Categorical32(arr)) => &arr.unique_values, - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] Array::TextArray(TextArray::Categorical8(arr)) => &arr.unique_values, #[cfg(feature = "extended_categorical")] Array::TextArray(TextArray::Categorical16(arr)) => &arr.unique_values, @@ -351,8 +354,9 @@ fn consolidate_categorical_slices(slices: &[ArrayV let total_len: usize = slices.iter().map(|s| s.len()).sum(); let has_nulls = slices.iter().any(|s| match &s.array { + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] Array::TextArray(TextArray::Categorical32(arr)) => arr.null_mask().is_some(), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] Array::TextArray(TextArray::Categorical8(arr)) => arr.null_mask().is_some(), #[cfg(feature = "extended_categorical")] Array::TextArray(TextArray::Categorical16(arr)) => arr.null_mask().is_some(), @@ -371,6 +375,7 @@ fn consolidate_categorical_slices(slices: &[ArrayV for slice in slices { let (data, null_mask) = match &slice.array { + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] Array::TextArray(TextArray::Categorical32(arr)) => { // Type-punning since we know T matches let data_slice: &[T] = unsafe { @@ -384,7 +389,7 @@ fn consolidate_categorical_slices(slices: &[ArrayV arr.null_mask(), ) } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] Array::TextArray(TextArray::Categorical8(arr)) => { let data_slice: &[T] = unsafe { std::slice::from_raw_parts( @@ -442,9 +447,14 @@ fn consolidate_categorical_slices(slices: &[ArrayV // Wrap in appropriate variant if std::mem::size_of::() == 4 { - Array::TextArray(TextArray::Categorical32(Arc::new(unsafe { - std::mem::transmute::, CategoricalArray>(result) - }))) + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] + { + Array::TextArray(TextArray::Categorical32(Arc::new(unsafe { + std::mem::transmute::, CategoricalArray>(result) + }))) + } + #[cfg(all(feature = "default_categorical_8", not(feature = "extended_categorical")))] + panic!("Categorical32 not enabled") } else if std::mem::size_of::() == 8 { #[cfg(feature = "extended_categorical")] { @@ -464,13 +474,13 @@ fn consolidate_categorical_slices(slices: &[ArrayV #[cfg(not(feature = "extended_categorical"))] panic!("Categorical16 not enabled") } else { - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] { Array::TextArray(TextArray::Categorical8(Arc::new(unsafe { std::mem::transmute::, CategoricalArray>(result) }))) } - #[cfg(not(feature = "extended_categorical"))] + #[cfg(not(feature = "default_categorical_8"))] panic!("Categorical8 not enabled") } } @@ -1128,6 +1138,7 @@ mod tests { // Categorical Array Consolidation Tests + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] fn fa_categorical(name: &str, vals: &[&str]) -> FieldArray { use crate::ffi::arrow_dtype::CategoricalIndexType; let string_arr = crate::StringArray::::from_slice(vals); @@ -1142,6 +1153,7 @@ mod tests { FieldArray::new(field, arr) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] #[test] fn test_consolidate_categorical_single_chunk() { let fa1 = fa_categorical("cat", &["a", "b", "a", "c"]); @@ -1162,6 +1174,7 @@ mod tests { } } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] #[test] fn test_consolidate_categorical_with_offset() { let fa1 = fa_categorical("cat", &["x", "y", "z", "w", "v"]); @@ -1180,6 +1193,7 @@ mod tests { } } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] #[test] fn test_consolidate_categorical_same_dict_multiple_chunks() { use crate::ffi::arrow_dtype::CategoricalIndexType; diff --git a/src/structs/views/collections/text_array_view.rs b/src/structs/views/collections/text_array_view.rs index d095a1d..4fc8651 100644 --- a/src/structs/views/collections/text_array_view.rs +++ b/src/structs/views/collections/text_array_view.rs @@ -131,8 +131,9 @@ impl TextArrayV { TextArray::String32(arr) => arr.get_str(phys_idx), #[cfg(feature = "large_string")] TextArray::String64(arr) => arr.get_str(phys_idx), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(arr) => arr.get_str(phys_idx), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(arr) => arr.get_str(phys_idx), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(arr) => arr.get_str(phys_idx), @@ -303,8 +304,9 @@ impl Display for TextArrayV { TextArray::String32(_) => "String32", #[cfg(feature = "large_string")] TextArray::String64(_) => "String64", + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(_) => "Categorical32", - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(_) => "Categorical8", #[cfg(feature = "extended_categorical")] TextArray::Categorical16(_) => "Categorical16", diff --git a/src/structs/views/table_view.rs b/src/structs/views/table_view.rs index ccbe30d..8057a67 100644 --- a/src/structs/views/table_view.rs +++ b/src/structs/views/table_view.rs @@ -494,6 +494,7 @@ impl TableV { } Array::from_string64(new_arr) } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(_) => { use crate::{Bitmask, Vec64}; use std::collections::HashMap; @@ -529,7 +530,7 @@ impl TableV { let new_arr = CategoricalArray::::new(codes, unique_values, null_mask); Array::from_categorical32(new_arr) } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(_) => { use crate::{Bitmask, Vec64}; use std::collections::HashMap; diff --git a/src/traits/byte_size.rs b/src/traits/byte_size.rs index dc7b2f3..8e5b7c4 100644 --- a/src/traits/byte_size.rs +++ b/src/traits/byte_size.rs @@ -190,10 +190,11 @@ impl ByteSize for TextArray { TextArray::String32(arr) => arr.est_bytes(), #[cfg(feature = "large_string")] TextArray::String64(arr) => arr.est_bytes(), - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(arr) => arr.est_bytes(), #[cfg(feature = "extended_categorical")] TextArray::Categorical16(arr) => arr.est_bytes(), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(arr) => arr.est_bytes(), #[cfg(feature = "extended_categorical")] TextArray::Categorical64(arr) => arr.est_bytes(), diff --git a/src/traits/print.rs b/src/traits/print.rs index 45dc3f0..38d529a 100644 --- a/src/traits/print.rs +++ b/src/traits/print.rs @@ -82,11 +82,12 @@ pub(crate) fn value_to_string(arr: &Array, idx: usize) -> String { TextArray::String32(s) => string_value(&s.offsets, &s.data, idx), #[cfg(feature = "large_string")] TextArray::String64(s) => string_value(&s.offsets, &s.data, idx), + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] TextArray::Categorical32(cat) => { let key = cat.data[idx] as usize; cat.unique_values[key].clone() } - #[cfg(feature = "extended_categorical")] + #[cfg(feature = "default_categorical_8")] TextArray::Categorical8(cat) => { let key = cat.data[idx] as usize; cat.unique_values[key].clone() diff --git a/tests/arrow_c_integration.rs b/tests/arrow_c_integration.rs index 3cbdda1..8fbbf2d 100644 --- a/tests/arrow_c_integration.rs +++ b/tests/arrow_c_integration.rs @@ -205,6 +205,7 @@ mod arrow_c_integration { ); } + #[cfg(any(not(feature = "default_categorical_8"), feature = "extended_categorical"))] #[test] fn rt_dict32() { let cat = minarrow::CategoricalArray::::from_slices(