Support LargeUtf8 as partition column (#19942)

paleolimbot · web-flow · commit 45c8ae43e80e · 2026-01-23T18:22:20.000Z
## Which issue does this PR close? - Closes #19939 ## Rationale for this change Paritioned writes are suppored for Utf8 and Utf8View types and supporting LargeUtf8 is fairly easy to do. ## What changes are included in this PR? Support for LargeUtf8 partition values in hive partitions + a test for all string types. ## Are these changes tested? Yes! ## Are there any user-facing changes? Something that was not supported now is
diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs
@@ -43,6 +43,7 @@ use datafusion_functions_nested::make_array::make_array_udf;
 use datafusion_functions_window::expr_fn::{first_value, lead, row_number};
 use insta::assert_snapshot;
 use object_store::local::LocalFileSystem;
+use rstest::rstest;
 use std::collections::HashMap;
 use std::fs;
 use std::path::Path;
@@ -5615,30 +5616,33 @@ async fn test_dataframe_placeholder_like_expression() -> Result<()> {
     Ok(())
 }
 
+#[rstest]
+#[case(DataType::Utf8)]
+#[case(DataType::LargeUtf8)]
+#[case(DataType::Utf8View)]
 #[tokio::test]
-async fn write_partitioned_parquet_results() -> Result<()> {
-    // create partitioned input file and context
-    let tmp_dir = TempDir::new()?;
-
-    let ctx = SessionContext::new();
-
+async fn write_partitioned_parquet_results(#[case] string_type: DataType) -> Result<()> {
     // Create an in memory table with schema C1 and C2, both strings
     let schema = Arc::new(Schema::new(vec![
-        Field::new("c1", DataType::Utf8, false),
-        Field::new("c2", DataType::Utf8, false),
+        Field::new("c1", string_type.clone(), false),
+        Field::new("c2", string_type.clone(), false),
     ]));
 
-    let record_batch = RecordBatch::try_new(
-        schema.clone(),
-        vec![
-            Arc::new(StringArray::from(vec!["abc", "def"])),
-            Arc::new(StringArray::from(vec!["123", "456"])),
-        ],
-    )?;
+    let columns = [
+        Arc::new(StringArray::from(vec!["abc", "def"])) as ArrayRef,
+        Arc::new(StringArray::from(vec!["123", "456"])) as ArrayRef,
+    ]
+    .map(|col| arrow::compute::cast(&col, &string_type).unwrap())
+    .to_vec();
+
+    let record_batch = RecordBatch::try_new(schema.clone(), columns)?;
 
     let mem_table = Arc::new(MemTable::try_new(schema, vec![vec![record_batch]])?);
 
     // Register the table in the context
+    // create partitioned input file and context
+    let tmp_dir = TempDir::new()?;
+    let ctx = SessionContext::new();
     ctx.register_table("test", mem_table)?;
 
     let local = Arc::new(LocalFileSystem::new_with_prefix(&tmp_dir)?);
@@ -5665,6 +5669,7 @@ async fn write_partitioned_parquet_results() -> Result<()> {
 
     // Check that the c2 column is gone and that c1 is abc.
     let results = filter_df.collect().await?;
+    insta::allow_duplicates! {
     assert_snapshot!(
        batches_to_string(&results),
         @r"
@@ -5674,7 +5679,7 @@ async fn write_partitioned_parquet_results() -> Result<()> {
     | abc |
     +-----+
     "
-    );
+    )};
 
     // Read the entire set of parquet files
     let df = ctx
@@ -5687,17 +5692,19 @@ async fn write_partitioned_parquet_results() -> Result<()> {
 
     // Check that the df has the entire set of data
     let results = df.collect().await?;
-    assert_snapshot!(
-        batches_to_sort_string(&results),
-        @r"
+    insta::allow_duplicates! {
+        assert_snapshot!(
+            batches_to_sort_string(&results),
+            @r"
     +-----+-----+
     | c1  | c2  |
     +-----+-----+
     | abc | 123 |
     | def | 456 |
     +-----+-----+
     "
-    );
+        )
+    };
 
     Ok(())
 }
diff --git a/datafusion/datasource/src/write/demux.rs b/datafusion/datasource/src/write/demux.rs
@@ -35,8 +35,8 @@ use arrow::datatypes::{DataType, Schema};
 use datafusion_common::cast::{
     as_boolean_array, as_date32_array, as_date64_array, as_float16_array,
     as_float32_array, as_float64_array, as_int8_array, as_int16_array, as_int32_array,
-    as_int64_array, as_string_array, as_string_view_array, as_uint8_array,
-    as_uint16_array, as_uint32_array, as_uint64_array,
+    as_int64_array, as_large_string_array, as_string_array, as_string_view_array,
+    as_uint8_array, as_uint16_array, as_uint32_array, as_uint64_array,
 };
 use datafusion_common::{exec_datafusion_err, internal_datafusion_err, not_impl_err};
 use datafusion_common_runtime::SpawnedTask;
@@ -397,6 +397,12 @@ fn compute_partition_keys_by_row<'a>(
                     partition_values.push(Cow::from(array.value(i)));
                 }
             }
+            DataType::LargeUtf8 => {
+                let array = as_large_string_array(col_array)?;
+                for i in 0..rb.num_rows() {
+                    partition_values.push(Cow::from(array.value(i)));
+                }
+            }
             DataType::Utf8View => {
                 let array = as_string_view_array(col_array)?;
                 for i in 0..rb.num_rows() {