Skip to content

Commit 8023947

Browse files
authored
fix: maintain inner list nullability for array_sort (#19948)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Closes #19947 ## Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> #17657 introduced a regression since it cloned the inner field in the execution path but in the `return_type` function it still set nullability to true. Fix to ensure we maintain the field of the inner field as is. ## What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> Change `return_type` to just pass through the input datatype as is. Also refactor away usage of a null buffer builder in favour of copying the input array null buffer. ## Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> Added tests. ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> No. <!-- If there are any breaking changes to public APIs, please add the `api change` label. -->
1 parent d1eea07 commit 8023947

File tree

2 files changed

+34
-27
lines changed

2 files changed

+34
-27
lines changed

datafusion/functions-nested/src/sort.rs

Lines changed: 9 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,14 @@
1818
//! [`ScalarUDFImpl`] definitions for array_sort function.
1919
2020
use crate::utils::make_scalar_function;
21-
use arrow::array::{
22-
Array, ArrayRef, GenericListArray, NullBufferBuilder, OffsetSizeTrait, new_null_array,
23-
};
21+
use arrow::array::{Array, ArrayRef, GenericListArray, OffsetSizeTrait, new_null_array};
2422
use arrow::buffer::OffsetBuffer;
2523
use arrow::compute::SortColumn;
2624
use arrow::datatypes::{DataType, FieldRef};
2725
use arrow::{compute, compute::SortOptions};
2826
use datafusion_common::cast::{as_large_list_array, as_list_array, as_string_array};
2927
use datafusion_common::utils::ListCoercion;
30-
use datafusion_common::{Result, exec_err, plan_err};
28+
use datafusion_common::{Result, exec_err};
3129
use datafusion_expr::{
3230
ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation,
3331
ScalarUDFImpl, Signature, TypeSignature, Volatility,
@@ -134,18 +132,7 @@ impl ScalarUDFImpl for ArraySort {
134132
}
135133

136134
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
137-
match &arg_types[0] {
138-
DataType::Null => Ok(DataType::Null),
139-
DataType::List(field) => {
140-
Ok(DataType::new_list(field.data_type().clone(), true))
141-
}
142-
DataType::LargeList(field) => {
143-
Ok(DataType::new_large_list(field.data_type().clone(), true))
144-
}
145-
arg_type => {
146-
plan_err!("{} does not support type {arg_type}", self.name())
147-
}
148-
}
135+
Ok(arg_types[0].clone())
149136
}
150137

151138
fn invoke_with_args(
@@ -206,11 +193,11 @@ fn array_sort_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
206193
}
207194
DataType::List(field) => {
208195
let array = as_list_array(&args[0])?;
209-
array_sort_generic(array, field, sort_options)
196+
array_sort_generic(array, Arc::clone(field), sort_options)
210197
}
211198
DataType::LargeList(field) => {
212199
let array = as_large_list_array(&args[0])?;
213-
array_sort_generic(array, field, sort_options)
200+
array_sort_generic(array, Arc::clone(field), sort_options)
214201
}
215202
// Signature should prevent this arm ever occurring
216203
_ => exec_err!("array_sort expects list for first argument"),
@@ -219,18 +206,16 @@ fn array_sort_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
219206

220207
fn array_sort_generic<OffsetSize: OffsetSizeTrait>(
221208
list_array: &GenericListArray<OffsetSize>,
222-
field: &FieldRef,
209+
field: FieldRef,
223210
sort_options: Option<SortOptions>,
224211
) -> Result<ArrayRef> {
225212
let row_count = list_array.len();
226213

227214
let mut array_lengths = vec![];
228215
let mut arrays = vec![];
229-
let mut valid = NullBufferBuilder::new(row_count);
230216
for i in 0..row_count {
231217
if list_array.is_null(i) {
232218
array_lengths.push(0);
233-
valid.append_null();
234219
} else {
235220
let arr_ref = list_array.value(i);
236221

@@ -253,25 +238,22 @@ fn array_sort_generic<OffsetSize: OffsetSizeTrait>(
253238
};
254239
array_lengths.push(sorted_array.len());
255240
arrays.push(sorted_array);
256-
valid.append_non_null();
257241
}
258242
}
259243

260-
let buffer = valid.finish();
261-
262244
let elements = arrays
263245
.iter()
264246
.map(|a| a.as_ref())
265247
.collect::<Vec<&dyn Array>>();
266248

267249
let list_arr = if elements.is_empty() {
268-
GenericListArray::<OffsetSize>::new_null(Arc::clone(field), row_count)
250+
GenericListArray::<OffsetSize>::new_null(field, row_count)
269251
} else {
270252
GenericListArray::<OffsetSize>::new(
271-
Arc::clone(field),
253+
field,
272254
OffsetBuffer::from_lengths(array_lengths),
273255
Arc::new(compute::concat(elements.as_slice())?),
274-
buffer,
256+
list_array.nulls().cloned(),
275257
)
276258
};
277259
Ok(Arc::new(list_arr))

datafusion/sqllogictest/test_files/array.slt

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2577,6 +2577,31 @@ NULL NULL
25772577
NULL NULL
25782578
NULL NULL
25792579

2580+
# maintains inner nullability
2581+
query ?T
2582+
select array_sort(column1), arrow_typeof(array_sort(column1))
2583+
from values
2584+
(arrow_cast([], 'List(non-null Int32)')),
2585+
(arrow_cast(NULL, 'List(non-null Int32)')),
2586+
(arrow_cast([1, 3, 5, -5], 'List(non-null Int32)'))
2587+
;
2588+
----
2589+
[] List(non-null Int32)
2590+
NULL List(non-null Int32)
2591+
[-5, 1, 3, 5] List(non-null Int32)
2592+
2593+
query ?T
2594+
select column1, arrow_typeof(column1)
2595+
from values (array_sort(arrow_cast([1, 3, 5, -5], 'LargeList(non-null Int32)')));
2596+
----
2597+
[-5, 1, 3, 5] LargeList(non-null Int32)
2598+
2599+
query ?T
2600+
select column1, arrow_typeof(column1)
2601+
from values (array_sort(arrow_cast([1, 3, 5, -5], 'FixedSizeList(4 x non-null Int32)')));
2602+
----
2603+
[-5, 1, 3, 5] List(non-null Int32)
2604+
25802605
query ?
25812606
select array_sort([struct('foo', 3), struct('foo', 1), struct('bar', 1)])
25822607
----

0 commit comments

Comments
 (0)