From 41f9312460ed61e4cc32ec19b8a1848e7f90a0c6 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sun, 5 Apr 2026 17:05:00 -0500 Subject: [PATCH 1/4] Add `arrow_field(expr)` scalar UDF Adds a new introspection function that returns a struct containing the complete Arrow Field information for any expression: name, data_type, nullable, and metadata. This unifies what `arrow_typeof`, `arrow_metadata`, and `is_nullable` provide individually. Co-Authored-By: Claude Opus 4.6 (1M context) --- datafusion/functions/src/core/arrow_field.rs | 162 ++++++++++++++++++ datafusion/functions/src/core/mod.rs | 7 + .../sqllogictest/test_files/arrow_field.slt | 104 +++++++++++ 3 files changed, 273 insertions(+) create mode 100644 datafusion/functions/src/core/arrow_field.rs create mode 100644 datafusion/sqllogictest/test_files/arrow_field.slt diff --git a/datafusion/functions/src/core/arrow_field.rs b/datafusion/functions/src/core/arrow_field.rs new file mode 100644 index 0000000000000..7159f6e1bbb67 --- /dev/null +++ b/datafusion/functions/src/core/arrow_field.rs @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ + Array, BooleanArray, MapBuilder, StringArray, StringBuilder, StructArray, +}; +use arrow::datatypes::{DataType, Field, Fields}; +use datafusion_common::{Result, ScalarValue, utils::take_function_args}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, +}; +use datafusion_macros::user_doc; +use std::sync::Arc; + +#[user_doc( + doc_section(label = "Other Functions"), + description = "Returns a struct containing the Arrow field information of the expression, including name, data type, nullability, and metadata.", + syntax_example = "arrow_field(expression)", + sql_example = r#"```sql +> select arrow_field(1); ++----------------------------------------------+ +| arrow_field(Int64(1)) | ++----------------------------------------------+ +| {name: Int64(1), data_type: Int64, ...} | ++----------------------------------------------+ + +> select arrow_field(1)['data_type']; ++-----------------------------------+ +| arrow_field(Int64(1))[data_type] | ++-----------------------------------+ +| Int64 | ++-----------------------------------+ +```"#, + argument( + name = "expression", + description = "Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators." + ) +)] +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct ArrowFieldFunc { + signature: Signature, +} + +impl Default for ArrowFieldFunc { + fn default() -> Self { + Self::new() + } +} + +impl ArrowFieldFunc { + pub fn new() -> Self { + Self { + signature: Signature::any(1, Volatility::Immutable), + } + } + + fn return_struct_type() -> DataType { + DataType::Struct(Fields::from(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("data_type", DataType::Utf8, false), + Field::new("nullable", DataType::Boolean, false), + Field::new( + "metadata", + DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Utf8, true), + ])), + false, + )), + false, + ), + false, + ), + ])) + } +} + +impl ScalarUDFImpl for ArrowFieldFunc { + fn name(&self) -> &str { + "arrow_field" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Self::return_struct_type()) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let [_arg] = take_function_args(self.name(), args.args)?; + let field = &args.arg_fields[0]; + + // Build the name array + let name_array = + Arc::new(StringArray::from(vec![field.name().as_str()])) as Arc; + + // Build the data_type array + let data_type_str = format!("{}", field.data_type()); + let data_type_array = + Arc::new(StringArray::from(vec![data_type_str.as_str()])) as Arc; + + // Build the nullable array + let nullable_array = + Arc::new(BooleanArray::from(vec![field.is_nullable()])) as Arc; + + // Build the metadata map array (same pattern as arrow_metadata.rs) + let metadata = field.metadata(); + let mut map_builder = + MapBuilder::new(None, StringBuilder::new(), StringBuilder::new()); + + let mut entries: Vec<_> = metadata.iter().collect(); + entries.sort_by_key(|(k, _)| *k); + + for (k, v) in entries { + map_builder.keys().append_value(k); + map_builder.values().append_value(v); + } + map_builder.append(true)?; + + let metadata_array = Arc::new(map_builder.finish()) as Arc; + + // Build the struct + let DataType::Struct(fields) = Self::return_struct_type() else { + unreachable!() + }; + + let struct_array = StructArray::new( + fields, + vec![name_array, data_type_array, nullable_array, metadata_array], + None, + ); + + Ok(ColumnarValue::Scalar(ScalarValue::try_from_array( + &struct_array, + 0, + )?)) + } + + fn documentation(&self) -> Option<&Documentation> { + self.doc() + } +} diff --git a/datafusion/functions/src/core/mod.rs b/datafusion/functions/src/core/mod.rs index e8737612a1dcf..fc24dc5a24511 100644 --- a/datafusion/functions/src/core/mod.rs +++ b/datafusion/functions/src/core/mod.rs @@ -21,6 +21,7 @@ use datafusion_expr::ScalarUDF; use std::sync::Arc; pub mod arrow_cast; +pub mod arrow_field; pub mod arrow_metadata; pub mod arrow_try_cast; pub mod arrowtypeof; @@ -59,6 +60,7 @@ make_udf_function!(union_extract::UnionExtractFun, union_extract); make_udf_function!(union_tag::UnionTagFunc, union_tag); make_udf_function!(version::VersionFunc, version); make_udf_function!(arrow_metadata::ArrowMetadataFunc, arrow_metadata); +make_udf_function!(arrow_field::ArrowFieldFunc, arrow_field); pub mod expr_fn { use datafusion_expr::{Expr, Literal}; @@ -91,6 +93,10 @@ pub mod expr_fn { arrow_typeof, "Returns the Arrow type of the input expression.", arg1 + ),( + arrow_field, + "Returns the Arrow field info (name, data_type, nullable, metadata) of the input expression.", + arg1 ),( arrow_metadata, "Returns the metadata of the input expression", @@ -147,6 +153,7 @@ pub fn functions() -> Vec> { nullif(), arrow_cast(), arrow_try_cast(), + arrow_field(), arrow_metadata(), nvl(), nvl2(), diff --git a/datafusion/sqllogictest/test_files/arrow_field.slt b/datafusion/sqllogictest/test_files/arrow_field.slt new file mode 100644 index 0000000000000..03254b12e5d11 --- /dev/null +++ b/datafusion/sqllogictest/test_files/arrow_field.slt @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# arrow_field on integer literal +query ? +SELECT arrow_field(1) +---- +{name: lit, data_type: Int64, nullable: false, metadata: {}} + +# arrow_field on null literal +query ? +SELECT arrow_field(null) +---- +{name: lit, data_type: Null, nullable: true, metadata: {}} + +# arrow_field on boolean literal +query ? +SELECT arrow_field(true) +---- +{name: lit, data_type: Boolean, nullable: false, metadata: {}} + +# arrow_field on string literal +query ? +SELECT arrow_field('foo') +---- +{name: lit, data_type: Utf8, nullable: false, metadata: {}} + +# arrow_field on float literal +query ? +SELECT arrow_field(1.0) +---- +{name: lit, data_type: Float64, nullable: false, metadata: {}} + +# arrow_field on list +query ? +SELECT arrow_field(ARRAY[1,2,3]) +---- +{name: lit, data_type: List(Int64), nullable: false, metadata: {}} + +# arrow_field struct field access - data_type +query T +SELECT arrow_field(1)['data_type'] +---- +Int64 + +# arrow_field struct field access - nullable +query B +SELECT arrow_field(1)['nullable'] +---- +false + +# arrow_field struct field access - name +query T +SELECT arrow_field(1)['name'] +---- +lit + +# arrow_field with table columns +statement ok +CREATE TABLE arrow_field_test(x INT NOT NULL, y TEXT) AS VALUES (1, 'a'); + +query ? +SELECT arrow_field(x) FROM arrow_field_test +---- +{name: x, data_type: Int32, nullable: false, metadata: {}} + +query ? +SELECT arrow_field(y) FROM arrow_field_test +---- +{name: y, data_type: Utf8View, nullable: true, metadata: {}} + +# arrow_field column access - name reflects column name +query T +SELECT arrow_field(x)['name'] FROM arrow_field_test +---- +x + +# arrow_field column access - nullability +query B +SELECT arrow_field(x)['nullable'] FROM arrow_field_test +---- +false + +query B +SELECT arrow_field(y)['nullable'] FROM arrow_field_test +---- +true + +statement ok +DROP TABLE arrow_field_test; From c4e9831f9a0423d0fe253ec3d8ffc617dec75594 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 6 Apr 2026 10:07:38 -0500 Subject: [PATCH 2/4] Apply suggestions from code review Co-authored-by: Martin Grigorov --- datafusion/functions/src/core/arrow_field.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/datafusion/functions/src/core/arrow_field.rs b/datafusion/functions/src/core/arrow_field.rs index 7159f6e1bbb67..ad25e6216367d 100644 --- a/datafusion/functions/src/core/arrow_field.rs +++ b/datafusion/functions/src/core/arrow_field.rs @@ -36,7 +36,7 @@ use std::sync::Arc; +----------------------------------------------+ | arrow_field(Int64(1)) | +----------------------------------------------+ -| {name: Int64(1), data_type: Int64, ...} | +| {name: lit, data_type: Int64, ...} | +----------------------------------------------+ > select arrow_field(1)['data_type']; @@ -51,7 +51,7 @@ use std::sync::Arc; description = "Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators." ) )] -#[derive(Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Debug, PartialEq, Eq, Hash)] pub struct ArrowFieldFunc { signature: Signature, } @@ -107,15 +107,14 @@ impl ScalarUDFImpl for ArrowFieldFunc { } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { - let [_arg] = take_function_args(self.name(), args.args)?; - let field = &args.arg_fields[0]; + let [field] = take_function_args(self.name(), args.arg_fields)?; // Build the name array let name_array = Arc::new(StringArray::from(vec![field.name().as_str()])) as Arc; // Build the data_type array - let data_type_str = format!("{}", field.data_type()); + let data_type_str = field.data_type().to_string(); let data_type_array = Arc::new(StringArray::from(vec![data_type_str.as_str()])) as Arc; @@ -140,7 +139,7 @@ impl ScalarUDFImpl for ArrowFieldFunc { let metadata_array = Arc::new(map_builder.finish()) as Arc; // Build the struct - let DataType::Struct(fields) = Self::return_struct_type() else { + let &DataType::Struct(fields) = args.return_type() else { unreachable!() }; From 79148eb3044cd8ce29dad9058332b7738b083590 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 7 Apr 2026 07:40:50 -0500 Subject: [PATCH 3/4] Fix arrow_field CI and expand tests --- datafusion/functions/src/core/arrow_field.rs | 3 ++- .../sqllogictest/test_files/arrow_field.slt | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/datafusion/functions/src/core/arrow_field.rs b/datafusion/functions/src/core/arrow_field.rs index ad25e6216367d..f8efb611b1614 100644 --- a/datafusion/functions/src/core/arrow_field.rs +++ b/datafusion/functions/src/core/arrow_field.rs @@ -107,6 +107,7 @@ impl ScalarUDFImpl for ArrowFieldFunc { } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let return_type = args.return_type().clone(); let [field] = take_function_args(self.name(), args.arg_fields)?; // Build the name array @@ -139,7 +140,7 @@ impl ScalarUDFImpl for ArrowFieldFunc { let metadata_array = Arc::new(map_builder.finish()) as Arc; // Build the struct - let &DataType::Struct(fields) = args.return_type() else { + let DataType::Struct(fields) = return_type else { unreachable!() }; diff --git a/datafusion/sqllogictest/test_files/arrow_field.slt b/datafusion/sqllogictest/test_files/arrow_field.slt index 03254b12e5d11..a2f005e14aaa4 100644 --- a/datafusion/sqllogictest/test_files/arrow_field.slt +++ b/datafusion/sqllogictest/test_files/arrow_field.slt @@ -51,6 +51,24 @@ SELECT arrow_field(ARRAY[1,2,3]) ---- {name: lit, data_type: List(Int64), nullable: false, metadata: {}} +# arrow_field on map +query ? +SELECT arrow_field(MAP {'a': 1, 'b': 2}) +---- +{name: lit, data_type: Map("entries": non-null Struct("key": non-null Utf8, "value": Int64), unsorted), nullable: false, metadata: {}} + +# arrow_field on struct +query ? +SELECT arrow_field({a: 1, b: 'foo'}) +---- +{name: lit, data_type: Struct("a": Int64, "b": Utf8), nullable: false, metadata: {}} + +# arrow_field on dictionary +query ? +SELECT arrow_field(arrow_cast('foo', 'Dictionary(Int32, Utf8)')) +---- +{name: lit, data_type: Dictionary(Int32, Utf8), nullable: false, metadata: {}} + # arrow_field struct field access - data_type query T SELECT arrow_field(1)['data_type'] From 4afd60a9a581b1a0b10a99398133892301a31246 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 7 Apr 2026 08:55:54 -0500 Subject: [PATCH 4/4] lint --- .../source/user-guide/sql/scalar_functions.md | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index c303b43fc8844..4192345439532 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -5282,6 +5282,7 @@ union_tag(union_expression) ## Other Functions - [arrow_cast](#arrow_cast) +- [arrow_field](#arrow_field) - [arrow_metadata](#arrow_metadata) - [arrow_try_cast](#arrow_try_cast) - [arrow_typeof](#arrow_typeof) @@ -5326,6 +5327,36 @@ arrow_cast(expression, datatype) +---------------------------+---------------------+ ``` +### `arrow_field` + +Returns a struct containing the Arrow field information of the expression, including name, data type, nullability, and metadata. + +```sql +arrow_field(expression) +``` + +#### Arguments + +- **expression**: Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select arrow_field(1); ++----------------------------------------------+ +| arrow_field(Int64(1)) | ++----------------------------------------------+ +| {name: lit, data_type: Int64, ...} | ++----------------------------------------------+ + +> select arrow_field(1)['data_type']; ++-----------------------------------+ +| arrow_field(Int64(1))[data_type] | ++-----------------------------------+ +| Int64 | ++-----------------------------------+ +``` + ### `arrow_metadata` Returns the metadata of the input expression. If a key is provided, returns the value for that key. If no key is provided, returns a Map of all metadata.