Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 162 additions & 0 deletions datafusion/functions/src/core/arrow_field.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow::array::{
Array, BooleanArray, MapBuilder, StringArray, StringBuilder, StructArray,
};
use arrow::datatypes::{DataType, Field, Fields};
use datafusion_common::{Result, ScalarValue, utils::take_function_args};
use datafusion_expr::{
ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
Volatility,
};
use datafusion_macros::user_doc;
use std::sync::Arc;

#[user_doc(
doc_section(label = "Other Functions"),
description = "Returns a struct containing the Arrow field information of the expression, including name, data type, nullability, and metadata.",
syntax_example = "arrow_field(expression)",
sql_example = r#"```sql
> select arrow_field(1);
+----------------------------------------------+
| arrow_field(Int64(1)) |
+----------------------------------------------+
| {name: lit, data_type: Int64, ...} |
+----------------------------------------------+

> select arrow_field(1)['data_type'];
+-----------------------------------+
| arrow_field(Int64(1))[data_type] |
+-----------------------------------+
| Int64 |
+-----------------------------------+
```"#,
argument(
name = "expression",
description = "Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators."
)
)]
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub struct ArrowFieldFunc {
signature: Signature,
}

impl Default for ArrowFieldFunc {
fn default() -> Self {
Self::new()
}
}

impl ArrowFieldFunc {
pub fn new() -> Self {
Self {
signature: Signature::any(1, Volatility::Immutable),
}
}

fn return_struct_type() -> DataType {
DataType::Struct(Fields::from(vec![
Field::new("name", DataType::Utf8, false),
Field::new("data_type", DataType::Utf8, false),
Field::new("nullable", DataType::Boolean, false),
Field::new(
"metadata",
DataType::Map(
Arc::new(Field::new(
"entries",
DataType::Struct(Fields::from(vec![
Field::new("keys", DataType::Utf8, false),
Field::new("values", DataType::Utf8, true),
])),
false,
)),
false,
),
false,
),
]))
}
}

impl ScalarUDFImpl for ArrowFieldFunc {
fn name(&self) -> &str {
"arrow_field"
}

fn signature(&self) -> &Signature {
&self.signature
}

fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
Ok(Self::return_struct_type())
}

fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
let return_type = args.return_type().clone();
let [field] = take_function_args(self.name(), args.arg_fields)?;

// Build the name array
let name_array =
Arc::new(StringArray::from(vec![field.name().as_str()])) as Arc<dyn Array>;

// Build the data_type array
let data_type_str = field.data_type().to_string();
let data_type_array =
Arc::new(StringArray::from(vec![data_type_str.as_str()])) as Arc<dyn Array>;

// Build the nullable array
let nullable_array =
Arc::new(BooleanArray::from(vec![field.is_nullable()])) as Arc<dyn Array>;

// Build the metadata map array (same pattern as arrow_metadata.rs)
let metadata = field.metadata();
let mut map_builder =
MapBuilder::new(None, StringBuilder::new(), StringBuilder::new());

let mut entries: Vec<_> = metadata.iter().collect();
entries.sort_by_key(|(k, _)| *k);

for (k, v) in entries {
map_builder.keys().append_value(k);
map_builder.values().append_value(v);
}
map_builder.append(true)?;

let metadata_array = Arc::new(map_builder.finish()) as Arc<dyn Array>;

// Build the struct
let DataType::Struct(fields) = return_type else {
unreachable!()
};

let struct_array = StructArray::new(
fields,
vec![name_array, data_type_array, nullable_array, metadata_array],
None,
);

Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
&struct_array,
0,
)?))
}

fn documentation(&self) -> Option<&Documentation> {
self.doc()
}
}
7 changes: 7 additions & 0 deletions datafusion/functions/src/core/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use datafusion_expr::ScalarUDF;
use std::sync::Arc;

pub mod arrow_cast;
pub mod arrow_field;
pub mod arrow_metadata;
pub mod arrow_try_cast;
pub mod arrowtypeof;
Expand Down Expand Up @@ -59,6 +60,7 @@ make_udf_function!(union_extract::UnionExtractFun, union_extract);
make_udf_function!(union_tag::UnionTagFunc, union_tag);
make_udf_function!(version::VersionFunc, version);
make_udf_function!(arrow_metadata::ArrowMetadataFunc, arrow_metadata);
make_udf_function!(arrow_field::ArrowFieldFunc, arrow_field);

pub mod expr_fn {
use datafusion_expr::{Expr, Literal};
Expand Down Expand Up @@ -91,6 +93,10 @@ pub mod expr_fn {
arrow_typeof,
"Returns the Arrow type of the input expression.",
arg1
),(
arrow_field,
"Returns the Arrow field info (name, data_type, nullable, metadata) of the input expression.",
arg1
),(
arrow_metadata,
"Returns the metadata of the input expression",
Expand Down Expand Up @@ -147,6 +153,7 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
nullif(),
arrow_cast(),
arrow_try_cast(),
arrow_field(),
arrow_metadata(),
nvl(),
nvl2(),
Expand Down
122 changes: 122 additions & 0 deletions datafusion/sqllogictest/test_files/arrow_field.slt
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# arrow_field on integer literal
query ?
SELECT arrow_field(1)
----
{name: lit, data_type: Int64, nullable: false, metadata: {}}

# arrow_field on null literal
query ?
SELECT arrow_field(null)
----
{name: lit, data_type: Null, nullable: true, metadata: {}}

# arrow_field on boolean literal
query ?
SELECT arrow_field(true)
----
{name: lit, data_type: Boolean, nullable: false, metadata: {}}

# arrow_field on string literal
query ?
SELECT arrow_field('foo')
----
{name: lit, data_type: Utf8, nullable: false, metadata: {}}

# arrow_field on float literal
query ?
SELECT arrow_field(1.0)
----
{name: lit, data_type: Float64, nullable: false, metadata: {}}

# arrow_field on list
query ?
SELECT arrow_field(ARRAY[1,2,3])
----
{name: lit, data_type: List(Int64), nullable: false, metadata: {}}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good to have tests for more complex types like Map and Struct too! Maybe a Dictionary too.


# arrow_field on map
query ?
SELECT arrow_field(MAP {'a': 1, 'b': 2})
----
{name: lit, data_type: Map("entries": non-null Struct("key": non-null Utf8, "value": Int64), unsorted), nullable: false, metadata: {}}

# arrow_field on struct
query ?
SELECT arrow_field({a: 1, b: 'foo'})
----
{name: lit, data_type: Struct("a": Int64, "b": Utf8), nullable: false, metadata: {}}

# arrow_field on dictionary
query ?
SELECT arrow_field(arrow_cast('foo', 'Dictionary(Int32, Utf8)'))
----
{name: lit, data_type: Dictionary(Int32, Utf8), nullable: false, metadata: {}}

# arrow_field struct field access - data_type
query T
SELECT arrow_field(1)['data_type']
----
Int64

# arrow_field struct field access - nullable
query B
SELECT arrow_field(1)['nullable']
----
false

# arrow_field struct field access - name
query T
SELECT arrow_field(1)['name']
----
lit

# arrow_field with table columns
statement ok
CREATE TABLE arrow_field_test(x INT NOT NULL, y TEXT) AS VALUES (1, 'a');

query ?
SELECT arrow_field(x) FROM arrow_field_test
----
{name: x, data_type: Int32, nullable: false, metadata: {}}

query ?
SELECT arrow_field(y) FROM arrow_field_test
----
{name: y, data_type: Utf8View, nullable: true, metadata: {}}

# arrow_field column access - name reflects column name
query T
SELECT arrow_field(x)['name'] FROM arrow_field_test
----
x

# arrow_field column access - nullability
query B
SELECT arrow_field(x)['nullable'] FROM arrow_field_test
----
false

query B
SELECT arrow_field(y)['nullable'] FROM arrow_field_test
----
true

statement ok
DROP TABLE arrow_field_test;
31 changes: 31 additions & 0 deletions docs/source/user-guide/sql/scalar_functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -5282,6 +5282,7 @@ union_tag(union_expression)
## Other Functions

- [arrow_cast](#arrow_cast)
- [arrow_field](#arrow_field)
- [arrow_metadata](#arrow_metadata)
- [arrow_try_cast](#arrow_try_cast)
- [arrow_typeof](#arrow_typeof)
Expand Down Expand Up @@ -5326,6 +5327,36 @@ arrow_cast(expression, datatype)
+---------------------------+---------------------+
```

### `arrow_field`

Returns a struct containing the Arrow field information of the expression, including name, data type, nullability, and metadata.

```sql
arrow_field(expression)
```

#### Arguments

- **expression**: Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators.

#### Example

```sql
> select arrow_field(1);
+----------------------------------------------+
| arrow_field(Int64(1)) |
+----------------------------------------------+
| {name: lit, data_type: Int64, ...} |
+----------------------------------------------+

> select arrow_field(1)['data_type'];
+-----------------------------------+
| arrow_field(Int64(1))[data_type] |
+-----------------------------------+
| Int64 |
+-----------------------------------+
```

### `arrow_metadata`

Returns the metadata of the input expression. If a key is provided, returns the value for that key. If no key is provided, returns a Map of all metadata.
Expand Down
Loading