Skip to content

Commit eddade7

Browse files
Omega359alamb
andauthored
Migrate documentation for all core functions from scalar_functions.md to code (#12854)
* Migrate documentation for all core functions from scalar_functions.md to code #12801 * Fixed formatting issue, regenerated documentation * Update docs/source/user-guide/sql/scalar_functions.md Co-authored-by: Andrew Lamb <[email protected]> --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 3bc7714 commit eddade7

File tree

14 files changed

+768
-267
lines changed

14 files changed

+768
-267
lines changed

datafusion/core/src/bin/print_functions_docs.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ fn print_docs(
108108
.collect::<Vec<_>>();
109109

110110
// write out section header
111-
let _ = writeln!(docs, "## {} ", doc_section.label);
111+
let _ = writeln!(docs, "\n## {} \n", doc_section.label);
112112

113113
if let Some(description) = doc_section.description {
114114
let _ = writeln!(docs, "{description}");

datafusion/expr/src/udf_docs.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ impl DocumentationBuilder {
155155
///
156156
/// ```text
157157
/// <arg_name>:
158-
/// <expression_type> expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.
158+
/// <expression_type> expression to operate on. Can be a constant, column, or function, and any combination of operators.
159159
/// ```
160160
pub fn with_standard_argument(
161161
self,

datafusion/functions/src/core/arrow_cast.rs

+38-3
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,19 @@
1717

1818
//! [`ArrowCastFunc`]: Implementation of the `arrow_cast`
1919
20-
use std::any::Any;
21-
2220
use arrow::datatypes::DataType;
2321
use datafusion_common::{
2422
arrow_datafusion_err, internal_err, plan_datafusion_err, plan_err, DataFusionError,
2523
ExprSchema, Result, ScalarValue,
2624
};
25+
use std::any::Any;
26+
use std::sync::OnceLock;
2727

28+
use datafusion_expr::scalar_doc_sections::DOC_SECTION_OTHER;
2829
use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
2930
use datafusion_expr::{
30-
ColumnarValue, Expr, ExprSchemable, ScalarUDFImpl, Signature, Volatility,
31+
ColumnarValue, Documentation, Expr, ExprSchemable, ScalarUDFImpl, Signature,
32+
Volatility,
3133
};
3234

3335
/// Implements casting to arbitrary arrow types (rather than SQL types)
@@ -131,6 +133,39 @@ impl ScalarUDFImpl for ArrowCastFunc {
131133
// return the newly written argument to DataFusion
132134
Ok(ExprSimplifyResult::Simplified(new_expr))
133135
}
136+
137+
fn documentation(&self) -> Option<&Documentation> {
138+
Some(get_arrow_cast_doc())
139+
}
140+
}
141+
142+
static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
143+
144+
fn get_arrow_cast_doc() -> &'static Documentation {
145+
DOCUMENTATION.get_or_init(|| {
146+
Documentation::builder()
147+
.with_doc_section(DOC_SECTION_OTHER)
148+
.with_description("Casts a value to a specific Arrow data type.")
149+
.with_syntax_example("arrow_cast(expression, datatype)")
150+
.with_sql_example(
151+
r#"```sql
152+
> select arrow_cast(-5, 'Int8') as a,
153+
arrow_cast('foo', 'Dictionary(Int32, Utf8)') as b,
154+
arrow_cast('bar', 'LargeUtf8') as c,
155+
arrow_cast('2023-01-02T12:53:02', 'Timestamp(Microsecond, Some("+08:00"))') as d
156+
;
157+
+----+-----+-----+---------------------------+
158+
| a | b | c | d |
159+
+----+-----+-----+---------------------------+
160+
| -5 | foo | bar | 2023-01-02T12:53:02+08:00 |
161+
+----+-----+-----+---------------------------+
162+
```"#,
163+
)
164+
.with_argument("expression", "Expression to cast. The expression can be a constant, column, or function, and any combination of operators.")
165+
.with_argument("datatype", "[Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) name to cast to, as a string. The format is the same as that returned by [`arrow_typeof`]")
166+
.build()
167+
.unwrap()
168+
})
134169
}
135170

136171
/// Returns the requested type from the arguments

datafusion/functions/src/core/arrowtypeof.rs

+34-1
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717

1818
use arrow::datatypes::DataType;
1919
use datafusion_common::{exec_err, Result, ScalarValue};
20-
use datafusion_expr::ColumnarValue;
20+
use datafusion_expr::scalar_doc_sections::DOC_SECTION_OTHER;
21+
use datafusion_expr::{ColumnarValue, Documentation};
2122
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
2223
use std::any::Any;
24+
use std::sync::OnceLock;
2325

2426
#[derive(Debug)]
2527
pub struct ArrowTypeOfFunc {
@@ -69,4 +71,35 @@ impl ScalarUDFImpl for ArrowTypeOfFunc {
6971
"{input_data_type}"
7072
))))
7173
}
74+
75+
fn documentation(&self) -> Option<&Documentation> {
76+
Some(get_arrowtypeof_doc())
77+
}
78+
}
79+
80+
static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
81+
82+
fn get_arrowtypeof_doc() -> &'static Documentation {
83+
DOCUMENTATION.get_or_init(|| {
84+
Documentation::builder()
85+
.with_doc_section(DOC_SECTION_OTHER)
86+
.with_description(
87+
"Returns the name of the underlying [Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) of the expression.",
88+
)
89+
.with_syntax_example("arrow_typeof(expression)")
90+
.with_sql_example(
91+
r#"```sql
92+
> select arrow_typeof('foo'), arrow_typeof(1);
93+
+---------------------------+------------------------+
94+
| arrow_typeof(Utf8("foo")) | arrow_typeof(Int64(1)) |
95+
+---------------------------+------------------------+
96+
| Utf8 | Int64 |
97+
+---------------------------+------------------------+
98+
```
99+
"#,
100+
)
101+
.with_argument("expression", "Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators.")
102+
.build()
103+
.unwrap()
104+
})
72105
}

datafusion/functions/src/core/coalesce.rs

+26-17
Original file line numberDiff line numberDiff line change
@@ -47,23 +47,6 @@ impl CoalesceFunc {
4747
}
4848
}
4949

50-
static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
51-
52-
fn get_coalesce_doc() -> &'static Documentation {
53-
DOCUMENTATION.get_or_init(|| {
54-
Documentation::builder()
55-
.with_doc_section(DOC_SECTION_CONDITIONAL)
56-
.with_description("Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values.")
57-
.with_syntax_example("coalesce(expression1[, ..., expression_n])")
58-
.with_argument(
59-
"expression1, expression_n",
60-
"Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary."
61-
)
62-
.build()
63-
.unwrap()
64-
})
65-
}
66-
6750
impl ScalarUDFImpl for CoalesceFunc {
6851
fn as_any(&self) -> &dyn Any {
6952
self
@@ -164,6 +147,32 @@ impl ScalarUDFImpl for CoalesceFunc {
164147
}
165148
}
166149

150+
static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
151+
152+
fn get_coalesce_doc() -> &'static Documentation {
153+
DOCUMENTATION.get_or_init(|| {
154+
Documentation::builder()
155+
.with_doc_section(DOC_SECTION_CONDITIONAL)
156+
.with_description("Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values.")
157+
.with_syntax_example("coalesce(expression1[, ..., expression_n])")
158+
.with_sql_example(r#"```sql
159+
> select coalesce(null, null, 'datafusion');
160+
+----------------------------------------+
161+
| coalesce(NULL,NULL,Utf8("datafusion")) |
162+
+----------------------------------------+
163+
| datafusion |
164+
+----------------------------------------+
165+
```"#,
166+
)
167+
.with_argument(
168+
"expression1, expression_n",
169+
"Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary."
170+
)
171+
.build()
172+
.unwrap()
173+
})
174+
}
175+
167176
#[cfg(test)]
168177
mod test {
169178
use arrow::datatypes::DataType;

datafusion/functions/src/core/getfield.rs

+64-10
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,11 @@ use datafusion_common::cast::{as_map_array, as_struct_array};
2323
use datafusion_common::{
2424
exec_err, plan_datafusion_err, plan_err, ExprSchema, Result, ScalarValue,
2525
};
26-
use datafusion_expr::{ColumnarValue, Expr, ExprSchemable};
26+
use datafusion_expr::scalar_doc_sections::DOC_SECTION_OTHER;
27+
use datafusion_expr::{ColumnarValue, Documentation, Expr, ExprSchemable};
2728
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
2829
use std::any::Any;
29-
use std::sync::Arc;
30+
use std::sync::{Arc, OnceLock};
3031

3132
#[derive(Debug)]
3233
pub struct GetFieldFunc {
@@ -133,7 +134,7 @@ impl ScalarUDFImpl for GetFieldFunc {
133134
DataType::Struct(fields) if fields.len() == 2 => {
134135
// Arrow's MapArray is essentially a ListArray of structs with two columns. They are
135136
// often named "key", and "value", but we don't require any specific naming here;
136-
// instead, we assume that the second columnis the "value" column both here and in
137+
// instead, we assume that the second column is the "value" column both here and in
137138
// execution.
138139
let value_field = fields.get(1).expect("fields should have exactly two members");
139140
Ok(value_field.data_type().clone())
@@ -155,7 +156,7 @@ impl ScalarUDFImpl for GetFieldFunc {
155156
"Only UTF8 strings are valid as an indexed field in a struct"
156157
),
157158
(DataType::Null, _) => Ok(DataType::Null),
158-
(other, _) => plan_err!("The expression to get an indexed field is only valid for `List`, `Struct`, `Map` or `Null` types, got {other}"),
159+
(other, _) => plan_err!("The expression to get an indexed field is only valid for `Struct`, `Map` or `Null` types, got {other}"),
159160
}
160161
}
161162

@@ -190,7 +191,7 @@ impl ScalarUDFImpl for GetFieldFunc {
190191
let keys = arrow::compute::kernels::cmp::eq(&key_scalar, map_array.keys())?;
191192

192193
// note that this array has more entries than the expected output/input size
193-
// because maparray is flatten
194+
// because map_array is flattened
194195
let original_data = map_array.entries().column(1).to_data();
195196
let capacity = Capacities::Array(original_data.len());
196197
let mut mutable =
@@ -205,7 +206,7 @@ impl ScalarUDFImpl for GetFieldFunc {
205206
keys.slice(start, end-start).
206207
iter().enumerate().
207208
find(|(_, t)| t.unwrap());
208-
if maybe_matched.is_none(){
209+
if maybe_matched.is_none() {
209210
mutable.extend_nulls(1);
210211
continue
211212
}
@@ -224,14 +225,67 @@ impl ScalarUDFImpl for GetFieldFunc {
224225
}
225226
}
226227
(DataType::Struct(_), name) => exec_err!(
227-
"get indexed field is only possible on struct with utf8 indexes. \
228-
Tried with {name:?} index"
228+
"get_field is only possible on struct with utf8 indexes. \
229+
Received with {name:?} index"
229230
),
230231
(DataType::Null, _) => Ok(ColumnarValue::Scalar(ScalarValue::Null)),
231232
(dt, name) => exec_err!(
232-
"get indexed field is only possible on lists with int64 indexes or struct \
233-
with utf8 indexes. Tried {dt:?} with {name:?} index"
233+
"get_field is only possible on maps with utf8 indexes or struct \
234+
with utf8 indexes. Received {dt:?} with {name:?} index"
234235
),
235236
}
236237
}
238+
239+
fn documentation(&self) -> Option<&Documentation> {
240+
Some(get_getfield_doc())
241+
}
242+
}
243+
244+
static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
245+
246+
fn get_getfield_doc() -> &'static Documentation {
247+
DOCUMENTATION.get_or_init(|| {
248+
Documentation::builder()
249+
.with_doc_section(DOC_SECTION_OTHER)
250+
.with_description(r#"Returns a field within a map or a struct with the given key.
251+
Note: most users invoke `get_field` indirectly via field access
252+
syntax such as `my_struct_col['field_name']` which results in a call to
253+
`get_field(my_struct_col, 'field_name')`."#)
254+
.with_syntax_example("get_field(expression1, expression2)")
255+
.with_sql_example(r#"```sql
256+
> create table t (idx varchar, v varchar) as values ('data','fusion'), ('apache', 'arrow');
257+
> select struct(idx, v) from t as c;
258+
+-------------------------+
259+
| struct(c.idx,c.v) |
260+
+-------------------------+
261+
| {c0: data, c1: fusion} |
262+
| {c0: apache, c1: arrow} |
263+
+-------------------------+
264+
> select get_field((select struct(idx, v) from t), 'c0');
265+
+-----------------------+
266+
| struct(t.idx,t.v)[c0] |
267+
+-----------------------+
268+
| data |
269+
| apache |
270+
+-----------------------+
271+
> select get_field((select struct(idx, v) from t), 'c1');
272+
+-----------------------+
273+
| struct(t.idx,t.v)[c1] |
274+
+-----------------------+
275+
| fusion |
276+
| arrow |
277+
+-----------------------+
278+
```
279+
"#)
280+
.with_argument(
281+
"expression1",
282+
"The map or struct to retrieve a field for."
283+
)
284+
.with_argument(
285+
"expression2",
286+
"The field name in the map or struct to retrieve data for. Must evaluate to a string."
287+
)
288+
.build()
289+
.unwrap()
290+
})
237291
}

datafusion/functions/src/core/named_struct.rs

+45-2
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,12 @@
1818
use arrow::array::StructArray;
1919
use arrow::datatypes::{DataType, Field, Fields};
2020
use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
21-
use datafusion_expr::{ColumnarValue, Expr, ExprSchemable};
21+
use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRUCT;
22+
use datafusion_expr::{ColumnarValue, Documentation, Expr, ExprSchemable};
2223
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
2324
use hashbrown::HashSet;
2425
use std::any::Any;
25-
use std::sync::Arc;
26+
use std::sync::{Arc, OnceLock};
2627

2728
/// put values in a struct array.
2829
fn named_struct_expr(args: &[ColumnarValue]) -> Result<ColumnarValue> {
@@ -161,4 +162,46 @@ impl ScalarUDFImpl for NamedStructFunc {
161162
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
162163
named_struct_expr(args)
163164
}
165+
166+
fn documentation(&self) -> Option<&Documentation> {
167+
Some(get_named_struct_doc())
168+
}
169+
}
170+
171+
static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
172+
173+
fn get_named_struct_doc() -> &'static Documentation {
174+
DOCUMENTATION.get_or_init(|| {
175+
Documentation::builder()
176+
.with_doc_section(DOC_SECTION_STRUCT)
177+
.with_description("Returns an Arrow struct using the specified name and input expressions pairs.")
178+
.with_syntax_example("named_struct(expression1_name, expression1_input[, ..., expression_n_name, expression_n_input])")
179+
.with_sql_example(r#"
180+
For example, this query converts two columns `a` and `b` to a single column with
181+
a struct type of fields `field_a` and `field_b`:
182+
```sql
183+
> select * from t;
184+
+---+---+
185+
| a | b |
186+
+---+---+
187+
| 1 | 2 |
188+
| 3 | 4 |
189+
+---+---+
190+
> select named_struct('field_a', a, 'field_b', b) from t;
191+
+-------------------------------------------------------+
192+
| named_struct(Utf8("field_a"),t.a,Utf8("field_b"),t.b) |
193+
+-------------------------------------------------------+
194+
| {field_a: 1, field_b: 2} |
195+
| {field_a: 3, field_b: 4} |
196+
+-------------------------------------------------------+
197+
```
198+
"#)
199+
.with_argument(
200+
"expression_n_name",
201+
"Name of the column field. Must be a constant string."
202+
)
203+
.with_argument("expression_n_input", "Expression to include in the output struct. Can be a constant, column, or function, and any combination of arithmetic or string operators.")
204+
.build()
205+
.unwrap()
206+
})
164207
}

0 commit comments

Comments
 (0)