From adf502d70b4e36f7e863ab5f3b4450882776b3b9 Mon Sep 17 00:00:00 2001 From: Jacob Sherin Date: Tue, 23 Jul 2024 22:17:30 +0530 Subject: [PATCH 01/10] doc: why nullable of list item is set to true --- datafusion/functions-aggregate/src/array_agg.rs | 11 +++++++++++ .../functions-aggregate/src/bit_and_or_xor.rs | 9 +++++++++ datafusion/functions-aggregate/src/count.rs | 9 +++++++++ datafusion/functions-aggregate/src/nth_value.rs | 13 +++++++++---- datafusion/functions-aggregate/src/sum.rs | 9 +++++++++ 5 files changed, 47 insertions(+), 4 deletions(-) diff --git a/datafusion/functions-aggregate/src/array_agg.rs b/datafusion/functions-aggregate/src/array_agg.rs index 777a242aa27e..6905d475ff6e 100644 --- a/datafusion/functions-aggregate/src/array_agg.rs +++ b/datafusion/functions-aggregate/src/array_agg.rs @@ -89,6 +89,17 @@ impl AggregateUDFImpl for ArrayAgg { } fn state_fields(&self, args: StateFieldsArgs) -> Result> { + // The data type of the "item" in the list is equivalent to the + // data type of the first argument. + // + // However, `nullable` is set to `true` regardless of how it is + // set in the schema of the first argument. This ensures that + // the aggregate computation works even when null values are + // present in the list. + // + // This eliminates the need for special treatment of nulls. + // + // (applies to both "distinct_array_agg" and "array_agg") if args.is_distinct { return Ok(vec![Field::new_list( format_state_name(args.name, "distinct_array_agg"), diff --git a/datafusion/functions-aggregate/src/bit_and_or_xor.rs b/datafusion/functions-aggregate/src/bit_and_or_xor.rs index 6c2d6cb5285c..d3de2866fcac 100644 --- a/datafusion/functions-aggregate/src/bit_and_or_xor.rs +++ b/datafusion/functions-aggregate/src/bit_and_or_xor.rs @@ -198,6 +198,15 @@ impl AggregateUDFImpl for BitwiseOperation { fn state_fields(&self, args: StateFieldsArgs) -> Result> { if self.operation == BitwiseOperationType::Xor && args.is_distinct { + // The data type of the "item" in the list is equivalent to the + // data type of the returned value. + // + // However, `nullable` is set to `true` regardless of how it is + // set in the schema of the returned value. This ensures that + // the aggregate computation works even when null values are + // present in the list. + // + // This eliminates the need for special treatment of nulls. Ok(vec![Field::new_list( format_state_name( args.name, diff --git a/datafusion/functions-aggregate/src/count.rs b/datafusion/functions-aggregate/src/count.rs index 0ead22e90a16..e6de2b3b2294 100644 --- a/datafusion/functions-aggregate/src/count.rs +++ b/datafusion/functions-aggregate/src/count.rs @@ -123,6 +123,15 @@ impl AggregateUDFImpl for Count { fn state_fields(&self, args: StateFieldsArgs) -> Result> { if args.is_distinct { + // The data type of the "item" in the list is equivalent to the + // data type of the first argument. + // + // However, `nullable` is set to `true` regardless of how it is + // set in the schema of the first argument. This ensures that + // the aggregate computation works even when null values are + // present in the list. + // + // This eliminates the need for special treatment of nulls. Ok(vec![Field::new_list( format_state_name(args.name, "count distinct"), Field::new("item", args.input_type.clone(), true), diff --git a/datafusion/functions-aggregate/src/nth_value.rs b/datafusion/functions-aggregate/src/nth_value.rs index 9bbd68c9bdf6..eb62aa3513e5 100644 --- a/datafusion/functions-aggregate/src/nth_value.rs +++ b/datafusion/functions-aggregate/src/nth_value.rs @@ -132,12 +132,17 @@ impl AggregateUDFImpl for NthValueAgg { } fn state_fields(&self, args: StateFieldsArgs) -> Result> { + // The data type of the "item" in the list is equivalent to the + // data type of the first argument. + // + // However, `nullable` is set to `true` regardless of how it is + // set in the schema of the first argument. This ensures that + // the aggregate computation works even when null values are + // present in the list. + // + // This eliminates the need for special treatment of nulls. let mut fields = vec![Field::new_list( format_state_name(self.name(), "nth_value"), - // TODO: The nullability of the list element should be configurable. - // The hard-coded `true` should be changed once the field for - // nullability is added to `StateFieldArgs` struct. - // See: https://github.com/apache/datafusion/pull/11063 Field::new("item", args.input_type.clone(), true), false, )]; diff --git a/datafusion/functions-aggregate/src/sum.rs b/datafusion/functions-aggregate/src/sum.rs index a9f31dc05be9..b5488dbb3302 100644 --- a/datafusion/functions-aggregate/src/sum.rs +++ b/datafusion/functions-aggregate/src/sum.rs @@ -171,6 +171,15 @@ impl AggregateUDFImpl for Sum { } fn state_fields(&self, args: StateFieldsArgs) -> Result> { + // The data type of the "item" in the list is equivalent to the + // data type of the returned value. + // + // However, `nullable` is set to `true` regardless of how it is + // set in the schema of the returned value. This ensures that + // the aggregate computation works even when null values are + // present in the list. + // + // This eliminates the need for special treatment of nulls. if args.is_distinct { Ok(vec![Field::new_list( format_state_name(args.name, "sum distinct"), From e2c1bf1512b783d207286e8047cc32de17fa8dee Mon Sep 17 00:00:00 2001 From: Jacob Sherin Date: Wed, 24 Jul 2024 18:19:15 +0530 Subject: [PATCH 02/10] Adds an external doc to avoid repeating text --- datafusion/functions-aggregate/COMMENTS.md | 64 ++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 datafusion/functions-aggregate/COMMENTS.md diff --git a/datafusion/functions-aggregate/COMMENTS.md b/datafusion/functions-aggregate/COMMENTS.md new file mode 100644 index 000000000000..3cdc36f672fc --- /dev/null +++ b/datafusion/functions-aggregate/COMMENTS.md @@ -0,0 +1,64 @@ +# Nullable Setting For List Items + +When the accumulator state is a list of items and the type of the item +is either equivalent to the first-argument/returned-value, then +we set `nullable` argument of list item as `true` regardless of the +how `nullable` is defined in the schema of either the first argument or +the returned value. + +For example, in `NthValueAgg` we do this: +```rust +fn state_fields(&self, args: StateFieldsArgs) -> Result> { + let mut fields = vec![Field::new_list( + format_state_name(self.name(), "nth_value"), + Field::new("item", args.input_type.clone(), true), // always true + false, + )]; + // ... rest of the function code +} +``` + +By setting `nullable` to be always `true` like this we ensure that the +aggregate computation works even when nulls are present. + +The advantage of doing it this way is that it eliminates the need for +additional code and special treatment of nulls in the accumulator state. + +# Nullable Setting For List + +This depends on the aggregate implementation. + +In `ArrayAgg` the list is nullable, so it is set to `true`. +```rust +fn state_fields(&self, args: StateFieldsArgs) -> Result> { + if args.is_distinct { + return Ok(vec![Field::new_list( + format_state_name(args.name, "distinct_array_agg"), + Field::new("item", args.input_type.clone(), true), + true, // list maybe null + )]); + } + + let mut fields = vec![Field::new_list( + format_state_name(args.name, "array_agg"), + Field::new("item", args.input_type.clone(), true), + true, // list maybe null + )]; + // ... rest of the function code +} +``` + +Alternatively in `Sum` the list is not nullable, so it is set to `false`. +```rust +fn state_fields(&self, args: StateFieldsArgs) -> Result> { + if args.is_distinct { + Ok(vec![Field::new_list( + format_state_name(args.name, "sum distinct"), + Field::new("item", args.return_type.clone(), true), + false, // impossible for list to be null + )]) + } else { + // .. rest of the function code + } +} +``` \ No newline at end of file From d46f01c105afc044cb7a2ef3cffdc111d0e0898c Mon Sep 17 00:00:00 2001 From: Jacob Sherin Date: Wed, 24 Jul 2024 22:43:15 +0530 Subject: [PATCH 03/10] rewrite --- datafusion/functions-aggregate/COMMENTS.md | 103 +++++++++------------ 1 file changed, 46 insertions(+), 57 deletions(-) diff --git a/datafusion/functions-aggregate/COMMENTS.md b/datafusion/functions-aggregate/COMMENTS.md index 3cdc36f672fc..e53db860df66 100644 --- a/datafusion/functions-aggregate/COMMENTS.md +++ b/datafusion/functions-aggregate/COMMENTS.md @@ -1,64 +1,53 @@ -# Nullable Setting For List Items - -When the accumulator state is a list of items and the type of the item -is either equivalent to the first-argument/returned-value, then -we set `nullable` argument of list item as `true` regardless of the -how `nullable` is defined in the schema of either the first argument or -the returned value. - -For example, in `NthValueAgg` we do this: +# Why Is List Item Always Nullable? + +## Motivation +There were independent proposals to make the `nullable` setting of list +items in accumulator state be configurable. This meant adding additional +fields which captured the `nullable` setting from schema in planning for +the first argument to the aggregation function, and the returned value. + +These fields were to be added to `StateFieldArgs`. But then we found out +that aggregate computation does not depend on it, and it can be avoided. + +This document exists to make that reasoning explicit. + +## Background +The list data type is used in the accumulator state for a few aggregate +functions like: +- `sum` +- `count` +- `array_agg` +- `bit_and`, `bit_or` and `bit_xor` +- `nth_value` + +In all of the above cases the data type of the list item is equivalent +to either the first argument of the aggregate function or the returned +value. + +For example, in `array_agg` the data type of item is equivalent to the +first argument and the definition looks like this: ```rust -fn state_fields(&self, args: StateFieldsArgs) -> Result> { - let mut fields = vec![Field::new_list( - format_state_name(self.name(), "nth_value"), - Field::new("item", args.input_type.clone(), true), // always true - false, - )]; - // ... rest of the function code -} +// `args` : `StateFieldArgs` +// `input_type` : data type of the first argument +let mut fields = vec![Field::new_list( + format_state_name(self.name(), "array_agg"), + Field::new("item", args.input_type.clone(), true /* nullable of list item */ ), + false, // nullable of list itself +)]; ``` -By setting `nullable` to be always `true` like this we ensure that the -aggregate computation works even when nulls are present. - -The advantage of doing it this way is that it eliminates the need for -additional code and special treatment of nulls in the accumulator state. - -# Nullable Setting For List +For all the aggregates listed above, the list item is always defined as +nullable. -This depends on the aggregate implementation. +## Computing Intermediate State -In `ArrayAgg` the list is nullable, so it is set to `true`. -```rust -fn state_fields(&self, args: StateFieldsArgs) -> Result> { - if args.is_distinct { - return Ok(vec![Field::new_list( - format_state_name(args.name, "distinct_array_agg"), - Field::new("item", args.input_type.clone(), true), - true, // list maybe null - )]); - } +By setting `nullable` to be always `true` like this we ensure that the +aggregate computation works even when nulls are present. The advantage +of doing it this way is that it eliminates the need for additional code +and special treatment of nulls in the accumulator state. - let mut fields = vec![Field::new_list( - format_state_name(args.name, "array_agg"), - Field::new("item", args.input_type.clone(), true), - true, // list maybe null - )]; - // ... rest of the function code -} -``` +## Nullable Of List Itself -Alternatively in `Sum` the list is not nullable, so it is set to `false`. -```rust -fn state_fields(&self, args: StateFieldsArgs) -> Result> { - if args.is_distinct { - Ok(vec![Field::new_list( - format_state_name(args.name, "sum distinct"), - Field::new("item", args.return_type.clone(), true), - false, // impossible for list to be null - )]) - } else { - // .. rest of the function code - } -} -``` \ No newline at end of file +The `nullable` of list itself is dependent on the aggregate. In the +case of `array_agg` the list is nullable(`true`), meanwhile for `sum` +the list is not nullable(`false`). \ No newline at end of file From c339eb041736821da229d89851bafc60a9042afb Mon Sep 17 00:00:00 2001 From: Jacob Sherin Date: Wed, 24 Jul 2024 22:47:44 +0530 Subject: [PATCH 04/10] redirects to external doc --- datafusion/functions-aggregate/src/array_agg.rs | 13 ++----------- .../functions-aggregate/src/bit_and_or_xor.rs | 10 +--------- datafusion/functions-aggregate/src/count.rs | 10 +--------- datafusion/functions-aggregate/src/nth_value.rs | 10 +--------- datafusion/functions-aggregate/src/sum.rs | 10 +--------- 5 files changed, 6 insertions(+), 47 deletions(-) diff --git a/datafusion/functions-aggregate/src/array_agg.rs b/datafusion/functions-aggregate/src/array_agg.rs index 6905d475ff6e..b31195829256 100644 --- a/datafusion/functions-aggregate/src/array_agg.rs +++ b/datafusion/functions-aggregate/src/array_agg.rs @@ -89,20 +89,10 @@ impl AggregateUDFImpl for ArrayAgg { } fn state_fields(&self, args: StateFieldsArgs) -> Result> { - // The data type of the "item" in the list is equivalent to the - // data type of the first argument. - // - // However, `nullable` is set to `true` regardless of how it is - // set in the schema of the first argument. This ensures that - // the aggregate computation works even when null values are - // present in the list. - // - // This eliminates the need for special treatment of nulls. - // - // (applies to both "distinct_array_agg" and "array_agg") if args.is_distinct { return Ok(vec![Field::new_list( format_state_name(args.name, "distinct_array_agg"), + // See COMMENTS.md to understand why nullable is set to true Field::new("item", args.input_type.clone(), true), true, )]); @@ -110,6 +100,7 @@ impl AggregateUDFImpl for ArrayAgg { let mut fields = vec![Field::new_list( format_state_name(args.name, "array_agg"), + // See COMMENTS.md to understand why nullable is set to true Field::new("item", args.input_type.clone(), true), true, )]; diff --git a/datafusion/functions-aggregate/src/bit_and_or_xor.rs b/datafusion/functions-aggregate/src/bit_and_or_xor.rs index d3de2866fcac..f6dd0bc20a83 100644 --- a/datafusion/functions-aggregate/src/bit_and_or_xor.rs +++ b/datafusion/functions-aggregate/src/bit_and_or_xor.rs @@ -198,20 +198,12 @@ impl AggregateUDFImpl for BitwiseOperation { fn state_fields(&self, args: StateFieldsArgs) -> Result> { if self.operation == BitwiseOperationType::Xor && args.is_distinct { - // The data type of the "item" in the list is equivalent to the - // data type of the returned value. - // - // However, `nullable` is set to `true` regardless of how it is - // set in the schema of the returned value. This ensures that - // the aggregate computation works even when null values are - // present in the list. - // - // This eliminates the need for special treatment of nulls. Ok(vec![Field::new_list( format_state_name( args.name, format!("{} distinct", self.name()).as_str(), ), + // See COMMENTS.md to understand why nullable is set to true Field::new("item", args.return_type.clone(), true), false, )]) diff --git a/datafusion/functions-aggregate/src/count.rs b/datafusion/functions-aggregate/src/count.rs index e6de2b3b2294..56850d0e02a1 100644 --- a/datafusion/functions-aggregate/src/count.rs +++ b/datafusion/functions-aggregate/src/count.rs @@ -123,17 +123,9 @@ impl AggregateUDFImpl for Count { fn state_fields(&self, args: StateFieldsArgs) -> Result> { if args.is_distinct { - // The data type of the "item" in the list is equivalent to the - // data type of the first argument. - // - // However, `nullable` is set to `true` regardless of how it is - // set in the schema of the first argument. This ensures that - // the aggregate computation works even when null values are - // present in the list. - // - // This eliminates the need for special treatment of nulls. Ok(vec![Field::new_list( format_state_name(args.name, "count distinct"), + // See COMMENTS.md to understand why nullable is set to true Field::new("item", args.input_type.clone(), true), false, )]) diff --git a/datafusion/functions-aggregate/src/nth_value.rs b/datafusion/functions-aggregate/src/nth_value.rs index eb62aa3513e5..0c1f1f292822 100644 --- a/datafusion/functions-aggregate/src/nth_value.rs +++ b/datafusion/functions-aggregate/src/nth_value.rs @@ -132,17 +132,9 @@ impl AggregateUDFImpl for NthValueAgg { } fn state_fields(&self, args: StateFieldsArgs) -> Result> { - // The data type of the "item" in the list is equivalent to the - // data type of the first argument. - // - // However, `nullable` is set to `true` regardless of how it is - // set in the schema of the first argument. This ensures that - // the aggregate computation works even when null values are - // present in the list. - // - // This eliminates the need for special treatment of nulls. let mut fields = vec![Field::new_list( format_state_name(self.name(), "nth_value"), + // See COMMENTS.md to understand why nullable is set to true Field::new("item", args.input_type.clone(), true), false, )]; diff --git a/datafusion/functions-aggregate/src/sum.rs b/datafusion/functions-aggregate/src/sum.rs index b5488dbb3302..08e3908a5829 100644 --- a/datafusion/functions-aggregate/src/sum.rs +++ b/datafusion/functions-aggregate/src/sum.rs @@ -171,18 +171,10 @@ impl AggregateUDFImpl for Sum { } fn state_fields(&self, args: StateFieldsArgs) -> Result> { - // The data type of the "item" in the list is equivalent to the - // data type of the returned value. - // - // However, `nullable` is set to `true` regardless of how it is - // set in the schema of the returned value. This ensures that - // the aggregate computation works even when null values are - // present in the list. - // - // This eliminates the need for special treatment of nulls. if args.is_distinct { Ok(vec![Field::new_list( format_state_name(args.name, "sum distinct"), + // See COMMENTS.md to understand why nullable is set to true Field::new("item", args.return_type.clone(), true), false, )]) From 02dc91fdc59cd4a159c0fb1cd1d69a790e05dfb7 Mon Sep 17 00:00:00 2001 From: Jacob Sherin Date: Wed, 24 Jul 2024 22:52:51 +0530 Subject: [PATCH 05/10] Adds ASF license --- datafusion/functions-aggregate/COMMENTS.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/datafusion/functions-aggregate/COMMENTS.md b/datafusion/functions-aggregate/COMMENTS.md index e53db860df66..3851bade7787 100644 --- a/datafusion/functions-aggregate/COMMENTS.md +++ b/datafusion/functions-aggregate/COMMENTS.md @@ -1,3 +1,22 @@ + + # Why Is List Item Always Nullable? ## Motivation From f9c1b1ea36101200e97a02e5933dc2696edc088f Mon Sep 17 00:00:00 2001 From: Jacob Sherin Date: Wed, 24 Jul 2024 22:56:16 +0530 Subject: [PATCH 06/10] Minor: formatting fixes --- datafusion/functions-aggregate/COMMENTS.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/datafusion/functions-aggregate/COMMENTS.md b/datafusion/functions-aggregate/COMMENTS.md index 3851bade7787..31bb0a4eb205 100644 --- a/datafusion/functions-aggregate/COMMENTS.md +++ b/datafusion/functions-aggregate/COMMENTS.md @@ -20,6 +20,7 @@ # Why Is List Item Always Nullable? ## Motivation + There were independent proposals to make the `nullable` setting of list items in accumulator state be configurable. This meant adding additional fields which captured the `nullable` setting from schema in planning for @@ -31,8 +32,10 @@ that aggregate computation does not depend on it, and it can be avoided. This document exists to make that reasoning explicit. ## Background + The list data type is used in the accumulator state for a few aggregate functions like: + - `sum` - `count` - `array_agg` @@ -45,6 +48,7 @@ value. For example, in `array_agg` the data type of item is equivalent to the first argument and the definition looks like this: + ```rust // `args` : `StateFieldArgs` // `input_type` : data type of the first argument @@ -69,4 +73,4 @@ and special treatment of nulls in the accumulator state. The `nullable` of list itself is dependent on the aggregate. In the case of `array_agg` the list is nullable(`true`), meanwhile for `sum` -the list is not nullable(`false`). \ No newline at end of file +the list is not nullable(`false`). From 23b3b63d41ed2968d2f9dbdc651631e780391036 Mon Sep 17 00:00:00 2001 From: Jacob Sherin Date: Wed, 24 Jul 2024 23:04:41 +0530 Subject: [PATCH 07/10] Minor: copy edits --- datafusion/functions-aggregate/COMMENTS.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/functions-aggregate/COMMENTS.md b/datafusion/functions-aggregate/COMMENTS.md index 31bb0a4eb205..eb4c18ec9de4 100644 --- a/datafusion/functions-aggregate/COMMENTS.md +++ b/datafusion/functions-aggregate/COMMENTS.md @@ -71,6 +71,6 @@ and special treatment of nulls in the accumulator state. ## Nullable Of List Itself -The `nullable` of list itself is dependent on the aggregate. In the -case of `array_agg` the list is nullable(`true`), meanwhile for `sum` -the list is not nullable(`false`). +The `nullable` of list itself depends on the aggregate. In the case of +`array_agg` the list is nullable(`true`), meanwhile for `sum` the list +is not nullable(`false`). From 039ea6c6c47e65b1f5db51b565f3501de77e1431 Mon Sep 17 00:00:00 2001 From: Jacob Sherin Date: Thu, 25 Jul 2024 23:37:04 +0530 Subject: [PATCH 08/10] Retrigger CI From f5181173b0a2baee6231f45794d4c299d1e3b7c3 Mon Sep 17 00:00:00 2001 From: Jacob Sherin Date: Fri, 26 Jul 2024 02:15:35 +0530 Subject: [PATCH 09/10] Fixes: name of aggregation in example In `array_agg` the list is nullable, so changed the example to `nth_value` where the list is not nullable to be correct. --- datafusion/functions-aggregate/COMMENTS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions-aggregate/COMMENTS.md b/datafusion/functions-aggregate/COMMENTS.md index eb4c18ec9de4..3bc14b4f19c3 100644 --- a/datafusion/functions-aggregate/COMMENTS.md +++ b/datafusion/functions-aggregate/COMMENTS.md @@ -53,7 +53,7 @@ first argument and the definition looks like this: // `args` : `StateFieldArgs` // `input_type` : data type of the first argument let mut fields = vec![Field::new_list( - format_state_name(self.name(), "array_agg"), + format_state_name(self.name(), "nth_value"), Field::new("item", args.input_type.clone(), true /* nullable of list item */ ), false, // nullable of list itself )]; From 1601390b5b7093ecf2e7205db24ed1cc44d66045 Mon Sep 17 00:00:00 2001 From: Jacob Sherin Date: Fri, 26 Jul 2024 02:19:09 +0530 Subject: [PATCH 10/10] Disambiguates list item nullability in copy --- datafusion/functions-aggregate/COMMENTS.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/datafusion/functions-aggregate/COMMENTS.md b/datafusion/functions-aggregate/COMMENTS.md index 3bc14b4f19c3..23a996faf007 100644 --- a/datafusion/functions-aggregate/COMMENTS.md +++ b/datafusion/functions-aggregate/COMMENTS.md @@ -64,10 +64,11 @@ nullable. ## Computing Intermediate State -By setting `nullable` to be always `true` like this we ensure that the -aggregate computation works even when nulls are present. The advantage -of doing it this way is that it eliminates the need for additional code -and special treatment of nulls in the accumulator state. +By setting `nullable` (of list item) to be always `true` like this we +ensure that the aggregate computation works even when nulls are +present. The advantage of doing it this way is that it eliminates the +need for additional code and special treatment of nulls in the +accumulator state. ## Nullable Of List Itself