15
15
// specific language governing permissions and limitations
16
16
// under the License.
17
17
18
+ //! [`GroupValues`] trait for storing and interning group keys
19
+
18
20
use arrow:: record_batch:: RecordBatch ;
19
21
use arrow_array:: { downcast_primitive, ArrayRef } ;
20
22
use arrow_schema:: { DataType , SchemaRef } ;
@@ -37,18 +39,61 @@ use datafusion_physical_expr::binary_map::OutputType;
37
39
38
40
mod group_column;
39
41
40
- /// An interning store for group keys
42
+ /// Stores the group values during hash aggregation.
43
+ ///
44
+ /// # Background
45
+ ///
46
+ /// In a query such as `SELECT a, b, count(*) FROM t GROUP BY a, b`, the group values
47
+ /// identify each group, and correspond to all the distinct values of `(a,b)`.
48
+ ///
49
+ /// ```sql
50
+ /// -- Input has 4 rows with 3 distinct combinations of (a,b) ("groups")
51
+ /// create table t(a int, b varchar)
52
+ /// as values (1, 'a'), (2, 'b'), (1, 'a'), (3, 'c');
53
+ ///
54
+ /// select a, b, count(*) from t group by a, b;
55
+ /// ----
56
+ /// 1 a 2
57
+ /// 2 b 1
58
+ /// 3 c 1
59
+ /// ```
60
+ ///
61
+ /// # Design
62
+ ///
63
+ /// Managing group values is a performance critical operation in hash
64
+ /// aggregation. The major operations are:
65
+ ///
66
+ /// 1. Intern: Quickly finding existing and adding new group values
67
+ /// 2. Emit: Returning the group values as an array
68
+ ///
69
+ /// There are multiple specialized implementations of this trait optimized for
70
+ /// different data types and number of columns, optimized for these operations.
71
+ /// See [`new_group_values`] for details.
72
+ ///
73
+ /// # Group Ids
74
+ ///
75
+ /// Each distinct group in a hash aggregation is identified by a unique group id
76
+ /// (usize) which is assigned by instances of this trait. Group ids are
77
+ /// continuous without gaps, starting from 0.
41
78
pub trait GroupValues : Send {
42
- /// Calculates the `groups` for each input row of `cols`
79
+ /// Calculates the group id for each input row of `cols`, assigning new
80
+ /// group ids as necessary.
81
+ ///
82
+ /// When the function returns, `groups` must contain the group id for each
83
+ /// row in `cols`.
84
+ ///
85
+ /// If a row has the same value as a previous row, the same group id is
86
+ /// assigned. If a row has a new value, the next available group id is
87
+ /// assigned.
43
88
fn intern ( & mut self , cols : & [ ArrayRef ] , groups : & mut Vec < usize > ) -> Result < ( ) > ;
44
89
45
- /// Returns the number of bytes used by this [`GroupValues`]
90
+ /// Returns the number of bytes of memory used by this [`GroupValues`]
46
91
fn size ( & self ) -> usize ;
47
92
48
93
/// Returns true if this [`GroupValues`] is empty
49
94
fn is_empty ( & self ) -> bool ;
50
95
51
- /// The number of values stored in this [`GroupValues`]
96
+ /// The number of values (distinct group values) stored in this [`GroupValues`]
52
97
fn len ( & self ) -> usize ;
53
98
54
99
/// Emits the group values
@@ -58,6 +103,7 @@ pub trait GroupValues: Send {
58
103
fn clear_shrink ( & mut self , batch : & RecordBatch ) ;
59
104
}
60
105
106
+ /// Return a specialized implementation of [`GroupValues`] for the given schema.
61
107
pub fn new_group_values ( schema : SchemaRef ) -> Result < Box < dyn GroupValues > > {
62
108
if schema. fields . len ( ) == 1 {
63
109
let d = schema. fields [ 0 ] . data_type ( ) ;
0 commit comments