Skip to content

Commit ef70b5a

Browse files
committed
Migrate documentation for regr* functions to code
1 parent eddade7 commit ef70b5a

File tree

3 files changed

+282
-179
lines changed

3 files changed

+282
-179
lines changed

datafusion/functions-aggregate/src/regr.rs

Lines changed: 153 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,6 @@
1717

1818
//! Defines physical expressions that can evaluated at runtime during query execution
1919
20-
use std::any::Any;
21-
use std::fmt::Debug;
22-
2320
use arrow::array::Float64Array;
2421
use arrow::{
2522
array::{ArrayRef, UInt64Array},
@@ -29,10 +26,17 @@ use arrow::{
2926
};
3027
use datafusion_common::{downcast_value, plan_err, unwrap_or_internal_err, ScalarValue};
3128
use datafusion_common::{DataFusionError, Result};
29+
use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL;
3230
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
3331
use datafusion_expr::type_coercion::aggregates::NUMERICS;
3432
use datafusion_expr::utils::format_state_name;
35-
use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, Volatility};
33+
use datafusion_expr::{
34+
Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
35+
};
36+
use std::any::Any;
37+
use std::collections::HashMap;
38+
use std::fmt::Debug;
39+
use std::sync::OnceLock;
3640

3741
macro_rules! make_regr_udaf_expr_and_func {
3842
($EXPR_FN:ident, $AGGREGATE_UDF_FN:ident, $REGR_TYPE:expr) => {
@@ -76,23 +80,7 @@ impl Regr {
7680
}
7781
}
7882

79-
/*
80-
#[derive(Debug)]
81-
pub struct Regr {
82-
name: String,
83-
regr_type: RegrType,
84-
expr_y: Arc<dyn PhysicalExpr>,
85-
expr_x: Arc<dyn PhysicalExpr>,
86-
}
87-
88-
impl Regr {
89-
pub fn get_regr_type(&self) -> RegrType {
90-
self.regr_type.clone()
91-
}
92-
}
93-
*/
94-
95-
#[derive(Debug, Clone)]
83+
#[derive(Debug, Clone, PartialEq, Hash, Eq)]
9684
#[allow(clippy::upper_case_acronyms)]
9785
pub enum RegrType {
9886
/// Variant for `regr_slope` aggregate expression
@@ -135,6 +123,148 @@ pub enum RegrType {
135123
SXY,
136124
}
137125

126+
impl RegrType {
127+
/// return the documentation for the `RegrType`
128+
fn documentation(&self) -> Option<&Documentation> {
129+
get_regr_docs().get(self)
130+
}
131+
}
132+
133+
static DOCUMENTATION: OnceLock<HashMap<RegrType, Documentation>> = OnceLock::new();
134+
fn get_regr_docs() -> &'static HashMap<RegrType, Documentation> {
135+
DOCUMENTATION.get_or_init(|| {
136+
let mut hash_map = HashMap::new();
137+
hash_map.insert(
138+
RegrType::Slope,
139+
Documentation::builder()
140+
.with_doc_section(DOC_SECTION_STATISTICAL)
141+
.with_description(
142+
"Returns the slope of the linear regression line for non-null pairs in aggregate columns. \
143+
Given input column Y and X: regr_slope(Y, X) returns the slope (k in Y = k*X + b) using minimal RSS fitting.",
144+
)
145+
.with_syntax_example("regr_slope(expression_y, expression_x)")
146+
.with_standard_argument("expression_y", "Expression")
147+
.with_standard_argument("expression_x", "Expression")
148+
.build()
149+
.unwrap()
150+
);
151+
152+
hash_map.insert(
153+
RegrType::Intercept,
154+
Documentation::builder()
155+
.with_doc_section(DOC_SECTION_STATISTICAL)
156+
.with_description(
157+
"Computes the y-intercept of the linear regression line. For the equation (y = kx + b), \
158+
this function returns b.",
159+
)
160+
.with_syntax_example("regr_intercept(expression_y, expression_x)")
161+
.with_standard_argument("expression_y", "Dependent variable")
162+
.with_standard_argument("expression_x", "Independent variable")
163+
.build()
164+
.unwrap()
165+
);
166+
167+
hash_map.insert(
168+
RegrType::Count,
169+
Documentation::builder()
170+
.with_doc_section(DOC_SECTION_STATISTICAL)
171+
.with_description(
172+
"Counts the number of non-null paired data points.",
173+
)
174+
.with_syntax_example("regr_count(expression_y, expression_x)")
175+
.with_standard_argument("expression_y", "Dependent variable")
176+
.with_standard_argument("expression_x", "Independent variable")
177+
.build()
178+
.unwrap()
179+
);
180+
181+
hash_map.insert(
182+
RegrType::R2,
183+
Documentation::builder()
184+
.with_doc_section(DOC_SECTION_STATISTICAL)
185+
.with_description(
186+
"Computes the square of the correlation coefficient between the independent and dependent variables.",
187+
)
188+
.with_syntax_example("regr_r2(expression_y, expression_x)")
189+
.with_standard_argument("expression_y", "Dependent variable")
190+
.with_standard_argument("expression_x", "Independent variable")
191+
.build()
192+
.unwrap()
193+
);
194+
195+
hash_map.insert(
196+
RegrType::AvgX,
197+
Documentation::builder()
198+
.with_doc_section(DOC_SECTION_STATISTICAL)
199+
.with_description(
200+
"Computes the average of the independent variable (input) expression_x for the non-null paired data points.",
201+
)
202+
.with_syntax_example("regr_avgx(expression_y, expression_x)")
203+
.with_standard_argument("expression_y", "Dependent variable")
204+
.with_standard_argument("expression_x", "Independent variable")
205+
.build()
206+
.unwrap()
207+
);
208+
209+
hash_map.insert(
210+
RegrType::AvgY,
211+
Documentation::builder()
212+
.with_doc_section(DOC_SECTION_STATISTICAL)
213+
.with_description(
214+
"Computes the average of the dependent variable (output) expression_y for the non-null paired data points.",
215+
)
216+
.with_syntax_example("regr_avgy(expression_y, expression_x)")
217+
.with_standard_argument("expression_y", "Dependent variable")
218+
.with_standard_argument("expression_x", "Independent variable")
219+
.build()
220+
.unwrap()
221+
);
222+
223+
hash_map.insert(
224+
RegrType::SXX,
225+
Documentation::builder()
226+
.with_doc_section(DOC_SECTION_STATISTICAL)
227+
.with_description(
228+
"Computes the sum of squares of the independent variable.",
229+
)
230+
.with_syntax_example("regr_sxx(expression_y, expression_x)")
231+
.with_standard_argument("expression_y", "Dependent variable")
232+
.with_standard_argument("expression_x", "Independent variable")
233+
.build()
234+
.unwrap()
235+
);
236+
237+
hash_map.insert(
238+
RegrType::SYY,
239+
Documentation::builder()
240+
.with_doc_section(DOC_SECTION_STATISTICAL)
241+
.with_description(
242+
"Computes the sum of squares of the dependent variable.",
243+
)
244+
.with_syntax_example("regr_syy(expression_y, expression_x)")
245+
.with_standard_argument("expression_y", "Dependent variable")
246+
.with_standard_argument("expression_x", "Independent variable")
247+
.build()
248+
.unwrap()
249+
);
250+
251+
hash_map.insert(
252+
RegrType::SXY,
253+
Documentation::builder()
254+
.with_doc_section(DOC_SECTION_STATISTICAL)
255+
.with_description(
256+
"Computes the sum of products of paired data points.",
257+
)
258+
.with_syntax_example("regr_sxy(expression_y, expression_x)")
259+
.with_standard_argument("expression_y", "Dependent variable")
260+
.with_standard_argument("expression_x", "Independent variable")
261+
.build()
262+
.unwrap()
263+
);
264+
hash_map
265+
})
266+
}
267+
138268
impl AggregateUDFImpl for Regr {
139269
fn as_any(&self) -> &dyn Any {
140270
self
@@ -198,22 +328,11 @@ impl AggregateUDFImpl for Regr {
198328
),
199329
])
200330
}
201-
}
202331

203-
/*
204-
impl PartialEq<dyn Any> for Regr {
205-
fn eq(&self, other: &dyn Any) -> bool {
206-
down_cast_any_ref(other)
207-
.downcast_ref::<Self>()
208-
.map(|x| {
209-
self.name == x.name
210-
&& self.expr_y.eq(&x.expr_y)
211-
&& self.expr_x.eq(&x.expr_x)
212-
})
213-
.unwrap_or(false)
332+
fn documentation(&self) -> Option<&Documentation> {
333+
self.regr_type.documentation()
214334
}
215335
}
216-
*/
217336

218337
/// `RegrAccumulator` is used to compute linear regression aggregate functions
219338
/// by maintaining statistics needed to compute them in an online fashion.

docs/source/user-guide/sql/aggregate_functions.md

Lines changed: 0 additions & 145 deletions
Original file line numberDiff line numberDiff line change
@@ -204,15 +204,6 @@ last_value(expression [ORDER BY expression])
204204
- [stddev](#stddev)
205205
- [stddev_pop](#stddev_pop)
206206
- [stddev_samp](#stddev_samp)
207-
- [regr_avgx](#regr_avgx)
208-
- [regr_avgy](#regr_avgy)
209-
- [regr_count](#regr_count)
210-
- [regr_intercept](#regr_intercept)
211-
- [regr_r2](#regr_r2)
212-
- [regr_slope](#regr_slope)
213-
- [regr_sxx](#regr_sxx)
214-
- [regr_syy](#regr_syy)
215-
- [regr_sxy](#regr_sxy)
216207

217208
### `corr`
218209

@@ -313,142 +304,6 @@ stddev_samp(expression)
313304
- **expression**: Expression to operate on.
314305
Can be a constant, column, or function, and any combination of arithmetic operators.
315306

316-
### `regr_slope`
317-
318-
Returns the slope of the linear regression line for non-null pairs in aggregate columns.
319-
Given input column Y and X: regr_slope(Y, X) returns the slope (k in Y = k\*X + b) using minimal RSS fitting.
320-
321-
```
322-
regr_slope(expression1, expression2)
323-
```
324-
325-
#### Arguments
326-
327-
- **expression_y**: Expression to operate on.
328-
Can be a constant, column, or function, and any combination of arithmetic operators.
329-
- **expression_x**: Expression to operate on.
330-
Can be a constant, column, or function, and any combination of arithmetic operators.
331-
332-
### `regr_avgx`
333-
334-
Computes the average of the independent variable (input) `expression_x` for the non-null paired data points.
335-
336-
```
337-
regr_avgx(expression_y, expression_x)
338-
```
339-
340-
#### Arguments
341-
342-
- **expression_y**: Dependent variable.
343-
Can be a constant, column, or function, and any combination of arithmetic operators.
344-
- **expression_x**: Independent variable.
345-
Can be a constant, column, or function, and any combination of arithmetic operators.
346-
347-
### `regr_avgy`
348-
349-
Computes the average of the dependent variable (output) `expression_y` for the non-null paired data points.
350-
351-
```
352-
regr_avgy(expression_y, expression_x)
353-
```
354-
355-
#### Arguments
356-
357-
- **expression_y**: Dependent variable.
358-
Can be a constant, column, or function, and any combination of arithmetic operators.
359-
- **expression_x**: Independent variable.
360-
Can be a constant, column, or function, and any combination of arithmetic operators.
361-
362-
### `regr_count`
363-
364-
Counts the number of non-null paired data points.
365-
366-
```
367-
regr_count(expression_y, expression_x)
368-
```
369-
370-
#### Arguments
371-
372-
- **expression_y**: Dependent variable.
373-
Can be a constant, column, or function, and any combination of arithmetic operators.
374-
- **expression_x**: Independent variable.
375-
Can be a constant, column, or function, and any combination of arithmetic operators.
376-
377-
### `regr_intercept`
378-
379-
Computes the y-intercept of the linear regression line. For the equation \(y = kx + b\), this function returns `b`.
380-
381-
```
382-
regr_intercept(expression_y, expression_x)
383-
```
384-
385-
#### Arguments
386-
387-
- **expression_y**: Dependent variable.
388-
Can be a constant, column, or function, and any combination of arithmetic operators.
389-
- **expression_x**: Independent variable.
390-
Can be a constant, column, or function, and any combination of arithmetic operators.
391-
392-
### `regr_r2`
393-
394-
Computes the square of the correlation coefficient between the independent and dependent variables.
395-
396-
```
397-
regr_r2(expression_y, expression_x)
398-
```
399-
400-
#### Arguments
401-
402-
- **expression_y**: Dependent variable.
403-
Can be a constant, column, or function, and any combination of arithmetic operators.
404-
- **expression_x**: Independent variable.
405-
Can be a constant, column, or function, and any combination of arithmetic operators.
406-
407-
### `regr_sxx`
408-
409-
Computes the sum of squares of the independent variable.
410-
411-
```
412-
regr_sxx(expression_y, expression_x)
413-
```
414-
415-
#### Arguments
416-
417-
- **expression_y**: Dependent variable.
418-
Can be a constant, column, or function, and any combination of arithmetic operators.
419-
- **expression_x**: Independent variable.
420-
Can be a constant, column, or function, and any combination of arithmetic operators.
421-
422-
### `regr_syy`
423-
424-
Computes the sum of squares of the dependent variable.
425-
426-
```
427-
regr_syy(expression_y, expression_x)
428-
```
429-
430-
#### Arguments
431-
432-
- **expression_y**: Dependent variable.
433-
Can be a constant, column, or function, and any combination of arithmetic operators.
434-
- **expression_x**: Independent variable.
435-
Can be a constant, column, or function, and any combination of arithmetic operators.
436-
437-
### `regr_sxy`
438-
439-
Computes the sum of products of paired data points.
440-
441-
```
442-
regr_sxy(expression_y, expression_x)
443-
```
444-
445-
#### Arguments
446-
447-
- **expression_y**: Dependent variable.
448-
Can be a constant, column, or function, and any combination of arithmetic operators.
449-
- **expression_x**: Independent variable.
450-
Can be a constant, column, or function, and any combination of arithmetic operators.
451-
452307
## Approximate
453308

454309
- [approx_distinct](#approx_distinct)

0 commit comments

Comments
 (0)