Skip to content

Commit

Permalink
Initial work on apache#12432 to allow for generation of udf docs from…
Browse files Browse the repository at this point in the history
… embedded documentation in the code
  • Loading branch information
Omega359 committed Sep 28, 2024
1 parent 3892499 commit 6034ac8
Show file tree
Hide file tree
Showing 29 changed files with 1,557 additions and 42 deletions.
15 changes: 15 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,21 @@ jobs:
# If you encounter an error, run './dev/update_config_docs.sh' and commit
./dev/update_config_docs.sh
git diff --exit-code
- name: Check if aggregate_functions.md has been modified
run: |
# If you encounter an error, run './dev/update_aggregate_docs.sh' and commit
./dev/update_aggregate_docs.sh
git diff --exit-code
- name: Check if scalar_functions.md has been modified
run: |
# If you encounter an error, run './dev/update_scalar_docs.sh' and commit
./dev/update_scalar_docs.sh
git diff --exit-code
- name: Check if window_functions.md has been modified
run: |
# If you encounter an error, run './dev/update_window_docs.sh' and commit
./dev/update_window_docs.sh
git diff --exit-code
# Verify MSRV for the crates which are directly used by other projects:
# - datafusion
Expand Down
3 changes: 3 additions & 0 deletions datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions datafusion/expr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ datafusion-expr-common = { workspace = true }
datafusion-functions-aggregate-common = { workspace = true }
datafusion-functions-window-common = { workspace = true }
datafusion-physical-expr-common = { workspace = true }
indexmap = { workspace = true }
paste = "^1.0"
serde_json = { workspace = true }
sqlparser = { workspace = true }
Expand Down
8 changes: 5 additions & 3 deletions datafusion/expr/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ mod partition_evaluator;
mod table_source;
mod udaf;
mod udf;
mod udf_docs;
mod udwf;

pub mod conditional_expressions;
Expand Down Expand Up @@ -90,9 +91,10 @@ pub use logical_plan::*;
pub use partition_evaluator::PartitionEvaluator;
pub use sqlparser;
pub use table_source::{TableProviderFilterPushDown, TableSource, TableType};
pub use udaf::{AggregateUDF, AggregateUDFImpl, ReversedUDAF};
pub use udf::{ScalarUDF, ScalarUDFImpl};
pub use udwf::{WindowUDF, WindowUDFImpl};
pub use udaf::{aggregate_doc_sections, AggregateUDF, AggregateUDFImpl, ReversedUDAF};
pub use udf::{scalar_doc_sections, ScalarUDF, ScalarUDFImpl};
pub use udf_docs::{DocSection, Documentation, DOCUMENTATION_NONE, DOC_SECTION_NONE};
pub use udwf::{window_doc_sections, WindowUDF, WindowUDFImpl};
pub use window_frame::{WindowFrame, WindowFrameBound, WindowFrameUnits};

#[cfg(test)]
Expand Down
72 changes: 67 additions & 5 deletions datafusion/expr/src/udaf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,11 @@ use crate::function::{
AccumulatorArgs, AggregateFunctionSimplification, StateFieldsArgs,
};
use crate::groups_accumulator::GroupsAccumulator;
use crate::udf_docs::DOCUMENTATION_NONE;
use crate::utils::format_state_name;
use crate::utils::AggregateOrderSensitivity;
use crate::Signature;
use crate::{Accumulator, Expr};
use crate::{Documentation, Signature};

/// Logical representation of a user-defined [aggregate function] (UDAF).
///
Expand Down Expand Up @@ -248,6 +249,11 @@ impl AggregateUDF {
pub fn default_value(&self, data_type: &DataType) -> Result<ScalarValue> {
self.inner.default_value(data_type)
}

/// Returns this UDF's documentation that will be used to generate public documentation
pub fn documentation(&self) -> &Documentation {
self.inner.documentation()
}
}

impl<F> From<F> for AggregateUDF
Expand All @@ -274,19 +280,31 @@ where
/// # use std::any::Any;
/// # use arrow::datatypes::DataType;
/// # use datafusion_common::{DataFusionError, plan_err, Result};
/// # use datafusion_expr::{col, ColumnarValue, Signature, Volatility, Expr};
/// # use datafusion_expr::{col, ColumnarValue, Signature, Volatility, Expr, Documentation};
/// # use datafusion_expr::{AggregateUDFImpl, AggregateUDF, Accumulator, function::{AccumulatorArgs, StateFieldsArgs}};
/// # use datafusion_expr::window_doc_sections::DOC_SECTION_AGGREGATE;
/// # use arrow::datatypes::Schema;
/// # use arrow::datatypes::Field;
/// # use indexmap::IndexMap;
///
/// #[derive(Debug, Clone)]
/// struct GeoMeanUdf {
/// signature: Signature
/// signature: Signature,
/// documentation: Documentation,
/// }
///
/// impl GeoMeanUdf {
/// fn new() -> Self {
/// Self {
/// signature: Signature::uniform(1, vec![DataType::Float64], Volatility::Immutable)
/// signature: Signature::uniform(1, vec![DataType::Float64], Volatility::Immutable),
/// documentation: Documentation {
/// doc_section: DOC_SECTION_AGGREGATE,
/// description: "calculates a geometric mean",
/// syntax_example: "geo_mean(2.0)",
/// sql_example: None,
/// arguments: Some(IndexMap::from([("arg_1", "The Float64 number for the geometric mean")])),
/// related_udfs: None,
/// }
/// }
/// }
/// }
Expand All @@ -298,7 +316,7 @@ where
/// fn signature(&self) -> &Signature { &self.signature }
/// fn return_type(&self, args: &[DataType]) -> Result<DataType> {
/// if !matches!(args.get(0), Some(&DataType::Float64)) {
/// return plan_err!("add_one only accepts Float64 arguments");
/// return plan_err!("geo_mean only accepts Float64 arguments");
/// }
/// Ok(DataType::Float64)
/// }
Expand All @@ -310,6 +328,9 @@ where
/// Field::new("ordering", DataType::UInt32, true)
/// ])
/// }
/// fn documentation(&self) -> Documentation {
/// &self.documentation
/// }
/// }
///
/// // Create a new AggregateUDF from the implementation
Expand Down Expand Up @@ -564,6 +585,12 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
fn default_value(&self, data_type: &DataType) -> Result<ScalarValue> {
ScalarValue::try_from(data_type)
}

/// Returns the documentation for this Aggregate UDF for use
/// in generating publicly facing documentation.
fn documentation(&self) -> &Documentation {
&DOCUMENTATION_NONE
}
}

impl PartialEq for dyn AggregateUDFImpl {
Expand Down Expand Up @@ -710,6 +737,41 @@ impl AggregateUDFImpl for AliasedAggregateUDFImpl {
fn is_descending(&self) -> Option<bool> {
self.inner.is_descending()
}

fn documentation(&self) -> &Documentation {
self.inner.documentation()
}
}

// Aggregate UDF doc sections for use in public documentation
pub mod aggregate_doc_sections {
use crate::DocSection;

pub fn doc_sections() -> Vec<DocSection> {
vec![
DOC_SECTION_GENERAL,
DOC_SECTION_STATISTICAL,
DOC_SECTION_APPROXIMATE,
]
}

pub const DOC_SECTION_GENERAL: DocSection = DocSection {
include: true,
label: "General Functions",
description: None,
};

pub const DOC_SECTION_STATISTICAL: DocSection = DocSection {
include: true,
label: "Statistical Functions",
description: None,
};

pub const DOC_SECTION_APPROXIMATE: DocSection = DocSection {
include: true,
label: "Approximate Functions",
description: None,
};
}

#[cfg(test)]
Expand Down
133 changes: 129 additions & 4 deletions datafusion/expr/src/udf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@
use crate::expr::schema_name_from_exprs_comma_seperated_without_space;
use crate::simplify::{ExprSimplifyResult, SimplifyInfo};
use crate::sort_properties::{ExprProperties, SortProperties};
use crate::{ColumnarValue, Expr, ScalarFunctionImplementation, Signature};
use crate::udf_docs::DOCUMENTATION_NONE;
use crate::{
ColumnarValue, Documentation, Expr, ScalarFunctionImplementation, Signature,
};
use arrow::datatypes::DataType;
use datafusion_common::{not_impl_err, ExprSchema, Result};
use datafusion_expr_common::interval_arithmetic::Interval;
Expand Down Expand Up @@ -274,6 +277,11 @@ impl ScalarUDF {
pub fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
self.inner.coerce_types(arg_types)
}

/// Returns this UDF's documentation that will be used to generate public documentation
pub fn documentation(&self) -> &Documentation {
self.inner.documentation()
}
}

impl<F> From<F> for ScalarUDF
Expand All @@ -299,18 +307,30 @@ where
/// ```
/// # use std::any::Any;
/// # use arrow::datatypes::DataType;
/// # use indexmap::IndexMap;
/// # use datafusion_common::{DataFusionError, plan_err, Result};
/// # use datafusion_expr::{col, ColumnarValue, Signature, Volatility};
/// # use datafusion_expr::{col, ColumnarValue, Documentation, Signature, Volatility};
/// # use datafusion_expr::{ScalarUDFImpl, ScalarUDF};
/// # use datafusion_expr::scalar_doc_sections::DOC_SECTION_MATH;
///
/// #[derive(Debug)]
/// struct AddOne {
/// signature: Signature
/// signature: Signature,
/// documentation: Documentation,
/// }
///
/// impl AddOne {
/// fn new() -> Self {
/// Self {
/// signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable)
/// signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable),
/// documentation: Documentation {
/// doc_section: DOC_SECTION_MATH,
/// description: "Add one to an int32",
/// syntax_example: "add_one(2)",
/// sql_example: None,
/// arguments: Some(IndexMap::from([("arg_1", "The int32 number to add one to")])),
/// related_udfs: None,
/// }
/// }
/// }
/// }
Expand All @@ -328,6 +348,9 @@ where
/// }
/// // The actual implementation would add one to the argument
/// fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> { unimplemented!() }
/// fn documentation(&self) -> Documentation {
/// &self.documentation
/// }
/// }
///
/// // Create a new ScalarUDF from the implementation
Expand Down Expand Up @@ -596,6 +619,12 @@ pub trait ScalarUDFImpl: Debug + Send + Sync {
self.signature().hash(hasher);
hasher.finish()
}

/// Returns the documentation for this scalar UDF for use
/// in generating publicly facing documentation.
fn documentation(&self) -> &Documentation {
&DOCUMENTATION_NONE
}
}

/// ScalarUDF that adds an alias to the underlying function. It is better to
Expand Down Expand Up @@ -709,4 +738,100 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl {
self.aliases.hash(hasher);
hasher.finish()
}

fn documentation(&self) -> &Documentation {
self.inner.documentation()
}
}

// Scalar UDF doc sections for use in public documentation
pub mod scalar_doc_sections {
use crate::DocSection;

pub fn doc_sections() -> Vec<DocSection> {
vec![
DOC_SECTION_MATH,
DOC_SECTION_CONDITIONAL,
DOC_SECTION_STRING,
DOC_SECTION_BINARY_STRING,
DOC_SECTION_REGEX,
DOC_SECTION_DATETIME,
DOC_SECTION_ARRAY,
DOC_SECTION_STRUCT,
DOC_SECTION_MAP,
DOC_SECTION_HASHING,
DOC_SECTION_OTHER,
]
}

pub const DOC_SECTION_MATH: DocSection = DocSection {
include: true,
label: "Math Functions",
description: None,
};

pub const DOC_SECTION_CONDITIONAL: DocSection = DocSection {
include: true,
label: "Conditional Functions",
description: None,
};

pub const DOC_SECTION_STRING: DocSection = DocSection {
include: true,
label: "String Functions",
description: None,
};

pub const DOC_SECTION_BINARY_STRING: DocSection = DocSection {
include: true,
label: "Binary String Functions",
description: None,
};

pub const DOC_SECTION_REGEX: DocSection = DocSection {
include: true,
label: "Regular Expression Functions",
description: Some(
r#"Apache DataFusion uses a [PCRE-like](https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions)
regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax)
(minus support for several features including look-around and backreferences).
The following regular expression functions are supported:"#,
),
};

pub const DOC_SECTION_DATETIME: DocSection = DocSection {
include: true,
label: "Time and Date Functions",
description: None,
};

pub const DOC_SECTION_ARRAY: DocSection = DocSection {
include: true,
label: "Array Functions",
description: None,
};

pub const DOC_SECTION_STRUCT: DocSection = DocSection {
include: true,
label: "Struct Functions",
description: None,
};

pub const DOC_SECTION_MAP: DocSection = DocSection {
include: true,
label: "Map Functions",
description: None,
};

pub const DOC_SECTION_HASHING: DocSection = DocSection {
include: true,
label: "Hashing Functions",
description: None,
};

pub const DOC_SECTION_OTHER: DocSection = DocSection {
include: true,
label: "Other Functions",
description: None,
};
}
Loading

0 comments on commit 6034ac8

Please sign in to comment.