diff --git a/auto_doc.py b/auto_doc.py index 8ce69661a1..4d876a5fb3 100644 --- a/auto_doc.py +++ b/auto_doc.py @@ -61,7 +61,9 @@ }, "api/embedding_index_api.md": { "embedding_index": ["hsfs.embedding.EmbeddingIndex"], - "embedding_index_methods": keras_autodoc.get_methods("hsfs.embedding.EmbeddingIndex"), + "embedding_index_methods": keras_autodoc.get_methods( + "hsfs.embedding.EmbeddingIndex" + ), }, "api/spine_group_api.md": { "fg": ["hsfs.feature_group.SpineGroup"], @@ -256,6 +258,79 @@ "hsfs.core.explicit_provenance.Artifact" ), }, + "api/statistics_api.md": { + "statistics": ["hsfs.statistics.Statistics"], + "statistics_properties": keras_autodoc.get_properties( + "hsfs.statistics.Statistics" + ), + }, + "api/split_statistics_api.md": { + "split_statistics": ["hsfs.split_statistics.SplitStatistics"], + "split_statistics_properties": keras_autodoc.get_properties( + "hsfs.split_statistics.SplitStatistics" + ), + }, + "api/feature_descriptive_statistics_api.md": { + "feature_descriptive_statistics": [ + "hsfs.core.feature_descriptive_statistics.FeatureDescriptiveStatistics" + ], + "feature_descriptive_statistics_properties": keras_autodoc.get_properties( + "hsfs.core.feature_descriptive_statistics.FeatureDescriptiveStatistics" + ), + }, + "api/feature_monitoring_config_api.md": { + "feature_monitoring_config": [ + "hsfs.core.feature_monitoring_config.FeatureMonitoringConfig" + ], + "feature_monitoring_config_properties": keras_autodoc.get_properties( + "hsfs.core.feature_monitoring_config.FeatureMonitoringConfig" + ), + "feature_monitoring_config_methods": keras_autodoc.get_methods( + "hsfs.core.feature_monitoring_config.FeatureMonitoringConfig", + exclude=[ + "from_response_json", + "update_from_response_json", + "json", + "to_dict", + ], + ), + # from feature group + "feature_monitoring_config_creation_fg": [ + "hsfs.feature_group.FeatureGroup.create_statistics_monitoring", + "hsfs.feature_group.FeatureGroup.create_feature_monitoring", + ], + # from feature view + "feature_monitoring_config_creation_fv": [ + "hsfs.feature_view.FeatureView.create_statistics_monitoring", + "hsfs.feature_view.FeatureView.create_feature_monitoring", + ], + # retrieval + "feature_monitoring_config_retrieval_fg": [ + "hsfs.feature_group.FeatureGroup.get_feature_monitoring_configs", + ], + "feature_monitoring_config_retrieval_fv": [ + "hsfs.feature_view.FeatureView.get_feature_monitoring_configs", + ], + }, + "api/feature_monitoring_result_api.md": { + "feature_monitoring_result": [ + "hsfs.core.feature_monitoring_result.FeatureMonitoringResult" + ], + "feature_monitoring_result_retrieval": [ + "hsfs.core.feature_monitoring_config.FeatureMonitoringConfig.get_history" + ], + "feature_monitoring_result_properties": keras_autodoc.get_properties( + "hsfs.core.feature_monitoring_result.FeatureMonitoringResult" + ), + }, + "api/feature_monitoring_window_config_api.md": { + "feature_monitoring_window_config": [ + "hsfs.core.monitoring_window_config.MonitoringWindowConfig" + ], + "feature_monitoring_window_config_properties": keras_autodoc.get_properties( + "hsfs.core.monitoring_window_config.MonitoringWindowConfig" + ), + }, } hsfs_dir = pathlib.Path(__file__).resolve().parents[0] diff --git a/docs/templates/api/feature_descriptive_statistics_api.md b/docs/templates/api/feature_descriptive_statistics_api.md new file mode 100644 index 0000000000..3be8cccd36 --- /dev/null +++ b/docs/templates/api/feature_descriptive_statistics_api.md @@ -0,0 +1,7 @@ +# Feature Descriptive Statistics + +{{feature_descriptive_statistics}} + +## Properties + +{{feature_descriptive_statistics_properties}} diff --git a/docs/templates/api/feature_monitoring_config_api.md b/docs/templates/api/feature_monitoring_config_api.md new file mode 100644 index 0000000000..7ca9b46ff5 --- /dev/null +++ b/docs/templates/api/feature_monitoring_config_api.md @@ -0,0 +1,27 @@ +# Feature Monitoring Configuration + +{{feature_monitoring_config}} + +## Creation from Feature Group + +{{feature_monitoring_config_creation_fg}} + +## Creation from Feature View + +{{feature_monitoring_config_creation_fv}} + +## Retrieval from Feature Group + +{{feature_monitoring_config_retrieval_fg}} + +## Retrieval from Feature View + +{{feature_monitoring_config_retrieval_fv}} + +## Properties + +{{feature_monitoring_config_properties}} + +## Methods + +{{feature_monitoring_config_methods}} diff --git a/docs/templates/api/feature_monitoring_result_api.md b/docs/templates/api/feature_monitoring_result_api.md new file mode 100644 index 0000000000..5bfca1165c --- /dev/null +++ b/docs/templates/api/feature_monitoring_result_api.md @@ -0,0 +1,11 @@ +# Feature Monitoring Result + +{{feature_monitoring_result}} + +## Retrieval + +{{feature_monitoring_result_retrieval}} + +## Properties + +{{feature_monitoring_result_properties}} diff --git a/docs/templates/api/feature_monitoring_window_config_api.md b/docs/templates/api/feature_monitoring_window_config_api.md new file mode 100644 index 0000000000..53ef23ea2d --- /dev/null +++ b/docs/templates/api/feature_monitoring_window_config_api.md @@ -0,0 +1,7 @@ +# Feature Monitoring Window Configuration + +{{feature_monitoring_window_config}} + +## Properties + +{{feature_monitoring_window_config_properties}} diff --git a/docs/templates/api/split_statistics_api.md b/docs/templates/api/split_statistics_api.md new file mode 100644 index 0000000000..09053ac5d2 --- /dev/null +++ b/docs/templates/api/split_statistics_api.md @@ -0,0 +1,7 @@ +# Split Statistics + +{{split_statistics}} + +## Properties + +{{split_statistics_properties}} diff --git a/docs/templates/api/statistics_api.md b/docs/templates/api/statistics_api.md new file mode 100644 index 0000000000..27ed90c9da --- /dev/null +++ b/docs/templates/api/statistics_api.md @@ -0,0 +1,7 @@ +# Statistics + +{{statistics}} + +## Properties + +{{statistics_properties}} diff --git a/docs/templates/api/validation_report_api.md b/docs/templates/api/validation_report_api.md index 1a2fe4a117..435a87a03d 100644 --- a/docs/templates/api/validation_report_api.md +++ b/docs/templates/api/validation_report_api.md @@ -6,7 +6,7 @@ {{validation_report_validate}} -## Fetch Validation Reports +## Retrieval {{validation_report_get}} diff --git a/mkdocs.yml b/mkdocs.yml index b2555fa005..40b626da49 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -33,6 +33,14 @@ nav: - ValidationReport: generated/api/validation_report_api.md - Job: generated/api/job.md - Provenance Links: generated/api/links.md + - Statistics: + - Statistics: generated/api/statistics_api.md + - Split Statistics: generated/api/split_statistics_api.md + - Feature descriptive statistics: generated/api/feature_descriptive_statistics_api.md + - Feature Monitoring: + - Configuration: generated/api/feature_monitoring_config_api.md + - Result: generated/api/feature_monitoring_result_api.md + - Window: generated/api/feature_monitoring_window_config_api.md - Contributing: CONTRIBUTING.md - Community ↗: https://community.hopsworks.ai/ diff --git a/python/hsfs/core/feature_descriptive_statistics.py b/python/hsfs/core/feature_descriptive_statistics.py index da9d5e6733..8568930498 100644 --- a/python/hsfs/core/feature_descriptive_statistics.py +++ b/python/hsfs/core/feature_descriptive_statistics.py @@ -214,72 +214,105 @@ def id(self) -> Optional[int]: @property def feature_type(self) -> str: + """Data type of the feature. It can be one of Boolean, Fractional, Integral, or String.""" return self._feature_type @property def feature_name(self) -> str: + """Name of the feature.""" return self._feature_name @property def count(self) -> int: + """Number of values.""" return self._count @property def completeness(self) -> Optional[float]: + """Fraction of non-null values in a column.""" return self._completeness @property def num_non_null_values(self) -> Optional[int]: + """Number of non-null values.""" return self._num_non_null_values @property def num_null_values(self) -> Optional[int]: + """Number of null values.""" return self._num_null_values @property def approx_num_distinct_values(self) -> Optional[int]: + """Approximate number of distinct values.""" return self._approx_num_distinct_values @property def min(self) -> Optional[float]: + """Minimum value.""" return self._min @property def max(self) -> Optional[float]: + """Maximum value.""" return self._max @property def sum(self) -> Optional[float]: + """Sum of all feature values.""" return self._sum @property def mean(self) -> Optional[float]: + """Mean value.""" return self._mean @property def stddev(self) -> Optional[float]: + """Standard deviation of the feature values.""" return self._stddev @property def percentiles(self) -> Optional[Mapping[str, float]]: + """Percentiles.""" return self._percentiles @property def distinctness(self) -> Optional[float]: + """Fraction of distinct values of a feature over the number of all its values. Distinct values occur at least once. + + !!! note "Example" + $[a, a, b]$ contains two distinct values $a$ and $b$, so distinctness is $2/3$. + """ return self._distinctness @property def entropy(self) -> Optional[float]: + """Entropy is a measure of the level of information contained in an event (feature value) when considering all possible events (all feature values). + Entropy is estimated using observed value counts as the negative sum of (value_count/total_count) * log(value_count/total_count). + + !!! note "Example" + $[a, b, b, c, c]$ has three distinct values with counts $[1, 2, 2]$. + + Entropy is then $(-1/5*log(1/5)-2/5*log(2/5)-2/5*log(2/5)) = 1.055$. + """ return self._entropy @property def uniqueness(self) -> Optional[float]: + """Fraction of unique values over the number of all values of a column. Unique values occur exactly once. + + !!! note "Example" + $[a, a, b]$ contains one unique value $b$, so uniqueness is $1/3$. + """ return self._uniqueness @property def exact_num_distinct_values(self) -> Optional[int]: + """Exact number of distinct values.""" return self._exact_num_distinct_values @property def extended_statistics(self) -> Optional[dict]: + """Additional statistics computed on the feature values such as histograms and correlations.""" return self._extended_statistics diff --git a/python/hsfs/core/feature_monitoring_config.py b/python/hsfs/core/feature_monitoring_config.py index 54cd705512..b2c592d58a 100644 --- a/python/hsfs/core/feature_monitoring_config.py +++ b/python/hsfs/core/feature_monitoring_config.py @@ -219,10 +219,10 @@ def with_detection_window( window_length: Optional[str] = None, row_percentage: Optional[float] = None, ) -> "FeatureMonitoringConfig": - """Sets the detection window for the feature monitoring job. + """Sets the detection window of data to compute statistics on. !!! example ```python - # fetch your feature group or feature view + # Fetch your feature group or feature view fg = fs.get_feature_group(name="my_feature_group", version=1) # Compute statistics on a regular basis fg.create_statistics_monitoring( @@ -271,11 +271,11 @@ def with_reference_window( window_length: Optional[str] = None, row_percentage: Optional[float] = None, ) -> "FeatureMonitoringConfig": - """Sets the reference window for the feature monitoring job. + """Sets the reference window of data to compute statistics on. See also `with_reference_value(...)` and `with_reference_training_dataset(...)` for other reference options. !!! example ```python - # fetch your feature group or feature view + # Fetch your feature group or feature view fg = fs.get_feature_group(name="my_feature_group", version=1) # Setup feature monitoring and a detection window my_monitoring_config = fg.create_feature_monitoring(...).with_detection_window(...) @@ -285,8 +285,10 @@ def with_reference_window( window_length="1d", ).compare_on(...).save() ``` - !!! note - You must provide a comparison configuration via compare_on(...) before saving the feature monitoring config. + + !!! warning "Provide a comparison configuration" + You must provide a comparison configuration via `compare_on()` before saving the feature monitoring config. + # Arguments time_offset: The time offset from the current time to the start of the time window. window_length: The length of the time window. @@ -312,11 +314,11 @@ def with_reference_value( self, value: Optional[Union[float, int]] = None, ) -> "FeatureMonitoringConfig": - """Sets the reference value for the feature monitoring job. + """Sets the reference value to compare statistics with. See also `with_reference_window(...)` and `with_reference_training_dataset(...)` for other reference options. !!! example ```python - # fetch your feature group or feature view + # Fetch your feature group or feature view fg = fs.get_feature_group(name="my_feature_group", version=1) # Setup feature monitoring and a detection window my_monitoring_config = fg.create_feature_monitoring(...).with_detection_window(...) @@ -325,8 +327,10 @@ def with_reference_value( value=0.0, ).compare_on(...).save() ``` - !!! note - You must provide a comparison configuration via compare_on(...) before saving the feature monitoring config. + + !!! warning "Provide a comparison configuration" + You must provide a comparison configuration via `compare_on()` before saving the feature monitoring config. + # Arguments value: A float value to use as reference. # Returns @@ -342,11 +346,11 @@ def with_reference_training_dataset( self, training_dataset_version: Optional[int] = None, ) -> "FeatureMonitoringConfig": - """Sets the reference training dataset for the feature monitoring job. + """Sets the reference training dataset to compare statistics with. See also `with_reference_value(...)` and `with_reference_window(...)` for other reference options. !!! example ```python - # fetch your feature group or feature view + # Fetch your feature group or feature view fg = fs.get_feature_group(name="my_feature_group", version=1) # Setup feature monitoring and a detection window my_monitoring_config = fg.create_feature_monitoring(...).with_detection_window(...) @@ -356,8 +360,10 @@ def with_reference_training_dataset( training_dataset_version=3, ).compare_on(...).save() ``` - !!! note - You must provide a comparison configuration via compare_on(...) before saving the feature monitoring config. + + !!! warning "Provide a comparison configuration" + You must provide a comparison configuration via `compare_on()` before saving the feature monitoring config. + # Arguments training_dataset_version: The version of the training dataset to use as reference. # Returns @@ -376,10 +382,10 @@ def compare_on( strict: Optional[bool] = False, relative: Optional[bool] = False, ) -> "FeatureMonitoringConfig": - """Sets the comparison configuration for monitoring involving a reference window. + """Sets the statistics comparison criteria for feature monitoring with a reference window. !!! example ```python - # fetch your feature group or feature view + # Fetch your feature group or feature view fg = fs.get_feature_group(name="my_feature_group", version=1) # Setup feature monitoring, a detection window and a reference window my_monitoring_config = fg.create_feature_monitoring( @@ -393,8 +399,10 @@ def compare_on( relative=True, ).save() ``` + !!! note Detection window and reference window/value/training_dataset must be set prior to comparison configuration. + # Arguments metric: The metric to use for comparison. Different metric are available for different feature type. threshold: The threshold to apply to the difference to potentially trigger an alert. @@ -414,10 +422,10 @@ def compare_on( return self def save(self): - """Saves the feature monitoring configuration to the backend. + """Saves the feature monitoring configuration. !!! example ```python - # fetch your feature group or feature view + # Fetch your feature group or feature view fg = fs.get_feature_group(name="my_feature_group", version=1) # Setup feature monitoring and a detection window my_monitoring_config = fg.create_statistics_monitoring( @@ -448,11 +456,11 @@ def update(self): """Updates allowed fields of the saved feature monitoring configuration. !!! example ```python - # fetch your feature group or feature view + # Fetch your feature group or feature view fg = fs.get_feature_group(name="my_feature_group", version=1) - # fetch registered config by name - my_monitoring_config = fg.get_feature_monitoring_config(name="my_monitoring_config") - # update the percentage of rows to use when computing the statistics + # Fetch registered config by name + my_monitoring_config = fg.get_feature_monitoring_configs(name="my_monitoring_config") + # Update the percentage of rows to use when computing the statistics my_monitoring_config.detection_window.row_percentage = 10 my_monitoring_config.update() ``` @@ -462,18 +470,21 @@ def update(self): return self._feature_monitoring_config_engine.update(self) def run_job(self): - """Trigger the monitoring job which computes statistics on detection and reference window. + """Trigger the feature monitoring job which computes and compares statistics on the detection and reference windows. !!! example ```python3 - # fetch your feature group or feature view + # Fetch your feature group or feature view fg = fs.get_feature_group(name="my_feature_group", version=1) - # fetch registered config by name + # Fetch registered config by name my_monitoring_config = fg.get_feature_monitoring_configs(name="my_monitoring_config") - # trigger the job which computes statistics on detection and reference window + # Trigger the feature monitoring job once my_monitoring_config.run_job() ``` + !!! info - The job will be triggered asynchronously and the method will return immediately. + The feature monitoring job will be triggered asynchronously and the method will return immediately. + Calling this method does not affect the ongoing schedule. + # Raises `FeatureStoreException`: If the feature monitoring config has not been saved. # Returns @@ -489,14 +500,14 @@ def run_job(self): ) def get_job(self): - """Get the monitoring job which computes statistics on detection and reference window. + """Get the feature monitoring job which computes and compares statistics on the detection and reference windows. !!! example ```python3 - # fetch registered config by name via feature group or feature view + # Fetch registered config by name via feature group or feature view my_monitoring_config = fg.get_feature_monitoring_configs(name="my_monitoring_config") - # get the job which computes statistics on detection and reference window + # Get the job which computes statistics on detection and reference window job = my_monitoring_config.get_job() - # print job history and ongoing executions + # Print job history and ongoing executions job.executions ``` # Raises @@ -515,14 +526,14 @@ def get_job(self): ) def delete(self): - """Deletes the feature monitoring configuration from the backend. + """Deletes the feature monitoring configuration. !!! example ```python - # fetch your feature group or feature view + # Fetch your feature group or feature view fg = fs.get_feature_group(name="my_feature_group", version=1) - # fetch registered config by name - my_monitoring_config = fg.get_feature_monitoring_config(name="my_monitoring_config") - # delete the feature monitoring config + # Fetch registered config by name + my_monitoring_config = fg.get_feature_monitoring_configs(name="my_monitoring_config") + # Delete the feature monitoring config my_monitoring_config.delete() ``` # Raises @@ -536,14 +547,14 @@ def delete(self): self._feature_monitoring_config_engine.delete(config_id=self._id) def disable(self): - """Disables the spawning of monitoring job at time-interval controlled by the scheduler. + """Disables the schedule of the feature monitoring job. !!! example ```python - # fetch your feature group or feature view + # Fetch your feature group or feature view fg = fs.get_feature_group(name="my_feature_group", version=1) - # fetch registered config by name - my_monitoring_config = fg.get_feature_monitoring_config(name="my_monitoring_config") - # disable the feature monitoring config + # Fetch registered config by name + my_monitoring_config = fg.get_feature_monitoring_configs(name="my_monitoring_config") + # Disable the feature monitoring config my_monitoring_config.disable() ``` # Raises @@ -552,15 +563,15 @@ def disable(self): self._update_schedule(enabled=False) def enable(self): - """Enables the spawning of monitoring job at time-interval controlled by the scheduler. + """Enables the schedule of the feature monitoring job. The scheduler can be configured via the `job_schedule` property. !!! example ```python - # fetch your feature group or feature view + # Fetch your feature group or feature view fg = fs.get_feature_group(name="my_feature_group", version=1) - # fetch registered config by name - my_monitoring_config = fg.get_feature_monitoring_config(name="my_monitoring_config") - # enable the feature monitoring config + # Fetch registered config by name + my_monitoring_config = fg.get_feature_monitoring_configs(name="my_monitoring_config") + # Enable the feature monitoring config my_monitoring_config.enable() ``` # Raises @@ -590,14 +601,14 @@ def get_history( with_statistics: bool = True, ) -> List["FeatureMonitoringResult"]: """ - Fetch the history of the computed statistics for this configuration. + Fetch the history of the computed statistics and comparison results for this configuration. !!! example ```python3 - # fetch your feature group or feature view + # Fetch your feature group or feature view fg = fs.get_feature_group(name="my_feature_group", version=1) - # fetch registered config by name - my_monitoring_config = fg.get_feature_monitoring_config(name="my_monitoring_config") - # fetch the history of the computed statistics for this configuration + # Fetch registered config by name + my_monitoring_config = fg.get_feature_monitoring_configs(name="my_monitoring_config") + # Fetch the history of the computed statistics for this configuration history = my_monitoring_config.get_history( start_time="2021-01-01", end_time="2021-01-31", @@ -606,7 +617,7 @@ def get_history( # Args: start_time: The start time of the time range to fetch the history for. end_time: The end time of the time range to fetch the history for. - with_statistics: Whether to include the computed statistics in the result. + with_statistics: Whether to include the computed statistics in the results. # Raises `FeatureStoreException`: If the feature monitoring config has not been saved. """ @@ -624,32 +635,46 @@ def get_history( @property def id(self) -> Optional[int]: + """Id of the feature monitoring configuration.""" return self._id @property def feature_store_id(self) -> int: + """Id of the Feature Store.""" return self._feature_store_id @property def feature_group_id(self) -> Optional[int]: + """Id of the Feature Group to which this feature monitoring configuration is attached.""" return self._feature_group_id + @property + def feature_view_name(self) -> Optional[str]: + """Name of the Feature View to which this feature monitoring configuration is attached.""" + return self._feature_view_name + + @property + def feature_view_version(self) -> Optional[int]: + """Version of the Feature View to which this feature monitoring configuration is attached.""" + return self._feature_view_version + @property def feature_name(self) -> Optional[str]: """The name of the feature to monitor. If not set, all features of the - feature group or feature view are monitored, only available for statistics monitoring. - !!! note - This property is read-only after the feature monitoring config has been saved. + Feature Group or Feature View are monitored, only available for scheduled statistics. + + !!! info "This property is read-only" + """ return self._feature_name @property def name(self) -> str: """The name of the feature monitoring config. - A Feature Group or Feature View cannot have multiple feature monitoring configs with the same name. The name of - a feature monitoring config is limited to 63 characters. - !!! note - This property is read-only after the feature monitoring config has been saved. + A Feature Group or Feature View cannot have multiple feature monitoring configurations with the same name. The name of + a feature monitoring configuration is limited to 63 characters. + + !!! info "This property is read-only once the feature monitoring configuration has been saved." """ return self._name @@ -665,6 +690,7 @@ def name(self, name: str): @property def description(self) -> Optional[str]: + """Description of the feature monitoring configuration.""" return self._description @description.setter @@ -679,11 +705,12 @@ def description(self, description: Optional[str]): @property def job_name(self) -> Optional[str]: + """Name of the feature monitoring job.""" return self._job_name @property def enabled(self) -> bool: - """Controls whether or not this config is spawning new monitoring jobs. + """Controls whether or not this config is spawning new feature monitoring jobs. This field belongs to the scheduler configuration but is made transparent to the user for convenience. """ return self.job_schedule.enabled @@ -691,7 +718,7 @@ def enabled(self) -> bool: @enabled.setter def enabled(self, enabled: bool): """ - Controls whether or not this config is spawning new monitoring jobs. + Controls whether or not this config is spawning new feature monitoring jobs. This field belongs to the scheduler configuration but is made transparent to the user for convenience. """ self.job_schedule.enabled = enabled @@ -700,15 +727,18 @@ def enabled(self, enabled: bool): def feature_monitoring_type(self) -> Optional[str]: """The type of feature monitoring to perform. Used for internal validation. Options are: - - STATISTICS_COMPUTATION if no reference window (and therefore comparison config) is provided - - STATISTICS_COMPARISON if a reference window (and therefore comparison config) is provided. - !!! note - This property is read-only. + - STATISTICS_COMPUTATION if no reference window (and, therefore, comparison config) is provided + - STATISTICS_COMPARISON if a reference window (and, therefore, comparison config) is provided. + + !!! info "This property is read-only." """ return self._feature_monitoring_type @property def job_schedule(self) -> JobSchedule: + """Schedule of the feature monitoring job. + This field belongs to the job configuration but is made transparent to the user for convenience. + """ return self._job_schedule @job_schedule.setter @@ -722,6 +752,7 @@ def job_schedule(self, job_schedule: Union[JobSchedule, Dict[str, Any]]): @property def detection_window_config(self) -> mwc.MonitoringWindowConfig: + """Configuration for the detection window.""" return self._detection_window_config @detection_window_config.setter @@ -748,6 +779,7 @@ def detection_window_config( @property def reference_window_config(self) -> mwc.MonitoringWindowConfig: + """Configuration for the reference window.""" return self._reference_window_config @reference_window_config.setter @@ -787,6 +819,7 @@ def reference_window_config( def statistics_comparison_config( self, ) -> Optional[Dict[str, Any]]: + """Configuration for the comparison of detection and reference statistics.""" return self._statistics_comparison_config @statistics_comparison_config.setter diff --git a/python/hsfs/core/feature_monitoring_result.py b/python/hsfs/core/feature_monitoring_result.py index b31b05e369..d449c8ddb8 100644 --- a/python/hsfs/core/feature_monitoring_result.py +++ b/python/hsfs/core/feature_monitoring_result.py @@ -131,60 +131,79 @@ def __repr__(self) -> str: @property def id(self) -> Optional[int]: + """Id of the feature monitoring result.""" return self._id @property def config_id(self) -> int: + """Id of the feature monitoring configuration containing this result.""" return self._config_id @property def feature_store_id(self) -> int: + """Id of the Feature Store.""" return self._feature_store_id @property def detection_statistics_id(self) -> Optional[int]: + """Id of the feature descriptive statistics computed on the detection window.""" return self._detection_statistics_id @property def reference_statistics_id(self) -> Optional[int]: + """Id of the feature descriptive statistics computed on the reference window.""" return self._reference_statistics_id @property def detection_statistics(self) -> Optional[FeatureDescriptiveStatistics]: + """Feature descriptive statistics computed on the detection window.""" return self._detection_statistics @property def reference_statistics(self) -> Optional[FeatureDescriptiveStatistics]: + """Feature descriptive statistics computed on the reference window.""" return self._reference_statistics @property def execution_id(self) -> Optional[int]: + """Execution id of the feature monitoring job.""" return self._execution_id @property def monitoring_time(self) -> int: + """Time at which this feature monitoring result was created.""" return self._monitoring_time @property def difference(self) -> Optional[float]: + """Difference between detection and reference values. It can be relative or absolute difference, + depending on the statistics comparison configuration provided in `relative` parameter passed to `compare_on()` + when enabling feature monitoring. + """ return self._difference @property def shift_detected(self) -> bool: + """Whether or not shift was detected in the detection window based on the computed statistics and the threshold provided in `compare_on()` + when enabling feature monitoring.""" return self._shift_detected @property def feature_name(self) -> str: + """Name of the feature being monitored.""" return self._feature_name @property def empty_detection_window(self) -> bool: + """Whether or not the detection window was empty in this feature monitoring run.""" return self._empty_detection_window @property def empty_reference_window(self) -> bool: + """Whether or not the reference window was empty in this feature monitoring run.""" return self._empty_reference_window @property def specific_value(self) -> Optional[float]: + """Specific value used as reference in the statistics comparison.""" return self._specific_value diff --git a/python/hsfs/core/monitoring_window_config.py b/python/hsfs/core/monitoring_window_config.py index 8c02d91e83..436167659f 100644 --- a/python/hsfs/core/monitoring_window_config.py +++ b/python/hsfs/core/monitoring_window_config.py @@ -186,10 +186,12 @@ def __repr__(self): @property def id(self) -> Optional[int]: + """Id of the window configuration.""" return self._id @property def window_config_type(self) -> WindowConfigType: + """Type of the window. It can be one of `ALL_TIME`, `ROLLING_TIME`, `TRAINING_DATASET` or `SPECIFIC_VALUE`.""" return self._window_config_type @window_config_type.setter @@ -217,10 +219,12 @@ def window_config_type(self, window_config_type: Union[WindowConfigType, str]): @property def time_offset(self) -> Optional[str]: + """The time offset from the current time to the start of the time window. Only used for windows of type `ROLLING_TIME`.""" return self._time_offset @property def window_length(self) -> Optional[str]: + """The length of the time window. Only used for windows of type `ROLLING_TIME`.""" return self._window_length @window_length.setter @@ -239,8 +243,7 @@ def window_length(self, window_length: Optional[str]): @property def training_dataset_version(self) -> Optional[int]: - """The version of the training dataset to use as reference. Only used for - TRAINING_DATASET window config type.""" + """The version of the training dataset to use as reference. Only used for windows of type `TRAINING_DATASET`.""" return self._training_dataset_version @training_dataset_version.setter @@ -256,6 +259,7 @@ def training_dataset_version(self, training_dataset_version: Optional[int]): @property def specific_value(self) -> Optional[float]: + """The specific value to use as reference. Only used for windows of type `SPECIFIC_VALUE`.""" return self._specific_value @specific_value.setter @@ -271,7 +275,7 @@ def specific_value(self, specific_value: Optional[float]): @property def row_percentage(self) -> Optional[float]: - """The percentage of rows to fetch and compute the statistics on. Only used for ROLLING_TIME and ALL_TIME.""" + """The percentage of rows to fetch and compute the statistics on. Only used for windows of type `ROLLING_TIME` and `ALL_TIME`.""" return self._row_percentage @row_percentage.setter diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index 4b48fb0373..ea7fc0a4d9 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -541,6 +541,7 @@ def create_feature_group( !!!note "Event time data type restriction" The supported data types for the event time column are: `timestamp`, `date` and `bigint`. + stream: Optionally, Define whether the feature group should support real time stream writing capabilities. Stream enabled Feature Groups have unified single API for writing streaming features transparently to both online and offline store. @@ -673,6 +674,7 @@ def get_or_create_feature_group( !!!note "Event time data type restriction" The supported data types for the event time column are: `timestamp`, `date` and `bigint`. + stream: Optionally, Define whether the feature group should support real time stream writing capabilities. Stream enabled Feature Groups have unified single API for writing streaming features transparently to both online and offline store. @@ -801,6 +803,7 @@ def create_on_demand_feature_group( !!!note "Event time data type restriction" The supported data types for the event time column are: `timestamp`, `date` and `bigint`. + expectation_suite: Optionally, attach an expectation suite to the feature group which dataframes should be validated against upon insertion. Defaults to `None`. @@ -937,8 +940,11 @@ def create_external_feature_group( event_time: Optionally, provide the name of the feature containing the event time for the features in this feature group. If event_time is set the feature group can be used for point-in-time joins. Defaults to `None`. - !!! note "Event time data type restriction" - The supported data types for the event time column are: `timestamp`, `date` and `bigint`. + + !!! note "Event time data type restriction" + The supported data types for the event time column are: `timestamp`, `date` and `bigint`. + + expectation_suite: Optionally, attach an expectation suite to the feature group which dataframes should be validated against upon insertion. Defaults to `None`. @@ -1082,8 +1088,11 @@ def get_or_create_spine_group( list of `Feature` objects. Defaults to empty list `[]` and will use the schema information of the DataFrame resulting by executing the provided query against the data source. - !!!note "Event time data type restriction" - The supported data types for the event time column are: `timestamp`, `date` and `bigint`. + + !!!note "Event time data type restriction" + The supported data types for the event time column are: `timestamp`, `date` and `bigint`. + + dataframe: DataFrame, RDD, Ndarray, list. Spine dataframe with primary key, event time and label column to use for point in time join when fetching features. diff --git a/python/hsfs/split_statistics.py b/python/hsfs/split_statistics.py index aeb78af01c..95f67957b9 100644 --- a/python/hsfs/split_statistics.py +++ b/python/hsfs/split_statistics.py @@ -36,9 +36,11 @@ def __init__( self._name = name self._feature_descriptive_statistics = feature_descriptive_statistics self._feature_descriptive_statistics = [ - FeatureDescriptiveStatistics.from_response_json(fds) - if isinstance(fds, dict) - else fds + ( + FeatureDescriptiveStatistics.from_response_json(fds) + if isinstance(fds, dict) + else fds + ) for fds in feature_descriptive_statistics ] @@ -60,8 +62,10 @@ def json(self): @property def name(self): + """Name of the training dataset split.""" return self._name @property def feature_descriptive_statistics(self): + """List of feature descriptive statistics.""" return self._feature_descriptive_statistics diff --git a/python/hsfs/statistics.py b/python/hsfs/statistics.py index 28cbf804c8..3262801ff4 100644 --- a/python/hsfs/statistics.py +++ b/python/hsfs/statistics.py @@ -150,11 +150,13 @@ def __repr__(self): return f"Statistics({self._computation_time!r})" @property - def computation_time(self): + def computation_time(self) -> int: + """Time at which the statistics were computed.""" return self._computation_time @property - def row_percentage(self) -> Optional[float]: + def row_percentage(self) -> float: + """Percentage of data on which statistics were computed.""" return self._row_percentage @row_percentage.setter @@ -170,37 +172,48 @@ def row_percentage(self, row_percentage: Optional[float]): raise TypeError("Row percentage must be a float between 0 and 1.") @property - def feature_descriptive_statistics(self): + def feature_descriptive_statistics( + self, + ) -> Optional[List[FeatureDescriptiveStatistics]]: + """List of feature descriptive statistics.""" return self._feature_descriptive_statistics @property - def feature_group_id(self): + def feature_group_id(self) -> Optional[int]: + """Id of the feature group on whose data the statistics were computed.""" return self._feature_group_id @property - def feature_view_name(self): + def feature_view_name(self) -> Optional[str]: + """Name of the feature view whose query was used to retrieve the data on which the statistics were computed.""" return self._feature_view_name @property - def feature_view_version(self): + def feature_view_version(self) -> Optional[int]: + """Id of the feature view whose query was used to retrieve the data on which the statistics were computed.""" return self._feature_view_version @property def window_start_commit_time(self): + """Start time of the window of data on which statistics were computed.""" return self._window_start_commit_time @property def window_end_commit_time(self): + """End time of the window of data on which statistics were computed.""" return self._window_end_commit_time @property - def training_dataset_version(self): + def training_dataset_version(self) -> Optional[int]: + """Version of the training dataset on which statistics were computed.""" return self._training_dataset_version @property - def split_statistics(self): + def split_statistics(self) -> Optional[List[SplitStatistics]]: + """List of statistics computed on each split of a training dataset.""" return self._split_statistics @property - def before_transformation(self): + def before_transformation(self) -> bool: + """Whether or not the statistics were computed on feature values before applying model-dependent transformations.""" return self._before_transformation