From f3bc5e9fdc761ff9fa6f2728dc73499d55cdb174 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Mon, 12 Feb 2024 12:33:47 +0200 Subject: [PATCH 1/9] Initial implementations for raster and vector normality test CLI functions --- eis_toolkit/cli.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index f31ba0c6..f979ca99 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -267,6 +267,48 @@ class NodataHandling(str, Enum): # --- EXPLORATORY ANALYSES --- +# NORMALITY TEST RASTER +@app.command() +def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION]): + """Compute Shapiro-Wilk test for normality on the input data.""" + from eis_toolkit.exploratory_analyses.statistical_tests import normality_test + + typer.echo("Progress: 10%") + + with rasterio.open(input_raster) as raster: + data = raster.read() + typer.echo("Progress: 25%") + + results_dict = normality_test(data) + + typer.echo("Progress: 75%") + + json_str = json.dumps(results_dict) + typer.echo("Progress: 100%") + typer.echo(f"Results: {json_str}") + typer.echo("Normality test (raster) completed") + + +# NORMALITY TEST VECTOR +@app.command() +def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], columns: List[str] = None): + """Compute Shapiro-Wilk test for normality on the input data.""" + from eis_toolkit.exploratory_analyses.statistical_tests import normality_test + + typer.echo("Progress: 10%") + + geodataframe = gpd.read_file(input_vector) + typer.echo("Progress: 25%") + + results_dict = normality_test(geodataframe, columns) + + typer.echo("Progress: 75%") + + json_str = json.dumps(results_dict) + typer.echo("Progress: 100%") + typer.echo(f"Results: {json_str}") + typer.echo("Normality test (vector) completed") + # DBSCAN @app.command() From 1c562025a41973f6c64b1ad3453102b637ef6eb8 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Fri, 23 Feb 2024 10:42:10 +0200 Subject: [PATCH 2/9] Added CLI functions for normality tests. Adjusted toolkit func implementations --- eis_toolkit/cli.py | 16 +++++++++------- .../exploratory_analyses/normality_test.py | 17 ++++++++++------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index a9e62631..d660591f 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -276,9 +276,9 @@ class LocalMoranWeightType(str, Enum): # NORMALITY TEST RASTER @app.command() -def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION]): - """Compute Shapiro-Wilk test for normality on the input data.""" - from eis_toolkit.exploratory_analyses.statistical_tests import normality_test +def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION], bands: Optional[List[int]] = None): + """Compute Shapiro-Wilk test for normality on the input raster data.""" + from eis_toolkit.exploratory_analyses.normality_test import normality_test_array typer.echo("Progress: 10%") @@ -286,7 +286,9 @@ def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION]): data = raster.read() typer.echo("Progress: 25%") - results_dict = normality_test(data) + if len(bands) == 0: + bands = None + results_dict = normality_test_array(data=data, bands=bands, nodata_value=raster.nodata) typer.echo("Progress: 75%") @@ -299,15 +301,15 @@ def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION]): # NORMALITY TEST VECTOR @app.command() def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], columns: List[str] = None): - """Compute Shapiro-Wilk test for normality on the input data.""" - from eis_toolkit.exploratory_analyses.statistical_tests import normality_test + """Compute Shapiro-Wilk test for normality on the input vector data.""" + from eis_toolkit.exploratory_analyses.normality_test import normality_test_dataframe typer.echo("Progress: 10%") geodataframe = gpd.read_file(input_vector) typer.echo("Progress: 25%") - results_dict = normality_test(geodataframe, columns) + results_dict = normality_test_dataframe(geodataframe, columns) typer.echo("Progress: 75%") diff --git a/eis_toolkit/exploratory_analyses/normality_test.py b/eis_toolkit/exploratory_analyses/normality_test.py index 4034d2b1..d3507fe7 100644 --- a/eis_toolkit/exploratory_analyses/normality_test.py +++ b/eis_toolkit/exploratory_analyses/normality_test.py @@ -59,7 +59,8 @@ def normality_test_dataframe( for column in columns: if len(data[column]) > 5000: raise SampleSizeExceededException(f"Sample size for column '{column}' exceeds the limit of 5000 samples.") - statistics[column] = shapiro(data[column]) + stat, p_value = shapiro(data[column]) + statistics[column] = {"Statistic": stat, "p-value": p_value} return statistics @@ -67,7 +68,7 @@ def normality_test_dataframe( @beartype def normality_test_array( data: np.ndarray, bands: Optional[Sequence[int]] = None, nodata_value: Optional[Number] = None -) -> Dict[int, Tuple[float, float]]: +) -> Dict[str, Tuple[float, float]]: """ Compute Shapiro-Wilk test for normality on the input Numpy array. @@ -94,14 +95,14 @@ def normality_test_array( if data.ndim == 1 or data.ndim == 2: prepared_data = np.expand_dims(data, axis=0) - bands = range(1) + bands = [1] elif data.ndim == 3: if bands is not None: - if not all(band < len(data) for band in bands): + if not all(band - 1 < len(data) for band in bands): raise InvalidRasterBandException("All selected bands were not found in the input array.") else: - bands = range(len(data)) + bands = range(1, len(data) + 1) prepared_data = data else: @@ -110,7 +111,8 @@ def normality_test_array( statistics = {} for band in bands: - flattened_data = prepared_data[band].ravel() + band_idx = band - 1 + flattened_data = prepared_data[band_idx].ravel() nan_mask = flattened_data == np.nan if nodata_value is not None: @@ -121,6 +123,7 @@ def normality_test_array( if len(masked_data) > 5000: raise SampleSizeExceededException(f"Sample size for band '{band}' exceeds the limit of 5000 samples.") - statistics[band] = shapiro(masked_data) + stat, p_value = shapiro(masked_data) + statistics[f"Band {band}"] = {"Statistic": stat, "p-value": p_value} return statistics From 6cd07031668b852ada4540d4a97e1907b613fe96 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Fri, 23 Feb 2024 11:10:47 +0200 Subject: [PATCH 3/9] Added CLI function for chi-square test, adjusted toolkit func --- eis_toolkit/cli.py | 29 +++++++++++++++-- .../exploratory_analyses/statistical_tests.py | 31 ++++++++++--------- 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index d660591f..23c90010 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -300,7 +300,7 @@ def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION], # NORMALITY TEST VECTOR @app.command() -def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], columns: List[str] = None): +def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], columns: Optional[List[str]] = None): """Compute Shapiro-Wilk test for normality on the input vector data.""" from eis_toolkit.exploratory_analyses.normality_test import normality_test_dataframe @@ -309,7 +309,7 @@ def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], geodataframe = gpd.read_file(input_vector) typer.echo("Progress: 25%") - results_dict = normality_test_dataframe(geodataframe, columns) + results_dict = normality_test_dataframe(data=geodataframe, columns=columns) typer.echo("Progress: 75%") @@ -319,6 +319,31 @@ def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], typer.echo("Normality test (vector) completed") +# CHI-SQUARE_TEST +@app.command() +def chi_square_test_cli( + input_vector: Annotated[Path, INPUT_FILE_OPTION], + target_column: str = typer.Option(), + columns: Optional[List[str]] = None, +): + """Perform a Chi-square test of independence between a target variable and one or more other variables.""" + from eis_toolkit.exploratory_analyses.statistical_tests import chi_square_test + + typer.echo("Progress: 10%") + + geodataframe = gpd.read_file(input_vector) + typer.echo("Progress: 25%") + + results_dict = chi_square_test(data=geodataframe, target_column=target_column, columns=columns) + + typer.echo("Progress: 75%") + + json_str = json.dumps(results_dict) + typer.echo("Progress: 100%") + typer.echo(f"Results: {json_str}") + typer.echo("Chi-square test completed") + + # DBSCAN @app.command() def dbscan_cli( diff --git a/eis_toolkit/exploratory_analyses/statistical_tests.py b/eis_toolkit/exploratory_analyses/statistical_tests.py index c7f80117..7a64b3ae 100644 --- a/eis_toolkit/exploratory_analyses/statistical_tests.py +++ b/eis_toolkit/exploratory_analyses/statistical_tests.py @@ -9,22 +9,24 @@ @beartype def chi_square_test(data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None) -> dict: - """Compute Chi-square test for independence on the input data. + """Perform a Chi-square test of independence between a target variable and one or more other variables. - It is assumed that the variables in the input data are independent and that they are categorical, i.e. strings, - booleans or integers, but not floats. + Input data should be categorical data. Continuous data or non-categorical data should be discretized or + binned before using this function, as Chi-square tests are not applicable to continuous variables directly. + + The test assumes that the observed frequencies in each category are independent. Args: - data: Dataframe containing the input data + data: Dataframe containing the input data. target_column: Variable against which independence of other variables is tested. columns: Variables that are tested against the variable in target_column. If None, every column is used. Returns: - Test statistics for each variable (except target_column). + Test statistics, p-value and degrees of freedom for each variable. Raises: - EmptyDataFrameException: The input Dataframe is empty. - InvalidParameterValueException: The target_column is not in input Dataframe or invalid column is provided. + EmptyDataFrameException: Input Dataframe is empty. + InvalidParameterValueException: Invalid column is input. """ if check_empty_dataframe(data): raise EmptyDataFrameException("The input Dataframe is empty.") @@ -32,19 +34,18 @@ def chi_square_test(data: pd.DataFrame, target_column: str, columns: Optional[Se if not check_columns_valid(data, [target_column]): raise InvalidParameterValueException("Target column not found in the Dataframe.") - if columns is not None: + if columns: invalid_columns = [column for column in columns if column not in data.columns] - if any(invalid_columns): - raise InvalidParameterValueException(f"The following variables are not in the dataframe: {invalid_columns}") + if invalid_columns: + raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") else: - columns = data.columns + columns = [col for col in data.columns if col != target_column] statistics = {} for column in columns: - if column != target_column: - contingency_table = pd.crosstab(data[target_column], data[column]) - chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table) - statistics[column] = (chi_square, p_value, degrees_of_freedom) + contingency_table = pd.crosstab(data[target_column], data[column]) + chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table) + statistics[column] = {"chi_square": chi_square, "p-value": p_value, "degrees_of_freedom": degrees_of_freedom} return statistics From 4ede1878aa753dd84f686293d656f34a19ad523a Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Fri, 23 Feb 2024 12:12:49 +0200 Subject: [PATCH 4/9] Added CLI functions for covariance and correlation matrices, adjusted toolkit implementations --- eis_toolkit/cli.py | 70 ++++++++++++++++++- .../exploratory_analyses/statistical_tests.py | 37 ++++++++-- 2 files changed, 99 insertions(+), 8 deletions(-) diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index 23c90010..f5bdd5ee 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -207,6 +207,14 @@ class LocalMoranWeightType(str, Enum): knn = "knn" +class CorrelationMethod(str, Enum): + """Correlation methods available.""" + + pearson = "pearson" + kendall = "kendall" + spearman = "spearman" + + RESAMPLING_MAPPING = { "nearest": warp.Resampling.nearest, "bilinear": warp.Resampling.bilinear, @@ -331,7 +339,7 @@ def chi_square_test_cli( typer.echo("Progress: 10%") - geodataframe = gpd.read_file(input_vector) + geodataframe = gpd.read_file(input_vector) # Should we drop geometry columns? typer.echo("Progress: 25%") results_dict = chi_square_test(data=geodataframe, target_column=target_column, columns=columns) @@ -344,6 +352,66 @@ def chi_square_test_cli( typer.echo("Chi-square test completed") +# CORRELATION MATRIX +@app.command() +def correlation_matrix_cli( + input_vector: Annotated[Path, INPUT_FILE_OPTION], + output_file: Annotated[Path, OUTPUT_FILE_OPTION], + columns: Optional[List[str]] = None, + correlation_method: CorrelationMethod = CorrelationMethod.pearson, + min_periods: Optional[int] = None, +): + """Compute correlation matrix on the input data.""" + from eis_toolkit.exploratory_analyses.statistical_tests import correlation_matrix + + typer.echo("Progress: 10%") + + geodataframe = gpd.read_file(input_vector) + dataframe = pd.DataFrame(geodataframe.drop(columns="geometry")) + typer.echo("Progress: 25%") + + output_df = correlation_matrix( + data=dataframe, columns=columns, correlation_method=correlation_method, min_periods=min_periods + ) + + typer.echo("Progress: 75%") + + output_df.to_csv(output_file) + typer.echo("Progress: 100%") + + typer.echo("Correlation matrix completed") + + +# COVARIANCE MATRIX +@app.command() +def covariance_matrix_cli( + input_vector: Annotated[Path, INPUT_FILE_OPTION], + output_file: Annotated[Path, OUTPUT_FILE_OPTION], + columns: Optional[List[str]] = None, + min_periods: Optional[int] = None, + delta_degrees_of_freedom: int = 1, +): + """Compute covariance matrix on the input data.""" + from eis_toolkit.exploratory_analyses.statistical_tests import covariance_matrix + + typer.echo("Progress: 10%") + + geodataframe = gpd.read_file(input_vector) + dataframe = pd.DataFrame(geodataframe.drop(columns="geometry")) + typer.echo("Progress: 25%") + + output_df = covariance_matrix( + data=dataframe, columns=columns, min_periods=min_periods, delta_degrees_of_freedom=delta_degrees_of_freedom + ) + + typer.echo("Progress: 75%") + + output_df.to_csv(output_file) + typer.echo("Progress: 100%") + + typer.echo("Covariance matrix completed") + + # DBSCAN @app.command() def dbscan_cli( diff --git a/eis_toolkit/exploratory_analyses/statistical_tests.py b/eis_toolkit/exploratory_analyses/statistical_tests.py index 7a64b3ae..8fe5721f 100644 --- a/eis_toolkit/exploratory_analyses/statistical_tests.py +++ b/eis_toolkit/exploratory_analyses/statistical_tests.py @@ -1,10 +1,11 @@ +import numpy as np import pandas as pd from beartype import beartype from beartype.typing import Literal, Optional, Sequence from scipy.stats import chi2_contingency from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException -from eis_toolkit.utilities.checks.dataframe import check_columns_numeric, check_columns_valid, check_empty_dataframe +from eis_toolkit.utilities.checks.dataframe import check_columns_valid, check_empty_dataframe @beartype @@ -53,6 +54,7 @@ def chi_square_test(data: pd.DataFrame, target_column: str, columns: Optional[Se @beartype def correlation_matrix( data: pd.DataFrame, + columns: Optional[Sequence[str]] = None, correlation_method: Literal["pearson", "kendall", "spearman"] = "pearson", min_periods: Optional[int] = None, ) -> pd.DataFrame: @@ -62,6 +64,7 @@ def correlation_matrix( Args: data: Dataframe containing the input data. + columns: Columns to include in the correlation matrix. If None, all numeric columns are used. correlation_method: 'pearson', 'kendall', or 'spearman'. Defaults to 'pearson'. min_periods: Minimum number of observations required per pair of columns to have valid result. Optional. @@ -72,12 +75,20 @@ def correlation_matrix( Raises: EmptyDataFrameException: The input Dataframe is empty. InvalidParameterValueException: min_periods argument is used with method 'kendall'. - NonNumericDataException: The input data contain non-numeric data. + NonNumericDataException: The selected columns contain non-numeric data. """ if check_empty_dataframe(data): raise EmptyDataFrameException("The input Dataframe is empty.") - if not check_columns_numeric(data, data.columns.to_list()): + if columns: + invalid_columns = [column for column in columns if column not in data.columns] + if invalid_columns: + raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") + data_subset = data[columns] + else: + data_subset = data.select_dtypes(include=np.number) + + if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))): raise NonNumericDataException("The input data contain non-numeric data.") if correlation_method == "kendall" and min_periods is not None: @@ -85,14 +96,17 @@ def correlation_matrix( "The argument min_periods is available only with correlation methods 'pearson' and 'spearman'." ) - matrix = data.corr(method=correlation_method, min_periods=min_periods, numeric_only=True) + matrix = data_subset.corr(method=correlation_method, min_periods=min_periods, numeric_only=True) return matrix @beartype def covariance_matrix( - data: pd.DataFrame, min_periods: Optional[int] = None, delta_degrees_of_freedom: int = 1 + data: pd.DataFrame, + columns: Optional[Sequence[str]] = None, + min_periods: Optional[int] = None, + delta_degrees_of_freedom: int = 1, ) -> pd.DataFrame: """Compute covariance matrix on the input data. @@ -100,6 +114,7 @@ def covariance_matrix( Args: data: Dataframe containing the input data. + columns: Columns to include in the covariance matrix. If None, all numeric columns are used. min_periods: Minimum number of observations required per pair of columns to have valid result. Optional. delta_degrees_of_freedom: Delta degrees of freedom used for computing covariance matrix. Defaults to 1. @@ -114,7 +129,15 @@ def covariance_matrix( if check_empty_dataframe(data): raise EmptyDataFrameException("The input Dataframe is empty.") - if not check_columns_numeric(data, data.columns.to_list()): + if columns: + invalid_columns = [column for column in columns if column not in data.columns] + if invalid_columns: + raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") + data_subset = data[columns] + else: + data_subset = data.select_dtypes(include=np.number) + + if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))): raise NonNumericDataException("The input data contain non-numeric data.") if delta_degrees_of_freedom < 0: @@ -123,6 +146,6 @@ def covariance_matrix( if min_periods and min_periods < 0: raise InvalidParameterValueException("Min perioids must be non-negative.") - matrix = data.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom) + matrix = data_subset.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom) return matrix From a4d367018046249e79da70e248f65cc6ecad580e Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Fri, 23 Feb 2024 12:39:35 +0200 Subject: [PATCH 5/9] Update tests for statistical functions, typing fixes --- eis_toolkit/exploratory_analyses/normality_test.py | 6 +++--- .../exploratory_analyses/statistical_tests.py | 6 ++++-- tests/exploratory_analyses/normality_test_test.py | 14 +++++++------- .../exploratory_analyses/statistical_tests_test.py | 4 ++-- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/eis_toolkit/exploratory_analyses/normality_test.py b/eis_toolkit/exploratory_analyses/normality_test.py index d3507fe7..8e8fcad0 100644 --- a/eis_toolkit/exploratory_analyses/normality_test.py +++ b/eis_toolkit/exploratory_analyses/normality_test.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd from beartype import beartype -from beartype.typing import Dict, Optional, Sequence, Tuple +from beartype.typing import Dict, Optional, Sequence from scipy.stats import shapiro from eis_toolkit.exceptions import ( @@ -20,7 +20,7 @@ @beartype def normality_test_dataframe( data: pd.DataFrame, columns: Optional[Sequence[str]] = None -) -> Dict[str, Tuple[float, float]]: +) -> Dict[str, Dict[str, float]]: """ Compute Shapiro-Wilk test for normality on the input DataFrame. @@ -68,7 +68,7 @@ def normality_test_dataframe( @beartype def normality_test_array( data: np.ndarray, bands: Optional[Sequence[int]] = None, nodata_value: Optional[Number] = None -) -> Dict[str, Tuple[float, float]]: +) -> Dict[str, Dict[str, float]]: """ Compute Shapiro-Wilk test for normality on the input Numpy array. diff --git a/eis_toolkit/exploratory_analyses/statistical_tests.py b/eis_toolkit/exploratory_analyses/statistical_tests.py index 8fe5721f..518492fe 100644 --- a/eis_toolkit/exploratory_analyses/statistical_tests.py +++ b/eis_toolkit/exploratory_analyses/statistical_tests.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd from beartype import beartype -from beartype.typing import Literal, Optional, Sequence +from beartype.typing import Dict, Literal, Optional, Sequence from scipy.stats import chi2_contingency from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException @@ -9,7 +9,9 @@ @beartype -def chi_square_test(data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None) -> dict: +def chi_square_test( + data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None +) -> Dict[str, Dict[str, float]]: """Perform a Chi-square test of independence between a target variable and one or more other variables. Input data should be categorical data. Continuous data or non-categorical data should be discretized or diff --git a/tests/exploratory_analyses/normality_test_test.py b/tests/exploratory_analyses/normality_test_test.py index c2012578..dcf3f5b4 100644 --- a/tests/exploratory_analyses/normality_test_test.py +++ b/tests/exploratory_analyses/normality_test_test.py @@ -24,35 +24,35 @@ def test_normality_test_dataframe(): """Test that returned normality statistics for DataFrame data are correct.""" output_statistics = normality_test_dataframe(data=DATA_DF, columns=["a"]) - np.testing.assert_array_almost_equal(output_statistics["a"], (0.82827, 0.13502), decimal=5) + np.testing.assert_array_almost_equal(list(output_statistics["a"].values()), [0.82827, 0.13502], decimal=5) def test_normality_test_array(): """Test that returned normality statistics for Numpy array data are correct.""" # 3D array - output_statistics = normality_test_array(data=DATA_ARRAY, bands=[0]) - np.testing.assert_array_almost_equal(output_statistics[0], (0.91021, 0.01506), decimal=5) + output_statistics = normality_test_array(data=DATA_ARRAY, bands=[1]) + np.testing.assert_array_almost_equal(list(output_statistics["Band 1"].values()), [0.91021, 0.01506], decimal=5) # 2D array output_statistics = normality_test_array(data=DATA_ARRAY[0]) - np.testing.assert_array_almost_equal(output_statistics[0], (0.91021, 0.01506), decimal=5) + np.testing.assert_array_almost_equal(list(output_statistics["Band 1"].values()), [0.91021, 0.01506], decimal=5) # 1D array output_statistics = normality_test_array(data=DATA_ARRAY[0][0]) - np.testing.assert_array_almost_equal(output_statistics[0], (0.9067, 0.41504), decimal=5) + np.testing.assert_array_almost_equal(list(output_statistics["Band 1"].values()), [0.9067, 0.41504], decimal=5) def test_normality_test_dataframe_missing_data(): """Test that DataFrame input with missing data returns statistics correctly.""" df_with_nan = DATA_DF.replace(3, np.nan) output_statistics = normality_test_dataframe(data=df_with_nan, columns=["a"]) - np.testing.assert_array_almost_equal(output_statistics["a"], (0.62978, 0.00124), decimal=5) + np.testing.assert_array_almost_equal(list(output_statistics["a"].values()), (0.62978, 0.00124), decimal=5) def test_normality_test_array_nodata(): """Test that Numpy array input with missing data returns statistics correctly.""" output_statistics = normality_test_array(data=DATA_ARRAY, nodata_value=3) - np.testing.assert_array_almost_equal(output_statistics[0], (0.91021, 0.01506), decimal=5) + np.testing.assert_array_almost_equal(list(output_statistics["Band 1"].values()), [0.91021, 0.01506], decimal=5) def test_invalid_selection(): diff --git a/tests/exploratory_analyses/statistical_tests_test.py b/tests/exploratory_analyses/statistical_tests_test.py index 71954c0f..03c68d7e 100644 --- a/tests/exploratory_analyses/statistical_tests_test.py +++ b/tests/exploratory_analyses/statistical_tests_test.py @@ -22,7 +22,7 @@ def test_chi_square_test(): """Test that returned statistics for independence are correct.""" output_statistics = chi_square_test(data=categorical_data, target_column=target_column, columns=["f"]) - np.testing.assert_array_equal((output_statistics["f"]), (0.0, 1.0, 1)) + np.testing.assert_array_equal(list(output_statistics["f"].values()), [0.0, 1.0, 1]) def test_correlation_matrix_nan(): @@ -56,7 +56,7 @@ def test_correlation_matrix(): def test_correlation_matrix_non_numeric(): """Test that returned correlation matrix is correct.""" with pytest.raises(NonNumericDataException): - correlation_matrix(data=non_numeric_df) + correlation_matrix(data=non_numeric_df, columns=["a", "b"]) def test_covariance_matrix_nan(): From 62fe65430e7d74db5a9e3f410375eea55a52fef6 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Thu, 29 Feb 2024 09:32:29 +0200 Subject: [PATCH 6/9] Separated correlation matrix, covariance matrix and chi2 tests into own modules --- eis_toolkit/cli.py | 6 +- .../exploratory_analyses/chi_square_test.py | 52 ++++++ .../correlation_matrix.py | 57 +++++++ .../exploratory_analyses/covariance_matrix.py | 57 +++++++ .../exploratory_analyses/statistical_tests.py | 153 ------------------ tests/exploratory_analyses/chi_square_test.py | 20 +++ .../correlation_matrix_test.py | 53 ++++++ .../covariance_matrix_test.py | 56 +++++++ .../statistical_tests_test.py | 117 -------------- 9 files changed, 298 insertions(+), 273 deletions(-) create mode 100644 eis_toolkit/exploratory_analyses/chi_square_test.py create mode 100644 eis_toolkit/exploratory_analyses/correlation_matrix.py create mode 100644 eis_toolkit/exploratory_analyses/covariance_matrix.py delete mode 100644 eis_toolkit/exploratory_analyses/statistical_tests.py create mode 100644 tests/exploratory_analyses/chi_square_test.py create mode 100644 tests/exploratory_analyses/correlation_matrix_test.py create mode 100644 tests/exploratory_analyses/covariance_matrix_test.py delete mode 100644 tests/exploratory_analyses/statistical_tests_test.py diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index f5bdd5ee..ad070e98 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -335,7 +335,7 @@ def chi_square_test_cli( columns: Optional[List[str]] = None, ): """Perform a Chi-square test of independence between a target variable and one or more other variables.""" - from eis_toolkit.exploratory_analyses.statistical_tests import chi_square_test + from eis_toolkit.exploratory_analyses.chi_square_test import chi_square_test typer.echo("Progress: 10%") @@ -362,7 +362,7 @@ def correlation_matrix_cli( min_periods: Optional[int] = None, ): """Compute correlation matrix on the input data.""" - from eis_toolkit.exploratory_analyses.statistical_tests import correlation_matrix + from eis_toolkit.exploratory_analyses.correlation_matrix import correlation_matrix typer.echo("Progress: 10%") @@ -392,7 +392,7 @@ def covariance_matrix_cli( delta_degrees_of_freedom: int = 1, ): """Compute covariance matrix on the input data.""" - from eis_toolkit.exploratory_analyses.statistical_tests import covariance_matrix + from eis_toolkit.exploratory_analyses.covariance_matrix import covariance_matrix typer.echo("Progress: 10%") diff --git a/eis_toolkit/exploratory_analyses/chi_square_test.py b/eis_toolkit/exploratory_analyses/chi_square_test.py new file mode 100644 index 00000000..cf82aa25 --- /dev/null +++ b/eis_toolkit/exploratory_analyses/chi_square_test.py @@ -0,0 +1,52 @@ +import pandas as pd +from beartype import beartype +from beartype.typing import Dict, Optional, Sequence +from scipy.stats import chi2_contingency + +from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException +from eis_toolkit.utilities.checks.dataframe import check_columns_valid, check_empty_dataframe + + +@beartype +def chi_square_test( + data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None +) -> Dict[str, Dict[str, float]]: + """Perform a Chi-square test of independence between a target variable and one or more other variables. + + Input data should be categorical data. Continuous data or non-categorical data should be discretized or + binned before using this function, as Chi-square tests are not applicable to continuous variables directly. + + The test assumes that the observed frequencies in each category are independent. + + Args: + data: Dataframe containing the input data. + target_column: Variable against which independence of other variables is tested. + columns: Variables that are tested against the variable in target_column. If None, every column is used. + + Returns: + Test statistics, p-value and degrees of freedom for each variable. + + Raises: + EmptyDataFrameException: Input Dataframe is empty. + InvalidParameterValueException: Invalid column is input. + """ + if check_empty_dataframe(data): + raise EmptyDataFrameException("The input Dataframe is empty.") + + if not check_columns_valid(data, [target_column]): + raise InvalidParameterValueException("Target column not found in the Dataframe.") + + if columns: + invalid_columns = [column for column in columns if column not in data.columns] + if invalid_columns: + raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") + else: + columns = [col for col in data.columns if col != target_column] + + statistics = {} + for column in columns: + contingency_table = pd.crosstab(data[target_column], data[column]) + chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table) + statistics[column] = {"chi_square": chi_square, "p-value": p_value, "degrees_of_freedom": degrees_of_freedom} + + return statistics diff --git a/eis_toolkit/exploratory_analyses/correlation_matrix.py b/eis_toolkit/exploratory_analyses/correlation_matrix.py new file mode 100644 index 00000000..bdcb79f3 --- /dev/null +++ b/eis_toolkit/exploratory_analyses/correlation_matrix.py @@ -0,0 +1,57 @@ +import numpy as np +import pandas as pd +from beartype import beartype +from beartype.typing import Literal, Optional, Sequence + +from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException +from eis_toolkit.utilities.checks.dataframe import check_empty_dataframe + + +@beartype +def correlation_matrix( + data: pd.DataFrame, + columns: Optional[Sequence[str]] = None, + correlation_method: Literal["pearson", "kendall", "spearman"] = "pearson", + min_periods: Optional[int] = None, +) -> pd.DataFrame: + """Compute correlation matrix on the input data. + + It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations. + + Args: + data: Dataframe containing the input data. + columns: Columns to include in the correlation matrix. If None, all numeric columns are used. + correlation_method: 'pearson', 'kendall', or 'spearman'. Defaults to 'pearson'. + min_periods: Minimum number of observations required per pair of columns to have valid result. Optional. + + Returns: + Dataframe containing matrix representing the correlation coefficient \ + between the corresponding pair of variables. + + Raises: + EmptyDataFrameException: The input Dataframe is empty. + InvalidParameterValueException: min_periods argument is used with method 'kendall'. + NonNumericDataException: The selected columns contain non-numeric data. + """ + if check_empty_dataframe(data): + raise EmptyDataFrameException("The input Dataframe is empty.") + + if columns: + invalid_columns = [column for column in columns if column not in data.columns] + if invalid_columns: + raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") + data_subset = data[columns] + else: + data_subset = data.select_dtypes(include=np.number) + + if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))): + raise NonNumericDataException("The input data contain non-numeric data.") + + if correlation_method == "kendall" and min_periods is not None: + raise InvalidParameterValueException( + "The argument min_periods is available only with correlation methods 'pearson' and 'spearman'." + ) + + matrix = data_subset.corr(method=correlation_method, min_periods=min_periods, numeric_only=True) + + return matrix diff --git a/eis_toolkit/exploratory_analyses/covariance_matrix.py b/eis_toolkit/exploratory_analyses/covariance_matrix.py new file mode 100644 index 00000000..25c63850 --- /dev/null +++ b/eis_toolkit/exploratory_analyses/covariance_matrix.py @@ -0,0 +1,57 @@ +import numpy as np +import pandas as pd +from beartype import beartype +from beartype.typing import Optional, Sequence + +from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException +from eis_toolkit.utilities.checks.dataframe import check_empty_dataframe + + +@beartype +def covariance_matrix( + data: pd.DataFrame, + columns: Optional[Sequence[str]] = None, + min_periods: Optional[int] = None, + delta_degrees_of_freedom: int = 1, +) -> pd.DataFrame: + """Compute covariance matrix on the input data. + + It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations. + + Args: + data: Dataframe containing the input data. + columns: Columns to include in the covariance matrix. If None, all numeric columns are used. + min_periods: Minimum number of observations required per pair of columns to have valid result. Optional. + delta_degrees_of_freedom: Delta degrees of freedom used for computing covariance matrix. Defaults to 1. + + Returns: + Dataframe containing matrix representing the covariance between the corresponding pair of variables. + + Raises: + EmptyDataFrameException: The input Dataframe is empty. + InvalidParameterValueException: Provided value for delta_degrees_of_freedom or min_periods is negative. + NonNumericDataException: The input data contain non-numeric data. + """ + if check_empty_dataframe(data): + raise EmptyDataFrameException("The input Dataframe is empty.") + + if columns: + invalid_columns = [column for column in columns if column not in data.columns] + if invalid_columns: + raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") + data_subset = data[columns] + else: + data_subset = data.select_dtypes(include=np.number) + + if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))): + raise NonNumericDataException("The input data contain non-numeric data.") + + if delta_degrees_of_freedom < 0: + raise InvalidParameterValueException("Delta degrees of freedom must be non-negative.") + + if min_periods and min_periods < 0: + raise InvalidParameterValueException("Min perioids must be non-negative.") + + matrix = data_subset.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom) + + return matrix diff --git a/eis_toolkit/exploratory_analyses/statistical_tests.py b/eis_toolkit/exploratory_analyses/statistical_tests.py deleted file mode 100644 index 518492fe..00000000 --- a/eis_toolkit/exploratory_analyses/statistical_tests.py +++ /dev/null @@ -1,153 +0,0 @@ -import numpy as np -import pandas as pd -from beartype import beartype -from beartype.typing import Dict, Literal, Optional, Sequence -from scipy.stats import chi2_contingency - -from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException -from eis_toolkit.utilities.checks.dataframe import check_columns_valid, check_empty_dataframe - - -@beartype -def chi_square_test( - data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None -) -> Dict[str, Dict[str, float]]: - """Perform a Chi-square test of independence between a target variable and one or more other variables. - - Input data should be categorical data. Continuous data or non-categorical data should be discretized or - binned before using this function, as Chi-square tests are not applicable to continuous variables directly. - - The test assumes that the observed frequencies in each category are independent. - - Args: - data: Dataframe containing the input data. - target_column: Variable against which independence of other variables is tested. - columns: Variables that are tested against the variable in target_column. If None, every column is used. - - Returns: - Test statistics, p-value and degrees of freedom for each variable. - - Raises: - EmptyDataFrameException: Input Dataframe is empty. - InvalidParameterValueException: Invalid column is input. - """ - if check_empty_dataframe(data): - raise EmptyDataFrameException("The input Dataframe is empty.") - - if not check_columns_valid(data, [target_column]): - raise InvalidParameterValueException("Target column not found in the Dataframe.") - - if columns: - invalid_columns = [column for column in columns if column not in data.columns] - if invalid_columns: - raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") - else: - columns = [col for col in data.columns if col != target_column] - - statistics = {} - for column in columns: - contingency_table = pd.crosstab(data[target_column], data[column]) - chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table) - statistics[column] = {"chi_square": chi_square, "p-value": p_value, "degrees_of_freedom": degrees_of_freedom} - - return statistics - - -@beartype -def correlation_matrix( - data: pd.DataFrame, - columns: Optional[Sequence[str]] = None, - correlation_method: Literal["pearson", "kendall", "spearman"] = "pearson", - min_periods: Optional[int] = None, -) -> pd.DataFrame: - """Compute correlation matrix on the input data. - - It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations. - - Args: - data: Dataframe containing the input data. - columns: Columns to include in the correlation matrix. If None, all numeric columns are used. - correlation_method: 'pearson', 'kendall', or 'spearman'. Defaults to 'pearson'. - min_periods: Minimum number of observations required per pair of columns to have valid result. Optional. - - Returns: - Dataframe containing matrix representing the correlation coefficient \ - between the corresponding pair of variables. - - Raises: - EmptyDataFrameException: The input Dataframe is empty. - InvalidParameterValueException: min_periods argument is used with method 'kendall'. - NonNumericDataException: The selected columns contain non-numeric data. - """ - if check_empty_dataframe(data): - raise EmptyDataFrameException("The input Dataframe is empty.") - - if columns: - invalid_columns = [column for column in columns if column not in data.columns] - if invalid_columns: - raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") - data_subset = data[columns] - else: - data_subset = data.select_dtypes(include=np.number) - - if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))): - raise NonNumericDataException("The input data contain non-numeric data.") - - if correlation_method == "kendall" and min_periods is not None: - raise InvalidParameterValueException( - "The argument min_periods is available only with correlation methods 'pearson' and 'spearman'." - ) - - matrix = data_subset.corr(method=correlation_method, min_periods=min_periods, numeric_only=True) - - return matrix - - -@beartype -def covariance_matrix( - data: pd.DataFrame, - columns: Optional[Sequence[str]] = None, - min_periods: Optional[int] = None, - delta_degrees_of_freedom: int = 1, -) -> pd.DataFrame: - """Compute covariance matrix on the input data. - - It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations. - - Args: - data: Dataframe containing the input data. - columns: Columns to include in the covariance matrix. If None, all numeric columns are used. - min_periods: Minimum number of observations required per pair of columns to have valid result. Optional. - delta_degrees_of_freedom: Delta degrees of freedom used for computing covariance matrix. Defaults to 1. - - Returns: - Dataframe containing matrix representing the covariance between the corresponding pair of variables. - - Raises: - EmptyDataFrameException: The input Dataframe is empty. - InvalidParameterValueException: Provided value for delta_degrees_of_freedom or min_periods is negative. - NonNumericDataException: The input data contain non-numeric data. - """ - if check_empty_dataframe(data): - raise EmptyDataFrameException("The input Dataframe is empty.") - - if columns: - invalid_columns = [column for column in columns if column not in data.columns] - if invalid_columns: - raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") - data_subset = data[columns] - else: - data_subset = data.select_dtypes(include=np.number) - - if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))): - raise NonNumericDataException("The input data contain non-numeric data.") - - if delta_degrees_of_freedom < 0: - raise InvalidParameterValueException("Delta degrees of freedom must be non-negative.") - - if min_periods and min_periods < 0: - raise InvalidParameterValueException("Min perioids must be non-negative.") - - matrix = data_subset.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom) - - return matrix diff --git a/tests/exploratory_analyses/chi_square_test.py b/tests/exploratory_analyses/chi_square_test.py new file mode 100644 index 00000000..83385d47 --- /dev/null +++ b/tests/exploratory_analyses/chi_square_test.py @@ -0,0 +1,20 @@ +import numpy as np +import pandas as pd +import pytest + +from eis_toolkit.exceptions import InvalidParameterValueException +from eis_toolkit.exploratory_analyses.chi_square_test import chi_square_test + +DATA = pd.DataFrame({"e": [0, 0, 1, 1], "f": [True, False, True, True]}) + + +def test_chi_square_test(): + """Test that returned statistics for independence are correct.""" + output_statistics = chi_square_test(data=DATA, target_column="e", columns=["f"]) + np.testing.assert_array_equal(list(output_statistics["f"].values()), [0.0, 1.0, 1]) + + +def test_invalid_target_column(): + """Test that invalid target column raises the correct exception.""" + with pytest.raises(InvalidParameterValueException): + chi_square_test(data=DATA, target_column="invalid_column") diff --git a/tests/exploratory_analyses/correlation_matrix_test.py b/tests/exploratory_analyses/correlation_matrix_test.py new file mode 100644 index 00000000..903fa48c --- /dev/null +++ b/tests/exploratory_analyses/correlation_matrix_test.py @@ -0,0 +1,53 @@ +import numpy as np +import pytest +from beartype.roar import BeartypeCallHintParamViolation + +from eis_toolkit.exceptions import InvalidParameterValueException, NonNumericDataException +from eis_toolkit.exploratory_analyses.correlation_matrix import correlation_matrix +from tests.exploratory_analyses.covariance_matrix_test import DF, DF_NON_NUMERIC, DF_WITH_NAN + + +def test_correlation_matrix_nan(): + """Test that returned correlation matrix is correct, when NaN present in the dataframe.""" + expected_correlation_matrix = np.array( + [ + [1.000000, -0.577350, -1.000000, 1.000000], + [-0.577350, 1.000000, np.nan, -0.577350], + [-1.000000, np.nan, 1.000000, -1.000000], + [1.000000, -0.577350, -1.000000, 1.000000], + ] + ) + output_matrix = correlation_matrix(data=DF_WITH_NAN) + np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) + + +def test_correlation_matrix(): + """Test that returned correlation matrix is correct.""" + expected_correlation_matrix = np.array( + [ + [1.000000, -0.577350, -0.904534, 1.000000], + [-0.577350, 1.000000, 0.174078, -0.577350], + [-0.904534, 0.174078, 1.000000, -0.904534], + [1.000000, -0.577350, -0.904534, 1.000000], + ] + ) + output_matrix = correlation_matrix(data=DF) + np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) + + +def test_correlation_matrix_non_numeric(): + """Test that returned correlation matrix is correct.""" + with pytest.raises(NonNumericDataException): + correlation_matrix(data=DF_NON_NUMERIC, columns=["a", "b"]) + + +def test_invalid_correlation_method(): + """Test that invalid correlation method raises the correct exception.""" + with pytest.raises(BeartypeCallHintParamViolation): + correlation_matrix(data=DF, correlation_method="invalid_method") + + +def test_min_periods_with_kendall(): + """Test that min_periods with correlation_method 'kendall' raises the correct exception.""" + with pytest.raises(InvalidParameterValueException): + correlation_matrix(data=DF, correlation_method="kendall", min_periods=1) diff --git a/tests/exploratory_analyses/covariance_matrix_test.py b/tests/exploratory_analyses/covariance_matrix_test.py new file mode 100644 index 00000000..be69dd64 --- /dev/null +++ b/tests/exploratory_analyses/covariance_matrix_test.py @@ -0,0 +1,56 @@ +import numpy as np +import pandas as pd +import pytest + +from eis_toolkit.exceptions import InvalidParameterValueException +from eis_toolkit.exploratory_analyses.covariance_matrix import covariance_matrix + +DATA = np.array([[0, 1, 2, 1], [2, 0, 1, 2], [2, 1, 0, 2], [0, 1, 2, 1]]) +DF = pd.DataFrame(DATA, columns=["a", "b", "c", "d"]) +DF_NON_NUMERIC = pd.DataFrame( + data=np.array([[0, 1, 2, 1], ["a", "b", "c", "d"], [3, 2, 1, 0], ["c", "d", "b", "a"]]), + columns=["a", "b", "c", "d"], +) +DF_WITH_NAN = pd.DataFrame( + data=np.array([[0, 1, 2, 1], [2, 0, np.nan, 2], [2, 1, 0, 2], [0, 1, 2, 1]]), columns=["a", "b", "c", "d"] +) + + +def test_covariance_matrix_nan(): + """Test that returned covariance matrix is correct, when NaN present in the dataframe.""" + expected_correlation_matrix = np.array( + [ + [1.333333, -0.333333, -1.333333, 0.666667], + [-0.333333, 0.25, 0, -0.166667], + [-1.333333, 0, 1.333333, -0.666667], + [0.666667, -0.166667, -0.666667, 0.333333], + ] + ) + output_matrix = covariance_matrix(data=DF_WITH_NAN) + np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) + + +def test_covariance_matrix(): + """Test that returned covariance matrix is correct.""" + expected_covariance_matrix = np.array( + [ + [1.333333, -0.333333, -1.000000, 0.666667], + [-0.333333, 0.250000, 0.083333, -0.166667], + [-1.000000, 0.083333, 0.916667, -0.500000], + [0.666667, -0.166667, -0.500000, 0.333333], + ] + ) + output_matrix = covariance_matrix(data=DF) + np.testing.assert_array_almost_equal(output_matrix, expected_covariance_matrix) + + +def test_covariance_matrix_negative_min_periods(): + """Test that negative min_periods value raises the correct exception.""" + with pytest.raises(InvalidParameterValueException): + covariance_matrix(data=DF, min_periods=-1) + + +def test_invalid_ddof(): + """Test that invalid delta degrees of freedom raises the correct exception.""" + with pytest.raises(InvalidParameterValueException): + covariance_matrix(data=DF, delta_degrees_of_freedom=-1) diff --git a/tests/exploratory_analyses/statistical_tests_test.py b/tests/exploratory_analyses/statistical_tests_test.py deleted file mode 100644 index 03c68d7e..00000000 --- a/tests/exploratory_analyses/statistical_tests_test.py +++ /dev/null @@ -1,117 +0,0 @@ -import numpy as np -import pandas as pd -import pytest -from beartype.roar import BeartypeCallHintParamViolation - -from eis_toolkit.exceptions import InvalidParameterValueException, NonNumericDataException -from eis_toolkit.exploratory_analyses.statistical_tests import chi_square_test, correlation_matrix, covariance_matrix - -data = np.array([[0, 1, 2, 1], [2, 0, 1, 2], [2, 1, 0, 2], [0, 1, 2, 1]]) -missing_data = np.array([[0, 1, 2, 1], [2, 0, np.nan, 2], [2, 1, 0, 2], [0, 1, 2, 1]]) -non_numeric_data = np.array([[0, 1, 2, 1], ["a", "b", "c", "d"], [3, 2, 1, 0], ["c", "d", "b", "a"]]) -numeric_data = pd.DataFrame(data, columns=["a", "b", "c", "d"]) -non_numeric_df = pd.DataFrame(non_numeric_data, columns=["a", "b", "c", "d"]) -missing_values_df = pd.DataFrame(missing_data, columns=["a", "b", "c", "d"]) -categorical_data = pd.DataFrame({"e": [0, 0, 1, 1], "f": [True, False, True, True]}) -target_column = "e" -np.random.seed(42) -large_data = np.random.normal(size=5001) -large_df = pd.DataFrame(large_data, columns=["a"]) - - -def test_chi_square_test(): - """Test that returned statistics for independence are correct.""" - output_statistics = chi_square_test(data=categorical_data, target_column=target_column, columns=["f"]) - np.testing.assert_array_equal(list(output_statistics["f"].values()), [0.0, 1.0, 1]) - - -def test_correlation_matrix_nan(): - """Test that returned correlation matrix is correct, when NaN present in the dataframe.""" - expected_correlation_matrix = np.array( - [ - [1.000000, -0.577350, -1.000000, 1.000000], - [-0.577350, 1.000000, np.nan, -0.577350], - [-1.000000, np.nan, 1.000000, -1.000000], - [1.000000, -0.577350, -1.000000, 1.000000], - ] - ) - output_matrix = correlation_matrix(data=missing_values_df) - np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) - - -def test_correlation_matrix(): - """Test that returned correlation matrix is correct.""" - expected_correlation_matrix = np.array( - [ - [1.000000, -0.577350, -0.904534, 1.000000], - [-0.577350, 1.000000, 0.174078, -0.577350], - [-0.904534, 0.174078, 1.000000, -0.904534], - [1.000000, -0.577350, -0.904534, 1.000000], - ] - ) - output_matrix = correlation_matrix(data=numeric_data) - np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) - - -def test_correlation_matrix_non_numeric(): - """Test that returned correlation matrix is correct.""" - with pytest.raises(NonNumericDataException): - correlation_matrix(data=non_numeric_df, columns=["a", "b"]) - - -def test_covariance_matrix_nan(): - """Test that returned covariance matrix is correct, when NaN present in the dataframe.""" - expected_correlation_matrix = np.array( - [ - [1.333333, -0.333333, -1.333333, 0.666667], - [-0.333333, 0.25, 0, -0.166667], - [-1.333333, 0, 1.333333, -0.666667], - [0.666667, -0.166667, -0.666667, 0.333333], - ] - ) - output_matrix = covariance_matrix(data=missing_values_df) - np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) - - -def test_covariance_matrix(): - """Test that returned covariance matrix is correct.""" - expected_covariance_matrix = np.array( - [ - [1.333333, -0.333333, -1.000000, 0.666667], - [-0.333333, 0.250000, 0.083333, -0.166667], - [-1.000000, 0.083333, 0.916667, -0.500000], - [0.666667, -0.166667, -0.500000, 0.333333], - ] - ) - output_matrix = covariance_matrix(data=numeric_data) - np.testing.assert_array_almost_equal(output_matrix, expected_covariance_matrix) - - -def test_covariance_matrix_negative_min_periods(): - """Test that negative min_periods value raises the correct exception.""" - with pytest.raises(InvalidParameterValueException): - covariance_matrix(data=numeric_data, min_periods=-1) - - -def test_invalid_target_column(): - """Test that invalid target column raises the correct exception.""" - with pytest.raises(InvalidParameterValueException): - chi_square_test(data=categorical_data, target_column="invalid_column") - - -def test_invalid_correlation_method(): - """Test that invalid correlation method raises the correct exception.""" - with pytest.raises(BeartypeCallHintParamViolation): - correlation_matrix(data=numeric_data, correlation_method="invalid_method") - - -def test_min_periods_with_kendall(): - """Test that min_periods with correlation_method 'kendall' raises the correct exception.""" - with pytest.raises(InvalidParameterValueException): - correlation_matrix(data=numeric_data, correlation_method="kendall", min_periods=1) - - -def test_invalid_ddof(): - """Test that invalid delta degrees of freedom raises the correct exception.""" - with pytest.raises(InvalidParameterValueException): - covariance_matrix(data=numeric_data, delta_degrees_of_freedom=-1) From 6093734fd3b2e4c181cbd59ed0a998c45e0fe8d7 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Thu, 29 Feb 2024 09:37:28 +0200 Subject: [PATCH 7/9] Add docs --- docs/exploratory_analyses/chi_square_test.md | 3 +++ docs/exploratory_analyses/correlation_matrix.md | 0 docs/exploratory_analyses/covariance_matrix.md | 3 +++ docs/exploratory_analyses/statistical_testing.md | 3 --- 4 files changed, 6 insertions(+), 3 deletions(-) create mode 100644 docs/exploratory_analyses/chi_square_test.md create mode 100644 docs/exploratory_analyses/correlation_matrix.md create mode 100644 docs/exploratory_analyses/covariance_matrix.md delete mode 100644 docs/exploratory_analyses/statistical_testing.md diff --git a/docs/exploratory_analyses/chi_square_test.md b/docs/exploratory_analyses/chi_square_test.md new file mode 100644 index 00000000..52e00339 --- /dev/null +++ b/docs/exploratory_analyses/chi_square_test.md @@ -0,0 +1,3 @@ +# Chi-square test + +::: eis_toolkit.exploratory_analyses.chi_square_test diff --git a/docs/exploratory_analyses/correlation_matrix.md b/docs/exploratory_analyses/correlation_matrix.md new file mode 100644 index 00000000..e69de29b diff --git a/docs/exploratory_analyses/covariance_matrix.md b/docs/exploratory_analyses/covariance_matrix.md new file mode 100644 index 00000000..12385763 --- /dev/null +++ b/docs/exploratory_analyses/covariance_matrix.md @@ -0,0 +1,3 @@ +# Covariance matrix + +::: eis_toolkit.exploratory_analyses.covariance_matrix diff --git a/docs/exploratory_analyses/statistical_testing.md b/docs/exploratory_analyses/statistical_testing.md deleted file mode 100644 index 04df277a..00000000 --- a/docs/exploratory_analyses/statistical_testing.md +++ /dev/null @@ -1,3 +0,0 @@ -# Statistical (hypothesis) testing - -::: eis_toolkit.exploratory_analyses.statistical_tests From 68e00628af14ed426e61ad378a02f7521d0a1403 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Thu, 29 Feb 2024 09:41:21 +0200 Subject: [PATCH 8/9] minor update --- docs/exploratory_analyses/correlation_matrix.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/exploratory_analyses/correlation_matrix.md b/docs/exploratory_analyses/correlation_matrix.md index e69de29b..b86109e1 100644 --- a/docs/exploratory_analyses/correlation_matrix.md +++ b/docs/exploratory_analyses/correlation_matrix.md @@ -0,0 +1,3 @@ +# Correlation matrix + +::: eis_toolkit.exploratory_analyses.correlation_matrix From 19c7229bfecbf99078ffc5079c431f079c37a132 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Mon, 4 Mar 2024 13:45:50 +0200 Subject: [PATCH 9/9] fix: normality test filters numeric columns instead of error, fixed case where CLI function does not compute anything if columns param is left empty --- eis_toolkit/cli.py | 2 +- eis_toolkit/exploratory_analyses/normality_test.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index ad070e98..7c42f3b3 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -293,7 +293,7 @@ def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION], with rasterio.open(input_raster) as raster: data = raster.read() typer.echo("Progress: 25%") - + print(bands) if len(bands) == 0: bands = None results_dict = normality_test_array(data=data, bands=bands, nodata_value=raster.nodata) diff --git a/eis_toolkit/exploratory_analyses/normality_test.py b/eis_toolkit/exploratory_analyses/normality_test.py index 8e8fcad0..26d0fb79 100644 --- a/eis_toolkit/exploratory_analyses/normality_test.py +++ b/eis_toolkit/exploratory_analyses/normality_test.py @@ -36,13 +36,13 @@ def normality_test_dataframe( Raises: EmptyDataException: The input data is empty. InvalidColumnException: All selected columns were not found in the input data. - NonNumericDataException: Selected data or columns contains non-numeric data. + NonNumericDataException: Selected columns contain non-numeric data or no numeric columns were found. SampleSizeExceededException: Input data exceeds the maximum of 5000 samples. """ if check_empty_dataframe(data): raise EmptyDataException("The input Dataframe is empty.") - if columns is not None: + if columns is not None and columns != []: if not check_columns_valid(data, columns): raise InvalidColumnException("All selected columns were not found in the input DataFrame.") if not check_columns_numeric(data, columns): @@ -51,9 +51,9 @@ def normality_test_dataframe( data = data[columns].dropna() else: - if not check_columns_numeric(data, data.columns): - raise NonNumericDataException("The input data contain non-numeric data.") - columns = data.columns + columns = data.select_dtypes(include=[np.number]).columns + if len(columns) == 0: + raise NonNumericDataException("No numeric columns were found.") statistics = {} for column in columns: