diff --git a/docs/exploratory_analyses/chi_square_test.md b/docs/exploratory_analyses/chi_square_test.md new file mode 100644 index 00000000..52e00339 --- /dev/null +++ b/docs/exploratory_analyses/chi_square_test.md @@ -0,0 +1,3 @@ +# Chi-square test + +::: eis_toolkit.exploratory_analyses.chi_square_test diff --git a/docs/exploratory_analyses/correlation_matrix.md b/docs/exploratory_analyses/correlation_matrix.md new file mode 100644 index 00000000..b86109e1 --- /dev/null +++ b/docs/exploratory_analyses/correlation_matrix.md @@ -0,0 +1,3 @@ +# Correlation matrix + +::: eis_toolkit.exploratory_analyses.correlation_matrix diff --git a/docs/exploratory_analyses/covariance_matrix.md b/docs/exploratory_analyses/covariance_matrix.md new file mode 100644 index 00000000..12385763 --- /dev/null +++ b/docs/exploratory_analyses/covariance_matrix.md @@ -0,0 +1,3 @@ +# Covariance matrix + +::: eis_toolkit.exploratory_analyses.covariance_matrix diff --git a/docs/exploratory_analyses/statistical_testing.md b/docs/exploratory_analyses/statistical_testing.md deleted file mode 100644 index 04df277a..00000000 --- a/docs/exploratory_analyses/statistical_testing.md +++ /dev/null @@ -1,3 +0,0 @@ -# Statistical (hypothesis) testing - -::: eis_toolkit.exploratory_analyses.statistical_tests diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index c92246c8..6ce202d1 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -228,6 +228,14 @@ class LocalMoranWeightType(str, Enum): knn = "knn" +class CorrelationMethod(str, Enum): + """Correlation methods available.""" + + pearson = "pearson" + kendall = "kendall" + spearman = "spearman" + + RESAMPLING_MAPPING = { "nearest": warp.Resampling.nearest, "bilinear": warp.Resampling.bilinear, @@ -295,6 +303,135 @@ class LocalMoranWeightType(str, Enum): # --- EXPLORATORY ANALYSES --- +# NORMALITY TEST RASTER +@app.command() +def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION], bands: Optional[List[int]] = None): + """Compute Shapiro-Wilk test for normality on the input raster data.""" + from eis_toolkit.exploratory_analyses.normality_test import normality_test_array + + typer.echo("Progress: 10%") + + with rasterio.open(input_raster) as raster: + data = raster.read() + typer.echo("Progress: 25%") + print(bands) + if len(bands) == 0: + bands = None + results_dict = normality_test_array(data=data, bands=bands, nodata_value=raster.nodata) + + typer.echo("Progress: 75%") + + json_str = json.dumps(results_dict) + typer.echo("Progress: 100%") + typer.echo(f"Results: {json_str}") + typer.echo("Normality test (raster) completed") + + +# NORMALITY TEST VECTOR +@app.command() +def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], columns: Optional[List[str]] = None): + """Compute Shapiro-Wilk test for normality on the input vector data.""" + from eis_toolkit.exploratory_analyses.normality_test import normality_test_dataframe + + typer.echo("Progress: 10%") + + geodataframe = gpd.read_file(input_vector) + typer.echo("Progress: 25%") + + results_dict = normality_test_dataframe(data=geodataframe, columns=columns) + + typer.echo("Progress: 75%") + + json_str = json.dumps(results_dict) + typer.echo("Progress: 100%") + typer.echo(f"Results: {json_str}") + typer.echo("Normality test (vector) completed") + + +# CHI-SQUARE_TEST +@app.command() +def chi_square_test_cli( + input_vector: Annotated[Path, INPUT_FILE_OPTION], + target_column: str = typer.Option(), + columns: Optional[List[str]] = None, +): + """Perform a Chi-square test of independence between a target variable and one or more other variables.""" + from eis_toolkit.exploratory_analyses.chi_square_test import chi_square_test + + typer.echo("Progress: 10%") + + geodataframe = gpd.read_file(input_vector) # Should we drop geometry columns? + typer.echo("Progress: 25%") + + results_dict = chi_square_test(data=geodataframe, target_column=target_column, columns=columns) + + typer.echo("Progress: 75%") + + json_str = json.dumps(results_dict) + typer.echo("Progress: 100%") + typer.echo(f"Results: {json_str}") + typer.echo("Chi-square test completed") + + +# CORRELATION MATRIX +@app.command() +def correlation_matrix_cli( + input_vector: Annotated[Path, INPUT_FILE_OPTION], + output_file: Annotated[Path, OUTPUT_FILE_OPTION], + columns: Optional[List[str]] = None, + correlation_method: CorrelationMethod = CorrelationMethod.pearson, + min_periods: Optional[int] = None, +): + """Compute correlation matrix on the input data.""" + from eis_toolkit.exploratory_analyses.correlation_matrix import correlation_matrix + + typer.echo("Progress: 10%") + + geodataframe = gpd.read_file(input_vector) + dataframe = pd.DataFrame(geodataframe.drop(columns="geometry")) + typer.echo("Progress: 25%") + + output_df = correlation_matrix( + data=dataframe, columns=columns, correlation_method=correlation_method, min_periods=min_periods + ) + + typer.echo("Progress: 75%") + + output_df.to_csv(output_file) + typer.echo("Progress: 100%") + + typer.echo("Correlation matrix completed") + + +# COVARIANCE MATRIX +@app.command() +def covariance_matrix_cli( + input_vector: Annotated[Path, INPUT_FILE_OPTION], + output_file: Annotated[Path, OUTPUT_FILE_OPTION], + columns: Optional[List[str]] = None, + min_periods: Optional[int] = None, + delta_degrees_of_freedom: int = 1, +): + """Compute covariance matrix on the input data.""" + from eis_toolkit.exploratory_analyses.covariance_matrix import covariance_matrix + + typer.echo("Progress: 10%") + + geodataframe = gpd.read_file(input_vector) + dataframe = pd.DataFrame(geodataframe.drop(columns="geometry")) + typer.echo("Progress: 25%") + + output_df = covariance_matrix( + data=dataframe, columns=columns, min_periods=min_periods, delta_degrees_of_freedom=delta_degrees_of_freedom + ) + + typer.echo("Progress: 75%") + + output_df.to_csv(output_file) + typer.echo("Progress: 100%") + + typer.echo("Covariance matrix completed") + # DBSCAN @app.command() diff --git a/eis_toolkit/exploratory_analyses/chi_square_test.py b/eis_toolkit/exploratory_analyses/chi_square_test.py new file mode 100644 index 00000000..cf82aa25 --- /dev/null +++ b/eis_toolkit/exploratory_analyses/chi_square_test.py @@ -0,0 +1,52 @@ +import pandas as pd +from beartype import beartype +from beartype.typing import Dict, Optional, Sequence +from scipy.stats import chi2_contingency + +from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException +from eis_toolkit.utilities.checks.dataframe import check_columns_valid, check_empty_dataframe + + +@beartype +def chi_square_test( + data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None +) -> Dict[str, Dict[str, float]]: + """Perform a Chi-square test of independence between a target variable and one or more other variables. + + Input data should be categorical data. Continuous data or non-categorical data should be discretized or + binned before using this function, as Chi-square tests are not applicable to continuous variables directly. + + The test assumes that the observed frequencies in each category are independent. + + Args: + data: Dataframe containing the input data. + target_column: Variable against which independence of other variables is tested. + columns: Variables that are tested against the variable in target_column. If None, every column is used. + + Returns: + Test statistics, p-value and degrees of freedom for each variable. + + Raises: + EmptyDataFrameException: Input Dataframe is empty. + InvalidParameterValueException: Invalid column is input. + """ + if check_empty_dataframe(data): + raise EmptyDataFrameException("The input Dataframe is empty.") + + if not check_columns_valid(data, [target_column]): + raise InvalidParameterValueException("Target column not found in the Dataframe.") + + if columns: + invalid_columns = [column for column in columns if column not in data.columns] + if invalid_columns: + raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") + else: + columns = [col for col in data.columns if col != target_column] + + statistics = {} + for column in columns: + contingency_table = pd.crosstab(data[target_column], data[column]) + chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table) + statistics[column] = {"chi_square": chi_square, "p-value": p_value, "degrees_of_freedom": degrees_of_freedom} + + return statistics diff --git a/eis_toolkit/exploratory_analyses/correlation_matrix.py b/eis_toolkit/exploratory_analyses/correlation_matrix.py new file mode 100644 index 00000000..bdcb79f3 --- /dev/null +++ b/eis_toolkit/exploratory_analyses/correlation_matrix.py @@ -0,0 +1,57 @@ +import numpy as np +import pandas as pd +from beartype import beartype +from beartype.typing import Literal, Optional, Sequence + +from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException +from eis_toolkit.utilities.checks.dataframe import check_empty_dataframe + + +@beartype +def correlation_matrix( + data: pd.DataFrame, + columns: Optional[Sequence[str]] = None, + correlation_method: Literal["pearson", "kendall", "spearman"] = "pearson", + min_periods: Optional[int] = None, +) -> pd.DataFrame: + """Compute correlation matrix on the input data. + + It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations. + + Args: + data: Dataframe containing the input data. + columns: Columns to include in the correlation matrix. If None, all numeric columns are used. + correlation_method: 'pearson', 'kendall', or 'spearman'. Defaults to 'pearson'. + min_periods: Minimum number of observations required per pair of columns to have valid result. Optional. + + Returns: + Dataframe containing matrix representing the correlation coefficient \ + between the corresponding pair of variables. + + Raises: + EmptyDataFrameException: The input Dataframe is empty. + InvalidParameterValueException: min_periods argument is used with method 'kendall'. + NonNumericDataException: The selected columns contain non-numeric data. + """ + if check_empty_dataframe(data): + raise EmptyDataFrameException("The input Dataframe is empty.") + + if columns: + invalid_columns = [column for column in columns if column not in data.columns] + if invalid_columns: + raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") + data_subset = data[columns] + else: + data_subset = data.select_dtypes(include=np.number) + + if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))): + raise NonNumericDataException("The input data contain non-numeric data.") + + if correlation_method == "kendall" and min_periods is not None: + raise InvalidParameterValueException( + "The argument min_periods is available only with correlation methods 'pearson' and 'spearman'." + ) + + matrix = data_subset.corr(method=correlation_method, min_periods=min_periods, numeric_only=True) + + return matrix diff --git a/eis_toolkit/exploratory_analyses/covariance_matrix.py b/eis_toolkit/exploratory_analyses/covariance_matrix.py new file mode 100644 index 00000000..25c63850 --- /dev/null +++ b/eis_toolkit/exploratory_analyses/covariance_matrix.py @@ -0,0 +1,57 @@ +import numpy as np +import pandas as pd +from beartype import beartype +from beartype.typing import Optional, Sequence + +from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException +from eis_toolkit.utilities.checks.dataframe import check_empty_dataframe + + +@beartype +def covariance_matrix( + data: pd.DataFrame, + columns: Optional[Sequence[str]] = None, + min_periods: Optional[int] = None, + delta_degrees_of_freedom: int = 1, +) -> pd.DataFrame: + """Compute covariance matrix on the input data. + + It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations. + + Args: + data: Dataframe containing the input data. + columns: Columns to include in the covariance matrix. If None, all numeric columns are used. + min_periods: Minimum number of observations required per pair of columns to have valid result. Optional. + delta_degrees_of_freedom: Delta degrees of freedom used for computing covariance matrix. Defaults to 1. + + Returns: + Dataframe containing matrix representing the covariance between the corresponding pair of variables. + + Raises: + EmptyDataFrameException: The input Dataframe is empty. + InvalidParameterValueException: Provided value for delta_degrees_of_freedom or min_periods is negative. + NonNumericDataException: The input data contain non-numeric data. + """ + if check_empty_dataframe(data): + raise EmptyDataFrameException("The input Dataframe is empty.") + + if columns: + invalid_columns = [column for column in columns if column not in data.columns] + if invalid_columns: + raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") + data_subset = data[columns] + else: + data_subset = data.select_dtypes(include=np.number) + + if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))): + raise NonNumericDataException("The input data contain non-numeric data.") + + if delta_degrees_of_freedom < 0: + raise InvalidParameterValueException("Delta degrees of freedom must be non-negative.") + + if min_periods and min_periods < 0: + raise InvalidParameterValueException("Min perioids must be non-negative.") + + matrix = data_subset.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom) + + return matrix diff --git a/eis_toolkit/exploratory_analyses/normality_test.py b/eis_toolkit/exploratory_analyses/normality_test.py index 4034d2b1..26d0fb79 100644 --- a/eis_toolkit/exploratory_analyses/normality_test.py +++ b/eis_toolkit/exploratory_analyses/normality_test.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd from beartype import beartype -from beartype.typing import Dict, Optional, Sequence, Tuple +from beartype.typing import Dict, Optional, Sequence from scipy.stats import shapiro from eis_toolkit.exceptions import ( @@ -20,7 +20,7 @@ @beartype def normality_test_dataframe( data: pd.DataFrame, columns: Optional[Sequence[str]] = None -) -> Dict[str, Tuple[float, float]]: +) -> Dict[str, Dict[str, float]]: """ Compute Shapiro-Wilk test for normality on the input DataFrame. @@ -36,13 +36,13 @@ def normality_test_dataframe( Raises: EmptyDataException: The input data is empty. InvalidColumnException: All selected columns were not found in the input data. - NonNumericDataException: Selected data or columns contains non-numeric data. + NonNumericDataException: Selected columns contain non-numeric data or no numeric columns were found. SampleSizeExceededException: Input data exceeds the maximum of 5000 samples. """ if check_empty_dataframe(data): raise EmptyDataException("The input Dataframe is empty.") - if columns is not None: + if columns is not None and columns != []: if not check_columns_valid(data, columns): raise InvalidColumnException("All selected columns were not found in the input DataFrame.") if not check_columns_numeric(data, columns): @@ -51,15 +51,16 @@ def normality_test_dataframe( data = data[columns].dropna() else: - if not check_columns_numeric(data, data.columns): - raise NonNumericDataException("The input data contain non-numeric data.") - columns = data.columns + columns = data.select_dtypes(include=[np.number]).columns + if len(columns) == 0: + raise NonNumericDataException("No numeric columns were found.") statistics = {} for column in columns: if len(data[column]) > 5000: raise SampleSizeExceededException(f"Sample size for column '{column}' exceeds the limit of 5000 samples.") - statistics[column] = shapiro(data[column]) + stat, p_value = shapiro(data[column]) + statistics[column] = {"Statistic": stat, "p-value": p_value} return statistics @@ -67,7 +68,7 @@ def normality_test_dataframe( @beartype def normality_test_array( data: np.ndarray, bands: Optional[Sequence[int]] = None, nodata_value: Optional[Number] = None -) -> Dict[int, Tuple[float, float]]: +) -> Dict[str, Dict[str, float]]: """ Compute Shapiro-Wilk test for normality on the input Numpy array. @@ -94,14 +95,14 @@ def normality_test_array( if data.ndim == 1 or data.ndim == 2: prepared_data = np.expand_dims(data, axis=0) - bands = range(1) + bands = [1] elif data.ndim == 3: if bands is not None: - if not all(band < len(data) for band in bands): + if not all(band - 1 < len(data) for band in bands): raise InvalidRasterBandException("All selected bands were not found in the input array.") else: - bands = range(len(data)) + bands = range(1, len(data) + 1) prepared_data = data else: @@ -110,7 +111,8 @@ def normality_test_array( statistics = {} for band in bands: - flattened_data = prepared_data[band].ravel() + band_idx = band - 1 + flattened_data = prepared_data[band_idx].ravel() nan_mask = flattened_data == np.nan if nodata_value is not None: @@ -121,6 +123,7 @@ def normality_test_array( if len(masked_data) > 5000: raise SampleSizeExceededException(f"Sample size for band '{band}' exceeds the limit of 5000 samples.") - statistics[band] = shapiro(masked_data) + stat, p_value = shapiro(masked_data) + statistics[f"Band {band}"] = {"Statistic": stat, "p-value": p_value} return statistics diff --git a/eis_toolkit/exploratory_analyses/statistical_tests.py b/eis_toolkit/exploratory_analyses/statistical_tests.py deleted file mode 100644 index c7f80117..00000000 --- a/eis_toolkit/exploratory_analyses/statistical_tests.py +++ /dev/null @@ -1,127 +0,0 @@ -import pandas as pd -from beartype import beartype -from beartype.typing import Literal, Optional, Sequence -from scipy.stats import chi2_contingency - -from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException -from eis_toolkit.utilities.checks.dataframe import check_columns_numeric, check_columns_valid, check_empty_dataframe - - -@beartype -def chi_square_test(data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None) -> dict: - """Compute Chi-square test for independence on the input data. - - It is assumed that the variables in the input data are independent and that they are categorical, i.e. strings, - booleans or integers, but not floats. - - Args: - data: Dataframe containing the input data - target_column: Variable against which independence of other variables is tested. - columns: Variables that are tested against the variable in target_column. If None, every column is used. - - Returns: - Test statistics for each variable (except target_column). - - Raises: - EmptyDataFrameException: The input Dataframe is empty. - InvalidParameterValueException: The target_column is not in input Dataframe or invalid column is provided. - """ - if check_empty_dataframe(data): - raise EmptyDataFrameException("The input Dataframe is empty.") - - if not check_columns_valid(data, [target_column]): - raise InvalidParameterValueException("Target column not found in the Dataframe.") - - if columns is not None: - invalid_columns = [column for column in columns if column not in data.columns] - if any(invalid_columns): - raise InvalidParameterValueException(f"The following variables are not in the dataframe: {invalid_columns}") - else: - columns = data.columns - - statistics = {} - for column in columns: - if column != target_column: - contingency_table = pd.crosstab(data[target_column], data[column]) - chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table) - statistics[column] = (chi_square, p_value, degrees_of_freedom) - - return statistics - - -@beartype -def correlation_matrix( - data: pd.DataFrame, - correlation_method: Literal["pearson", "kendall", "spearman"] = "pearson", - min_periods: Optional[int] = None, -) -> pd.DataFrame: - """Compute correlation matrix on the input data. - - It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations. - - Args: - data: Dataframe containing the input data. - correlation_method: 'pearson', 'kendall', or 'spearman'. Defaults to 'pearson'. - min_periods: Minimum number of observations required per pair of columns to have valid result. Optional. - - Returns: - Dataframe containing matrix representing the correlation coefficient \ - between the corresponding pair of variables. - - Raises: - EmptyDataFrameException: The input Dataframe is empty. - InvalidParameterValueException: min_periods argument is used with method 'kendall'. - NonNumericDataException: The input data contain non-numeric data. - """ - if check_empty_dataframe(data): - raise EmptyDataFrameException("The input Dataframe is empty.") - - if not check_columns_numeric(data, data.columns.to_list()): - raise NonNumericDataException("The input data contain non-numeric data.") - - if correlation_method == "kendall" and min_periods is not None: - raise InvalidParameterValueException( - "The argument min_periods is available only with correlation methods 'pearson' and 'spearman'." - ) - - matrix = data.corr(method=correlation_method, min_periods=min_periods, numeric_only=True) - - return matrix - - -@beartype -def covariance_matrix( - data: pd.DataFrame, min_periods: Optional[int] = None, delta_degrees_of_freedom: int = 1 -) -> pd.DataFrame: - """Compute covariance matrix on the input data. - - It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations. - - Args: - data: Dataframe containing the input data. - min_periods: Minimum number of observations required per pair of columns to have valid result. Optional. - delta_degrees_of_freedom: Delta degrees of freedom used for computing covariance matrix. Defaults to 1. - - Returns: - Dataframe containing matrix representing the covariance between the corresponding pair of variables. - - Raises: - EmptyDataFrameException: The input Dataframe is empty. - InvalidParameterValueException: Provided value for delta_degrees_of_freedom or min_periods is negative. - NonNumericDataException: The input data contain non-numeric data. - """ - if check_empty_dataframe(data): - raise EmptyDataFrameException("The input Dataframe is empty.") - - if not check_columns_numeric(data, data.columns.to_list()): - raise NonNumericDataException("The input data contain non-numeric data.") - - if delta_degrees_of_freedom < 0: - raise InvalidParameterValueException("Delta degrees of freedom must be non-negative.") - - if min_periods and min_periods < 0: - raise InvalidParameterValueException("Min perioids must be non-negative.") - - matrix = data.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom) - - return matrix diff --git a/tests/exploratory_analyses/chi_square_test.py b/tests/exploratory_analyses/chi_square_test.py new file mode 100644 index 00000000..83385d47 --- /dev/null +++ b/tests/exploratory_analyses/chi_square_test.py @@ -0,0 +1,20 @@ +import numpy as np +import pandas as pd +import pytest + +from eis_toolkit.exceptions import InvalidParameterValueException +from eis_toolkit.exploratory_analyses.chi_square_test import chi_square_test + +DATA = pd.DataFrame({"e": [0, 0, 1, 1], "f": [True, False, True, True]}) + + +def test_chi_square_test(): + """Test that returned statistics for independence are correct.""" + output_statistics = chi_square_test(data=DATA, target_column="e", columns=["f"]) + np.testing.assert_array_equal(list(output_statistics["f"].values()), [0.0, 1.0, 1]) + + +def test_invalid_target_column(): + """Test that invalid target column raises the correct exception.""" + with pytest.raises(InvalidParameterValueException): + chi_square_test(data=DATA, target_column="invalid_column") diff --git a/tests/exploratory_analyses/correlation_matrix_test.py b/tests/exploratory_analyses/correlation_matrix_test.py new file mode 100644 index 00000000..903fa48c --- /dev/null +++ b/tests/exploratory_analyses/correlation_matrix_test.py @@ -0,0 +1,53 @@ +import numpy as np +import pytest +from beartype.roar import BeartypeCallHintParamViolation + +from eis_toolkit.exceptions import InvalidParameterValueException, NonNumericDataException +from eis_toolkit.exploratory_analyses.correlation_matrix import correlation_matrix +from tests.exploratory_analyses.covariance_matrix_test import DF, DF_NON_NUMERIC, DF_WITH_NAN + + +def test_correlation_matrix_nan(): + """Test that returned correlation matrix is correct, when NaN present in the dataframe.""" + expected_correlation_matrix = np.array( + [ + [1.000000, -0.577350, -1.000000, 1.000000], + [-0.577350, 1.000000, np.nan, -0.577350], + [-1.000000, np.nan, 1.000000, -1.000000], + [1.000000, -0.577350, -1.000000, 1.000000], + ] + ) + output_matrix = correlation_matrix(data=DF_WITH_NAN) + np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) + + +def test_correlation_matrix(): + """Test that returned correlation matrix is correct.""" + expected_correlation_matrix = np.array( + [ + [1.000000, -0.577350, -0.904534, 1.000000], + [-0.577350, 1.000000, 0.174078, -0.577350], + [-0.904534, 0.174078, 1.000000, -0.904534], + [1.000000, -0.577350, -0.904534, 1.000000], + ] + ) + output_matrix = correlation_matrix(data=DF) + np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) + + +def test_correlation_matrix_non_numeric(): + """Test that returned correlation matrix is correct.""" + with pytest.raises(NonNumericDataException): + correlation_matrix(data=DF_NON_NUMERIC, columns=["a", "b"]) + + +def test_invalid_correlation_method(): + """Test that invalid correlation method raises the correct exception.""" + with pytest.raises(BeartypeCallHintParamViolation): + correlation_matrix(data=DF, correlation_method="invalid_method") + + +def test_min_periods_with_kendall(): + """Test that min_periods with correlation_method 'kendall' raises the correct exception.""" + with pytest.raises(InvalidParameterValueException): + correlation_matrix(data=DF, correlation_method="kendall", min_periods=1) diff --git a/tests/exploratory_analyses/covariance_matrix_test.py b/tests/exploratory_analyses/covariance_matrix_test.py new file mode 100644 index 00000000..be69dd64 --- /dev/null +++ b/tests/exploratory_analyses/covariance_matrix_test.py @@ -0,0 +1,56 @@ +import numpy as np +import pandas as pd +import pytest + +from eis_toolkit.exceptions import InvalidParameterValueException +from eis_toolkit.exploratory_analyses.covariance_matrix import covariance_matrix + +DATA = np.array([[0, 1, 2, 1], [2, 0, 1, 2], [2, 1, 0, 2], [0, 1, 2, 1]]) +DF = pd.DataFrame(DATA, columns=["a", "b", "c", "d"]) +DF_NON_NUMERIC = pd.DataFrame( + data=np.array([[0, 1, 2, 1], ["a", "b", "c", "d"], [3, 2, 1, 0], ["c", "d", "b", "a"]]), + columns=["a", "b", "c", "d"], +) +DF_WITH_NAN = pd.DataFrame( + data=np.array([[0, 1, 2, 1], [2, 0, np.nan, 2], [2, 1, 0, 2], [0, 1, 2, 1]]), columns=["a", "b", "c", "d"] +) + + +def test_covariance_matrix_nan(): + """Test that returned covariance matrix is correct, when NaN present in the dataframe.""" + expected_correlation_matrix = np.array( + [ + [1.333333, -0.333333, -1.333333, 0.666667], + [-0.333333, 0.25, 0, -0.166667], + [-1.333333, 0, 1.333333, -0.666667], + [0.666667, -0.166667, -0.666667, 0.333333], + ] + ) + output_matrix = covariance_matrix(data=DF_WITH_NAN) + np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) + + +def test_covariance_matrix(): + """Test that returned covariance matrix is correct.""" + expected_covariance_matrix = np.array( + [ + [1.333333, -0.333333, -1.000000, 0.666667], + [-0.333333, 0.250000, 0.083333, -0.166667], + [-1.000000, 0.083333, 0.916667, -0.500000], + [0.666667, -0.166667, -0.500000, 0.333333], + ] + ) + output_matrix = covariance_matrix(data=DF) + np.testing.assert_array_almost_equal(output_matrix, expected_covariance_matrix) + + +def test_covariance_matrix_negative_min_periods(): + """Test that negative min_periods value raises the correct exception.""" + with pytest.raises(InvalidParameterValueException): + covariance_matrix(data=DF, min_periods=-1) + + +def test_invalid_ddof(): + """Test that invalid delta degrees of freedom raises the correct exception.""" + with pytest.raises(InvalidParameterValueException): + covariance_matrix(data=DF, delta_degrees_of_freedom=-1) diff --git a/tests/exploratory_analyses/normality_test_test.py b/tests/exploratory_analyses/normality_test_test.py index c2012578..dcf3f5b4 100644 --- a/tests/exploratory_analyses/normality_test_test.py +++ b/tests/exploratory_analyses/normality_test_test.py @@ -24,35 +24,35 @@ def test_normality_test_dataframe(): """Test that returned normality statistics for DataFrame data are correct.""" output_statistics = normality_test_dataframe(data=DATA_DF, columns=["a"]) - np.testing.assert_array_almost_equal(output_statistics["a"], (0.82827, 0.13502), decimal=5) + np.testing.assert_array_almost_equal(list(output_statistics["a"].values()), [0.82827, 0.13502], decimal=5) def test_normality_test_array(): """Test that returned normality statistics for Numpy array data are correct.""" # 3D array - output_statistics = normality_test_array(data=DATA_ARRAY, bands=[0]) - np.testing.assert_array_almost_equal(output_statistics[0], (0.91021, 0.01506), decimal=5) + output_statistics = normality_test_array(data=DATA_ARRAY, bands=[1]) + np.testing.assert_array_almost_equal(list(output_statistics["Band 1"].values()), [0.91021, 0.01506], decimal=5) # 2D array output_statistics = normality_test_array(data=DATA_ARRAY[0]) - np.testing.assert_array_almost_equal(output_statistics[0], (0.91021, 0.01506), decimal=5) + np.testing.assert_array_almost_equal(list(output_statistics["Band 1"].values()), [0.91021, 0.01506], decimal=5) # 1D array output_statistics = normality_test_array(data=DATA_ARRAY[0][0]) - np.testing.assert_array_almost_equal(output_statistics[0], (0.9067, 0.41504), decimal=5) + np.testing.assert_array_almost_equal(list(output_statistics["Band 1"].values()), [0.9067, 0.41504], decimal=5) def test_normality_test_dataframe_missing_data(): """Test that DataFrame input with missing data returns statistics correctly.""" df_with_nan = DATA_DF.replace(3, np.nan) output_statistics = normality_test_dataframe(data=df_with_nan, columns=["a"]) - np.testing.assert_array_almost_equal(output_statistics["a"], (0.62978, 0.00124), decimal=5) + np.testing.assert_array_almost_equal(list(output_statistics["a"].values()), (0.62978, 0.00124), decimal=5) def test_normality_test_array_nodata(): """Test that Numpy array input with missing data returns statistics correctly.""" output_statistics = normality_test_array(data=DATA_ARRAY, nodata_value=3) - np.testing.assert_array_almost_equal(output_statistics[0], (0.91021, 0.01506), decimal=5) + np.testing.assert_array_almost_equal(list(output_statistics["Band 1"].values()), [0.91021, 0.01506], decimal=5) def test_invalid_selection(): diff --git a/tests/exploratory_analyses/statistical_tests_test.py b/tests/exploratory_analyses/statistical_tests_test.py deleted file mode 100644 index 71954c0f..00000000 --- a/tests/exploratory_analyses/statistical_tests_test.py +++ /dev/null @@ -1,117 +0,0 @@ -import numpy as np -import pandas as pd -import pytest -from beartype.roar import BeartypeCallHintParamViolation - -from eis_toolkit.exceptions import InvalidParameterValueException, NonNumericDataException -from eis_toolkit.exploratory_analyses.statistical_tests import chi_square_test, correlation_matrix, covariance_matrix - -data = np.array([[0, 1, 2, 1], [2, 0, 1, 2], [2, 1, 0, 2], [0, 1, 2, 1]]) -missing_data = np.array([[0, 1, 2, 1], [2, 0, np.nan, 2], [2, 1, 0, 2], [0, 1, 2, 1]]) -non_numeric_data = np.array([[0, 1, 2, 1], ["a", "b", "c", "d"], [3, 2, 1, 0], ["c", "d", "b", "a"]]) -numeric_data = pd.DataFrame(data, columns=["a", "b", "c", "d"]) -non_numeric_df = pd.DataFrame(non_numeric_data, columns=["a", "b", "c", "d"]) -missing_values_df = pd.DataFrame(missing_data, columns=["a", "b", "c", "d"]) -categorical_data = pd.DataFrame({"e": [0, 0, 1, 1], "f": [True, False, True, True]}) -target_column = "e" -np.random.seed(42) -large_data = np.random.normal(size=5001) -large_df = pd.DataFrame(large_data, columns=["a"]) - - -def test_chi_square_test(): - """Test that returned statistics for independence are correct.""" - output_statistics = chi_square_test(data=categorical_data, target_column=target_column, columns=["f"]) - np.testing.assert_array_equal((output_statistics["f"]), (0.0, 1.0, 1)) - - -def test_correlation_matrix_nan(): - """Test that returned correlation matrix is correct, when NaN present in the dataframe.""" - expected_correlation_matrix = np.array( - [ - [1.000000, -0.577350, -1.000000, 1.000000], - [-0.577350, 1.000000, np.nan, -0.577350], - [-1.000000, np.nan, 1.000000, -1.000000], - [1.000000, -0.577350, -1.000000, 1.000000], - ] - ) - output_matrix = correlation_matrix(data=missing_values_df) - np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) - - -def test_correlation_matrix(): - """Test that returned correlation matrix is correct.""" - expected_correlation_matrix = np.array( - [ - [1.000000, -0.577350, -0.904534, 1.000000], - [-0.577350, 1.000000, 0.174078, -0.577350], - [-0.904534, 0.174078, 1.000000, -0.904534], - [1.000000, -0.577350, -0.904534, 1.000000], - ] - ) - output_matrix = correlation_matrix(data=numeric_data) - np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) - - -def test_correlation_matrix_non_numeric(): - """Test that returned correlation matrix is correct.""" - with pytest.raises(NonNumericDataException): - correlation_matrix(data=non_numeric_df) - - -def test_covariance_matrix_nan(): - """Test that returned covariance matrix is correct, when NaN present in the dataframe.""" - expected_correlation_matrix = np.array( - [ - [1.333333, -0.333333, -1.333333, 0.666667], - [-0.333333, 0.25, 0, -0.166667], - [-1.333333, 0, 1.333333, -0.666667], - [0.666667, -0.166667, -0.666667, 0.333333], - ] - ) - output_matrix = covariance_matrix(data=missing_values_df) - np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) - - -def test_covariance_matrix(): - """Test that returned covariance matrix is correct.""" - expected_covariance_matrix = np.array( - [ - [1.333333, -0.333333, -1.000000, 0.666667], - [-0.333333, 0.250000, 0.083333, -0.166667], - [-1.000000, 0.083333, 0.916667, -0.500000], - [0.666667, -0.166667, -0.500000, 0.333333], - ] - ) - output_matrix = covariance_matrix(data=numeric_data) - np.testing.assert_array_almost_equal(output_matrix, expected_covariance_matrix) - - -def test_covariance_matrix_negative_min_periods(): - """Test that negative min_periods value raises the correct exception.""" - with pytest.raises(InvalidParameterValueException): - covariance_matrix(data=numeric_data, min_periods=-1) - - -def test_invalid_target_column(): - """Test that invalid target column raises the correct exception.""" - with pytest.raises(InvalidParameterValueException): - chi_square_test(data=categorical_data, target_column="invalid_column") - - -def test_invalid_correlation_method(): - """Test that invalid correlation method raises the correct exception.""" - with pytest.raises(BeartypeCallHintParamViolation): - correlation_matrix(data=numeric_data, correlation_method="invalid_method") - - -def test_min_periods_with_kendall(): - """Test that min_periods with correlation_method 'kendall' raises the correct exception.""" - with pytest.raises(InvalidParameterValueException): - correlation_matrix(data=numeric_data, correlation_method="kendall", min_periods=1) - - -def test_invalid_ddof(): - """Test that invalid delta degrees of freedom raises the correct exception.""" - with pytest.raises(InvalidParameterValueException): - covariance_matrix(data=numeric_data, delta_degrees_of_freedom=-1)