Merge pull request #337 from GispoCoding/321-add-cli-functions-for-mi…

…ssing-statistical-tests 321 add CLI functions for missing statistical tests
GispoCoding · Mar 4, 2024 · 9e1639e · 9e1639e
2 parents 5813e55 + 19c7229
commit 9e1639e
Show file tree

Hide file tree

Showing 15 changed files with 465 additions and 268 deletions.
diff --git a/docs/exploratory_analyses/chi_square_test.md b/docs/exploratory_analyses/chi_square_test.md
@@ -0,0 +1,3 @@
+# Chi-square test
+
+::: eis_toolkit.exploratory_analyses.chi_square_test
diff --git a/docs/exploratory_analyses/correlation_matrix.md b/docs/exploratory_analyses/correlation_matrix.md
@@ -0,0 +1,3 @@
+# Correlation matrix
+
+::: eis_toolkit.exploratory_analyses.correlation_matrix
diff --git a/docs/exploratory_analyses/covariance_matrix.md b/docs/exploratory_analyses/covariance_matrix.md
@@ -0,0 +1,3 @@
+# Covariance matrix
+
+::: eis_toolkit.exploratory_analyses.covariance_matrix
diff --git a/docs/exploratory_analyses/statistical_testing.md b/docs/exploratory_analyses/statistical_testing.md
diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py
@@ -228,6 +228,14 @@ class LocalMoranWeightType(str, Enum):
     knn = "knn"
 
 
+class CorrelationMethod(str, Enum):
+    """Correlation methods available."""
+
+    pearson = "pearson"
+    kendall = "kendall"
+    spearman = "spearman"
+
+
 RESAMPLING_MAPPING = {
     "nearest": warp.Resampling.nearest,
     "bilinear": warp.Resampling.bilinear,
@@ -295,6 +303,135 @@ class LocalMoranWeightType(str, Enum):
 
 # --- EXPLORATORY ANALYSES ---
 
+# NORMALITY TEST RASTER
+@app.command()
+def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION], bands: Optional[List[int]] = None):
+    """Compute Shapiro-Wilk test for normality on the input raster data."""
+    from eis_toolkit.exploratory_analyses.normality_test import normality_test_array
+
+    typer.echo("Progress: 10%")
+
+    with rasterio.open(input_raster) as raster:
+        data = raster.read()
+        typer.echo("Progress: 25%")
+        print(bands)
+        if len(bands) == 0:
+            bands = None
+        results_dict = normality_test_array(data=data, bands=bands, nodata_value=raster.nodata)
+
+    typer.echo("Progress: 75%")
+
+    json_str = json.dumps(results_dict)
+    typer.echo("Progress: 100%")
+    typer.echo(f"Results: {json_str}")
+    typer.echo("Normality test (raster) completed")
+
+
+# NORMALITY TEST VECTOR
+@app.command()
+def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], columns: Optional[List[str]] = None):
+    """Compute Shapiro-Wilk test for normality on the input vector data."""
+    from eis_toolkit.exploratory_analyses.normality_test import normality_test_dataframe
+
+    typer.echo("Progress: 10%")
+
+    geodataframe = gpd.read_file(input_vector)
+    typer.echo("Progress: 25%")
+
+    results_dict = normality_test_dataframe(data=geodataframe, columns=columns)
+
+    typer.echo("Progress: 75%")
+
+    json_str = json.dumps(results_dict)
+    typer.echo("Progress: 100%")
+    typer.echo(f"Results: {json_str}")
+    typer.echo("Normality test (vector) completed")
+
+
+# CHI-SQUARE_TEST
+@app.command()
+def chi_square_test_cli(
+    input_vector: Annotated[Path, INPUT_FILE_OPTION],
+    target_column: str = typer.Option(),
+    columns: Optional[List[str]] = None,
+):
+    """Perform a Chi-square test of independence between a target variable and one or more other variables."""
+    from eis_toolkit.exploratory_analyses.chi_square_test import chi_square_test
+
+    typer.echo("Progress: 10%")
+
+    geodataframe = gpd.read_file(input_vector)  # Should we drop geometry columns?
+    typer.echo("Progress: 25%")
+
+    results_dict = chi_square_test(data=geodataframe, target_column=target_column, columns=columns)
+
+    typer.echo("Progress: 75%")
+
+    json_str = json.dumps(results_dict)
+    typer.echo("Progress: 100%")
+    typer.echo(f"Results: {json_str}")
+    typer.echo("Chi-square test completed")
+
+
+# CORRELATION MATRIX
+@app.command()
+def correlation_matrix_cli(
+    input_vector: Annotated[Path, INPUT_FILE_OPTION],
+    output_file: Annotated[Path, OUTPUT_FILE_OPTION],
+    columns: Optional[List[str]] = None,
+    correlation_method: CorrelationMethod = CorrelationMethod.pearson,
+    min_periods: Optional[int] = None,
+):
+    """Compute correlation matrix on the input data."""
+    from eis_toolkit.exploratory_analyses.correlation_matrix import correlation_matrix
+
+    typer.echo("Progress: 10%")
+
+    geodataframe = gpd.read_file(input_vector)
+    dataframe = pd.DataFrame(geodataframe.drop(columns="geometry"))
+    typer.echo("Progress: 25%")
+
+    output_df = correlation_matrix(
+        data=dataframe, columns=columns, correlation_method=correlation_method, min_periods=min_periods
+    )
+
+    typer.echo("Progress: 75%")
+
+    output_df.to_csv(output_file)
+    typer.echo("Progress: 100%")
+
+    typer.echo("Correlation matrix completed")
+
+
+# COVARIANCE MATRIX
+@app.command()
+def covariance_matrix_cli(
+    input_vector: Annotated[Path, INPUT_FILE_OPTION],
+    output_file: Annotated[Path, OUTPUT_FILE_OPTION],
+    columns: Optional[List[str]] = None,
+    min_periods: Optional[int] = None,
+    delta_degrees_of_freedom: int = 1,
+):
+    """Compute covariance matrix on the input data."""
+    from eis_toolkit.exploratory_analyses.covariance_matrix import covariance_matrix
+
+    typer.echo("Progress: 10%")
+
+    geodataframe = gpd.read_file(input_vector)
+    dataframe = pd.DataFrame(geodataframe.drop(columns="geometry"))
+    typer.echo("Progress: 25%")
+
+    output_df = covariance_matrix(
+        data=dataframe, columns=columns, min_periods=min_periods, delta_degrees_of_freedom=delta_degrees_of_freedom
+    )
+
+    typer.echo("Progress: 75%")
+
+    output_df.to_csv(output_file)
+    typer.echo("Progress: 100%")
+
+    typer.echo("Covariance matrix completed")
+
 
 # DBSCAN
 @app.command()

diff --git a/eis_toolkit/exploratory_analyses/chi_square_test.py b/eis_toolkit/exploratory_analyses/chi_square_test.py
@@ -0,0 +1,52 @@
+import pandas as pd
+from beartype import beartype
+from beartype.typing import Dict, Optional, Sequence
+from scipy.stats import chi2_contingency
+
+from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException
+from eis_toolkit.utilities.checks.dataframe import check_columns_valid, check_empty_dataframe
+
+
+@beartype
+def chi_square_test(
+    data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None
+) -> Dict[str, Dict[str, float]]:
+    """Perform a Chi-square test of independence between a target variable and one or more other variables.
+
+    Input data should be categorical data. Continuous data or non-categorical data should be discretized or
+    binned before using this function, as Chi-square tests are not applicable to continuous variables directly.
+
+    The test assumes that the observed frequencies in each category are independent.
+
+    Args:
+        data: Dataframe containing the input data.
+        target_column: Variable against which independence of other variables is tested.
+        columns: Variables that are tested against the variable in target_column. If None, every column is used.
+
+    Returns:
+        Test statistics, p-value and degrees of freedom for each variable.
+
+    Raises:
+        EmptyDataFrameException: Input Dataframe is empty.
+        InvalidParameterValueException: Invalid column is input.
+    """
+    if check_empty_dataframe(data):
+        raise EmptyDataFrameException("The input Dataframe is empty.")
+
+    if not check_columns_valid(data, [target_column]):
+        raise InvalidParameterValueException("Target column not found in the Dataframe.")
+
+    if columns:
+        invalid_columns = [column for column in columns if column not in data.columns]
+        if invalid_columns:
+            raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}")
+    else:
+        columns = [col for col in data.columns if col != target_column]
+
+    statistics = {}
+    for column in columns:
+        contingency_table = pd.crosstab(data[target_column], data[column])
+        chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table)
+        statistics[column] = {"chi_square": chi_square, "p-value": p_value, "degrees_of_freedom": degrees_of_freedom}
+
+    return statistics
diff --git a/eis_toolkit/exploratory_analyses/correlation_matrix.py b/eis_toolkit/exploratory_analyses/correlation_matrix.py
@@ -0,0 +1,57 @@
+import numpy as np
+import pandas as pd
+from beartype import beartype
+from beartype.typing import Literal, Optional, Sequence
+
+from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException
+from eis_toolkit.utilities.checks.dataframe import check_empty_dataframe
+
+
+@beartype
+def correlation_matrix(
+    data: pd.DataFrame,
+    columns: Optional[Sequence[str]] = None,
+    correlation_method: Literal["pearson", "kendall", "spearman"] = "pearson",
+    min_periods: Optional[int] = None,
+) -> pd.DataFrame:
+    """Compute correlation matrix on the input data.
+
+    It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations.
+
+    Args:
+        data: Dataframe containing the input data.
+        columns: Columns to include in the correlation matrix. If None, all numeric columns are used.
+        correlation_method: 'pearson', 'kendall', or 'spearman'. Defaults to 'pearson'.
+        min_periods: Minimum number of observations required per pair of columns to have valid result. Optional.
+
+    Returns:
+        Dataframe containing matrix representing the correlation coefficient \
+            between the corresponding pair of variables.
+
+    Raises:
+        EmptyDataFrameException: The input Dataframe is empty.
+        InvalidParameterValueException: min_periods argument is used with method 'kendall'.
+        NonNumericDataException: The selected columns contain non-numeric data.
+    """
+    if check_empty_dataframe(data):
+        raise EmptyDataFrameException("The input Dataframe is empty.")
+
+    if columns:
+        invalid_columns = [column for column in columns if column not in data.columns]
+        if invalid_columns:
+            raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}")
+        data_subset = data[columns]
+    else:
+        data_subset = data.select_dtypes(include=np.number)
+
+    if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))):
+        raise NonNumericDataException("The input data contain non-numeric data.")
+
+    if correlation_method == "kendall" and min_periods is not None:
+        raise InvalidParameterValueException(
+            "The argument min_periods is available only with correlation methods 'pearson' and 'spearman'."
+        )
+
+    matrix = data_subset.corr(method=correlation_method, min_periods=min_periods, numeric_only=True)
+
+    return matrix
diff --git a/eis_toolkit/exploratory_analyses/covariance_matrix.py b/eis_toolkit/exploratory_analyses/covariance_matrix.py
@@ -0,0 +1,57 @@
+import numpy as np
+import pandas as pd
+from beartype import beartype
+from beartype.typing import Optional, Sequence
+
+from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException
+from eis_toolkit.utilities.checks.dataframe import check_empty_dataframe
+
+
+@beartype
+def covariance_matrix(
+    data: pd.DataFrame,
+    columns: Optional[Sequence[str]] = None,
+    min_periods: Optional[int] = None,
+    delta_degrees_of_freedom: int = 1,
+) -> pd.DataFrame:
+    """Compute covariance matrix on the input data.
+
+    It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations.
+
+    Args:
+        data: Dataframe containing the input data.
+        columns: Columns to include in the covariance matrix. If None, all numeric columns are used.
+        min_periods: Minimum number of observations required per pair of columns to have valid result. Optional.
+        delta_degrees_of_freedom: Delta degrees of freedom used for computing covariance matrix. Defaults to 1.
+
+    Returns:
+        Dataframe containing matrix representing the covariance between the corresponding pair of variables.
+
+    Raises:
+        EmptyDataFrameException: The input Dataframe is empty.
+        InvalidParameterValueException: Provided value for delta_degrees_of_freedom or min_periods is negative.
+        NonNumericDataException: The input data contain non-numeric data.
+    """
+    if check_empty_dataframe(data):
+        raise EmptyDataFrameException("The input Dataframe is empty.")
+
+    if columns:
+        invalid_columns = [column for column in columns if column not in data.columns]
+        if invalid_columns:
+            raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}")
+        data_subset = data[columns]
+    else:
+        data_subset = data.select_dtypes(include=np.number)
+
+    if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))):
+        raise NonNumericDataException("The input data contain non-numeric data.")
+
+    if delta_degrees_of_freedom < 0:
+        raise InvalidParameterValueException("Delta degrees of freedom must be non-negative.")
+
+    if min_periods and min_periods < 0:
+        raise InvalidParameterValueException("Min perioids must be non-negative.")
+
+    matrix = data_subset.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom)
+
+    return matrix