Skip to content

Commit

Permalink
Merge pull request #337 from GispoCoding/321-add-cli-functions-for-mi…
Browse files Browse the repository at this point in the history
…ssing-statistical-tests

321 add CLI functions for missing statistical tests
  • Loading branch information
nmaarnio committed Mar 4, 2024
2 parents 5813e55 + 19c7229 commit 9e1639e
Show file tree
Hide file tree
Showing 15 changed files with 465 additions and 268 deletions.
3 changes: 3 additions & 0 deletions docs/exploratory_analyses/chi_square_test.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Chi-square test

::: eis_toolkit.exploratory_analyses.chi_square_test
3 changes: 3 additions & 0 deletions docs/exploratory_analyses/correlation_matrix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Correlation matrix

::: eis_toolkit.exploratory_analyses.correlation_matrix
3 changes: 3 additions & 0 deletions docs/exploratory_analyses/covariance_matrix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Covariance matrix

::: eis_toolkit.exploratory_analyses.covariance_matrix
3 changes: 0 additions & 3 deletions docs/exploratory_analyses/statistical_testing.md

This file was deleted.

137 changes: 137 additions & 0 deletions eis_toolkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,14 @@ class LocalMoranWeightType(str, Enum):
knn = "knn"


class CorrelationMethod(str, Enum):
"""Correlation methods available."""

pearson = "pearson"
kendall = "kendall"
spearman = "spearman"


RESAMPLING_MAPPING = {
"nearest": warp.Resampling.nearest,
"bilinear": warp.Resampling.bilinear,
Expand Down Expand Up @@ -295,6 +303,135 @@ class LocalMoranWeightType(str, Enum):

# --- EXPLORATORY ANALYSES ---

# NORMALITY TEST RASTER
@app.command()
def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION], bands: Optional[List[int]] = None):
"""Compute Shapiro-Wilk test for normality on the input raster data."""
from eis_toolkit.exploratory_analyses.normality_test import normality_test_array

typer.echo("Progress: 10%")

with rasterio.open(input_raster) as raster:
data = raster.read()
typer.echo("Progress: 25%")
print(bands)
if len(bands) == 0:
bands = None
results_dict = normality_test_array(data=data, bands=bands, nodata_value=raster.nodata)

typer.echo("Progress: 75%")

json_str = json.dumps(results_dict)
typer.echo("Progress: 100%")
typer.echo(f"Results: {json_str}")
typer.echo("Normality test (raster) completed")


# NORMALITY TEST VECTOR
@app.command()
def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], columns: Optional[List[str]] = None):
"""Compute Shapiro-Wilk test for normality on the input vector data."""
from eis_toolkit.exploratory_analyses.normality_test import normality_test_dataframe

typer.echo("Progress: 10%")

geodataframe = gpd.read_file(input_vector)
typer.echo("Progress: 25%")

results_dict = normality_test_dataframe(data=geodataframe, columns=columns)

typer.echo("Progress: 75%")

json_str = json.dumps(results_dict)
typer.echo("Progress: 100%")
typer.echo(f"Results: {json_str}")
typer.echo("Normality test (vector) completed")


# CHI-SQUARE_TEST
@app.command()
def chi_square_test_cli(
input_vector: Annotated[Path, INPUT_FILE_OPTION],
target_column: str = typer.Option(),
columns: Optional[List[str]] = None,
):
"""Perform a Chi-square test of independence between a target variable and one or more other variables."""
from eis_toolkit.exploratory_analyses.chi_square_test import chi_square_test

typer.echo("Progress: 10%")

geodataframe = gpd.read_file(input_vector) # Should we drop geometry columns?
typer.echo("Progress: 25%")

results_dict = chi_square_test(data=geodataframe, target_column=target_column, columns=columns)

typer.echo("Progress: 75%")

json_str = json.dumps(results_dict)
typer.echo("Progress: 100%")
typer.echo(f"Results: {json_str}")
typer.echo("Chi-square test completed")


# CORRELATION MATRIX
@app.command()
def correlation_matrix_cli(
input_vector: Annotated[Path, INPUT_FILE_OPTION],
output_file: Annotated[Path, OUTPUT_FILE_OPTION],
columns: Optional[List[str]] = None,
correlation_method: CorrelationMethod = CorrelationMethod.pearson,
min_periods: Optional[int] = None,
):
"""Compute correlation matrix on the input data."""
from eis_toolkit.exploratory_analyses.correlation_matrix import correlation_matrix

typer.echo("Progress: 10%")

geodataframe = gpd.read_file(input_vector)
dataframe = pd.DataFrame(geodataframe.drop(columns="geometry"))
typer.echo("Progress: 25%")

output_df = correlation_matrix(
data=dataframe, columns=columns, correlation_method=correlation_method, min_periods=min_periods
)

typer.echo("Progress: 75%")

output_df.to_csv(output_file)
typer.echo("Progress: 100%")

typer.echo("Correlation matrix completed")


# COVARIANCE MATRIX
@app.command()
def covariance_matrix_cli(
input_vector: Annotated[Path, INPUT_FILE_OPTION],
output_file: Annotated[Path, OUTPUT_FILE_OPTION],
columns: Optional[List[str]] = None,
min_periods: Optional[int] = None,
delta_degrees_of_freedom: int = 1,
):
"""Compute covariance matrix on the input data."""
from eis_toolkit.exploratory_analyses.covariance_matrix import covariance_matrix

typer.echo("Progress: 10%")

geodataframe = gpd.read_file(input_vector)
dataframe = pd.DataFrame(geodataframe.drop(columns="geometry"))
typer.echo("Progress: 25%")

output_df = covariance_matrix(
data=dataframe, columns=columns, min_periods=min_periods, delta_degrees_of_freedom=delta_degrees_of_freedom
)

typer.echo("Progress: 75%")

output_df.to_csv(output_file)
typer.echo("Progress: 100%")

typer.echo("Covariance matrix completed")


# DBSCAN
@app.command()
Expand Down
52 changes: 52 additions & 0 deletions eis_toolkit/exploratory_analyses/chi_square_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import pandas as pd
from beartype import beartype
from beartype.typing import Dict, Optional, Sequence
from scipy.stats import chi2_contingency

from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException
from eis_toolkit.utilities.checks.dataframe import check_columns_valid, check_empty_dataframe


@beartype
def chi_square_test(
data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None
) -> Dict[str, Dict[str, float]]:
"""Perform a Chi-square test of independence between a target variable and one or more other variables.
Input data should be categorical data. Continuous data or non-categorical data should be discretized or
binned before using this function, as Chi-square tests are not applicable to continuous variables directly.
The test assumes that the observed frequencies in each category are independent.
Args:
data: Dataframe containing the input data.
target_column: Variable against which independence of other variables is tested.
columns: Variables that are tested against the variable in target_column. If None, every column is used.
Returns:
Test statistics, p-value and degrees of freedom for each variable.
Raises:
EmptyDataFrameException: Input Dataframe is empty.
InvalidParameterValueException: Invalid column is input.
"""
if check_empty_dataframe(data):
raise EmptyDataFrameException("The input Dataframe is empty.")

if not check_columns_valid(data, [target_column]):
raise InvalidParameterValueException("Target column not found in the Dataframe.")

if columns:
invalid_columns = [column for column in columns if column not in data.columns]
if invalid_columns:
raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}")
else:
columns = [col for col in data.columns if col != target_column]

statistics = {}
for column in columns:
contingency_table = pd.crosstab(data[target_column], data[column])
chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table)
statistics[column] = {"chi_square": chi_square, "p-value": p_value, "degrees_of_freedom": degrees_of_freedom}

return statistics
57 changes: 57 additions & 0 deletions eis_toolkit/exploratory_analyses/correlation_matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import numpy as np
import pandas as pd
from beartype import beartype
from beartype.typing import Literal, Optional, Sequence

from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException
from eis_toolkit.utilities.checks.dataframe import check_empty_dataframe


@beartype
def correlation_matrix(
data: pd.DataFrame,
columns: Optional[Sequence[str]] = None,
correlation_method: Literal["pearson", "kendall", "spearman"] = "pearson",
min_periods: Optional[int] = None,
) -> pd.DataFrame:
"""Compute correlation matrix on the input data.
It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations.
Args:
data: Dataframe containing the input data.
columns: Columns to include in the correlation matrix. If None, all numeric columns are used.
correlation_method: 'pearson', 'kendall', or 'spearman'. Defaults to 'pearson'.
min_periods: Minimum number of observations required per pair of columns to have valid result. Optional.
Returns:
Dataframe containing matrix representing the correlation coefficient \
between the corresponding pair of variables.
Raises:
EmptyDataFrameException: The input Dataframe is empty.
InvalidParameterValueException: min_periods argument is used with method 'kendall'.
NonNumericDataException: The selected columns contain non-numeric data.
"""
if check_empty_dataframe(data):
raise EmptyDataFrameException("The input Dataframe is empty.")

if columns:
invalid_columns = [column for column in columns if column not in data.columns]
if invalid_columns:
raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}")
data_subset = data[columns]
else:
data_subset = data.select_dtypes(include=np.number)

if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))):
raise NonNumericDataException("The input data contain non-numeric data.")

if correlation_method == "kendall" and min_periods is not None:
raise InvalidParameterValueException(
"The argument min_periods is available only with correlation methods 'pearson' and 'spearman'."
)

matrix = data_subset.corr(method=correlation_method, min_periods=min_periods, numeric_only=True)

return matrix
57 changes: 57 additions & 0 deletions eis_toolkit/exploratory_analyses/covariance_matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import numpy as np
import pandas as pd
from beartype import beartype
from beartype.typing import Optional, Sequence

from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException
from eis_toolkit.utilities.checks.dataframe import check_empty_dataframe


@beartype
def covariance_matrix(
data: pd.DataFrame,
columns: Optional[Sequence[str]] = None,
min_periods: Optional[int] = None,
delta_degrees_of_freedom: int = 1,
) -> pd.DataFrame:
"""Compute covariance matrix on the input data.
It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations.
Args:
data: Dataframe containing the input data.
columns: Columns to include in the covariance matrix. If None, all numeric columns are used.
min_periods: Minimum number of observations required per pair of columns to have valid result. Optional.
delta_degrees_of_freedom: Delta degrees of freedom used for computing covariance matrix. Defaults to 1.
Returns:
Dataframe containing matrix representing the covariance between the corresponding pair of variables.
Raises:
EmptyDataFrameException: The input Dataframe is empty.
InvalidParameterValueException: Provided value for delta_degrees_of_freedom or min_periods is negative.
NonNumericDataException: The input data contain non-numeric data.
"""
if check_empty_dataframe(data):
raise EmptyDataFrameException("The input Dataframe is empty.")

if columns:
invalid_columns = [column for column in columns if column not in data.columns]
if invalid_columns:
raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}")
data_subset = data[columns]
else:
data_subset = data.select_dtypes(include=np.number)

if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))):
raise NonNumericDataException("The input data contain non-numeric data.")

if delta_degrees_of_freedom < 0:
raise InvalidParameterValueException("Delta degrees of freedom must be non-negative.")

if min_periods and min_periods < 0:
raise InvalidParameterValueException("Min perioids must be non-negative.")

matrix = data_subset.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom)

return matrix
Loading

0 comments on commit 9e1639e

Please sign in to comment.