From f3bc5e9fdc761ff9fa6f2728dc73499d55cdb174 Mon Sep 17 00:00:00 2001
From: Niko Aarnio <niko.aarnio@protonmail.com>
Date: Mon, 12 Feb 2024 12:33:47 +0200
Subject: [PATCH 1/9] Initial implementations for raster and vector normality
 test CLI functions

---
 eis_toolkit/cli.py | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py
index f31ba0c6..f979ca99 100644
--- a/eis_toolkit/cli.py
+++ b/eis_toolkit/cli.py
@@ -267,6 +267,48 @@ class NodataHandling(str, Enum):
 
 # --- EXPLORATORY ANALYSES ---
 
+# NORMALITY TEST RASTER
+@app.command()
+def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION]):
+    """Compute Shapiro-Wilk test for normality on the input data."""
+    from eis_toolkit.exploratory_analyses.statistical_tests import normality_test
+
+    typer.echo("Progress: 10%")
+
+    with rasterio.open(input_raster) as raster:
+        data = raster.read()
+        typer.echo("Progress: 25%")
+
+        results_dict = normality_test(data)
+
+    typer.echo("Progress: 75%")
+
+    json_str = json.dumps(results_dict)
+    typer.echo("Progress: 100%")
+    typer.echo(f"Results: {json_str}")
+    typer.echo("Normality test (raster) completed")
+
+
+# NORMALITY TEST VECTOR
+@app.command()
+def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], columns: List[str] = None):
+    """Compute Shapiro-Wilk test for normality on the input data."""
+    from eis_toolkit.exploratory_analyses.statistical_tests import normality_test
+
+    typer.echo("Progress: 10%")
+
+    geodataframe = gpd.read_file(input_vector)
+    typer.echo("Progress: 25%")
+
+    results_dict = normality_test(geodataframe, columns)
+
+    typer.echo("Progress: 75%")
+
+    json_str = json.dumps(results_dict)
+    typer.echo("Progress: 100%")
+    typer.echo(f"Results: {json_str}")
+    typer.echo("Normality test (vector) completed")
+
 
 # DBSCAN
 @app.command()

From 1c562025a41973f6c64b1ad3453102b637ef6eb8 Mon Sep 17 00:00:00 2001
From: Niko Aarnio <niko.aarnio@protonmail.com>
Date: Fri, 23 Feb 2024 10:42:10 +0200
Subject: [PATCH 2/9] Added CLI functions for normality tests. Adjusted toolkit
 func implementations

---
 eis_toolkit/cli.py                              | 16 +++++++++-------
 .../exploratory_analyses/normality_test.py      | 17 ++++++++++-------
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py
index a9e62631..d660591f 100644
--- a/eis_toolkit/cli.py
+++ b/eis_toolkit/cli.py
@@ -276,9 +276,9 @@ class LocalMoranWeightType(str, Enum):
 
 # NORMALITY TEST RASTER
 @app.command()
-def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION]):
-    """Compute Shapiro-Wilk test for normality on the input data."""
-    from eis_toolkit.exploratory_analyses.statistical_tests import normality_test
+def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION], bands: Optional[List[int]] = None):
+    """Compute Shapiro-Wilk test for normality on the input raster data."""
+    from eis_toolkit.exploratory_analyses.normality_test import normality_test_array
 
     typer.echo("Progress: 10%")
 
@@ -286,7 +286,9 @@ def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION]):
         data = raster.read()
         typer.echo("Progress: 25%")
 
-        results_dict = normality_test(data)
+        if len(bands) == 0:
+            bands = None
+        results_dict = normality_test_array(data=data, bands=bands, nodata_value=raster.nodata)
 
     typer.echo("Progress: 75%")
 
@@ -299,15 +301,15 @@ def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION]):
 # NORMALITY TEST VECTOR
 @app.command()
 def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], columns: List[str] = None):
-    """Compute Shapiro-Wilk test for normality on the input data."""
-    from eis_toolkit.exploratory_analyses.statistical_tests import normality_test
+    """Compute Shapiro-Wilk test for normality on the input vector data."""
+    from eis_toolkit.exploratory_analyses.normality_test import normality_test_dataframe
 
     typer.echo("Progress: 10%")
 
     geodataframe = gpd.read_file(input_vector)
     typer.echo("Progress: 25%")
 
-    results_dict = normality_test(geodataframe, columns)
+    results_dict = normality_test_dataframe(geodataframe, columns)
 
     typer.echo("Progress: 75%")
 
diff --git a/eis_toolkit/exploratory_analyses/normality_test.py b/eis_toolkit/exploratory_analyses/normality_test.py
index 4034d2b1..d3507fe7 100644
--- a/eis_toolkit/exploratory_analyses/normality_test.py
+++ b/eis_toolkit/exploratory_analyses/normality_test.py
@@ -59,7 +59,8 @@ def normality_test_dataframe(
     for column in columns:
         if len(data[column]) > 5000:
             raise SampleSizeExceededException(f"Sample size for column '{column}' exceeds the limit of 5000 samples.")
-        statistics[column] = shapiro(data[column])
+        stat, p_value = shapiro(data[column])
+        statistics[column] = {"Statistic": stat, "p-value": p_value}
 
     return statistics
 
@@ -67,7 +68,7 @@ def normality_test_dataframe(
 @beartype
 def normality_test_array(
     data: np.ndarray, bands: Optional[Sequence[int]] = None, nodata_value: Optional[Number] = None
-) -> Dict[int, Tuple[float, float]]:
+) -> Dict[str, Tuple[float, float]]:
     """
     Compute Shapiro-Wilk test for normality on the input Numpy array.
 
@@ -94,14 +95,14 @@ def normality_test_array(
 
     if data.ndim == 1 or data.ndim == 2:
         prepared_data = np.expand_dims(data, axis=0)
-        bands = range(1)
+        bands = [1]
 
     elif data.ndim == 3:
         if bands is not None:
-            if not all(band < len(data) for band in bands):
+            if not all(band - 1 < len(data) for band in bands):
                 raise InvalidRasterBandException("All selected bands were not found in the input array.")
         else:
-            bands = range(len(data))
+            bands = range(1, len(data) + 1)
         prepared_data = data
 
     else:
@@ -110,7 +111,8 @@ def normality_test_array(
     statistics = {}
 
     for band in bands:
-        flattened_data = prepared_data[band].ravel()
+        band_idx = band - 1
+        flattened_data = prepared_data[band_idx].ravel()
 
         nan_mask = flattened_data == np.nan
         if nodata_value is not None:
@@ -121,6 +123,7 @@ def normality_test_array(
         if len(masked_data) > 5000:
             raise SampleSizeExceededException(f"Sample size for band '{band}' exceeds the limit of 5000 samples.")
 
-        statistics[band] = shapiro(masked_data)
+        stat, p_value = shapiro(masked_data)
+        statistics[f"Band {band}"] = {"Statistic": stat, "p-value": p_value}
 
     return statistics

From 6cd07031668b852ada4540d4a97e1907b613fe96 Mon Sep 17 00:00:00 2001
From: Niko Aarnio <niko.aarnio@protonmail.com>
Date: Fri, 23 Feb 2024 11:10:47 +0200
Subject: [PATCH 3/9] Added CLI function for chi-square test, adjusted toolkit
 func

---
 eis_toolkit/cli.py                            | 29 +++++++++++++++--
 .../exploratory_analyses/statistical_tests.py | 31 ++++++++++---------
 2 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py
index d660591f..23c90010 100644
--- a/eis_toolkit/cli.py
+++ b/eis_toolkit/cli.py
@@ -300,7 +300,7 @@ def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION],
 
 # NORMALITY TEST VECTOR
 @app.command()
-def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], columns: List[str] = None):
+def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], columns: Optional[List[str]] = None):
     """Compute Shapiro-Wilk test for normality on the input vector data."""
     from eis_toolkit.exploratory_analyses.normality_test import normality_test_dataframe
 
@@ -309,7 +309,7 @@ def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION],
     geodataframe = gpd.read_file(input_vector)
     typer.echo("Progress: 25%")
 
-    results_dict = normality_test_dataframe(geodataframe, columns)
+    results_dict = normality_test_dataframe(data=geodataframe, columns=columns)
 
     typer.echo("Progress: 75%")
 
@@ -319,6 +319,31 @@ def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION],
     typer.echo("Normality test (vector) completed")
 
 
+# CHI-SQUARE_TEST
+@app.command()
+def chi_square_test_cli(
+    input_vector: Annotated[Path, INPUT_FILE_OPTION],
+    target_column: str = typer.Option(),
+    columns: Optional[List[str]] = None,
+):
+    """Perform a Chi-square test of independence between a target variable and one or more other variables."""
+    from eis_toolkit.exploratory_analyses.statistical_tests import chi_square_test
+
+    typer.echo("Progress: 10%")
+
+    geodataframe = gpd.read_file(input_vector)
+    typer.echo("Progress: 25%")
+
+    results_dict = chi_square_test(data=geodataframe, target_column=target_column, columns=columns)
+
+    typer.echo("Progress: 75%")
+
+    json_str = json.dumps(results_dict)
+    typer.echo("Progress: 100%")
+    typer.echo(f"Results: {json_str}")
+    typer.echo("Chi-square test completed")
+
+
 # DBSCAN
 @app.command()
 def dbscan_cli(
diff --git a/eis_toolkit/exploratory_analyses/statistical_tests.py b/eis_toolkit/exploratory_analyses/statistical_tests.py
index c7f80117..7a64b3ae 100644
--- a/eis_toolkit/exploratory_analyses/statistical_tests.py
+++ b/eis_toolkit/exploratory_analyses/statistical_tests.py
@@ -9,22 +9,24 @@
 
 @beartype
 def chi_square_test(data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None) -> dict:
-    """Compute Chi-square test for independence on the input data.
+    """Perform a Chi-square test of independence between a target variable and one or more other variables.
 
-    It is assumed that the variables in the input data are independent and that they are categorical, i.e. strings,
-    booleans or integers, but not floats.
+    Input data should be categorical data. Continuous data or non-categorical data should be discretized or
+    binned before using this function, as Chi-square tests are not applicable to continuous variables directly.
+
+    The test assumes that the observed frequencies in each category are independent.
 
     Args:
-        data: Dataframe containing the input data
+        data: Dataframe containing the input data.
         target_column: Variable against which independence of other variables is tested.
         columns: Variables that are tested against the variable in target_column. If None, every column is used.
 
     Returns:
-        Test statistics for each variable (except target_column).
+        Test statistics, p-value and degrees of freedom for each variable.
 
     Raises:
-        EmptyDataFrameException: The input Dataframe is empty.
-        InvalidParameterValueException: The target_column is not in input Dataframe or invalid column is provided.
+        EmptyDataFrameException: Input Dataframe is empty.
+        InvalidParameterValueException: Invalid column is input.
     """
     if check_empty_dataframe(data):
         raise EmptyDataFrameException("The input Dataframe is empty.")
@@ -32,19 +34,18 @@ def chi_square_test(data: pd.DataFrame, target_column: str, columns: Optional[Se
     if not check_columns_valid(data, [target_column]):
         raise InvalidParameterValueException("Target column not found in the Dataframe.")
 
-    if columns is not None:
+    if columns:
         invalid_columns = [column for column in columns if column not in data.columns]
-        if any(invalid_columns):
-            raise InvalidParameterValueException(f"The following variables are not in the dataframe: {invalid_columns}")
+        if invalid_columns:
+            raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}")
     else:
-        columns = data.columns
+        columns = [col for col in data.columns if col != target_column]
 
     statistics = {}
     for column in columns:
-        if column != target_column:
-            contingency_table = pd.crosstab(data[target_column], data[column])
-            chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table)
-            statistics[column] = (chi_square, p_value, degrees_of_freedom)
+        contingency_table = pd.crosstab(data[target_column], data[column])
+        chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table)
+        statistics[column] = {"chi_square": chi_square, "p-value": p_value, "degrees_of_freedom": degrees_of_freedom}
 
     return statistics
 

From 4ede1878aa753dd84f686293d656f34a19ad523a Mon Sep 17 00:00:00 2001
From: Niko Aarnio <niko.aarnio@protonmail.com>
Date: Fri, 23 Feb 2024 12:12:49 +0200
Subject: [PATCH 4/9] Added CLI functions for covariance and correlation
 matrices, adjusted toolkit implementations

---
 eis_toolkit/cli.py                            | 70 ++++++++++++++++++-
 .../exploratory_analyses/statistical_tests.py | 37 ++++++++--
 2 files changed, 99 insertions(+), 8 deletions(-)

diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py
index 23c90010..f5bdd5ee 100644
--- a/eis_toolkit/cli.py
+++ b/eis_toolkit/cli.py
@@ -207,6 +207,14 @@ class LocalMoranWeightType(str, Enum):
     knn = "knn"
 
 
+class CorrelationMethod(str, Enum):
+    """Correlation methods available."""
+
+    pearson = "pearson"
+    kendall = "kendall"
+    spearman = "spearman"
+
+
 RESAMPLING_MAPPING = {
     "nearest": warp.Resampling.nearest,
     "bilinear": warp.Resampling.bilinear,
@@ -331,7 +339,7 @@ def chi_square_test_cli(
 
     typer.echo("Progress: 10%")
 
-    geodataframe = gpd.read_file(input_vector)
+    geodataframe = gpd.read_file(input_vector)  # Should we drop geometry columns?
     typer.echo("Progress: 25%")
 
     results_dict = chi_square_test(data=geodataframe, target_column=target_column, columns=columns)
@@ -344,6 +352,66 @@ def chi_square_test_cli(
     typer.echo("Chi-square test completed")
 
 
+# CORRELATION MATRIX
+@app.command()
+def correlation_matrix_cli(
+    input_vector: Annotated[Path, INPUT_FILE_OPTION],
+    output_file: Annotated[Path, OUTPUT_FILE_OPTION],
+    columns: Optional[List[str]] = None,
+    correlation_method: CorrelationMethod = CorrelationMethod.pearson,
+    min_periods: Optional[int] = None,
+):
+    """Compute correlation matrix on the input data."""
+    from eis_toolkit.exploratory_analyses.statistical_tests import correlation_matrix
+
+    typer.echo("Progress: 10%")
+
+    geodataframe = gpd.read_file(input_vector)
+    dataframe = pd.DataFrame(geodataframe.drop(columns="geometry"))
+    typer.echo("Progress: 25%")
+
+    output_df = correlation_matrix(
+        data=dataframe, columns=columns, correlation_method=correlation_method, min_periods=min_periods
+    )
+
+    typer.echo("Progress: 75%")
+
+    output_df.to_csv(output_file)
+    typer.echo("Progress: 100%")
+
+    typer.echo("Correlation matrix completed")
+
+
+# COVARIANCE MATRIX
+@app.command()
+def covariance_matrix_cli(
+    input_vector: Annotated[Path, INPUT_FILE_OPTION],
+    output_file: Annotated[Path, OUTPUT_FILE_OPTION],
+    columns: Optional[List[str]] = None,
+    min_periods: Optional[int] = None,
+    delta_degrees_of_freedom: int = 1,
+):
+    """Compute covariance matrix on the input data."""
+    from eis_toolkit.exploratory_analyses.statistical_tests import covariance_matrix
+
+    typer.echo("Progress: 10%")
+
+    geodataframe = gpd.read_file(input_vector)
+    dataframe = pd.DataFrame(geodataframe.drop(columns="geometry"))
+    typer.echo("Progress: 25%")
+
+    output_df = covariance_matrix(
+        data=dataframe, columns=columns, min_periods=min_periods, delta_degrees_of_freedom=delta_degrees_of_freedom
+    )
+
+    typer.echo("Progress: 75%")
+
+    output_df.to_csv(output_file)
+    typer.echo("Progress: 100%")
+
+    typer.echo("Covariance matrix completed")
+
+
 # DBSCAN
 @app.command()
 def dbscan_cli(
diff --git a/eis_toolkit/exploratory_analyses/statistical_tests.py b/eis_toolkit/exploratory_analyses/statistical_tests.py
index 7a64b3ae..8fe5721f 100644
--- a/eis_toolkit/exploratory_analyses/statistical_tests.py
+++ b/eis_toolkit/exploratory_analyses/statistical_tests.py
@@ -1,10 +1,11 @@
+import numpy as np
 import pandas as pd
 from beartype import beartype
 from beartype.typing import Literal, Optional, Sequence
 from scipy.stats import chi2_contingency
 
 from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException
-from eis_toolkit.utilities.checks.dataframe import check_columns_numeric, check_columns_valid, check_empty_dataframe
+from eis_toolkit.utilities.checks.dataframe import check_columns_valid, check_empty_dataframe
 
 
 @beartype
@@ -53,6 +54,7 @@ def chi_square_test(data: pd.DataFrame, target_column: str, columns: Optional[Se
 @beartype
 def correlation_matrix(
     data: pd.DataFrame,
+    columns: Optional[Sequence[str]] = None,
     correlation_method: Literal["pearson", "kendall", "spearman"] = "pearson",
     min_periods: Optional[int] = None,
 ) -> pd.DataFrame:
@@ -62,6 +64,7 @@ def correlation_matrix(
 
     Args:
         data: Dataframe containing the input data.
+        columns: Columns to include in the correlation matrix. If None, all numeric columns are used.
         correlation_method: 'pearson', 'kendall', or 'spearman'. Defaults to 'pearson'.
         min_periods: Minimum number of observations required per pair of columns to have valid result. Optional.
 
@@ -72,12 +75,20 @@ def correlation_matrix(
     Raises:
         EmptyDataFrameException: The input Dataframe is empty.
         InvalidParameterValueException: min_periods argument is used with method 'kendall'.
-        NonNumericDataException: The input data contain non-numeric data.
+        NonNumericDataException: The selected columns contain non-numeric data.
     """
     if check_empty_dataframe(data):
         raise EmptyDataFrameException("The input Dataframe is empty.")
 
-    if not check_columns_numeric(data, data.columns.to_list()):
+    if columns:
+        invalid_columns = [column for column in columns if column not in data.columns]
+        if invalid_columns:
+            raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}")
+        data_subset = data[columns]
+    else:
+        data_subset = data.select_dtypes(include=np.number)
+
+    if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))):
         raise NonNumericDataException("The input data contain non-numeric data.")
 
     if correlation_method == "kendall" and min_periods is not None:
@@ -85,14 +96,17 @@ def correlation_matrix(
             "The argument min_periods is available only with correlation methods 'pearson' and 'spearman'."
         )
 
-    matrix = data.corr(method=correlation_method, min_periods=min_periods, numeric_only=True)
+    matrix = data_subset.corr(method=correlation_method, min_periods=min_periods, numeric_only=True)
 
     return matrix
 
 
 @beartype
 def covariance_matrix(
-    data: pd.DataFrame, min_periods: Optional[int] = None, delta_degrees_of_freedom: int = 1
+    data: pd.DataFrame,
+    columns: Optional[Sequence[str]] = None,
+    min_periods: Optional[int] = None,
+    delta_degrees_of_freedom: int = 1,
 ) -> pd.DataFrame:
     """Compute covariance matrix on the input data.
 
@@ -100,6 +114,7 @@ def covariance_matrix(
 
     Args:
         data: Dataframe containing the input data.
+        columns: Columns to include in the covariance matrix. If None, all numeric columns are used.
         min_periods: Minimum number of observations required per pair of columns to have valid result. Optional.
         delta_degrees_of_freedom: Delta degrees of freedom used for computing covariance matrix. Defaults to 1.
 
@@ -114,7 +129,15 @@ def covariance_matrix(
     if check_empty_dataframe(data):
         raise EmptyDataFrameException("The input Dataframe is empty.")
 
-    if not check_columns_numeric(data, data.columns.to_list()):
+    if columns:
+        invalid_columns = [column for column in columns if column not in data.columns]
+        if invalid_columns:
+            raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}")
+        data_subset = data[columns]
+    else:
+        data_subset = data.select_dtypes(include=np.number)
+
+    if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))):
         raise NonNumericDataException("The input data contain non-numeric data.")
 
     if delta_degrees_of_freedom < 0:
@@ -123,6 +146,6 @@ def covariance_matrix(
     if min_periods and min_periods < 0:
         raise InvalidParameterValueException("Min perioids must be non-negative.")
 
-    matrix = data.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom)
+    matrix = data_subset.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom)
 
     return matrix

From a4d367018046249e79da70e248f65cc6ecad580e Mon Sep 17 00:00:00 2001
From: Niko Aarnio <niko.aarnio@protonmail.com>
Date: Fri, 23 Feb 2024 12:39:35 +0200
Subject: [PATCH 5/9] Update tests for statistical functions, typing fixes

---
 eis_toolkit/exploratory_analyses/normality_test.py |  6 +++---
 .../exploratory_analyses/statistical_tests.py      |  6 ++++--
 tests/exploratory_analyses/normality_test_test.py  | 14 +++++++-------
 .../exploratory_analyses/statistical_tests_test.py |  4 ++--
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/eis_toolkit/exploratory_analyses/normality_test.py b/eis_toolkit/exploratory_analyses/normality_test.py
index d3507fe7..8e8fcad0 100644
--- a/eis_toolkit/exploratory_analyses/normality_test.py
+++ b/eis_toolkit/exploratory_analyses/normality_test.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pandas as pd
 from beartype import beartype
-from beartype.typing import Dict, Optional, Sequence, Tuple
+from beartype.typing import Dict, Optional, Sequence
 from scipy.stats import shapiro
 
 from eis_toolkit.exceptions import (
@@ -20,7 +20,7 @@
 @beartype
 def normality_test_dataframe(
     data: pd.DataFrame, columns: Optional[Sequence[str]] = None
-) -> Dict[str, Tuple[float, float]]:
+) -> Dict[str, Dict[str, float]]:
     """
     Compute Shapiro-Wilk test for normality on the input DataFrame.
 
@@ -68,7 +68,7 @@ def normality_test_dataframe(
 @beartype
 def normality_test_array(
     data: np.ndarray, bands: Optional[Sequence[int]] = None, nodata_value: Optional[Number] = None
-) -> Dict[str, Tuple[float, float]]:
+) -> Dict[str, Dict[str, float]]:
     """
     Compute Shapiro-Wilk test for normality on the input Numpy array.
 
diff --git a/eis_toolkit/exploratory_analyses/statistical_tests.py b/eis_toolkit/exploratory_analyses/statistical_tests.py
index 8fe5721f..518492fe 100644
--- a/eis_toolkit/exploratory_analyses/statistical_tests.py
+++ b/eis_toolkit/exploratory_analyses/statistical_tests.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pandas as pd
 from beartype import beartype
-from beartype.typing import Literal, Optional, Sequence
+from beartype.typing import Dict, Literal, Optional, Sequence
 from scipy.stats import chi2_contingency
 
 from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException
@@ -9,7 +9,9 @@
 
 
 @beartype
-def chi_square_test(data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None) -> dict:
+def chi_square_test(
+    data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None
+) -> Dict[str, Dict[str, float]]:
     """Perform a Chi-square test of independence between a target variable and one or more other variables.
 
     Input data should be categorical data. Continuous data or non-categorical data should be discretized or
diff --git a/tests/exploratory_analyses/normality_test_test.py b/tests/exploratory_analyses/normality_test_test.py
index c2012578..dcf3f5b4 100644
--- a/tests/exploratory_analyses/normality_test_test.py
+++ b/tests/exploratory_analyses/normality_test_test.py
@@ -24,35 +24,35 @@
 def test_normality_test_dataframe():
     """Test that returned normality statistics for DataFrame data are correct."""
     output_statistics = normality_test_dataframe(data=DATA_DF, columns=["a"])
-    np.testing.assert_array_almost_equal(output_statistics["a"], (0.82827, 0.13502), decimal=5)
+    np.testing.assert_array_almost_equal(list(output_statistics["a"].values()), [0.82827, 0.13502], decimal=5)
 
 
 def test_normality_test_array():
     """Test that returned normality statistics for Numpy array data are correct."""
     # 3D array
-    output_statistics = normality_test_array(data=DATA_ARRAY, bands=[0])
-    np.testing.assert_array_almost_equal(output_statistics[0], (0.91021, 0.01506), decimal=5)
+    output_statistics = normality_test_array(data=DATA_ARRAY, bands=[1])
+    np.testing.assert_array_almost_equal(list(output_statistics["Band 1"].values()), [0.91021, 0.01506], decimal=5)
 
     # 2D array
     output_statistics = normality_test_array(data=DATA_ARRAY[0])
-    np.testing.assert_array_almost_equal(output_statistics[0], (0.91021, 0.01506), decimal=5)
+    np.testing.assert_array_almost_equal(list(output_statistics["Band 1"].values()), [0.91021, 0.01506], decimal=5)
 
     # 1D array
     output_statistics = normality_test_array(data=DATA_ARRAY[0][0])
-    np.testing.assert_array_almost_equal(output_statistics[0], (0.9067, 0.41504), decimal=5)
+    np.testing.assert_array_almost_equal(list(output_statistics["Band 1"].values()), [0.9067, 0.41504], decimal=5)
 
 
 def test_normality_test_dataframe_missing_data():
     """Test that DataFrame input with missing data returns statistics correctly."""
     df_with_nan = DATA_DF.replace(3, np.nan)
     output_statistics = normality_test_dataframe(data=df_with_nan, columns=["a"])
-    np.testing.assert_array_almost_equal(output_statistics["a"], (0.62978, 0.00124), decimal=5)
+    np.testing.assert_array_almost_equal(list(output_statistics["a"].values()), (0.62978, 0.00124), decimal=5)
 
 
 def test_normality_test_array_nodata():
     """Test that Numpy array input with missing data returns statistics correctly."""
     output_statistics = normality_test_array(data=DATA_ARRAY, nodata_value=3)
-    np.testing.assert_array_almost_equal(output_statistics[0], (0.91021, 0.01506), decimal=5)
+    np.testing.assert_array_almost_equal(list(output_statistics["Band 1"].values()), [0.91021, 0.01506], decimal=5)
 
 
 def test_invalid_selection():
diff --git a/tests/exploratory_analyses/statistical_tests_test.py b/tests/exploratory_analyses/statistical_tests_test.py
index 71954c0f..03c68d7e 100644
--- a/tests/exploratory_analyses/statistical_tests_test.py
+++ b/tests/exploratory_analyses/statistical_tests_test.py
@@ -22,7 +22,7 @@
 def test_chi_square_test():
     """Test that returned statistics for independence are correct."""
     output_statistics = chi_square_test(data=categorical_data, target_column=target_column, columns=["f"])
-    np.testing.assert_array_equal((output_statistics["f"]), (0.0, 1.0, 1))
+    np.testing.assert_array_equal(list(output_statistics["f"].values()), [0.0, 1.0, 1])
 
 
 def test_correlation_matrix_nan():
@@ -56,7 +56,7 @@ def test_correlation_matrix():
 def test_correlation_matrix_non_numeric():
     """Test that returned correlation matrix is correct."""
     with pytest.raises(NonNumericDataException):
-        correlation_matrix(data=non_numeric_df)
+        correlation_matrix(data=non_numeric_df, columns=["a", "b"])
 
 
 def test_covariance_matrix_nan():

From 62fe65430e7d74db5a9e3f410375eea55a52fef6 Mon Sep 17 00:00:00 2001
From: Niko Aarnio <niko.aarnio@protonmail.com>
Date: Thu, 29 Feb 2024 09:32:29 +0200
Subject: [PATCH 6/9] Separated correlation matrix, covariance matrix and chi2
 tests into own modules

---
 eis_toolkit/cli.py                            |   6 +-
 .../exploratory_analyses/chi_square_test.py   |  52 ++++++
 .../correlation_matrix.py                     |  57 +++++++
 .../exploratory_analyses/covariance_matrix.py |  57 +++++++
 .../exploratory_analyses/statistical_tests.py | 153 ------------------
 tests/exploratory_analyses/chi_square_test.py |  20 +++
 .../correlation_matrix_test.py                |  53 ++++++
 .../covariance_matrix_test.py                 |  56 +++++++
 .../statistical_tests_test.py                 | 117 --------------
 9 files changed, 298 insertions(+), 273 deletions(-)
 create mode 100644 eis_toolkit/exploratory_analyses/chi_square_test.py
 create mode 100644 eis_toolkit/exploratory_analyses/correlation_matrix.py
 create mode 100644 eis_toolkit/exploratory_analyses/covariance_matrix.py
 delete mode 100644 eis_toolkit/exploratory_analyses/statistical_tests.py
 create mode 100644 tests/exploratory_analyses/chi_square_test.py
 create mode 100644 tests/exploratory_analyses/correlation_matrix_test.py
 create mode 100644 tests/exploratory_analyses/covariance_matrix_test.py
 delete mode 100644 tests/exploratory_analyses/statistical_tests_test.py

diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py
index f5bdd5ee..ad070e98 100644
--- a/eis_toolkit/cli.py
+++ b/eis_toolkit/cli.py
@@ -335,7 +335,7 @@ def chi_square_test_cli(
     columns: Optional[List[str]] = None,
 ):
     """Perform a Chi-square test of independence between a target variable and one or more other variables."""
-    from eis_toolkit.exploratory_analyses.statistical_tests import chi_square_test
+    from eis_toolkit.exploratory_analyses.chi_square_test import chi_square_test
 
     typer.echo("Progress: 10%")
 
@@ -362,7 +362,7 @@ def correlation_matrix_cli(
     min_periods: Optional[int] = None,
 ):
     """Compute correlation matrix on the input data."""
-    from eis_toolkit.exploratory_analyses.statistical_tests import correlation_matrix
+    from eis_toolkit.exploratory_analyses.correlation_matrix import correlation_matrix
 
     typer.echo("Progress: 10%")
 
@@ -392,7 +392,7 @@ def covariance_matrix_cli(
     delta_degrees_of_freedom: int = 1,
 ):
     """Compute covariance matrix on the input data."""
-    from eis_toolkit.exploratory_analyses.statistical_tests import covariance_matrix
+    from eis_toolkit.exploratory_analyses.covariance_matrix import covariance_matrix
 
     typer.echo("Progress: 10%")
 
diff --git a/eis_toolkit/exploratory_analyses/chi_square_test.py b/eis_toolkit/exploratory_analyses/chi_square_test.py
new file mode 100644
index 00000000..cf82aa25
--- /dev/null
+++ b/eis_toolkit/exploratory_analyses/chi_square_test.py
@@ -0,0 +1,52 @@
+import pandas as pd
+from beartype import beartype
+from beartype.typing import Dict, Optional, Sequence
+from scipy.stats import chi2_contingency
+
+from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException
+from eis_toolkit.utilities.checks.dataframe import check_columns_valid, check_empty_dataframe
+
+
+@beartype
+def chi_square_test(
+    data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None
+) -> Dict[str, Dict[str, float]]:
+    """Perform a Chi-square test of independence between a target variable and one or more other variables.
+
+    Input data should be categorical data. Continuous data or non-categorical data should be discretized or
+    binned before using this function, as Chi-square tests are not applicable to continuous variables directly.
+
+    The test assumes that the observed frequencies in each category are independent.
+
+    Args:
+        data: Dataframe containing the input data.
+        target_column: Variable against which independence of other variables is tested.
+        columns: Variables that are tested against the variable in target_column. If None, every column is used.
+
+    Returns:
+        Test statistics, p-value and degrees of freedom for each variable.
+
+    Raises:
+        EmptyDataFrameException: Input Dataframe is empty.
+        InvalidParameterValueException: Invalid column is input.
+    """
+    if check_empty_dataframe(data):
+        raise EmptyDataFrameException("The input Dataframe is empty.")
+
+    if not check_columns_valid(data, [target_column]):
+        raise InvalidParameterValueException("Target column not found in the Dataframe.")
+
+    if columns:
+        invalid_columns = [column for column in columns if column not in data.columns]
+        if invalid_columns:
+            raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}")
+    else:
+        columns = [col for col in data.columns if col != target_column]
+
+    statistics = {}
+    for column in columns:
+        contingency_table = pd.crosstab(data[target_column], data[column])
+        chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table)
+        statistics[column] = {"chi_square": chi_square, "p-value": p_value, "degrees_of_freedom": degrees_of_freedom}
+
+    return statistics
diff --git a/eis_toolkit/exploratory_analyses/correlation_matrix.py b/eis_toolkit/exploratory_analyses/correlation_matrix.py
new file mode 100644
index 00000000..bdcb79f3
--- /dev/null
+++ b/eis_toolkit/exploratory_analyses/correlation_matrix.py
@@ -0,0 +1,57 @@
+import numpy as np
+import pandas as pd
+from beartype import beartype
+from beartype.typing import Literal, Optional, Sequence
+
+from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException
+from eis_toolkit.utilities.checks.dataframe import check_empty_dataframe
+
+
+@beartype
+def correlation_matrix(
+    data: pd.DataFrame,
+    columns: Optional[Sequence[str]] = None,
+    correlation_method: Literal["pearson", "kendall", "spearman"] = "pearson",
+    min_periods: Optional[int] = None,
+) -> pd.DataFrame:
+    """Compute correlation matrix on the input data.
+
+    It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations.
+
+    Args:
+        data: Dataframe containing the input data.
+        columns: Columns to include in the correlation matrix. If None, all numeric columns are used.
+        correlation_method: 'pearson', 'kendall', or 'spearman'. Defaults to 'pearson'.
+        min_periods: Minimum number of observations required per pair of columns to have valid result. Optional.
+
+    Returns:
+        Dataframe containing matrix representing the correlation coefficient \
+            between the corresponding pair of variables.
+
+    Raises:
+        EmptyDataFrameException: The input Dataframe is empty.
+        InvalidParameterValueException: min_periods argument is used with method 'kendall'.
+        NonNumericDataException: The selected columns contain non-numeric data.
+    """
+    if check_empty_dataframe(data):
+        raise EmptyDataFrameException("The input Dataframe is empty.")
+
+    if columns:
+        invalid_columns = [column for column in columns if column not in data.columns]
+        if invalid_columns:
+            raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}")
+        data_subset = data[columns]
+    else:
+        data_subset = data.select_dtypes(include=np.number)
+
+    if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))):
+        raise NonNumericDataException("The input data contain non-numeric data.")
+
+    if correlation_method == "kendall" and min_periods is not None:
+        raise InvalidParameterValueException(
+            "The argument min_periods is available only with correlation methods 'pearson' and 'spearman'."
+        )
+
+    matrix = data_subset.corr(method=correlation_method, min_periods=min_periods, numeric_only=True)
+
+    return matrix
diff --git a/eis_toolkit/exploratory_analyses/covariance_matrix.py b/eis_toolkit/exploratory_analyses/covariance_matrix.py
new file mode 100644
index 00000000..25c63850
--- /dev/null
+++ b/eis_toolkit/exploratory_analyses/covariance_matrix.py
@@ -0,0 +1,57 @@
+import numpy as np
+import pandas as pd
+from beartype import beartype
+from beartype.typing import Optional, Sequence
+
+from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException
+from eis_toolkit.utilities.checks.dataframe import check_empty_dataframe
+
+
+@beartype
+def covariance_matrix(
+    data: pd.DataFrame,
+    columns: Optional[Sequence[str]] = None,
+    min_periods: Optional[int] = None,
+    delta_degrees_of_freedom: int = 1,
+) -> pd.DataFrame:
+    """Compute covariance matrix on the input data.
+
+    It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations.
+
+    Args:
+        data: Dataframe containing the input data.
+        columns: Columns to include in the covariance matrix. If None, all numeric columns are used.
+        min_periods: Minimum number of observations required per pair of columns to have valid result. Optional.
+        delta_degrees_of_freedom: Delta degrees of freedom used for computing covariance matrix. Defaults to 1.
+
+    Returns:
+        Dataframe containing matrix representing the covariance between the corresponding pair of variables.
+
+    Raises:
+        EmptyDataFrameException: The input Dataframe is empty.
+        InvalidParameterValueException: Provided value for delta_degrees_of_freedom or min_periods is negative.
+        NonNumericDataException: The input data contain non-numeric data.
+    """
+    if check_empty_dataframe(data):
+        raise EmptyDataFrameException("The input Dataframe is empty.")
+
+    if columns:
+        invalid_columns = [column for column in columns if column not in data.columns]
+        if invalid_columns:
+            raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}")
+        data_subset = data[columns]
+    else:
+        data_subset = data.select_dtypes(include=np.number)
+
+    if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))):
+        raise NonNumericDataException("The input data contain non-numeric data.")
+
+    if delta_degrees_of_freedom < 0:
+        raise InvalidParameterValueException("Delta degrees of freedom must be non-negative.")
+
+    if min_periods and min_periods < 0:
+        raise InvalidParameterValueException("Min perioids must be non-negative.")
+
+    matrix = data_subset.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom)
+
+    return matrix
diff --git a/eis_toolkit/exploratory_analyses/statistical_tests.py b/eis_toolkit/exploratory_analyses/statistical_tests.py
deleted file mode 100644
index 518492fe..00000000
--- a/eis_toolkit/exploratory_analyses/statistical_tests.py
+++ /dev/null
@@ -1,153 +0,0 @@
-import numpy as np
-import pandas as pd
-from beartype import beartype
-from beartype.typing import Dict, Literal, Optional, Sequence
-from scipy.stats import chi2_contingency
-
-from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException
-from eis_toolkit.utilities.checks.dataframe import check_columns_valid, check_empty_dataframe
-
-
-@beartype
-def chi_square_test(
-    data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None
-) -> Dict[str, Dict[str, float]]:
-    """Perform a Chi-square test of independence between a target variable and one or more other variables.
-
-    Input data should be categorical data. Continuous data or non-categorical data should be discretized or
-    binned before using this function, as Chi-square tests are not applicable to continuous variables directly.
-
-    The test assumes that the observed frequencies in each category are independent.
-
-    Args:
-        data: Dataframe containing the input data.
-        target_column: Variable against which independence of other variables is tested.
-        columns: Variables that are tested against the variable in target_column. If None, every column is used.
-
-    Returns:
-        Test statistics, p-value and degrees of freedom for each variable.
-
-    Raises:
-        EmptyDataFrameException: Input Dataframe is empty.
-        InvalidParameterValueException: Invalid column is input.
-    """
-    if check_empty_dataframe(data):
-        raise EmptyDataFrameException("The input Dataframe is empty.")
-
-    if not check_columns_valid(data, [target_column]):
-        raise InvalidParameterValueException("Target column not found in the Dataframe.")
-
-    if columns:
-        invalid_columns = [column for column in columns if column not in data.columns]
-        if invalid_columns:
-            raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}")
-    else:
-        columns = [col for col in data.columns if col != target_column]
-
-    statistics = {}
-    for column in columns:
-        contingency_table = pd.crosstab(data[target_column], data[column])
-        chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table)
-        statistics[column] = {"chi_square": chi_square, "p-value": p_value, "degrees_of_freedom": degrees_of_freedom}
-
-    return statistics
-
-
-@beartype
-def correlation_matrix(
-    data: pd.DataFrame,
-    columns: Optional[Sequence[str]] = None,
-    correlation_method: Literal["pearson", "kendall", "spearman"] = "pearson",
-    min_periods: Optional[int] = None,
-) -> pd.DataFrame:
-    """Compute correlation matrix on the input data.
-
-    It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations.
-
-    Args:
-        data: Dataframe containing the input data.
-        columns: Columns to include in the correlation matrix. If None, all numeric columns are used.
-        correlation_method: 'pearson', 'kendall', or 'spearman'. Defaults to 'pearson'.
-        min_periods: Minimum number of observations required per pair of columns to have valid result. Optional.
-
-    Returns:
-        Dataframe containing matrix representing the correlation coefficient \
-            between the corresponding pair of variables.
-
-    Raises:
-        EmptyDataFrameException: The input Dataframe is empty.
-        InvalidParameterValueException: min_periods argument is used with method 'kendall'.
-        NonNumericDataException: The selected columns contain non-numeric data.
-    """
-    if check_empty_dataframe(data):
-        raise EmptyDataFrameException("The input Dataframe is empty.")
-
-    if columns:
-        invalid_columns = [column for column in columns if column not in data.columns]
-        if invalid_columns:
-            raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}")
-        data_subset = data[columns]
-    else:
-        data_subset = data.select_dtypes(include=np.number)
-
-    if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))):
-        raise NonNumericDataException("The input data contain non-numeric data.")
-
-    if correlation_method == "kendall" and min_periods is not None:
-        raise InvalidParameterValueException(
-            "The argument min_periods is available only with correlation methods 'pearson' and 'spearman'."
-        )
-
-    matrix = data_subset.corr(method=correlation_method, min_periods=min_periods, numeric_only=True)
-
-    return matrix
-
-
-@beartype
-def covariance_matrix(
-    data: pd.DataFrame,
-    columns: Optional[Sequence[str]] = None,
-    min_periods: Optional[int] = None,
-    delta_degrees_of_freedom: int = 1,
-) -> pd.DataFrame:
-    """Compute covariance matrix on the input data.
-
-    It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations.
-
-    Args:
-        data: Dataframe containing the input data.
-        columns: Columns to include in the covariance matrix. If None, all numeric columns are used.
-        min_periods: Minimum number of observations required per pair of columns to have valid result. Optional.
-        delta_degrees_of_freedom: Delta degrees of freedom used for computing covariance matrix. Defaults to 1.
-
-    Returns:
-        Dataframe containing matrix representing the covariance between the corresponding pair of variables.
-
-    Raises:
-        EmptyDataFrameException: The input Dataframe is empty.
-        InvalidParameterValueException: Provided value for delta_degrees_of_freedom or min_periods is negative.
-        NonNumericDataException: The input data contain non-numeric data.
-    """
-    if check_empty_dataframe(data):
-        raise EmptyDataFrameException("The input Dataframe is empty.")
-
-    if columns:
-        invalid_columns = [column for column in columns if column not in data.columns]
-        if invalid_columns:
-            raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}")
-        data_subset = data[columns]
-    else:
-        data_subset = data.select_dtypes(include=np.number)
-
-    if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))):
-        raise NonNumericDataException("The input data contain non-numeric data.")
-
-    if delta_degrees_of_freedom < 0:
-        raise InvalidParameterValueException("Delta degrees of freedom must be non-negative.")
-
-    if min_periods and min_periods < 0:
-        raise InvalidParameterValueException("Min perioids must be non-negative.")
-
-    matrix = data_subset.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom)
-
-    return matrix
diff --git a/tests/exploratory_analyses/chi_square_test.py b/tests/exploratory_analyses/chi_square_test.py
new file mode 100644
index 00000000..83385d47
--- /dev/null
+++ b/tests/exploratory_analyses/chi_square_test.py
@@ -0,0 +1,20 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from eis_toolkit.exceptions import InvalidParameterValueException
+from eis_toolkit.exploratory_analyses.chi_square_test import chi_square_test
+
+DATA = pd.DataFrame({"e": [0, 0, 1, 1], "f": [True, False, True, True]})
+
+
+def test_chi_square_test():
+    """Test that returned statistics for independence are correct."""
+    output_statistics = chi_square_test(data=DATA, target_column="e", columns=["f"])
+    np.testing.assert_array_equal(list(output_statistics["f"].values()), [0.0, 1.0, 1])
+
+
+def test_invalid_target_column():
+    """Test that invalid target column raises the correct exception."""
+    with pytest.raises(InvalidParameterValueException):
+        chi_square_test(data=DATA, target_column="invalid_column")
diff --git a/tests/exploratory_analyses/correlation_matrix_test.py b/tests/exploratory_analyses/correlation_matrix_test.py
new file mode 100644
index 00000000..903fa48c
--- /dev/null
+++ b/tests/exploratory_analyses/correlation_matrix_test.py
@@ -0,0 +1,53 @@
+import numpy as np
+import pytest
+from beartype.roar import BeartypeCallHintParamViolation
+
+from eis_toolkit.exceptions import InvalidParameterValueException, NonNumericDataException
+from eis_toolkit.exploratory_analyses.correlation_matrix import correlation_matrix
+from tests.exploratory_analyses.covariance_matrix_test import DF, DF_NON_NUMERIC, DF_WITH_NAN
+
+
+def test_correlation_matrix_nan():
+    """Test that returned correlation matrix is correct, when NaN present in the dataframe."""
+    expected_correlation_matrix = np.array(
+        [
+            [1.000000, -0.577350, -1.000000, 1.000000],
+            [-0.577350, 1.000000, np.nan, -0.577350],
+            [-1.000000, np.nan, 1.000000, -1.000000],
+            [1.000000, -0.577350, -1.000000, 1.000000],
+        ]
+    )
+    output_matrix = correlation_matrix(data=DF_WITH_NAN)
+    np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix)
+
+
+def test_correlation_matrix():
+    """Test that returned correlation matrix is correct."""
+    expected_correlation_matrix = np.array(
+        [
+            [1.000000, -0.577350, -0.904534, 1.000000],
+            [-0.577350, 1.000000, 0.174078, -0.577350],
+            [-0.904534, 0.174078, 1.000000, -0.904534],
+            [1.000000, -0.577350, -0.904534, 1.000000],
+        ]
+    )
+    output_matrix = correlation_matrix(data=DF)
+    np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix)
+
+
+def test_correlation_matrix_non_numeric():
+    """Test that returned correlation matrix is correct."""
+    with pytest.raises(NonNumericDataException):
+        correlation_matrix(data=DF_NON_NUMERIC, columns=["a", "b"])
+
+
+def test_invalid_correlation_method():
+    """Test that invalid correlation method raises the correct exception."""
+    with pytest.raises(BeartypeCallHintParamViolation):
+        correlation_matrix(data=DF, correlation_method="invalid_method")
+
+
+def test_min_periods_with_kendall():
+    """Test that min_periods with correlation_method 'kendall' raises the correct exception."""
+    with pytest.raises(InvalidParameterValueException):
+        correlation_matrix(data=DF, correlation_method="kendall", min_periods=1)
diff --git a/tests/exploratory_analyses/covariance_matrix_test.py b/tests/exploratory_analyses/covariance_matrix_test.py
new file mode 100644
index 00000000..be69dd64
--- /dev/null
+++ b/tests/exploratory_analyses/covariance_matrix_test.py
@@ -0,0 +1,56 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from eis_toolkit.exceptions import InvalidParameterValueException
+from eis_toolkit.exploratory_analyses.covariance_matrix import covariance_matrix
+
+DATA = np.array([[0, 1, 2, 1], [2, 0, 1, 2], [2, 1, 0, 2], [0, 1, 2, 1]])
+DF = pd.DataFrame(DATA, columns=["a", "b", "c", "d"])
+DF_NON_NUMERIC = pd.DataFrame(
+    data=np.array([[0, 1, 2, 1], ["a", "b", "c", "d"], [3, 2, 1, 0], ["c", "d", "b", "a"]]),
+    columns=["a", "b", "c", "d"],
+)
+DF_WITH_NAN = pd.DataFrame(
+    data=np.array([[0, 1, 2, 1], [2, 0, np.nan, 2], [2, 1, 0, 2], [0, 1, 2, 1]]), columns=["a", "b", "c", "d"]
+)
+
+
+def test_covariance_matrix_nan():
+    """Test that returned covariance matrix is correct, when NaN present in the dataframe."""
+    expected_correlation_matrix = np.array(
+        [
+            [1.333333, -0.333333, -1.333333, 0.666667],
+            [-0.333333, 0.25, 0, -0.166667],
+            [-1.333333, 0, 1.333333, -0.666667],
+            [0.666667, -0.166667, -0.666667, 0.333333],
+        ]
+    )
+    output_matrix = covariance_matrix(data=DF_WITH_NAN)
+    np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix)
+
+
+def test_covariance_matrix():
+    """Test that returned covariance matrix is correct."""
+    expected_covariance_matrix = np.array(
+        [
+            [1.333333, -0.333333, -1.000000, 0.666667],
+            [-0.333333, 0.250000, 0.083333, -0.166667],
+            [-1.000000, 0.083333, 0.916667, -0.500000],
+            [0.666667, -0.166667, -0.500000, 0.333333],
+        ]
+    )
+    output_matrix = covariance_matrix(data=DF)
+    np.testing.assert_array_almost_equal(output_matrix, expected_covariance_matrix)
+
+
+def test_covariance_matrix_negative_min_periods():
+    """Test that negative min_periods value raises the correct exception."""
+    with pytest.raises(InvalidParameterValueException):
+        covariance_matrix(data=DF, min_periods=-1)
+
+
+def test_invalid_ddof():
+    """Test that invalid delta degrees of freedom raises the correct exception."""
+    with pytest.raises(InvalidParameterValueException):
+        covariance_matrix(data=DF, delta_degrees_of_freedom=-1)
diff --git a/tests/exploratory_analyses/statistical_tests_test.py b/tests/exploratory_analyses/statistical_tests_test.py
deleted file mode 100644
index 03c68d7e..00000000
--- a/tests/exploratory_analyses/statistical_tests_test.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import numpy as np
-import pandas as pd
-import pytest
-from beartype.roar import BeartypeCallHintParamViolation
-
-from eis_toolkit.exceptions import InvalidParameterValueException, NonNumericDataException
-from eis_toolkit.exploratory_analyses.statistical_tests import chi_square_test, correlation_matrix, covariance_matrix
-
-data = np.array([[0, 1, 2, 1], [2, 0, 1, 2], [2, 1, 0, 2], [0, 1, 2, 1]])
-missing_data = np.array([[0, 1, 2, 1], [2, 0, np.nan, 2], [2, 1, 0, 2], [0, 1, 2, 1]])
-non_numeric_data = np.array([[0, 1, 2, 1], ["a", "b", "c", "d"], [3, 2, 1, 0], ["c", "d", "b", "a"]])
-numeric_data = pd.DataFrame(data, columns=["a", "b", "c", "d"])
-non_numeric_df = pd.DataFrame(non_numeric_data, columns=["a", "b", "c", "d"])
-missing_values_df = pd.DataFrame(missing_data, columns=["a", "b", "c", "d"])
-categorical_data = pd.DataFrame({"e": [0, 0, 1, 1], "f": [True, False, True, True]})
-target_column = "e"
-np.random.seed(42)
-large_data = np.random.normal(size=5001)
-large_df = pd.DataFrame(large_data, columns=["a"])
-
-
-def test_chi_square_test():
-    """Test that returned statistics for independence are correct."""
-    output_statistics = chi_square_test(data=categorical_data, target_column=target_column, columns=["f"])
-    np.testing.assert_array_equal(list(output_statistics["f"].values()), [0.0, 1.0, 1])
-
-
-def test_correlation_matrix_nan():
-    """Test that returned correlation matrix is correct, when NaN present in the dataframe."""
-    expected_correlation_matrix = np.array(
-        [
-            [1.000000, -0.577350, -1.000000, 1.000000],
-            [-0.577350, 1.000000, np.nan, -0.577350],
-            [-1.000000, np.nan, 1.000000, -1.000000],
-            [1.000000, -0.577350, -1.000000, 1.000000],
-        ]
-    )
-    output_matrix = correlation_matrix(data=missing_values_df)
-    np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix)
-
-
-def test_correlation_matrix():
-    """Test that returned correlation matrix is correct."""
-    expected_correlation_matrix = np.array(
-        [
-            [1.000000, -0.577350, -0.904534, 1.000000],
-            [-0.577350, 1.000000, 0.174078, -0.577350],
-            [-0.904534, 0.174078, 1.000000, -0.904534],
-            [1.000000, -0.577350, -0.904534, 1.000000],
-        ]
-    )
-    output_matrix = correlation_matrix(data=numeric_data)
-    np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix)
-
-
-def test_correlation_matrix_non_numeric():
-    """Test that returned correlation matrix is correct."""
-    with pytest.raises(NonNumericDataException):
-        correlation_matrix(data=non_numeric_df, columns=["a", "b"])
-
-
-def test_covariance_matrix_nan():
-    """Test that returned covariance matrix is correct, when NaN present in the dataframe."""
-    expected_correlation_matrix = np.array(
-        [
-            [1.333333, -0.333333, -1.333333, 0.666667],
-            [-0.333333, 0.25, 0, -0.166667],
-            [-1.333333, 0, 1.333333, -0.666667],
-            [0.666667, -0.166667, -0.666667, 0.333333],
-        ]
-    )
-    output_matrix = covariance_matrix(data=missing_values_df)
-    np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix)
-
-
-def test_covariance_matrix():
-    """Test that returned covariance matrix is correct."""
-    expected_covariance_matrix = np.array(
-        [
-            [1.333333, -0.333333, -1.000000, 0.666667],
-            [-0.333333, 0.250000, 0.083333, -0.166667],
-            [-1.000000, 0.083333, 0.916667, -0.500000],
-            [0.666667, -0.166667, -0.500000, 0.333333],
-        ]
-    )
-    output_matrix = covariance_matrix(data=numeric_data)
-    np.testing.assert_array_almost_equal(output_matrix, expected_covariance_matrix)
-
-
-def test_covariance_matrix_negative_min_periods():
-    """Test that negative min_periods value raises the correct exception."""
-    with pytest.raises(InvalidParameterValueException):
-        covariance_matrix(data=numeric_data, min_periods=-1)
-
-
-def test_invalid_target_column():
-    """Test that invalid target column raises the correct exception."""
-    with pytest.raises(InvalidParameterValueException):
-        chi_square_test(data=categorical_data, target_column="invalid_column")
-
-
-def test_invalid_correlation_method():
-    """Test that invalid correlation method raises the correct exception."""
-    with pytest.raises(BeartypeCallHintParamViolation):
-        correlation_matrix(data=numeric_data, correlation_method="invalid_method")
-
-
-def test_min_periods_with_kendall():
-    """Test that min_periods with correlation_method 'kendall' raises the correct exception."""
-    with pytest.raises(InvalidParameterValueException):
-        correlation_matrix(data=numeric_data, correlation_method="kendall", min_periods=1)
-
-
-def test_invalid_ddof():
-    """Test that invalid delta degrees of freedom raises the correct exception."""
-    with pytest.raises(InvalidParameterValueException):
-        covariance_matrix(data=numeric_data, delta_degrees_of_freedom=-1)

From 6093734fd3b2e4c181cbd59ed0a998c45e0fe8d7 Mon Sep 17 00:00:00 2001
From: Niko Aarnio <niko.aarnio@protonmail.com>
Date: Thu, 29 Feb 2024 09:37:28 +0200
Subject: [PATCH 7/9] Add docs

---
 docs/exploratory_analyses/chi_square_test.md     | 3 +++
 docs/exploratory_analyses/correlation_matrix.md  | 0
 docs/exploratory_analyses/covariance_matrix.md   | 3 +++
 docs/exploratory_analyses/statistical_testing.md | 3 ---
 4 files changed, 6 insertions(+), 3 deletions(-)
 create mode 100644 docs/exploratory_analyses/chi_square_test.md
 create mode 100644 docs/exploratory_analyses/correlation_matrix.md
 create mode 100644 docs/exploratory_analyses/covariance_matrix.md
 delete mode 100644 docs/exploratory_analyses/statistical_testing.md

diff --git a/docs/exploratory_analyses/chi_square_test.md b/docs/exploratory_analyses/chi_square_test.md
new file mode 100644
index 00000000..52e00339
--- /dev/null
+++ b/docs/exploratory_analyses/chi_square_test.md
@@ -0,0 +1,3 @@
+# Chi-square test
+
+::: eis_toolkit.exploratory_analyses.chi_square_test
diff --git a/docs/exploratory_analyses/correlation_matrix.md b/docs/exploratory_analyses/correlation_matrix.md
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/exploratory_analyses/covariance_matrix.md b/docs/exploratory_analyses/covariance_matrix.md
new file mode 100644
index 00000000..12385763
--- /dev/null
+++ b/docs/exploratory_analyses/covariance_matrix.md
@@ -0,0 +1,3 @@
+# Covariance matrix
+
+::: eis_toolkit.exploratory_analyses.covariance_matrix
diff --git a/docs/exploratory_analyses/statistical_testing.md b/docs/exploratory_analyses/statistical_testing.md
deleted file mode 100644
index 04df277a..00000000
--- a/docs/exploratory_analyses/statistical_testing.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Statistical (hypothesis) testing
-
-::: eis_toolkit.exploratory_analyses.statistical_tests

From 68e00628af14ed426e61ad378a02f7521d0a1403 Mon Sep 17 00:00:00 2001
From: Niko Aarnio <niko.aarnio@protonmail.com>
Date: Thu, 29 Feb 2024 09:41:21 +0200
Subject: [PATCH 8/9] minor update

---
 docs/exploratory_analyses/correlation_matrix.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/exploratory_analyses/correlation_matrix.md b/docs/exploratory_analyses/correlation_matrix.md
index e69de29b..b86109e1 100644
--- a/docs/exploratory_analyses/correlation_matrix.md
+++ b/docs/exploratory_analyses/correlation_matrix.md
@@ -0,0 +1,3 @@
+# Correlation matrix
+
+::: eis_toolkit.exploratory_analyses.correlation_matrix

From 19c7229bfecbf99078ffc5079c431f079c37a132 Mon Sep 17 00:00:00 2001
From: Niko Aarnio <niko.aarnio@protonmail.com>
Date: Mon, 4 Mar 2024 13:45:50 +0200
Subject: [PATCH 9/9] fix: normality test filters numeric columns instead of
 error, fixed case where CLI function does not compute anything if columns
 param is left empty

---
 eis_toolkit/cli.py                                 |  2 +-
 eis_toolkit/exploratory_analyses/normality_test.py | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py
index ad070e98..7c42f3b3 100644
--- a/eis_toolkit/cli.py
+++ b/eis_toolkit/cli.py
@@ -293,7 +293,7 @@ def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION],
     with rasterio.open(input_raster) as raster:
         data = raster.read()
         typer.echo("Progress: 25%")
-
+        print(bands)
         if len(bands) == 0:
             bands = None
         results_dict = normality_test_array(data=data, bands=bands, nodata_value=raster.nodata)
diff --git a/eis_toolkit/exploratory_analyses/normality_test.py b/eis_toolkit/exploratory_analyses/normality_test.py
index 8e8fcad0..26d0fb79 100644
--- a/eis_toolkit/exploratory_analyses/normality_test.py
+++ b/eis_toolkit/exploratory_analyses/normality_test.py
@@ -36,13 +36,13 @@ def normality_test_dataframe(
     Raises:
         EmptyDataException: The input data is empty.
         InvalidColumnException: All selected columns were not found in the input data.
-        NonNumericDataException: Selected data or columns contains non-numeric data.
+        NonNumericDataException: Selected columns contain non-numeric data or no numeric columns were found.
         SampleSizeExceededException: Input data exceeds the maximum of 5000 samples.
     """
     if check_empty_dataframe(data):
         raise EmptyDataException("The input Dataframe is empty.")
 
-    if columns is not None:
+    if columns is not None and columns != []:
         if not check_columns_valid(data, columns):
             raise InvalidColumnException("All selected columns were not found in the input DataFrame.")
         if not check_columns_numeric(data, columns):
@@ -51,9 +51,9 @@ def normality_test_dataframe(
         data = data[columns].dropna()
 
     else:
-        if not check_columns_numeric(data, data.columns):
-            raise NonNumericDataException("The input data contain non-numeric data.")
-        columns = data.columns
+        columns = data.select_dtypes(include=[np.number]).columns
+        if len(columns) == 0:
+            raise NonNumericDataException("No numeric columns were found.")
 
     statistics = {}
     for column in columns: