Merge remote-tracking branch 'origin/main' into katys/update-ruff-config

bhklab · Dec 17, 2024 · 5dbfc73 · 5dbfc73
2 parents 7dda5ed + 9b734be
commit 5dbfc73
Show file tree

Hide file tree

Showing 14 changed files with 3,930 additions and 782 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,29 @@
 # CHANGELOG
 
 
+## v1.27.0 (2024-12-17)
+
+### Features
+
+- Add analysis functions ([#92](https://github.com/bhklab/readii/pull/92),
+  [`a57182c`](https://github.com/bhklab/readii/commit/a57182cb32f3bcd3600eaed53ad60ac145fe9d6f))
+
+Includes correlation calculations and plotting those correlations as a heatmap and histogram
+
+- **New Features** - Updated version number to 1.26.0 with new dependencies: `numpy`, `seaborn`, and
+  `pandas`. - Introduced a new module for analyzing READII outputs with several correlation
+  functions. - Added visualization functions for correlation data: heatmap and histogram. 	- New
+  validation function for DataFrame dimensions added. 	- Expanded platform support to include
+  `osx-64` and `win-64`.
+
+- **Bug Fixes** - Enhanced error handling in correlation calculations and plot saving processes. 	-
+  Simplified exception handling in feature loading functions.
+
+- **Documentation** - Improved docstrings for new functions and modules for better usability.
+
+- **Chores** 	- Expanded linting configuration for broader coverage of Python files.
+
+
 ## v1.26.0 (2024-12-16)
 
 ### Features

diff --git a/config/ruff.toml b/config/ruff.toml
@@ -7,6 +7,7 @@ include = [
   "src/readii/cli/**/*.py",
   "src/readii/negative_controls_refactor/**.py",
   "src/readii/io/**/**.py",
+  "src/readii/analyze/**.py"
 ]
 
 # extend-exclude is used to exclude directories from the flake8 checks

diff --git a/notebooks/nifti_writer_example.ipynb b/notebooks/nifti_writer_example.ipynb
@@ -16,7 +16,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -26,7 +26,6 @@
     "import subprocess\n",
     "import SimpleITK as sitk\n",
     "import pandas as pd\n",
-    "import uuid\n",
     "import random\n",
     "import sys\n",
     "from readii.utils import logger"
@@ -41,7 +40,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -65,7 +64,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -210,7 +209,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.7"
+   "version": "3.12.8"
   }
  },
  "nbformat": 4,

diff --git a/pixi.lock b/pixi.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "readii"
-version = "1.26.0"
+version = "1.27.0"
 description = "A package to extract radiomic features!"
 authors = [{ name = "Katy Scott", email = "[email protected]" }]
 
@@ -13,6 +13,9 @@ dependencies = [
   "pydicom>=2.3.1",
   "pyradiomics-bhklab>=3.1.4,<4", 
   "orcestra-downloader>=0.9.0,<1",
+  "numpy==1.26.4.*",
+  "seaborn>=0.13.2,<0.14",
+  "pandas>=2.2.3,<3"
 ]
 requires-python = ">=3.10, <3.13"
 

diff --git a/src/readii/__init__.py b/src/readii/__init__.py
@@ -1,4 +1,4 @@
 # read version from installed package
 from importlib.metadata import version
-__version__ = "1.26.0"
+__version__ = "1.27.0"
 
diff --git a/src/readii/analyze/__init__.py b/src/readii/analyze/__init__.py
@@ -0,0 +1,17 @@
+"""Module to perform analysis on READII outputs."""
+from .correlation import (
+    getCrossCorrelationMatrix,
+    getFeatureCorrelations,
+    getHorizontalSelfCorrelations,
+    getVerticalSelfCorrelations,
+)
+from .plot_correlation import plotCorrelationHeatmap, plotCorrelationHistogram
+
+__all__ = [
+    'getFeatureCorrelations',
+    'getVerticalSelfCorrelations',
+    'getHorizontalSelfCorrelations',
+    'getCrossCorrelationMatrix',
+    'plotCorrelationHeatmap',
+    'plotCorrelationHistogram'
+]
diff --git a/src/readii/analyze/correlation.py b/src/readii/analyze/correlation.py
@@ -0,0 +1,165 @@
+import pandas as pd
+
+from readii.data.select import validateDataframeSubsetSelection
+from readii.utils import logger
+
+
+def getFeatureCorrelations(vertical_features:pd.DataFrame,
+                           horizontal_features:pd.DataFrame,
+                           method:str = "pearson",
+                           vertical_feature_name:str = '_vertical',
+                           horizontal_feature_name:str = '_horizontal') -> pd.DataFrame:
+    """Calculate correlation between two sets of features.
+
+    Parameters
+    ----------
+    vertical_features : pd.DataFrame
+        Dataframe containing features to calculate correlations with. Index must be the same as the index of the horizontal_features dataframe.
+    horizontal_features : pd.DataFrame
+        Dataframe containing features to calculate correlations with. Index must be the same as the index of the vertical_features dataframe.
+    method : str
+        Method to use for calculating correlations. Default is "pearson".
+    vertical_feature_name : str
+        Name of the vertical features to use as suffix in correlation dataframe. Default is "_vertical".
+    horizontal_feature_name : str
+        Name of the horizontal features to use as suffix in correlation dataframe. Default is "_horizontal".
+    
+    Returns
+    -------
+    correlation_matrix : pd.DataFrame
+        Dataframe containing correlation values.
+    """
+    # Check that features are dataframes
+    if not isinstance(vertical_features, pd.DataFrame):
+        msg = "vertical_features must be a pandas DataFrame"
+        logger.exception(msg)
+        raise TypeError()
+    if not isinstance(horizontal_features, pd.DataFrame):
+        msg = "horizontal_features must be a pandas DataFrame"
+        logger.exception(msg)
+        raise TypeError()
+
+    # Check for empty DataFrames  
+    if vertical_features.empty or horizontal_features.empty:  
+        msg = "Cannot calculate correlations with empty DataFrames"  
+        logger.exception(msg)  
+        raise ValueError(msg)
+
+    if method not in ["pearson", "spearman", "kendall"]:
+        msg = "Correlation method must be one of 'pearson', 'spearman', or 'kendall'."
+        logger.exception(msg)
+        raise ValueError()
+
+    if not vertical_features.index.equals(horizontal_features.index):
+        msg = "Vertical and horizontal features must have the same index to calculate correlation. Set the index to the intersection of patient IDs."
+        logger.exception(msg)
+        raise ValueError()
+
+    # Add _ to beginnging of feature names if they don't start with _ so they can be used as suffixes
+    if not vertical_feature_name.startswith("_"): 
+        vertical_feature_name = f"_{vertical_feature_name}"
+    if not horizontal_feature_name.startswith("_"): 
+        horizontal_feature_name = f"_{horizontal_feature_name}"
+
+    # Join the features into one dataframe
+    # Use inner join to keep only the rows that have a value in both vertical and horizontal features
+    features_to_correlate = vertical_features.join(horizontal_features, 
+                                                   how='inner', 
+                                                   lsuffix=vertical_feature_name, 
+                                                   rsuffix=horizontal_feature_name) 
+
+    try:
+        # Calculate correlation between vertical features and horizontal features
+        correlation_matrix = features_to_correlate.corr(method=method)
+    except Exception as e:
+        msg = f"Error calculating correlation matrix: {e}"
+        logger.exception(msg)
+        raise e
+
+    return correlation_matrix
+
+
+
+def getVerticalSelfCorrelations(correlation_matrix:pd.DataFrame,
+                                num_vertical_features:int) -> pd.DataFrame:
+    """Get the vertical (y-axis) self correlations from a correlation matrix. Gets the top left quadrant of the correlation matrix.
+
+    Parameters
+    ----------
+    correlation_matrix : pd.DataFrame
+        Dataframe containing the correlation matrix to get the vertical self correlations from.
+    num_vertical_features : int
+        Number of vertical features in the correlation matrix.
+
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe containing the vertical self correlations from the correlation matrix.    
+    """
+    try: 
+        validateDataframeSubsetSelection(correlation_matrix, num_vertical_features, num_vertical_features)
+    except ValueError as e:
+        msg = "Number of vertical features provided is greater than the number of rows or columns in the correlation matrix."
+        logger.exception(msg)
+        raise e
+
+    # Get the correlation matrix for vertical vs vertical - this is the top left corner of the matrix
+    return correlation_matrix.iloc[0:num_vertical_features, 0:num_vertical_features]
+
+
+
+def getHorizontalSelfCorrelations(correlation_matrix:pd.DataFrame,
+                                  num_horizontal_features:int) -> pd.DataFrame:
+    """Get the horizontal (x-axis) self correlations from a correlation matrix. Gets the bottom right quadrant of the correlation matrix.
+
+    Parameters
+    ----------
+    correlation_matrix : pd.DataFrame
+        Dataframe containing the correlation matrix to get the horizontal self correlations from.
+    num_horizontal_features : int
+        Number of horizontal features in the correlation matrix.
+
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe containing the horizontal self correlations from the correlation matrix.
+    """
+    try: 
+        validateDataframeSubsetSelection(correlation_matrix, num_horizontal_features, num_horizontal_features)
+    except ValueError as e: 
+        msg = "Number of horizontalfeatures provided is greater than the number of rows or columns in the correlation matrix."
+        logger.exception(msg)
+        raise e
+
+    # Get the index of the start of the horizontal correlations
+    start_of_horizontal_correlations = len(correlation_matrix.columns) - num_horizontal_features
+
+    # Get the correlation matrix for horizontal vs horizontal - this is the bottom right corner of the matrix
+    return correlation_matrix.iloc[start_of_horizontal_correlations:, start_of_horizontal_correlations:]
+
+
+
+def getCrossCorrelationMatrix(correlation_matrix:pd.DataFrame,
+                              num_vertical_features:int) -> pd.DataFrame:
+    """Get the cross correlation matrix subsection for a correlation matrix. Gets the top right quadrant of the correlation matrix so vertical and horizontal features are correctly labeled.
+
+    Parameters
+    ----------
+    correlation_matrix : pd.DataFrame
+        Dataframe containing the correlation matrix to get the cross correlation matrix subsection from.
+    num_vertical_features : int
+        Number of vertical features in the correlation matrix.
+    
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe containing the cross correlations from the correlation matrix.
+    """
+    try:
+        validateDataframeSubsetSelection(correlation_matrix, num_vertical_features, num_vertical_features)
+    except ValueError as e:
+        msg = "Number of vertical features provided is greater than the number of rows or columns in the correlation matrix."
+        logger.exception(msg)
+        raise e
+
+    return correlation_matrix.iloc[0:num_vertical_features, num_vertical_features:]