Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add io/loaders module #83

Merged
merged 19 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
fe6a77b
feat: add io/loaders module with features, general, and images specif…
strixy16 Dec 11, 2024
9010f03
test: test the loadImageDatasetConfig function
strixy16 Dec 12, 2024
08f8151
feat: add READII config files for the two test datasets
strixy16 Dec 12, 2024
1d3d96a
tests: add radiomic feature extraction output test for 4D lung so out…
strixy16 Dec 12, 2024
35d4dbc
feat: test radiomic feature output for 4D Lung
strixy16 Dec 12, 2024
c29d791
test: add ct to seg match file for 4D Lung so tests work
strixy16 Dec 12, 2024
186851f
refactor: improve import in test_general
strixy16 Dec 12, 2024
42f1370
chore: ignore test outputs in new data structure
strixy16 Dec 12, 2024
b2c61e4
chore: remove output feature file for 4D-lung
strixy16 Dec 12, 2024
4745b40
Merge remote-tracking branch 'origin/main' into katys/add-io
strixy16 Dec 12, 2024
89bc370
Refactor file loading with custom error handling and pathlib support …
jjjermiah Dec 13, 2024
8386e71
refactor: apply suggest updates for error handling, add logger
strixy16 Dec 13, 2024
e1aa3d8
refactor: made expected image types a fixture
strixy16 Dec 13, 2024
9b87aae
build: readii updated to 1.20.0
strixy16 Dec 13, 2024
229663b
fix: update output paths for ct_to_seg files for new data arrangement
strixy16 Dec 13, 2024
f4cc548
feat: update test output ignore to have procdata along with results f…
strixy16 Dec 13, 2024
0c7c45a
Merge branch 'main' into katys/add-io
jjjermiah Dec 13, 2024
11def04
chore: update lockfile
jjjermiah Dec 13, 2024
94111ba
feat: update test fixtures to copy metadata files to new paths, and r…
jjjermiah Dec 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ dmypy.json

# Test outputs
tests/output/*
tests/*/procdata/*
tests/*/results/*

# pixi environments
.pixi
Expand Down
56 changes: 28 additions & 28 deletions pixi.lock

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions src/readii/io/loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Module for loading different data types for the READII pipeline."""

from .features import loadFeatureFilesFromImageTypes
from .general import loadFileToDataFrame, loadImageDatasetConfig
from .images import getImageTypesFromDirectory

__all__ = [
"loadFeatureFilesFromImageTypes",
"loadFileToDataFrame",
"loadImageDatasetConfig",
"getImageTypesFromDirectory"
]
85 changes: 85 additions & 0 deletions src/readii/io/loaders/features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os
import pandas as pd

from typing import Optional, Dict

from readii.io.loaders.general import loadFileToDataFrame

from readii.utils import logger


def loadFeatureFilesFromImageTypes(extracted_feature_dir:str,
image_types:list,
drop_labels:Optional[bool]=True,
labels_to_drop:Optional[list]=None)->Dict[str,pd.DataFrame]:
"""Function to load in all the extracted imaging feature sets from a directory and return them as a dictionary of dataframes.

Parameters
----------
extracted_feature_dir : str
Path to the directory containing the extracted feature csv files
image_types : list, optional
List of image types to load in. The default is ['original'].
drop_labels : bool, optional
Whether to drop the labels from the dataframes. Use when loading labelled data from data_setup_for_modeling.ipynb. The default is True.
labels_to_drop : list, optional
List of labels to drop from the dataframes. The default is ["patient_ID","survival_time_in_years","survival_event_binary"] based on code
in data_setup_for_modeling.ipynb.

Returns
-------
feature_sets : dict
Dictionary of dataframes containing the extracted radiomics features.
"""
# Set default labels to drop if not specified
if labels_to_drop is None:
labels_to_drop = ["patient_ID","survival_time_in_years","survival_event_binary"]

# Initialize dictionary to store the feature sets
feature_sets = {}

# Check if the passed in extracted feature directory exists
if not os.path.isdir(extracted_feature_dir):
raise FileNotFoundError(f"Extracted feature directory {extracted_feature_dir} does not exist.")

feature_file_list = os.listdir(extracted_feature_dir)

# Loop through all the files in the directory
for image_type in image_types:
try:
# Extract the image type feature csv file from the feature directory
matching_files = [file for file in feature_file_list if (image_type in file) and (file.endswith(".csv"))]
if matching_files:
image_type_feature_file = matching_files[0]
# Remove the image type file from the list of feature files
feature_file_list.remove(image_type_feature_file)
jjjermiah marked this conversation as resolved.
Show resolved Hide resolved
except IndexError as e:
logger.warning(f"No {image_type} feature csv files found in {extracted_feature_dir}")
# Skip to the next image type
continue


# Get the full path to the feature file
feature_file_path = os.path.join(extracted_feature_dir, image_type_feature_file)

# Load the feature data into a pandas dataframe
raw_feature_data = loadFileToDataFrame(feature_file_path)

try:
# Drop the labels from the dataframe if specified
if drop_labels:
# Data is now only extracted features
raw_feature_data.drop(labels_to_drop, axis=1, inplace=True)
except KeyError as e:
logger.warning(f"{feature_file_path} does not have the labels {labels_to_drop} to drop.")
# Skip to the next image type
continue

# Save the dataframe to the feature_sets dictionary
feature_sets[image_type] = raw_feature_data

# After processing all image types, check if any feature sets were loaded
if not feature_sets:
raise ValueError(f"No valid feature sets were loaded from {extracted_feature_dir}")

return feature_sets
99 changes: 99 additions & 0 deletions src/readii/io/loaders/general.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from pathlib import Path

import pandas as pd
import yaml


class ConfigError(Exception):
"""Base class for errors in the config module."""

pass

class DataFrameLoadError(Exception):
"""Custom exception for DataFrame loading errors."""

pass

def loadImageDatasetConfig(dataset_name: str, config_dir_path: str | Path) -> dict:
"""Load the configuration file for a given dataset.

Expects the configuration file to be named <dataset_name>.yaml.

Parameters
----------
dataset_name : str
Name of the dataset to load the configuration file for.
config_dir_path : str or pathlib.Path
Path to the directory containing the configuration files.

Returns
-------
dict
Dictionary containing the configuration settings for the dataset.

Examples
--------
>>> config = loadImageDatasetConfig("NSCLC_Radiogenomics", "config")
"""
config_dir_path = Path(config_dir_path)
config_file_path = config_dir_path / f"{dataset_name}.yaml"

if not config_file_path.exists():
msg = f"Config file {config_file_path} does not exist."
raise FileNotFoundError(msg)

try:
with config_file_path.open("r") as f:
config = yaml.safe_load(f)
except yaml.YAMLError as ye:
raise ConfigError("Invalid YAML in config file") from ye

if not config:
raise ConfigError("Config file is empty or invalid")

return config

jjjermiah marked this conversation as resolved.
Show resolved Hide resolved


def loadFileToDataFrame(file_path: str | Path) -> pd.DataFrame:
"""Load data from a csv or xlsx file into a pandas dataframe.

Parameters
----------
file_path : str or pathlib.Path
Path to the data file.

Returns
-------
pd.DataFrame
Dataframe containing the data from the file.
"""
file_path = Path(file_path)
if not file_path:
raise ValueError("File is empty")

if not file_path.exists():
msg = f"File {file_path} does not exist"
raise FileNotFoundError(msg)

# Get the file extension
file_extension = file_path.suffix

try:
if file_extension == '.xlsx':
df = pd.read_excel(file_path)
elif file_extension == '.csv':
df = pd.read_csv(file_path)
else:
msg = f"Unsupported file format {file_extension}. Please provide a .csv or .xlsx file."
raise ValueError(msg)

Comment on lines +82 to +90
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Codebase verification

Based on the gathered information, I can now provide the final response:

Add validation for potential Excel file format exploits

The current implementation only handles basic file extension checks and empty file errors, but lacks specific Excel format validation. Add the following security measures:

  • Use openpyxl engine explicitly with read_only=True to prevent formula execution
  • Add file size limits to prevent memory exhaustion attacks
  • Validate the Excel file structure before processing

Example locations that need attention:

  • src/readii/io/loaders/general.py:84: Update pd.read_excel() call with security parameters
🔗 Analysis chain

Add validation for Excel file format.

The current implementation doesn't validate Excel file formats, which could lead to security issues.

Let's verify the Excel file handling in the codebase:


Let me gather more context about the Excel file handling implementation.


Let me check for any additional error handling or validation mechanisms in the codebase.

🏁 Scripts executed

The following scripts were executed for the analysis:

Script:

#!/bin/bash
# Search for Excel file handling patterns
rg -l "\.xlsx?" .

# Check for potential Excel-related security measures
ast-grep --pattern 'pd.read_excel($$$)' 

Length of output: 199


Script:

#!/bin/bash
# Check for any Excel-related validation or security measures
rg -A 5 -B 5 "\.xlsx" .

# Look for any Excel-related error handling
rg -A 5 -B 5 "read_excel" .

# Check for any custom Excel validation functions
ast-grep --pattern 'def $_($$$) {
  $$$
  xlsx
  $$$
}'

Length of output: 2634


Script:

#!/bin/bash
# Check for custom exceptions and error handling
rg -A 10 "class.*Error" .

# Look for any validation utilities
rg -A 5 "validate|check|verify" .

# Check the full context of the file loading function
ast-grep --pattern 'def load_$_($$$) {
  $$$
}'

Length of output: 18408

except pd.errors.EmptyDataError as e:
raise DataFrameLoadError("File is empty") from e

except (pd.errors.ParserError, ValueError) as e:
raise DataFrameLoadError("Error parsing file") from e

if df.empty:
raise DataFrameLoadError("Dataframe is empty")
return df
jjjermiah marked this conversation as resolved.
Show resolved Hide resolved
52 changes: 52 additions & 0 deletions src/readii/io/loaders/images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from pathlib import Path
from typing import Union

def getImageTypesFromDirectory(raw_data_dir:Union[Path|str],
feature_file_prefix:str = "",
feature_file_suffix:str = ".csv"):
jjjermiah marked this conversation as resolved.
Show resolved Hide resolved
""" Function to get a list of image types from a directory containing image feature files.

Parameters
----------
raw_data_dir : str
Path to the directory containing the image feature files.
feature_file_prefix : str, optional
Prefix to remove from the feature file name. The default is "".
feature_file_suffix : str, optional
Suffix to remove from the feature file name. The default is ".csv".

Returns
-------
list
List of image types from the image feature files.
"""
# Check if raw_data_dir is a string or a Path object, convert to Path object if it is a string
if isinstance(raw_data_dir, str):
raw_data_dir = Path(raw_data_dir)

# Check if the directory exists
if not raw_data_dir.exists():
raise FileNotFoundError(f"Directory {raw_data_dir} does not exist.")

# Check if the directory is a directory
if not raw_data_dir.is_dir():
raise NotADirectoryError(f"Path {raw_data_dir} is not a directory.")

# Check that directory contains files with the specified prefix and suffix
if not any(raw_data_dir.glob(f"{feature_file_prefix}*{feature_file_suffix}")):
raise FileNotFoundError(f"No files with prefix {feature_file_prefix} and suffix {feature_file_suffix} found in directory {raw_data_dir}.")

# Initialize an empty list to store the image types
image_types = []

# Get list of file banes with the specified prefix and suffix in the directory
for file in raw_data_dir.glob(f"{feature_file_prefix}*{feature_file_suffix}"):
file_name = file.name

# Remove the prefix and suffix from the file name
image_type = file_name.removeprefix(feature_file_prefix).removesuffix(feature_file_suffix)

# Add the image type to the list
image_types.append(image_type)

return image_types
20 changes: 20 additions & 0 deletions tests/4D-Lung/4D-Lung.yaml
strixy16 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Config file for 4D-Lung for READII
dataset_name: 4D-Lung

### CLINICAL VARIABLE INFORMATION ###
# Event values should be in the order [Alive_value, Dead_value]
outcome_variables:
time_label:
event_label:
convert_to_years: False
event_value_mapping:
jjjermiah marked this conversation as resolved.
Show resolved Hide resolved

exclusion_variables:

train_test_split:
split: False
split_variable:
impute:


image_types: ["original", "shuffled_full","shuffled_roi","shuffled_non_roi","randomized_sampled_full","randomized_sampled_roi","randomized_sampled_non_roi"]
20 changes: 20 additions & 0 deletions tests/NSCLC_Radiogenomics/NSCLC_Radiogenomics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Config file for NSCLC_Radiogenomics for READII
dataset_name: NSCLC_Radiogenomics

### CLINICAL VARIABLE INFORMATION ###
# Event values should be in the order [Alive_value, Dead_value]
outcome_variables:
time_label: ""
event_label: "Survival Status"
convert_to_years: False
event_value_mapping: {'Alive': 0, 'Dead': 1}

exclusion_variables:

train_test_split:
split: False
split_variable:
impute:


image_types: ["original", "shuffled_full","shuffled_roi","shuffled_non_roi","randomized_sampled_full","randomized_sampled_roi","randomized_sampled_non_roi"]
29 changes: 29 additions & 0 deletions tests/io/loaders/test_general.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from readii.io.loaders.general import loadImageDatasetConfig
import pytest

@pytest.fixture
def nsclcConfigDirPath():
return "tests/NSCLC_Radiogenomics"

@pytest.fixture
def lung4DConfigDirPath():
return "tests/4D-Lung"

@pytest.fixture
def expected_image_types():
return ["original", "shuffled_full","shuffled_roi","shuffled_non_roi","randomized_sampled_full","randomized_sampled_roi","randomized_sampled_non_roi"]


def test_NSCLC_loadImageDatasetConfig(nsclcConfigDirPath, expected_image_types):
config = loadImageDatasetConfig("NSCLC_Radiogenomics", nsclcConfigDirPath)
assert config["dataset_name"] == "NSCLC_Radiogenomics"
assert config["image_types"] == expected_image_types
assert config["outcome_variables"]["event_label"] == "Survival Status"
assert config["outcome_variables"]["event_value_mapping"] == {'Alive': 0, 'Dead': 1}

def test_lung4D_loadImageDatasetConfig(lung4DConfigDirPath, expected_image_types):
config = loadImageDatasetConfig("4D-Lung", lung4DConfigDirPath)
assert config["dataset_name"] == "4D-Lung"
assert config["image_types"] == expected_image_types
assert config["outcome_variables"]["event_label"] is None
assert config["outcome_variables"]["event_value_mapping"] is None
2 changes: 2 additions & 0 deletions tests/output/ct_to_seg_match_list_4D-Lung.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
patient_ID,study_CT,study_description_CT,series_CT,series_description_CT,subseries_CT,modality_CT,instances_CT,instance_uid_CT,reference_ct_CT,reference_rs_CT,reference_pl_CT,reference_frame_CT,folder_CT,orientation_CT,orientation_type_CT,MR_repetition_time_CT,MR_echo_time_CT,MR_scan_sequence_CT,MR_magnetic_field_strength_CT,MR_imaged_nucleus_CT,file_path_CT,series_seg,subseries_seg,modality_seg,instances_seg,instance_uid_seg,reference_ct_seg,reference_rs_seg,reference_pl_seg,reference_frame_seg,folder_seg,orientation_seg,orientation_type_seg,MR_repetition_time_seg,MR_echo_time_seg,MR_scan_sequence_seg,MR_magnetic_field_strength_seg,MR_imaged_nucleus_seg,file_path_seg,edge_type
113_HM10395,1.3.6.1.4.1.14519.5.2.1.6834.5010.324605948863389564556891313296,p4,1.3.6.1.4.1.14519.5.2.1.6834.5010.339023390306606021995936229543,"P4^P113^S303^I10349, Gated, 40.0%B",default,CT,99,1.3.6.1.4.1.14519.5.2.1.6834.5010.249506064276270740866733345688,,,,1.3.6.1.4.1.14519.5.2.1.6834.5010.107174034240688216982546597713,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-29543,"[1, 0, 0, 0, 1, 0]",,,,,,,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-29543/1-81.dcm,2.25.186899387610254289948150314209581209847.35,default,RTSTRUCT,1,1.3.6.1.4.1.14519.5.2.1.6834.5010.815153834456695039602326691312,1.3.6.1.4.1.14519.5.2.1.6834.5010.339023390306606021995936229543,,,1.3.6.1.4.1.14519.5.2.1.6834.5010.107174034240688216982546597713,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-47.35/1-1.dcm,,,,,,,,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-47.35/1-1.dcm,2
36 changes: 31 additions & 5 deletions tests/test_feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import collections
import pandas as pd
import os
import shutil
from pathlib import Path

@pytest.fixture
def nsclcCTImage():
Expand Down Expand Up @@ -44,7 +46,21 @@ def pyradiomicsParamFilePath():

@pytest.fixture
def nsclcMetadataPath():
return "tests/output/ct_to_seg_match_list_NSCLC_Radiogenomics.csv"
oldpath = Path("tests/output/ct_to_seg_match_list_NSCLC_Radiogenomics.csv")
newpath = Path("tests/NSCLC_Radiogenomics/procdata/ct_to_seg_match_list_NSCLC_Radiogenomics.csv")
newpath.parent.mkdir(parents=True, exist_ok=True)
shutil.copy(oldpath, newpath)
yield newpath.as_posix()
newpath.unlink()

@pytest.fixture
def lung4DMetadataPath():
oldpath = Path("tests/output/ct_to_seg_match_list_4D-Lung.csv")
newpath = Path("tests/4D-Lung/procdata/ct_to_seg_match_list_4D-Lung.csv")
newpath.parent.mkdir(parents=True, exist_ok=True)
shutil.copy(oldpath, newpath)
yield newpath.as_posix()
newpath.unlink()


def test_singleRadiomicFeatureExtraction_SEG(nsclcCTImage, nsclcSEGImage, pyradiomicsParamFilePath):
Expand Down Expand Up @@ -108,11 +124,21 @@ def test_radiomicFeatureExtraction(nsclcMetadataPath):
"Volume feature is incorrect"


def test_radiomicFeatureExtraction_output(nsclcMetadataPath):
"""Test output creation from radiomic feature extraction"""
def test_NSCLC_radiomicFeatureExtraction_output(nsclcMetadataPath):
"""Test output creation from radiomic feature extraction for SEG dataset"""
actual = radiomicFeatureExtraction(nsclcMetadataPath,
imageDirPath = "tests/",
roiNames = None,
outputDirPath = "tests/output/")
expected_path = "tests/output/features/radiomicfeatures_original_NSCLC_Radiogenomics.csv"
outputDirPath = "tests/NSCLC_Radiogenomics/results/")
expected_path = "tests/NSCLC_Radiogenomics/results/features/radiomicfeatures_original_NSCLC_Radiogenomics.csv"
assert os.path.exists(expected_path)


def test_4DLung_radiomicFeatureExtraction_output(lung4DMetadataPath):
"""Test output creation from radiomic feature extraction for RTSTRUCT dataset"""
actual = radiomicFeatureExtraction(lung4DMetadataPath,
imageDirPath = "tests/",
roiNames = "Tumor_c40",
outputDirPath = "tests/4D-Lung/results/")
expected_path = "tests/4D-Lung/results/features/radiomicfeatures_original_4D-Lung.csv"
assert os.path.exists(expected_path)
Loading
Loading