Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REF] OASIS-to-BIDS : improve code quality for function create_sessions_dict #1325

Merged
merged 7 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions clinica/iotools/converters/oasis_to_bids/oasis_to_bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,16 +112,10 @@ def _create_sessions_tsv(

sessions_dict = create_sessions_dict(
clinical_data_dir=clinical_data_dir,
bids_dir=bids_dir,
clinical_specifications_folder=Path(__file__).parents[1] / "specifications",
bids_ids=bids_ids,
)

# todo : when tested add to create_sessions_dict bc specific to oasis1
for bids_id in bids_ids:
sessions_dict[bids_id]["M000"]["diagnosis"] = (
"AD" if sessions_dict[bids_id]["M000"]["diagnosis"] > 0 else "CN"
)
write_sessions_tsv(bids_dir, sessions_dict)

return sessions_dict
Expand Down
91 changes: 27 additions & 64 deletions clinica/iotools/converters/oasis_to_bids/oasis_to_bids_utils.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
import os
from pathlib import Path
from typing import Iterable

import numpy as np
import pandas as pd

from clinica.iotools.bids_utils import StudyName, get_bids_sess_list
from clinica.utils.stream import cprint
from clinica.iotools.bids_utils import StudyName, bids_id_factory

__all__ = ["create_sessions_dict", "write_sessions_tsv"]


def create_sessions_dict(
clinical_data_dir: Path,
bids_dir: Path,
clinical_specifications_folder: Path,
AliceJoubert marked this conversation as resolved.
Show resolved Hide resolved
bids_ids: Iterable[str],
) -> dict:
Expand All @@ -24,11 +20,8 @@ def create_sessions_dict(
clinical_data_dir : Path
The path to the input folder.

bids_dir : Path
The path to the BIDS directory.

clinical_specifications_folder : Path
The path to the clinical file.
The path to the clinical file folder.

bids_ids : list of str
The list of bids ids.
Expand All @@ -39,63 +32,33 @@ def create_sessions_dict(
Session dict.
"""

location = f"{StudyName.OASIS.value} location"
sessions = pd.read_csv(clinical_specifications_folder / "sessions.tsv", sep="\t")
sessions_fields = sessions[StudyName.OASIS.value]
field_location = sessions[location]
sessions_fields_bids = sessions["BIDS CLINICA"]
fields_dataset = []
fields_bids = []
study = StudyName.OASIS.value
location = f"{study} location"
spec = pd.read_csv(clinical_specifications_folder / "sessions.tsv", sep="\t")[
[study, location, "BIDS CLINICA"]
].dropna()
sessions_dict = {}

for i in range(0, len(sessions_fields)):
if not pd.isnull(sessions_fields[i]):
fields_bids.append(sessions_fields_bids[i])
fields_dataset.append(sessions_fields[i])

for i in range(0, len(sessions_fields)):
# If the i-th field is available
if not pd.isnull(sessions_fields[i]):
# Load the file
tmp = field_location[i].split("/")
location = tmp[0]
sheet = tmp[1] if len(tmp) > 1 else 0
file_to_read_path = clinical_data_dir / location
file_ext = os.path.splitext(location)[1]
if file_ext == ".xlsx":
file_to_read = pd.read_excel(file_to_read_path, sheet_name=sheet)
elif file_ext == ".csv":
file_to_read = pd.read_csv(file_to_read_path)
else:
raise ValueError(
f"Unknown file extension {file_ext}. Expecting either .xlsx or .csv."
)

for r in range(0, len(file_to_read.values)):
# Extracts the subject ids columns from the dataframe
subj_id = file_to_read.iloc[r]["ID"]
if hasattr(subj_id, "dtype"):
if subj_id.dtype == np.int64:
subj_id = str(subj_id)
# Removes all the - from
subj_id_alpha = str(subj_id[0:3] + "IS" + subj_id[3] + subj_id[5:9])

# Extract the corresponding BIDS id and create the output file if doesn't exist
subj_bids = [s for s in bids_ids if subj_id_alpha in s]
if subj_bids:
subj_bids = subj_bids[0]
subj_dir = bids_dir / subj_bids
session_names = get_bids_sess_list(subj_dir)
for s in session_names:
s_name = s.replace("ses-", "")
NicolasGensollen marked this conversation as resolved.
Show resolved Hide resolved
row = file_to_read.iloc[r]
if subj_bids not in sessions_dict:
sessions_dict.update({subj_bids: {}})
if s_name not in sessions_dict[subj_bids].keys():
sessions_dict[subj_bids].update({s_name: {"session_id": s}})
(sessions_dict[subj_bids][s_name]).update(
{sessions_fields_bids[i]: row[sessions_fields[i]]}
)
for loc in spec[location].unique():
file = pd.read_excel(clinical_data_dir / loc)
file["BIDS ID"] = file.ID.apply(
lambda x: bids_id_factory(StudyName.OASIS).from_original_study_id(x)
)
file.set_index("BIDS ID", drop=True, inplace=True)
sessions_df = pd.DataFrame()
for _, row in spec[spec[location] == loc].iterrows():
sessions_df[row["BIDS CLINICA"]] = file[row[[study]]]

sessions_df = sessions_df.loc[bids_ids]
sessions_df["diagnosis"] = sessions_df["diagnosis"].apply(
lambda x: "AD" if x > 0 else "CN"
)
sessions_df["session_id"] = "ses-M000"

for bids_id, row in sessions_df.iterrows():
sessions_dict.update(
{bids_id: {"M000": {label: value for label, value in row.items()}}}
)

return sessions_dict

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ def _build_clinical_data(clinical_data_path: Path) -> None:
"Delay": [float("nan"), float("nan")],
}
)
df.to_csv(clinical_data_path / "oasis_cross-sectional.csv", index=False)

# todo : future with excel
df.to_excel(
clinical_data_path / "oasis_cross-sectional-5708aa0a98d82080.xlsx", index=False
)


@pytest.fixture
Expand All @@ -57,9 +57,9 @@ def _build_spec_sessions_success(sessions_path_success: Path) -> None:
"ADNI": [np.nan, np.nan, np.nan, "foo"],
"OASIS": ["CDR", "MMSE", "CDR", np.nan],
"OASIS location": [
"oasis_cross-sectional.csv",
"oasis_cross-sectional.csv",
"oasis_cross-sectional.csv",
"oasis_cross-sectional-5708aa0a98d82080.xlsx",
"oasis_cross-sectional-5708aa0a98d82080.xlsx",
"oasis_cross-sectional-5708aa0a98d82080.xlsx",
np.nan,
],
}
Expand Down Expand Up @@ -109,21 +109,15 @@ def expected() -> dict:
"session_id": "ses-M000",
"cdr_global": 0,
"MMS": 29,
"diagnosis": 0,
},
"M006": {
"session_id": "ses-M006",
"cdr_global": 0,
"MMS": 29,
"diagnosis": 0,
"diagnosis": "CN",
},
},
"sub-OASIS10002": {
"M000": {
"session_id": "ses-M000",
"cdr_global": 0,
"MMS": 29,
"diagnosis": 0,
"diagnosis": "CN",
}
},
}
Expand All @@ -134,15 +128,11 @@ def expected() -> dict:
def test_create_sessions_dict_success(
tmp_path,
clinical_data_path: Path,
bids_dir: Path,
sessions_path_success: Path,
expected: dict,
):
# todo : how does it handle nan inside excel/csv ? verify with excel

result = create_sessions_dict(
clinical_data_path,
bids_dir,
sessions_path_success,
["sub-OASIS10001", "sub-OASIS10002"],
)
Expand All @@ -153,16 +143,12 @@ def test_create_sessions_dict_success(
def test_create_sessions_dict_error(
tmp_path,
clinical_data_path: Path,
bids_dir: Path,
sessions_path_error: Path,
expected: dict,
):
# todo : how does it handle nan inside excel/csv ? verify with excel

with pytest.raises(FileNotFoundError):
create_sessions_dict(
clinical_data_path,
bids_dir,
sessions_path_error,
["sub-OASIS10001", "sub-OASIS10002"],
)
Expand All @@ -177,7 +163,6 @@ def test_write_sessions_tsv(
):
sessions = create_sessions_dict(
clinical_data_path,
bids_dir,
sessions_path_success,
["sub-OASIS10001", "sub-OASIS10002"],
)
Expand Down
Loading