Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REF] OASIS-to-BIDS writing sessions tsv files now use simpler code with data frames #1336

Merged
merged 3 commits into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions clinica/iotools/converters/oasis_to_bids/oasis_to_bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,23 +102,24 @@ def _create_participants_tsv(
encoding="utf-8",
)

@staticmethod
def _create_sessions_tsv(
self,
clinical_data_dir: Path,
bids_dir: Path,
bids_ids: list[str],
) -> None:
from .oasis_to_bids_utils import create_sessions_dict, write_sessions_tsv
from .oasis_to_bids_utils import create_sessions_df, write_sessions_tsv

sessions_dict = create_sessions_dict(
sessions_df = create_sessions_df(
clinical_data_dir=clinical_data_dir,
clinical_specifications_folder=Path(__file__).parents[1] / "specifications",
bids_ids=bids_ids,
)

write_sessions_tsv(bids_dir, sessions_dict)
write_sessions_tsv(bids_dir, sessions_df)

def _create_scans_tsv(self, bids_dir: Path) -> None:
@staticmethod
def _create_scans_tsv(bids_dir: Path) -> None:
from .oasis_to_bids_utils import write_scans_tsv

write_scans_tsv(bids_dir)
Expand Down
110 changes: 58 additions & 52 deletions clinica/iotools/converters/oasis_to_bids/oasis_to_bids_utils.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,31 @@
from pathlib import Path
from typing import Iterable
from typing import Iterable, Union

import pandas as pd
from jinja2.utils import missing
AliceJoubert marked this conversation as resolved.
Show resolved Hide resolved

from clinica.iotools.bids_utils import StudyName, bids_id_factory
from clinica.utils.stream import cprint

__all__ = ["create_sessions_dict", "write_sessions_tsv", "write_scans_tsv"]
__all__ = ["create_sessions_df", "write_sessions_tsv", "write_scans_tsv"]


def create_sessions_dict(
def _convert_cdr_to_diagnosis(cdr: Union[int, str]) -> str:
AliceJoubert marked this conversation as resolved.
Show resolved Hide resolved
# todo : test
AliceJoubert marked this conversation as resolved.
Show resolved Hide resolved
if cdr == int(0):
return "CN"
elif cdr == "n/a":
return cdr
else:
return "AD"


def create_sessions_df(
clinical_data_dir: Path,
clinical_specifications_folder: Path,
bids_ids: Iterable[str],
) -> dict:
"""Extract the information regarding the sessions and store them in a dictionary (session M000 only).
) -> pd.DataFrame:
"""Extract the information regarding sessions M000 and store them in a dataframe.

Parameters
----------
Expand All @@ -24,85 +36,79 @@ def create_sessions_dict(
The path to the clinical file folder.

bids_ids : list of str
The list of bids ids.
The list of bids ids which are in the BIDS directory.

Returns
-------
dict :
Session dict.
pd.Dataframe :
Session df.
"""

study = StudyName.OASIS.value
location = f"{study} location"
spec = pd.read_csv(clinical_specifications_folder / "sessions.tsv", sep="\t")[
[study, location, "BIDS CLINICA"]
].dropna()
sessions_dict = {}

for loc in spec[location].unique():
file = pd.read_excel(clinical_data_dir / loc)
file["BIDS ID"] = file.ID.apply(
lambda x: bids_id_factory(StudyName.OASIS).from_original_study_id(x)
)
file.set_index("BIDS ID", drop=True, inplace=True)
sessions_df = pd.DataFrame()
for _, row in spec[spec[location] == loc].iterrows():
sessions_df[row["BIDS CLINICA"]] = file[row[[study]]]

sessions_df = sessions_df.loc[bids_ids]
sessions_df["diagnosis"] = sessions_df["diagnosis"].apply(
lambda x: "AD" if x > 0 else "CN"
sessions_df = pd.DataFrame()
if len(spec[location].unique()) == 1:
loc = spec[location].unique()[0]
else:
raise ValueError(
f"OASIS1 metadata is supposed to be contained in only 1 file, {len(spec[location].unique())} were detected : {spec[location].unique()}"
)
sessions_df["session_id"] = "ses-M000"

for bids_id, row in sessions_df.iterrows():
sessions_dict.update(
{bids_id: {"M000": {label: value for label, value in row.items()}}}
)
file = pd.read_excel(clinical_data_dir / loc)
file["BIDS ID"] = file.ID.apply(
lambda x: bids_id_factory(StudyName.OASIS).from_original_study_id(x)
)
file.set_index("BIDS ID", drop=True, inplace=True)

for _, row in spec[spec[location] == loc].iterrows():
sessions_df[row["BIDS CLINICA"]] = file[row[[study]]]

return sessions_dict
missing_subjects = set(bids_ids) - set(sessions_df.index)
for ms in missing_subjects:
sessions_df.loc[ms] = ["n/a" for _ in sessions_df.columns]

sessions_df = sessions_df.loc[bids_ids]

def write_sessions_tsv(bids_dir: Path, sessions_dict: dict) -> None:
"""Create <participant_id>_sessions.tsv files.
sessions_df["diagnosis"] = sessions_df["diagnosis"].apply(
lambda x: _convert_cdr_to_diagnosis(x)
)

Basically writes the content of the function
`clinica.iotools.bids_utils.create_sessions_dict` in several TSV files
following the BIDS specification.
sessions_df.insert(loc=0, column="session_id", value="ses-M000")

return sessions_df


def write_sessions_tsv(bids_dir: Path, sessions_df: pd.DataFrame) -> None:
"""Writes the content of the function `clinica.iotools.bids_utils.create_sessions_df`
in several TSV files following the BIDS specification.

Parameters
----------
bids_dir : Path
The path to the BIDS directory.

sessions_dict : dict
Dictionary containing sessions metadata.
sessions_df : DataFrame
Contains sessions metadata.

.. note::
This is the output of the function
`clinica.iotools.bids_utils.create_sessions_dict`.
`clinica.iotools.bids_utils.create_sessions_df`.

See also
--------
create_sessions_dict
create_sessions_df
"""
for subject_path in bids_dir.glob("sub-*"):
if subject_path.name in sessions_dict:
session_df = pd.DataFrame.from_dict(
sessions_dict[subject_path.name], orient="index"
)
cols = session_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
session_df = session_df[cols]
else:
print(f"No session data available for {subject_path}")
session_df = pd.DataFrame(columns=["session_id"])
session_df["session_id"] = pd.Series("M000")
session_df = session_df.set_index("session_id").fillna("n/a")
session_df.to_csv(
subject_path / f"{subject_path.name}_sessions.tsv",
for subject, data in sessions_df.iterrows():
session_path = bids_dir / subject
data.to_frame().T.to_csv(
session_path / f"{subject}_sessions.tsv",
sep="\t",
encoding="utf8",
index=False,
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pandas.testing import assert_frame_equal

from clinica.iotools.converters.oasis_to_bids.oasis_to_bids_utils import (
create_sessions_dict,
create_sessions_df,
write_scans_tsv,
write_sessions_tsv,
)
Expand Down Expand Up @@ -104,52 +104,76 @@ def _build_bids_dir(bids_dir: Path) -> None:


@pytest.fixture
def expected() -> dict:
def expected() -> pd.DataFrame:
expected = {
"sub-OASIS10001": {
"M000": {
"session_id": "ses-M000",
"cdr_global": 0,
"MMS": 29,
"diagnosis": "CN",
},
"session_id": "ses-M000",
"cdr_global": 0,
"MMS": 29,
"diagnosis": "CN",
},
"sub-OASIS10002": {
"M000": {
"session_id": "ses-M000",
"cdr_global": 0,
"MMS": 29,
"diagnosis": "CN",
}
"session_id": "ses-M000",
"cdr_global": 0,
"MMS": 29,
"diagnosis": "CN",
},
}

expected = pd.DataFrame.from_dict(expected).T
expected.index.names = ["BIDS ID"]

return expected


def test_create_sessions_dict_success(
def test_create_sessions_df_success(
tmp_path,
clinical_data_path: Path,
sessions_path_success: Path,
expected: dict,
expected: pd.DataFrame,
):
result = create_sessions_dict(
result = create_sessions_df(
clinical_data_path,
sessions_path_success,
["sub-OASIS10001", "sub-OASIS10002"],
)
assert_frame_equal(expected, result, check_like=True, check_dtype=False)


def test_create_sessions_df_missing_clinical_data(
tmp_path,
clinical_data_path: Path,
sessions_path_success: Path,
expected: pd.DataFrame,
):
result = create_sessions_df(
clinical_data_path,
sessions_path_success,
["sub-OASIS10001", "sub-OASIS10002", "sub-OASIS10004"],
)
missing_line = pd.DataFrame.from_dict(
{
"sub-OASIS10004": {
"session_id": "ses-M000",
"diagnosis": "n/a",
"cdr_global": "n/a",
"MMS": "n/a",
}
}
).T
missing_line.index.names = ["BIDS ID"]

assert result == expected
expected = pd.concat([expected, missing_line])
assert_frame_equal(expected, result, check_like=True, check_dtype=False)


def test_create_sessions_dict_error(
def test_create_sessions_df_file_not_found(
tmp_path,
clinical_data_path: Path,
sessions_path_error: Path,
expected: dict,
):
with pytest.raises(FileNotFoundError):
create_sessions_dict(
create_sessions_df(
clinical_data_path,
sessions_path_error,
["sub-OASIS10001", "sub-OASIS10002"],
Expand All @@ -161,22 +185,21 @@ def test_write_sessions_tsv(
clinical_data_path: Path,
bids_dir: Path,
sessions_path_success: Path,
expected: dict,
expected: pd.DataFrame,
):
sessions = create_sessions_dict(
sessions = create_sessions_df(
clinical_data_path,
sessions_path_success,
["sub-OASIS10001", "sub-OASIS10002"],
)
write_sessions_tsv(tmp_path / "BIDS", sessions)
sessions_files = list((tmp_path / "BIDS").rglob("*.tsv"))
write_sessions_tsv(bids_dir, sessions)
sessions_files = list(bids_dir.rglob("*.tsv"))

assert len(sessions_files) == 2
for file in sessions_files:
assert_frame_equal(
pd.read_csv(file, sep="\t").set_index("session_id", drop=False),
pd.DataFrame(expected[file.parent.name]).T.set_index(
"session_id", drop=False
),
pd.read_csv(file, sep="\t").reset_index(drop=True),
expected.loc[[file.parent.name]].reset_index(drop=True),
check_like=True,
check_dtype=False,
)
Expand Down
Loading