Skip to content

Commit

Permalink
[ENH] Add derivatives command and pipeline-catalog submodule (#349)
Browse files Browse the repository at this point in the history
* add examples of nipoppy proc status files

* add skeleton derivatives command

* add basic smoke test of derivatives cmd

* load TSV and check for missing IDs in derivatives cmd

* generalize util to load a tabular file

* Test added for load_tabular

* Added nipoppy pipeline catalogue as submodule

* Add loading of pipeline names and versions

* validate pipeline names & versions and store expected col names in a dict

* update help text & docstrings

* refactor unique subject check to generic util

* refactor pipeline name / version validation

* add example proc status file w/ subjects not in the synthetic dataset
- participant_id col of existing examples also updated to match Nipoppy

* check that proc status subs are in pheno-containing JSONLD

* refactor out jsonld validation & move IO utils into new module
- prevents circular import errors since some utils require models.py, which in turn requires IO utils

* switch to typer echo statement for model validation error

* factor out context extraction

* add logic to add completed pipelines to existing or new imaging sessions

* create utility for extracting imaging sessions from a JSONLD

* create util for creating completed pipelines

* handle missing BIDS sessions

* refine smoke test and add test using pheno-bids JSONLD

* refactor out custom session ID

* refactor out jsonld subject extraction

* create list of namespaces & update tests to catch outdated @context

* regenerate context in each cmd to ensure they are up-to-date

* add short option for overwrite to error msg

* update test data README

* handle jsonld loading tgt with dataset parsing

* create global vars for known pipelines + vers

* handle error for mismatched subs in separate func

* update bagel bids to add metadata to existing sessions

* update derivatives cmd to add to existing custom ses

* make get_imaging_session_instances a shared util

* update tests of bids command
  • Loading branch information
alyssadai authored Oct 8, 2024
1 parent ac4b2c7 commit 826bc93
Show file tree
Hide file tree
Showing 21 changed files with 1,147 additions and 212 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
[submodule "neurobagel_examples"]
path = neurobagel_examples
url = https://github.com/neurobagel/neurobagel_examples.git
[submodule "pipeline-catalog"]
path = pipeline-catalog
url = https://github.com/nipoppy/pipeline-catalog.git
13 changes: 0 additions & 13 deletions bagel/bids_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,6 @@ def get_bids_subjects_simple(bids_dir: Path) -> list:
return bids_subject_list


def check_unique_bids_subjects(pheno_subjects: list, bids_subjects: list):
"""Raises informative error if subject IDs exist that are found only in the BIDS directory."""
unique_bids_subjects = set(bids_subjects).difference(pheno_subjects)
if len(unique_bids_subjects) > 0:
raise LookupError(
"The specified BIDS directory contains subject IDs not found in "
"the provided phenotypic json-ld file:\n"
f"{unique_bids_subjects}\n"
"Subject IDs are case sensitive. "
"Please check that the specified BIDS and phenotypic datasets match."
)


def create_acquisitions(
layout: BIDSLayout,
bids_sub_id: str,
Expand Down
252 changes: 201 additions & 51 deletions bagel/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,23 @@

import typer
from bids import BIDSLayout
from pydantic import ValidationError

import bagel.bids_utils as butil
import bagel.derivatives_utils as dutil
import bagel.file_utils as futil
import bagel.pheno_utils as putil
from bagel import mappings, models
from bagel.utility import check_overwrite, load_json
from bagel.derivatives_utils import PROC_STATUS_COLS
from bagel.utility import (
confirm_subs_match_pheno_data,
extract_and_validate_jsonld_dataset,
generate_context,
get_imaging_session_instances,
get_subject_instances,
)

# TODO: Coordinate with Nipoppy about what we want to name this
CUSTOM_SESSION_LABEL = "ses-nb01"

bagel = typer.Typer(
help="""
Expand Down Expand Up @@ -84,14 +95,14 @@ def pheno(
graph data model for the provided phenotypic file in the .jsonld format.
You can upload this .jsonld file to the Neurobagel graph.
"""
# Check if output file already exists
check_overwrite(output, overwrite)
futil.check_overwrite(output, overwrite)

data_dictionary = load_json(dictionary)
pheno_df = putil.load_pheno(pheno)
data_dictionary = futil.load_json(dictionary)
pheno_df = futil.load_tabular(pheno)
putil.validate_inputs(data_dictionary, pheno_df)

# Display validated input paths to user
# NOTE: `space` determines the amount of padding (in num. characters) before the file paths in the print statement.
# It is currently calculated as = (length of the longer string, including the 3 leading spaces) + (2 extra spaces)
space = 25
print(
"Processing phenotypic annotations:\n"
Expand Down Expand Up @@ -119,12 +130,12 @@ def pheno(
for session_row_idx, session_row in _sub_pheno.iterrows():
# If there is no session column, we create a session with a custom label "ses-nb01" to assign each subject's phenotypic data to
if session_column is None:
session_name = "ses-nb01" # TODO: Should we make this more obscure to avoid potential overlap with actual session names?
session_label = CUSTOM_SESSION_LABEL
else:
# NOTE: We take the name from the first session column - we don't know how to handle multiple session columns yet
session_name = session_row[session_column[0]]
session_label = session_row[session_column[0]]

session = models.PhenotypicSession(hasLabel=str(session_name))
session = models.PhenotypicSession(hasLabel=str(session_label))
_ses_pheno = session_row

if "sex" in column_mapping.keys():
Expand Down Expand Up @@ -185,7 +196,7 @@ def pheno(
hasSamples=subject_list,
)

context = putil.generate_context()
context = generate_context()
# We can't just exclude_unset here because the identifier and schemaKey
# for each instance are created as default values and so technically are never set
# TODO: we should revisit this because there may be reasons to have None be meaningful in the future
Expand All @@ -203,7 +214,8 @@ def bids(
...,
"--jsonld-path",
"-p", # for pheno
help="The path to the .jsonld file containing the phenotypic data for your dataset, created by the bagel pheno command.",
help="The path to the .jsonld file containing the phenotypic data for your dataset, created by the bagel pheno command. "
"This file may optionally also include the processing pipeline metadata for the dataset (created by the bagel derivatives command).",
exists=True,
file_okay=True,
dir_okay=False,
Expand Down Expand Up @@ -236,55 +248,47 @@ def bids(
),
):
"""
Extract imaging metadata from a valid BIDS dataset and combine them
with phenotypic metadata (.jsonld) created by the bagel pheno command.
Extract imaging metadata from a valid BIDS dataset and integrate it with
subjects' harmonized phenotypic data (from the bagel pheno command) and, optionally,
processing pipeline metadata (from the bagel derivatives command) in a single .jsonld file.
NOTE: Must be run AFTER the pheno command.
This command will create a valid, subject-level instance of the Neurobagel
graph data model for the combined metadata in the .jsonld format.
You can upload this .jsonld file to the Neurobagel graph.
"""
# Check if output file already exists
check_overwrite(output, overwrite)
futil.check_overwrite(output, overwrite)

space = 32
space = 51
print(
"Running initial checks of inputs...\n"
f" {'Phenotypic .jsonld to augment:' : <{space}} {jsonld_path}\n"
f" {'Existing subject graph data to augment (.jsonld):' : <{space}} {jsonld_path}\n"
f" {'BIDS dataset directory:' : <{space}} {bids_dir}"
)

jsonld = load_json(jsonld_path)
# Strip and store context to be added back later, since it's not part of
# (and can't be easily added) to the existing data model
context = {"@context": jsonld.pop("@context")}
try:
pheno_dataset = models.Dataset.parse_obj(jsonld)
except ValidationError as err:
print(err)
jsonld_dataset = extract_and_validate_jsonld_dataset(jsonld_path)

pheno_subject_dict = {
pheno_subject.hasLabel: pheno_subject
for pheno_subject in getattr(pheno_dataset, "hasSamples")
}
existing_subs_dict = get_subject_instances(jsonld_dataset)

# TODO: Revert to using Layout.get_subjects() to get BIDS subjects once pybids performance is improved
butil.check_unique_bids_subjects(
pheno_subjects=pheno_subject_dict.keys(),
bids_subjects=butil.get_bids_subjects_simple(bids_dir),
confirm_subs_match_pheno_data(
subjects=butil.get_bids_subjects_simple(bids_dir),
subject_source_for_err="BIDS directory",
pheno_subjects=existing_subs_dict.keys(),
)

print("Initial checks of inputs passed.\n")

print("Parsing and validating BIDS dataset. This may take a while...")
layout = BIDSLayout(bids_dir, validate=True)
print("BIDS parsing completed.\n")

print(
"Merging subject-level BIDS metadata with the phenotypic annotations...\n"
)
print("Merging BIDS metadata with existing subject annotations...\n")
for bids_sub_id in layout.get_subjects():
pheno_subject = pheno_subject_dict.get(f"sub-{bids_sub_id}")
session_list = []
existing_subject = existing_subs_dict.get(f"sub-{bids_sub_id}")
existing_sessions_dict = get_imaging_session_instances(
existing_subject
)

bids_sessions = layout.get_sessions(subject=bids_sub_id)
if not bids_sessions:
Expand All @@ -294,42 +298,188 @@ def bids(

# For some reason .get_sessions() doesn't always follow alphanumeric order
# By default (without sorting) the session lists look like ["02", "01"] per subject
for session in sorted(bids_sessions):
for session_id in sorted(bids_sessions):
image_list = butil.create_acquisitions(
layout=layout,
bids_sub_id=bids_sub_id,
session=session,
session=session_id,
)

# If subject's session has no image files, a Session object is not added
if not image_list:
continue

# TODO: Currently if a subject has BIDS data but no "ses-" directories (e.g., only 1 session),
# we create a session for that subject with a custom label "ses-nb01" to be added to the graph
# so the API can still find the session-level information.
# we create a session for that subject with a custom label "ses-nb01" to be added to the graph.
# However, we still provide the BIDS SUBJECT directory as the session path, instead of making up a path.
# This should be revisited in the future as for these cases the resulting dataset object is not
# an exact representation of what's on disk.
session_label = "nb01" if session is None else session
# Here, we also need to add back "ses" prefix because pybids stripped it
session_label = (
CUSTOM_SESSION_LABEL
if session_id is None
else f"ses-{session_id}"
)
session_path = butil.get_session_path(
layout=layout,
bids_dir=bids_dir,
bids_sub_id=bids_sub_id,
session=session,
session=session_id,
)

session_list.append(
# Add back "ses" prefix because pybids stripped it
models.ImagingSession(
hasLabel="ses-" + session_label,
# If a custom Neurobagel-created session already exists (if `bagel derivatives` was run first),
# we add to that session when there is no session layer in the BIDS directory
if session_label in existing_sessions_dict:
existing_img_session = existing_sessions_dict.get(
session_label
)
existing_img_session.hasAcquisition = image_list
existing_img_session.hasFilePath = session_path
else:
new_imaging_session = models.ImagingSession(
hasLabel=session_label,
hasFilePath=session_path,
hasAcquisition=image_list,
)
existing_subject.hasSession.append(new_imaging_session)

context = generate_context()
merged_dataset = {**context, **jsonld_dataset.dict(exclude_none=True)}

with open(output, "w") as f:
f.write(json.dumps(merged_dataset, indent=2))

print(f"Saved output to: {output}")


@bagel.command()
def derivatives(
tabular: Path = typer.Option(
...,
"--tabular",
"-t",
help="The path to a .tsv containing subject-level processing pipeline status info. Expected to comply with the Nipoppy processing status file schema.",
exists=True,
file_okay=True,
dir_okay=False,
resolve_path=True,
),
# TODO: Remove _path?
jsonld_path: Path = typer.Option(
...,
"--jsonld-path",
"-p", # for pheno
help="The path to a .jsonld file containing the phenotypic data for your dataset, created by the bagel pheno command. This JSONLD may optionally also include the BIDS metadata for the dataset (created by the bagel bids command).",
exists=True,
file_okay=True,
dir_okay=False,
resolve_path=True,
),
output: Path = typer.Option(
"pheno_derivatives.jsonld",
"--output",
"-o",
help="The path for the output .jsonld file.",
file_okay=True,
dir_okay=False,
resolve_path=True,
),
overwrite: bool = typer.Option(
False,
"--overwrite",
"-f",
help="Overwrite output file if it already exists.",
),
):
"""
Extract subject processing pipeline and derivative metadata from a tabular processing status file and
integrate them in a single .jsonld with subjects' harmonized phenotypic data (from the bagel pheno command) and optionally,
BIDS metadata (from the bagel bids command).
NOTE: Must be run AFTER the pheno command.
This command will create a valid, subject-level instance of the Neurobagel
graph data model for the combined metadata in the .jsonld format.
You can upload this .jsonld file to the Neurobagel graph.
"""
futil.check_overwrite(output, overwrite)

space = 51
print(
"Processing subject-level derivative metadata...\n"
f" {'Existing subject graph data to augment (.jsonld):' : <{space}}{jsonld_path}\n"
f" {'Processing status file (.tsv):' : <{space}}{tabular}"
)

status_df = futil.load_tabular(tabular, input_type="processing status")

# We don't allow empty values in the participant ID column
if row_indices := putil.get_rows_with_empty_strings(
status_df, [PROC_STATUS_COLS["participant"]]
):
raise LookupError(
f"Your processing status file contains missing values in the column '{PROC_STATUS_COLS['participant']}'. "
"Please ensure that every row has a non-empty participant id. "
f"We found missing values in the following rows (first row is zero): {row_indices}."
)

pipelines = status_df[PROC_STATUS_COLS["pipeline_name"]].unique()
dutil.check_pipelines_are_recognized(pipelines)

# TODO: Do we need to check all versions across all pipelines first, and report all unrecognized versions together?
for pipeline in pipelines:
versions = status_df[
status_df[PROC_STATUS_COLS["pipeline_name"]] == pipeline
][PROC_STATUS_COLS["pipeline_version"]].unique()

dutil.check_pipeline_versions_are_recognized(pipeline, versions)

jsonld_dataset = extract_and_validate_jsonld_dataset(jsonld_path)

existing_subs_dict = get_subject_instances(jsonld_dataset)

confirm_subs_match_pheno_data(
subjects=status_df[PROC_STATUS_COLS["participant"]].unique(),
subject_source_for_err="processing status file",
pheno_subjects=existing_subs_dict.keys(),
)

# Create sub-dataframes for each subject
for subject, sub_proc_df in status_df.groupby(
PROC_STATUS_COLS["participant"]
):
existing_subject = existing_subs_dict.get(subject)

# Note: Dictionary of existing imaging sessions can be empty if only bagel pheno was run
existing_sessions_dict = get_imaging_session_instances(
existing_subject
)

for session_label, sub_ses_proc_df in sub_proc_df.groupby(
PROC_STATUS_COLS["session"]
):
completed_pipelines = dutil.create_completed_pipelines(
sub_ses_proc_df
)

pheno_subject.hasSession += session_list
if not completed_pipelines:
continue

session_label = (
CUSTOM_SESSION_LABEL if session_label == "" else session_label
)
if session_label in existing_sessions_dict:
existing_img_session = existing_sessions_dict.get(
session_label
)
existing_img_session.hasCompletedPipeline = completed_pipelines
else:
new_img_session = models.ImagingSession(
hasLabel=session_label,
hasCompletedPipeline=completed_pipelines,
)
existing_subject.hasSession.append(new_img_session)

merged_dataset = {**context, **pheno_dataset.dict(exclude_none=True)}
context = generate_context()
merged_dataset = {**context, **jsonld_dataset.dict(exclude_none=True)}

with open(output, "w") as f:
f.write(json.dumps(merged_dataset, indent=2))
Expand Down
Loading

0 comments on commit 826bc93

Please sign in to comment.