From 826bc93105502931e3e66189b1888410a7bb2397 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Tue, 8 Oct 2024 11:41:02 -0400 Subject: [PATCH] [ENH] Add derivatives command and `pipeline-catalog` submodule (#349) * add examples of nipoppy proc status files * add skeleton derivatives command * add basic smoke test of derivatives cmd * load TSV and check for missing IDs in derivatives cmd * generalize util to load a tabular file * Test added for load_tabular * Added nipoppy pipeline catalogue as submodule * Add loading of pipeline names and versions * validate pipeline names & versions and store expected col names in a dict * update help text & docstrings * refactor unique subject check to generic util * refactor pipeline name / version validation * add example proc status file w/ subjects not in the synthetic dataset - participant_id col of existing examples also updated to match Nipoppy * check that proc status subs are in pheno-containing JSONLD * refactor out jsonld validation & move IO utils into new module - prevents circular import errors since some utils require models.py, which in turn requires IO utils * switch to typer echo statement for model validation error * factor out context extraction * add logic to add completed pipelines to existing or new imaging sessions * create utility for extracting imaging sessions from a JSONLD * create util for creating completed pipelines * handle missing BIDS sessions * refine smoke test and add test using pheno-bids JSONLD * refactor out custom session ID * refactor out jsonld subject extraction * create list of namespaces & update tests to catch outdated @context * regenerate context in each cmd to ensure they are up-to-date * add short option for overwrite to error msg * update test data README * handle jsonld loading tgt with dataset parsing * create global vars for known pipelines + vers * handle error for mismatched subs in separate func * update bagel bids to add metadata to existing sessions * update derivatives cmd to add to existing custom ses * make get_imaging_session_instances a shared util * update tests of bids command --- .gitmodules | 3 + bagel/bids_utils.py | 13 - bagel/cli.py | 252 ++++++++++++--- bagel/derivatives_utils.py | 77 +++++ bagel/file_utils.py | 96 ++++++ bagel/mappings.py | 42 +++ bagel/pheno_utils.py | 77 +---- bagel/tests/data/README.md | 16 +- .../data/proc_status_missing_sessions.tsv | 8 + .../data/proc_status_no_bids_sessions.tsv | 9 + bagel/tests/data/proc_status_synthetic.csv | 9 + bagel/tests/data/proc_status_synthetic.tsv | 9 + .../data/proc_status_synthetic_incomplete.tsv | 9 + .../data/proc_status_unique_sessions.tsv | 9 + bagel/tests/data/proc_status_unique_subs.tsv | 7 + bagel/tests/test_cli_bids.py | 78 ++++- bagel/tests/test_cli_derivatives.py | 207 ++++++++++++ bagel/tests/test_cli_pheno.py | 2 +- bagel/tests/test_utility.py | 306 ++++++++++++++++-- bagel/utility.py | 129 ++++++-- pipeline-catalog | 1 + 21 files changed, 1147 insertions(+), 212 deletions(-) create mode 100644 bagel/derivatives_utils.py create mode 100644 bagel/file_utils.py create mode 100644 bagel/tests/data/proc_status_missing_sessions.tsv create mode 100644 bagel/tests/data/proc_status_no_bids_sessions.tsv create mode 100644 bagel/tests/data/proc_status_synthetic.csv create mode 100644 bagel/tests/data/proc_status_synthetic.tsv create mode 100644 bagel/tests/data/proc_status_synthetic_incomplete.tsv create mode 100644 bagel/tests/data/proc_status_unique_sessions.tsv create mode 100644 bagel/tests/data/proc_status_unique_subs.tsv create mode 100644 bagel/tests/test_cli_derivatives.py create mode 160000 pipeline-catalog diff --git a/.gitmodules b/.gitmodules index da8ac28..cd2afe2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "neurobagel_examples"] path = neurobagel_examples url = https://github.com/neurobagel/neurobagel_examples.git +[submodule "pipeline-catalog"] + path = pipeline-catalog + url = https://github.com/nipoppy/pipeline-catalog.git diff --git a/bagel/bids_utils.py b/bagel/bids_utils.py index f73b598..5c26c13 100644 --- a/bagel/bids_utils.py +++ b/bagel/bids_utils.py @@ -24,19 +24,6 @@ def get_bids_subjects_simple(bids_dir: Path) -> list: return bids_subject_list -def check_unique_bids_subjects(pheno_subjects: list, bids_subjects: list): - """Raises informative error if subject IDs exist that are found only in the BIDS directory.""" - unique_bids_subjects = set(bids_subjects).difference(pheno_subjects) - if len(unique_bids_subjects) > 0: - raise LookupError( - "The specified BIDS directory contains subject IDs not found in " - "the provided phenotypic json-ld file:\n" - f"{unique_bids_subjects}\n" - "Subject IDs are case sensitive. " - "Please check that the specified BIDS and phenotypic datasets match." - ) - - def create_acquisitions( layout: BIDSLayout, bids_sub_id: str, diff --git a/bagel/cli.py b/bagel/cli.py index aa88901..fb1bc0a 100644 --- a/bagel/cli.py +++ b/bagel/cli.py @@ -3,12 +3,23 @@ import typer from bids import BIDSLayout -from pydantic import ValidationError import bagel.bids_utils as butil +import bagel.derivatives_utils as dutil +import bagel.file_utils as futil import bagel.pheno_utils as putil from bagel import mappings, models -from bagel.utility import check_overwrite, load_json +from bagel.derivatives_utils import PROC_STATUS_COLS +from bagel.utility import ( + confirm_subs_match_pheno_data, + extract_and_validate_jsonld_dataset, + generate_context, + get_imaging_session_instances, + get_subject_instances, +) + +# TODO: Coordinate with Nipoppy about what we want to name this +CUSTOM_SESSION_LABEL = "ses-nb01" bagel = typer.Typer( help=""" @@ -84,14 +95,14 @@ def pheno( graph data model for the provided phenotypic file in the .jsonld format. You can upload this .jsonld file to the Neurobagel graph. """ - # Check if output file already exists - check_overwrite(output, overwrite) + futil.check_overwrite(output, overwrite) - data_dictionary = load_json(dictionary) - pheno_df = putil.load_pheno(pheno) + data_dictionary = futil.load_json(dictionary) + pheno_df = futil.load_tabular(pheno) putil.validate_inputs(data_dictionary, pheno_df) - # Display validated input paths to user + # NOTE: `space` determines the amount of padding (in num. characters) before the file paths in the print statement. + # It is currently calculated as = (length of the longer string, including the 3 leading spaces) + (2 extra spaces) space = 25 print( "Processing phenotypic annotations:\n" @@ -119,12 +130,12 @@ def pheno( for session_row_idx, session_row in _sub_pheno.iterrows(): # If there is no session column, we create a session with a custom label "ses-nb01" to assign each subject's phenotypic data to if session_column is None: - session_name = "ses-nb01" # TODO: Should we make this more obscure to avoid potential overlap with actual session names? + session_label = CUSTOM_SESSION_LABEL else: # NOTE: We take the name from the first session column - we don't know how to handle multiple session columns yet - session_name = session_row[session_column[0]] + session_label = session_row[session_column[0]] - session = models.PhenotypicSession(hasLabel=str(session_name)) + session = models.PhenotypicSession(hasLabel=str(session_label)) _ses_pheno = session_row if "sex" in column_mapping.keys(): @@ -185,7 +196,7 @@ def pheno( hasSamples=subject_list, ) - context = putil.generate_context() + context = generate_context() # We can't just exclude_unset here because the identifier and schemaKey # for each instance are created as default values and so technically are never set # TODO: we should revisit this because there may be reasons to have None be meaningful in the future @@ -203,7 +214,8 @@ def bids( ..., "--jsonld-path", "-p", # for pheno - help="The path to the .jsonld file containing the phenotypic data for your dataset, created by the bagel pheno command.", + help="The path to the .jsonld file containing the phenotypic data for your dataset, created by the bagel pheno command. " + "This file may optionally also include the processing pipeline metadata for the dataset (created by the bagel derivatives command).", exists=True, file_okay=True, dir_okay=False, @@ -236,55 +248,47 @@ def bids( ), ): """ - Extract imaging metadata from a valid BIDS dataset and combine them - with phenotypic metadata (.jsonld) created by the bagel pheno command. + Extract imaging metadata from a valid BIDS dataset and integrate it with + subjects' harmonized phenotypic data (from the bagel pheno command) and, optionally, + processing pipeline metadata (from the bagel derivatives command) in a single .jsonld file. NOTE: Must be run AFTER the pheno command. This command will create a valid, subject-level instance of the Neurobagel graph data model for the combined metadata in the .jsonld format. You can upload this .jsonld file to the Neurobagel graph. """ - # Check if output file already exists - check_overwrite(output, overwrite) + futil.check_overwrite(output, overwrite) - space = 32 + space = 51 print( "Running initial checks of inputs...\n" - f" {'Phenotypic .jsonld to augment:' : <{space}} {jsonld_path}\n" + f" {'Existing subject graph data to augment (.jsonld):' : <{space}} {jsonld_path}\n" f" {'BIDS dataset directory:' : <{space}} {bids_dir}" ) - jsonld = load_json(jsonld_path) - # Strip and store context to be added back later, since it's not part of - # (and can't be easily added) to the existing data model - context = {"@context": jsonld.pop("@context")} - try: - pheno_dataset = models.Dataset.parse_obj(jsonld) - except ValidationError as err: - print(err) + jsonld_dataset = extract_and_validate_jsonld_dataset(jsonld_path) - pheno_subject_dict = { - pheno_subject.hasLabel: pheno_subject - for pheno_subject in getattr(pheno_dataset, "hasSamples") - } + existing_subs_dict = get_subject_instances(jsonld_dataset) # TODO: Revert to using Layout.get_subjects() to get BIDS subjects once pybids performance is improved - butil.check_unique_bids_subjects( - pheno_subjects=pheno_subject_dict.keys(), - bids_subjects=butil.get_bids_subjects_simple(bids_dir), + confirm_subs_match_pheno_data( + subjects=butil.get_bids_subjects_simple(bids_dir), + subject_source_for_err="BIDS directory", + pheno_subjects=existing_subs_dict.keys(), ) + print("Initial checks of inputs passed.\n") print("Parsing and validating BIDS dataset. This may take a while...") layout = BIDSLayout(bids_dir, validate=True) print("BIDS parsing completed.\n") - print( - "Merging subject-level BIDS metadata with the phenotypic annotations...\n" - ) + print("Merging BIDS metadata with existing subject annotations...\n") for bids_sub_id in layout.get_subjects(): - pheno_subject = pheno_subject_dict.get(f"sub-{bids_sub_id}") - session_list = [] + existing_subject = existing_subs_dict.get(f"sub-{bids_sub_id}") + existing_sessions_dict = get_imaging_session_instances( + existing_subject + ) bids_sessions = layout.get_sessions(subject=bids_sub_id) if not bids_sessions: @@ -294,42 +298,188 @@ def bids( # For some reason .get_sessions() doesn't always follow alphanumeric order # By default (without sorting) the session lists look like ["02", "01"] per subject - for session in sorted(bids_sessions): + for session_id in sorted(bids_sessions): image_list = butil.create_acquisitions( layout=layout, bids_sub_id=bids_sub_id, - session=session, + session=session_id, ) - # If subject's session has no image files, a Session object is not added if not image_list: continue # TODO: Currently if a subject has BIDS data but no "ses-" directories (e.g., only 1 session), - # we create a session for that subject with a custom label "ses-nb01" to be added to the graph - # so the API can still find the session-level information. + # we create a session for that subject with a custom label "ses-nb01" to be added to the graph. + # However, we still provide the BIDS SUBJECT directory as the session path, instead of making up a path. # This should be revisited in the future as for these cases the resulting dataset object is not # an exact representation of what's on disk. - session_label = "nb01" if session is None else session + # Here, we also need to add back "ses" prefix because pybids stripped it + session_label = ( + CUSTOM_SESSION_LABEL + if session_id is None + else f"ses-{session_id}" + ) session_path = butil.get_session_path( layout=layout, bids_dir=bids_dir, bids_sub_id=bids_sub_id, - session=session, + session=session_id, ) - session_list.append( - # Add back "ses" prefix because pybids stripped it - models.ImagingSession( - hasLabel="ses-" + session_label, + # If a custom Neurobagel-created session already exists (if `bagel derivatives` was run first), + # we add to that session when there is no session layer in the BIDS directory + if session_label in existing_sessions_dict: + existing_img_session = existing_sessions_dict.get( + session_label + ) + existing_img_session.hasAcquisition = image_list + existing_img_session.hasFilePath = session_path + else: + new_imaging_session = models.ImagingSession( + hasLabel=session_label, hasFilePath=session_path, hasAcquisition=image_list, ) + existing_subject.hasSession.append(new_imaging_session) + + context = generate_context() + merged_dataset = {**context, **jsonld_dataset.dict(exclude_none=True)} + + with open(output, "w") as f: + f.write(json.dumps(merged_dataset, indent=2)) + + print(f"Saved output to: {output}") + + +@bagel.command() +def derivatives( + tabular: Path = typer.Option( + ..., + "--tabular", + "-t", + help="The path to a .tsv containing subject-level processing pipeline status info. Expected to comply with the Nipoppy processing status file schema.", + exists=True, + file_okay=True, + dir_okay=False, + resolve_path=True, + ), + # TODO: Remove _path? + jsonld_path: Path = typer.Option( + ..., + "--jsonld-path", + "-p", # for pheno + help="The path to a .jsonld file containing the phenotypic data for your dataset, created by the bagel pheno command. This JSONLD may optionally also include the BIDS metadata for the dataset (created by the bagel bids command).", + exists=True, + file_okay=True, + dir_okay=False, + resolve_path=True, + ), + output: Path = typer.Option( + "pheno_derivatives.jsonld", + "--output", + "-o", + help="The path for the output .jsonld file.", + file_okay=True, + dir_okay=False, + resolve_path=True, + ), + overwrite: bool = typer.Option( + False, + "--overwrite", + "-f", + help="Overwrite output file if it already exists.", + ), +): + """ + Extract subject processing pipeline and derivative metadata from a tabular processing status file and + integrate them in a single .jsonld with subjects' harmonized phenotypic data (from the bagel pheno command) and optionally, + BIDS metadata (from the bagel bids command). + NOTE: Must be run AFTER the pheno command. + + This command will create a valid, subject-level instance of the Neurobagel + graph data model for the combined metadata in the .jsonld format. + You can upload this .jsonld file to the Neurobagel graph. + """ + futil.check_overwrite(output, overwrite) + + space = 51 + print( + "Processing subject-level derivative metadata...\n" + f" {'Existing subject graph data to augment (.jsonld):' : <{space}}{jsonld_path}\n" + f" {'Processing status file (.tsv):' : <{space}}{tabular}" + ) + + status_df = futil.load_tabular(tabular, input_type="processing status") + + # We don't allow empty values in the participant ID column + if row_indices := putil.get_rows_with_empty_strings( + status_df, [PROC_STATUS_COLS["participant"]] + ): + raise LookupError( + f"Your processing status file contains missing values in the column '{PROC_STATUS_COLS['participant']}'. " + "Please ensure that every row has a non-empty participant id. " + f"We found missing values in the following rows (first row is zero): {row_indices}." + ) + + pipelines = status_df[PROC_STATUS_COLS["pipeline_name"]].unique() + dutil.check_pipelines_are_recognized(pipelines) + + # TODO: Do we need to check all versions across all pipelines first, and report all unrecognized versions together? + for pipeline in pipelines: + versions = status_df[ + status_df[PROC_STATUS_COLS["pipeline_name"]] == pipeline + ][PROC_STATUS_COLS["pipeline_version"]].unique() + + dutil.check_pipeline_versions_are_recognized(pipeline, versions) + + jsonld_dataset = extract_and_validate_jsonld_dataset(jsonld_path) + + existing_subs_dict = get_subject_instances(jsonld_dataset) + + confirm_subs_match_pheno_data( + subjects=status_df[PROC_STATUS_COLS["participant"]].unique(), + subject_source_for_err="processing status file", + pheno_subjects=existing_subs_dict.keys(), + ) + + # Create sub-dataframes for each subject + for subject, sub_proc_df in status_df.groupby( + PROC_STATUS_COLS["participant"] + ): + existing_subject = existing_subs_dict.get(subject) + + # Note: Dictionary of existing imaging sessions can be empty if only bagel pheno was run + existing_sessions_dict = get_imaging_session_instances( + existing_subject + ) + + for session_label, sub_ses_proc_df in sub_proc_df.groupby( + PROC_STATUS_COLS["session"] + ): + completed_pipelines = dutil.create_completed_pipelines( + sub_ses_proc_df ) - pheno_subject.hasSession += session_list + if not completed_pipelines: + continue + + session_label = ( + CUSTOM_SESSION_LABEL if session_label == "" else session_label + ) + if session_label in existing_sessions_dict: + existing_img_session = existing_sessions_dict.get( + session_label + ) + existing_img_session.hasCompletedPipeline = completed_pipelines + else: + new_img_session = models.ImagingSession( + hasLabel=session_label, + hasCompletedPipeline=completed_pipelines, + ) + existing_subject.hasSession.append(new_img_session) - merged_dataset = {**context, **pheno_dataset.dict(exclude_none=True)} + context = generate_context() + merged_dataset = {**context, **jsonld_dataset.dict(exclude_none=True)} with open(output, "w") as f: f.write(json.dumps(merged_dataset, indent=2)) diff --git a/bagel/derivatives_utils.py b/bagel/derivatives_utils.py new file mode 100644 index 0000000..0e64eca --- /dev/null +++ b/bagel/derivatives_utils.py @@ -0,0 +1,77 @@ +from typing import Iterable + +import pandas as pd + +from bagel import mappings, models + +# Shorthands for expected column names in a Nipoppy processing status file +# TODO: While there are multiple session ID columns in a Nipoppy processing status file, +# we only only look at `bids_session` right now. We should revisit this after the schema is finalized, +# to see if any other logic is needed to avoid issues with session ID discrepancies across columns. +PROC_STATUS_COLS = { + "participant": "bids_participant", + "session": "bids_session", + "pipeline_name": "pipeline_name", + "pipeline_version": "pipeline_version", + "status": "status", +} + + +def check_pipelines_are_recognized(pipelines: Iterable[str]): + """Check that all pipelines in the processing status file are supported by Nipoppy.""" + unrecognized_pipelines = list( + set(pipelines).difference(mappings.KNOWN_PIPELINE_URIS) + ) + if len(unrecognized_pipelines) > 0: + raise LookupError( + f"The processing status file contains unrecognized pipelines in the column '{PROC_STATUS_COLS['pipeline_name']}': " + f"{unrecognized_pipelines}. " + f"Allowed pipeline names are the following pipelines supported natively in Nipoppy (https://github.com/nipoppy/pipeline-catalog): \n" + f"{mappings.KNOWN_PIPELINE_URIS}" + ) + + +def check_pipeline_versions_are_recognized( + pipeline: str, versions: Iterable[str] +): + """ + Check that all pipeline versions in the processing status file are supported by Nipoppy. + Assumes that the input pipeline name is recognized. + """ + unrecognized_versions = list( + set(versions).difference(mappings.KNOWN_PIPELINE_VERSIONS[pipeline]) + ) + if len(unrecognized_versions) > 0: + raise LookupError( + f"The processing status file contains unrecognized {pipeline} versions in the column '{PROC_STATUS_COLS['pipeline_version']}': {unrecognized_versions}. " + f"Allowed {pipeline} versions are the following versions supported natively in Nipoppy (https://github.com/nipoppy/pipeline-catalog): \n" + f"{mappings.KNOWN_PIPELINE_VERSIONS[pipeline]}" + ) + + +def create_completed_pipelines(session_proc_df: pd.DataFrame) -> list: + """ + Create a list of CompletedPipeline objects for a single subject-session based on the completion status + info of pipelines for that session from the processing status dataframe. + """ + completed_pipelines = [] + for (pipeline, version), session_pipe_df in session_proc_df.groupby( + [ + PROC_STATUS_COLS["pipeline_name"], + PROC_STATUS_COLS["pipeline_version"], + ] + ): + # Check that all pipeline steps have succeeded + if ( + session_pipe_df[PROC_STATUS_COLS["status"]].str.lower() + == "success" + ).all(): + completed_pipeline = models.CompletedPipeline( + hasPipelineName=models.Pipeline( + identifier=mappings.KNOWN_PIPELINE_URIS[pipeline] + ), + hasPipelineVersion=version, + ) + completed_pipelines.append(completed_pipeline) + + return completed_pipelines diff --git a/bagel/file_utils.py b/bagel/file_utils.py new file mode 100644 index 0000000..385af12 --- /dev/null +++ b/bagel/file_utils.py @@ -0,0 +1,96 @@ +import json +from pathlib import Path +from typing import Optional + +import pandas as pd +import typer + + +def file_encoding_error_message(input_p: Path) -> str: + """Return a message for when a file cannot be read due to encoding issues.""" + return typer.style( + f"Failed to decode the input file {input_p}. " + "Please ensure that both your phenotypic .tsv file and .json data dictionary have UTF-8 encoding.\n" + "Tip: Need help converting your file? Try a tool like iconv (http://linux.die.net/man/1/iconv) or https://www.freeformatter.com/convert-file-encoding.html.", + fg=typer.colors.RED, + ) + + +def load_tabular( + input_p: Path, input_type: str = "phenotypic" +) -> Optional[pd.DataFrame]: + """Load a .tsv pheno file and do some basic validation of the file type.""" + if input_p.suffix == ".tsv": + try: + tabular_df = pd.read_csv( + input_p, + sep="\t", + keep_default_na=False, + dtype=str, + encoding="utf-8", + ) + except UnicodeDecodeError as e: + # TODO: Refactor once https://github.com/neurobagel/bagel-cli/issues/218 is addressed + typer.echo( + file_encoding_error_message(input_p), + err=True, + ) + raise typer.Exit(code=1) from e + + if tabular_df.shape[1] > 1: + return tabular_df + + # If we have only one column, but splitting by ',' gives us several elements + # then there is a good chance the user accidentally renamed a .csv into .tsv + # and we should give them some extra info with our error message to fix this. + note_misnamed_csv = ( + f"Note that your {input_type} input file also looks like a .csv file " + "as it contains several ',' commas. It is possible that " + "you have accidentally renamed a .csv file as a .tsv." + ) + raise ValueError( + f"Your {input_type} input file {input_p} has only one column " + f"and is therefore not valid as a Neurobagel {input_type} file. " + f" Please provide a valid .tsv {input_type} file!" + f"\n\n{note_misnamed_csv if len(tabular_df.columns[0].split(',')) > 1 else ''}" + ) + + raise ValueError( + f"Your ({input_p}) is not a .tsv file." + f" Please provide a valid .tsv {input_type} file!" + ) + + +def load_json(input_p: Path) -> dict: + """Load a user-specified json type file.""" + try: + with open(input_p, "r", encoding="utf-8") as f: + return json.load(f) + except UnicodeDecodeError as e: + # TODO: Refactor once https://github.com/neurobagel/bagel-cli/issues/218 is addressed + typer.echo( + file_encoding_error_message(input_p), + err=True, + ) + raise typer.Exit(code=1) from e + except json.JSONDecodeError as e: + typer.echo( + typer.style( + f"The provided data dictionary {input_p} is not valid JSON. " + "Please provide a valid JSON file.", + fg=typer.colors.RED, + ), + err=True, + ) + raise typer.Exit(code=1) from e + + +def check_overwrite(output: Path, overwrite: bool): + """Exit program gracefully if an output file already exists but --overwrite has not been set.""" + if output.exists() and not overwrite: + raise typer.Exit( + typer.style( + f"Output file {output} already exists. Use --overwrite or -f to overwrite.", + fg=typer.colors.RED, + ) + ) diff --git a/bagel/mappings.py b/bagel/mappings.py index 8014fed..b99e624 100644 --- a/bagel/mappings.py +++ b/bagel/mappings.py @@ -1,4 +1,7 @@ from collections import namedtuple +from pathlib import Path + +import bagel.file_utils as futil Namespace = namedtuple("Namespace", ["pf", "url"]) COGATLAS = Namespace("cogatlas", "https://www.cognitiveatlas.org/task/id/") @@ -6,6 +9,11 @@ NCIT = Namespace("ncit", "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#") NIDM = Namespace("nidm", "http://purl.org/nidash/nidm#") SNOMED = Namespace("snomed", "http://purl.bioontology.org/ontology/SNOMEDCT/") +NP = Namespace( + "np", "https://github.com/nipoppy/pipeline-catalog/tree/main/processing/" +) +# Store all supported amespaces in a list for easy iteration & testing +ALL_NAMESPACES = [COGATLAS, NB, NCIT, NIDM, SNOMED, NP] BIDS = { "anat": NIDM.pf + ":Anatomical", @@ -29,3 +37,37 @@ "healthy_control": NCIT.pf + ":C94342", "assessment_tool": NB.pf + ":Assessment", } + +PROCESSING_PIPELINE_PATH = ( + Path(__file__).parents[1] / "pipeline-catalog" / "processing" +) + + +def get_pipeline_uris() -> dict: + """ + Load files from the pipeline-catalog and return a dictionary of pipeline names + and their URIs in the Nipoppy namespace. + """ + output_dict = {} + for pipe_file in PROCESSING_PIPELINE_PATH.glob("*.json"): + pipe = futil.load_json(pipe_file) + output_dict[pipe["name"]] = f"{NP.pf}:{pipe['name']}" + + return output_dict + + +def get_pipeline_versions() -> dict: + """ + Load files from the pipeline-catalog and return a dictionary of pipeline names + and corresponding supported versions in the Nipoppy namespace. + """ + output_dict = {} + for pipe_file in PROCESSING_PIPELINE_PATH.glob("*.json"): + pipe = futil.load_json(pipe_file) + output_dict[pipe["name"]] = pipe["versions"] + + return output_dict + + +KNOWN_PIPELINE_URIS = get_pipeline_uris() +KNOWN_PIPELINE_VERSIONS = get_pipeline_versions() diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py index 26142af..6b28ddb 100644 --- a/bagel/pheno_utils.py +++ b/bagel/pheno_utils.py @@ -2,20 +2,16 @@ import warnings from collections import defaultdict -from pathlib import Path from typing import Optional, Union import isodate import jsonschema import pandas as pd import pydantic -import typer -from pandas import DataFrame from typer import BadParameter -from bagel import dictionary_models, mappings, models -from bagel.mappings import COGATLAS, NB, NCIT, NIDM, SNOMED -from bagel.utility import file_encoding_error_message +from bagel import dictionary_models, mappings +from bagel.mappings import NB DICTIONARY_SCHEMA = dictionary_models.DataDictionary.schema() @@ -41,75 +37,6 @@ def validate_portal_uri(portal: str) -> Optional[str]: return portal -def load_pheno(input_p: Path) -> pd.DataFrame | None: - """Load a .tsv pheno file and do some basic validation.""" - if input_p.suffix == ".tsv": - try: - pheno_df: DataFrame = pd.read_csv( - input_p, - sep="\t", - keep_default_na=False, - dtype=str, - encoding="utf-8", - ) - except UnicodeDecodeError as e: - # TODO: Refactor once https://github.com/neurobagel/bagel-cli/issues/218 is addressed - typer.echo( - file_encoding_error_message(input_p), - err=True, - ) - raise typer.Exit(code=1) from e - - if pheno_df.shape[1] > 1: - return pheno_df - - # If we have only one column, but splitting by ',' gives us several elements - # then there is a good chance the user accidentally renamed a .csv into .tsv - # and we should give them some extra info with our error message to fix this. - note_misnamed_csv = ( - "Note that your phenotypic input file also looks like a .csv file " - "as it contains several ',' commas. It is possible that " - "you have accidentally renamed a .csv file as a .tsv." - ) - raise ValueError( - f"Your phenotypic input file {input_p} has only one column " - f"and is therefore not valid as a Neurobagel phenotypic file. " - " Please provide a valid .tsv pheno file!" - f"\n\n{note_misnamed_csv if len(pheno_df.columns[0].split(',')) > 1 else ''}" - ) - - raise ValueError( - f"Your ({input_p}) is not a .tsv file." - " Please provide a valid .tsv pheno file!" - ) - - -def generate_context(): - # Direct copy of the dandi-schema context generation function - # https://github.com/dandi/dandi-schema/blob/c616d87eaae8869770df0cb5405c24afdb9db096/dandischema/metadata.py - field_preamble = { - namespace.pf: namespace.url - for namespace in [NB, SNOMED, NIDM, COGATLAS, NCIT] - } - fields = {} - for val in dir(models): - klass = getattr(models, val) - if not isinstance(klass, pydantic.main.ModelMetaclass): - continue - fields[klass.__name__] = f"{NB.pf}:{klass.__name__}" - for name, field in klass.__fields__.items(): - if name == "schemaKey": - fields[name] = "@type" - elif name == "identifier": - fields[name] = "@id" - elif name not in fields: - fields[name] = {"@id": f"{NB.pf}:{name}"} - - field_preamble.update(**fields) - - return {"@context": field_preamble} - - def get_columns_about(data_dict: dict, concept: str) -> list: """ Returns all column names that have been annotated as "IsAbout" the desired concept. diff --git a/bagel/tests/data/README.md b/bagel/tests/data/README.md index 37b039d..073c90d 100644 --- a/bagel/tests/data/README.md +++ b/bagel/tests/data/README.md @@ -1,6 +1,6 @@ # Test data -Example inputs to the CLI +## Example inputs to the `bagel pheno` CLI command | Example name | `.tsv` | `.json` | Expect | | ----- | ----- | ----- | ----- | @@ -31,5 +31,19 @@ Example inputs to the CLI `* this is expected to fail until we enable multiple participant_ID handling`. +## Example inputs to the `bagel derivatives` command +Designed to work with `.jsonld` files from the [Neurobagel reference example dataset](https://github.com/neurobagel/neurobagel_examples). + +Example file `proc_status`... | Description | Expected result +----- | ----- | ----- +_synthetic.tsv | Captures a subset of subject-sessions represented in the BIDS examples synthetic dataset | Pass +_synthetic.csv | Same as proc_status_synthetic.csv, but is a CSV file | Fail +_unique_subs.tsv | Includes subjects not found in the phenotypic dataset | Fail +_incomplete.tsv | Has a missing value in the `bids_participant` column | Fail +_unique_sessions.csv | Includes a unique subject-session (`sub-01`, `ses-03`) not found in the synthetic dataset | Pass +_missing_sessions.tsv | One subject (`sub-02`) is missing all session labels | Pass +_no_bids_sessions.tsv | Has session labels in all rows for `session_id`, but no values in `bids_session` column | Pass + + ## Example expected CLI outputs You can find example expected CLI outputs [here](https://github.com/neurobagel/neurobagel_examples). \ No newline at end of file diff --git a/bagel/tests/data/proc_status_missing_sessions.tsv b/bagel/tests/data/proc_status_missing_sessions.tsv new file mode 100644 index 0000000..47bb84f --- /dev/null +++ b/bagel/tests/data/proc_status_missing_sessions.tsv @@ -0,0 +1,8 @@ +participant_id bids_participant session_id bids_session pipeline_name pipeline_version pipeline_step status +01 sub-01 01 ses-01 fmriprep 20.2.7 step1 SUCCESS +01 sub-01 01 ses-01 fmriprep 20.2.7 step2 SUCCESS +01 sub-01 01 ses-01 fmriprep 23.1.3 default SUCCESS +01 sub-01 01 ses-01 freesurfer 7.3.2 default SUCCESS +01 sub-01 02 ses-02 freesurfer 7.3.2 default SUCCESS +02 sub-02 fmriprep 23.1.3 default SUCCESS +02 sub-02 freesurfer 7.3.2 default SUCCESS diff --git a/bagel/tests/data/proc_status_no_bids_sessions.tsv b/bagel/tests/data/proc_status_no_bids_sessions.tsv new file mode 100644 index 0000000..b5ac516 --- /dev/null +++ b/bagel/tests/data/proc_status_no_bids_sessions.tsv @@ -0,0 +1,9 @@ +participant_id bids_participant session_id bids_session pipeline_name pipeline_version pipeline_step status +01 sub-01 01 fmriprep 20.2.7 step1 SUCCESS +01 sub-01 01 fmriprep 20.2.7 step2 SUCCESS +01 sub-01 01 fmriprep 23.1.3 default SUCCESS +01 sub-01 01 freesurfer 7.3.2 default SUCCESS +01 sub-01 02 freesurfer 7.3.2 default SUCCESS +02 sub-02 01 fmriprep 23.1.3 default SUCCESS +02 sub-02 01 freesurfer 7.3.2 default SUCCESS +02 sub-02 02 freesurfer 7.3.2 default SUCCESS diff --git a/bagel/tests/data/proc_status_synthetic.csv b/bagel/tests/data/proc_status_synthetic.csv new file mode 100644 index 0000000..da47d84 --- /dev/null +++ b/bagel/tests/data/proc_status_synthetic.csv @@ -0,0 +1,9 @@ +participant_id,bids_participant,session_id,bids_session,pipeline_name,pipeline_version,pipeline_step,status +01,sub-01,1,ses-01,fmriprep,20.2.7,step1,FAIL +01,sub-01,1,ses-01,fmriprep,20.2.7,step2,INCOMPLETE +01,sub-01,1,ses-01,fmriprep,23.1.3,default,SUCCESS +01,sub-01,1,ses-01,freesurfer,7.3.2,default,SUCCESS +01,sub-01,2,ses-02,freesurfer,7.3.2,default,UNAVAILABLE +02,sub-02,1,ses-01,fmriprep,23.1.3,default,SUCCESS +02,sub-02,1,ses-01,freesurfer,7.3.2,default,SUCCESS +02,sub-02,2,ses-02,freesurfer,7.3.2,default,SUCCESS diff --git a/bagel/tests/data/proc_status_synthetic.tsv b/bagel/tests/data/proc_status_synthetic.tsv new file mode 100644 index 0000000..e2bc680 --- /dev/null +++ b/bagel/tests/data/proc_status_synthetic.tsv @@ -0,0 +1,9 @@ +participant_id bids_participant session_id bids_session pipeline_name pipeline_version pipeline_step status +01 sub-01 01 ses-01 fmriprep 20.2.7 step1 FAIL +01 sub-01 01 ses-01 fmriprep 20.2.7 step2 INCOMPLETE +01 sub-01 01 ses-01 fmriprep 23.1.3 default SUCCESS +01 sub-01 01 ses-01 freesurfer 7.3.2 default SUCCESS +01 sub-01 02 ses-02 freesurfer 7.3.2 default UNAVAILABLE +02 sub-02 01 ses-01 fmriprep 23.1.3 default SUCCESS +02 sub-02 01 ses-01 freesurfer 7.3.2 default SUCCESS +02 sub-02 02 ses-02 freesurfer 7.3.2 default SUCCESS diff --git a/bagel/tests/data/proc_status_synthetic_incomplete.tsv b/bagel/tests/data/proc_status_synthetic_incomplete.tsv new file mode 100644 index 0000000..c0308c5 --- /dev/null +++ b/bagel/tests/data/proc_status_synthetic_incomplete.tsv @@ -0,0 +1,9 @@ +participant_id bids_participant session_id bids_session pipeline_name pipeline_version pipeline_step status +01 sub-01 01 ses-01 fmriprep 20.2.7 step1 FAIL +01 sub-01 01 ses-01 fmriprep 20.2.7 step2 INCOMPLETE +01 sub-01 01 ses-01 fmriprep 23.1.3 default SUCCESS +01 sub-01 01 ses-01 freesurfer 7.3.2 default SUCCESS +01 sub-01 02 ses-02 freesurfer 7.3.2 default UNAVAILABLE +02 01 ses-01 fmriprep 23.1.3 default SUCCESS +02 sub-02 01 ses-01 freesurfer 7.3.2 default SUCCESS +02 sub-02 02 ses-02 freesurfer 7.3.2 default SUCCESS diff --git a/bagel/tests/data/proc_status_unique_sessions.tsv b/bagel/tests/data/proc_status_unique_sessions.tsv new file mode 100644 index 0000000..f12db1c --- /dev/null +++ b/bagel/tests/data/proc_status_unique_sessions.tsv @@ -0,0 +1,9 @@ +participant_id bids_participant session_id bids_session pipeline_name pipeline_version pipeline_step status +01 sub-01 01 ses-01 fmriprep 20.2.7 step1 FAIL +01 sub-01 01 ses-01 fmriprep 20.2.7 step2 INCOMPLETE +01 sub-01 03 ses-03 fmriprep 23.1.3 default SUCCESS +01 sub-01 01 ses-01 freesurfer 7.3.2 default SUCCESS +01 sub-01 02 ses-02 fmriprep 23.1.3 default UNAVAILABLE +02 sub-02 01 ses-01 freesurfer 7.3.2 default SUCCESS +02 sub-02 01 ses-01 freesurfer 7.3.2 default SUCCESS +02 sub-02 02 ses-02 freesurfer 7.3.2 default SUCCESS diff --git a/bagel/tests/data/proc_status_unique_subs.tsv b/bagel/tests/data/proc_status_unique_subs.tsv new file mode 100644 index 0000000..4eeb356 --- /dev/null +++ b/bagel/tests/data/proc_status_unique_subs.tsv @@ -0,0 +1,7 @@ +participant_id bids_participant session_id bids_session pipeline_name pipeline_version pipeline_step status +pd1 sub-pd1 01 ses-01 fmriprep 20.2.7 step1 FAIL +pd1 sub-pd1 01 ses-01 fmriprep 20.2.7 step2 INCOMPLETE +pd2 sub-pd2 01 ses-01 fmriprep 23.1.3 default SUCCESS +pd1 sub-pd1 01 ses-01 freesurfer 7.3.2 default SUCCESS +pd2 sub-pd2 01 ses-01 freesurfer 7.3.2 default SUCCESS +pd2 sub-pd2 02 ses-02 freesurfer 7.3.2 default SUCCESS diff --git a/bagel/tests/test_cli_bids.py b/bagel/tests/test_cli_bids.py index 2a40051..71388ab 100644 --- a/bagel/tests/test_cli_bids.py +++ b/bagel/tests/test_cli_bids.py @@ -36,14 +36,14 @@ def test_bids_valid_inputs_run_successfully( ).exists(), "The pheno_bids.jsonld output was not created." -def test_bids_sessions_have_correct_labels( +def test_imaging_sessions_have_expected_labels( runner, test_data_upload_path, bids_synthetic, default_pheno_bids_output_path, load_test_json, ): - """Check that sessions added to pheno_bids.jsonld have the expected labels.""" + """Check that the imaging sessions in the JSONLD output have the expected session labels.""" runner.invoke( bagel, [ @@ -57,21 +57,77 @@ def test_bids_sessions_have_correct_labels( ], ) - pheno_bids = load_test_json(default_pheno_bids_output_path) - for sub in pheno_bids["hasSamples"]: + output = load_test_json(default_pheno_bids_output_path) + + for sub in output["hasSamples"]: + # all subjects in the BIDS synthetic dataset are expected to have 4 sessions total assert 4 == len(sub["hasSession"]) - imaging_session = [ - ses + imaging_ses_labels = [ + ses["hasLabel"] for ses in sub["hasSession"] if ses["schemaKey"] == "ImagingSession" ] - assert 2 == len(imaging_session) - # We also need to make sure that we do not have duplicate imaging session labels - assert set(["ses-01", "ses-02"]) == set( - [ses["hasLabel"] for ses in imaging_session] - ) + assert sorted(imaging_ses_labels) == ["ses-01", "ses-02"] + + +@pytest.mark.parametrize( + "jsonld_path,expected_sessions_with_acq_and_pipe_metadata", + [ + ("example_synthetic.jsonld", 0), + ( + "pheno-derivatives-output/example_synthetic_pheno-derivatives.jsonld", + 3, + ), + ], +) +def test_imaging_sessions_have_expected_metadata( + runner, + test_data_upload_path, + bids_synthetic, + default_pheno_bids_output_path, + load_test_json, + jsonld_path, + expected_sessions_with_acq_and_pipe_metadata, +): + """ + Check that the JSONLD output contains the expected total number of imaging sessions with + acquisition and completed pipeline metadata, based on whether a phenotypic-only JSONLD or + JSONLD with both phenotypic and processing pipeline metadata is provided as input. + """ + runner.invoke( + bagel, + [ + "bids", + "--jsonld-path", + test_data_upload_path / jsonld_path, + "--bids-dir", + bids_synthetic, + "--output", + default_pheno_bids_output_path, + ], + ) + + output = load_test_json(default_pheno_bids_output_path) + + sessions_with_acq_metadata = [] + sessions_with_acq_and_pipe_metadata = [] + for sub in output["hasSamples"]: + for ses in sub["hasSession"]: + if ( + ses["schemaKey"] == "ImagingSession" + and ses.get("hasAcquisition") is not None + ): + sessions_with_acq_metadata.append(ses) + if ses.get("hasCompletedPipeline") is not None: + sessions_with_acq_and_pipe_metadata.append(ses) + + assert len(sessions_with_acq_metadata) == 10 + assert ( + len(sessions_with_acq_and_pipe_metadata) + == expected_sessions_with_acq_and_pipe_metadata + ) def test_bids_data_with_sessions_have_correct_paths( diff --git a/bagel/tests/test_cli_derivatives.py b/bagel/tests/test_cli_derivatives.py new file mode 100644 index 0000000..b6b3088 --- /dev/null +++ b/bagel/tests/test_cli_derivatives.py @@ -0,0 +1,207 @@ +from collections import defaultdict + +import pytest + +from bagel.cli import bagel + + +@pytest.fixture(scope="function") +def default_derivatives_output_path(tmp_path): + "Return temporary derivatives command output filepath that uses the default filename." + return tmp_path / "pheno_derivatives.jsonld" + + +@pytest.mark.parametrize( + "valid_proc_status_file,num_expected_imaging_sessions", + [ + ("proc_status_synthetic.tsv", {"sub-01": 1, "sub-02": 2}), + ("proc_status_unique_sessions.tsv", {"sub-01": 2, "sub-02": 2}), + ], +) +def test_derivatives_cmd_with_valid_TSV_and_pheno_jsonlds_is_successful( + runner, + test_data, + test_data_upload_path, + default_derivatives_output_path, + load_test_json, + valid_proc_status_file, + num_expected_imaging_sessions, +): + """ + Test that when a valid processing status TSV and bagel pheno-created JSONLD are supplied as inputs, + the bagel derivatives command successfully creates an output JSONLD where subjects have the expected number + of imaging sessions created based on pipeline completion data. + """ + result = runner.invoke( + bagel, + [ + "derivatives", + "-t", + test_data / valid_proc_status_file, + "-p", + test_data_upload_path / "example_synthetic.jsonld", + "-o", + default_derivatives_output_path, + ], + ) + + assert result.exit_code == 0, f"Errored out. STDOUT: {result.output}" + + output = load_test_json(default_derivatives_output_path) + num_created_imaging_sessions = defaultdict(int) + for sub in output["hasSamples"]: + for ses in sub["hasSession"]: + if ses["schemaKey"] == "ImagingSession": + num_created_imaging_sessions[sub["hasLabel"]] += 1 + + assert num_created_imaging_sessions == num_expected_imaging_sessions + + +def test_pipeline_info_added_to_existing_imaging_sessions( + runner, + test_data, + test_data_upload_path, + default_derivatives_output_path, + load_test_json, +): + """ + Test that when a valid processing status TSV and bagel pheno- and bagel bids-created JSONLD + are supplied as inputs, the bagel derivatives command successfully creates an output JSONLD + where pipeline completion info is correctly added to existing or newly created imaging sessions. + """ + result = runner.invoke( + bagel, + [ + "derivatives", + "-t", + test_data / "proc_status_unique_sessions.tsv", + "-p", + test_data_upload_path + / "pheno-bids-output" + / "example_synthetic_pheno-bids.jsonld", + "-o", + default_derivatives_output_path, + ], + ) + + assert result.exit_code == 0, f"Errored out. STDOUT: {result.output}" + output = load_test_json(default_derivatives_output_path) + + # Only consider sub-01 for simplicity + ses_with_raw_imaging = [] + ses_with_derivatives = [] + for sub01_ses in output["hasSamples"][0]["hasSession"]: + if sub01_ses["schemaKey"] == "ImagingSession": + if sub01_ses.get("hasAcquisition") is not None: + ses_with_raw_imaging.append(sub01_ses["hasLabel"]) + if sub01_ses.get("hasCompletedPipeline") is not None: + ses_with_derivatives.append(sub01_ses["hasLabel"]) + + assert sorted(ses_with_raw_imaging) == ["ses-01", "ses-02"] + assert sorted(ses_with_derivatives) == ["ses-01", "ses-03"] + + +@pytest.mark.parametrize( + "example,expected_message,expected_error", + [ + ( + "proc_status_synthetic_incomplete.tsv", + ["missing", "status"], + LookupError, + ), + ( + "proc_status_synthetic.csv", + ["processing status", "not a .tsv file"], + ValueError, + ), + ( + "proc_status_unique_subs.tsv", + ["processing status file", "subject IDs not found"], + LookupError, + ), + ], +) +def test_derivatives_invalid_inputs_fail( + runner, + test_data, + test_data_upload_path, + default_derivatives_output_path, + example, + expected_message, + expected_error, +): + """Assure that we handle expected user errors in the input files for the bagel derivatives command gracefully.""" + with pytest.raises(expected_error) as e: + runner.invoke( + bagel, + [ + "derivatives", + "-t", + test_data / example, + "-p", + test_data_upload_path / "example_synthetic.jsonld", + "-o", + default_derivatives_output_path, + ], + catch_exceptions=False, + ) + + for substring in expected_message: + assert substring in str(e.value) + + assert ( + not default_derivatives_output_path.exists() + ), "The JSONLD output was created despite inputs being invalid." + + +@pytest.mark.parametrize( + "proc_status_file,completed_pipes_for_missing_ses_sub", + [ + ("proc_status_missing_sessions.tsv", {"sub-02": 2}), + # TODO: Revisit this example once the updated Nipoppy proc status file schema is available + # This example assumes that + # 1. It is possible to have a subject with missing values in bids_session but not in session_id + # 2. Duplicate entries of pipeline name, version, and step for an apparent subject-session based on bids_participant and bids_session + # (i.e., the two columns Neurobagel looks at) are allowed (see rows 8 and 9) + ("proc_status_no_bids_sessions.tsv", {"sub-01": 3, "sub-02": 2}), + ], +) +def test_custom_imaging_sessions_created_for_missing_session_labels( + runner, + test_data, + test_data_upload_path, + default_derivatives_output_path, + load_test_json, + proc_status_file, + completed_pipes_for_missing_ses_sub, +): + """Test that pipeline records for a subject with missing session labels are aggregated into a custom, Neurobagel-created session.""" + result = runner.invoke( + bagel, + [ + "derivatives", + "-t", + test_data / proc_status_file, + "-p", + test_data_upload_path / "example_synthetic.jsonld", + "-o", + default_derivatives_output_path, + ], + ) + assert result.exit_code == 0, f"Errored out. STDOUT: {result.output}" + + output = load_test_json(default_derivatives_output_path) + + custom_ses_completed_pipes = {} + for sub in output["hasSamples"]: + for ses in sub["hasSession"]: + if ( + ses["schemaKey"] == "ImagingSession" + and ses["hasLabel"] == "ses-nb01" + ): + custom_ses_completed_pipes[sub["hasLabel"]] = len( + ses["hasCompletedPipeline"] + ) + + # Note: order of items does not matter for dict comparison + assert custom_ses_completed_pipes == completed_pipes_for_missing_ses_sub diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py index 73ae236..8e08774 100644 --- a/bagel/tests/test_cli_pheno.py +++ b/bagel/tests/test_cli_pheno.py @@ -353,7 +353,7 @@ def test_providing_csv_file_raises_error( catch_exceptions=False, ) - assert "Please provide a valid .tsv pheno file" in str(e.value) + assert "Please provide a valid .tsv phenotypic file" in str(e.value) def test_that_output_file_contains_dataset_level_attributes( diff --git a/bagel/tests/test_utility.py b/bagel/tests/test_utility.py index a76e93b..59dbc65 100644 --- a/bagel/tests/test_utility.py +++ b/bagel/tests/test_utility.py @@ -1,5 +1,4 @@ from collections import Counter -from contextlib import nullcontext as does_not_raise from pathlib import Path import pandas as pd @@ -8,15 +7,22 @@ from bids import BIDSLayout import bagel.bids_utils as butil +import bagel.derivatives_utils as dutil +import bagel.file_utils as futil import bagel.pheno_utils as putil -from bagel import mappings -from bagel.utility import load_json +from bagel import mappings, models +from bagel.utility import ( + generate_context, + get_imaging_session_instances, + get_subject_instances, + get_subs_missing_from_pheno_data, +) @pytest.fixture def get_test_context(): """Generate an @context dictionary to test against.""" - return putil.generate_context() + return generate_context() @pytest.fixture @@ -352,25 +358,29 @@ def test_invalid_age_heuristic(): assert "unrecognized age transformation: nb:birthyear" in str(e.value) +# TODO: See if we can remove this test: it's a little hard to maintain and +# essentially replicates the logic of the function being tested +# Instead, see test_all_used_namespaces_have_urls and test_used_namespaces_in_context @pytest.mark.parametrize( "model, attributes", [ ("Bagel", ["identifier"]), - ("Sex", ["identifier", "schemaKey"]), - ("Diagnosis", ["identifier", "schemaKey"]), - ("SubjectGroup", ["identifier", "schemaKey"]), - ("Assessment", ["identifier", "schemaKey"]), - ("Image", ["identifier", "schemaKey"]), + ("ControlledTerm", ["identifier", "schemaKey"]), + ("Sex", ["schemaKey"]), + ("Diagnosis", ["schemaKey"]), + ("SubjectGroup", ["schemaKey"]), + ("Assessment", ["schemaKey"]), + ("Image", ["schemaKey"]), ("Acquisition", ["hasContrastType", "schemaKey"]), + ("Pipeline", ["schemaKey"]), ( - "Session", - ["hasLabel", "hasFilePath", "hasAcquisition", "schemaKey"], + "CompletedPipeline", + ["hasPipelineVersion", "hasPipelineName", "schemaKey"], ), + ("Session", ["hasLabel"]), ( - "Subject", + "PhenotypicSession", [ - "hasLabel", - "hasSession", "hasAge", "hasSex", "isSubjectGroup", @@ -379,6 +389,23 @@ def test_invalid_age_heuristic(): "schemaKey", ], ), + ( + "ImagingSession", + [ + "hasFilePath", + "hasAcquisition", + "hasCompletedPipeline", + "schemaKey", + ], + ), + ( + "Subject", + [ + "hasLabel", + "hasSession", + "schemaKey", + ], + ), ( "Dataset", [ @@ -414,34 +441,37 @@ def test_get_bids_subjects_simple(bids_path, bids_dir): @pytest.mark.parametrize( - "bids_list, expectation", + "bids_list, expected_bids_exclusive_subs", [ - (["sub-01", "sub-02", "sub-03"], does_not_raise()), + (["sub-01", "sub-02", "sub-03"], []), ( ["sub-01", "sub-02", "sub-03", "sub-04", "sub-05"], - pytest.raises(LookupError), + ["sub-04", "sub-05"], ), ( ["sub-cbm001", "sub-cbm002", "sub-cbm003"], - pytest.raises(LookupError), + ["sub-cbm001", "sub-cbm002", "sub-cbm003"], ), ( ["sub-pd123", "sub-pd234"], - pytest.raises(LookupError), + ["sub-pd123", "sub-pd234"], ), ], ) -def test_check_unique_bids_subjects_err(bids_list, expectation): +def test_get_subjects_missing_from_pheno_data( + bids_list, expected_bids_exclusive_subs +): """ - Given a list of BIDS subject IDs, raise an error or not depending on - whether all IDs are found in the phenotypic subject list. + Given a list of BIDS subject IDs, test that IDs not found in the phenotypic subject list are returned. """ pheno_list = ["sub-01", "sub-02", "sub-03", "sub-PD123", "sub-PD234"] + bids_exclusive_subs = get_subs_missing_from_pheno_data( + pheno_subjects=pheno_list, subjects=bids_list + ) - with expectation: - butil.check_unique_bids_subjects( - pheno_subjects=pheno_list, bids_subjects=bids_list - ) + # We sort the list for comparison since the order of the missing subjects is not guaranteed + # due to using set operations + assert sorted(bids_exclusive_subs) == expected_bids_exclusive_subs @pytest.mark.parametrize( @@ -534,7 +564,7 @@ def test_failed_json_reading_raises_informative_error( ): """Test that when there is an issue reading an input JSON file, the CLI exits with an informative error message.""" with pytest.raises(typer.Exit): - load_json(test_data / unreadable_json) + futil.load_json(test_data / unreadable_json) captured = capsys.readouterr() assert expected_message in captured.err @@ -543,7 +573,227 @@ def test_failed_json_reading_raises_informative_error( def test_unsupported_tsv_encoding_raises_informative_error(test_data, capsys): """Test that given an input phenotypic TSV with an unsupported encoding, the CLI exits with an informative error message.""" with pytest.raises(typer.Exit): - putil.load_pheno(test_data / "example_iso88591.tsv") + futil.load_tabular(test_data / "example_iso88591.tsv") captured = capsys.readouterr() assert "Failed to decode the input file" in captured.err + + +def test_get_subject_instances(): + """Test that subjects are correctly extracted from a Neurobagel dataset instance.""" + dataset = models.Dataset( + hasLabel="test_dataset", + hasSamples=[ + models.Subject( + hasLabel="sub-01", + hasSession=[ + models.PhenotypicSession( + hasLabel="ses-01", + hasAge=26, + ), + ], + ), + models.Subject( + hasLabel="sub-02", + hasSession=[ + models.PhenotypicSession( + hasLabel="ses-01", + hasAge=30, + ), + ], + ), + ], + ) + + subjects = get_subject_instances(dataset) + assert list(subjects.keys()) == ["sub-01", "sub-02"] + + +def test_pipeline_uris_are_loaded(): + """Test that pipeline URIs are loaded from the pipeline-catalog submodule.""" + + pipeline_dict = mappings.get_pipeline_uris() + assert all( + ((mappings.NP.pf in pipe_uri) and (" " not in pipe_uri)) + for pipe_uri in pipeline_dict.values() + ) + + +def test_pipeline_versions_are_loaded(): + """Test that pipeline versions are loaded from the pipeline-catalog submodule.""" + + pipeline_dict = mappings.get_pipeline_versions() + assert all( + isinstance(pipe_versions, list) and len(pipe_versions) > 0 + for pipe_versions in pipeline_dict.values() + ) + + +@pytest.mark.parametrize( + "pipelines, unrecog_pipelines", + [ + (["fmriprep", "pipeline1"], ["pipeline1"]), + (["pipelineA", "pipelineB"], ["pipelineA", "pipelineB"]), + ], +) +def test_unrecognized_pipeline_names_raise_error(pipelines, unrecog_pipelines): + """Test that pipeline names not found in the pipeline catalog raise an informative error.""" + with pytest.raises(LookupError) as e: + dutil.check_pipelines_are_recognized(pipelines) + + assert all( + substr in str(e.value) + for substr in ["unrecognized pipelines"] + unrecog_pipelines + ) + + +@pytest.mark.parametrize( + "fmriprep_versions, unrecog_versions", + [ + (["20.2.7", "vA.B"], ["vA.B"]), + (["C.D.E", "F.G.H"], ["C.D.E", "F.G.H"]), + ], +) +def test_unrecognized_pipeline_versions_raise_error( + fmriprep_versions, unrecog_versions +): + """Test that versions of a pipeline not found in the pipeline catalog raise an informative error.""" + with pytest.raises(LookupError) as e: + dutil.check_pipeline_versions_are_recognized( + "fmriprep", fmriprep_versions + ) + + assert all( + substr in str(e.value) + for substr in ["unrecognized fmriprep versions"] + unrecog_versions + ) + + +def test_get_imaging_session_instances(): + """Test that get_imaging_session_instances() correctly returns existing imaging sessions for a given subject.""" + example_subject_jsonld = { + "identifier": "nb:34ec1e2d-9a81-4a50-bcd0-eb22c88d11e1", + "hasLabel": "sub-01", + "hasSession": [ + { + "identifier": "nb:85c7473c-6122-4999-ad3b-5cd57a883c87", + "hasLabel": "ses-01", + "hasAge": 34.1, + "hasSex": { + "identifier": "snomed:248152002", + "schemaKey": "Sex", + }, + "schemaKey": "PhenotypicSession", + }, + { + "identifier": "nb:eb57d0c1-fb96-4c04-8c16-1f29f7f40db4", + "hasLabel": "ses-02", + "hasAge": 35.3, + "hasSex": { + "identifier": "snomed:248152002", + "schemaKey": "Sex", + }, + "schemaKey": "PhenotypicSession", + }, + { + "identifier": "nb:e67fd08b-9bf9-4ed8-b4cc-d0142cd27789", + "hasLabel": "ses-im01", + "hasFilePath": "/data/neurobagel/bagel-cli/bids-examples/synthetic/sub-01/ses-01", + "hasAcquisition": [ + { + "identifier": "nb:5dc2e11e-4f7a-4b0e-9488-843f0a607f4b", + "hasContrastType": { + "identifier": "nidm:T1Weighted", + "schemaKey": "Image", + }, + "schemaKey": "Acquisition", + }, + ], + "schemaKey": "ImagingSession", + }, + ], + "schemaKey": "Subject", + } + example_subject = models.Subject(**example_subject_jsonld) + imaging_sessions = get_imaging_session_instances(example_subject) + + assert list(imaging_sessions.keys()) == ["ses-im01"] + + +def test_create_completed_pipelines(): + """ + Test that completed pipelines for a subject-session are accurately identified, + where a completed pipeline is one meeting the condition that *all* steps of that pipeline + that were run for the session are marked with a status of "SUCCESS". + """ + sub_ses_data = [ + [ + "01", + "sub-01", + "01", + "ses-01", + "fmriprep", + "20.2.7", + "step1", + "SUCCESS", + ], + [ + "01", + "sub-01", + "01", + "ses-01", + "fmriprep", + "20.2.7", + "step2", + "FAIL", + ], + [ + "01", + "sub-01", + "01", + "ses-01", + "fmriprep", + "23.1.3", + "default", + "SUCCESS", + ], + ] + example_ses_proc_df = pd.DataFrame.from_records( + columns=[ + "participant_id", + "bids_participant", + "session_id", + "bids_session", + "pipeline_name", + "pipeline_version", + "pipeline_step", + "status", + ], + data=sub_ses_data, + ) + completed_pipelines = dutil.create_completed_pipelines(example_ses_proc_df) + + assert len(completed_pipelines) == 1 + assert ( + completed_pipelines[0].hasPipelineName.identifier + == f"{mappings.NP.pf}:fmriprep" + ) + assert completed_pipelines[0].hasPipelineVersion == "23.1.3" + + +def test_used_namespaces_in_context(test_data_upload_path, load_test_json): + """ + Test that all namespaces used internally by the CLI for JSONLD dataset creation are defined + in the @context of reference example .jsonld files. + """ + # Fetch all .jsonld files to avoid having to add a test parameter whenever we add a new JSONLD + example_jsonld_files = list(test_data_upload_path.rglob("*.jsonld")) + for jsonld in example_jsonld_files: + jsonld_context = load_test_json(test_data_upload_path / jsonld)[ + "@context" + ] + + for ns in mappings.ALL_NAMESPACES: + assert ( + ns.pf in jsonld_context.keys() + ), f"The namespace '{ns.pf}' was not found in the @context of {jsonld}." diff --git a/bagel/utility.py b/bagel/utility.py index 0665f13..4d7a723 100644 --- a/bagel/utility.py +++ b/bagel/utility.py @@ -1,49 +1,114 @@ -import json from pathlib import Path +from typing import Iterable +import pydantic import typer +from pydantic import ValidationError +import bagel.file_utils as futil +from bagel import models +from bagel.mappings import ALL_NAMESPACES, NB -def file_encoding_error_message(input_p: Path) -> str: - """Return a message for when a file cannot be read due to encoding issues.""" - return typer.style( - f"Failed to decode the input file {input_p}. " - "Please ensure that both your phenotypic .tsv file and .json data dictionary have UTF-8 encoding.\n" - "Tip: Need help converting your file? Try a tool like iconv (http://linux.die.net/man/1/iconv) or https://www.freeformatter.com/convert-file-encoding.html.", - fg=typer.colors.RED, + +def generate_context(): + # Direct copy of the dandi-schema context generation function + # https://github.com/dandi/dandi-schema/blob/c616d87eaae8869770df0cb5405c24afdb9db096/dandischema/metadata.py + field_preamble = { + namespace.pf: namespace.url for namespace in ALL_NAMESPACES + } + fields = {} + for val in dir(models): + klass = getattr(models, val) + if not isinstance(klass, pydantic.main.ModelMetaclass): + continue + fields[klass.__name__] = f"{NB.pf}:{klass.__name__}" + for name, field in klass.__fields__.items(): + if name == "schemaKey": + fields[name] = "@type" + elif name == "identifier": + fields[name] = "@id" + elif name not in fields: + fields[name] = {"@id": f"{NB.pf}:{name}"} + + field_preamble.update(**fields) + + return {"@context": field_preamble} + + +def get_subs_missing_from_pheno_data( + subjects: Iterable, pheno_subjects: Iterable +) -> list: + """Check a list of subject IDs and return any not found in the provided phenotypic subject list.""" + return list(set(subjects).difference(pheno_subjects)) + + +def confirm_subs_match_pheno_data( + subjects: Iterable, subject_source_for_err: str, pheno_subjects: Iterable +): + """ + Return an error if not all subjects in the subject list are found in the provided phenotypic subject list. + """ + missing_subs = get_subs_missing_from_pheno_data( + subjects=subjects, + pheno_subjects=pheno_subjects, ) + if len(missing_subs) > 0: + raise LookupError( + f"The specified {subject_source_for_err} contains subject IDs not found in " + "the provided json-ld file:\n" + f"{missing_subs}\n" + "Subject IDs are case sensitive. " + f"Please check that the {subject_source_for_err} corresponds to the dataset in the provided .jsonld." + ) -def load_json(input_p: Path) -> dict: - """Load a user-specified json type file.""" + +def extract_and_validate_jsonld_dataset(file_path: Path) -> models.Dataset: + """ + Strip the context from a user-provided JSONLD and validate the remaining contents + against the data model for a Neurobagel dataset. + """ + jsonld = futil.load_json(file_path) + jsonld.pop("@context") try: - with open(input_p, "r", encoding="utf-8") as f: - return json.load(f) - except UnicodeDecodeError as e: - # TODO: Refactor once https://github.com/neurobagel/bagel-cli/issues/218 is addressed - typer.echo( - file_encoding_error_message(input_p), - err=True, - ) - raise typer.Exit(code=1) from e - except json.JSONDecodeError as e: + jsonld_dataset = models.Dataset.parse_obj(jsonld) + except ValidationError as err: typer.echo( typer.style( - f"The provided data dictionary {input_p} is not valid JSON. " - "Please provide a valid JSON file.", + ( + f"Error: {file_path} is not a valid Neurobagel JSONLD dataset. " + "Please ensure to provide a valid JSONLD file generated by Neurobagel CLI commands.\n" + f"Validation errors: {str(err)}" + ), fg=typer.colors.RED, ), err=True, ) - raise typer.Exit(code=1) from e + raise typer.Exit(code=1) from err + return jsonld_dataset -def check_overwrite(output: Path, overwrite: bool): - """Exit program gracefully if an output file already exists but --overwrite has not been set.""" - if output.exists() and not overwrite: - raise typer.Exit( - typer.style( - f"Output file {output} already exists. Use --overwrite to overwrite.", - fg=typer.colors.RED, - ) - ) + +def get_subject_instances(dataset: models.Dataset) -> dict: + """ + Return a dictionary of subjects for a given Neurobagel dataset from JSONLD data, + where keys are subject labels and values are the subject objects. + """ + return { + subject.hasLabel: subject for subject in getattr(dataset, "hasSamples") + } + + +def get_imaging_session_instances( + jsonld_subject: models.Subject, +) -> dict: + """ + Return a dictionary of imaging sessions for a given subject from JSONLD data, + where the keys are the session labels and values are the session objects. + """ + jsonld_sub_sessions_dict = {} + for jsonld_sub_ses in getattr(jsonld_subject, "hasSession"): + if jsonld_sub_ses.schemaKey == "ImagingSession": + jsonld_sub_sessions_dict[jsonld_sub_ses.hasLabel] = jsonld_sub_ses + + return jsonld_sub_sessions_dict diff --git a/pipeline-catalog b/pipeline-catalog new file mode 160000 index 0000000..65167b3 --- /dev/null +++ b/pipeline-catalog @@ -0,0 +1 @@ +Subproject commit 65167b3ef114a444d678a777bc0e78cffbdd0a3e