diff --git a/bagel/dictionary_models.py b/bagel/dictionary_models.py index 0e6d12b..79bd960 100644 --- a/bagel/dictionary_models.py +++ b/bagel/dictionary_models.py @@ -75,7 +75,7 @@ class Column(BaseModel): alias="Description", ) annotations: Union[CategoricalNeurobagel, ContinuousNeurobagel] = Field( - ..., description="Semantic annotations", alias="Annotations" + None, description="Semantic annotations", alias="Annotations" ) diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py index da337ba..35eec9b 100644 --- a/bagel/pheno_utils.py +++ b/bagel/pheno_utils.py @@ -80,8 +80,16 @@ def get_columns_about(data_dict: dict, concept: str) -> list: """ return [ col - for col, annotations in data_dict.items() - if annotations["Annotations"]["IsAbout"]["TermURL"] == concept + for col, content in get_annotated_columns(data_dict) + if content["Annotations"]["IsAbout"]["TermURL"] == concept + ] + + +def get_annotated_columns(data_dict: dict) -> list: + return [ + (col, content) + for col, content in data_dict.items() + if "Annotations" in content ] @@ -103,7 +111,7 @@ def map_tools_to_columns(data_dict: dict) -> dict: are mapped to it. """ out_dict = defaultdict(list) - for col, content in data_dict.items(): + for col, content in get_annotated_columns(data_dict): part_of = content["Annotations"].get("IsPartOf") if part_of is not None: out_dict[part_of.get("TermURL")].append(col) @@ -195,10 +203,10 @@ def get_transformed_values( # TODO: Check all columns and then return list of offending columns' names def categorical_cols_have_bids_levels(data_dict: dict) -> bool: - for col, attrs in data_dict.items(): + for col, content in get_annotated_columns(data_dict): if ( is_column_categorical(col, data_dict) - and attrs.get("Levels") is None + and content.get("Levels") is None ): return False @@ -211,12 +219,12 @@ def get_mismatched_categorical_levels(data_dict: dict) -> list: for the "Levels" key between the column's BIDS and Neurobagel annotations. """ mismatched_cols = [] - for col, attrs in data_dict.items(): + for col, content in get_annotated_columns(data_dict): if is_column_categorical(col, data_dict): - known_levels = list(attrs["Annotations"]["Levels"].keys()) + attrs[ - "Annotations" - ].get("MissingValues", []) - if set(attrs.get("Levels", {}).keys()).difference(known_levels): + known_levels = list( + content["Annotations"]["Levels"].keys() + ) + content["Annotations"].get("MissingValues", []) + if set(content.get("Levels", {}).keys()).difference(known_levels): mismatched_cols.append(col) return mismatched_cols @@ -239,7 +247,12 @@ def are_inputs_compatible(data_dict: dict, pheno_df: pd.DataFrame) -> bool: """ Determines whether the provided data dictionary and phenotypic file make sense together """ - return all([key in pheno_df.columns for key in data_dict.keys()]) + return all( + [ + col in pheno_df.columns + for col, _ in get_annotated_columns(data_dict) + ] + ) def find_undefined_cat_col_values( @@ -251,11 +264,11 @@ def find_undefined_cat_col_values( dictionary entry. """ all_undefined_values = {} - for col, attr in data_dict.items(): + for col, content in get_annotated_columns(data_dict): if is_column_categorical(col, data_dict): - known_values = list(attr["Annotations"]["Levels"].keys()) + attr[ - "Annotations" - ].get("MissingValues", []) + known_values = list( + content["Annotations"]["Levels"].keys() + ) + content["Annotations"].get("MissingValues", []) unknown_values = [] for value in pheno_df[col].unique(): if value not in known_values: @@ -275,9 +288,9 @@ def find_unused_missing_values( file column. """ all_unused_missing_vals = {} - for col, attr in data_dict.items(): + for col, content in get_annotated_columns(data_dict): unused_missing_vals = [] - for missing_val in attr["Annotations"].get("MissingValues", []): + for missing_val in content["Annotations"].get("MissingValues", []): if missing_val not in pheno_df[col].unique(): unused_missing_vals.append(missing_val) if unused_missing_vals: @@ -308,6 +321,23 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None: "Make sure that each annotated column contains an 'Annotations' key." ) from e + if get_annotated_columns(data_dict) == []: + raise LookupError( + "The provided data dictionary must contain at least one column with Neurobagel annotations." + ) + + if ( + len( + get_columns_about( + data_dict, concept=mappings.NEUROBAGEL["participant"] + ) + ) + == 0 + ): + raise LookupError( + "The provided data dictionary must contain at least one column annotated as being about participant ID." + ) + # TODO: remove this validation when we start handling multiple participant and / or session ID columns if ( len( diff --git a/bagel/tests/data/README.md b/bagel/tests/data/README.md index a14268f..970abb3 100644 --- a/bagel/tests/data/README.md +++ b/bagel/tests/data/README.md @@ -3,7 +3,7 @@ Example inputs to the CLI | Example name | `.tsv` | `.json` | Expect | -|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------|--------------------| +| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------ | ------------------ | | 1 | invalid, non-unique combinations of `participant` and `session` IDs | valid, has `IsAbout` annotations for `participant` and `session` ID columns | fail | | 2 | valid, unique `participant` and `session` IDs | same as example 1 | pass | | 3 | same as example 2 | valid BIDS data dictionary, BUT: does not contain Neurobagel `"Annotations"` key | fail | @@ -19,6 +19,8 @@ Example inputs to the CLI | 11 | invalid, ex 6 with missing entries in `participant_id` and `session_id` columns | valid, based on example 6 | fail | | 12 | Valid, same as example 2 | Valid, based on example 2 but missing BIDS "Levels" attribute for group column | Pass, with warning | | 13 | Valid, same as example_synthetic | Valid, based on example_synthetic but with mismatched levels for group column | Pass, with warning | +| 14 | Valid, same as example 2 | Valid, based on example 2, but with an extra column annotation without Neurobagel | Pass | +| 15 | Valid, same as example 2 | Invalid, based on example 2, but participant ID column lacks Neurobagel annotations | Fail | `* this is expected to fail until we enable multiple participant_ID handling`. diff --git a/bagel/tests/data/example14.json b/bagel/tests/data/example14.json new file mode 100644 index 0000000..6fb0352 --- /dev/null +++ b/bagel/tests/data/example14.json @@ -0,0 +1,82 @@ +{ + "participant_id": { + "Description": "A participant ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:ParticipantID", + "Label": "Unique participant identifier" + } + } + }, + "session_id": { + "Description": "A session ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:SessionID", + "Label": "Unique session identifier" + } + } + }, + "group": { + "Description": "Group variable", + "Levels": { + "PAT": "Patient", + "CTRL": "Control subject" + }, + "Annotations": { + "IsAbout": { + "TermURL": "nb:Diagnosis", + "Label": "Diagnosis" + }, + "Levels": { + "PAT": { + "TermURL": "snomed:49049000", + "Label": "Parkinson's disease" + }, + "CTRL": { + "TermURL": "purl:NCIT_C94342", + "Label": "Healthy Control" + } + } + } + }, + "sex": { + "Description": "Sex variable", + "Levels": { + "M": "Male", + "F": "Female" + }, + "Annotations": { + "IsAbout": { + "TermURL": "nb:Sex", + "Label": "Sex" + }, + "Levels": { + "M": { + "TermURL": "snomed:248153007", + "Label": "Male" + }, + "F": { + "TermURL": "snomed:248152002", + "Label": "Female" + } + } + } + }, + "participant_age": { + "Description": "Age of the participant", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Age", + "Label": "Chronological age" + }, + "Transformation": { + "TermURL": "nb:iso8601", + "Label": "A period of time defined according to the ISO8601 standard" + } + } + }, + "someOtherColumn": { + "Description": "This is cool in BIDS, but not in Neurobagel" + } +} \ No newline at end of file diff --git a/bagel/tests/data/example14.tsv b/bagel/tests/data/example14.tsv new file mode 100644 index 0000000..705ad3b --- /dev/null +++ b/bagel/tests/data/example14.tsv @@ -0,0 +1,5 @@ +participant_id session_id group sex participant_age +sub-01 ses-01 PAT M "P20Y6M" +sub-01 ses-02 PAT M "P20Y8M" +sub-02 ses-01 CTRL F "P25Y8M" +sub-02 ses-02 CTRL F "P26Y4M" diff --git a/bagel/tests/data/example15.json b/bagel/tests/data/example15.json new file mode 100644 index 0000000..7326a66 --- /dev/null +++ b/bagel/tests/data/example15.json @@ -0,0 +1,73 @@ +{ + "participant_id": { + "Description": "A participant ID" + }, + "session_id": { + "Description": "A session ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:SessionID", + "Label": "Unique session identifier" + } + } + }, + "group": { + "Description": "Group variable", + "Levels": { + "PAT": "Patient", + "CTRL": "Control subject" + }, + "Annotations": { + "IsAbout": { + "TermURL": "nb:Diagnosis", + "Label": "Diagnosis" + }, + "Levels": { + "PAT": { + "TermURL": "snomed:49049000", + "Label": "Parkinson's disease" + }, + "CTRL": { + "TermURL": "purl:NCIT_C94342", + "Label": "Healthy Control" + } + } + } + }, + "sex": { + "Description": "Sex variable", + "Levels": { + "M": "Male", + "F": "Female" + }, + "Annotations": { + "IsAbout": { + "TermURL": "nb:Sex", + "Label": "Sex" + }, + "Levels": { + "M": { + "TermURL": "snomed:248153007", + "Label": "Male" + }, + "F": { + "TermURL": "snomed:248152002", + "Label": "Female" + } + } + } + }, + "participant_age": { + "Description": "Age of the participant", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Age", + "Label": "Chronological age" + }, + "Transformation": { + "TermURL": "nb:iso8601", + "Label": "A period of time defined according to the ISO8601 standard" + } + } + } +} \ No newline at end of file diff --git a/bagel/tests/data/example15.tsv b/bagel/tests/data/example15.tsv new file mode 100644 index 0000000..705ad3b --- /dev/null +++ b/bagel/tests/data/example15.tsv @@ -0,0 +1,5 @@ +participant_id session_id group sex participant_age +sub-01 ses-01 PAT M "P20Y6M" +sub-01 ses-02 PAT M "P20Y8M" +sub-02 ses-01 CTRL F "P25Y8M" +sub-02 ses-02 CTRL F "P26Y4M" diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py index 39d316d..f0186c9 100644 --- a/bagel/tests/test_cli_pheno.py +++ b/bagel/tests/test_cli_pheno.py @@ -11,6 +11,7 @@ "example6", "example12", "example13", + "example14", "example_synthetic", ], ) @@ -41,7 +42,11 @@ def test_pheno_valid_inputs_run_successfully( @pytest.mark.parametrize( "example,expected_exception,expected_message", [ - ("example3", ValueError, ["not a valid Neurobagel data dictionary"]), + ( + "example3", + LookupError, + ["must contain at least one column with Neurobagel annotations"], + ), ( "example_invalid", ValueError, @@ -62,6 +67,13 @@ def test_pheno_valid_inputs_run_successfully( LookupError, ["missing values in participant or session id"], ), + ( + "example15", + LookupError, + [ + "must contain at least one column annotated as being about participant ID" + ], + ), ], ) def test_invalid_inputs_are_handled_gracefully( diff --git a/bagel/tests/test_utility.py b/bagel/tests/test_utility.py index 3aee174..efb29e4 100644 --- a/bagel/tests/test_utility.py +++ b/bagel/tests/test_utility.py @@ -20,7 +20,7 @@ def get_test_context(): def test_get_columns_that_are_about_concept(test_data, load_test_json): """Test that matching annotated columns are returned as a list, and that empty list is returned if nothing matches""" - data_dict = load_test_json(test_data / "example1.json") + data_dict = load_test_json(test_data / "example14.json") assert ["participant_id"] == putil.get_columns_about( data_dict, concept=mappings.NEUROBAGEL["participant"] @@ -30,6 +30,26 @@ def test_get_columns_that_are_about_concept(test_data, load_test_json): ) +def test_get_columns_with_annotations(): + example = { + "someOtherColumn": { + "Description": "This is cool in BIDS, but not in Neurobagel" + }, + "participant_id": { + "Description": "A participant ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:ParticipantID", + "Label": "Unique participant identifier", + } + }, + }, + } + result = putil.get_annotated_columns(example)[0] + assert result[0] == "participant_id" + assert result[1] == example["participant_id"] + + def test_map_categories_to_columns(test_data, load_test_json): """Test that inverse mapping of concepts to columns is correctly created""" data_dict = load_test_json(test_data / "example2.json")