Skip to content

Commit

Permalink
[FIX] Schema: allow for data dictionaries with incomplete Neurobagel …
Browse files Browse the repository at this point in the history
…annotations (#163)

* skip columns without annotations

- required now that the schema makes annotations optional

* Introduce get_annotated_columns

* Refactor get_annotated_columns to return column name and content

* Refactor utils to use only annotated columns

* Update test data description

* add check + test that no annotated columns in data dict raises error

* add check + test that >=1 column about participant ID

* add example with participant ID column lacking neurobagel annotations

---------

Co-authored-by: Alyssa Dai <[email protected]>
  • Loading branch information
surchs and alyssadai authored Jun 15, 2023
1 parent 0ed0e4d commit 4f655c9
Show file tree
Hide file tree
Showing 9 changed files with 250 additions and 21 deletions.
2 changes: 1 addition & 1 deletion bagel/dictionary_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ class Column(BaseModel):
alias="Description",
)
annotations: Union[CategoricalNeurobagel, ContinuousNeurobagel] = Field(
..., description="Semantic annotations", alias="Annotations"
None, description="Semantic annotations", alias="Annotations"
)


Expand Down
64 changes: 47 additions & 17 deletions bagel/pheno_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,16 @@ def get_columns_about(data_dict: dict, concept: str) -> list:
"""
return [
col
for col, annotations in data_dict.items()
if annotations["Annotations"]["IsAbout"]["TermURL"] == concept
for col, content in get_annotated_columns(data_dict)
if content["Annotations"]["IsAbout"]["TermURL"] == concept
]


def get_annotated_columns(data_dict: dict) -> list:
return [
(col, content)
for col, content in data_dict.items()
if "Annotations" in content
]


Expand All @@ -103,7 +111,7 @@ def map_tools_to_columns(data_dict: dict) -> dict:
are mapped to it.
"""
out_dict = defaultdict(list)
for col, content in data_dict.items():
for col, content in get_annotated_columns(data_dict):
part_of = content["Annotations"].get("IsPartOf")
if part_of is not None:
out_dict[part_of.get("TermURL")].append(col)
Expand Down Expand Up @@ -195,10 +203,10 @@ def get_transformed_values(

# TODO: Check all columns and then return list of offending columns' names
def categorical_cols_have_bids_levels(data_dict: dict) -> bool:
for col, attrs in data_dict.items():
for col, content in get_annotated_columns(data_dict):
if (
is_column_categorical(col, data_dict)
and attrs.get("Levels") is None
and content.get("Levels") is None
):
return False

Expand All @@ -211,12 +219,12 @@ def get_mismatched_categorical_levels(data_dict: dict) -> list:
for the "Levels" key between the column's BIDS and Neurobagel annotations.
"""
mismatched_cols = []
for col, attrs in data_dict.items():
for col, content in get_annotated_columns(data_dict):
if is_column_categorical(col, data_dict):
known_levels = list(attrs["Annotations"]["Levels"].keys()) + attrs[
"Annotations"
].get("MissingValues", [])
if set(attrs.get("Levels", {}).keys()).difference(known_levels):
known_levels = list(
content["Annotations"]["Levels"].keys()
) + content["Annotations"].get("MissingValues", [])
if set(content.get("Levels", {}).keys()).difference(known_levels):
mismatched_cols.append(col)

return mismatched_cols
Expand All @@ -239,7 +247,12 @@ def are_inputs_compatible(data_dict: dict, pheno_df: pd.DataFrame) -> bool:
"""
Determines whether the provided data dictionary and phenotypic file make sense together
"""
return all([key in pheno_df.columns for key in data_dict.keys()])
return all(
[
col in pheno_df.columns
for col, _ in get_annotated_columns(data_dict)
]
)


def find_undefined_cat_col_values(
Expand All @@ -251,11 +264,11 @@ def find_undefined_cat_col_values(
dictionary entry.
"""
all_undefined_values = {}
for col, attr in data_dict.items():
for col, content in get_annotated_columns(data_dict):
if is_column_categorical(col, data_dict):
known_values = list(attr["Annotations"]["Levels"].keys()) + attr[
"Annotations"
].get("MissingValues", [])
known_values = list(
content["Annotations"]["Levels"].keys()
) + content["Annotations"].get("MissingValues", [])
unknown_values = []
for value in pheno_df[col].unique():
if value not in known_values:
Expand All @@ -275,9 +288,9 @@ def find_unused_missing_values(
file column.
"""
all_unused_missing_vals = {}
for col, attr in data_dict.items():
for col, content in get_annotated_columns(data_dict):
unused_missing_vals = []
for missing_val in attr["Annotations"].get("MissingValues", []):
for missing_val in content["Annotations"].get("MissingValues", []):
if missing_val not in pheno_df[col].unique():
unused_missing_vals.append(missing_val)
if unused_missing_vals:
Expand Down Expand Up @@ -308,6 +321,23 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None:
"Make sure that each annotated column contains an 'Annotations' key."
) from e

if get_annotated_columns(data_dict) == []:
raise LookupError(
"The provided data dictionary must contain at least one column with Neurobagel annotations."
)

if (
len(
get_columns_about(
data_dict, concept=mappings.NEUROBAGEL["participant"]
)
)
== 0
):
raise LookupError(
"The provided data dictionary must contain at least one column annotated as being about participant ID."
)

# TODO: remove this validation when we start handling multiple participant and / or session ID columns
if (
len(
Expand Down
4 changes: 3 additions & 1 deletion bagel/tests/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Example inputs to the CLI

| Example name | `.tsv` | `.json` | Expect |
|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------|--------------------|
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------ | ------------------ |
| 1 | invalid, non-unique combinations of `participant` and `session` IDs | valid, has `IsAbout` annotations for `participant` and `session` ID columns | fail |
| 2 | valid, unique `participant` and `session` IDs | same as example 1 | pass |
| 3 | same as example 2 | valid BIDS data dictionary, BUT: does not contain Neurobagel `"Annotations"` key | fail |
Expand All @@ -19,6 +19,8 @@ Example inputs to the CLI
| 11 | invalid, ex 6 with missing entries in `participant_id` and `session_id` columns | valid, based on example 6 | fail |
| 12 | Valid, same as example 2 | Valid, based on example 2 but missing BIDS "Levels" attribute for group column | Pass, with warning |
| 13 | Valid, same as example_synthetic | Valid, based on example_synthetic but with mismatched levels for group column | Pass, with warning |
| 14 | Valid, same as example 2 | Valid, based on example 2, but with an extra column annotation without Neurobagel | Pass |
| 15 | Valid, same as example 2 | Invalid, based on example 2, but participant ID column lacks Neurobagel annotations | Fail |

`* this is expected to fail until we enable multiple participant_ID handling`.

Expand Down
82 changes: 82 additions & 0 deletions bagel/tests/data/example14.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
{
"participant_id": {
"Description": "A participant ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:ParticipantID",
"Label": "Unique participant identifier"
}
}
},
"session_id": {
"Description": "A session ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:SessionID",
"Label": "Unique session identifier"
}
}
},
"group": {
"Description": "Group variable",
"Levels": {
"PAT": "Patient",
"CTRL": "Control subject"
},
"Annotations": {
"IsAbout": {
"TermURL": "nb:Diagnosis",
"Label": "Diagnosis"
},
"Levels": {
"PAT": {
"TermURL": "snomed:49049000",
"Label": "Parkinson's disease"
},
"CTRL": {
"TermURL": "purl:NCIT_C94342",
"Label": "Healthy Control"
}
}
}
},
"sex": {
"Description": "Sex variable",
"Levels": {
"M": "Male",
"F": "Female"
},
"Annotations": {
"IsAbout": {
"TermURL": "nb:Sex",
"Label": "Sex"
},
"Levels": {
"M": {
"TermURL": "snomed:248153007",
"Label": "Male"
},
"F": {
"TermURL": "snomed:248152002",
"Label": "Female"
}
}
}
},
"participant_age": {
"Description": "Age of the participant",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Age",
"Label": "Chronological age"
},
"Transformation": {
"TermURL": "nb:iso8601",
"Label": "A period of time defined according to the ISO8601 standard"
}
}
},
"someOtherColumn": {
"Description": "This is cool in BIDS, but not in Neurobagel"
}
}
5 changes: 5 additions & 0 deletions bagel/tests/data/example14.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
participant_id session_id group sex participant_age
sub-01 ses-01 PAT M "P20Y6M"
sub-01 ses-02 PAT M "P20Y8M"
sub-02 ses-01 CTRL F "P25Y8M"
sub-02 ses-02 CTRL F "P26Y4M"
73 changes: 73 additions & 0 deletions bagel/tests/data/example15.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
{
"participant_id": {
"Description": "A participant ID"
},
"session_id": {
"Description": "A session ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:SessionID",
"Label": "Unique session identifier"
}
}
},
"group": {
"Description": "Group variable",
"Levels": {
"PAT": "Patient",
"CTRL": "Control subject"
},
"Annotations": {
"IsAbout": {
"TermURL": "nb:Diagnosis",
"Label": "Diagnosis"
},
"Levels": {
"PAT": {
"TermURL": "snomed:49049000",
"Label": "Parkinson's disease"
},
"CTRL": {
"TermURL": "purl:NCIT_C94342",
"Label": "Healthy Control"
}
}
}
},
"sex": {
"Description": "Sex variable",
"Levels": {
"M": "Male",
"F": "Female"
},
"Annotations": {
"IsAbout": {
"TermURL": "nb:Sex",
"Label": "Sex"
},
"Levels": {
"M": {
"TermURL": "snomed:248153007",
"Label": "Male"
},
"F": {
"TermURL": "snomed:248152002",
"Label": "Female"
}
}
}
},
"participant_age": {
"Description": "Age of the participant",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Age",
"Label": "Chronological age"
},
"Transformation": {
"TermURL": "nb:iso8601",
"Label": "A period of time defined according to the ISO8601 standard"
}
}
}
}
5 changes: 5 additions & 0 deletions bagel/tests/data/example15.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
participant_id session_id group sex participant_age
sub-01 ses-01 PAT M "P20Y6M"
sub-01 ses-02 PAT M "P20Y8M"
sub-02 ses-01 CTRL F "P25Y8M"
sub-02 ses-02 CTRL F "P26Y4M"
14 changes: 13 additions & 1 deletion bagel/tests/test_cli_pheno.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"example6",
"example12",
"example13",
"example14",
"example_synthetic",
],
)
Expand Down Expand Up @@ -41,7 +42,11 @@ def test_pheno_valid_inputs_run_successfully(
@pytest.mark.parametrize(
"example,expected_exception,expected_message",
[
("example3", ValueError, ["not a valid Neurobagel data dictionary"]),
(
"example3",
LookupError,
["must contain at least one column with Neurobagel annotations"],
),
(
"example_invalid",
ValueError,
Expand All @@ -62,6 +67,13 @@ def test_pheno_valid_inputs_run_successfully(
LookupError,
["missing values in participant or session id"],
),
(
"example15",
LookupError,
[
"must contain at least one column annotated as being about participant ID"
],
),
],
)
def test_invalid_inputs_are_handled_gracefully(
Expand Down
22 changes: 21 additions & 1 deletion bagel/tests/test_utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def get_test_context():
def test_get_columns_that_are_about_concept(test_data, load_test_json):
"""Test that matching annotated columns are returned as a list,
and that empty list is returned if nothing matches"""
data_dict = load_test_json(test_data / "example1.json")
data_dict = load_test_json(test_data / "example14.json")

assert ["participant_id"] == putil.get_columns_about(
data_dict, concept=mappings.NEUROBAGEL["participant"]
Expand All @@ -30,6 +30,26 @@ def test_get_columns_that_are_about_concept(test_data, load_test_json):
)


def test_get_columns_with_annotations():
example = {
"someOtherColumn": {
"Description": "This is cool in BIDS, but not in Neurobagel"
},
"participant_id": {
"Description": "A participant ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:ParticipantID",
"Label": "Unique participant identifier",
}
},
},
}
result = putil.get_annotated_columns(example)[0]
assert result[0] == "participant_id"
assert result[1] == example["participant_id"]


def test_map_categories_to_columns(test_data, load_test_json):
"""Test that inverse mapping of concepts to columns is correctly created"""
data_dict = load_test_json(test_data / "example2.json")
Expand Down

0 comments on commit 4f655c9

Please sign in to comment.