[FIX] Schema: allow for data dictionaries with incomplete Neurobagel …

…annotations (#163) * skip columns without annotations - required now that the schema makes annotations optional * Introduce get_annotated_columns * Refactor get_annotated_columns to return column name and content * Refactor utils to use only annotated columns * Update test data description * add check + test that no annotated columns in data dict raises error * add check + test that >=1 column about participant ID * add example with participant ID column lacking neurobagel annotations --------- Co-authored-by: Alyssa Dai <[email protected]>
neurobagel · Jun 15, 2023 · 4f655c9 · 4f655c9
1 parent 0ed0e4d
commit 4f655c9
Show file tree

Hide file tree

Showing 9 changed files with 250 additions and 21 deletions.
diff --git a/bagel/dictionary_models.py b/bagel/dictionary_models.py
@@ -75,7 +75,7 @@ class Column(BaseModel):
         alias="Description",
     )
     annotations: Union[CategoricalNeurobagel, ContinuousNeurobagel] = Field(
-        ..., description="Semantic annotations", alias="Annotations"
+        None, description="Semantic annotations", alias="Annotations"
     )
 
 

diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py
@@ -80,8 +80,16 @@ def get_columns_about(data_dict: dict, concept: str) -> list:
     """
     return [
         col
-        for col, annotations in data_dict.items()
-        if annotations["Annotations"]["IsAbout"]["TermURL"] == concept
+        for col, content in get_annotated_columns(data_dict)
+        if content["Annotations"]["IsAbout"]["TermURL"] == concept
+    ]
+
+
+def get_annotated_columns(data_dict: dict) -> list:
+    return [
+        (col, content)
+        for col, content in data_dict.items()
+        if "Annotations" in content
     ]
 
 
@@ -103,7 +111,7 @@ def map_tools_to_columns(data_dict: dict) -> dict:
     are mapped to it.
     """
     out_dict = defaultdict(list)
-    for col, content in data_dict.items():
+    for col, content in get_annotated_columns(data_dict):
         part_of = content["Annotations"].get("IsPartOf")
         if part_of is not None:
             out_dict[part_of.get("TermURL")].append(col)
@@ -195,10 +203,10 @@ def get_transformed_values(
 
 # TODO: Check all columns and then return list of offending columns' names
 def categorical_cols_have_bids_levels(data_dict: dict) -> bool:
-    for col, attrs in data_dict.items():
+    for col, content in get_annotated_columns(data_dict):
         if (
             is_column_categorical(col, data_dict)
-            and attrs.get("Levels") is None
+            and content.get("Levels") is None
         ):
             return False
 
@@ -211,12 +219,12 @@ def get_mismatched_categorical_levels(data_dict: dict) -> list:
     for the "Levels" key between the column's BIDS and Neurobagel annotations.
     """
     mismatched_cols = []
-    for col, attrs in data_dict.items():
+    for col, content in get_annotated_columns(data_dict):
         if is_column_categorical(col, data_dict):
-            known_levels = list(attrs["Annotations"]["Levels"].keys()) + attrs[
-                "Annotations"
-            ].get("MissingValues", [])
-            if set(attrs.get("Levels", {}).keys()).difference(known_levels):
+            known_levels = list(
+                content["Annotations"]["Levels"].keys()
+            ) + content["Annotations"].get("MissingValues", [])
+            if set(content.get("Levels", {}).keys()).difference(known_levels):
                 mismatched_cols.append(col)
 
     return mismatched_cols
@@ -239,7 +247,12 @@ def are_inputs_compatible(data_dict: dict, pheno_df: pd.DataFrame) -> bool:
     """
     Determines whether the provided data dictionary and phenotypic file make sense together
     """
-    return all([key in pheno_df.columns for key in data_dict.keys()])
+    return all(
+        [
+            col in pheno_df.columns
+            for col, _ in get_annotated_columns(data_dict)
+        ]
+    )
 
 
 def find_undefined_cat_col_values(
@@ -251,11 +264,11 @@ def find_undefined_cat_col_values(
     dictionary entry.
     """
     all_undefined_values = {}
-    for col, attr in data_dict.items():
+    for col, content in get_annotated_columns(data_dict):
         if is_column_categorical(col, data_dict):
-            known_values = list(attr["Annotations"]["Levels"].keys()) + attr[
-                "Annotations"
-            ].get("MissingValues", [])
+            known_values = list(
+                content["Annotations"]["Levels"].keys()
+            ) + content["Annotations"].get("MissingValues", [])
             unknown_values = []
             for value in pheno_df[col].unique():
                 if value not in known_values:
@@ -275,9 +288,9 @@ def find_unused_missing_values(
     file column.
     """
     all_unused_missing_vals = {}
-    for col, attr in data_dict.items():
+    for col, content in get_annotated_columns(data_dict):
         unused_missing_vals = []
-        for missing_val in attr["Annotations"].get("MissingValues", []):
+        for missing_val in content["Annotations"].get("MissingValues", []):
             if missing_val not in pheno_df[col].unique():
                 unused_missing_vals.append(missing_val)
         if unused_missing_vals:
@@ -308,6 +321,23 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None:
             "Make sure that each annotated column contains an 'Annotations' key."
         ) from e
 
+    if get_annotated_columns(data_dict) == []:
+        raise LookupError(
+            "The provided data dictionary must contain at least one column with Neurobagel annotations."
+        )
+
+    if (
+        len(
+            get_columns_about(
+                data_dict, concept=mappings.NEUROBAGEL["participant"]
+            )
+        )
+        == 0
+    ):
+        raise LookupError(
+            "The provided data dictionary must contain at least one column annotated as being about participant ID."
+        )
+
     # TODO: remove this validation when we start handling multiple participant and / or session ID columns
     if (
         len(

diff --git a/bagel/tests/data/README.md b/bagel/tests/data/README.md
@@ -3,7 +3,7 @@
 Example inputs to the CLI
 
 | Example name | `.tsv`                                                                                                                                                                   | `.json`                                                                              | Expect             |
-|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------|--------------------|
+| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------ | ------------------ |
 | 1            | invalid, non-unique combinations of `participant` and `session` IDs                                                                                                      | valid, has `IsAbout` annotations for `participant` and `session` ID columns          | fail               |
 | 2            | valid, unique `participant` and `session` IDs                                                                                                                            | same as example 1                                                                    | pass               |
 | 3            | same as example 2                                                                                                                                                        | valid BIDS data dictionary, BUT: does not contain Neurobagel `"Annotations"` key     | fail               |
@@ -19,6 +19,8 @@ Example inputs to the CLI
 | 11           | invalid, ex 6 with missing entries in `participant_id` and `session_id` columns                                                                                          | valid, based on example 6                                                            | fail               |
 | 12           | Valid, same as example 2                                                                                                                                                 | Valid, based on example 2 but missing BIDS "Levels" attribute for group column       | Pass, with warning |
 | 13           | Valid, same as example_synthetic                                                                                                                                         | Valid, based on example_synthetic but with mismatched levels for group column        | Pass, with warning |
+| 14           | Valid, same as example 2                                                                                                                                                 | Valid, based on example 2, but with an extra column annotation without Neurobagel    | Pass               |
+| 15           | Valid, same as example 2                                                                                                                                                 | Invalid, based on example 2, but participant ID column lacks Neurobagel annotations | Fail                   |
 
 `* this is expected to fail until we enable multiple participant_ID handling`.
 

diff --git a/bagel/tests/data/example14.json b/bagel/tests/data/example14.json
@@ -0,0 +1,82 @@
+{
+  "participant_id": {
+    "Description": "A participant ID",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:ParticipantID",
+        "Label": "Unique participant identifier"
+      }
+    }
+  },
+  "session_id": {
+    "Description": "A session ID",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:SessionID",
+        "Label": "Unique session identifier"
+      }
+    }
+  },
+  "group": {
+    "Description": "Group variable",
+    "Levels": {
+      "PAT": "Patient",
+      "CTRL": "Control subject"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Diagnosis",
+        "Label": "Diagnosis"
+      },
+      "Levels": {
+        "PAT": {
+          "TermURL": "snomed:49049000",
+          "Label": "Parkinson's disease"
+        },
+        "CTRL": {
+          "TermURL": "purl:NCIT_C94342",
+          "Label": "Healthy Control"
+        }
+      }
+    }
+  },
+  "sex": {
+    "Description": "Sex variable",
+    "Levels": {
+      "M": "Male",
+      "F": "Female"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Sex",
+        "Label": "Sex"
+      },
+      "Levels": {
+        "M": {
+          "TermURL": "snomed:248153007",
+          "Label": "Male"
+        },
+        "F": {
+          "TermURL": "snomed:248152002",
+          "Label": "Female"
+        }
+      }
+    }
+  },
+  "participant_age": {
+    "Description": "Age of the participant",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Age",
+        "Label": "Chronological age"
+      },
+      "Transformation": {
+        "TermURL": "nb:iso8601",
+        "Label": "A period of time defined according to the ISO8601 standard"
+      }
+    }
+  },
+  "someOtherColumn": {
+    "Description": "This is cool in BIDS, but not in Neurobagel"
+  }
+}
diff --git a/bagel/tests/data/example14.tsv b/bagel/tests/data/example14.tsv
@@ -0,0 +1,5 @@
+participant_id	session_id	group	sex	participant_age
+sub-01	ses-01	PAT	M	"P20Y6M"
+sub-01	ses-02	PAT	M	"P20Y8M"
+sub-02	ses-01	CTRL	F	"P25Y8M"
+sub-02	ses-02	CTRL	F	"P26Y4M"
diff --git a/bagel/tests/data/example15.json b/bagel/tests/data/example15.json
@@ -0,0 +1,73 @@
+{
+  "participant_id": {
+    "Description": "A participant ID"
+  },
+  "session_id": {
+    "Description": "A session ID",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:SessionID",
+        "Label": "Unique session identifier"
+      }
+    }
+  },
+  "group": {
+    "Description": "Group variable",
+    "Levels": {
+      "PAT": "Patient",
+      "CTRL": "Control subject"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Diagnosis",
+        "Label": "Diagnosis"
+      },
+      "Levels": {
+        "PAT": {
+          "TermURL": "snomed:49049000",
+          "Label": "Parkinson's disease"
+        },
+        "CTRL": {
+          "TermURL": "purl:NCIT_C94342",
+          "Label": "Healthy Control"
+        }
+      }
+    }
+  },
+  "sex": {
+    "Description": "Sex variable",
+    "Levels": {
+      "M": "Male",
+      "F": "Female"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Sex",
+        "Label": "Sex"
+      },
+      "Levels": {
+        "M": {
+          "TermURL": "snomed:248153007",
+          "Label": "Male"
+        },
+        "F": {
+          "TermURL": "snomed:248152002",
+          "Label": "Female"
+        }
+      }
+    }
+  },
+  "participant_age": {
+    "Description": "Age of the participant",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Age",
+        "Label": "Chronological age"
+      },
+      "Transformation": {
+        "TermURL": "nb:iso8601",
+        "Label": "A period of time defined according to the ISO8601 standard"
+      }
+    }
+  }
+}
diff --git a/bagel/tests/data/example15.tsv b/bagel/tests/data/example15.tsv
@@ -0,0 +1,5 @@
+participant_id	session_id	group	sex	participant_age
+sub-01	ses-01	PAT	M	"P20Y6M"
+sub-01	ses-02	PAT	M	"P20Y8M"
+sub-02	ses-01	CTRL	F	"P25Y8M"
+sub-02	ses-02	CTRL	F	"P26Y4M"
diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py
@@ -11,6 +11,7 @@
         "example6",
         "example12",
         "example13",
+        "example14",
         "example_synthetic",
     ],
 )
@@ -41,7 +42,11 @@ def test_pheno_valid_inputs_run_successfully(
 @pytest.mark.parametrize(
     "example,expected_exception,expected_message",
     [
-        ("example3", ValueError, ["not a valid Neurobagel data dictionary"]),
+        (
+            "example3",
+            LookupError,
+            ["must contain at least one column with Neurobagel annotations"],
+        ),
         (
             "example_invalid",
             ValueError,
@@ -62,6 +67,13 @@ def test_pheno_valid_inputs_run_successfully(
             LookupError,
             ["missing values in participant or session id"],
         ),
+        (
+            "example15",
+            LookupError,
+            [
+                "must contain at least one column annotated as being about participant ID"
+            ],
+        ),
     ],
 )
 def test_invalid_inputs_are_handled_gracefully(

diff --git a/bagel/tests/test_utility.py b/bagel/tests/test_utility.py
@@ -20,7 +20,7 @@ def get_test_context():
 def test_get_columns_that_are_about_concept(test_data, load_test_json):
     """Test that matching annotated columns are returned as a list,
     and that empty list is returned if nothing matches"""
-    data_dict = load_test_json(test_data / "example1.json")
+    data_dict = load_test_json(test_data / "example14.json")
 
     assert ["participant_id"] == putil.get_columns_about(
         data_dict, concept=mappings.NEUROBAGEL["participant"]
@@ -30,6 +30,26 @@ def test_get_columns_that_are_about_concept(test_data, load_test_json):
     )
 
 
+def test_get_columns_with_annotations():
+    example = {
+        "someOtherColumn": {
+            "Description": "This is cool in BIDS, but not in Neurobagel"
+        },
+        "participant_id": {
+            "Description": "A participant ID",
+            "Annotations": {
+                "IsAbout": {
+                    "TermURL": "nb:ParticipantID",
+                    "Label": "Unique participant identifier",
+                }
+            },
+        },
+    }
+    result = putil.get_annotated_columns(example)[0]
+    assert result[0] == "participant_id"
+    assert result[1] == example["participant_id"]
+
+
 def test_map_categories_to_columns(test_data, load_test_json):
     """Test that inverse mapping of concepts to columns is correctly created"""
     data_dict = load_test_json(test_data / "example2.json")