[ENH] Allow participants.tsv to contain a superset of subject directories and subjects listed in phenotype files (#2044)

effigies · web-flow · commit fa2b5d85f7e0 · 2025-02-14T09:30:27.000-05:00
* Update src/schema/objects/files.yaml The participants schema description now contains the comprehensive superset rule from #914. * Update src/schema/objects/files.yaml Committing the good suggestion. Co-authored-by: Chris Markiewicz <effigies@gmail.com> * Update src/schema/objects/files.yaml * doc(schema): Update intersects() to return the intersection if non-empty * feat(schema): Require participants.tsv to be a superset of sub_dirs/participants * schema: Improve error messages --------- Co-authored-by: Chris Markiewicz <effigies@gmail.com> Co-authored-by: Chris Markiewicz <markiewicz@stanford.edu>
diff --git a/src/schema/README.md b/src/schema/README.md
@@ -259,20 +259,20 @@ The following operators should be defined by an interpreter:
 
 The following functions should be defined by an interpreter:
 
-| Function                                        | Definition                                                                                                                                | Example                                                | Note                                                                           |
-| ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------ | ------------------------------------------------------------------------------ |
-| `count(arg: array, val: any) -> int`            | Number of elements in an array equal to `val`                                                                                             | `count(columns.type, "EEG")`                           | The number of times "EEG" appears in the column "type" of the current TSV file |
-| `exists(arg: str \| array, rule: str) -> int`   | Count of files in an array that exist in the dataset. String is array with length 1. See following section for the meanings of rules.     | `exists(sidecar.IntendedFor, "subject")`               | True if all files in `IntendedFor` exist, relative to the subject directory.   |
-| `index(arg: array, val: any) -> int`            | Index of first element in an array equal to `val`, `null` if not found                                                                    | `index(["i", "j", "k"], axis)`                         | The number, from 0-2 corresponding to the string `axis`                        |
-| `intersects(a: array, b: array) -> bool`        | `true` if arguments contain any shared elements                                                                                           | `intersects(dataset.modalities, ["pet", "mri"])`       | True if either PET or MRI data is found in dataset                             |
-| `allequal(a: array, b: array) -> bool`          | `true` if arrays have the same length and paired elements are equal                                                                       | `intersects(dataset.modalities, ["pet", "mri"])`       | True if either PET or MRI data is found in dataset                             |
-| `length(arg: array) -> int`                     | Number of elements in an array                                                                                                            | `length(columns.onset) > 0`                            | True if there is at least one value in the onset column                        |
-| `match(arg: str, pattern: str) -> bool`         | `true` if `arg` matches the regular expression `pattern` (anywhere in string)                                                             | `match(extension, ".gz$")`                             | True if the file extension ends with `.gz`                                     |
-| `max(arg: array) -> number`                     | The largest non-`n/a` value in an array                                                                                                   | `max(columns.onset)`                                   | The time of the last onset in an events.tsv file                               |
-| `min(arg: array) -> number`                     | The smallest non-`n/a` value in an array                                                                                                  | `min(sidecar.SliceTiming) == 0`                        | A check that the onset of the first slice is 0s                                |
-| `sorted(arg: array, method: str) -> array`      | The sorted values of the input array; defaults to type-determined sort. If method is "lexical", or "numeric" use lexical or numeric sort. | `sorted(sidecar.VolumeTiming) == sidecar.VolumeTiming` | True if `sidecar.VolumeTiming` is sorted                                       |
-| `substr(arg: str, start: int, end: int) -> str` | The portion of the input string spanning from start position to end position                                                              | `substr(path, 0, length(path) - 3)`                    | `path` with the last three characters dropped                                  |
-| `type(arg: Any) -> str`                         | The name of the type, including `"array"`, `"object"`, `"null"`                                                                           | `type(datatypes)`                                      | Returns `"array"`                                                              |
+| Function                                          | Definition                                                                                                                                | Example                                                | Note                                                                           |
+| ------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------ | ------------------------------------------------------------------------------ |
+| `count(arg: array, val: any) -> int`              | Number of elements in an array equal to `val`                                                                                             | `count(columns.type, "EEG")`                           | The number of times "EEG" appears in the column "type" of the current TSV file |
+| `exists(arg: str \| array, rule: str) -> int`     | Count of files in an array that exist in the dataset. String is array with length 1. See following section for the meanings of rules.     | `exists(sidecar.IntendedFor, "subject")`               | True if all files in `IntendedFor` exist, relative to the subject directory.   |
+| `index(arg: array, val: any) -> int`              | Index of first element in an array equal to `val`, `null` if not found                                                                    | `index(["i", "j", "k"], axis)`                         | The number, from 0-2 corresponding to the string `axis`                        |
+| `intersects(a: array, b: array) -> array \| bool` | The intersection of arrays `a` and `b`, or `false` if there are no shared values.                                                         | `intersects(dataset.modalities, ["pet", "mri"])`       | Non-empty array if either PET or MRI data is found in dataset, otherwise false |
+| `allequal(a: array, b: array) -> bool`            | `true` if arrays have the same length and paired elements are equal                                                                       | `intersects(dataset.modalities, ["pet", "mri"])`       | True if either PET or MRI data is found in dataset                             |
+| `length(arg: array) -> int`                       | Number of elements in an array                                                                                                            | `length(columns.onset) > 0`                            | True if there is at least one value in the onset column                        |
+| `match(arg: str, pattern: str) -> bool`           | `true` if `arg` matches the regular expression `pattern` (anywhere in string)                                                             | `match(extension, ".gz$")`                             | True if the file extension ends with `.gz`                                     |
+| `max(arg: array) -> number`                       | The largest non-`n/a` value in an array                                                                                                   | `max(columns.onset)`                                   | The time of the last onset in an events.tsv file                               |
+| `min(arg: array) -> number`                       | The smallest non-`n/a` value in an array                                                                                                  | `min(sidecar.SliceTiming) == 0`                        | A check that the onset of the first slice is 0s                                |
+| `sorted(arg: array, method: str) -> array`        | The sorted values of the input array; defaults to type-determined sort. If method is "lexical", or "numeric" use lexical or numeric sort. | `sorted(sidecar.VolumeTiming) == sidecar.VolumeTiming` | True if `sidecar.VolumeTiming` is sorted                                       |
+| `substr(arg: str, start: int, end: int) -> str`   | The portion of the input string spanning from start position to end position                                                              | `substr(path, 0, length(path) - 3)`                    | `path` with the last three characters dropped                                  |
+| `type(arg: Any) -> str`                           | The name of the type, including `"array"`, `"object"`, `"null"`                                                                           | `type(datatypes)`                                      | Returns `"array"`                                                              |
 
 #### The `exists()` function
 
diff --git a/src/schema/meta/expression_tests.yaml b/src/schema/meta/expression_tests.yaml
@@ -93,7 +93,7 @@
 - expression: type(true)
   result: 'boolean'
 - expression: intersects([1], [1, 2])
-  result: true
+  result: [1]
 - expression: intersects([1], [])
   result: false
 - expression: length([1, 2, 3])
diff --git a/src/schema/objects/files.yaml b/src/schema/objects/files.yaml
@@ -75,6 +75,10 @@ participants:
     followed by a list of optional columns describing participants.
     Each participant MUST be described by one and only one row.
 
+    The `participant_id` entries MUST be a superset of all subject directories
+    and all `participant_id` entries found among phenotypic and assessment data
+    in the `phenotype/` directory.
+
     Commonly used *optional* columns in `participants.tsv` files are `age`, `sex`,
     `handedness`, `strain`, and `strain_rrid`.
 
diff --git a/src/schema/rules/checks/dataset.yaml b/src/schema/rules/checks/dataset.yaml
@@ -18,26 +18,35 @@ ParticipantIDMismatch:
   issue:
     code: PARTICIPANT_ID_MISMATCH
     message: |
-      Participant labels found in this dataset did not match the values in participant_id column
-      found in the participants.tsv file.
+      Subject directories found in this dataset did not match the values in
+      the participant_id column found in the participants.tsv file.
     level: error
   selectors:
     - path == '/participants.tsv'
   checks:
-    - allequal(sorted(columns.participant_id), sorted(dataset.subjects.sub_dirs))
+    - |
+      allequal(
+        sorted(intersects(columns.participant_id, dataset.subjects.sub_dirs)),
+        sorted(dataset.subjects.sub_dirs)
+      )
 
 # 51
 PhenotypeSubjectsMissing:
   issue:
     code: PHENOTYPE_SUBJECTS_MISSING
     message: |
-      A phenotype/ .tsv file lists subjects that were not found in the dataset.
+      A phenotype/ .tsv file lists subjects that were not found in
+      the participant_id column found in the participants.tsv file.
     level: error
   selectors:
-    - path == '/dataset_description.json'
+    - path == '/participants.tsv'
     - type(dataset.subjects.phenotype) != 'null'
   checks:
-    - allequal(sorted(dataset.subjects.phenotype), sorted(dataset.subjects.sub_dirs))
+    - |
+      allequal(
+        sorted(intersects(columns.participant_id, dataset.subjects.phenotype)),
+        sorted(dataset.subjects.phenotype)
+      )
 
 # 214
 SamplesTSVMissing: