From ec9c4ea58bfa124f1d5432989dd33b76f4178ba9 Mon Sep 17 00:00:00 2001
From: Gab-D-G <gabriel.desrosiers-gregoire@mail.mcgill.ca>
Date: Tue, 26 Sep 2023 21:03:35 -0400
Subject: [PATCH] New --bids_filter parameter for providing custom BIDS specs

---
 docs/running_the_software.md          | 22 ++++++---
 rabies/parser.py                      | 22 +++++++++
 rabies/preprocess_pkg/bold_main_wf.py |  5 +-
 rabies/preprocess_pkg/main_wf.py      | 31 ++++++++----
 rabies/preprocess_pkg/utils.py        | 71 ++++++++++++++-------------
 5 files changed, 95 insertions(+), 56 deletions(-)
diff --git a/docs/running_the_software.md b/docs/running_the_software.md
index 54c6bd4..7217875 100644
--- a/docs/running_the_software.md
+++ b/docs/running_the_software.md
@@ -2,14 +2,7 @@
 
 ## Input data in BIDS standard
 
-The input dataset must be organized according to the [BIDS data structure](https://bids.neuroimaging.io/){cite}`Gorgolewski2016-zm`. RABIES will iterate through subjects and search for all available functional scans with suffix 'bold'. Anatomical scans are not necessary (`--bold_only` runs preprocessing with only functional scans), but can improve registration quality. If anatomical scans are used for preprocessing, each functional scan will be matched to one corresponding anatomical scan with suffix `T1w` or `T2w` of the same subject/session. Extra files which don't have a functional or structural suffix will be ignored.
-<br/>
-<br/>
-Mandatory BIDS specifications are:
-* `sub-{subject ID}` and `ses-{session ID}` for both functional and anatomical images
-* `bold` suffix for functional images
-* `T1w` or `T2w` for anatomical images
-* `run-{run #}` is necessary for functional images if there are multiple scans per session
+The input dataset must be organized according to the [BIDS data structure](https://bids.neuroimaging.io/){cite}`Gorgolewski2016-zm`. RABIES will iterate through all subjects found to contain a functional file (see section on BIDS filters below), and will also iterate according to sessions and runs found within each subject if available. If anatomical scans are used for preprocessing (i.e. not using `--bold_only`), each functional scan will be matched to one corresponding anatomical scan of the same subject/session (using BIDS filters for the anatomical image, see below).
 
 ### Directory structure for an example dataset
 * Our [example dataset](http://doi.org/10.5281/zenodo.3937697) has the following BIDS structure:
@@ -65,6 +58,19 @@ Mandatory BIDS specifications are:
 </body>
 </html>
 
+### BIDS filters to identify functional and structural images
+By default, RABIES will use the 'bold' or 'cbv' suffix to identify functional scans and the 'T1w' or 'T2w' suffix for structural scans. Files which don't match the BIDS filters are ignored. However, the BIDS filters can also be customized with the `--bids_filter` parameter during the preprocessing stage. This can be useful for instance if the default is not enough to find the right set of scans. The custom BIDS filter must be formated into a JSON file with the functional filter under 'func' and structural filter under 'anat' (see example below for the default parameters):
+```json
+{
+    "func": {
+        "suffix":["bold","cbv"]
+    },
+    "anat": {
+        "suffix":["T1w","T2w"]
+    }
+}
+```
+
 ## Command Line Interface
 
 RABIES is executed using a command line interface, within a terminal. The software is divided into three main processing stages: preprocessing, confound correction and analysis. Accordingly, the command line interface allows for three different mode of execution, corresponding to the processing stages. So first, when executing the software, one of the processing stage must be selected. Below you can find the general --help message printed with `rabies --help`, which provides a summary of each processing stage together with options for parallel processing and memory management. Then, the --help associated to each processing stage, i.e. `preprocess`, `confound_correction` and `analysis`, describes in more detail the various parameters available to adapt image processing according to the user needs. Click on the corresponding --help to expand:
diff --git a/rabies/parser.py b/rabies/parser.py
index d080b4b..517790b 100644
--- a/rabies/parser.py
+++ b/rabies/parser.py
@@ -179,6 +179,17 @@ def get_parser():
             "the output path to drop outputs from major preprocessing steps.\n"
             "\n"
         )
+    preprocess.add_argument(
+        "--bids_filter", 
+        default={"func":{"suffix":["bold","cbv"]},"anat":{"suffix":["T1w","T2w"]}},
+        help=
+            "Allows to provide additional BIDS specifications (found within the input BIDS directory) \n"
+            "for selected a subset of functional and/or anatomical images. Takes as input a JSON file \n"
+            "containing the set of parameters for functional image under 'func' and under 'anat' for the \n"
+            "anatomical image. See online documentation for an example. \n"
+            "(default: %(default)s)\n"
+            "\n"
+        )
     preprocess.add_argument(
         "--bold_only", dest='bold_only', action='store_true',
         help=
@@ -1000,6 +1011,17 @@ def read_parser(parser, args):
         opts = parser.parse_args(args)
 
     if opts.rabies_stage == 'preprocess':
+        if not type(opts.bids_filter) is dict:
+            # read as a json file
+            import json
+            opts.bids_filter = json.load(open(opts.bids_filter))
+
+        if 'anat' in list(opts.bids_filter.keys()):
+            if 'subject' in list(opts.bids_filter['anat'].keys()):
+                raise ValueError("Don't provide 'subject' specifications with the structural image using --bids_filter. Manage this parameter with the functional image only.")
+            if 'session' in list(opts.bids_filter['anat'].keys()):
+                raise ValueError("Don't provide 'session' specifications with the structural image --bids_filter. Manage this parameter with the functional image only.")
+
         opts.anat_inho_cor = parse_argument(opt=opts.anat_inho_cor, 
             key_value_pairs = {'method':['Rigid', 'Affine', 'SyN', 'no_reg', 'N4_reg', 'disable'], 
                 'otsu_thresh':['0','1','2','3','4'], 'multiotsu':['true', 'false']},
diff --git a/rabies/preprocess_pkg/bold_main_wf.py b/rabies/preprocess_pkg/bold_main_wf.py
index a6af86b..511dc49 100644
--- a/rabies/preprocess_pkg/bold_main_wf.py
+++ b/rabies/preprocess_pkg/bold_main_wf.py
@@ -10,7 +10,7 @@
 from nipype.interfaces.utility import Function
 
 
-def init_bold_main_wf(opts, output_folder, bold_scan_list, inho_cor_only=False, name='bold_main_wf'):
+def init_bold_main_wf(opts, output_folder, number_functional_scans, inho_cor_only=False, name='bold_main_wf'):
     """
     This workflow controls the functional preprocessing stages of the pipeline when both
     functional and anatomical images are provided.
@@ -146,8 +146,7 @@ def init_bold_main_wf(opts, output_folder, bold_scan_list, inho_cor_only=False,
 
         bold_reference_wf = init_bold_reference_wf(opts=opts)
 
-        num_scan = len(bold_scan_list)
-        num_procs = min(opts.local_threads, num_scan)
+        num_procs = min(opts.local_threads, number_functional_scans)
         inho_cor_wf = init_inho_correction_wf(opts=opts, image_type='EPI', output_folder=output_folder, num_procs=num_procs, name="bold_inho_cor_wf")
 
         if opts.isotropic_HMC:
diff --git a/rabies/preprocess_pkg/main_wf.py b/rabies/preprocess_pkg/main_wf.py
index 8c673b6..fb6f747 100644
--- a/rabies/preprocess_pkg/main_wf.py
+++ b/rabies/preprocess_pkg/main_wf.py
@@ -141,8 +141,19 @@ def init_main_wf(data_dir_path, output_folder, opts, name='main_wf'):
         log.warning(f"The BIDS compliance failed: {e} \n\nRABIES will run anyway; double-check that the right files were picked up for processing.\n")
         layout = bids.layout.BIDSLayout(data_dir_path, validate=False)
 
-    split_name, scan_info, run_iter, scan_list, bold_scan_list = prep_bids_iter(
-        layout, opts.bold_only, inclusion_list=opts.inclusion_ids, exclusion_list=opts.exclusion_ids)
+    split_name, scan_info, run_iter, structural_scan_list, number_functional_scans = prep_bids_iter(
+        layout, opts.bids_filter, opts.bold_only, inclusion_list=opts.inclusion_ids, exclusion_list=opts.exclusion_ids)
+    '''***details on outputs from prep_bids_iter:
+    split_name: a list of strings, providing a sensible name to distinguish each iterable, 
+        and also necessary to link up the run iterables with a specific session later.
+    scan_info: a list of dictionary including the subject ID and session # for a given 
+        iterable from split_name
+    run_iter: a list of dictionary, where the keys correspond to a session split from 
+        split_name, and the value is a list of runs for that split. This manages iterables
+        for runs.
+    structural_scan_list: the set of structural scans; used for resample_template and managing # threads
+    number_functional_scans: the number of functional scans; used for managing # threads
+    '''
 
     # setting up all iterables
     main_split = pe.Node(niu.IdentityInterface(fields=['split_name', 'scan_info']),
@@ -151,8 +162,8 @@ def init_main_wf(data_dir_path, output_folder, opts, name='main_wf'):
                             ('scan_info', scan_info)]
     main_split.synchronize = True
 
-    bold_selectfiles = pe.Node(BIDSDataGraber(bids_dir=data_dir_path, suffix=[
-                               'bold', 'cbv']), name='bold_selectfiles')
+    bold_selectfiles = pe.Node(BIDSDataGraber(bids_dir=data_dir_path, bids_filter=opts.bids_filter['func']),
+                               name='bold_selectfiles')
 
     # node to conver input image to consistent RAS orientation
     bold_convert_to_RAS_node = pe.Node(Function(input_names=['img_file'],
@@ -190,11 +201,11 @@ def init_main_wf(data_dir_path, output_folder, opts, name='main_wf'):
     resample_template_node.inputs.template_file = str(opts.anat_template)
     resample_template_node.inputs.mask_file = str(opts.brain_mask)
     resample_template_node.inputs.spacing = opts.anatomical_resampling
-    resample_template_node.inputs.file_list = scan_list
+    resample_template_node.inputs.file_list = structural_scan_list
     resample_template_node.inputs.rabies_data_type = opts.data_type
 
     # calculate the number of scans that will be registered
-    num_scan = len(scan_list)
+    num_scan = len(structural_scan_list)
     num_procs = min(opts.local_threads, num_scan)
 
     EPI_target_buffer = pe.Node(niu.IdentityInterface(fields=['EPI_template', 'EPI_mask']),
@@ -202,7 +213,7 @@ def init_main_wf(data_dir_path, output_folder, opts, name='main_wf'):
 
     commonspace_reg_wf = init_commonspace_reg_wf(opts=opts, commonspace_masking=opts.commonspace_reg['masking'], brain_extraction=opts.commonspace_reg['brain_extraction'], template_reg=opts.commonspace_reg['template_registration'], fast_commonspace=opts.commonspace_reg['fast_commonspace'], output_folder=output_folder, transforms_datasink=transforms_datasink, num_procs=num_procs, output_datasinks=True, joinsource_list=['main_split'], name='commonspace_reg_wf')
 
-    bold_main_wf = init_bold_main_wf(opts=opts, output_folder=output_folder, bold_scan_list=bold_scan_list)
+    bold_main_wf = init_bold_main_wf(opts=opts, output_folder=output_folder, number_functional_scans=number_functional_scans)
 
     # organizing visual QC outputs
     template_diagnosis = pe.Node(Function(input_names=['anat_template', 'opts', 'out_dir', 'figure_format'],
@@ -313,8 +324,8 @@ def init_main_wf(data_dir_path, output_folder, opts, name='main_wf'):
         run_split.itersource = ('main_split', 'split_name')
         run_split.iterables = [('run', run_iter)]
 
-        anat_selectfiles = pe.Node(BIDSDataGraber(bids_dir=data_dir_path, suffix=[
-                                   'T2w', 'T1w']), name='anat_selectfiles')
+        anat_selectfiles = pe.Node(BIDSDataGraber(bids_dir=data_dir_path, bids_filter=opts.bids_filter['anat']),
+                                   name='anat_selectfiles')
         anat_selectfiles.inputs.run = None
 
         anat_convert_to_RAS_node = pe.Node(Function(input_names=['img_file'],
@@ -416,7 +427,7 @@ def init_main_wf(data_dir_path, output_folder, opts, name='main_wf'):
 
     else:
         inho_cor_bold_main_wf = init_bold_main_wf(
-            output_folder=output_folder, bold_scan_list=bold_scan_list, inho_cor_only=True, name='inho_cor_bold_main_wf', opts=opts)
+            output_folder=output_folder, number_functional_scans=number_functional_scans, inho_cor_only=True, name='inho_cor_bold_main_wf', opts=opts)
 
         workflow.connect([
             (resample_template_node, inho_cor_bold_main_wf, [
diff --git a/rabies/preprocess_pkg/utils.py b/rabies/preprocess_pkg/utils.py
index 461c613..7eb6c47 100644
--- a/rabies/preprocess_pkg/utils.py
+++ b/rabies/preprocess_pkg/utils.py
@@ -7,7 +7,7 @@
     File, BaseInterface
 )
 
-def prep_bids_iter(layout, bold_only=False, inclusion_list=['all'], exclusion_list=['none']):
+def prep_bids_iter(layout, bids_filter, bold_only=False, inclusion_list=['all'], exclusion_list=['none']):
     '''
     This function takes as input a BIDSLayout, and generates iteration lists
     for managing the workflow's iterables depending on whether --bold_only is
@@ -17,7 +17,7 @@ def prep_bids_iter(layout, bold_only=False, inclusion_list=['all'], exclusion_li
 
     scan_info = []
     split_name = []
-    scan_list = []
+    structural_scan_list = []
     run_iter = {}
     bold_scan_list = []
 
@@ -25,17 +25,21 @@ def prep_bids_iter(layout, bold_only=False, inclusion_list=['all'], exclusion_li
     if len(subject_list) == 0:
         raise ValueError(
             "No subject information could be retrieved from the BIDS directory. The 'sub-' specification is mandatory.")
-    if not bold_only:
-        anat_bids = layout.get(subject=subject_list, suffix=[
-                               'T2w', 'T1w'], extension=['nii', 'nii.gz'])
-        if len(anat_bids) == 0:
-            raise ValueError(
-                "No anatomical file with the suffix 'T2w' or 'T1w' were found among the BIDS directory.")
-    bold_bids = layout.get(subject=subject_list, suffix=[
-                           'bold'], extension=['nii', 'nii.gz'])
+
+    if not 'subject' in list(bids_filter['func'].keys()):
+        # enforce that only files with subject ID are read
+        bids_filter['func']['subject']=subject_list
+
+    # create the list for all functional images; this is applying all filters from bids_filter
+    bold_bids = layout.get(extension=['nii', 'nii.gz'], **bids_filter['func'])
     if len(bold_bids) == 0:
         raise ValueError(
-            "No functional file with the suffix 'bold' were found among the BIDS directory.")
+            f"No functional file were found respecting the functional BIDS spec: {bids_filter['func']}")
+    
+    # remove subject, session and run; these are used later to target single files
+    bids_filter['func'].pop('subject', None)
+    bids_filter['func'].pop('session', None)
+    bids_filter['func'].pop('run', None)
 
     # filter inclusion/exclusion lists
     from rabies.utils import filter_scan_inclusion, filter_scan_exclusion
@@ -68,8 +72,8 @@ def prep_bids_iter(layout, bold_only=False, inclusion_list=['all'], exclusion_li
         if ses not in list(bold_dict[sub].keys()):
             bold_dict[sub][ses] = {}
 
-        bold_list = layout.get(subject=sub, session=ses, run=run, suffix=[
-                               'bold'], extension=['nii', 'nii.gz'], return_type='filename')
+        bold_list = layout.get(subject=sub, session=ses, run=run, 
+                               extension=['nii', 'nii.gz'], return_type='filename',**bids_filter['func'])
         bold_dict[sub][ses][run] = bold_list
 
     # if not bold_only, then the bold_list and run_iter will be a dictionary with keys being the anat filename
@@ -77,16 +81,16 @@ def prep_bids_iter(layout, bold_only=False, inclusion_list=['all'], exclusion_li
     for sub in list(bold_dict.keys()):
         for ses in list(bold_dict[sub].keys()):
             if not bold_only:
-                anat_list = layout.get(subject=sub, session=ses, suffix=[
-                                       'T2w', 'T1w'], extension=['nii', 'nii.gz'], return_type='filename')
+                anat_list = layout.get(subject=sub, session=ses,
+                                       extension=['nii', 'nii.gz'], return_type='filename',**bids_filter['anat'])
                 if len(anat_list) == 0:
                     raise ValueError(
-                        f'Missing an anatomical image for sub {sub} and ses- {ses}')
+                        f'Missing an anatomical image for sub {sub} and ses- {ses}, and the following BIDS specs: {bids_filter["anat"]}')
                 if len(anat_list) > 1:
                     raise ValueError(
                         f'Duplicate was found for the anatomical file sub- {sub}, ses- {ses}: {str(anat_list)}')
                 file = anat_list[0]
-                scan_list.append(file)
+                structural_scan_list.append(file)
                 filename_template = pathlib.Path(file).name.rsplit(".nii")[0]
                 split_name.append(filename_template)
                 scan_info.append({'subject_id': sub, 'session': ses})
@@ -100,7 +104,7 @@ def prep_bids_iter(layout, bold_only=False, inclusion_list=['all'], exclusion_li
                 file = bold_list[0]
                 bold_scan_list.append(file)
                 if bold_only:
-                    scan_list.append(file)
+                    structural_scan_list.append(file)
                     filename_template = pathlib.Path(
                         file).name.rsplit(".nii")[0]
                     split_name.append(filename_template)
@@ -109,14 +113,15 @@ def prep_bids_iter(layout, bold_only=False, inclusion_list=['all'], exclusion_li
                 else:
                     run_iter[filename_template].append(run)
 
-    return split_name, scan_info, run_iter, scan_list, bold_scan_list
+    number_functional_scans = len(bold_scan_list)
+    return split_name, scan_info, run_iter, structural_scan_list, number_functional_scans
 
 
 class BIDSDataGraberInputSpec(BaseInterfaceInputSpec):
     bids_dir = traits.Str(exists=True, mandatory=True,
                           desc="BIDS data directory")
-    suffix = traits.List(exists=True, mandatory=True,
-                         desc="Suffix to search for")
+    bids_filter = traits.Dict(exists=True, mandatory=True,
+                         desc="BIDS specs")
     scan_info = traits.Dict(exists=True, mandatory=True,
                             desc="Info required to find the scan")
     run = traits.Any(exists=True, desc="Run number")
@@ -137,31 +142,27 @@ class BIDSDataGraber(BaseInterface):
     output_spec = BIDSDataGraberOutputSpec
 
     def _run_interface(self, runtime):
-        subject_id = self.inputs.scan_info['subject_id']
-        session = self.inputs.scan_info['session']
         if 'run' in (self.inputs.scan_info.keys()):
             run = self.inputs.scan_info['run']
         else:
             run = self.inputs.run
 
+        bids_filter = self.inputs.bids_filter.copy()
+        bids_filter['subject'] = self.inputs.scan_info['subject_id']
+        bids_filter['session'] = self.inputs.scan_info['session']
+        if not run is None:
+            bids_filter['run'] = run
+
         from bids.layout import BIDSLayout
         layout = BIDSLayout(self.inputs.bids_dir, validate=False)
         try:
-            if run is None: # if there is no run spec to search, don't include it in the search
-                file_list = layout.get(subject=subject_id, session=session, extension=[
-                                  'nii', 'nii.gz'], suffix=self.inputs.suffix, return_type='filename')
-            else:
-                file_list = layout.get(subject=subject_id, session=session, run=run, extension=[
-                                  'nii', 'nii.gz'], suffix=self.inputs.suffix, return_type='filename')
+            file_list = layout.get(extension=['nii', 'nii.gz'], return_type='filename', **bids_filter)
             if len(file_list) > 1:
-                raise ValueError(f'Provided BIDS spec lead to duplicates: \
-                    {str(self.inputs.suffix)} sub-{subject_id} ses-{session} run-{str(run)}')
+                raise ValueError(f'Provided BIDS spec lead to duplicates: {bids_filter}')
             elif len(file_list)==0:
-                raise ValueError(f'No file for found corresponding to the following BIDS spec: \
-                    {str(self.inputs.suffix)} sub-{subject_id} ses-{session} run-{str(run)}')
+                raise ValueError(f'No file for found corresponding to the following BIDS spec: {bids_filter}')
         except:
-            raise ValueError(f'Error with BIDS spec: \
-                    {str(self.inputs.suffix)} sub-{subject_id} ses-{session} run-{str(run)}')
+            raise ValueError(f'Error with BIDS spec: {bids_filter}')
 
         setattr(self, 'out_file', file_list[0])