Skip to content

Commit

Permalink
Draft curation code.
Browse files Browse the repository at this point in the history
  • Loading branch information
tsalo committed Sep 19, 2024
1 parent 7b2c4a2 commit c8f471d
Show file tree
Hide file tree
Showing 15 changed files with 461 additions and 1,509 deletions.
31 changes: 0 additions & 31 deletions .github/workflows/lint_r.yml

This file was deleted.

8 changes: 6 additions & 2 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
{
"python.linting.flake8Enabled": true,
"python.linting.enabled": true
"[python]": {
"editor.rulers": [99]
},
"flake8.args": [
"--max-line-length=100"
],
}
18 changes: 18 additions & 0 deletions 00_download_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

# Flywheel project name
project="bbl/MEBOLD"

# List any subjects you want to download here
subjects="ID1 ID2"

# Include a path to your flywheel API token here
token=$(</cbica/home/salot/tokens/flywheel.txt)
fw login "$token"

# Navigate to the folder to which you want to download the data
cd /cbica/projects/mebold/sourcedata || exit

for subject in $subjects; do
fw download --yes --zip "fw://${project}/${subject}"
done
13 changes: 13 additions & 0 deletions 01_unzip_dicoms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""Expand dicom zip files in order to run heudiconv."""

import os
import zipfile
from glob import glob

if __name__ == "__main__":
zip_files = sorted(glob("/cbica/projects/mebold/sourcedata/*_*/*/*/*.dicom.zip"))
for zip_file in zip_files:
with zipfile.ZipFile(zip_file, "r") as zip_ref:
zip_ref.extractall(os.path.dirname(zip_file))

os.remove(zip_file)
16 changes: 16 additions & 0 deletions 02_run_heudiconv.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash
# Loop over subjects and run heudiconv on each.
# Make sure to activate the conda environment with heudiconv installed before running this.

declare -a subjects=("ID1" "ID2")
for sub in "${subjects[@]}"
do
echo "$sub"
heudiconv \
-f reproin \
-o /cbica/projects/mebold/dset \
-d "/cbica/projects/mebold/sourcedata/{subject}_{session}/*/*/*/*.dcm" \
-s "$sub" \
-ss 1 \
--bids
done
145 changes: 145 additions & 0 deletions 03_fix_bids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
"""Fix BIDS files after heudiconv conversion.
This script should deal with steps 1-6 below.
The necessary steps are:
1. Deal with duplicates.
2. Rename multi-echo magnitude BOLD files to part-mag_bold.
3. Rename phase files to part-phase_bold.
4. Split out noRF noise scans from multi-echo BOLD scans.
- Also copy the JSON.
5. Copy first echo of each multi-echo field map without echo entity.
6. Update filenames in the scans.tsv files.
7. Remove events files.
"""

import os
import shutil
from glob import glob

import nibabel as nb
import pandas as pd

# Number of EPI noise scans to split out of the BOLD scans.
N_NOISE_VOLS = 3

FULL_RUN_LENGTHS = (240, 204, 200)


if __name__ == "__main__":
dset_dir = "/cbica/projects/mebold/dset/"
subject_dirs = sorted(glob(os.path.join(dset_dir, "sub-*")))
for subject_dir in subject_dirs:
sub_id = os.path.basename(subject_dir)
session_dirs = sorted(glob(os.path.join(subject_dir, "ses-*")))
for session_dir in session_dirs:
ses_id = os.path.basename(session_dir)
anat_dir = os.path.join(session_dir, "anat")
fmap_dir = os.path.join(session_dir, "fmap")
func_dir = os.path.join(session_dir, "func")

# Remove empty events files created by heudiconv
events_files = sorted(glob(os.path.join(func_dir, "*_events.tsv")))
for events_file in events_files:
os.remove(events_file)

# Load scans file
scans_file = os.path.join(session_dir, f"{sub_id}_{ses_id}_scans.tsv")
assert os.path.isfile(scans_file), f"Scans file DNE: {scans_file}"
scans_df = pd.read_table(scans_file)

# Heudiconv's reproin heuristic currently (as of v1.2.0) names magnitude and phase
# files as _bold and _phase, respectively.
# The better way to do it is to call them part-mag_bold and part-phase_bold.
mag_files = sorted(glob(os.path.join(func_dir, "*echo-*_bold.*")))
for mag_file in mag_files:
if "part-" in mag_file:
print(f"Skipping {mag_file}")
continue

new_mag_file = mag_file.replace("_bold.", "_part-mag_bold.")
os.rename(mag_file, new_mag_file)

mag_filename = os.path.join("func", os.path.basename(mag_file))
new_mag_filename = os.path.join("func", os.path.basename(new_mag_file))

# Replace the filename in the scans.tsv file
scans_df = scans_df.replace({"filename": {mag_filename: new_mag_filename}})

# Rename phase files from _phase to _part-phase_bold.
phase_files = sorted(glob(os.path.join(func_dir, "*_phase.*")))
for phase_file in phase_files:
new_phase_file = phase_file.replace("_phase.", "_part-phase_bold.")
os.rename(phase_file, new_phase_file)

phase_filename = os.path.join("func", os.path.basename(phase_file))
new_phase_filename = os.path.join("func", os.path.basename(new_phase_file))

# Replace the filename in the scans.tsv file
scans_df = scans_df.replace({"filename": {phase_filename: new_phase_filename}})

# Split out noise scans from all multi-echo BOLD files.
# There is no metadata to distinguish noise scans from BOLD scans,
# so we need to hardcode the number of noise scans to split out.
# In order to handle partial scans where the last N volumes aren't noise scans,
# we also need to hardcode valid scan lengths.
me_bolds = sorted(glob(os.path.join(func_dir, "*acq-MBME*_bold.nii.gz")))
for me_bold in me_bolds:
noise_scan = me_bold.replace("_bold.nii.gz", "_noRF.nii.gz")
if os.path.isfile(noise_scan):
print(f"File exists: {os.path.basename(noise_scan)}")
continue

img = nb.load(me_bold)
n_vols = img.shape[-1]
if n_vols not in FULL_RUN_LENGTHS:
print(f"File is a partial scan: {os.path.basename(me_bold)}")
continue

noise_img = img.slicer[..., -N_NOISE_VOLS:]
bold_img = img.slicer[..., :-N_NOISE_VOLS]

# Overwrite the BOLD scan
os.remove(me_bold)
bold_img.to_filename(me_bold)
noise_img.to_filename(noise_scan)

# Copy the JSON as well
shutil.copyfile(
me_bold.replace(".nii.gz", ".json"),
noise_scan.replace(".nii.gz", ".json"),
)

# Add noise scans to scans DataFrame
i_row = len(scans_df.index)
me_bold_fname = os.path.join("func", os.path.basename(me_bold))
noise_fname = os.path.join("func", os.path.basename(noise_scan))
scans_df.loc[i_row] = scans_df.loc[scans_df["filename"] == me_bold_fname].iloc[0]
scans_df.loc[i_row, "filename"] = noise_fname

# In this protocol, we have multi-echo field maps.
# In practice, multi-echo field maps aren't useful, so we just grab the first echo's
# data and label it as a single-echo field map.
# Copy first echo's sbref of multi-echo field maps without echo entity.
me_fmaps = sorted(glob(os.path.join(fmap_dir, "*_acq-ME*_echo-1_sbref.*")))
for me_fmap in me_fmaps:
out_fmap = me_fmap.replace("_echo-1_", "_").replace("_sbref", "_epi")
if os.path.isfile(out_fmap):
print(f"File exists: {os.path.basename(out_fmap)}")
continue

me_fmap_fname = os.path.join("fmap", os.path.basename(me_fmap))
out_fmap_fname = os.path.join("fmap", os.path.basename(out_fmap))
shutil.copyfile(me_fmap, out_fmap)
if me_fmap.endswith(".nii.gz"):
i_row = len(scans_df.index)
scans_df.loc[i_row] = scans_df.loc[
scans_df["filename"] == me_fmap_fname
].iloc[0]
scans_df.loc[i_row, "filename"] = out_fmap_fname

# Save out the modified scans.tsv file.
scans_df = scans_df.sort_values(by=["acq_time", "filename"])
os.remove(scans_file)
scans_df.to_csv(scans_file, sep="\t", na_rep="n/a", index=False)
15 changes: 15 additions & 0 deletions 04_reface_t1ws.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash
# Reface T1w images using afni_refacer_run.
module load afni/2022_05_03

t1w_files=$(find /cbica/projects/mebold/dset/sub-*/ses-*/anat/*T1w.nii.gz)
for t1w_file in $t1w_files
do
echo "$t1w_file"
@afni_refacer_run \
-input "${t1w_file}" \
-mode_reface \
-no_images \
-overwrite \
-prefix "${t1w_file}"
done
55 changes: 55 additions & 0 deletions 05_anonymize_acqtimes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""Anonymize acquisition datetimes for a dataset.
Anonymize acquisition datetimes for a dataset. Works for both longitudinal
and cross-sectional studies. The time of day is preserved, but the first
scan is set to January 1st, 1800. In a longitudinal study, each session is
anonymized relative to the first session, so that time between sessions is
preserved.
Overwrites scan tsv files in dataset. Only run this *after* data collection
is complete for the study, especially if it's longitudinal.
"""

import os
from glob import glob

import pandas as pd
from dateutil import parser

if __name__ == "__main__":
dset_dir = "/cbica/projects/mebold/dset"

bl_dt = parser.parse("1800-01-01")

subject_dirs = sorted(glob(os.path.join(dset_dir, "sub-*")))
for subject_dir in subject_dirs:
sub_id = os.path.basename(subject_dir)
print(f"Processing {sub_id}")

scans_files = sorted(glob(os.path.join(subject_dir, "ses-*/*_scans.tsv")))

for i_ses, scans_file in enumerate(scans_files):
ses_dir = os.path.dirname(scans_file)
ses_name = os.path.basename(ses_dir)
print(f"\t{ses_name}")

df = pd.read_table(scans_file)
if i_ses == 0:
# Anonymize in terms of first scan for subject.
first_scan = df["acq_time"].min()
first_dt = parser.parse(first_scan.split("T")[0])
diff = first_dt - bl_dt

acq_times = df["acq_time"].apply(parser.parse)
acq_times = (acq_times - diff).astype(str)
df["acq_time"] = acq_times
df["acq_time"] = df["acq_time"].str.replace(" ", "T")

os.remove(scans_file)
df.to_csv(
scans_file,
sep="\t",
lineterminator="\n",
na_rep="n/a",
index=False,
)
28 changes: 28 additions & 0 deletions 06_clean_jsons.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Remove unneeded fields from bottom-level JSON files."""

import json
import os
from glob import glob

if __name__ == "__main__":
dset_dir = "/cbica/projects/mebold/dset/"
drop_keys = [
"AcquisitionTime",
"CogAtlasID",
"InstitutionAddress",
"TaskName",
"ImageComments",
]

json_files = sorted(glob(os.path.join(dset_dir, "sub-*/ses-*/*/*.json")))
for json_file in json_files:
with open(json_file, "r") as fo:
json_data = json.load(fo)

for drop_key in drop_keys:
if drop_key in json_data.keys():
json_data.pop(drop_key)

os.remove(json_file)
with open(json_file, "w") as fo:
json.dump(json_data, fo, indent=4, sort_keys=True)
Loading

0 comments on commit c8f471d

Please sign in to comment.