Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge Alistair's fixes with Isaac's #80

Open
wants to merge 3 commits into
base: corpus-3-release-fixes
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion src/b2aiprep/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from streamlit.web.bootstrap import run

from b2aiprep.prepare.bids_like_data import redcap_to_bids
from b2aiprep.prepare.prepare import prepare_bids_like_data
from b2aiprep.prepare.prepare import prepare_bids_like_data, validate_bids_data


@click.group()
Expand Down Expand Up @@ -119,6 +119,28 @@ def prepbidslikedata(
)


@main.command()
@click.argument("bids_dir_path", type=click.Path())
@click.argument("fix", type=bool)
def validate(
bids_dir_path,
fix,
):
"""Organizes the data into a BIDS-like directory structure.

redcap_csv_path: path to the redcap csv\n
audio_dir_path: path to directory with audio files\n
bids_dir_path: path to store bids-like data\n
tar_file_path: path to store tar file\n
transcription_model_size: tiny, small, medium, or large\n
n_cores: number of cores to run feature extraction on\n
with_sensitive: whether to include sensitive data
"""
validate_bids_data(
bids_dir_path=Path(bids_dir_path),
fix=fix,
)

@main.command()
@click.argument("filename", type=click.Path(exists=True))
@click.option("-s", "--subject", type=str, default=None)
Expand Down
82 changes: 52 additions & 30 deletions src/b2aiprep/prepare/bids_like_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,48 +81,69 @@ def load_redcap_csv(file_path):
return None


def update_redcap_df_column_names(df: DataFrame) -> DataFrame:
"""Update column names for a RedCap derived dataframe to match the coded
column names used in the B2AI data processing pipeline.
def validate_redcap_df_column_names(df: DataFrame) -> DataFrame:
"""RedCap allows two distinct export formats: raw data or with labels.
The raw data format exports column names as coded entries, e.g. "record_id".
This would be ideal, but it modifies the values of the data export. To avoid this,
the dataset is exported with labels, e.g. "Record ID". Afterward, the dataset
has the header manually modified to match the coded entries.

RedCap can export coded column names and text column names ("labels"), e.g.
"redcap_repeat_instrument" vs. "Repeat Instrument". For downstream consistency,
we map all column names to the coded form, i.e. "redcap_repeat_instrument".
This cannot be done with the data dictionary, as the data dictionary dimensions
do not match the dataset dimensions. The raw data must also be exported and the header
should be copied over to the label data.

This function verifies the headers are exported correctly.

Parameters
----------
df : DataFrame
The DataFrame to update.

Returns
-------
DataFrame
The updated DataFrame.
Raises
------
ValueError
If the columns in the DataFrame do not match the expected columns
for the Bridge2AI data voice data.
"""

# this column mapping is derived from the data dictionary, and is a subset
# of the columns exported from redcap.
b2ai_resources = files("b2aiprep").joinpath("prepare").joinpath("resources")
column_mapping: dict = json.loads(b2ai_resources.joinpath("column_mapping.json").read_text())
# the mapping by default is {"coded_entry": "Coded Entry"}
# we want our columns to be named "coded_entry", so we reverse the dict
column_mapping = {v: k for k, v in column_mapping.items()}


# only map columns if we have a full overlap with the mapping dict
overlap_keys = set(df.columns.tolist()).intersection(set(column_mapping.keys()))
overlap_values = set(df.columns.tolist()).intersection(set(column_mapping.values()))

if len(overlap_keys) == df.shape[1]:
_LOGGER.info("Mapping columns to coded format.")
return df.rename(columns=column_mapping)
elif len(overlap_values) == df.shape[1]:
# no need to map columns
return df
else:
non_overlapping_columns = set(df.columns.tolist()) - set(column_mapping.keys())
overlap_with_label = set(df.columns.tolist()).intersection(set(column_mapping.values()))
overlap_with_coded = set(df.columns.tolist()).intersection(set(column_mapping.keys()))

if len(overlap_with_coded) == df.shape[1]:
return

if len(overlap_with_coded) == 0:
raise ValueError(
f"Found {len(non_overlapping_columns)} \
columns not in mapping: {non_overlapping_columns}"
(
"Dataframe has no coded headers. Please modify the source data to have "
"coded labels instead."
)
)

# if we have more than half of the columns as label headers, we assume the data is
# exported with labels
if len(overlap_with_label) > (df.shape[1] * 0.5):
raise ValueError(
(
"Dataframe has label headers rather than coded headers. Please modify the source data to have "
"coded labels instead."
)
)

# raise a warning about the labels - unclear why there would be a mix
_LOGGER.warning(
(
f"Dataframe has a mix of label and coded headers: {len(overlap_with_coded)} coded and "
f"{len(overlap_with_label)} label. Downstream processing expects only coded labels."
)
)


def get_df_of_repeat_instrument(df: DataFrame, instrument: Instrument) -> pd.DataFrame:
"""Filters rows and columns of a RedCap dataframe to correspond to a specific repeat instrument.
Expand Down Expand Up @@ -471,9 +492,10 @@ def redcap_to_bids(
# It is possible for each column in the dataframe to have one of two names:
# 1. coded column names ("record_id")
# 2. text column names ("Record ID")
# for simplicity, we always map columns to coded columns before processing,
# that way we only ever need to manually subselect using one version of the column name
df = update_redcap_df_column_names(df)
# we require coded column names for downstream processing. it is not trivial to map
# them one to one, as the text column names are not unique (e.g. there are ~6 columns
# with the name "Strain").
validate_redcap_df_column_names(df)

construct_tsv_from_json( # construct participants.tsv
df=df,
Expand Down
Loading