diff --git a/src/b2aiprep/demographics.py b/src/b2aiprep/demographics.py index ecd1edd..36345ec 100644 --- a/src/b2aiprep/demographics.py +++ b/src/b2aiprep/demographics.py @@ -1,14 +1,29 @@ +from pathlib import Path +from typing import Dict, List, Optional import pandas as pd from pandas import DataFrame from enum import Enum import json import pickle +from importlib.resources import files +import torch + +_AUDIO_TASKS = ( + 'Animal fluency', 'Audio Check', 'Breath Sounds', + 'Cape V sentences', 'Caterpillar Passage', 'Cinderella Story', + 'Diadochokinesis', 'Free Speech', 'Glides', 'Loudness', + 'Maximum phonation time', 'Picture description', + 'Productive Vocabulary', 'Prolonged vowel', 'Rainbow Passage', + 'Random Item Generation', 'Respiration and cough', 'Story recall', + 'Voluntary Cough', 'Word-color Stroop', +) class RepeatInstrument(Enum): SESSION = 'Session' ACOUSTIC_TASK = 'Acoustic Task' RECORDING = 'Recording' + PARTICIPANT = 'Participant' GENERIC_DEMOGRAPHICS = 'Q Generic Demographics' GENERIC_CONFOUNDERS = 'Q Generic Confounders' GENERIC_VOICE_PERCEPTION = 'Q Generic Voice Perception' @@ -39,62 +54,93 @@ def load_csv_file(file_path): print('Error parsing CSV file.') return None -def pickle_new_data_columns(category: str, column_names: list): - - out_file = f'{category}.pkl' - with open(out_file, "wb") as f: - pickle.dump(column_names, f) - print(f'New column names saved to: {out_file}') - return - -def load_data_columns(): - data_columns = {} - with open('participant_columns.pkl', "rb") as f: - data_columns['participant_columns'] = pickle.load(f) - - with open('demographics_columns.pkl', "rb") as f: - data_columns['demographics_columns'] = pickle.load(f) - - with open('confounders_columns.pkl', "rb") as f: - data_columns['confounders_columns'] = pickle.load(f) - - with open('phq9_columns.pkl', "rb") as f: - data_columns['phq9_columns'] = pickle.load(f) - - with open('gad7_columns.pkl', "rb") as f: - data_columns['gad7_columns'] = pickle.load(f) +def load_data_columns() -> Dict[str, List[str]]: + b2ai_resources = files("b2aiprep") + columns = json.loads(b2ai_resources.joinpath("resources", "columns.json").read_text()) + confounders_columns = json.loads(b2ai_resources.joinpath("resources", "confounders.json").read_text()) + demographics_columns = json.loads(b2ai_resources.joinpath("resources", "demographics.json").read_text()) + gad7_columns = json.loads(b2ai_resources.joinpath("resources", "gad7.json").read_text()) + participant_columns = json.loads(b2ai_resources.joinpath("resources", "participant.json").read_text()) + phq9_columns = json.loads(b2ai_resources.joinpath("resources", "phq9.json").read_text()) + + data_columns = { + 'columns': columns, + 'participant_columns': participant_columns, + 'demographics_columns': demographics_columns, + 'confounders_columns': confounders_columns, + 'phq9_columns': phq9_columns, + 'gad7_columns': gad7_columns + } return data_columns -def get_columns_of_repeat_instrument(df: DataFrame, repeat_instrument: RepeatInstrument): - columns = pd.Index(['record_id']) - +def get_columns_of_repeat_instrument(repeat_instrument: RepeatInstrument) -> List[str]: data_columns = load_data_columns() - participant_columns=data_columns['participant_columns'] - demographics_columns=data_columns['demographics_columns'] - confounders_columns=data_columns['confounders_columns'] - phq9_columns=data_columns['phq9_columns'] - gad7_columns=data_columns['gad7_columns'] - - if repeat_instrument == RepeatInstrument.SESSION: - columns = columns.append(df.columns[pd.Series(df.columns).str.startswith('session_')]) - elif repeat_instrument == RepeatInstrument.ACOUSTIC_TASK: - columns = columns.append(df.columns[pd.Series(df.columns).str.startswith('acoustic_task_')]) - elif repeat_instrument == RepeatInstrument.RECORDING: - columns = columns.append(df.columns[pd.Series(df.columns).str.startswith('recording_')]) - elif repeat_instrument == RepeatInstrument.GENERIC_DEMOGRAPHICS: - columns = columns.append(pd.Index(demographics_columns)) - elif repeat_instrument == RepeatInstrument.GENERIC_CONFOUNDERS: - columns = columns.append(pd.Index(confounders_columns)) - elif repeat_instrument == RepeatInstrument.GENERIC_PHQ9_DEPRESSION: - columns = columns.append(pd.Index(phq9_columns)) - elif repeat_instrument == RepeatInstrument.GENERIC_GAD7_ANXIETY: - columns = columns.append(pd.Index(gad7_columns)) + repeat_instrument_columns = { + RepeatInstrument.PARTICIPANT: 'participant_columns', + RepeatInstrument.GENERIC_DEMOGRAPHICS: 'demographics_columns', + RepeatInstrument.GENERIC_CONFOUNDERS: 'confounders_columns', + RepeatInstrument.GENERIC_PHQ9_DEPRESSION: 'phq9_columns', + RepeatInstrument.GENERIC_GAD7_ANXIETY: 'gad7_columns' + } + + repeat_instrument_prefix_mapping = { + RepeatInstrument.SESSION: 'session_', + RepeatInstrument.ACOUSTIC_TASK: 'acoustic_task_', + RepeatInstrument.RECORDING: 'recording_', + } + if repeat_instrument in repeat_instrument_columns: + columns = data_columns[repeat_instrument_columns[repeat_instrument]] + elif repeat_instrument in repeat_instrument_prefix_mapping: + columns = [c for c in data_columns['columns'] if c.startswith(repeat_instrument_prefix_mapping[repeat_instrument])] else: # add rest of columns for other repeat instruments raise NotImplementedError('Repeat Instrument not implemented.') + + if 'record_id' not in columns: + columns.insert(0, 'record_id') return columns -def get_df_of_repeat_instrument(df: DataFrame, repeat_instrument: RepeatInstrument): - return df[df['redcap_repeat_instrument'] == repeat_instrument.value][get_columns_of_repeat_instrument(df, repeat_instrument)] +def get_df_of_repeat_instrument(df: DataFrame, repeat_instrument: RepeatInstrument) -> pd.DataFrame: + columns = get_columns_of_repeat_instrument(repeat_instrument) + if repeat_instrument in (RepeatInstrument.GENERIC_DEMOGRAPHICS, RepeatInstrument.PARTICIPANT): + idx = df['redcap_repeat_instrument'].isnull() + else: + idx = df['redcap_repeat_instrument'] == repeat_instrument.value + return df.loc[idx, columns].copy() + +def get_recordings_for_acoustic_task(df: pd.DataFrame, acoustic_task: str) -> pd.DataFrame: + if acoustic_task not in _AUDIO_TASKS: + raise ValueError(f'Unrecognized {acoustic_task}. Options: {_AUDIO_TASKS}') + + acoustic_tasks_df = get_df_of_repeat_instrument(df, RepeatInstrument.ACOUSTIC_TASK) + recordings_df = get_df_of_repeat_instrument(df, RepeatInstrument.RECORDING) + + idx = acoustic_tasks_df['acoustic_task_name'] == acoustic_task + + dff = recordings_df.merge( + acoustic_tasks_df.loc[idx, ['acoustic_task_id']], + left_on='recording_acoustic_task_id', + right_on='acoustic_task_id', + how='inner' + ) + return dff + +def load_features_for_recordings(df: pd.DataFrame, data_path: Path, feature: Optional[str] = None) -> Dict[str, torch.Tensor]: + output = {} + feature_options = ( + 'specgram', 'melfilterbank', 'mfcc', 'opensmile', + 'sample_rate', 'checksum', 'transcription' + ) + if feature is not None: + if feature not in feature_options: + raise ValueError(f'Unrecognized feature {feature}. Options: {feature_options}') + + for recording_id in df['recording_id'].unique(): + output[recording_id] = torch.load(data_path / f"{recording_id}_features.pt") + + # if requested, we subselect to the given feature + if feature is not None: + output[recording_id] = output[recording_id][feature] + return output \ No newline at end of file