Skip to content

Commit

Permalink
Merge pull request #40 from alistairewj/helper_functions
Browse files Browse the repository at this point in the history
Add functions which make it easier to load/manage the datasets
  • Loading branch information
satra authored Apr 18, 2024
2 parents 1ba6cb9 + 672a0f6 commit 61d5ef7
Showing 1 changed file with 95 additions and 49 deletions.
144 changes: 95 additions & 49 deletions src/b2aiprep/demographics.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,29 @@
from pathlib import Path
from typing import Dict, List, Optional
import pandas as pd
from pandas import DataFrame
from enum import Enum
import json
import pickle
from importlib.resources import files

import torch

_AUDIO_TASKS = (
'Animal fluency', 'Audio Check', 'Breath Sounds',
'Cape V sentences', 'Caterpillar Passage', 'Cinderella Story',
'Diadochokinesis', 'Free Speech', 'Glides', 'Loudness',
'Maximum phonation time', 'Picture description',
'Productive Vocabulary', 'Prolonged vowel', 'Rainbow Passage',
'Random Item Generation', 'Respiration and cough', 'Story recall',
'Voluntary Cough', 'Word-color Stroop',
)

class RepeatInstrument(Enum):
SESSION = 'Session'
ACOUSTIC_TASK = 'Acoustic Task'
RECORDING = 'Recording'
PARTICIPANT = 'Participant'
GENERIC_DEMOGRAPHICS = 'Q Generic Demographics'
GENERIC_CONFOUNDERS = 'Q Generic Confounders'
GENERIC_VOICE_PERCEPTION = 'Q Generic Voice Perception'
Expand Down Expand Up @@ -39,62 +54,93 @@ def load_csv_file(file_path):
print('Error parsing CSV file.')
return None

def pickle_new_data_columns(category: str, column_names: list):

out_file = f'{category}.pkl'
with open(out_file, "wb") as f:
pickle.dump(column_names, f)
print(f'New column names saved to: {out_file}')
return

def load_data_columns():
data_columns = {}
with open('participant_columns.pkl', "rb") as f:
data_columns['participant_columns'] = pickle.load(f)

with open('demographics_columns.pkl', "rb") as f:
data_columns['demographics_columns'] = pickle.load(f)

with open('confounders_columns.pkl', "rb") as f:
data_columns['confounders_columns'] = pickle.load(f)

with open('phq9_columns.pkl', "rb") as f:
data_columns['phq9_columns'] = pickle.load(f)

with open('gad7_columns.pkl', "rb") as f:
data_columns['gad7_columns'] = pickle.load(f)
def load_data_columns() -> Dict[str, List[str]]:
b2ai_resources = files("b2aiprep")
columns = json.loads(b2ai_resources.joinpath("resources", "columns.json").read_text())
confounders_columns = json.loads(b2ai_resources.joinpath("resources", "confounders.json").read_text())
demographics_columns = json.loads(b2ai_resources.joinpath("resources", "demographics.json").read_text())
gad7_columns = json.loads(b2ai_resources.joinpath("resources", "gad7.json").read_text())
participant_columns = json.loads(b2ai_resources.joinpath("resources", "participant.json").read_text())
phq9_columns = json.loads(b2ai_resources.joinpath("resources", "phq9.json").read_text())

data_columns = {
'columns': columns,
'participant_columns': participant_columns,
'demographics_columns': demographics_columns,
'confounders_columns': confounders_columns,
'phq9_columns': phq9_columns,
'gad7_columns': gad7_columns
}

return data_columns

def get_columns_of_repeat_instrument(df: DataFrame, repeat_instrument: RepeatInstrument):
columns = pd.Index(['record_id'])

def get_columns_of_repeat_instrument(repeat_instrument: RepeatInstrument) -> List[str]:
data_columns = load_data_columns()
participant_columns=data_columns['participant_columns']
demographics_columns=data_columns['demographics_columns']
confounders_columns=data_columns['confounders_columns']
phq9_columns=data_columns['phq9_columns']
gad7_columns=data_columns['gad7_columns']

if repeat_instrument == RepeatInstrument.SESSION:
columns = columns.append(df.columns[pd.Series(df.columns).str.startswith('session_')])
elif repeat_instrument == RepeatInstrument.ACOUSTIC_TASK:
columns = columns.append(df.columns[pd.Series(df.columns).str.startswith('acoustic_task_')])
elif repeat_instrument == RepeatInstrument.RECORDING:
columns = columns.append(df.columns[pd.Series(df.columns).str.startswith('recording_')])
elif repeat_instrument == RepeatInstrument.GENERIC_DEMOGRAPHICS:
columns = columns.append(pd.Index(demographics_columns))
elif repeat_instrument == RepeatInstrument.GENERIC_CONFOUNDERS:
columns = columns.append(pd.Index(confounders_columns))
elif repeat_instrument == RepeatInstrument.GENERIC_PHQ9_DEPRESSION:
columns = columns.append(pd.Index(phq9_columns))
elif repeat_instrument == RepeatInstrument.GENERIC_GAD7_ANXIETY:
columns = columns.append(pd.Index(gad7_columns))
repeat_instrument_columns = {
RepeatInstrument.PARTICIPANT: 'participant_columns',
RepeatInstrument.GENERIC_DEMOGRAPHICS: 'demographics_columns',
RepeatInstrument.GENERIC_CONFOUNDERS: 'confounders_columns',
RepeatInstrument.GENERIC_PHQ9_DEPRESSION: 'phq9_columns',
RepeatInstrument.GENERIC_GAD7_ANXIETY: 'gad7_columns'
}

repeat_instrument_prefix_mapping = {
RepeatInstrument.SESSION: 'session_',
RepeatInstrument.ACOUSTIC_TASK: 'acoustic_task_',
RepeatInstrument.RECORDING: 'recording_',
}
if repeat_instrument in repeat_instrument_columns:
columns = data_columns[repeat_instrument_columns[repeat_instrument]]
elif repeat_instrument in repeat_instrument_prefix_mapping:
columns = [c for c in data_columns['columns'] if c.startswith(repeat_instrument_prefix_mapping[repeat_instrument])]
else:
# add rest of columns for other repeat instruments
raise NotImplementedError('Repeat Instrument not implemented.')

if 'record_id' not in columns:
columns.insert(0, 'record_id')
return columns

def get_df_of_repeat_instrument(df: DataFrame, repeat_instrument: RepeatInstrument):
return df[df['redcap_repeat_instrument'] == repeat_instrument.value][get_columns_of_repeat_instrument(df, repeat_instrument)]
def get_df_of_repeat_instrument(df: DataFrame, repeat_instrument: RepeatInstrument) -> pd.DataFrame:
columns = get_columns_of_repeat_instrument(repeat_instrument)
if repeat_instrument in (RepeatInstrument.GENERIC_DEMOGRAPHICS, RepeatInstrument.PARTICIPANT):
idx = df['redcap_repeat_instrument'].isnull()
else:
idx = df['redcap_repeat_instrument'] == repeat_instrument.value
return df.loc[idx, columns].copy()

def get_recordings_for_acoustic_task(df: pd.DataFrame, acoustic_task: str) -> pd.DataFrame:
if acoustic_task not in _AUDIO_TASKS:
raise ValueError(f'Unrecognized {acoustic_task}. Options: {_AUDIO_TASKS}')

acoustic_tasks_df = get_df_of_repeat_instrument(df, RepeatInstrument.ACOUSTIC_TASK)
recordings_df = get_df_of_repeat_instrument(df, RepeatInstrument.RECORDING)

idx = acoustic_tasks_df['acoustic_task_name'] == acoustic_task

dff = recordings_df.merge(
acoustic_tasks_df.loc[idx, ['acoustic_task_id']],
left_on='recording_acoustic_task_id',
right_on='acoustic_task_id',
how='inner'
)
return dff

def load_features_for_recordings(df: pd.DataFrame, data_path: Path, feature: Optional[str] = None) -> Dict[str, torch.Tensor]:
output = {}
feature_options = (
'specgram', 'melfilterbank', 'mfcc', 'opensmile',
'sample_rate', 'checksum', 'transcription'
)
if feature is not None:
if feature not in feature_options:
raise ValueError(f'Unrecognized feature {feature}. Options: {feature_options}')

for recording_id in df['recording_id'].unique():
output[recording_id] = torch.load(data_path / f"{recording_id}_features.pt")

# if requested, we subselect to the given feature
if feature is not None:
output[recording_id] = output[recording_id][feature]

return output

0 comments on commit 61d5ef7

Please sign in to comment.