Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add functions which make it easier to load/manage the datasets #40

Merged
merged 1 commit into from
Apr 18, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 95 additions & 49 deletions src/b2aiprep/demographics.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,29 @@
from pathlib import Path
from typing import Dict, List, Optional
import pandas as pd
from pandas import DataFrame
from enum import Enum
import json
import pickle
from importlib.resources import files

import torch

_AUDIO_TASKS = (
'Animal fluency', 'Audio Check', 'Breath Sounds',
'Cape V sentences', 'Caterpillar Passage', 'Cinderella Story',
'Diadochokinesis', 'Free Speech', 'Glides', 'Loudness',
'Maximum phonation time', 'Picture description',
'Productive Vocabulary', 'Prolonged vowel', 'Rainbow Passage',
'Random Item Generation', 'Respiration and cough', 'Story recall',
'Voluntary Cough', 'Word-color Stroop',
)

class RepeatInstrument(Enum):
SESSION = 'Session'
ACOUSTIC_TASK = 'Acoustic Task'
RECORDING = 'Recording'
PARTICIPANT = 'Participant'
GENERIC_DEMOGRAPHICS = 'Q Generic Demographics'
GENERIC_CONFOUNDERS = 'Q Generic Confounders'
GENERIC_VOICE_PERCEPTION = 'Q Generic Voice Perception'
Expand Down Expand Up @@ -39,62 +54,93 @@ def load_csv_file(file_path):
print('Error parsing CSV file.')
return None

def pickle_new_data_columns(category: str, column_names: list):

out_file = f'{category}.pkl'
with open(out_file, "wb") as f:
pickle.dump(column_names, f)
print(f'New column names saved to: {out_file}')
return

def load_data_columns():
data_columns = {}
with open('participant_columns.pkl', "rb") as f:
data_columns['participant_columns'] = pickle.load(f)

with open('demographics_columns.pkl', "rb") as f:
data_columns['demographics_columns'] = pickle.load(f)

with open('confounders_columns.pkl', "rb") as f:
data_columns['confounders_columns'] = pickle.load(f)

with open('phq9_columns.pkl', "rb") as f:
data_columns['phq9_columns'] = pickle.load(f)

with open('gad7_columns.pkl', "rb") as f:
data_columns['gad7_columns'] = pickle.load(f)
def load_data_columns() -> Dict[str, List[str]]:
b2ai_resources = files("b2aiprep")
columns = json.loads(b2ai_resources.joinpath("resources", "columns.json").read_text())
confounders_columns = json.loads(b2ai_resources.joinpath("resources", "confounders.json").read_text())
demographics_columns = json.loads(b2ai_resources.joinpath("resources", "demographics.json").read_text())
gad7_columns = json.loads(b2ai_resources.joinpath("resources", "gad7.json").read_text())
participant_columns = json.loads(b2ai_resources.joinpath("resources", "participant.json").read_text())
phq9_columns = json.loads(b2ai_resources.joinpath("resources", "phq9.json").read_text())

data_columns = {
'columns': columns,
'participant_columns': participant_columns,
'demographics_columns': demographics_columns,
'confounders_columns': confounders_columns,
'phq9_columns': phq9_columns,
'gad7_columns': gad7_columns
}

return data_columns

def get_columns_of_repeat_instrument(df: DataFrame, repeat_instrument: RepeatInstrument):
columns = pd.Index(['record_id'])

def get_columns_of_repeat_instrument(repeat_instrument: RepeatInstrument) -> List[str]:
data_columns = load_data_columns()
participant_columns=data_columns['participant_columns']
demographics_columns=data_columns['demographics_columns']
confounders_columns=data_columns['confounders_columns']
phq9_columns=data_columns['phq9_columns']
gad7_columns=data_columns['gad7_columns']

if repeat_instrument == RepeatInstrument.SESSION:
columns = columns.append(df.columns[pd.Series(df.columns).str.startswith('session_')])
elif repeat_instrument == RepeatInstrument.ACOUSTIC_TASK:
columns = columns.append(df.columns[pd.Series(df.columns).str.startswith('acoustic_task_')])
elif repeat_instrument == RepeatInstrument.RECORDING:
columns = columns.append(df.columns[pd.Series(df.columns).str.startswith('recording_')])
elif repeat_instrument == RepeatInstrument.GENERIC_DEMOGRAPHICS:
columns = columns.append(pd.Index(demographics_columns))
elif repeat_instrument == RepeatInstrument.GENERIC_CONFOUNDERS:
columns = columns.append(pd.Index(confounders_columns))
elif repeat_instrument == RepeatInstrument.GENERIC_PHQ9_DEPRESSION:
columns = columns.append(pd.Index(phq9_columns))
elif repeat_instrument == RepeatInstrument.GENERIC_GAD7_ANXIETY:
columns = columns.append(pd.Index(gad7_columns))
repeat_instrument_columns = {
RepeatInstrument.PARTICIPANT: 'participant_columns',
RepeatInstrument.GENERIC_DEMOGRAPHICS: 'demographics_columns',
RepeatInstrument.GENERIC_CONFOUNDERS: 'confounders_columns',
RepeatInstrument.GENERIC_PHQ9_DEPRESSION: 'phq9_columns',
RepeatInstrument.GENERIC_GAD7_ANXIETY: 'gad7_columns'
}

repeat_instrument_prefix_mapping = {
RepeatInstrument.SESSION: 'session_',
RepeatInstrument.ACOUSTIC_TASK: 'acoustic_task_',
RepeatInstrument.RECORDING: 'recording_',
}
if repeat_instrument in repeat_instrument_columns:
columns = data_columns[repeat_instrument_columns[repeat_instrument]]
elif repeat_instrument in repeat_instrument_prefix_mapping:
columns = [c for c in data_columns['columns'] if c.startswith(repeat_instrument_prefix_mapping[repeat_instrument])]
else:
# add rest of columns for other repeat instruments
raise NotImplementedError('Repeat Instrument not implemented.')

if 'record_id' not in columns:
columns.insert(0, 'record_id')
return columns

def get_df_of_repeat_instrument(df: DataFrame, repeat_instrument: RepeatInstrument):
return df[df['redcap_repeat_instrument'] == repeat_instrument.value][get_columns_of_repeat_instrument(df, repeat_instrument)]
def get_df_of_repeat_instrument(df: DataFrame, repeat_instrument: RepeatInstrument) -> pd.DataFrame:
columns = get_columns_of_repeat_instrument(repeat_instrument)
if repeat_instrument in (RepeatInstrument.GENERIC_DEMOGRAPHICS, RepeatInstrument.PARTICIPANT):
idx = df['redcap_repeat_instrument'].isnull()
else:
idx = df['redcap_repeat_instrument'] == repeat_instrument.value
return df.loc[idx, columns].copy()

def get_recordings_for_acoustic_task(df: pd.DataFrame, acoustic_task: str) -> pd.DataFrame:
if acoustic_task not in _AUDIO_TASKS:
raise ValueError(f'Unrecognized {acoustic_task}. Options: {_AUDIO_TASKS}')

acoustic_tasks_df = get_df_of_repeat_instrument(df, RepeatInstrument.ACOUSTIC_TASK)
recordings_df = get_df_of_repeat_instrument(df, RepeatInstrument.RECORDING)

idx = acoustic_tasks_df['acoustic_task_name'] == acoustic_task

dff = recordings_df.merge(
acoustic_tasks_df.loc[idx, ['acoustic_task_id']],
left_on='recording_acoustic_task_id',
right_on='acoustic_task_id',
how='inner'
)
return dff

def load_features_for_recordings(df: pd.DataFrame, data_path: Path, feature: Optional[str] = None) -> Dict[str, torch.Tensor]:
output = {}
feature_options = (
'specgram', 'melfilterbank', 'mfcc', 'opensmile',
'sample_rate', 'checksum', 'transcription'
)
if feature is not None:
if feature not in feature_options:
raise ValueError(f'Unrecognized feature {feature}. Options: {feature_options}')

for recording_id in df['recording_id'].unique():
output[recording_id] = torch.load(data_path / f"{recording_id}_features.pt")

# if requested, we subselect to the given feature
if feature is not None:
output[recording_id] = output[recording_id][feature]

return output
Loading