From 6bdba57fee014baf28c088d6989ceb0b8d48b248 Mon Sep 17 00:00:00 2001
From: Chaitanya Ahuja <ahujachaitanya@gmail.com>
Date: Tue, 25 Aug 2020 03:28:01 -0400
Subject: [PATCH] added code and readme

---
 .gitignore                            |    5 +
 README.md                             |  124 ++-
 argsUtils.py                          |   44 +
 data/__init__.py                      |    9 +
 data/audio.py                         |  102 +++
 data/common.py                        |  255 ++++++
 data/dataUtils.py                     |  715 +++++++++++++++++
 data/skeleton.py                      |   92 +++
 data/text.py                          |   84 ++
 dataloader_tutorial.ipynb             | 1064 +++++++++++++++++++++++++
 requirements.txt                      |    8 +
 youtube2croppedaudio/youtube2audio.py |   67 ++
 zip_dataset.ipynb                     |   91 +++
 13 files changed, 2649 insertions(+), 11 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 argsUtils.py
 create mode 100644 data/__init__.py
 create mode 100644 data/audio.py
 create mode 100644 data/common.py
 create mode 100644 data/dataUtils.py
 create mode 100644 data/skeleton.py
 create mode 100644 data/text.py
 create mode 100644 dataloader_tutorial.ipynb
 create mode 100644 requirements.txt
 create mode 100644 youtube2croppedaudio/youtube2audio.py
 create mode 100644 zip_dataset.ipynb
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4ff7a34
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+*~
+*#
+__pycache__
+*pyc
+.ipynb_checkpoints
\ No newline at end of file
diff --git a/README.md b/README.md
index e7f4c55..8253726 100644
--- a/README.md
+++ b/README.md
@@ -1,22 +1,124 @@
 # PATS Dataset
+For an overview of the dataset check the [website](http://chahuja.com/pats)
+<center>
+<img src="https://user-images.githubusercontent.com/43928520/90454983-c022ba00-e0c2-11ea-991e-36bd5cb3b38b.png" width="500px">
+</center>
 
-% Image
+# Structure of the dataset
 
-% link to webpage
+```sh
+pats/data
+  - cmu_intervals_df.csv
+  - missing_intervals.h5
+  - processed
+      - oliver # speakers
+          - XXXX.h5
+          - YYYY.h5
+          ...
+      - jon
+      ...
+      ...
+      - bee
+      - noah
+  - raw
+      - oliver
+      - oliver_cropped
+```
 
-## Download
+The dataset consists of:
+
+- `cmu_intervals_df.csv`: list of all intervals and the relevant meta information (Similar to [Ginosar et. al. 2019](https://github.com/amirbar/speech2gesture/blob/master/data/dataset.md))
+- `missing_intervals.h5`: list of all intervals that have incomplete set of features. For the sake of uniformity they are excluded from the benchmark tests.
+- `proceesed`: h5 files containing processed features for pose, audio and transcripts for all speakers
+- `raw`: mp3 audio files corresponding for each interval which is useful during rendering.
+
+## Processed Features
+- pose 
+    - data: XY coordinates of upper body pose relative to the neck joint. Joint order and parents can be found [here](https://github.com/chahuja/pats/data/skeleton.py#L40)
+    - normalize: same as data but the size of the body is normalized across speakers. In other words, each speaker is scaled to have the same shoulder length. This is especially useful in style transfer experiments where we would like the style of gestures to be independent of the size of the speaker.
+    - confidence: confidence scores provided by openpose with 1 being most confident and 0 being least confident.
+- audio
+    - log_mel_400:  Log mel Spectrograms extracted with the function [here](https://github.com/chahuja/pats/data/audio.py#L38)
+    - log_mel_512: Log mel Spectrograms extracted with the function [here](https://github.com/chahuja/pats/data/audio.py#L32) 
+    - silence: Using [VAD](https://github.com/chahuja/pats/data/audio.py#L65) we estimate which segments have voice of the speaker and which just have noise.
+- text
+    - bert: fixed pre-trained bert embeddings of size 768
+    - tokens: tokens extracted using BertTokenizer of [HuggingFace](https://huggingface.co)
+    - w2v: Word2Vec features of size 300
+    - meta: Pandas Dataframe with words, start_frame and end_frame
+
+## Raw Features
+We provide links to original youtube videos to help download the relevant audio files. Rendering the generated animations with audio would require the raw audio and would be useful for user-studies.
+
+# Dataset Download
+To download **processed** features of the dataset visit [here](http://chahuja.com/pats/download.html).
+
+To download **raw** features of the dataset run,
+```sh
+python youtube2croppedaudio/youtube2audio.py \
+-base_path pats/data/ \ # Path to dataset folder 
+-speaker bee \ # Speaker Name (Optional). Downloads all speakers if not specified
+-interval_path cmu_intervals_df.csv
+```
+As raw audio files are downloaded from video streaming websites such as YouTube, some of them may not be available at the time of download. For the purposes of consistent benchmarking **processed** features should be used.
+
+# Data Loader
+As a part of this dataset we provide a DataLoader in [PyTorch](https://pytorch.org) to jumpstart your research. This DataLoader samples batches of aligned processed features of Pose, Audio and Transcripts for one or many speakers in a dictionary format. We describe the various [arguments](#arguments-of-class-data) of the class [`Data`](https://github.com/chahuja/pats/data/dataUtils.py#L51) which generates the DataLoaders.
+
+DataLoader Examples: [Ipython Notebook](dataloader_tutorial.ipynb)
 
 ## Requirements
+* pycasper
+
+```sh
+mkdir ../pycasper
+git clone https://github.com/chahuja/pycasper ../pycasper
+ln -s ../pycasper/pycasper .
+```
+
+* Create an [anaconda](https://www.anaconda.com/) enviroment named `pats` from `env.yaml`
+
+```sh
+conda env create -f env.yaml
+```
+
+## Arguments of class `Data`
+There are way too many arguments (#research) for `Data`. For most cases you might not even need most of them and can leave them as default values. We divide the arguments into **Essential**, **DataLoader Arguments**, **Modality Arguments**, **Sampler Arguments** and **Others**.
+### Essential
+- `path2data (str)`: path to processed data e.g. "pats/data/processed"
+- `speaker (str or list)`: one or more speaker names. Find list of speakers [here](https://github.com/chahuja/pats/data/common.py#L152).
+- `modalities (list)`: list of processed features to be loaded. Default- ['pose/data', 'audio/log_mel_512']. Find list of all processed features [here](https://github.com/chahuja/pats#processed-features).
+- `fs_new (list)`: list of frame rates for each modality in modalities. Default- [15, 15]. Length of fs_new == Length of modalities. 
+- `time (float)`: length of window for each sample in seconds. Default- 4.3. The default value is recommended. It results in 64 frames of audio and pose when fs_new is 15.
+- `split (tuple or None)`: train, dev and test split as fractions. Default- None. Using None would use pre-defined splits in cmu_intervals_df.csv. Example use case of a tuple, (0.7, 0.1) represents the ratios of train and dev, hence test split is 0.2.
+- `window_hop (int)`: number of frames a window hops in an interval to contruct samples. Default- 0. Using 0 implies non-overlapping windows. For `window_hop` > 0, samples are created with the following formula `[sample[i:i+int(time*fs_new[0])] for i in range(0, len(sample), window_hop)]`
+
+### DataLoader Arguments
+- `batch_size (int)`: Size of batch. Default- 100.
+- `shuffle (bool)`: Shuffle samples after each epoch. Default- True
+- `num_workers (int)`: Number of workers to load the data. Defaut- 0
+
+### Text Arguments
+- `filler (int)`: Get "text/filler" as a feature in the sampled batch. This feature is a tensor of shape 'batch x time', where each element represents if the spoken work was a filler word or not. The list of filler words is the same as nltk's stopword list for english. Default- 0. Use 1 to get the "text/filler" feature.
+- `repeat_text (int)`: If 1, the feature of each word token is repeated to match the length of its duration. For example if a word is spoken for 10 frames of the pose and/or audio sequence, it is stacked 10 times. Hence the time dimension of pose audio and transcripts are the same. If 0, words tokens are not repeated. As each sample could have different number of words, the shorter sequences are padded with zeros. Extra features "text/token_duration" and "text/token_count" are also part of the sample which represent the duration of each token in frames and number of tokens in each sequence respectively.
+
+### Sampler Arguments (Mutually exclusive unless specified)
+- `style_iters (int)`: If value > 0, [`AlternateClassSampler`](https://github.com/chahuja/pats/data/dataUtils.py#L618) is used as the sampler argument while building the train dataloader. This sampler is useful if two or more speakers are trained together. This sampler ensures that each mini-batch has equal number of samples from each speaker. Value refers to the number of iterations in each epoch. Default- 0.
+- `sample_all_styles (int)`: Can only be used with argument `style_iters` If value > 0, randomly selects value number of samples from each speaker to load. This is especially useful for performing inference in style transfer experiments, when the number of permutations of style transfer increases exponentially with the number of speakers. This argument puts an upper bound on the number of samples for each speaker, hence limiting the time to generate gestures for a limited number of inputs. Default- 0.
+- `num_training_sample (int or None)`: if value > 0, chooses a random subset of unique samples with cardinality of the set == value as the new training set. if value is None, all samples are considered for training. Default- None.
+- `quantile_sample (float or int or None)`: Default- None.
+- `quantile_num_training_sample (int or None)`: Default- None.
+- `weighted (int)`: If value > 0, `torch.utils.data.WeightedRandomSampler` as the sampler argument while building the train dataloader. The weights are set to 1 for each sample. While, this is equivalent to a uniform sampler, this provides a possibility of being able to change the weights for each sample while training. Default- 0.
 
-## Data Loader
-pytorch dataloader
+### Others
+- `load_data (bool)`: If True, loads the hdf5 files in RAM. If False, files are not loaded and the dataloaders will not work as intended. Useful for quick debugging.
+- `num_training_iters (int or None)`: If value > 0, changes the training sampler to sample with replacement and value is the number of iterations per epoch. If value is None, the sampler samples without replacement and the number of iterations are inferred based on the size of the dataset. Default- None.
 
 ## Render
-% render script
-% format of h5
-% download audios to render
+Todo..
 
-## FIle Contents
-% intervals.csv
-% .h5
+# Creating your own dataloader
+In case you prefer to create your own dataloaders, we would recommend checking out the [structure of the h5 files](#processed-features) and the last sections of the [Ipython Notebook](dataloader_tutorial.ipynb). We have a class [`HDF5`](data/common.py#L16) with many staticmethods which might be useful to load HDF5 files.
 
+# Issues
+All research has a tag of work in progress. If you find any issues with this code, feel free to raise issues or pull requests (even better) and I will get to it as soon as humanly possible.
\ No newline at end of file
diff --git a/argsUtils.py b/argsUtils.py
new file mode 100644
index 0000000..23951cd
--- /dev/null
+++ b/argsUtils.py
@@ -0,0 +1,44 @@
+import argparse
+import itertools
+from ast import literal_eval
+
+def get_args_perm():
+  parser = argparse.ArgumentParser()
+
+  ## Dataset Parameters
+  parser.add_argument('-path2data', nargs='+', type=str, default=['pats/data/'],
+                      help='path to data')
+  parser.add_argument('-speaker', nargs='+', type=literal_eval, default=['bee'],
+                      help='choose speaker or `all` to use all the speakers available')  
+  parser.add_argument('-modalities', nargs='+', type=literal_eval, default=[['pose/data', 'audio/log_mel_512']],
+                      help='choose a set of modalities to be loaded by the dataloader')  
+  parser.add_argument('-split', nargs='+', type=literal_eval, default=[None],
+                      help='(train,dev) split of data. default=None')
+  parser.add_argument('-batch_size', nargs='+', type=int, default=[32],
+                      help='minibatch size. Use batch_size=1 when using time=0')
+  parser.add_argument('-shuffle', nargs='+', type=int, default=[1],
+                      help='shuffle the data after each epoch. default=True')
+  parser.add_argument('-time', nargs='+', type=int, default=[4.3],
+                      help='time (in seconds) for each sample')
+  parser.add_argument('-fs_new', nargs='+', type=literal_eval, default=[[15, 15]],
+                      help='subsample to the new frequency')  
+  
+  
+  args, unknown = parser.parse_known_args()
+  print(args)
+  print(unknown)
+
+  ## Create a permutation of all the values in argparse
+  args_dict = args.__dict__
+  args_keys = sorted(args_dict)
+  args_perm = [dict(zip(args_keys, prod)) for prod in itertools.product(*(args_dict[names] for names in args_keys))]
+  
+  return args, args_perm
+
+def argparseNloop(loop):
+  args, args_perm = get_args_perm()
+
+  for i, perm in enumerate(args_perm):
+    args.__dict__.update(perm)
+    print(args)    
+    loop(args, i)
diff --git a/data/__init__.py b/data/__init__.py
new file mode 100644
index 0000000..6dc6e6d
--- /dev/null
+++ b/data/__init__.py
@@ -0,0 +1,9 @@
+import sys
+sys.path.insert(0, '..')
+
+from .dataUtils import *
+from .transform import *
+from .common import *
+from .skeleton import *
+from .audio import *
+from .text import *
diff --git a/data/audio.py b/data/audio.py
new file mode 100644
index 0000000..d0ccef2
--- /dev/null
+++ b/data/audio.py
@@ -0,0 +1,102 @@
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import pdb
+from tqdm import tqdm
+import warnings
+
+from common import Modality, MissingData
+
+class Audio(Modality):
+  def __init__(self, path2data='../dataset/groot/data',
+               path2outdata='../dataset/groot/data',
+               speaker='all',
+               preprocess_methods=['log_mel_512']):
+    super(Audio, self).__init__(path2data=path2data)
+    self.path2data = path2data
+    self.df = pd.read_csv(Path(self.path2data)/'cmu_intervals_df.csv', dtype=object)
+    self.df.loc[:, 'delta_time'] = self.df['delta_time'].apply(float)
+    self.df.loc[:, 'interval_id'] = self.df['interval_id'].apply(str)
+    
+    self.path2outdata = path2outdata
+    self.speaker = speaker
+    self.preprocess_methods = preprocess_methods
+
+    self.missing = MissingData(self.path2data)
+    
+  def log_mel_512(self, y, sr, eps=1e-10):
+    spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512)
+    mask = (spec == 0).astype(np.float)
+    spec = mask * eps + (1-mask) * spec
+    return np.log(spec).transpose(1,0)
+
+  def log_mel_400(self, y, sr, eps=1e-6):
+    y = librosa.core.resample(y, orig_sr=sr, target_sr=16000) ## resampling to 16k Hz
+    #pdb.set_trace()
+    sr = 16000
+    n_fft = 512
+    hop_length = 160
+    win_length = 400
+    S = librosa.core.stft(y=y.reshape((-1)),
+                          n_fft=n_fft,
+                          hop_length=hop_length,
+                          win_length=win_length,
+                          center=False)
+                                   
+    S = np.abs(S)
+    spec = librosa.feature.melspectrogram(S=S, 
+                                          sr=sr, 
+                                          n_fft=n_fft, 
+                                          hop_length=hop_length, 
+                                          power=1,
+                                          n_mels=64,
+                                          fmin=125.0,
+                                          fmax=7500.0,
+                                          norm=None)    
+    mask = (spec == 0).astype(np.float)
+    spec = mask * eps + (1-mask) * spec
+    return np.log(spec).transpose(1,0)
+
+  def silence(self, y, sr, eps=1e-6):
+    vad = webrtcvad.Vad(3)
+    y = librosa.core.resample(y, orig_sr=sr, target_sr=16000) ## resampling to 16k Hz
+    #pdb.set_trace()
+    fs_old = 16000
+    fs_new = 15
+    ranges = np.arange(0, y.shape[0], fs_old/fs_new)
+    starts = ranges[0:-1]
+    ends = ranges[1:]
+
+    is_speeches = []
+    for start, end in zip(starts, ends):
+      Ranges = np.arange(start, end, fs_old/100)
+      is_speech = []
+      for s, e, in zip(Ranges[:-1], Ranges[1:]):
+        try:
+          is_speech.append(vad.is_speech(y[int(s):int(e)].tobytes(), fs_old))
+        except:
+          pdb.set_trace()
+      is_speeches.append(int(np.array(is_speech, dtype=np.int).mean() <= 0.5))
+      is_speeches.append(0)
+    return np.array(is_speeches, dtype=np.int)
+
+  @property
+  def fs_map(self):
+    return {
+      'log_mel_512': int(45.6*1000/512), #int(44.1*1000/512) #112 #round(22.5*1000/512)
+      'log_mel_400': int(16.52 *1000/160),
+      'silence': 15
+      }
+  
+  def fs(self, modality):
+    modality = modality.split('/')[-1]
+    return self.fs_map[modality]
+
+  @property
+  def h5_key(self):
+    return 'audio'
diff --git a/data/common.py b/data/common.py
new file mode 100644
index 0000000..451fb7e
--- /dev/null
+++ b/data/common.py
@@ -0,0 +1,255 @@
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import h5py
+from pathlib import Path
+import warnings
+import pandas as pd
+import numpy as np
+import pdb
+
+from argsUtils import *
+from tqdm import tqdm
+
+class HDF5():
+  def __init__(self):
+    pass
+  
+  '''
+  Create a file if it does not exist, else appends data to key
+  '''
+  @staticmethod
+  def append(filename, key, data):
+    h5 = HDF5.h5_open(filename, 'a')
+    try:
+      HDF5.update_dataset(h5, key, data)
+    except:
+      #pdb.set_trace()
+      warnings.warn('could not update dataset {} with filename {}'.format(key, filename))
+    HDF5.h5_close(h5)
+
+  @staticmethod
+  def load(filename, key):
+    h5 = HDF5.h5_open(filename, 'r')
+    data = h5[key]
+    return data, h5
+
+  @staticmethod
+  def isDatasetInFile(filename, key):
+    h5 = HDF5.h5_open(filename, 'r')
+    if key in h5:
+      h5.close()
+      return True
+    else:
+      h5.close()
+      return False
+      
+  @staticmethod
+  def h5_open(filename, mode):
+    ## create the parent directory if does not exist
+    os.makedirs(Path(filename).parent, exist_ok=True)
+    return h5py.File(filename, mode)
+
+  @staticmethod
+  def h5_close(h5):
+    h5.close()
+
+  @staticmethod
+  def add_dataset(h5, key, data, exist_ok=False):
+    if key in h5:
+      if exist_ok:
+        warnings.warn('dataset {} already exists. Updating data...'.format(key))
+        del h5[key]
+        h5.create_dataset(key, data=data)
+      else:
+        warnings.warn('dataset {} already exists. Skipping...'.format(key))
+    else:
+      h5.create_dataset(key, data=data)
+
+  @staticmethod
+  def update_dataset(h5, key, data):
+    HDF5.add_dataset(h5, key, data, exist_ok=True)
+
+  '''
+  Delete a dataset in an hdf file
+  
+  Arguments
+    h5: file pointer to the hdf file
+    key: key to be deleted
+
+  Return 
+    ``True`` if key found and deleted
+    ``False`` if key not found
+  '''
+  @staticmethod
+  def del_dataset(h5, key):
+    if key in h5:
+      del h5[key]
+      return True
+    else:
+      warnings.warn('Key not found. Skipping...')
+      return False
+
+  @staticmethod
+  def add_key(base_key, sub_keys=[]):
+    if isinstance(sub_keys, str):
+      sub_keys = [sub_keys]
+
+    sub_keys = '/'.join(sub_keys)
+    new_key = (Path(base_key)/Path(sub_keys)).as_posix()
+    return new_key
+
+  
+class Modality(HDF5):
+  def __init__(self, path2data='../dataset/groot/data',
+               path2outdata='../dataset/groot/data',
+               speaker='all',
+               preprocess_methods=['log_mel']):
+    super(Modality, self).__init__()
+    self.path2data = path2data
+    self.df = pd.read_csv(Path(self.path2data)/'cmu_intervals_df.csv', dtype=object)
+    self.df.loc[:, 'delta_time'] = self.df['delta_time'].apply(float)
+    self.df.loc[:, 'interval_id'] = self.df['interval_id'].apply(str)
+    
+    self.path2outdata = path2outdata
+    self.speaker = speaker
+    self.preprocess_methods = preprocess_methods
+
+  def preprocess(self):
+    raise NotImplementedError
+
+  def del_keys(self, h5_key):
+    if self.speaker != 'all':
+      speakers = [self.speaker]
+    else:
+      speakers = self.speakers
+
+    for speaker in tqdm(speakers, desc='speakers', leave=False):
+      tqdm.write('Speaker: {}'.format(speaker))
+      df_speaker = self.get_df_subset(speaker)
+      interval_ids = df_speaker['interval_id'].unique()
+      for preprocess_method in self.preprocess_methods:
+        for interval_id in tqdm(interval_ids, desc='intervals'):
+          filename = Path(self.path2outdata)/'processed'/speaker/'{}.h5'.format(interval_id)
+          key = self.add_key(h5_key[0], [preprocess_method])
+
+          ## delete dataset
+          h5 = self.h5_open(filename.as_posix(), 'a')
+          key_flag = self.del_dataset(h5, key)
+          if not key_flag:
+            break ## ignore files of a speaker if the first file does not have ``key``
+          self.h5_close(h5)
+
+  def get_df_subset(self, column, value):
+    if isinstance(value, list):
+      return self.df[self.df[column].isin(value)]
+    else:
+      return self.df[self.df[column] == value]
+  
+  @property
+  def speakers(self):
+    return [
+      'oliver', #TV sitting high_freq
+      'jon', #TV sitting 
+      'conan', #TV standing high_freq
+      'rock', #lec sitting
+      'chemistry', #lec sitting
+      'ellen', #TV standing
+      'almaram', #eval sitting
+      'angelica', #eval sitting
+      'seth', #TV sitting low frequency
+      'shelly', #TV sitting
+      'colbert', #TV standing high_freq
+      'corden', #TV standing 
+      'fallon', #TV standing
+      'huckabee', #TV standing
+      'maher', #TV standing
+      'lec_cosmic', #lec sitting
+      'lec_evol', #lec sitting
+      'lec_hist', #lec sitting
+      'lec_law', #lec sitting
+      'minhaj', #TV standing
+      'ytch_charisma', #yt sitting
+      'ytch_dating', #yt sitting
+      'ytch_prof', #yt sitting
+      'bee', #TV standing
+      'noah' #TV sitting
+    ]
+
+  @property
+  def inv_speakers(self):
+    dc = {}
+    for i, speaker in enumerate(self.speakers):
+      dc[speaker] = i
+    return dc
+  
+  def speaker_id(self, speaker):
+    return self.inv_speakers[speaker]
+
+class MissingData(HDF5):
+  def __init__(self, path2data):
+    super(MissingData, self).__init__()
+    self.path2file = Path(path2data)/'missing_intervals.h5'
+    if not os.path.exists(self.path2file):
+      h5 = HDF5.h5_open(self.path2file, 'a')
+      HDF5.h5_close(h5)
+    self.key = 'intervals'
+    self.missing_data_list = []
+
+  def append_interval(self, data):
+    self.missing_data_list.append(data)
+    warnings.warn('interval_id: {} not found.'.format(data))
+    
+  def save_intervals(self, missing_data_list):
+    '''
+    Append `data` to the missing_intervals.h5 file
+    '''
+    dt = h5py.special_dtype(vlen=str)
+    if HDF5.isDatasetInFile(self.path2file, self.key):
+      intervals, h5 = HDF5.load(self.path2file, self.key)
+      intervals = set(intervals[()])
+      h5.close()
+      intervals.update(set(missing_data_list) - {None})
+      intervals = np.array(list(intervals), dtype=dt)
+    else:
+      intervals = np.array(list(set(missing_data_list) - {None}), dtype=dt)
+
+    HDF5.append(self.path2file, self.key, intervals)
+
+  def save(self, missing_data_list):
+    '''
+    Add new missing `data` from the current set to the missing_intervals.h5 file
+    '''
+    dt = h5py.special_dtype(vlen=str)
+    intervals = np.array(list(set(missing_data_list) - {None}), dtype=dt)
+    HDF5.append(self.path2file, self.key, intervals)
+
+  def load_intervals(self):
+    if HDF5.isDatasetInFile(self.path2file, self.key):
+      intervals, h5 = HDF5.load(self.path2file, self.key)
+      intervals = set(intervals[()])
+      h5.close()
+    else:
+      intervals = set()
+    return intervals
+
+'''
+Delete keys from the processed dataset stored in hdf files
+
+path2data: Irrelevant
+path2outdata: path to processed data 
+speaker: 'all' or a particular speaker
+preprocess_methods: list of preprocess_methods to delete
+modalities: modality to delete. 
+            deleting keys of different modalities must be deleted separately
+            eg: 'audio', 'pose' etc.
+'''
+def delete_keys(args, exp_num):
+  modality = Modality(args.path2data, args.path2outdata,
+                      args.speaker, args.preprocess_methods)
+  modality.del_keys(args.modalities)
+
+if __name__ == '__main__':
+  argparseNloop(delete_keys)
diff --git a/data/dataUtils.py b/data/dataUtils.py
new file mode 100644
index 0000000..25a7637
--- /dev/null
+++ b/data/dataUtils.py
@@ -0,0 +1,715 @@
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from tqdm import tqdm
+from pathlib import Path
+import time
+import pdb
+from joblib import Parallel, delayed
+import numpy as np
+from nltk.corpus import stopwords
+
+from skeleton import *
+from audio import *
+from text import *
+from common import *
+from argsUtils import *
+import bisect
+
+import torch
+from torch.utils.data import Dataset, DataLoader, ConcatDataset, Sampler
+from torch.utils.data._utils.collate import default_collate
+from transformers import BertTokenizer
+import logging
+logging.getLogger('transformers').setLevel(logging.CRITICAL)
+
+from functools import partial
+
+class DummyData(Dataset):
+  def __init__(self, variable_list=['pose', 'audio'], length=1000, random=False, pause=False):
+    super(DummyData, self).__init__()
+    self.variable_list = variable_list
+    self.len = length
+    self.pause = pause
+    
+    if random:
+      self.data = {variable:torch.rand(self.len, 30, 50) + 1 for variable in self.variable_list}
+    else:
+      self.data = {variable:torch.arange(self.len) + 1 for variable in self.variable_list}
+    
+  def __getitem__(self, idx):
+    if self.pause:
+      time.sleep(self.pause)
+    return {variable:self.data[variable][idx].to(torch.double) for variable in self.variable_list}
+  
+  def __len__(self):
+    return self.len
+
+
+class Data(Modality):
+  r'''
+  Wrapper for DataLoaders
+
+  Arguments:
+    path2data (str): path to dataset.
+    speaker (str): speaker name. 
+    modalities (list of str): list of modalities to wrap in the dataloader. These modalities are basically keys of the hdf5 files which were preprocessed earlier (default: ``['pose/data', 'audio/log_mel']``)
+    fs_new (list, optional): new frequency of modalities, to which the data is up/downsampled to. (default: ``[15, 15]``).
+    time (float, optional): time snippet length in seconds. (default: ``4.3``).
+    split (tuple or None, optional): split fraction of train and dev sets. Must add up to less than 1. If ``None``, use ``dataset`` columns in the master dataframe (loaded in self.df) to decide train, dev and test split. (default: ``None``).
+    batch_size (int, optional): batch size of the dataloader. (default: ``100``).
+    shuffle (boolean, optional): set to ``True`` to have the data reshuffled at every epoch (default: ``False``).
+    num_workers (int, optional): set to values >0 to have more workers to load the data. argument for torch.utils.data.DataLoader. (default: ``15``). 
+
+  Example::
+    from data.dataUtils import Data 
+    data = Data('../dataset/groot/data/', 'oliver', ['pose/data'], [15])
+
+    for batch in data.train:
+      break
+
+    print(batch).
+
+  '''
+  def __init__(self, path2data, speaker,
+               modalities = ['pose/data', 'audio/log_mel_512'],
+               fs_new=[15, 15], time=4.3, 
+               split=None,
+               batch_size=100, shuffle=True, num_workers=0,
+               window_hop=0,
+               load_data=True,
+               style_iters=0,
+               num_training_sample=None,
+               sample_all_styles=0,
+               repeat_text=1,
+               quantile_sample=None,
+               quantile_num_training_sample=None,
+               weighted=0,
+               filler=0,
+               num_training_iters=None):
+    super().__init__(path2data=path2data)
+    self.path2data = path2data
+    self.speaker = speaker
+    self.modalities = modalities
+    self.fs_new = fs_new
+    self.time = time
+    self.split = split
+    self.batch_size = batch_size
+    self.shuffle = shuffle
+    self.num_workers = num_workers
+    self.window_hop = window_hop
+    self.load_data = load_data
+    self.style_iters = style_iters ## used to call a train sampler
+    self.num_training_sample = num_training_sample
+    self.sample_all_styles = sample_all_styles
+    self.repeat_text = repeat_text
+    self.quantile_sample = quantile_sample
+    self.quantile_num_training_sample = quantile_num_training_sample
+    self.weighted = weighted
+    self.filler = filler
+    if self.filler:
+      self.stopwords = stopwords.words('english')
+      self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    else:
+      self.stopwords, self.tokenizer = None, None
+    self.num_training_iters = num_training_iters
+      
+    self.text_in_modalities = False
+    for modality in self.modalities:
+      if 'text' in modality:
+        self.text_in_modalities = True
+    
+    self.missing = MissingData(self.path2data)
+
+    if isinstance(self.speaker, str):
+      self.speaker = [self.speaker]
+    
+    ## Load all modalities
+    self.modality_classes = self._load_modality_classes()
+    
+    ## Load the master table
+    self.df = pd.read_csv((Path(self.path2data)/'cmu_intervals_df.csv').as_posix())
+    self.df = self.df.append(pd.read_csv((Path(self.path2data)/'cmu_intervals_df_transforms.csv').as_posix())) ## file with evil twins
+    self.df.loc[:, 'interval_id'] = self.df['interval_id'].apply(str)
+
+    ## Check for missing_data
+    #self.missing_data
+    
+    if speaker[0] == 'all':
+#      self.df = self.get_df_subset('speaker', self.speakers)
+      self.speaker = self.speakers
+    else:
+      pass
+#      self.df = self.get_df_subset('speaker', speaker)
+
+    self.df = self.get_df_subset('speaker', self.speaker)
+    ## Create Style Dictionary
+    self.style_dict = {sp:i for i, sp in enumerate(self.speaker)}
+
+    assert len(self.df.values), 'speaker `{}` not found'.format(speaker)
+
+    #if self.load_data:
+    ## get train-dev-test split
+    self.datasets = self.tdt_split()
+    self.dataLoader_kwargs = {'batch_size':batch_size,
+                              'shuffle':shuffle,
+                              'num_workers':num_workers,
+                              'pin_memory':False}
+
+    ## if not repeat_text, do not repeat the word vectors to match the fs
+    #if True: #not self.repeat_text:
+    if self.text_in_modalities:
+      ## always keep text/token_duration at the end to comply with the collate_fn_pad
+      pad_keys = ['text/w2v', 'text/bert', 'text/filler', 'text/tokens', 'text/token_duration']
+      self.dataLoader_kwargs.update({'collate_fn':partial(collate_fn_pad, pad_key=pad_keys, dim=0)})
+      
+    self.update_dataloaders(time, window_hop)
+      
+  def _load_modality_classes(self):
+    modality_map = {}
+    for modality in self.modalities:
+      mod = modality.split('/')[0]
+      modality_map[modality] = self.mod_map(mod)
+
+    return modality_map
+
+  def mod_map(self, mod):
+    mod_map_dict = {
+      'pose': Skeleton2D,
+      'audio': Audio,
+      'text': Text
+    }
+    return mod_map_dict[mod](path2data=self.path2data, speaker=self.speaker)
+
+  # def get_df_subset(self, column, value):
+  #   if isinstance(value, list):
+  #     return self.df[self.df[column].isin(value)]
+  #   else:
+  #     return self.df[self.df[column] == value]
+
+  def getSpeaker(self, x):
+    return self.get_df_subset('interval_id', x)['speaker'].values[0]
+
+  def getPath2file(self, x):
+    return (Path(self.path2data)/'processed'/self.getSpeaker(x)/str(x)).as_posix() + '.h5'
+
+  def getStyle(self, interval_id):
+    df_subset = self.get_df_subset('interval_id', interval_id)
+    speaker = df_subset.speaker.iloc[0]
+    try:
+      style = self.style_dict[speaker]
+    except:
+      raise 'speaker style for {} not found'.format(speaker)
+    return style
+
+  def get_transforms_missing_intervals(self, missing_intervals):
+    transforms = []
+    for speaker in self.speaker:
+      if '|' in speaker:
+        transforms.append(speaker.split('|')[-1])
+
+    transforms = sorted(list(set(transforms)))
+    new_missing_intervals = set()
+    for transform in transforms:
+      for interval in missing_intervals:
+        new_missing_intervals.update({'{}|{}'.format(interval, transform)})
+
+    missing_intervals.update(new_missing_intervals)
+    return missing_intervals
+
+  def order_intervals(self, intervals):
+    interval_dict = {i:[] for i in self.style_dict}
+    for interval in intervals:
+      interval_dict[self.getSpeaker(interval)].append(interval)
+    intervals_dict = [(k, interval_dict[k]) for k in interval_dict]
+    ordered_intervals = []
+    for tup in intervals_dict:
+      ordered_intervals += tup[1]
+    return intervals_dict, ordered_intervals
+
+  @property
+  def minidataKwargs(self):
+    minidataKwargs = {'modalities':self.modalities,
+                      'fs_new':self.fs_new,
+                      'time':self.time,
+                      'modality_classes':self.modality_classes,
+                      'window_hop':self.window_hop,
+                      'repeat_text':self.repeat_text,
+                      'text_in_modalities':self.text_in_modalities,
+                      'filler':self.filler,
+                      'stopwords':self.stopwords,
+                      'tokenizer':self.tokenizer}      
+    return minidataKwargs
+  
+  def get_minidata_list(self, intervals):
+    return [MiniData(self.getPath2file(interval_id), style=self.getStyle(interval_id), **self.minidataKwargs)
+                                   for interval_id in tqdm(intervals)]
+  
+  def tdt_split(self):
+    if not self.split:
+      df_train = self.get_df_subset('dataset', 'train')
+      df_dev = self.get_df_subset('dataset', 'dev')
+      df_test = self.get_df_subset('dataset', 'test')
+    else:
+      length = self.df.shape[0]
+      end_train = int(length*self.split[0])
+      start_dev = end_train
+      end_dev = int(start_dev + length*self.split[1])
+      start_test = end_dev
+
+      df_train = self.df[:end_train]
+      df_dev = self.df[start_dev:end_dev]
+      df_test = self.df[start_test:]
+
+    ## get missing intervals
+    missing_intervals = self.missing.load_intervals()
+    missing_intervals = self.get_transforms_missing_intervals(missing_intervals)
+    
+    ## get new train/dev/test intervals
+    get_intervals = lambda x: sorted(list(set(x['interval_id'].unique()) - missing_intervals))
+    train_intervals = get_intervals(df_train)
+    dev_intervals = get_intervals(df_dev)
+    test_intervals = get_intervals(df_test)
+
+    self.train_intervals_all = train_intervals
+    self.dev_intervals_all = dev_intervals
+    self.test_intervals_all = test_intervals
+    
+    if not self.load_data: ## load a sample of the data just to get the shape
+      train_intervals = train_intervals[:10]
+      dev_intervals = dev_intervals[:10]
+      test_intervals = test_intervals[:10]
+
+    ## update_train_intervals
+    train_intervals, dev_intervals, test_intervals, train_intervals_dict = self.update_intervals(train_intervals, dev_intervals, test_intervals)
+    
+    self.train_intervals = train_intervals
+    self.dev_intervals = dev_intervals
+    self.test_intervals = test_intervals
+      
+    dataset_train = ConcatDatasetIndex(self.get_minidata_list(train_intervals))
+    dataset_dev = ConcatDatasetIndex(self.get_minidata_list(dev_intervals))
+    dataset_test = ConcatDatasetIndex(self.get_minidata_list(test_intervals))
+
+    self.dataset_train = dataset_train
+    self.train_intervals_dict = train_intervals_dict
+    self.train_sampler = self.get_train_sampler(dataset_train, train_intervals_dict)
+    
+    return {'train': dataset_train,
+            'dev': dataset_dev,
+            'test': dataset_test}
+  
+  def update_dataloaders(self, time, window_hop):
+    ## update idx_list for all minidata
+    for key in self.datasets:
+      for d_ in self.datasets[key].datasets:
+        d_.update_idx_list(time, window_hop)
+
+    train_dataLoader_kwargs = self.dataLoader_kwargs.copy()
+    if self.train_sampler:
+      train_dataLoader_kwargs['shuffle'] = False
+      
+    self.train = DataLoader(ConcatDatasetIndex(self.datasets['train'].datasets), sampler=self.train_sampler, **train_dataLoader_kwargs)
+    self.dev = DataLoader(ConcatDatasetIndex(self.datasets['dev'].datasets), **self.dataLoader_kwargs)
+    self.test = DataLoader(ConcatDatasetIndex(self.datasets['test'].datasets), **self.dataLoader_kwargs)
+
+  def get_alternate_class_sampler(self, dataset, intervals_dict, num_samples):
+    class_count = []
+    interval_offset = 0
+    for tup in intervals_dict:
+      count = 0
+      for i in range(len(tup[1])):
+        count+=len(dataset.datasets[i+interval_offset])
+      class_count.append(count)
+      interval_offset += len(tup[1])
+
+    return AlternateClassSampler(class_count, num_samples*self.batch_size)
+
+  def update_intervals(self, train_intervals, dev_intervals, test_intervals):
+    def subsample_intervals(x):
+      temp = []
+      for x_ in x:
+        if self.sample_all_styles > 0:
+          temp.extend(x_[1][:self.sample_all_styles])
+        elif self.sample_all_styles == -1:
+          temp.extend(x_[1])
+      return temp
+      
+    if self.sample_all_styles != 0:
+      train_intervals_dict, train_intervals = self.order_intervals(train_intervals)
+      dev_intervals_dict, dev_intervals = self.order_intervals(dev_intervals)
+      test_intervals_dict, test_intervals = self.order_intervals(test_intervals)
+      train_intervals = subsample_intervals(train_intervals_dict)
+      dev_intervals = subsample_intervals(dev_intervals_dict)
+      test_intervals = subsample_intervals(test_intervals_dict)
+    elif self.style_iters > 0:  ## using AlternateClassSampler
+      train_intervals_dict, train_intervals = self.order_intervals(train_intervals)
+    else:
+      train_intervals_dict = None
+    return train_intervals, dev_intervals, test_intervals, train_intervals_dict
+
+  def get_quantile_sample(self, data, q):
+    pose_modality = None
+    for key in self.modalities:
+      if 'pose' in key:
+        pose_modality = key
+        break
+    assert pose_modality is not None, "can't find pose modality"
+    if isinstance(q, float) or isinstance(q, int):
+      if q<1:
+        kind = 'above'
+      elif q>1:
+        kind = 'rebalance'
+        q = int(q)
+      else:
+        raise 'q can\'t be 1 or negative'
+    elif isinstance(q, list):
+      assert np.array([q_<=1 and q_>=0 for q_ in q]).all(), 'quantile_sample is in [0,1]'
+      assert len(q) == 2
+      kind = 'tail'
+    
+    ## get distribution of velocities
+    diff = lambda x, idx: x[1:, idx] - x[:-1, idx]
+    vel = lambda x, idx: (((diff(x, idx))**2).sum(-1)**0.5).mean()
+    samples = []
+    for batch in tqdm(data, desc='quantile_sample_calc'):
+      pose = batch[pose_modality]
+      pose = pose.reshape(pose.shape[0], 2, -1).transpose(0, 2, 1)
+      samples.append(vel(pose, list(range(1, pose.shape[1]))))
+
+    min_sample, max_sample = min(samples), max(samples)
+    if kind == 'above':
+      v0 = np.quantile(np.array(samples, dtype=np.float), q)
+      print('above {} percentile'.format(v0))
+    elif kind == 'tail':
+      v0 = [np.quantile(np.array(samples, dtype=np.float), q[0]), np.quantile(np.array(samples, dtype=np.float), q[1])]
+      print('below {} and above {} percentile'.format(*v0))
+    elif kind == 'rebalance':
+      v0 = torch.arange(min_sample, max_sample+1e-5, (max_sample - min_sample)/q)
+      print('rebalaced data'+(' {}'*len(v0)).format(*v0))
+      
+    def in_subset(v, v0):
+      if kind == 'above':
+        return v > v0
+      elif kind == 'tail':
+        return (v > v0[1]) or (v < v0[0])
+      elif kind == 'rebalance':
+        starts, ends = v0[:-1], v0[1:]
+        interval = ((starts <= v) * (v <= ends))
+        if interval.any():
+          return interval.int().argmax().item()
+        else:
+          raise 'incorrect interval'
+
+    if kind in ['tail', 'above']:
+      subset_idx = []
+    elif kind == 'rebalance':
+      subset_idx = [[] for _ in range(len(v0) - 1)]
+
+    for i, batch in tqdm(enumerate(data), desc='quantile subset'):
+      pose = batch[pose_modality]
+      pose = pose.reshape(pose.shape[0], 2, -1).transpose(0, 2, 1)
+      v = vel(pose, list(range(1, pose.shape[1])))
+      if kind in ['tail', 'above']:
+        if in_subset(v, v0):
+          subset_idx.append(i)
+      elif kind == 'rebalance':
+        subset_idx[in_subset(v, v0)].append(i)
+
+    return subset_idx, kind
+      
+  def get_train_sampler(self, dataset_train, train_intervals_dict):
+    ## Style iterations with AlternateClassSampler
+    if self.style_iters > 0 and self.sample_all_styles == 0:
+      train_sampler = self.get_alternate_class_sampler(dataset_train, train_intervals_dict, self.style_iters)
+    ## Sampler with lesser number of samples for few-shot learning.
+    elif self.num_training_sample is not None:
+      subset_idx = torch.randperm(len(dataset_train))
+      train_sampler = torch.utils.data.SubsetRandomSampler(subset_idx[:self.num_training_sample])
+    elif self.quantile_sample is not None:
+      subset_idx, kind = self.get_quantile_sample(dataset_train, self.quantile_sample)
+      if kind in ['above', 'tail']:
+        train_sampler = torch.utils.data.SubsetRandomSampler(subset_idx)
+      elif kind in ['rebalance'] and self.quantile_num_training_sample is not None:
+        subset_idx = [torch.LongTensor(li) for li in subset_idx]
+        train_sampler = BalanceClassSampler(subset_idx, int(self.quantile_num_training_sample)*self.batch_size)
+    elif self.weighted:
+      train_sampler = torch.utils.data.WeightedRandomSampler([1]*len(dataset_train), self.weighted*self.batch_size)
+    elif self.num_training_iters is not None:
+      train_sampler = torch.utils.data.RandomSampler(dataset_train, num_samples=self.num_training_iters*self.batch_size, replacement=True)
+    else:
+      train_sampler = torch.utils.data.RandomSampler(dataset_train)
+      #train_sampler = None
+
+    return train_sampler
+  
+  def close_hdf5_files(self, files):
+    for h5 in files:
+      h5.close()
+
+  @property
+  def shape(self):
+    for minidata in self.train.dataset.datasets:
+      if len(minidata) > 0:
+        break
+    shape = {}
+    for modality, feats_shape in zip(self.modalities, minidata.shapes):
+      start = minidata.idx_start_list_dict[modality][0]
+      end = minidata.idx_end_list_dict[modality][0]
+      interval = minidata.idx_interval_dict[modality]
+      length = len(range(start, end, interval))
+      shape.update({modality:[length, feats_shape[-1]]})
+    return shape
+  
+class MiniData(Dataset, HDF5):
+  def __init__(self, path2h5, modalities, fs_new, time, modality_classes, window_hop, style=0, repeat_text=1, text_in_modalities=False, filler=0, **kwargs):
+    super(MiniData, self).__init__()
+    self.path2h5 = path2h5
+    self.modalities = modalities
+    self.fs_new = fs_new
+    self.time = time
+    self.modality_classes = modality_classes
+    self.window_hop = window_hop
+    self.style = style
+    self.repeat_text = repeat_text
+    self.text_in_modalities = text_in_modalities
+    self.filler = filler
+    
+    ## load modality shapes and maybe data
+    self.shapes = []
+    self.data = []
+    for modality in self.modalities:
+      try:
+        data, h5 = self.load(self.path2h5, modality)
+      except:
+        print(self.path2h5, modality)
+        sys.exit(1)
+        
+      self.shapes.append(data.shape)
+      self.data.append(data[()])
+      h5.close()
+
+    if self.text_in_modalities:
+      try:
+        self.text_df = pd.read_hdf(self.path2h5, key='text/meta')
+      except:
+        self.text_df = None
+    if self.filler:
+      self.stopwords = kwargs['stopwords']
+      self.tokenizer = kwargs['tokenizer']
+
+    ## create idx lists
+    self.idx_start_list_dict = {}
+    self.idx_end_list_dict = {}
+    self.idx_interval_dict = {}
+
+    self.update_idx_list(self.time, self.window_hop)
+    
+  def update_idx_list(self, time, window_hop=0):
+    for modality, fs_new, shape in zip(self.modalities, self.fs_new, self.shapes):
+      fs = self.modality_classes[modality].fs(modality)
+      window = int(time*fs)
+      assert window_hop < window, 'hop size {} must be less than window size {}'.format(window_hop, int(time*fs))
+
+      fs_ratio = round(fs/fs_new)
+      self.idx_interval_dict[modality] = fs_ratio
+
+      if not window_hop:
+        time_splits = np.r_[range(0, shape[0]-window, int(window))]
+      else:
+        time_splits = np.r_[range(0, shape[0]-window, int(window_hop*fs_ratio))]
+      self.idx_start_list_dict[modality] = time_splits[:]
+      self.idx_end_list_dict[modality] = time_splits + window
+
+    #len_starts = [len(self.idx_start_list_dict[modality]) for modality in self.idx_start_list_dict]
+    #raise len_starts[0] == len_starts[1], 'number of idxes are not consistent in file {}'.format(self.path2h5)
+      
+  def __len__(self):
+    return min([len(self.idx_start_list_dict[modality]) for modality in self.modalities])
+    #return len(self.idx_start_list_dict[self.modalities[0]])
+
+  def __getitem__(self, idx):
+    item = {}
+    ## args.modalities = ['pose/normalize', 'text/w2v']
+    for i, modality in enumerate(self.modalities):
+      ## read from loaded data
+      data = self.data[i]
+      
+      ## open h5 file
+      #data, h5 = self.load(self.path2h5, modality)
+      
+      start = self.idx_start_list_dict[modality][idx]
+      end = self.idx_end_list_dict[modality][idx]
+      interval = self.idx_interval_dict[modality]
+
+      item[modality] = data[start:end:interval].astype(np.float64)
+      start_time = data[0:start:interval].shape[0] / self.fs_new[-1]
+
+      if 'text' in modality:
+        vec = item[modality]
+        indices = [0] ## starts in 64 frames
+        if self.text_df is None or modality == 'text/tokens': ## to be used with self.repeat_text = 0
+          for t in range(1, vec.shape[0]):
+            if (vec[t] - vec[indices[-1]]).sum() != 0:
+              indices.append(t)
+        else:
+          text_df_ = self.text_df[(start <= self.text_df['end_frame']) & (end > self.text_df['start_frame'])]
+          starts_ = text_df_['start_frame'].values - start
+          starts_[0] = 0
+          indices = list(starts_.astype(np.int))
+        if not self.repeat_text:
+          item.update({modality:vec[indices]}) ## if self.repeat_text == 0, update the text modality
+
+        ## add filler masks
+        if self.filler:
+          filler = np.zeros((len(indices),))
+          if self.text_df is None:
+            pass ## if text_df is not available, assume no word is filler
+          else:
+            words = self.text_df[(start <= self.text_df['end_frame']) & (end > self.text_df['start_frame'])].Word.values
+            words = [word.lower() for word in words]
+            if 'bert' in modality or 'tokens' in modality:
+              words = self.tokenizer.tokenize(' '.join(words))
+
+            for i, word in enumerate(words[:len(indices)]):
+              if word in self.stopwords:
+                filler[i] = 1
+          if self.repeat_text:
+            filler_ = np.zeros((vec.shape[0], ))
+            end_indices = indices[1:] + [vec.shape[0]]
+            for i, (st, en) in enumerate(zip(indices, end_indices)):
+              filler_[st:en] = filler[i]
+            filler = filler_
+          item.update({'text/filler':filler})
+                
+
+        ## duration of each word
+        indices_arr = np.array(indices).astype(np.int)
+        length_word = np.zeros_like(indices_arr)
+        length_word[:-1] = indices_arr[1:] - indices_arr[:-1]
+        duration = (end-start)/interval
+        length_word[-1] = duration - indices_arr[-1]
+        item.update({'text/token_duration':length_word.astype(np.int)})
+      
+      ## close h5 file
+      #h5.close()
+      
+    ## start and end times of audio in the interval
+    #start_time = self.fs_new[-1] * data[0:start:interval].shape[0]
+    duration = item[self.modalities[0]].shape[0]/self.fs_new[-1]
+    #duration = ((end-start)/interval)/self.fs_new[-1]
+    end_time = start_time + duration
+    
+    item.update({'meta':{'interval_id':Path(self.path2h5).stem,
+                         'start':start_time,
+                         'end':end_time,
+                         'idx':idx}})
+
+    item['style'] = np.zeros(item[self.modalities[0]].shape[0]) + self.style
+
+    return item
+  
+  def close_h5_files(self, files):
+    for h5 in files:
+      h5.close()
+
+class AlternateClassSampler(Sampler):
+  def __init__(self, class_count, num_samples, replacement=True):
+    self.num_samples_per_class = num_samples//len(class_count)
+    self.num_samples = self.num_samples_per_class*len(class_count)
+    self.class_count = class_count
+    self.starts = [0]
+    self.ends = []
+    for counts in self.class_count:
+      self.starts.append(self.starts[-1]+counts)
+      self.ends.append(self.starts[-1])
+    self.starts = self.starts[:-1]
+
+  def __iter__(self):
+    return iter(torch.stack([torch.randint(start, end, size=(self.num_samples_per_class, )) for start, end in zip(self.starts, self.ends)], dim=1).view(-1).tolist())
+  
+  def __len__(self):
+    return self.num_samples
+
+class BalanceClassSampler(Sampler):
+  def __init__(self, classes, num_samples, replacement=True):
+    self.classes = classes
+    self.update_classes()
+    self.num_samples_per_class = num_samples//len(self.classes)
+    self.num_samples = self.num_samples_per_class*len(self.classes)
+
+  def update_classes(self):
+    cl_list = []
+    for cl in self.classes:
+      if cl.shape[0] > 0:
+        cl_list.append(cl)
+    self.classes = cl_list
+    
+  def __iter__(self):
+    return iter(torch.stack([class_idx[torch.randint(0, len(class_idx), size=(self.num_samples_per_class,))] for class_idx in self.classes], dim=1).view(-1).tolist())
+  
+  def __len__(self):
+    return self.num_samples
+
+class ConcatDatasetIndex(ConcatDataset):
+  def __init__(self, datasets):
+    super().__init__(datasets)
+
+  def __getitem__(self, idx):
+    if idx < 0:
+      if -idx > len(self):
+        raise ValueError("absolute value of index should not exceed dataset length")
+      idx = len(self) + idx
+    dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+    if dataset_idx == 0:
+      sample_idx = idx
+    else:
+      sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+    batch = self.datasets[dataset_idx][sample_idx]
+    if isinstance(batch, dict):
+      batch.update({'idx':idx})
+    return batch
+
+    
+def unittest(args, exp_num):
+  path2data = args.path2data
+  speaker = args.speaker
+  modalities = args.modalities
+  fs_new = args.fs_new
+  time = args.time
+  split = args.split
+  batch_size = args.batch_size
+  shuffle = args.shuffle
+
+  data = Data(path2data=path2data,
+              speaker=speaker,
+              modalities=modalities,
+              fs_new=fs_new,
+              time=time,
+              split=split,
+              batch_size=batch_size,
+              shuffle=shuffle)
+
+  print('Speaker: {}'.format(speaker))
+  for batch in tqdm(data.train):
+    continue
+  sizes = {modality:batch[modality].shape for modality in modalities}
+  print('train')
+  print(sizes)
+
+  for batch in tqdm(data.dev):
+    continue
+  sizes = {modality:batch[modality].shape for modality in modalities}
+  print('dev')
+  print(sizes)
+
+  for batch in tqdm(data.test):
+    continue
+  sizes = {modality:batch[modality].shape for modality in modalities}
+  print('test')
+  print(sizes)
+  
+if __name__ == '__main__':
+  argparseNloop(unittest)
diff --git a/data/skeleton.py b/data/skeleton.py
new file mode 100644
index 0000000..a87dea0
--- /dev/null
+++ b/data/skeleton.py
@@ -0,0 +1,92 @@
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from datetime import datetime
+
+from common import Modality, MissingData
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import pdb
+from tqdm import tqdm
+import yaml
+import warnings
+
+from pycasper.pathUtils import replace_Nth_parent
+
+class Skeleton2D(Modality):
+  def __init__(self, path2data='../dataset/groot/data/speech2gesture_data',
+               path2outdata='../dataset/groot/data',
+               speaker='all',
+               preprocess_methods=['data']):
+    super(Skeleton2D, self).__init__(path2data=path2data)
+    self.path2data = path2data
+    self.df = pd.read_csv(Path(self.path2data)/'cmu_intervals_df.csv', dtype=object)
+    self.df.loc[:, 'delta_time'] = self.df['delta_time'].apply(float)
+    self.df.loc[:, 'interval_id'] = self.df['interval_id'].apply(str)
+    
+    self.path2outdata = path2outdata
+    self.speaker = speaker
+    self.preprocess_methods = preprocess_methods
+    
+    self.missing = MissingData(self.path2outdata)
+
+
+
+  @property
+  def parents(self):
+    return [-1,
+            0, 1, 2,
+            0, 4, 5,
+            0, 7, 7,
+            6,
+            10, 11, 12, 13,
+            10, 15, 16, 17,
+            10, 19, 20, 21,
+            10, 23, 24, 25,
+            10, 27, 28, 29,
+            3,
+            31, 32, 33, 34,
+            31, 36, 37, 38,
+            31, 40, 41, 42,
+            31, 44, 45, 46,
+            31, 48, 49, 50]
+
+  @property
+  def joint_subset(self):
+    ## choose only the relevant skeleton key-points (removed nose and eyes)
+    return np.r_[range(7), range(10, len(self.parents))]
+
+  @property
+  def root(self):
+    return 0
+
+  @property
+  def joint_names(self):
+    return ['Neck',
+            'RShoulder', 'RElbow', 'RWrist',
+            'LShoulder', 'LElbow', 'LWrist',
+            'Nose', 'REye', 'LEye',
+            'LHandRoot',
+            'LHandThumb1', 'LHandThumb2', 'LHandThumb3', 'LHandThumb4',
+            'LHandIndex1', 'LHandIndex2', 'LHandIndex3', 'LHandIndex4',
+            'LHandMiddle1', 'LHandMiddle2', 'LHandMiddle3', 'LHandMiddle4',
+            'LHandRing1', 'LHandRing2', 'LHandRing3', 'LHandRing4',
+            'LHandLittle1', 'LHandLittle2', 'LHandLittle3', 'LHandLittle4',
+            'RHandRoot',
+            'RHandThumb1', 'RHandThumb2', 'RHandThumb3', 'RHandThumb4',
+            'RHandIndex1', 'RHandIndex2', 'RHandIndex3', 'RHandIndex4',
+            'RHandMiddle1', 'RHandMiddle2', 'RHandMiddle3', 'RHandMiddle4',
+            'RHandRing1', 'RHandRing2', 'RHandRing3', 'RHandRing4',
+            'RHandLittle1', 'RHandLittle2', 'RHandLittle3', 'RHandLittle4'
+    ]
+
+  def fs(self, modality):
+    return 15
+
+  @property
+  def h5_key(self):
+    return 'pose'
diff --git a/data/text.py b/data/text.py
new file mode 100644
index 0000000..b192c1e
--- /dev/null
+++ b/data/text.py
@@ -0,0 +1,84 @@
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import pdb
+from tqdm import tqdm
+import warnings
+import h5py
+
+from pycasper.pathUtils import replace_Nth_parent
+from common import Modality, MissingData, HDF5
+
+import warnings
+
+import torch
+from torch.utils.data._utils.collate import default_collate
+from functools import partial
+
+def pad(datasets, key, dim):
+  sizes = []
+  for data in datasets:
+    data = data[key]
+    sizes.append(data.shape[dim])
+  max_length = max(sizes)
+  new_datasets = []
+  lengths = []
+  for data in datasets:
+    data = data[key]
+    length = data.shape[dim]
+    zero_shape = list(data.shape)
+    zero_shape[dim] = max_length-length
+    new_datasets.append(np.concatenate([data, np.zeros(zero_shape)], axis=dim))
+    lengths.append(length)
+  return default_collate(new_datasets), default_collate(lengths)
+
+def collate_fn_pad(batch, pad_key='text/meta', dim=0):
+  if isinstance(batch[0], dict):
+    data_dict = {}
+    for key in batch[0]:
+      if key in pad_key:
+        padded_outs = pad(batch, key, dim=dim)
+        if key == pad_key[-1]: ## TODO hardcoded to use the last key which is text/token_duration
+          data_dict[key], data_dict['text/token_count'] = padded_outs[0], padded_outs[1]
+        else:
+          data_dict[key] = padded_outs[0]
+      else:
+        data_dict[key] = default_collate([d[key] for d in batch])
+    return data_dict
+  else:
+    return default_collate(batch)
+
+class Text(Modality):
+  def __init__(self, path2data='../dataset/groot/data',
+               path2outdata='../dataset/groot/data',
+               speaker='all',
+               preprocess_methods=['w2v'],
+               text_aligned=0):
+    super(Text, self).__init__(path2data=path2data)
+    self.path2data = path2data
+    self.df = pd.read_csv(Path(self.path2data)/'cmu_intervals_df.csv', dtype=object)
+    self.df.loc[:, 'delta_time'] = self.df['delta_time'].apply(float)
+    self.df.loc[:, 'interval_id'] = self.df['interval_id'].apply(str)
+    
+    self.path2outdata = path2outdata
+    self.speaker = speaker
+    self.preprocess_methods = preprocess_methods
+
+    self.missing = MissingData(self.path2data)
+
+    ## list of word2-vec models
+    self.w2v_models = []
+    self.text_aligned = text_aligned
+    
+
+  def fs(self, modality):
+    return 15
+  
+  @property
+  def h5_key(self):
+    return 'text'
diff --git a/dataloader_tutorial.ipynb b/dataloader_tutorial.ipynb
new file mode 100644
index 0000000..cd0ad3f
--- /dev/null
+++ b/dataloader_tutorial.ipynb
@@ -0,0 +1,1064 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loading Data as a pytorch DataLoader"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Basic Dataloader\n",
+    "- Single Speaker\n",
+    "- 3 Modalities"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from data import Data\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "common_kwargs = dict(path2data = 'pats/data',\n",
+    "                     speaker = ['lec_cosmic'],\n",
+    "                     modalities = ['pose/data', 'audio/log_mel_512', 'text/bert'],\n",
+    "                     fs_new = [15, 15, 15],\n",
+    "                     batch_size = 4,\n",
+    "                     window_hop = 5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 65/65 [00:04<00:00, 16.23it/s]\n",
+      "100%|██████████| 9/9 [00:00<00:00, 14.09it/s]\n",
+      "100%|██████████| 7/7 [00:00<00:00, 18.26it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "data = Data(**common_kwargs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`Data` has 3 DataLoader objets, `data.train`, `data.dev` and `data.test`. Let's sample a batch from `data.train`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for batch in data.train:\n",
+    "  break"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "All elements of the dictionary have a \"batch x time x feature\" order. Let's look at the shapes of all the elements of the dictionary `batch`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "pose/data: torch.Size([4, 64, 104])\n",
+      "audio/log_mel_512: torch.Size([4, 64, 128])\n",
+      "text/bert: torch.Size([4, 64, 768])\n",
+      "text/token_duration: torch.Size([4, 17])\n",
+      "text/token_count: torch.Size([4])\n",
+      "style: torch.Size([4, 64])\n",
+      "idx: torch.Size([4])\n"
+     ]
+    }
+   ],
+   "source": [
+    "for key in batch.keys():\n",
+    "  if key != 'meta':\n",
+    "    print('{}: {}'.format(key, batch[key].shape))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\"pose/data\" has 104 dimensions which is the same as 52 joints with XY coordinates. Let's reshape it to a more obvious format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([4, 64, 2, 52])\n"
+     ]
+    }
+   ],
+   "source": [
+    "pose = batch['pose/data']\n",
+    "pose = pose.reshape(pose.shape[0], pose.shape[1], 2, -1)\n",
+    "print(pose.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Apart from the requested modalities -i.e. pose, audio and text- we get some extra elements. Let's quickly gloss throught them.\n",
+    "- shape of \"text/bert\" along time is the same as \"pose/data\", hence they are temporally aligned.\n",
+    "- shape of \"text/token_duration\" implies the maximum length of a sentence in this mini-batch is 17\n",
+    "- \"idx\" refers to the idx of the object of the `Data` class\n",
+    "- \"style\" is the relative style id of the speakers in the dataset. In this case, all the values will be 0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Multi-Speaker DataLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "common_kwargs.update(dict(speaker=['lec_cosmic', 'lec_evol']))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 197/197 [00:03<00:00, 50.41it/s]\n",
+      "100%|██████████| 49/49 [00:01<00:00, 47.31it/s]\n",
+      "100%|██████████| 151/151 [00:02<00:00, 60.29it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "data = Data(**common_kwargs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for batch in data.train:\n",
+    "  break"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is the same as Basic DataLoader, except data from both speakers will be sampled allowing to train a multi-speaker model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Other text features\n",
+    "In case we do not want to use fixed pre-trained embeddings, we can use \"text/tokens\" as a modality. These tokens represent the indices extracted by `BertTokenizer` from [HuggingFace](https://huggingface.co) and can be used to fine-tune transformer based embeddings. In this example, we use `repeat_text=0` which does not repeat the text/tokens modality to align it with pose and/or audio."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "common_kwargs.update(dict(modalities = ['pose/data', 'audio/log_mel_512', 'text/tokens'],\n",
+    "                         repeat_text = 0))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 197/197 [00:06<00:00, 32.74it/s]\n",
+      "100%|██████████| 49/49 [00:01<00:00, 32.11it/s]\n",
+      "100%|██████████| 151/151 [00:03<00:00, 39.60it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "data = Data(**common_kwargs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for batch in data.train:\n",
+    "  break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([4, 13])"
+      ]
+     },
+     "execution_count": 59,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "batch['text/tokens'].shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[12761.,  7366.,  2008.,  2003.,  2428.,  4187.,  2005.,  4824.,     0.,\n",
+       "             0.,     0.,     0.,     0.],\n",
+       "        [ 2172.,  2062.,  5294.,  2292.,  1005.,  1055.,  2360.,  2702.,  2335.,\n",
+       "          2062.,  5294.,  2009.,  2052.],\n",
+       "        [ 2428.,  3147.,  2437.,  2235.,  2515.,  1996.,  2590.,     0.,     0.,\n",
+       "             0.,     0.,     0.,     0.],\n",
+       "        [ 5072.,  1998., 27226.,  2021.,  2036.,  1996.,  4195.,     0.,     0.,\n",
+       "             0.,     0.,     0.,     0.]], dtype=torch.float64)"
+      ]
+     },
+     "execution_count": 60,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "batch['text/tokens']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## DataLoaders with Samplers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "common_kwargs.update(dict(style_iters=100))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 197/197 [00:03<00:00, 59.95it/s]\n",
+      "100%|██████████| 49/49 [00:00<00:00, 68.05it/s]\n",
+      "100%|██████████| 151/151 [00:02<00:00, 69.11it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "data = Data(**common_kwargs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 100/100 [00:00<00:00, 405.04it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for batch in tqdm(data.train):\n",
+    "  continue"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is the same as the Multi-Speaker Dataloader, except the \"style\" element will now have 0 or 1 based on which speaker's data it it. We can be sure that every batch will have both styles as we use the `style_iters` argument. The number of iterations per epoch is 100 which is the value of style_iters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+       "         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+       "         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+       "         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
+       "        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+       "         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+       "         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+       "         1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],\n",
+       "        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+       "         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+       "         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+       "         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
+       "        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+       "         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+       "         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+       "         1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]], dtype=torch.float64)"
+      ]
+     },
+     "execution_count": 67,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "batch['style']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install torch\n",
+    "!pip install transformers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: tqdm in /work/cahuja/local/conda3/lib/python3.7/site-packages (4.32.1)\n",
+      "Requirement already satisfied: joblib in /work/cahuja/local/conda3/lib/python3.7/site-packages (0.16.0)\n",
+      "Collecting youtube-dl\n",
+      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/61/1c/a86837929eff24827b117d577584cc1a2a85dfdb5a91465d17c8b298f0d0/youtube_dl-2020.7.28-py2.py3-none-any.whl (1.8MB)\n",
+      "\u001b[K     |████████████████████████████████| 1.8MB 4.0MB/s eta 0:00:01\n",
+      "\u001b[?25hInstalling collected packages: youtube-dl\n",
+      "Successfully installed youtube-dl-2020.7.28\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install tqdm\n",
+    "!pip install joblib\n",
+    "!pip install youtube-dl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nltk\n",
+    "import h5py\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Working with h5 files\n",
+    "In case these dataloaders do not suit your needs, it is possible to read individual interval files. We have created a class `HDF5` with many static methods to load data from these h5 files. \n",
+    "\n",
+    "**Caution** - Not closing h5 files properly can give persistent errors and may require a system restart.\n",
+    "\n",
+    "**Caution-2** - It is recommended to ignore intervals in `missing_intervals.h5` as those intervals do not have complate data. The DataLoaders take care of that, but manually accessing h5 files does not"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from data import HDF5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<KeysViewHDF5 ['audio', 'pose', 'text']>\n",
+      "audio: <KeysViewHDF5 ['log_mel_400', 'log_mel_512', 'silence']>\n",
+      "pose: <KeysViewHDF5 ['confidence', 'data', 'normalize']>\n",
+      "text: <KeysViewHDF5 ['bert', 'meta', 'tokens', 'w2v']>\n"
+     ]
+    }
+   ],
+   "source": [
+    "h5 = HDF5.h5_open('pats/data/processed/bee/cmu0000025735.h5', 'r')\n",
+    "print(h5.keys())\n",
+    "for key in h5.keys():\n",
+    "  print('{}: {}'.format(key, h5[key].keys()))\n",
+    "h5.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading a key"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data, h5 = HDF5.load('pats/data/processed/bee/cmu0000025735.h5', key='pose/data')\n",
+    "data = data[()]\n",
+    "h5.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(292, 104)"
+      ]
+     },
+     "execution_count": 76,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading missing_intervals.h5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "missing, h5 = HDF5.load('pats/data/missing_intervals.h5', key='intervals')\n",
+    "missing = missing[()]\n",
+    "h5.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['115309', '147056', 'cmu0000022349', ..., '5227', '13510', '25204'],\n",
+       "      dtype=object)"
+      ]
+     },
+     "execution_count": 83,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "missing"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading Transcripts as a DataFrame"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Word</th>\n",
+       "      <th>start_frame</th>\n",
+       "      <th>end_frame</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>do</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>7.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>you</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>9.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>have</td>\n",
+       "      <td>9.0</td>\n",
+       "      <td>10.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>to</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>12.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>be</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>15.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>on</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>21.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Sunday</td>\n",
+       "      <td>21.0</td>\n",
+       "      <td>27.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>candidate</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>36.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Pete</td>\n",
+       "      <td>36.0</td>\n",
+       "      <td>40.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>buttigieg</td>\n",
+       "      <td>40.0</td>\n",
+       "      <td>49.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>hell.</td>\n",
+       "      <td>49.0</td>\n",
+       "      <td>55.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Fox</td>\n",
+       "      <td>55.0</td>\n",
+       "      <td>61.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>News</td>\n",
+       "      <td>61.0</td>\n",
+       "      <td>64.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Town</td>\n",
+       "      <td>64.0</td>\n",
+       "      <td>70.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Hall</td>\n",
+       "      <td>70.0</td>\n",
+       "      <td>73.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>as</td>\n",
+       "      <td>73.0</td>\n",
+       "      <td>76.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>I'm</td>\n",
+       "      <td>76.0</td>\n",
+       "      <td>78.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>sure</td>\n",
+       "      <td>78.0</td>\n",
+       "      <td>81.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>you</td>\n",
+       "      <td>81.0</td>\n",
+       "      <td>84.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>did</td>\n",
+       "      <td>84.0</td>\n",
+       "      <td>87.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>not</td>\n",
+       "      <td>87.0</td>\n",
+       "      <td>93.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>see</td>\n",
+       "      <td>93.0</td>\n",
+       "      <td>97.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>since</td>\n",
+       "      <td>97.0</td>\n",
+       "      <td>103.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>it</td>\n",
+       "      <td>103.0</td>\n",
+       "      <td>103.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>was</td>\n",
+       "      <td>103.0</td>\n",
+       "      <td>105.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>the</td>\n",
+       "      <td>105.0</td>\n",
+       "      <td>106.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>same</td>\n",
+       "      <td>106.0</td>\n",
+       "      <td>111.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>night</td>\n",
+       "      <td>111.0</td>\n",
+       "      <td>111.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>as</td>\n",
+       "      <td>111.0</td>\n",
+       "      <td>114.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>the</td>\n",
+       "      <td>114.0</td>\n",
+       "      <td>115.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
+       "      <td>Game</td>\n",
+       "      <td>115.0</td>\n",
+       "      <td>120.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>of</td>\n",
+       "      <td>120.0</td>\n",
+       "      <td>121.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>Thrones</td>\n",
+       "      <td>121.0</td>\n",
+       "      <td>129.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>finale</td>\n",
+       "      <td>129.0</td>\n",
+       "      <td>130.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>footage</td>\n",
+       "      <td>130.0</td>\n",
+       "      <td>151.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>seems</td>\n",
+       "      <td>151.0</td>\n",
+       "      <td>159.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>like</td>\n",
+       "      <td>159.0</td>\n",
+       "      <td>162.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>a</td>\n",
+       "      <td>162.0</td>\n",
+       "      <td>163.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38</th>\n",
+       "      <td>smart</td>\n",
+       "      <td>163.0</td>\n",
+       "      <td>166.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>candidate</td>\n",
+       "      <td>166.0</td>\n",
+       "      <td>175.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>40</th>\n",
+       "      <td>but</td>\n",
+       "      <td>175.0</td>\n",
+       "      <td>177.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>41</th>\n",
+       "      <td>the</td>\n",
+       "      <td>177.0</td>\n",
+       "      <td>187.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>42</th>\n",
+       "      <td>worst</td>\n",
+       "      <td>187.0</td>\n",
+       "      <td>192.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>43</th>\n",
+       "      <td>scheduled</td>\n",
+       "      <td>192.0</td>\n",
+       "      <td>199.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44</th>\n",
+       "      <td>media</td>\n",
+       "      <td>199.0</td>\n",
+       "      <td>204.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45</th>\n",
+       "      <td>event</td>\n",
+       "      <td>204.0</td>\n",
+       "      <td>208.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>46</th>\n",
+       "      <td>since</td>\n",
+       "      <td>208.0</td>\n",
+       "      <td>213.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>47</th>\n",
+       "      <td>Jay-Z</td>\n",
+       "      <td>213.0</td>\n",
+       "      <td>222.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>48</th>\n",
+       "      <td>decided</td>\n",
+       "      <td>222.0</td>\n",
+       "      <td>231.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>49</th>\n",
+       "      <td>September</td>\n",
+       "      <td>231.0</td>\n",
+       "      <td>238.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50</th>\n",
+       "      <td>11th</td>\n",
+       "      <td>238.0</td>\n",
+       "      <td>245.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>51</th>\n",
+       "      <td>2001</td>\n",
+       "      <td>245.0</td>\n",
+       "      <td>258.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>52</th>\n",
+       "      <td>was</td>\n",
+       "      <td>258.0</td>\n",
+       "      <td>261.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>53</th>\n",
+       "      <td>the</td>\n",
+       "      <td>261.0</td>\n",
+       "      <td>262.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54</th>\n",
+       "      <td>day</td>\n",
+       "      <td>262.0</td>\n",
+       "      <td>262.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>55</th>\n",
+       "      <td>to</td>\n",
+       "      <td>262.0</td>\n",
+       "      <td>267.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>56</th>\n",
+       "      <td>release</td>\n",
+       "      <td>267.0</td>\n",
+       "      <td>271.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>57</th>\n",
+       "      <td>his</td>\n",
+       "      <td>271.0</td>\n",
+       "      <td>276.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>58</th>\n",
+       "      <td>iconic</td>\n",
+       "      <td>276.0</td>\n",
+       "      <td>283.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>59</th>\n",
+       "      <td>album</td>\n",
+       "      <td>283.0</td>\n",
+       "      <td>292.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         Word  start_frame  end_frame\n",
+       "0          do          0.0        7.0\n",
+       "1         you          7.0        9.0\n",
+       "2        have          9.0       10.0\n",
+       "3          to         10.0       12.0\n",
+       "4          be         12.0       15.0\n",
+       "5          on         15.0       21.0\n",
+       "6      Sunday         21.0       27.0\n",
+       "7   candidate         27.0       36.0\n",
+       "8        Pete         36.0       40.0\n",
+       "9   buttigieg         40.0       49.0\n",
+       "10      hell.         49.0       55.0\n",
+       "11        Fox         55.0       61.0\n",
+       "12       News         61.0       64.0\n",
+       "13       Town         64.0       70.0\n",
+       "14       Hall         70.0       73.0\n",
+       "15         as         73.0       76.0\n",
+       "16        I'm         76.0       78.0\n",
+       "17       sure         78.0       81.0\n",
+       "18        you         81.0       84.0\n",
+       "19        did         84.0       87.0\n",
+       "20        not         87.0       93.0\n",
+       "21        see         93.0       97.0\n",
+       "22      since         97.0      103.0\n",
+       "23         it        103.0      103.0\n",
+       "24        was        103.0      105.0\n",
+       "25        the        105.0      106.0\n",
+       "26       same        106.0      111.0\n",
+       "27      night        111.0      111.0\n",
+       "28         as        111.0      114.0\n",
+       "29        the        114.0      115.0\n",
+       "30       Game        115.0      120.0\n",
+       "31         of        120.0      121.0\n",
+       "32    Thrones        121.0      129.0\n",
+       "33     finale        129.0      130.0\n",
+       "34    footage        130.0      151.0\n",
+       "35      seems        151.0      159.0\n",
+       "36       like        159.0      162.0\n",
+       "37          a        162.0      163.0\n",
+       "38      smart        163.0      166.0\n",
+       "39  candidate        166.0      175.0\n",
+       "40        but        175.0      177.0\n",
+       "41        the        177.0      187.0\n",
+       "42      worst        187.0      192.0\n",
+       "43  scheduled        192.0      199.0\n",
+       "44      media        199.0      204.0\n",
+       "45      event        204.0      208.0\n",
+       "46      since        208.0      213.0\n",
+       "47      Jay-Z        213.0      222.0\n",
+       "48    decided        222.0      231.0\n",
+       "49  September        231.0      238.0\n",
+       "50       11th        238.0      245.0\n",
+       "51       2001        245.0      258.0\n",
+       "52        was        258.0      261.0\n",
+       "53        the        261.0      262.0\n",
+       "54        day        262.0      262.0\n",
+       "55         to        262.0      267.0\n",
+       "56    release        267.0      271.0\n",
+       "57        his        271.0      276.0\n",
+       "58     iconic        276.0      283.0\n",
+       "59      album        283.0      292.0"
+      ]
+     },
+     "execution_count": 79,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.read_hdf('pats/data/processed/bee/cmu0000025735.h5', key='text/meta')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..81060cd
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+torch==1.6.0
+transformers==3.0.2
+tqdm==4.32.1
+joblib==0.16.0
+youtube-dl==2020.7.28
+nltk==3.4.4
+h5py==2.9.0
+pandas==0.24.2
\ No newline at end of file
diff --git a/youtube2croppedaudio/youtube2audio.py b/youtube2croppedaudio/youtube2audio.py
new file mode 100644
index 0000000..4daa1e9
--- /dev/null
+++ b/youtube2croppedaudio/youtube2audio.py
@@ -0,0 +1,67 @@
+from __future__ import unicode_literals
+import argparse
+from subprocess import call
+import pandas as pd
+import os
+from tqdm import tqdm
+from joblib import Parallel, delayed
+
+
+def downloader(row):
+    link = row['video_link']
+    if 'youtube' in link:
+        try:
+            output_dir = os.path.join(BASE_PATH, "raw_full", row["speaker"])
+            output_path = os.path.join(BASE_PATH, "raw_full", row["speaker"], row["video_link"][-11:] + ".mp3")
+            if not(os.path.exists(os.path.dirname(output_dir))):
+                os.makedirs(os.path.dirname(output_dir))
+            #download audio
+            command = 'youtube-dl {link} -o {output_path} --extract-audio --audio-format mp3'.format(link=link, output_path=output_path)
+            res1 = call(command, shell=True)
+        except Exception as e:
+            print(e)
+    else: #using youtube-dl to download jon
+        output_path = os.path.join(BASE_PATH, "raw_full", row["speaker"],str(row["video_fn"])[:-4] + ".mp3")
+        print(link)
+        command = 'youtube-dl {link} --extract-audio --audio-format mp3 -o "{output_path}"'.format(link=link, output_path=output_path)
+        res1 = call(command, shell=True)
+
+def save_interval(input_fn, start, end, output_fn):
+    cmd = 'ffmpeg -i "%s"  -ss %s -to %s -strict -2 "%s" -y' % (
+    input_fn, start, end, output_fn)
+    res3 = call(cmd, shell=True)
+
+def crop_tool(interval):
+    try:
+        start_time = str(pd.to_datetime(interval['start_time']).time())
+        end_time = str(pd.to_datetime(interval['end_time']).time())
+        video_id = (interval["video_link"][-11:])
+        OUTPUT_DIR  = os.path.join(args.base_path, "raw" , interval['speaker'] + "_cropped")
+        if not(os.path.isdir(OUTPUT_DIR)): os.makedirs(OUTPUT_DIR)
+        if (interval["speaker"] == 'jon') and ('youtube' not in interval["video_link"]):
+            video_id = (interval["video_fn"])[:-4]
+        input_fn = os.path.join(args.base_path,"raw_full", interval['speaker'], video_id + ".mp3")
+        output_fn = os.path.join(args.base_path, "raw" , interval['speaker'] + "_cropped", "%s.mp3"%(interval['interval_id'])) 
+        if not(os.path.exists(output_fn)):
+            save_interval(input_fn, str(start_time), str(end_time), output_fn)
+    except Exception as e:
+        print(e)
+        print("couldn't crop interval: %s"%interval)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-base_path', '--base_path', help='base folder path of dataset')
+    parser.add_argument('-speaker', '--speaker', help='download videos of a specific speaker')
+    parser.add_argument('-interval_path', '--interval_path', help='path to the intervals_df file')
+    args = parser.parse_args()
+    BASE_PATH = args.base_path #Path which to create raw folder and has intervals_df and video_links.csv
+
+    df = pd.read_csv(os.path.join(BASE_PATH, args.interval_path))
+    if args.speaker:
+        df = df[df['speaker'] == args.speaker]
+    df_download = df.drop_duplicates(subset=['video_link'])
+    Parallel(n_jobs=-1)(delayed(downloader)(row) for _, row in tqdm(df_download.iterrows(), total=df_download.shape[0]))
+    Parallel(n_jobs=-1)(delayed(crop_tool)(interval) for _, interval in tqdm(df.iterrows(), total=df.shape[0]))
+    command = "rm -r {}".format(os.path.join(BASE_PATH, "raw_full"))
+    res1 = call(command, shell=True)
\ No newline at end of file
diff --git a/zip_dataset.ipynb b/zip_dataset.ipynb
new file mode 100644
index 0000000..013782f
--- /dev/null
+++ b/zip_dataset.ipynb
@@ -0,0 +1,91 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import zipfile\n",
+    "from tqdm import tqdm\n",
+    "from data import Modality\n",
+    "\n",
+    "def zipdir(path, ziph):\n",
+    "    # ziph is zipfile handle\n",
+    "    for root, dirs, files in tqdm(os.walk(path)):\n",
+    "        for file in files:\n",
+    "            ziph.write(os.path.join(root, file))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/25 [00:00<?, ?it/s]\n",
+      "0it [00:00, ?it/s]\u001b[A\n",
+      "  4%|▍         | 1/25 [16:06<6:26:42, 966.77s/it]\n",
+      "0it [00:00, ?it/s]\u001b[A\n",
+      "  8%|▊         | 2/25 [34:59<6:29:43, 1016.67s/it]\n",
+      "0it [00:00, ?it/s]\u001b[A\n",
+      " 12%|█▏        | 3/25 [49:03<5:53:42, 964.67s/it] \n",
+      "0it [00:00, ?it/s]\u001b[A\n",
+      " 16%|█▌        | 4/25 [53:14<4:22:43, 750.66s/it]\n",
+      "0it [00:00, ?it/s]\u001b[A\n",
+      " 20%|██        | 5/25 [1:00:48<3:40:32, 661.64s/it]\n",
+      "0it [00:00, ?it/s]\u001b[A\n",
+      " 24%|██▍       | 6/25 [1:10:37<3:22:36, 639.80s/it]\n",
+      "0it [00:00, ?it/s]\u001b[A\n",
+      " 28%|██▊       | 7/25 [1:15:47<2:42:16, 540.89s/it]\n",
+      "0it [00:00, ?it/s]\u001b[A\n",
+      " 32%|███▏      | 8/25 [1:43:15<4:07:24, 873.21s/it]\n",
+      "0it [00:00, ?it/s]\u001b[A"
+     ]
+    }
+   ],
+   "source": [
+    "basePath = 'pats/data'\n",
+    "common_files = ['cmu_intervals_df.csv', 'cmu_intervals_df_transforms.csv', 'missing_intervals.h5']\n",
+    "#speaker = 'lec_cosmic'\n",
+    "for speaker in tqdm(Modality('pats/data').speakers):\n",
+    "  with zipfile.ZipFile(os.path.join(basePath,'zips/{}.zip'.format(speaker)), 'w', zipfile.ZIP_DEFLATED) as zipf:\n",
+    "    zipdir(os.path.join(basePath, 'processed/{}/'.format(speaker)), zipf)\n",
+    "    for f in common_files:\n",
+    "      zipf.write(os.path.join(basePath, f))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

	Word	start_frame	end_frame
0	do	0.0	7.0
1	you	7.0	9.0
2	have	9.0	10.0
3	to	10.0	12.0
4	be	12.0	15.0
5	on	15.0	21.0
6	Sunday	21.0	27.0
7	candidate	27.0	36.0
8	Pete	36.0	40.0
9	buttigieg	40.0	49.0
10	hell.	49.0	55.0
11	Fox	55.0	61.0
12	News	61.0	64.0
13	Town	64.0	70.0
14	Hall	70.0	73.0
15	as	73.0	76.0
16	I'm	76.0	78.0
17	sure	78.0	81.0
18	you	81.0	84.0
19	did	84.0	87.0
20	not	87.0	93.0
21	see	93.0	97.0
22	since	97.0	103.0
23	it	103.0	103.0
24	was	103.0	105.0
25	the	105.0	106.0
26	same	106.0	111.0
27	night	111.0	111.0
28	as	111.0	114.0
29	the	114.0	115.0
30	Game	115.0	120.0
31	of	120.0	121.0
32	Thrones	121.0	129.0
33	finale	129.0	130.0
34	footage	130.0	151.0
35	seems	151.0	159.0
36	like	159.0	162.0
37	a	162.0	163.0
38	smart	163.0	166.0
39	candidate	166.0	175.0
40	but	175.0	177.0
41	the	177.0	187.0
42	worst	187.0	192.0
43	scheduled	192.0	199.0
44	media	199.0	204.0
45	event	204.0	208.0
46	since	208.0	213.0
47	Jay-Z	213.0	222.0
48	decided	222.0	231.0
49	September	231.0	238.0
50	11th	238.0	245.0
51	2001	245.0	258.0
52	was	258.0	261.0
53	the	261.0	262.0
54	day	262.0	262.0
55	to	262.0	267.0
56	release	267.0	271.0
57	his	271.0	276.0
58	iconic	276.0	283.0
59	album	283.0	292.0