From f005132fef751c307100b0374715d22209a14eac Mon Sep 17 00:00:00 2001
From: Justin Salamon <justin.salamon@gmail.com>
Date: Wed, 11 Oct 2017 10:53:10 -0400
Subject: [PATCH 001/201] Initial commit

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 README.md

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b0dc68b
--- /dev/null
+++ b/README.md
@@ -0,0 +1,2 @@
+# l3embedding
+Learn and L3 embedding from audio/video pairs

From 1a1081495a892a018c2957e3da5a4603cdd7330d Mon Sep 17 00:00:00 2001
From: Justin Salamon <justin.salamon@gmail.com>
Date: Wed, 11 Oct 2017 11:21:52 -0400
Subject: [PATCH 002/201] Ignore pycharm files

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bc8a670
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.idea/*
\ No newline at end of file

From 8a89d27bd3702b9e4a5f1f102e180b7e4720db76 Mon Sep 17 00:00:00 2001
From: Justin Salamon <justin.salamon@gmail.com>
Date: Wed, 11 Oct 2017 11:22:05 -0400
Subject: [PATCH 003/201] Initial structure

---
 l3embedding/model.py | 42 ++++++++++++++++++++++++++++++
 l3embedding/train.py | 62 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+)
 create mode 100644 l3embedding/model.py
 create mode 100644 l3embedding/train.py

diff --git a/l3embedding/model.py b/l3embedding/model.py
new file mode 100644
index 0000000..4bf3874
--- /dev/null
+++ b/l3embedding/model.py
@@ -0,0 +1,42 @@
+
+from keras.layers import Input, Convolution2D, BatchNormalization
+
+def construct_cnn_L3_orig():
+
+    # INPUT
+    x = Input(shape=(n_freq_cnn, n_frames_cnn, 1), dtype='float32')
+
+    # CONV 1
+    y = Convolution2D(n_filters_cnn, filter_size_cnn, padding='valid',
+               activation='relu')(x)
+    y = MaxPooling2D(pool_size=pool_size_cnn, strides=None, padding='valid')(y)
+    y = BatchNormalization()(y)
+
+    # CONV 2
+    y = Convolution2D(n_filters_cnn, filter_size_cnn, padding='valid',
+               activation='relu')(y)
+    y = MaxPooling2D(pool_size=pool_size_cnn, strides=None, padding='valid')(y)
+    y = BatchNormalization()(y)
+
+    # CONV 3
+    y = Convolution2D(n_filters_cnn, filter_size_cnn, padding='valid',
+               activation='relu')(y)
+    # y = MaxPooling2D(pool_size=pool_size_cnn, strides=None, padding='valid')(y)
+    y = BatchNormalization()(y)
+
+    # Flatten for dense layers
+    y = Flatten()(y)
+    y = Dropout(0.5)(y)
+    y = Dense(n_dense_cnn, activation='relu')(y)
+    if large_cnn:
+        y = Dropout(0.5)(y)
+        y = Dense(n_dense_cnn, activation='relu')(y)
+    y = Dropout(0.5)(y)
+    y = Dense(n_classes, activation='sigmoid')(y)
+
+    m = Model(inputs=x, outputs=y)
+    return m
+
+
+
+MODELS = {'cnn_L3_orig': construct_cnn_L3_orig}
\ No newline at end of file
diff --git a/l3embedding/train.py b/l3embedding/train.py
new file mode 100644
index 0000000..40fc385
--- /dev/null
+++ b/l3embedding/train.py
@@ -0,0 +1,62 @@
+
+import pescador
+import pandas as pd
+from tqdm import tqdm
+
+
+def sampler(filename, file_list):
+    '''
+    Sample audio/video fromthe filename, with 50% change of using pairs from
+    the same file and 50% chance of mixing audio/video from another file
+    chosen at random from the file_list.
+
+    Parameters
+    ----------
+    filename
+    file_list
+
+    Returns
+    -------
+
+    '''
+
+
+def data_generator(csv_file, batch_size=64):
+    '''
+    - Load up CSV file
+    - Iterate over all training files
+    - Create a streamer per traning file
+
+    Parameters
+    ----------
+    csv_file
+
+    Returns
+    -------
+
+    '''
+
+    seeds = []
+
+    for track in tqdm(tracks):
+        fname = os.path.join(working,
+                             os.path.extsep.join([str(track), 'h5']))
+        seeds.append(pescador.Streamer(sampler, fname, file_list))
+
+    # Send it all to a mux
+    mux = pescador.Mux(seeds, k, **kwargs)
+
+    if batch_size == 1:
+        return mux
+    else:
+        return pescador.BufferedStreamer(mux, batch_size)
+
+
+def train(csv_file, batch_size=64, rate=16, seed=20171011):
+
+    train_gen = data_generator(
+        csv_file,
+        batch_size=batch_size,
+        lam=rate,
+        revive=True,
+        random_state=seed)
\ No newline at end of file

From 2d71992b6fbf4e54a94365b917b3fcd37ef4c5f9 Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Sat, 14 Oct 2017 20:07:55 -0400
Subject: [PATCH 004/201] Add basic functions to sample from audio and video
 files

---
 l3embedding/train.py | 148 +++++++++++++++++++++++++++++++++----------
 1 file changed, 116 insertions(+), 32 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 40fc385..76fac58 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -1,51 +1,135 @@
+import glob
 
 import pescador
-import pandas as pd
+import random
+import skvideo.io
+import soundfile as sf
 from tqdm import tqdm
 
 
-def sampler(filename, file_list):
-    '''
-    Sample audio/video fromthe filename, with 50% change of using pairs from
-    the same file and 50% chance of mixing audio/video from another file
-    chosen at random from the file_list.
+def get_file_list(data_dir):
+    """Return audio and video file list.
 
-    Parameters
-    ----------
-    filename
-    file_list
+    Args:
+        data_dir: input directory that contains audio/ and video/
 
-    Returns
-    -------
+    Returns:
+        audio_files: list of audio files
+        video_files: list of video files
 
-    '''
+    """
 
+    audio_files = glob.glob('{}/audio/*'.format(data_dir))
+    video_files = glob.glob('{}/video/*'.format(data_dir))
+    return audio_files, video_files
 
-def data_generator(csv_file, batch_size=64):
-    '''
-    - Load up CSV file
-    - Iterate over all training files
-    - Create a streamer per traning file
 
-    Parameters
-    ----------
-    csv_file
+def video_to_audio(video_file):
+    """Return corresponding audio_file.
 
-    Returns
-    -------
+    Args:
+        video_file: video_file
 
-    '''
+    Returns:
+        audio_file
 
-    seeds = []
+    """
+
+    *path, _, name = video_file.split('/')
+    name = name.split('.')[0] + '.flac'
+    return '/'.join(path + ['audio', name])
+
+
+def sample_one_second(audio_file, start=None):
+    """Return one second audio samples randomly if start is not specified,
+       otherwise, return one second audio samples starting with start (seconds).
+
+    Args:
+        audio_file: audio_file to sample from
+        start: starting time to fetch one second samples
+
+    Returns:
+        One second samples
+
+    """
+
+    audio_data, sampling_frequency = sf.read(audio_file)
+    if not start:
+        start = random.randrange(len(audio_data) - sampling_frequency)
+    else:
+        start = int(start * sampling_frequency)
+    return audio_data[start:start+sampling_frequency]
+
+
+def sample_one_frame(video_data, fps=30):
+    """Return one frame randomly and time (seconds).
+
+    Args:
+        video_data: video data to sample from
+        fps: frame per second
+
+    Returns:
+        One frame sampled randomly and time in seconds
+
+    """
 
-    for track in tqdm(tracks):
-        fname = os.path.join(working,
-                             os.path.extsep.join([str(track), 'h5']))
-        seeds.append(pescador.Streamer(sampler, fname, file_list))
+    num_frames = video_data.shape[0]
+    frame = random.randrange(num_frames - fps)
+    return video_data[frame, :, :, :], frame / fps
 
-    # Send it all to a mux
-    mux = pescador.Mux(seeds, k, **kwargs)
 
+def sampler(video_file, audio_files):
+    """Sample one frame from video_file, with 50% chance sample one second from corresponding audio_file,
+       50% chance sample one second from another audio_file in the list of audio_files.
+
+    Args:
+        video_file: video_file to sample from
+        audio_files: candidate audio_files to sample from
+
+    Returns:
+        A generator that yields dictionary of video sample, audio sample,
+        and label (0: not from corresponding files, 1: from corresponding files)
+
+    """
+
+    video_data = skvideo.io.vread(video_file)
+    while True:
+        sample_video_data, start = sample_one_frame(video_data)
+
+        audio_file = video_to_audio(video_file)
+        if random.random() < 0.5:
+            audio_file = random.choice([af for af in audio_files if af != audio_file])
+            sample_audio_data = sample_one_second(audio_file)
+            label = 0
+        else:
+            sample_audio_data = sample_one_second(audio_file, start)
+            label = 1
+        yield {
+            'video': sample_video_data,
+            'audio': sample_audio_data[:,0],
+            'label': label
+        }
+
+
+def data_generator(data_dir, k=32, batch_size=64):
+    """Sample video and audio from data_dir, returns a streamer that yield samples infinitely.
+
+    Args:
+        data_dir: directory to sample video and audio from
+        k: number of concurrent open streamer
+        batch_size: batch size
+
+    Returns:
+        A generator that yield infinite video and audio samples from data_dir
+
+    """
+
+    audio_files, video_files = get_file_list(data_dir)
+    seeds = []
+    for video_file in tqdm(video_files):
+        seeds.append(pescador.Streamer(sampler, video_file, audio_files))
+
+    mux = pescador.Mux(seeds, k)
     if batch_size == 1:
         return mux
     else:
@@ -59,4 +143,4 @@ def train(csv_file, batch_size=64, rate=16, seed=20171011):
         batch_size=batch_size,
         lam=rate,
         revive=True,
-        random_state=seed)
\ No newline at end of file
+        random_state=seed)

From 752d2af472ce62b1484413b18608b47f67448a94 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 17 Oct 2017 12:30:55 -0400
Subject: [PATCH 005/201] Add AudioSet ontology class

---
 audioset/__init__.py |   0
 audioset/ontology.py | 127 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+)
 create mode 100644 audioset/__init__.py
 create mode 100644 audioset/ontology.py

diff --git a/audioset/__init__.py b/audioset/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/audioset/ontology.py b/audioset/ontology.py
new file mode 100644
index 0000000..d39a002
--- /dev/null
+++ b/audioset/ontology.py
@@ -0,0 +1,127 @@
+import json
+import os
+
+
+class ASOntologyNode(object):
+    def __init__(self, ontology, _id, name, description, citation_uri, positive_examples, child_ids, restrictions):
+        self.ontology = ontology
+        self.id = _id
+        self.name = name
+        self.description = description
+        self.citation_uri = citation_uri
+        self.positive_examples = positive_examples
+        self.child_ids = child_ids
+        self.restrictions = restrictions
+
+        # Set restrictions
+        self.abstract = False
+        self.blacklist = False
+        for restr in restrictions:
+            if restr == 'abstract':
+                self.abstract = True
+            elif restr == 'blacklist':
+                self.blacklist = True
+
+        self.parent_id = None
+
+    @property
+    def children(self):
+        """
+        Get a list of this node's children
+        """
+        return self.ontology.get_node_children(self)
+
+    @property
+    def parent(self):
+        if self.parent_id is None:
+            return None
+        else:
+            return self.ontology.get_node(self.parent_id)
+
+    def is_child(self, q_child):
+        q_child = self.ontology.ensure_node(q_child)
+        # DFS to find child
+        for child in self.children:
+            if child == q_child:
+                return True
+            elif child.is_child(q_child):
+                return True
+        return False
+
+    def is_parent(self, q_parent):
+        q_parent = self.ontology.ensure_node(q_parent)
+        return q_parent.is_child(self)
+
+
+
+class ASOntology(object):
+    def __init__(self, ontology_path):
+        self._nodes = {}
+
+        # Make sure the path to the ontology file exists
+        if not os.path.exists(ontology_path):
+            error_msg =  'Cannot find ontology at "{}"'
+            raise ValueError(error_msg.format(ontology_path))
+
+        # Load the ontology object
+        with open(ontology_path, 'r') as f:
+            ontology_list = json.load(f)
+
+        # Create ontology nodes
+        for ontology_item in ontology_list:
+            _id = ontology_item['id']
+            node = ASOntologyNode(
+                self,
+                _id,
+                ontology_item['name'],
+                ontology_item['description'],
+                ontology_item['citation_uri'],
+                ontology_item['positive_examples'],
+                ontology_item['child_ids'],
+                ontology_item['restrictions']
+            )
+            self._nodes[_id] = node
+
+        self._init_tree()
+
+    def _init_tree(self):
+
+        # Do a pass through the node to assign parents
+        for node in self._nodes.values():
+            for child in node.children:
+                child.parent_id = node.id
+
+        # Do another pass to find the top level nodes, which have not
+        # been assigned a parent
+        self.top_level_node_ids = []
+        for node in self._nodes.values():
+            if node.parent is None:
+                self.top_level_node_ids.append(node.id)
+
+    @property
+    def top_level_nodes(self):
+        return [self.get_node(node_id) for node_id in self.top_level_node_ids]
+
+    def ensure_node(self, node):
+        if not isinstance(node, ASOntologyNode):
+            node = self._nodes[node]
+
+        return node
+
+    def get_node_children(self, node):
+        """
+        Get a list of children nodes of a given node
+        """
+        node = self.ensure_node(node)
+
+        return [self._nodes[child_id] for child_id in node.child_ids]
+
+    def get_node(self, node_id):
+        """
+        Get a node with the given id
+        """
+        if node_id not in self._nodes:
+            raise ValueError('No node with ID {}'.format(node_id))
+        return self._nodes[node_id]
+
+

From c3a1b3b8ea36111d9cfa133c7cd85da0eb3271f6 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sat, 21 Oct 2017 14:11:06 -0400
Subject: [PATCH 006/201] Add preliminary training code (no eval yet)

---
 l3embedding/model.py | 173 ++++++++++++++++++++++++++++++++++---------
 l3embedding/train.py | 117 +++++++++++++++++++++++++++--
 2 files changed, 252 insertions(+), 38 deletions(-)

diff --git a/l3embedding/model.py b/l3embedding/model.py
index 4bf3874..140a6ff 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -1,42 +1,149 @@
+from keras.models import Model
+from keras.layers import Input, Conv2D, BatchNormalization, MaxPooling2D,\
+                         Flatten, Concatenate, Dense
+from kapre.time_frequency import Spectrogram
 
-from keras.layers import Input, Convolution2D, BatchNormalization
 
 def construct_cnn_L3_orig():
+    """
+    Constructs a model that replicates that used in Look, Listen and Learn
 
+    Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, .
+
+    Returns
+    -------
+    model:  L3 CNN model
+            (Type: keras.models.Model)
+    """
+    ####
+    # Image subnetwork
+    ####
+    # INPUT
+    x_i = Input(shape=(224, 224, 3), dtype='float32')
+
+    # CONV BLOCK 1
+    n_filter_i_1 = 64
+    filt_size_i_1 = (3, 3)
+    pool_size_i_1 = (2,2)
+    y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
+                 activation='relu')(x_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
+                 activation='relu')(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = MaxPooling2D(pool_size=pool_size_i_1, strides=2, padding='same')(y_i)
+
+    # CONV BLOCK 2
+    n_filter_i_2 = 128
+    filt_size_i_2 = (3, 3)
+    pool_size_i_2 = (2,2)
+    y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same',
+                 activation='relu')(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same',
+                 activation='relu')(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = MaxPooling2D(pool_size=pool_size_i_2, strides=2, padding='same')(y_i)
+
+    # CONV BLOCK 3
+    n_filter_i_3 = 256
+    filt_size_i_3 = (3, 3)
+    pool_size_i_3 = (2,2)
+    y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same',
+                 activation='relu')(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same',
+                 activation='relu')(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = MaxPooling2D(pool_size=pool_size_i_3, strides=2, padding='same')(y_i)
+
+    # CONV BLOCK 4
+    n_filter_i_4 = 512
+    filt_size_i_4 = (3, 3)
+    pool_size_i_4 = (28, 28)
+    y_i = Conv2D(n_filter_i_4, filt_size_i_4, padding='same',
+                 activation='relu')(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Conv2D(n_filter_i_4, filt_size_i_4, padding='same',
+                 activation='relu')(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = MaxPooling2D(pool_size=pool_size_i_4, strides=2, padding='same')(y_i)
+    y_i = Flatten()(y_i)
+
+
+    ####
+    # Audio subnetwork
+    ####
+    n_dft = 512
+    n_hop = 16
+    asr = 48000
+    audio_window_dur = 1
     # INPUT
-    x = Input(shape=(n_freq_cnn, n_frames_cnn, 1), dtype='float32')
-
-    # CONV 1
-    y = Convolution2D(n_filters_cnn, filter_size_cnn, padding='valid',
-               activation='relu')(x)
-    y = MaxPooling2D(pool_size=pool_size_cnn, strides=None, padding='valid')(y)
-    y = BatchNormalization()(y)
-
-    # CONV 2
-    y = Convolution2D(n_filters_cnn, filter_size_cnn, padding='valid',
-               activation='relu')(y)
-    y = MaxPooling2D(pool_size=pool_size_cnn, strides=None, padding='valid')(y)
-    y = BatchNormalization()(y)
-
-    # CONV 3
-    y = Convolution2D(n_filters_cnn, filter_size_cnn, padding='valid',
-               activation='relu')(y)
-    # y = MaxPooling2D(pool_size=pool_size_cnn, strides=None, padding='valid')(y)
-    y = BatchNormalization()(y)
-
-    # Flatten for dense layers
-    y = Flatten()(y)
-    y = Dropout(0.5)(y)
-    y = Dense(n_dense_cnn, activation='relu')(y)
-    if large_cnn:
-        y = Dropout(0.5)(y)
-        y = Dense(n_dense_cnn, activation='relu')(y)
-    y = Dropout(0.5)(y)
-    y = Dense(n_classes, activation='sigmoid')(y)
-
-    m = Model(inputs=x, outputs=y)
-    return m
+    x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32')
+
+    # SPECTROGRAM PREPROCESSING
+    # 257 x 199 x 1
+    y_a = Spectrogram(n_dft=n_dft, n_hop=n_hop,
+                      return_decibel_spectrogram=True)(x_a)
+    # CONV BLOCK 1
+    n_filter_a_1 = 64
+    filt_size_a_1 = (3, 3)
+    pool_size_a_1 = (2,2)
+    y_a= Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
+               activation='relu')(y_a)
+    y_a= BatchNormalization()(y_a)
+    y_a= Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
+                       activation='relu')(y_a)
+    y_a= BatchNormalization()(y_a)
+    y_a= MaxPooling2D(pool_size=pool_size_a_1, strides=2, padding='same')(y_a)
+
+    # CONV BLOCK 2
+    n_filter_a_2 = 128
+    filt_size_a_2 = (3, 3)
+    pool_size_a_2 = (2,2)
+    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
+                activation='relu')(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
+                activation='relu')(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2, padding='same')(y_a)
+
+    # CONV BLOCK 3
+    n_filter_a_3 = 256
+    filt_size_a_3 = (3, 3)
+    pool_size_a_3 = (2,2)
+    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
+                activation='relu')(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
+                activation='relu')(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2, padding='same')(y_a)
+
+    # CONV BLOCK 4
+    n_filter_a_4 = 512
+    filt_size_a_4 = (3, 3)
+    pool_size_a_4 = (32, 24)
+    y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
+                activation='relu')(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
+                activation='relu')(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_4, strides=2, padding='same')(y_a)
+
+    y_a = Flatten()(y_a)
+
+
+
+    # Merge the subnetworks
+    y = Concatenate()([y_i, y_a])
+    y = Dense(128, activation='relu')(y)
+    y = Dense(2, activation='softmax')(y)
 
+    m = Model(inputs=[x_i, x_a], outputs=y)
+    return m, [x_i, x_a], y
 
 
 MODELS = {'cnn_L3_orig': construct_cnn_L3_orig}
\ No newline at end of file
diff --git a/l3embedding/train.py b/l3embedding/train.py
index 40fc385..49e4060 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -1,7 +1,12 @@
-
+from .model import construct_cnn_L3_orig
+import json
+import os
+import pickle
 import pescador
 import pandas as pd
 from tqdm import tqdm
+import keras
+from keras.optimizers import Adam
 
 
 def sampler(filename, file_list):
@@ -52,11 +57,113 @@ def data_generator(csv_file, batch_size=64):
         return pescador.BufferedStreamer(mux, batch_size)
 
 
-def train(csv_file, batch_size=64, rate=16, seed=20171011):
+class LossHistory(keras.callbacks.Callback):
+
+    def __init__(self, outfile):
+        super().__init__()
+        self.outfile = outfile
+
+    def on_train_begin(self, logs={}):
+        self.loss = []
+        self.val_loss = []
+
+    # def on_batch_end(self, batch, logs={}):
+    def on_epoch_end(self, epoch, logs={}):
+        self.loss.append(logs.get('loss'))
+        self.val_loss.append(logs.get('val_loss'))
+
+        loss_dict = {'loss': self.loss, 'val_loss': self.val_loss}
+        with open(self.outfile, 'wb') as fp:
+            pickle.dump(loss_dict, fp)
+
+
+def train(csv_file, model_id, output_dir, epochs=150, epoch_size=512,
+          batch_size=64, validation_size=1024, rate=16,
+          seed=20171011, verbose=False):
+    m, inputs, outputs = construct_cnn_L3_orig()
+    loss = 'binary_crossentropy'
+    metrics = ['accuracy']
+    #monitor = 'val_loss'
+
+    # Make sure the directories we need exist
+    model_dir = os.path.join(output_dir, model_id)
+    if not os.path.isdir(output_dir):
+        os.makedirs(output_dir)
+    if not os.path.isdir(model_dir):
+        os.makedirs(model_dir)
+
+    print('Compile model...')
+    m.compile(Adam(),
+              loss=loss,
+              metrics=metrics)
+
+    # Save the model
+    model_spec_path = os.path.join(model_dir, 'model_spec.pkl')
+    model_spec = keras.utils.serialize_keras_object(m)
+    with open(model_spec_path, 'wb') as fd:
+        pickle.dump(model_spec, fd)
+    model_json_path = os.path.join(model_dir, 'model.json')
+    model_json = m.to_json()
+    with open(model_json_path, 'w') as fd:
+        json.dump(model_json, fd, indent=2)
+
+    weight_path = os.path.join(model_dir, 'model.h5')
+
+    cb = []
+    cb.append(keras.callbacks.ModelCheckpoint(weight_path,
+                                              save_best_only=True,
+                                              verbose=1,))
+                                              #monitor=monitor))
+
+    history_checkpoint = os.path.join(model_dir, 'history_checkpoint.pkl')
+    cb.append(LossHistory(history_checkpoint))
+
+    history_csvlog = os.path.join(model_dir, 'history_csvlog.csv')
+    cb.append(keras.callbacks.CSVLogger(history_csvlog, append=True,
+                                        separator=','))
+
 
     train_gen = data_generator(
         csv_file,
         batch_size=batch_size,
-        lam=rate,
-        revive=True,
-        random_state=seed)
\ No newline at end of file
+        random_state=seed).tuples('video', 'audio', 'label')
+
+    train_gen = pescador.maps.keras_tuples(train_gen,
+                                           ['video', 'audio'],
+                                           'label')
+
+    # Fit the model
+    print('Fit model...')
+    if verbose:
+        verbosity = 1
+    else:
+        verbosity = 2
+    history = m.fit_generator(train_gen, epoch_size, epochs,
+    #                          validation_data=gen_val,
+    #                          validation_steps=validation_size,
+                              callbacks=cb,
+                              verbose=verbosity)
+
+    print('Done training. Saving results to disk...')
+    # Save history
+    with open(os.path.join(model_dir, 'history.pkl'), 'wb') as fd:
+        pickle.dump(history.history, fd)
+
+    # Evaluate model
+    print('Evaluate model...')
+    # Load best params
+    m.load_weights(weight_path)
+    with open(os.path.join(output_dir, 'index_test.json'), 'r') as fp:
+        test_idx = json.load(fp)['id']
+
+    # Compute eval scores
+    #results = score_model(output_dir, pump, model, test_idx, working,
+    #                      strong_label_file, duration, modelid,
+    #                      use_orig_duration=True)
+
+    # Save results to disk
+    #results_file = os.path.join(model_dir, 'results.json')
+    #with open(results_file, 'w') as fp:
+    #    json.dump(results, fp, indent=2)
+
+    #print('Done!')
\ No newline at end of file

From 044c0f02004dd16be4edf14fa6768a55fbfd362e Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Sat, 21 Oct 2017 15:04:02 -0400
Subject: [PATCH 007/201] sample any 1 second contains video, move open files
 out

---
 l3embedding/train.py | 45 +++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 76fac58..3b4085a 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -1,7 +1,7 @@
 import glob
+import random
 
 import pescador
-import random
 import skvideo.io
 import soundfile as sf
 from tqdm import tqdm
@@ -40,9 +40,9 @@ def video_to_audio(video_file):
     return '/'.join(path + ['audio', name])
 
 
-def sample_one_second(audio_file, start=None):
+def sample_one_second(audio_file, start, label):
     """Return one second audio samples randomly if start is not specified,
-       otherwise, return one second audio samples starting with start (seconds).
+       otherwise, return one second audio samples including start (seconds).
 
     Args:
         audio_file: audio_file to sample from
@@ -54,11 +54,11 @@ def sample_one_second(audio_file, start=None):
     """
 
     audio_data, sampling_frequency = sf.read(audio_file)
-    if not start:
-        start = random.randrange(len(audio_data) - sampling_frequency)
+    if label:
+        start = min(0, int(start * sampling_frequency) - random.randint(0, sampling_frequency))
     else:
-        start = int(start * sampling_frequency)
-    return audio_data[start:start+sampling_frequency]
+        start = random.randrange(len(audio_data) - sampling_frequency)
+    return audio_data[start:start+sampling_frequency], start / sampling_frequency
 
 
 def sample_one_frame(video_data, fps=30):
@@ -93,21 +93,26 @@ def sampler(video_file, audio_files):
     """
 
     video_data = skvideo.io.vread(video_file)
+    audio_file = video_to_audio(video_file)
+
+    if random.random() < 0.5:
+        audio_file = random.choice([af for af in audio_files if af != audio_file])
+        label = 0
+    else:
+        label = 1
+
     while True:
-        sample_video_data, start = sample_one_frame(video_data)
-
-        audio_file = video_to_audio(video_file)
-        if random.random() < 0.5:
-            audio_file = random.choice([af for af in audio_files if af != audio_file])
-            sample_audio_data = sample_one_second(audio_file)
-            label = 0
-        else:
-            sample_audio_data = sample_one_second(audio_file, start)
-            label = 1
+        sample_video_data, video_start = sample_one_frame(video_data)
+        sample_audio_data, audio_start = sample_one_second(audio_file, video_start, label)
+
         yield {
             'video': sample_video_data,
             'audio': sample_audio_data[:,0],
-            'label': label
+            'label': label,
+            'audio_file': audio_file,
+            'video_file': video_file,
+            'audio_start': audio_start,
+            'video_start': video_start
         }
 
 
@@ -124,9 +129,11 @@ def data_generator(data_dir, k=32, batch_size=64):
 
     """
 
+    random.seed(20171021)
+
     audio_files, video_files = get_file_list(data_dir)
     seeds = []
-    for video_file in tqdm(video_files):
+    for video_file in tqdm(random.sample(video_files, k)):
         seeds.append(pescador.Streamer(sampler, video_file, audio_files))
 
     mux = pescador.Mux(seeds, k)

From 5b76885df66651f1cb914d1dfd31f9a89eb1a4fe Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Wed, 1 Nov 2017 09:31:18 -0400
Subject: [PATCH 008/201] open audio file outside

---
 l3embedding/train.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 3b4085a..f4e63f9 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -40,7 +40,7 @@ def video_to_audio(video_file):
     return '/'.join(path + ['audio', name])
 
 
-def sample_one_second(audio_file, start, label):
+def sample_one_second(audio_data, sampling_frequency, start, label):
     """Return one second audio samples randomly if start is not specified,
        otherwise, return one second audio samples including start (seconds).
 
@@ -52,8 +52,6 @@ def sample_one_second(audio_file, start, label):
         One second samples
 
     """
-
-    audio_data, sampling_frequency = sf.read(audio_file)
     if label:
         start = min(0, int(start * sampling_frequency) - random.randint(0, sampling_frequency))
     else:
@@ -101,9 +99,11 @@ def sampler(video_file, audio_files):
     else:
         label = 1
 
+    audio_data, sampling_frequency = sf.read(audio_file)
+
     while True:
         sample_video_data, video_start = sample_one_frame(video_data)
-        sample_audio_data, audio_start = sample_one_second(audio_file, video_start, label)
+        sample_audio_data, audio_start = sample_one_second(audio_data, sampling_frequency, video_start, label)
 
         yield {
             'video': sample_video_data,
@@ -116,7 +116,7 @@ def sampler(video_file, audio_files):
         }
 
 
-def data_generator(data_dir, k=32, batch_size=64):
+def data_generator(data_dir, k=32, batch_size=64, random_state=20171021):
     """Sample video and audio from data_dir, returns a streamer that yield samples infinitely.
 
     Args:
@@ -129,7 +129,7 @@ def data_generator(data_dir, k=32, batch_size=64):
 
     """
 
-    random.seed(20171021)
+    random.seed(random_state)
 
     audio_files, video_files = get_file_list(data_dir)
     seeds = []

From 8962b39d9a295cc06548a6c30b3f9f845a669724 Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Wed, 1 Nov 2017 13:08:52 -0400
Subject: [PATCH 009/201] Add resize function

---
 l3embedding/train.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index f4e63f9..c9b0dd4 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -2,6 +2,7 @@
 import random
 
 import pescador
+from scipy.misc import imresize
 import skvideo.io
 import soundfile as sf
 from tqdm import tqdm
@@ -73,7 +74,7 @@ def sample_one_frame(video_data, fps=30):
 
     num_frames = video_data.shape[0]
     frame = random.randrange(num_frames - fps)
-    return video_data[frame, :, :, :], frame / fps
+    return imresize(video_data[frame, :, :, :], (224, 224)), frame / fps
 
 
 def sampler(video_file, audio_files):

From c5bbd03d26c214ed605a86ca6e8458096b5a246f Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 1 Nov 2017 15:36:50 -0400
Subject: [PATCH 010/201] Change image sampling to follow paper Move script
 part of train.py to a different file Add retry loop to opening video files
 Add periodic model checkpoints to training

---
 l3embedding/train.py | 150 +++++++++++++------------------------------
 train.py             |  97 ++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+), 104 deletions(-)
 create mode 100644 train.py

diff --git a/l3embedding/train.py b/l3embedding/train.py
index b7c543e..61e0189 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -1,11 +1,10 @@
-import argparse
 import json
 import os
 import pickle
 import glob
 import random
 import pescador
-from scipy.misc import imresize
+import scipy.misc
 import skvideo.io
 import soundfile as sf
 from tqdm import tqdm
@@ -68,7 +67,23 @@ def sample_one_second(audio_data, sampling_frequency, start, label):
     return audio_data[start:start+sampling_frequency], start / sampling_frequency
 
 
-def sample_one_frame(video_data, fps=30):
+def l3_frame_scaling(frame_data):
+    nx, ny, nc = frame_data.shape
+    scaling = 256.0 / min(nx, ny)
+
+    new_nx, new_ny = int(scaling * nx), int(scaling * ny)
+    assert 256 in (new_nx, new_ny)
+
+
+    resized_frame_data = scipy.misc.imresize(frame_data, (new_nx, new_ny, nc))
+
+    start_x, start_y = random.randrange(new_nx - 224), random.randrange(new_ny - 224)
+    end_x, end_y = start_x + 224, start_y + 224
+
+    return resized_frame_data[start_x:end_x, start_y:end_y, :]
+
+
+def sample_one_frame(video_data, fps=30, scaling_func=None):
     """Return one frame randomly and time (seconds).
 
     Args:
@@ -79,13 +94,16 @@ def sample_one_frame(video_data, fps=30):
         One frame sampled randomly and time in seconds
 
     """
-
+    if not scaling_func:
+        scaling_func = l3_frame_scaling
     num_frames = video_data.shape[0]
     frame = random.randrange(num_frames - fps)
-    return imresize(video_data[frame, :, :, :], (224, 224)), frame / fps
+    frame_data = video_data[frame, :, :, :]
+    frame_data = scaling_func(frame_data)
+    return frame_data, frame / fps
 
 
-def sampler(video_file, audio_files):
+def sampler(video_file, audio_files, io_retries=10):
     """Sample one frame from video_file, with 50% chance sample one second from corresponding audio_file,
        50% chance sample one second from another audio_file in the list of audio_files.
 
@@ -98,8 +116,17 @@ def sampler(video_file, audio_files):
         and label (0: not from corresponding files, 1: from corresponding files)
 
     """
+    for _ in range(io_retries):
+        try:
+            video_data = skvideo.io.vread(video_file)
+            break
+        except Exception as e:
+            print("Could not open {}. Retrying...".format(video_file))
+            continue
+    else:
+        import pdb
+        pdb.set_trace()
 
-    video_data = skvideo.io.vread(video_file)
     audio_file = video_to_audio(video_file)
 
     if random.random() < 0.5:
@@ -113,16 +140,18 @@ def sampler(video_file, audio_files):
     while True:
         sample_video_data, video_start = sample_one_frame(video_data)
         sample_audio_data, audio_start = sample_one_second(audio_data, sampling_frequency, video_start, label)
+        sample_audio_data = sample_audio_data[:,0]
 
-        yield {
+        sample = {
             'video': sample_video_data,
-            'audio': sample_audio_data[:,0],
+            'audio': sample_audio_data,
             'label': label,
             'audio_file': audio_file,
             'video_file': video_file,
             'audio_start': audio_start,
             'video_start': video_start
         }
+        yield sample
 
 
 def data_generator(data_dir, k=32, batch_size=64, random_state=20171021):
@@ -179,7 +208,7 @@ def on_epoch_end(self, epoch, logs=None):
 #def train(train_csv_path, model_id, output_dir, num_epochs=150, epoch_size=512,
 def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512,
           batch_size=64, validation_size=1024, num_streamers=16,
-          random_seed=20171021, verbose=False):
+          random_state=20171021, verbose=False, checkpoint_interval=100):
     m, inputs, outputs = construct_cnn_L3_orig()
     loss = 'binary_crossentropy'
     metrics = ['accuracy']
@@ -208,6 +237,7 @@ def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512,
         json.dump(model_json, fd, indent=2)
 
     weight_path = os.path.join(model_dir, 'model.h5')
+    checkpoint_weight_path = os.path.join(model_dir, 'model.{epoch:02d}.h5')
 
     cb = []
     cb.append(keras.callbacks.ModelCheckpoint(weight_path,
@@ -215,6 +245,10 @@ def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512,
                                               verbose=1,))
                                               #monitor=monitor))
 
+    cb.append(keras.callbacks.ModelCheckpoint(checkpoint_weight_path,
+                                              #monitor=monitor,
+                                              period=checkpoint_interval))
+
     history_checkpoint = os.path.join(model_dir, 'history_checkpoint.pkl')
     cb.append(LossHistory(history_checkpoint))
 
@@ -223,11 +257,12 @@ def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512,
                                         separator=','))
 
 
+    print('Setting up data generator...')
     train_gen = data_generator(
         #train_csv_path,
         train_data_dir,
         batch_size=batch_size,
-        random_seed=random_seed,
+        random_state=random_state,
         k=num_streamers)
 
     train_gen = pescador.maps.keras_tuples(train_gen,
@@ -269,96 +304,3 @@ def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512,
     #     json.dump(results, fp, indent=2)
 
     print('Done!')
-
-
-def parse_arguments():
-    """
-    Parse arguments from the command line
-
-
-    Returns:
-        args:  Argument dictionary
-               (Type: dict[str, *])
-    """
-    parser = argparse.ArgumentParser(description='Train an L3-like audio-visual correspondence model')
-
-    parser.add_argument('-e',
-                        '--num-epochs',
-                        dest='num_epochs',
-                        action='store',
-                        type=int,
-                        default=150,
-                        help='Maximum number of training epochs')
-
-    parser.add_argument('-es',
-                        '--epoch-size',
-                        dest='epoch_size',
-                        action='store',
-                        type=int,
-                        default=512,
-                        help='Number of training batches per epoch')
-
-    parser.add_argument('-b',
-                        '--batch-size',
-                        dest='batch_size',
-                        action='store',
-                        type=int,
-                        default=64,
-                        help='Number of training examples per batch')
-
-    parser.add_argument('-v',
-                        '--validation-size',
-                        dest='validation_size',
-                        action='store',
-                        type=int,
-                        default=1024,
-                        help='Number of trianing examples in the validation set')
-
-    parser.add_argument('-s',
-                        '--num-streamers',
-                        dest='num_streamers',
-                        action='store',
-                        type=int,
-                        default=32,
-                        help='Number of pescador streamers that can be open concurrently')
-
-    parser.add_argument('-r',
-                        '--random-seed',
-                        dest='random_seed',
-                        action='store',
-                        type=int,
-                        default=20171021,
-                        help='Random seed used to set the RNG state')
-
-    parser.add_argument('-v',
-                        '--verbose',
-                        dest='verbose',
-                        action='store_true',
-                        default=False,
-                        help='If True, print detailed messages')
-
-    """
-    parser.add_argument('train_csv_path',
-                        action='store',
-                        type=str,
-                        help='Path to training csv file')
-    """
-    parser.add_argument('train_data_dir',
-                        action='store',
-                        type=str,
-                        help='Path to directory where training subset files are stored')
-
-    parser.add_argument('model_id',
-                        action='store',
-                        type=str,
-                        help='Identifier for this model')
-
-    parser.add_argument('output_dir',
-                        action='store',
-                        type=str,
-                        help='Path to directory where output files will be stored')
-
-
-    return vars(parser.parse_args())
-
-
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..e0a4447
--- /dev/null
+++ b/train.py
@@ -0,0 +1,97 @@
+import argparse
+from l3embedding.train import *
+
+
+def parse_arguments():
+    """
+    Parse arguments from the command line
+
+
+    Returns:
+        args:  Argument dictionary
+               (Type: dict[str, *])
+    """
+    parser = argparse.ArgumentParser(description='Train an L3-like audio-visual correspondence model')
+
+    parser.add_argument('-e',
+                        '--num-epochs',
+                        dest='num_epochs',
+                        action='store',
+                        type=int,
+                        default=150,
+                        help='Maximum number of training epochs')
+
+    parser.add_argument('-es',
+                        '--epoch-size',
+                        dest='epoch_size',
+                        action='store',
+                        type=int,
+                        default=512,
+                        help='Number of training batches per epoch')
+
+    parser.add_argument('-bs',
+                        '--batch-size',
+                        dest='batch_size',
+                        action='store',
+                        type=int,
+                        default=64,
+                        help='Number of training examples per batch')
+
+    parser.add_argument('-vs',
+                        '--validation-size',
+                        dest='validation_size',
+                        action='store',
+                        type=int,
+                        default=1024,
+                        help='Number of trianing examples in the validation set')
+
+    parser.add_argument('-s',
+                        '--num-streamers',
+                        dest='num_streamers',
+                        action='store',
+                        type=int,
+                        default=32,
+                        help='Number of pescador streamers that can be open concurrently')
+
+    parser.add_argument('-r',
+                        '--random-state',
+                        dest='random_state',
+                        action='store',
+                        type=int,
+                        default=20171021,
+                        help='Random seed used to set the RNG state')
+
+    parser.add_argument('-v',
+                        '--verbose',
+                        dest='verbose',
+                        action='store_true',
+                        default=False,
+                        help='If True, print detailed messages')
+
+    """
+    parser.add_argument('train_csv_path',
+                        action='store',
+                        type=str,
+                        help='Path to training csv file')
+    """
+    parser.add_argument('train_data_dir',
+                        action='store',
+                        type=str,
+                        help='Path to directory where training subset files are stored')
+
+    parser.add_argument('model_id',
+                        action='store',
+                        type=str,
+                        help='Identifier for this model')
+
+    parser.add_argument('output_dir',
+                        action='store',
+                        type=str,
+                        help='Path to directory where output files will be stored')
+
+
+    return vars(parser.parse_args())
+
+
+if __name__ == '__main__':
+    train(**(parse_arguments()))

From 7b9c518ea4b60d78bfaff5317b0edfce00b84d6f Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Thu, 2 Nov 2017 12:20:50 -0400
Subject: [PATCH 011/201] s/min/max

---
 l3embedding/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 61e0189..b87c1f3 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -61,7 +61,7 @@ def sample_one_second(audio_data, sampling_frequency, start, label):
 
     """
     if label:
-        start = min(0, int(start * sampling_frequency) - random.randint(0, sampling_frequency))
+        start = max(0, int(start * sampling_frequency) - random.randint(0, sampling_frequency))
     else:
         start = random.randrange(len(audio_data) - sampling_frequency)
     return audio_data[start:start+sampling_frequency], start / sampling_frequency

From 38fcfa42e8925e2fb1241113462f1b3da2e50463 Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Thu, 2 Nov 2017 12:31:20 -0400
Subject: [PATCH 012/201] Custom read_video function

---
 l3embedding/train.py | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index b87c1f3..6aa8050 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -1,15 +1,18 @@
+import glob
 import json
 import os
 import pickle
-import glob
 import random
+
+import keras
+from keras.optimizers import Adam
+import numpy as np
 import pescador
 import scipy.misc
-import skvideo.io
+from skvideo.io.ffmpeg import FFmpegReader
 import soundfile as sf
 from tqdm import tqdm
-import keras
-from keras.optimizers import Adam
+
 from .model import construct_cnn_L3_orig
 
 
@@ -103,7 +106,21 @@ def sample_one_frame(video_data, fps=30, scaling_func=None):
     return frame_data, frame / fps
 
 
-def sampler(video_file, audio_files, io_retries=10):
+def read_video(video_file):
+    reader = FFmpegReader(video_file)
+    T, M, N, C = reader.getShape()
+    videodata = np.zeros((T, M, N, C), dtype=np.uint8)
+
+    length = M * N * C
+    for idx in range(T):
+        arr = np.fromstring(reader._proc.stdout.read(length), dtype=np.uint8)
+        if len(arr) != length:
+            break
+        videodata[idx, :, :, :] = arr.reshape((M, N, C))
+    return videodata
+
+
+def sampler(video_file, audio_files):
     """Sample one frame from video_file, with 50% chance sample one second from corresponding audio_file,
        50% chance sample one second from another audio_file in the list of audio_files.
 
@@ -116,17 +133,8 @@ def sampler(video_file, audio_files, io_retries=10):
         and label (0: not from corresponding files, 1: from corresponding files)
 
     """
-    for _ in range(io_retries):
-        try:
-            video_data = skvideo.io.vread(video_file)
-            break
-        except Exception as e:
-            print("Could not open {}. Retrying...".format(video_file))
-            continue
-    else:
-        import pdb
-        pdb.set_trace()
 
+    video_data = read_video(video_file)
     audio_file = video_to_audio(video_file)
 
     if random.random() < 0.5:

From f43f01fd6bbd223101ef6939a1e5a50fa4023380 Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Sun, 5 Nov 2017 17:01:18 -0500
Subject: [PATCH 013/201] fix some configuration so the model can train

---
 l3embedding/model.py | 52 +++++++++++++++++++++-----------------------
 l3embedding/train.py | 22 ++++---------------
 2 files changed, 29 insertions(+), 45 deletions(-)

diff --git a/l3embedding/model.py b/l3embedding/model.py
index 140a6ff..1408376 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -24,7 +24,7 @@ def construct_cnn_L3_orig():
     # CONV BLOCK 1
     n_filter_i_1 = 64
     filt_size_i_1 = (3, 3)
-    pool_size_i_1 = (2,2)
+    pool_size_i_1 = (2, 2)
     y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
                  activation='relu')(x_i)
     y_i = BatchNormalization()(y_i)
@@ -36,7 +36,7 @@ def construct_cnn_L3_orig():
     # CONV BLOCK 2
     n_filter_i_2 = 128
     filt_size_i_2 = (3, 3)
-    pool_size_i_2 = (2,2)
+    pool_size_i_2 = (2, 2)
     y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same',
                  activation='relu')(y_i)
     y_i = BatchNormalization()(y_i)
@@ -48,7 +48,7 @@ def construct_cnn_L3_orig():
     # CONV BLOCK 3
     n_filter_i_3 = 256
     filt_size_i_3 = (3, 3)
-    pool_size_i_3 = (2,2)
+    pool_size_i_3 = (2, 2)
     y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same',
                  activation='relu')(y_i)
     y_i = BatchNormalization()(y_i)
@@ -67,7 +67,7 @@ def construct_cnn_L3_orig():
     y_i = Conv2D(n_filter_i_4, filt_size_i_4, padding='same',
                  activation='relu')(y_i)
     y_i = BatchNormalization()(y_i)
-    y_i = MaxPooling2D(pool_size=pool_size_i_4, strides=2, padding='same')(y_i)
+    y_i = MaxPooling2D(pool_size=pool_size_i_4, padding='same')(y_i)
     y_i = Flatten()(y_i)
 
 
@@ -75,7 +75,7 @@ def construct_cnn_L3_orig():
     # Audio subnetwork
     ####
     n_dft = 512
-    n_hop = 16
+    n_hop = 242
     asr = 48000
     audio_window_dur = 1
     # INPUT
@@ -88,55 +88,53 @@ def construct_cnn_L3_orig():
     # CONV BLOCK 1
     n_filter_a_1 = 64
     filt_size_a_1 = (3, 3)
-    pool_size_a_1 = (2,2)
-    y_a= Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
-               activation='relu')(y_a)
-    y_a= BatchNormalization()(y_a)
-    y_a= Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
-                       activation='relu')(y_a)
-    y_a= BatchNormalization()(y_a)
-    y_a= MaxPooling2D(pool_size=pool_size_a_1, strides=2, padding='same')(y_a)
+    pool_size_a_1 = (2, 2)
+    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
+                activation='relu')(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
+                activation='relu')(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a)
 
     # CONV BLOCK 2
     n_filter_a_2 = 128
     filt_size_a_2 = (3, 3)
-    pool_size_a_2 = (2,2)
+    pool_size_a_2 = (2, 2)
     y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
-                activation='relu')(y_a)
+                 activation='relu')(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
-                activation='relu')(y_a)
+                 activation='relu')(y_a)
     y_a = BatchNormalization()(y_a)
-    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2, padding='same')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a)
 
     # CONV BLOCK 3
     n_filter_a_3 = 256
     filt_size_a_3 = (3, 3)
-    pool_size_a_3 = (2,2)
+    pool_size_a_3 = (2, 2)
     y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
-                activation='relu')(y_a)
+                 activation='relu')(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
-                activation='relu')(y_a)
+                 activation='relu')(y_a)
     y_a = BatchNormalization()(y_a)
-    y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2, padding='same')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a)
 
     # CONV BLOCK 4
     n_filter_a_4 = 512
     filt_size_a_4 = (3, 3)
     pool_size_a_4 = (32, 24)
     y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
-                activation='relu')(y_a)
+                 activation='relu')(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
-                activation='relu')(y_a)
+                 activation='relu')(y_a)
     y_a = BatchNormalization()(y_a)
-    y_a = MaxPooling2D(pool_size=pool_size_a_4, strides=2, padding='same')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_4)(y_a)
 
     y_a = Flatten()(y_a)
 
-
-
     # Merge the subnetworks
     y = Concatenate()([y_i, y_a])
     y = Dense(128, activation='relu')(y)
@@ -146,4 +144,4 @@ def construct_cnn_L3_orig():
     return m, [x_i, x_a], y
 
 
-MODELS = {'cnn_L3_orig': construct_cnn_L3_orig}
\ No newline at end of file
+MODELS = {'cnn_L3_orig': construct_cnn_L3_orig}
diff --git a/l3embedding/train.py b/l3embedding/train.py
index 6aa8050..efbd746 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -9,7 +9,7 @@
 import numpy as np
 import pescador
 import scipy.misc
-from skvideo.io.ffmpeg import FFmpegReader
+from skvideo.io import vread
 import soundfile as sf
 from tqdm import tqdm
 
@@ -106,20 +106,6 @@ def sample_one_frame(video_data, fps=30, scaling_func=None):
     return frame_data, frame / fps
 
 
-def read_video(video_file):
-    reader = FFmpegReader(video_file)
-    T, M, N, C = reader.getShape()
-    videodata = np.zeros((T, M, N, C), dtype=np.uint8)
-
-    length = M * N * C
-    for idx in range(T):
-        arr = np.fromstring(reader._proc.stdout.read(length), dtype=np.uint8)
-        if len(arr) != length:
-            break
-        videodata[idx, :, :, :] = arr.reshape((M, N, C))
-    return videodata
-
-
 def sampler(video_file, audio_files):
     """Sample one frame from video_file, with 50% chance sample one second from corresponding audio_file,
        50% chance sample one second from another audio_file in the list of audio_files.
@@ -134,7 +120,7 @@ def sampler(video_file, audio_files):
 
     """
 
-    video_data = read_video(video_file)
+    video_data = vread(video_file)
     audio_file = video_to_audio(video_file)
 
     if random.random() < 0.5:
@@ -148,12 +134,12 @@ def sampler(video_file, audio_files):
     while True:
         sample_video_data, video_start = sample_one_frame(video_data)
         sample_audio_data, audio_start = sample_one_second(audio_data, sampling_frequency, video_start, label)
-        sample_audio_data = sample_audio_data[:,0]
+        sample_audio_data = sample_audio_data[:, 0].reshape((1, len(sample_audio_data)))
 
         sample = {
             'video': sample_video_data,
             'audio': sample_audio_data,
-            'label': label,
+            'label': np.array([label, 1 - label]),
             'audio_file': audio_file,
             'video_file': video_file,
             'audio_start': audio_start,

From 0c3e38b1dbeb483cf1a6e7eb8595d8f5adb0cf7d Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 6 Nov 2017 01:15:33 -0500
Subject: [PATCH 014/201] Update Spectrogram parameters to reflect changes in
 kapre fork

---
 l3embedding/model.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/l3embedding/model.py b/l3embedding/model.py
index 1408376..649a9af 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -75,7 +75,8 @@ def construct_cnn_L3_orig():
     # Audio subnetwork
     ####
     n_dft = 512
-    n_hop = 242
+    n_win = 480
+    n_hop = n_win//2
     asr = 48000
     audio_window_dur = 1
     # INPUT
@@ -83,8 +84,8 @@ def construct_cnn_L3_orig():
 
     # SPECTROGRAM PREPROCESSING
     # 257 x 199 x 1
-    y_a = Spectrogram(n_dft=n_dft, n_hop=n_hop,
-                      return_decibel_spectrogram=True)(x_a)
+    y_a = Spectrogram(n_dft=n_dft, n_win=n_win, n_hop=n_hop,
+                      return_decibel_spectrogram=True, padding='valid')(x_a)
     # CONV BLOCK 1
     n_filter_a_1 = 64
     filt_size_a_1 = (3, 3)

From d87bd436a5d6ca696fde465d17290a162123b011 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 7 Nov 2017 11:49:51 -0500
Subject: [PATCH 015/201] Add data augmentation from paper

---
 l3embedding/image.py | 14 +++++++++
 l3embedding/train.py | 71 +++++++++++++++++++++++++++++++++++++-------
 2 files changed, 75 insertions(+), 10 deletions(-)
 create mode 100644 l3embedding/image.py

diff --git a/l3embedding/image.py b/l3embedding/image.py
new file mode 100644
index 0000000..b638171
--- /dev/null
+++ b/l3embedding/image.py
@@ -0,0 +1,14 @@
+import numpy as np
+import skimage.color
+
+def adjust_saturation(rgb_img, factor):
+    hsv_img = skimage.color.rgb2hsv(rgb_img)
+    hsv_img[:,:,1] = np.clip(hsv_img[:,:,1] * factor, 0.0, 1.0)
+    return skimage.color.hsv2rgb(hsv_img)
+
+
+def adjust_brightness(rgb_img, delta):
+    return np.clip(rgb_img - delta, 0.0, 1.0)
+
+def horiz_flip(rgb_img):
+    return rgb_img[:,::-1,:]
diff --git a/l3embedding/train.py b/l3embedding/train.py
index efbd746..12d187d 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -14,6 +14,7 @@
 from tqdm import tqdm
 
 from .model import construct_cnn_L3_orig
+from .image import *
 
 
 #TODO: Consider putting the sampling functionality into another file
@@ -51,7 +52,7 @@ def video_to_audio(video_file):
     return '/'.join(path + ['audio', name])
 
 
-def sample_one_second(audio_data, sampling_frequency, start, label):
+def sample_one_second(audio_data, sampling_frequency, start, label, augment=False):
     """Return one second audio samples randomly if start is not specified,
        otherwise, return one second audio samples including start (seconds).
 
@@ -63,11 +64,22 @@ def sample_one_second(audio_data, sampling_frequency, start, label):
         One second samples
 
     """
+    sampling_frequency = int(sampling_frequency)
     if label:
         start = max(0, int(start * sampling_frequency) - random.randint(0, sampling_frequency))
     else:
         start = random.randrange(len(audio_data) - sampling_frequency)
-    return audio_data[start:start+sampling_frequency], start / sampling_frequency
+
+    audio_data = audio_data[start:start+sampling_frequency]
+    if augment:
+        # Make sure we don't clip
+        gain = 1 + random.random()*min(0.1, 1.0/np.abs(audio_data).max() - 1)
+        audio_data *= gain
+        audio_aug_params = {'gain': gain}
+    else:
+        audio_aug_params = {}
+
+    return audio_data, start / sampling_frequency, audio_aug_params
 
 
 def l3_frame_scaling(frame_data):
@@ -83,10 +95,17 @@ def l3_frame_scaling(frame_data):
     start_x, start_y = random.randrange(new_nx - 224), random.randrange(new_ny - 224)
     end_x, end_y = start_x + 224, start_y + 224
 
-    return resized_frame_data[start_x:end_x, start_y:end_y, :]
+    bbox = {
+        'start_x': start_x,
+        'start_y': start_y,
+        'end_x': end_x,
+        'end_y': end_y
+    }
 
+    return resized_frame_data[start_x:end_x, start_y:end_y, :], bbox
 
-def sample_one_frame(video_data, fps=30, scaling_func=None):
+
+def sample_one_frame(video_data, fps=30, scaling_func=None, augment=False):
     """Return one frame randomly and time (seconds).
 
     Args:
@@ -102,11 +121,38 @@ def sample_one_frame(video_data, fps=30, scaling_func=None):
     num_frames = video_data.shape[0]
     frame = random.randrange(num_frames - fps)
     frame_data = video_data[frame, :, :, :]
-    frame_data = scaling_func(frame_data)
-    return frame_data, frame / fps
+    frame_data, bbox = scaling_func(frame_data)
+
+    video_aug_params = {'bounding_box': bbox}
+
+    if augment:
+        # Randomly horizontally flip the image
+        horizontal_flip = False
+        if random.random() < 0.5:
+            frame_data = horiz_flip(frame_data)
+            horizontal_flip = True
+
+        # Ranges taken from https://github.com/tensorflow/models/blob/master/research/slim/preprocessing/inception_preprocessing.py
+        # Add saturation jitter
+        saturation_factor = random.random() + 0.5
+        frame_data = adjust_saturation(frame_data, saturation_factor)
+
+        # Add brightness jitter
+        max_delta = 32. / 255.
+        brightness_delta = (2*random.random() - 1) * max_delta
+        frame_data = adjust_brightness(frame_data, brightness_delta)
+
+        video_aug_params.update({
+            'horizontal_flip': horizontal_flip,
+            'saturation_factor': saturation_factor,
+            'brightness_delta': brightness_delta
+        })
+
+
+    return frame_data, frame / fps, video_aug_params
 
 
-def sampler(video_file, audio_files):
+def sampler(video_file, audio_files, augment=False):
     """Sample one frame from video_file, with 50% chance sample one second from corresponding audio_file,
        50% chance sample one second from another audio_file in the list of audio_files.
 
@@ -132,8 +178,11 @@ def sampler(video_file, audio_files):
     audio_data, sampling_frequency = sf.read(audio_file)
 
     while True:
-        sample_video_data, video_start = sample_one_frame(video_data)
-        sample_audio_data, audio_start = sample_one_second(audio_data, sampling_frequency, video_start, label)
+        sample_video_data, video_start, video_aug_params \
+            = sample_one_frame(video_data, augment=augment)
+        sample_audio_data, audio_start, audio_aug_params \
+            = sample_one_second(audio_data, sampling_frequency, video_start,
+                                label, augment=augment)
         sample_audio_data = sample_audio_data[:, 0].reshape((1, len(sample_audio_data)))
 
         sample = {
@@ -143,7 +192,9 @@ def sampler(video_file, audio_files):
             'audio_file': audio_file,
             'video_file': video_file,
             'audio_start': audio_start,
-            'video_start': video_start
+            'video_start': video_start,
+            'audio_augment_params': audio_aug_params,
+            'video_augment_params': video_aug_params
         }
         yield sample
 

From cfd750d9749a0840ffbd087818a8a7be590ff2f2 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 7 Nov 2017 12:51:06 -0500
Subject: [PATCH 016/201] Add docstrings and fix brightness adjustment

---
 l3embedding/image.py | 36 +++++++++++++++++++++++++++++++++++-
 l3embedding/train.py | 20 ++++++++++++++++++--
 2 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/l3embedding/image.py b/l3embedding/image.py
index b638171..f14486e 100644
--- a/l3embedding/image.py
+++ b/l3embedding/image.py
@@ -1,14 +1,48 @@
 import numpy as np
+import skimage
 import skimage.color
 
 def adjust_saturation(rgb_img, factor):
+    """
+    Adjust the saturation of an RGB image
+
+    Args:
+        rgb_img: RGB image data array
+        factor: Multiplicative scaling factor to be applied to saturation
+
+    Returns:
+        adjusted_img: RGB image with adjusted saturation
+    """
     hsv_img = skimage.color.rgb2hsv(rgb_img)
     hsv_img[:,:,1] = np.clip(hsv_img[:,:,1] * factor, 0.0, 1.0)
     return skimage.color.hsv2rgb(hsv_img)
 
 
 def adjust_brightness(rgb_img, delta):
-    return np.clip(rgb_img - delta, 0.0, 1.0)
+    """
+    Adjust the brightness of an RGB image
+
+    Args:
+        rgb_img: RGB image data array
+        delta: Additive (normalized) gain factor applied to each pixel
+
+    Returns:
+        adjusted_img: RGB image with adjusted saturation
+    """
+    imin, imax = skimage.dtype_limits(rgb_img)
+    # Convert delta into the range of the image data
+    delta = rgb_img.dtype.type((imax - imin) * delta)
+
+    return np.clip(rgb_img + delta, imin, imax)
 
 def horiz_flip(rgb_img):
+    """
+    Horizontally flip the given image
+
+    Args:
+        rgb_img: RGB image data array
+
+    Returns:
+        flipped_img: Horizontally flipped image
+    """
     return rgb_img[:,::-1,:]
diff --git a/l3embedding/train.py b/l3embedding/train.py
index 12d187d..d614433 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -30,9 +30,14 @@ def get_file_list(data_dir):
         video_files: list of video files
 
     """
+    data_dir_contents = set(os.listdir(data_dir))
+    if 'audio' in data_dir_contents and 'video' in data_dir_contents:
+        audio_files = glob.glob('{}/audio/*'.format(data_dir))
+        video_files = glob.glob('{}/video/*'.format(data_dir))
+    else:
+        audio_files = glob.glob('{}/**/audio/*'.format(data_dir))
+        video_files = glob.glob('{}/**/video/*'.format(data_dir))
 
-    audio_files = glob.glob('{}/audio/*'.format(data_dir))
-    video_files = glob.glob('{}/video/*'.format(data_dir))
     return audio_files, video_files
 
 
@@ -83,6 +88,17 @@ def sample_one_second(audio_data, sampling_frequency, start, label, augment=Fals
 
 
 def l3_frame_scaling(frame_data):
+    """
+    Scale and crop an video frame, using the method from Look, Listen and Learn
+
+
+    Args:
+        frame_data: video frame data array
+
+    Returns:
+        scaled_frame_data: scaled and cropped frame data
+        bbox: bounding box for the cropped image
+    """
     nx, ny, nc = frame_data.shape
     scaling = 256.0 / min(nx, ny)
 

From db1315bd320a6699b02eab171055a4d28639d386 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 7 Nov 2017 14:10:09 -0500
Subject: [PATCH 017/201] Add augmentation option to training functions and add
 use of validation set

---
 l3embedding/train.py | 30 ++++++++++++++++++++++--------
 train.py             | 14 +++++++++++++-
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index d614433..0d02354 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -215,7 +215,7 @@ def sampler(video_file, audio_files, augment=False):
         yield sample
 
 
-def data_generator(data_dir, k=32, batch_size=64, random_state=20171021):
+def data_generator(data_dir, k=32, batch_size=64, random_state=20171021, augment=False):
     """Sample video and audio from data_dir, returns a streamer that yield samples infinitely.
 
     Args:
@@ -233,7 +233,7 @@ def data_generator(data_dir, k=32, batch_size=64, random_state=20171021):
     audio_files, video_files = get_file_list(data_dir)
     seeds = []
     for video_file in tqdm(random.sample(video_files, k)):
-        seeds.append(pescador.Streamer(sampler, video_file, audio_files))
+        seeds.append(pescador.Streamer(sampler, video_file, audio_files, augment=augment))
 
     mux = pescador.Mux(seeds, k)
     if batch_size == 1:
@@ -267,9 +267,9 @@ def on_epoch_end(self, epoch, logs=None):
 
 
 #def train(train_csv_path, model_id, output_dir, num_epochs=150, epoch_size=512,
-def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512,
+def train(train_data_dir, validation_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512,
           batch_size=64, validation_size=1024, num_streamers=16,
-          random_state=20171021, verbose=False, checkpoint_interval=100):
+          random_state=20171021, verbose=False, checkpoint_interval=100, augment=False):
     m, inputs, outputs = construct_cnn_L3_orig()
     loss = 'binary_crossentropy'
     metrics = ['accuracy']
@@ -318,18 +318,32 @@ def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512,
                                         separator=','))
 
 
-    print('Setting up data generator...')
+    print('Setting up train data generator...')
     train_gen = data_generator(
         #train_csv_path,
         train_data_dir,
         batch_size=batch_size,
         random_state=random_state,
-        k=num_streamers)
+        k=num_streamers,
+        augment=augment)
 
     train_gen = pescador.maps.keras_tuples(train_gen,
                                            ['video', 'audio'],
                                            'label')
 
+    print('Setting up validation data generator...')
+    val_gen = data_generator(
+        validation_data_dir,
+        batch_size=batch_size,
+        random_state=random_state,
+        k=num_streamers)
+
+    val_gen = pescador.maps.keras_tuples(val_gen,
+                                           ['video', 'audio'],
+                                           'label')
+
+
+
     # Fit the model
     print('Fit model...')
     if verbose:
@@ -337,8 +351,8 @@ def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512,
     else:
         verbosity = 2
     history = m.fit_generator(train_gen, epoch_size, num_epochs,
-    #                          validation_data=gen_val,
-    #                          validation_steps=validation_size,
+                              validation_data=val_gen,
+                              validation_steps=validation_size,
                               callbacks=cb,
                               verbose=verbosity)
 
diff --git a/train.py b/train.py
index e0a4447..0e329f4 100644
--- a/train.py
+++ b/train.py
@@ -61,6 +61,13 @@ def parse_arguments():
                         default=20171021,
                         help='Random seed used to set the RNG state')
 
+    parser.add_argument('-a',
+                        '--augment',
+                        dest='augment',
+                        action='store_true',
+                        default=False,
+                        help='If True, performs data augmentation on audio and images')
+
     parser.add_argument('-v',
                         '--verbose',
                         dest='verbose',
@@ -77,7 +84,12 @@ def parse_arguments():
     parser.add_argument('train_data_dir',
                         action='store',
                         type=str,
-                        help='Path to directory where training subset files are stored')
+                        help='Path to directory where training set files are stored')
+
+    parser.add_argument('validation_data_dir',
+                        action='store',
+                        type=str,
+                        help='Path to directory where validation set files are stored')
 
     parser.add_argument('model_id',
                         action='store',

From 2908f0727dcad395fa03dda5234f5a0b744adc37 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 7 Nov 2017 14:14:34 -0500
Subject: [PATCH 018/201] Randomize order of saturation and brightness
 jittering

---
 l3embedding/train.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 0d02354..e9bcc72 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -149,14 +149,26 @@ def sample_one_frame(video_data, fps=30, scaling_func=None, augment=False):
             horizontal_flip = True
 
         # Ranges taken from https://github.com/tensorflow/models/blob/master/research/slim/preprocessing/inception_preprocessing.py
-        # Add saturation jitter
-        saturation_factor = random.random() + 0.5
-        frame_data = adjust_saturation(frame_data, saturation_factor)
-
-        # Add brightness jitter
-        max_delta = 32. / 255.
-        brightness_delta = (2*random.random() - 1) * max_delta
-        frame_data = adjust_brightness(frame_data, brightness_delta)
+
+        # Randomize the order of saturation jitter and brightness jitter
+        if random.random() < 0.5:
+            # Add saturation jitter
+            saturation_factor = random.random() + 0.5
+            frame_data = adjust_saturation(frame_data, saturation_factor)
+
+            # Add brightness jitter
+            max_delta = 32. / 255.
+            brightness_delta = (2*random.random() - 1) * max_delta
+            frame_data = adjust_brightness(frame_data, brightness_delta)
+        else:
+            # Add brightness jitter
+            max_delta = 32. / 255.
+            brightness_delta = (2*random.random() - 1) * max_delta
+            frame_data = adjust_brightness(frame_data, brightness_delta)
+
+            # Add saturation jitter
+            saturation_factor = random.random() + 0.5
+            frame_data = adjust_saturation(frame_data, saturation_factor)
 
         video_aug_params.update({
             'horizontal_flip': horizontal_flip,

From a2aaf0bddf90aeb67c8d7b6aa653b365f44966a5 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 7 Nov 2017 23:24:19 -0500
Subject: [PATCH 019/201] Fix rescaling bug

---
 l3embedding/train.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index e9bcc72..57204b3 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -3,6 +3,7 @@
 import os
 import pickle
 import random
+import math
 
 import keras
 from keras.optimizers import Adam
@@ -102,8 +103,8 @@ def l3_frame_scaling(frame_data):
     nx, ny, nc = frame_data.shape
     scaling = 256.0 / min(nx, ny)
 
-    new_nx, new_ny = int(scaling * nx), int(scaling * ny)
-    assert 256 in (new_nx, new_ny)
+    new_nx, new_ny = math.ceil(scaling * nx), math.ceil(scaling * ny)
+    assert 256 in (new_nx, new_ny), str((new_nx, new_ny))
 
 
     resized_frame_data = scipy.misc.imresize(frame_data, (new_nx, new_ny, nc))

From b68eae552454724bf3e42473b1070973421aebc3 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 8 Nov 2017 12:46:37 -0500
Subject: [PATCH 020/201] Add more changes to the model so that it is
 consistent with paper

* Move ReLU _after_ batch norm
* Use weight decay (assumed L2)
* Set learning rate to 1e-4
* Allow for volume gain less than 0dB, as paper seems like it should allow for that
---
 l3embedding/model.py | 57 +++++++++++++++++++++++++++++---------------
 l3embedding/train.py | 12 ++++++----
 train.py             |  8 +++++++
 3 files changed, 53 insertions(+), 24 deletions(-)

diff --git a/l3embedding/model.py b/l3embedding/model.py
index 649a9af..37618d0 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -1,6 +1,7 @@
 from keras.models import Model
 from keras.layers import Input, Conv2D, BatchNormalization, MaxPooling2D,\
-                         Flatten, Concatenate, Dense
+                         Flatten, Concatenate, Dense, Activation
+import keras.regularizers as regularizers
 from kapre.time_frequency import Spectrogram
 
 
@@ -15,6 +16,8 @@ def construct_cnn_L3_orig():
     model:  L3 CNN model
             (Type: keras.models.Model)
     """
+    weight_decay = 1e-5
+    l2_weight_decay = regularizers.l2(weight_decay)
     ####
     # Image subnetwork
     ####
@@ -26,10 +29,12 @@ def construct_cnn_L3_orig():
     filt_size_i_1 = (3, 3)
     pool_size_i_1 = (2, 2)
     y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
-                 activation='relu')(x_i)
+                 kernel_regularizer=l2_weight_decay)(x_i)
     y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
     y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
-                 activation='relu')(y_i)
+                 kernel_regularizer=l2_weight_decay)(y_i)
+    y_i = Activation('relu')(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = MaxPooling2D(pool_size=pool_size_i_1, strides=2, padding='same')(y_i)
 
@@ -38,11 +43,13 @@ def construct_cnn_L3_orig():
     filt_size_i_2 = (3, 3)
     pool_size_i_2 = (2, 2)
     y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same',
-                 activation='relu')(y_i)
+                 kernel_regularizer=l2_weight_decay)(y_i)
     y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
     y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same',
-                 activation='relu')(y_i)
+                 kernel_regularizer=l2_weight_decay)(y_i)
     y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
     y_i = MaxPooling2D(pool_size=pool_size_i_2, strides=2, padding='same')(y_i)
 
     # CONV BLOCK 3
@@ -50,11 +57,13 @@ def construct_cnn_L3_orig():
     filt_size_i_3 = (3, 3)
     pool_size_i_3 = (2, 2)
     y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same',
-                 activation='relu')(y_i)
+                 kernel_regularizer=l2_weight_decay)(y_i)
     y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
     y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same',
-                 activation='relu')(y_i)
+                 kernel_regularizer=l2_weight_decay)(y_i)
     y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
     y_i = MaxPooling2D(pool_size=pool_size_i_3, strides=2, padding='same')(y_i)
 
     # CONV BLOCK 4
@@ -62,11 +71,13 @@ def construct_cnn_L3_orig():
     filt_size_i_4 = (3, 3)
     pool_size_i_4 = (28, 28)
     y_i = Conv2D(n_filter_i_4, filt_size_i_4, padding='same',
-                 activation='relu')(y_i)
+                 kernel_regularizer=l2_weight_decay)(y_i)
     y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
     y_i = Conv2D(n_filter_i_4, filt_size_i_4, padding='same',
-                 activation='relu')(y_i)
+                 kernel_regularizer=l2_weight_decay)(y_i)
     y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
     y_i = MaxPooling2D(pool_size=pool_size_i_4, padding='same')(y_i)
     y_i = Flatten()(y_i)
 
@@ -91,11 +102,13 @@ def construct_cnn_L3_orig():
     filt_size_a_1 = (3, 3)
     pool_size_a_1 = (2, 2)
     y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
-                activation='relu')(y_a)
+                kernel_regularizer=l2_weight_decay)(y_a)
     y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
     y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
-                activation='relu')(y_a)
+                kernel_regularizer=l2_weight_decay)(y_a)
     y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
     y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a)
 
     # CONV BLOCK 2
@@ -103,11 +116,13 @@ def construct_cnn_L3_orig():
     filt_size_a_2 = (3, 3)
     pool_size_a_2 = (2, 2)
     y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
-                 activation='relu')(y_a)
+                 kernel_regularizer=l2_weight_decay)(y_a)
     y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
     y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
-                 activation='relu')(y_a)
+                 kernel_regularizer=l2_weight_decay)(y_a)
     y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
     y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a)
 
     # CONV BLOCK 3
@@ -115,11 +130,13 @@ def construct_cnn_L3_orig():
     filt_size_a_3 = (3, 3)
     pool_size_a_3 = (2, 2)
     y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
-                 activation='relu')(y_a)
+                 kernel_regularizer=l2_weight_decay)(y_a)
     y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
     y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
-                 activation='relu')(y_a)
+                 kernel_regularizer=l2_weight_decay)(y_a)
     y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
     y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a)
 
     # CONV BLOCK 4
@@ -127,19 +144,21 @@ def construct_cnn_L3_orig():
     filt_size_a_4 = (3, 3)
     pool_size_a_4 = (32, 24)
     y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
-                 activation='relu')(y_a)
+                 kernel_regularizer=l2_weight_decay)(y_a)
     y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
     y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
-                 activation='relu')(y_a)
+                 kernel_regularizer=l2_weight_decay)(y_a)
     y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
     y_a = MaxPooling2D(pool_size=pool_size_a_4)(y_a)
 
     y_a = Flatten()(y_a)
 
     # Merge the subnetworks
     y = Concatenate()([y_i, y_a])
-    y = Dense(128, activation='relu')(y)
-    y = Dense(2, activation='softmax')(y)
+    y = Dense(128, activation='relu', kernel_regularizer=l2_weight_decay)(y)
+    y = Dense(2, activation='softmax', kernel_regularizer=l2_weight_decay)(y)
 
     m = Model(inputs=[x_i, x_a], outputs=y)
     return m, [x_i, x_a], y
diff --git a/l3embedding/train.py b/l3embedding/train.py
index 57204b3..ed61907 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -79,7 +79,8 @@ def sample_one_second(audio_data, sampling_frequency, start, label, augment=Fals
     audio_data = audio_data[start:start+sampling_frequency]
     if augment:
         # Make sure we don't clip
-        gain = 1 + random.random()*min(0.1, 1.0/np.abs(audio_data).max() - 1)
+        max_gain = min(0.1, 1.0/np.abs(audio_data).max() - 1)
+        gain = 1 + random.uniform(-0.1, max_gain)
         audio_data *= gain
         audio_aug_params = {'gain': gain}
     else:
@@ -280,9 +281,10 @@ def on_epoch_end(self, epoch, logs=None):
 
 
 #def train(train_csv_path, model_id, output_dir, num_epochs=150, epoch_size=512,
-def train(train_data_dir, validation_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512,
-          batch_size=64, validation_size=1024, num_streamers=16,
-          random_state=20171021, verbose=False, checkpoint_interval=100, augment=False):
+def train(train_data_dir, validation_data_dir, model_id, output_dir,
+          num_epochs=150, epoch_size=512, batch_size=64, validation_size=1024,
+          num_streamers=16, learning_rate=1e-4, random_state=20171021,
+          verbose=False, checkpoint_interval=100, augment=False):
     m, inputs, outputs = construct_cnn_L3_orig()
     loss = 'binary_crossentropy'
     metrics = ['accuracy']
@@ -296,7 +298,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir, num_epochs=
         os.makedirs(model_dir)
 
     print('Compile model...')
-    m.compile(Adam(),
+    m.compile(Adam(lr=learning_rate),
               loss=loss,
               metrics=metrics)
 
diff --git a/train.py b/train.py
index 0e329f4..eafbe51 100644
--- a/train.py
+++ b/train.py
@@ -53,6 +53,14 @@ def parse_arguments():
                         default=32,
                         help='Number of pescador streamers that can be open concurrently')
 
+    parser.add_argument('-lr',
+                        '--learning-rate',
+                        dest='learning_rate',
+                        action='store',
+                        type=float,
+                        default=1e-4,
+                        help='Optimization learning rate')
+
     parser.add_argument('-r',
                         '--random-state',
                         dest='random_state',

From f320653fe4474ed386f957363cd1b4a73469f7f3 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Fri, 10 Nov 2017 22:24:01 -0500
Subject: [PATCH 021/201] Split L3 network into separate audio and vision
 models

---
 l3embedding/audio_model.py  | 100 ++++++++++++++++++++
 l3embedding/model.py        | 179 +++++++-----------------------------
 l3embedding/vision_model.py |  90 ++++++++++++++++++
 3 files changed, 225 insertions(+), 144 deletions(-)
 create mode 100644 l3embedding/audio_model.py
 create mode 100644 l3embedding/vision_model.py

diff --git a/l3embedding/audio_model.py b/l3embedding/audio_model.py
new file mode 100644
index 0000000..410752d
--- /dev/null
+++ b/l3embedding/audio_model.py
@@ -0,0 +1,100 @@
+from keras.models import Model
+from keras.layers import Input, Conv2D, BatchNormalization, MaxPooling2D, \
+    Flatten, Activation
+from kapre.time_frequency import Spectrogram
+import keras.regularizers as regularizers
+
+def construct_cnn_L3_orig_audio_model():
+    """
+    Constructs a model that replicates the audio subnetwork  used in Look,
+    Listen and Learn
+
+    Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, .
+
+    Returns
+    -------
+    model:  L3 CNN model
+            (Type: keras.models.Model)
+    inputs: Model inputs
+            (Type: list[keras.layers.Input])
+    outputs: Model outputs
+            (Type: keras.layers.Layer)
+    """
+    weight_decay = 1e-5
+    l2_weight_decay = regularizers.l2(weight_decay)
+    ####
+    # Audio subnetwork
+    ####
+    n_dft = 512
+    n_win = 480
+    n_hop = n_win//2
+    asr = 48000
+    audio_window_dur = 1
+    # INPUT
+    x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32')
+
+    # SPECTROGRAM PREPROCESSING
+    # 257 x 199 x 1
+    y_a = Spectrogram(n_dft=n_dft, n_win=n_win, n_hop=n_hop,
+                      return_decibel_spectrogram=True, padding='valid')(x_a)
+    # CONV BLOCK 1
+    n_filter_a_1 = 64
+    filt_size_a_1 = (3, 3)
+    pool_size_a_1 = (2, 2)
+    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
+                 kernel_regularizer=l2_weight_decay)(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
+                 kernel_regularizer=l2_weight_decay)(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a)
+
+    # CONV BLOCK 2
+    n_filter_a_2 = 128
+    filt_size_a_2 = (3, 3)
+    pool_size_a_2 = (2, 2)
+    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
+                 kernel_regularizer=l2_weight_decay)(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
+                 kernel_regularizer=l2_weight_decay)(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a)
+
+    # CONV BLOCK 3
+    n_filter_a_3 = 256
+    filt_size_a_3 = (3, 3)
+    pool_size_a_3 = (2, 2)
+    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
+                 kernel_regularizer=l2_weight_decay)(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
+                 kernel_regularizer=l2_weight_decay)(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a)
+
+    # CONV BLOCK 4
+    n_filter_a_4 = 512
+    filt_size_a_4 = (3, 3)
+    pool_size_a_4 = (32, 24)
+    y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
+                 kernel_regularizer=l2_weight_decay)(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
+                 kernel_regularizer=l2_weight_decay)(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_4)(y_a)
+
+    y_a = Flatten(name='audio_embedding')(y_a)
+
+    m = Model(inputs=x_a, outputs=y_a)
+
+    return m, x_a, y_a
diff --git a/l3embedding/model.py b/l3embedding/model.py
index 37618d0..023f80b 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -1,13 +1,14 @@
 from keras.models import Model
-from keras.layers import Input, Conv2D, BatchNormalization, MaxPooling2D,\
-                         Flatten, Concatenate, Dense, Activation
+from keras.layers import concatenate, Dense
 import keras.regularizers as regularizers
-from kapre.time_frequency import Spectrogram
+from .vision_model import construct_cnn_L3_orig_vision_model
+from .audio_model import construct_cnn_L3_orig_audio_model
 
 
-def construct_cnn_L3_orig():
+def L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a):
     """
-    Constructs a model that replicates that used in Look, Listen and Learn
+    Merges the audio and vision subnetworks and adds additional fully connected
+    layers in the fashion of the model used in Look, Listen and Learn
 
     Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, .
 
@@ -15,153 +16,43 @@ def construct_cnn_L3_orig():
     -------
     model:  L3 CNN model
             (Type: keras.models.Model)
+    inputs: Model inputs
+            (Type: list[keras.layers.Input])
+    outputs: Model outputs
+            (Type: keras.layers.Layer)
     """
+    # Merge the subnetworks
     weight_decay = 1e-5
     l2_weight_decay = regularizers.l2(weight_decay)
-    ####
-    # Image subnetwork
-    ####
-    # INPUT
-    x_i = Input(shape=(224, 224, 3), dtype='float32')
-
-    # CONV BLOCK 1
-    n_filter_i_1 = 64
-    filt_size_i_1 = (3, 3)
-    pool_size_i_1 = (2, 2)
-    y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
-                 kernel_regularizer=l2_weight_decay)(x_i)
-    y_i = BatchNormalization()(y_i)
-    y_i = Activation('relu')(y_i)
-    y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_i)
-    y_i = Activation('relu')(y_i)
-    y_i = BatchNormalization()(y_i)
-    y_i = MaxPooling2D(pool_size=pool_size_i_1, strides=2, padding='same')(y_i)
-
-    # CONV BLOCK 2
-    n_filter_i_2 = 128
-    filt_size_i_2 = (3, 3)
-    pool_size_i_2 = (2, 2)
-    y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_i)
-    y_i = BatchNormalization()(y_i)
-    y_i = Activation('relu')(y_i)
-    y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_i)
-    y_i = BatchNormalization()(y_i)
-    y_i = Activation('relu')(y_i)
-    y_i = MaxPooling2D(pool_size=pool_size_i_2, strides=2, padding='same')(y_i)
-
-    # CONV BLOCK 3
-    n_filter_i_3 = 256
-    filt_size_i_3 = (3, 3)
-    pool_size_i_3 = (2, 2)
-    y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_i)
-    y_i = BatchNormalization()(y_i)
-    y_i = Activation('relu')(y_i)
-    y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_i)
-    y_i = BatchNormalization()(y_i)
-    y_i = Activation('relu')(y_i)
-    y_i = MaxPooling2D(pool_size=pool_size_i_3, strides=2, padding='same')(y_i)
-
-    # CONV BLOCK 4
-    n_filter_i_4 = 512
-    filt_size_i_4 = (3, 3)
-    pool_size_i_4 = (28, 28)
-    y_i = Conv2D(n_filter_i_4, filt_size_i_4, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_i)
-    y_i = BatchNormalization()(y_i)
-    y_i = Activation('relu')(y_i)
-    y_i = Conv2D(n_filter_i_4, filt_size_i_4, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_i)
-    y_i = BatchNormalization()(y_i)
-    y_i = Activation('relu')(y_i)
-    y_i = MaxPooling2D(pool_size=pool_size_i_4, padding='same')(y_i)
-    y_i = Flatten()(y_i)
-
-
-    ####
-    # Audio subnetwork
-    ####
-    n_dft = 512
-    n_win = 480
-    n_hop = n_win//2
-    asr = 48000
-    audio_window_dur = 1
-    # INPUT
-    x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32')
-
-    # SPECTROGRAM PREPROCESSING
-    # 257 x 199 x 1
-    y_a = Spectrogram(n_dft=n_dft, n_win=n_win, n_hop=n_hop,
-                      return_decibel_spectrogram=True, padding='valid')(x_a)
-    # CONV BLOCK 1
-    n_filter_a_1 = 64
-    filt_size_a_1 = (3, 3)
-    pool_size_a_1 = (2, 2)
-    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
-                kernel_regularizer=l2_weight_decay)(y_a)
-    y_a = BatchNormalization()(y_a)
-    y_a = Activation('relu')(y_a)
-    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
-                kernel_regularizer=l2_weight_decay)(y_a)
-    y_a = BatchNormalization()(y_a)
-    y_a = Activation('relu')(y_a)
-    y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a)
+    y = concatenate([vision_model(x_i), audio_model(x_a)])
+    y = Dense(128, activation='relu', kernel_regularizer=l2_weight_decay)(y)
+    y = Dense(2, activation='softmax', kernel_regularizer=l2_weight_decay)(y)
+    m = Model(inputs=[x_i, x_a], outputs=y)
 
-    # CONV BLOCK 2
-    n_filter_a_2 = 128
-    filt_size_a_2 = (3, 3)
-    pool_size_a_2 = (2, 2)
-    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_a)
-    y_a = BatchNormalization()(y_a)
-    y_a = Activation('relu')(y_a)
-    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_a)
-    y_a = BatchNormalization()(y_a)
-    y_a = Activation('relu')(y_a)
-    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a)
+    return m, [x_i, x_a], y
 
-    # CONV BLOCK 3
-    n_filter_a_3 = 256
-    filt_size_a_3 = (3, 3)
-    pool_size_a_3 = (2, 2)
-    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_a)
-    y_a = BatchNormalization()(y_a)
-    y_a = Activation('relu')(y_a)
-    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_a)
-    y_a = BatchNormalization()(y_a)
-    y_a = Activation('relu')(y_a)
-    y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a)
 
-    # CONV BLOCK 4
-    n_filter_a_4 = 512
-    filt_size_a_4 = (3, 3)
-    pool_size_a_4 = (32, 24)
-    y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_a)
-    y_a = BatchNormalization()(y_a)
-    y_a = Activation('relu')(y_a)
-    y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_a)
-    y_a = BatchNormalization()(y_a)
-    y_a = Activation('relu')(y_a)
-    y_a = MaxPooling2D(pool_size=pool_size_a_4)(y_a)
+def construct_cnn_L3_orig():
+    """
+    Constructs a model that replicates that used in Look, Listen and Learn
 
-    y_a = Flatten()(y_a)
+    Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, .
 
-    # Merge the subnetworks
-    y = Concatenate()([y_i, y_a])
-    y = Dense(128, activation='relu', kernel_regularizer=l2_weight_decay)(y)
-    y = Dense(2, activation='softmax', kernel_regularizer=l2_weight_decay)(y)
+    Returns
+    -------
+    model:  L3 CNN model
+            (Type: keras.models.Model)
+    inputs: Model inputs
+            (Type: list[keras.layers.Input])
+    outputs: Model outputs
+            (Type: keras.layers.Layer)
+    """
+    vision_model, x_i, y_i = construct_cnn_L3_orig_vision_model()
+    audio_model, x_a, y_a = construct_cnn_L3_orig_audio_model()
 
-    m = Model(inputs=[x_i, x_a], outputs=y)
-    return m, [x_i, x_a], y
+    return L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a)
 
 
-MODELS = {'cnn_L3_orig': construct_cnn_L3_orig}
+MODELS = {
+    'cnn_L3_orig': construct_cnn_L3_orig,
+}
diff --git a/l3embedding/vision_model.py b/l3embedding/vision_model.py
new file mode 100644
index 0000000..2d0696c
--- /dev/null
+++ b/l3embedding/vision_model.py
@@ -0,0 +1,90 @@
+from keras.models import Model
+from keras.layers import Input, Conv2D, BatchNormalization, MaxPooling2D, \
+    Flatten, Activation
+import keras.regularizers as regularizers
+
+
+def construct_cnn_L3_orig_vision_model():
+    """
+    Constructs a model that replicates the vision subnetwork  used in Look,
+    Listen and Learn
+
+    Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, .
+
+    Returns
+    -------
+    model:  L3 CNN model
+            (Type: keras.models.Model)
+    inputs: Model inputs
+            (Type: list[keras.layers.Input])
+    outputs: Model outputs
+            (Type: keras.layers.Layer)
+    """
+    weight_decay = 1e-5
+    l2_weight_decay = regularizers.l2(weight_decay)
+    ####
+    # Image subnetwork
+    ####
+    # INPUT
+    x_i = Input(shape=(224, 224, 3), dtype='float32')
+
+    # CONV BLOCK 1
+    n_filter_i_1 = 64
+    filt_size_i_1 = (3, 3)
+    pool_size_i_1 = (2, 2)
+    y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
+                 kernel_regularizer=l2_weight_decay)(x_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
+                 kernel_regularizer=l2_weight_decay)(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = MaxPooling2D(pool_size=pool_size_i_1, strides=2, padding='same')(y_i)
+
+    # CONV BLOCK 2
+    n_filter_i_2 = 128
+    filt_size_i_2 = (3, 3)
+    pool_size_i_2 = (2, 2)
+    y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same',
+                 kernel_regularizer=l2_weight_decay)(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same',
+                 kernel_regularizer=l2_weight_decay)(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = MaxPooling2D(pool_size=pool_size_i_2, strides=2, padding='same')(y_i)
+
+    # CONV BLOCK 3
+    n_filter_i_3 = 256
+    filt_size_i_3 = (3, 3)
+    pool_size_i_3 = (2, 2)
+    y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same',
+                 kernel_regularizer=l2_weight_decay)(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same',
+                 kernel_regularizer=l2_weight_decay)(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = MaxPooling2D(pool_size=pool_size_i_3, strides=2, padding='same')(y_i)
+
+    # CONV BLOCK 4
+    n_filter_i_4 = 512
+    filt_size_i_4 = (3, 3)
+    pool_size_i_4 = (28, 28)
+    y_i = Conv2D(n_filter_i_4, filt_size_i_4, padding='same',
+                 kernel_regularizer=l2_weight_decay)(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = Conv2D(n_filter_i_4, filt_size_i_4, padding='same',
+                 kernel_regularizer=l2_weight_decay)(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = MaxPooling2D(pool_size=pool_size_i_4, padding='same')(y_i)
+    y_i = Flatten(name='vision_embedding')(y_i)
+
+    m = Model(inputs=x_i, outputs=y_i)
+
+    return m, x_i, y_i

From 0e8254d55d05bb3fff274d38873cd46de0aa78ab Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Sat, 11 Nov 2017 22:46:32 -0500
Subject: [PATCH 022/201] Add multi-gpus support

---
 l3embedding/train.py          |  10 +-
 l3embedding/training_utils.py | 170 ++++++++++++++++++++++++++++++++++
 train.py                      |   5 +
 3 files changed, 181 insertions(+), 4 deletions(-)
 create mode 100644 l3embedding/training_utils.py

diff --git a/l3embedding/train.py b/l3embedding/train.py
index ed61907..76c2e37 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -1,9 +1,9 @@
 import glob
 import json
+import math
 import os
 import pickle
 import random
-import math
 
 import keras
 from keras.optimizers import Adam
@@ -14,8 +14,9 @@
 import soundfile as sf
 from tqdm import tqdm
 
-from .model import construct_cnn_L3_orig
 from .image import *
+from .model import construct_cnn_L3_orig
+from .training_utils import multi_gpu_model
 
 
 #TODO: Consider putting the sampling functionality into another file
@@ -284,8 +285,9 @@ def on_epoch_end(self, epoch, logs=None):
 def train(train_data_dir, validation_data_dir, model_id, output_dir,
           num_epochs=150, epoch_size=512, batch_size=64, validation_size=1024,
           num_streamers=16, learning_rate=1e-4, random_state=20171021,
-          verbose=False, checkpoint_interval=100, augment=False):
-    m, inputs, outputs = construct_cnn_L3_orig()
+          verbose=False, checkpoint_interval=100, augment=False, gpus=1):
+    single_gpu_model, inputs, outputs = construct_cnn_L3_orig()
+    m = multi_gpu_model(single_gpu_model, gpus=gpus)
     loss = 'binary_crossentropy'
     metrics = ['accuracy']
     #monitor = 'val_loss'
diff --git a/l3embedding/training_utils.py b/l3embedding/training_utils.py
new file mode 100644
index 0000000..b59ac09
--- /dev/null
+++ b/l3embedding/training_utils.py
@@ -0,0 +1,170 @@
+# Copy of https://github.com/fchollet/keras/blob/master/keras/utils/training_utils.py
+# Move import tensorflow as tf to the top to address the pickle issue
+# https://github.com/fchollet/keras/issues/8123
+
+from keras import backend as K
+from keras.engine.training import Model
+from keras.layers.core import Lambda
+from keras.layers.merge import concatenate
+import tensorflow as tf
+
+
+def _get_available_devices():
+    return [x.name for x in K.get_session().list_devices()]
+
+
+def _normalize_device_name(name):
+    name = '/' + name.lower().split('device:')[1]
+    return name
+
+
+def multi_gpu_model(model, gpus):
+    """Replicates a model on different GPUs.
+
+    Specifically, this function implements single-machine
+    multi-GPU data parallelism. It works in the following way:
+
+    - Divide the model's input(s) into multiple sub-batches.
+    - Apply a model copy on each sub-batch. Every model copy
+        is executed on a dedicated GPU.
+    - Concatenate the results (on CPU) into one big batch.
+
+    E.g. if your `batch_size` is 64 and you use `gpus=2`,
+    then we will divide the input into 2 sub-batches of 32 samples,
+    process each sub-batch on one GPU, then return the full
+    batch of 64 processed samples.
+
+    This induces quasi-linear speedup on up to 8 GPUs.
+
+    This function is only available with the TensorFlow backend
+    for the time being.
+
+    # Arguments
+        model: A Keras model instance. To avoid OOM errors,
+            this model could have been built on CPU, for instance
+            (see usage example below).
+        gpus: Integer >= 2, number of on GPUs on which to create
+            model replicas.
+
+    # Returns
+        A Keras `Model` instance which can be used just like the initial
+        `model` argument, but which distributes its workload on multiple GPUs.
+
+    # Example
+
+    ```python
+        import tensorflow as tf
+        from keras.applications import Xception
+        from keras.utils import multi_gpu_model
+        import numpy as np
+
+        num_samples = 1000
+        height = 224
+        width = 224
+        num_classes = 1000
+
+        # Instantiate the base model (or "template" model).
+        # We recommend doing this with under a CPU device scope,
+        # so that the model's weights are hosted on CPU memory.
+        # Otherwise they may end up hosted on a GPU, which would
+        # complicate weight sharing.
+        with tf.device('/cpu:0'):
+            model = Xception(weights=None,
+                             input_shape=(height, width, 3),
+                             classes=num_classes)
+
+        # Replicates the model on 8 GPUs.
+        # This assumes that your machine has 8 available GPUs.
+        parallel_model = multi_gpu_model(model, gpus=8)
+        parallel_model.compile(loss='categorical_crossentropy',
+                               optimizer='rmsprop')
+
+        # Generate dummy data.
+        x = np.random.random((num_samples, height, width, 3))
+        y = np.random.random((num_samples, num_classes))
+
+        # This `fit` call will be distributed on 8 GPUs.
+        # Since the batch size is 256, each GPU will process 32 samples.
+        parallel_model.fit(x, y, epochs=20, batch_size=256)
+
+        # Save model via the template model (which shares the same weights):
+        model.save('my_model.h5')
+    ```
+
+    # On model saving
+
+    To save the multi-gpu model, use `.save(fname)` or `.save_weights(fname)`
+    with the template model (the argument you passed to `multi_gpu_model),
+    rather than the model returned by `multi_gpu_model`.
+    """
+    if K.backend() != 'tensorflow':
+        raise ValueError('`multi_gpu_model` is only available '
+                         'with the TensorFlow backend.')
+    if gpus <= 1:
+        raise ValueError('For multi-gpu usage to be effective, '
+                         'call `multi_gpu_model` with `gpus >= 2`. '
+                         'Received: `gpus=%d`' % gpus)
+
+    target_devices = ['/cpu:0'] + ['/gpu:%d' % i for i in range(gpus)]
+    available_devices = _get_available_devices()
+    available_devices = [_normalize_device_name(name) for name in available_devices]
+    for device in target_devices:
+        if device not in available_devices:
+            raise ValueError(
+                'To call `multi_gpu_model` with `gpus=%d`, '
+                'we expect the following devices to be available: %s. '
+                'However this machine only has: %s. '
+                'Try reducing `gpus`.' % (gpus,
+                                          target_devices,
+                                          available_devices))
+
+    def get_slice(data, i, parts):
+        shape = tf.shape(data)
+        batch_size = shape[:1]
+        input_shape = shape[1:]
+        step = batch_size // parts
+        if i == gpus - 1:
+            size = batch_size - step * i
+        else:
+            size = step
+        size = tf.concat([size, input_shape], axis=0)
+        stride = tf.concat([step, input_shape * 0], axis=0)
+        start = stride * i
+        return tf.slice(data, start, size)
+
+    all_outputs = []
+    for i in range(len(model.outputs)):
+        all_outputs.append([])
+
+    # Place a copy of the model on each GPU,
+    # each getting a slice of the inputs.
+    for i in range(gpus):
+        with tf.device('/gpu:%d' % i):
+            with tf.name_scope('replica_%d' % i):
+                inputs = []
+                # Retrieve a slice of the input.
+                for x in model.inputs:
+                    input_shape = tuple(x.get_shape().as_list())[1:]
+                    slice_i = Lambda(get_slice,
+                                     output_shape=input_shape,
+                                     arguments={'i': i,
+                                                'parts': gpus})(x)
+                    inputs.append(slice_i)
+
+                # Apply model on slice
+                # (creating a model replica on the target device).
+                outputs = model(inputs)
+                if not isinstance(outputs, list):
+                    outputs = [outputs]
+
+                # Save the outputs for merging back together later.
+                for o in range(len(outputs)):
+                    all_outputs[o].append(outputs[o])
+
+    # Merge outputs on CPU.
+    with tf.device('/cpu:0'):
+        merged = []
+        for outputs in all_outputs:
+            merged.append(concatenate(outputs,
+                                      axis=0))
+        return Model(model.inputs, merged)
diff --git a/train.py b/train.py
index eafbe51..5a764c2 100644
--- a/train.py
+++ b/train.py
@@ -76,6 +76,11 @@ def parse_arguments():
                         default=False,
                         help='If True, performs data augmentation on audio and images')
 
+    parser.add_argument('--gpus',
+                        dest='gpus',
+                        default=1,
+                        help='Number of gpus used for data parallelism.')
+
     parser.add_argument('-v',
                         '--verbose',
                         dest='verbose',

From 9be139e2e583f1fdd50089040a6866ec59b4e612 Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Sun, 12 Nov 2017 10:28:22 -0500
Subject: [PATCH 023/201] default to int

---
 train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/train.py b/train.py
index 5a764c2..9710d39 100644
--- a/train.py
+++ b/train.py
@@ -78,6 +78,7 @@ def parse_arguments():
 
     parser.add_argument('--gpus',
                         dest='gpus',
+                        type=int,
                         default=1,
                         help='Number of gpus used for data parallelism.')
 

From 802910d402508f014ef122f6813b03f219903cbe Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Sun, 12 Nov 2017 11:08:39 -0500
Subject: [PATCH 024/201] Sometimes audio_data is less than a second

---
 l3embedding/train.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 76c2e37..5e2f688 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -75,7 +75,10 @@ def sample_one_second(audio_data, sampling_frequency, start, label, augment=Fals
     if label:
         start = max(0, int(start * sampling_frequency) - random.randint(0, sampling_frequency))
     else:
-        start = random.randrange(len(audio_data) - sampling_frequency)
+        try:
+            start = random.randrange(len(audio_data) - sampling_frequency)
+        except ValueError:
+            start = 0
 
     audio_data = audio_data[start:start+sampling_frequency]
     if augment:
@@ -209,11 +212,13 @@ def sampler(video_file, audio_files, augment=False):
     audio_data, sampling_frequency = sf.read(audio_file)
 
     while True:
-        sample_video_data, video_start, video_aug_params \
-            = sample_one_frame(video_data, augment=augment)
-        sample_audio_data, audio_start, audio_aug_params \
-            = sample_one_second(audio_data, sampling_frequency, video_start,
-                                label, augment=augment)
+        sample_video_data, video_start, video_aug_params = sample_one_frame(video_data,
+                                                                            augment=augment)
+        sample_audio_data, audio_start, audio_aug_params = sample_one_second(audio_data,
+                                                                             sampling_frequency,
+                                                                             video_start,
+                                                                             label,
+                                                                             augment=augment)
         sample_audio_data = sample_audio_data[:, 0].reshape((1, len(sample_audio_data)))
 
         sample = {

From 4924284589c1cb138dcb3a227c31ecee721589e0 Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Sun, 12 Nov 2017 12:02:31 -0500
Subject: [PATCH 025/201] explicit handle len(audio_data) < sampling_frequency

---
 l3embedding/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 5e2f688..32c09f3 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -75,9 +75,9 @@ def sample_one_second(audio_data, sampling_frequency, start, label, augment=Fals
     if label:
         start = max(0, int(start * sampling_frequency) - random.randint(0, sampling_frequency))
     else:
-        try:
+        if len(audio_data) > sampling_frequency:
             start = random.randrange(len(audio_data) - sampling_frequency)
-        except ValueError:
+        else:
             start = 0
 
     audio_data = audio_data[start:start+sampling_frequency]

From d00dcb5f35e5cf64f5ce2baed00e71f8f9d0f0da Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Sun, 12 Nov 2017 16:44:21 -0500
Subject: [PATCH 026/201] use default 10 to save model

---
 l3embedding/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 32c09f3..598ea61 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -290,7 +290,7 @@ def on_epoch_end(self, epoch, logs=None):
 def train(train_data_dir, validation_data_dir, model_id, output_dir,
           num_epochs=150, epoch_size=512, batch_size=64, validation_size=1024,
           num_streamers=16, learning_rate=1e-4, random_state=20171021,
-          verbose=False, checkpoint_interval=100, augment=False, gpus=1):
+          verbose=False, checkpoint_interval=10, augment=False, gpus=1):
     single_gpu_model, inputs, outputs = construct_cnn_L3_orig()
     m = multi_gpu_model(single_gpu_model, gpus=gpus)
     loss = 'binary_crossentropy'

From 45f88950c28cf875eae13b9d457057cf9f72ee79 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 13 Nov 2017 14:26:50 -0500
Subject: [PATCH 027/201] Make training a bit more robust

* Handle case where clip is all zeros (by ignoring; we will want to revisit this)
* Add ability to filter audio and video files so that an audio file will always correspond to a video file, and vice versa
* Add command line argument for checkpoint interval
---
 l3embedding/train.py | 38 ++++++++++++++++++++++++++++++++------
 train.py             | 15 +++++++++++++++
 2 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 598ea61..17333b6 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -21,7 +21,18 @@
 
 #TODO: Consider putting the sampling functionality into another file
 
-def get_file_list(data_dir):
+def get_filename(path):
+    """Return the filename of a path
+
+    Args: path: path to file
+
+    Returns:
+        filename: name of file (without extension)
+    """
+    return os.path.splitext(os.path.basename(path))[0]
+
+
+def get_file_list(data_dir, ensure_both=False):
     """Return audio and video file list.
 
     Args:
@@ -40,6 +51,14 @@ def get_file_list(data_dir):
         audio_files = glob.glob('{}/**/audio/*'.format(data_dir))
         video_files = glob.glob('{}/**/video/*'.format(data_dir))
 
+    if ensure_both:
+        audio_filenames = set([get_filename(path) for path in audio_files])
+        video_filenames = set([get_filename(path) for path in video_files])
+
+        valid_filenames = audio_filenames & video_filenames
+        audio_files = [path for path in audio_files if get_filename(path) in valid_filenames]
+        video_files = [path for path in video_files if get_filename(path) in valid_filenames]
+
     return audio_files, video_files
 
 
@@ -83,7 +102,11 @@ def sample_one_second(audio_data, sampling_frequency, start, label, augment=Fals
     audio_data = audio_data[start:start+sampling_frequency]
     if augment:
         # Make sure we don't clip
-        max_gain = min(0.1, 1.0/np.abs(audio_data).max() - 1)
+        if np.abs(audio_data).max():
+            max_gain = min(0.1, 1.0/np.abs(audio_data).max() - 1)
+        else:
+            # TODO: Handle audio with all zeros
+            max_gain = 0.1
         gain = 1 + random.uniform(-0.1, max_gain)
         audio_data *= gain
         audio_aug_params = {'gain': gain}
@@ -235,7 +258,8 @@ def sampler(video_file, audio_files, augment=False):
         yield sample
 
 
-def data_generator(data_dir, k=32, batch_size=64, random_state=20171021, augment=False):
+def data_generator(data_dir, k=32, batch_size=64, random_state=20171021,
+                   augment=False, ensure_both=False):
     """Sample video and audio from data_dir, returns a streamer that yield samples infinitely.
 
     Args:
@@ -250,7 +274,7 @@ def data_generator(data_dir, k=32, batch_size=64, random_state=20171021, augment
 
     random.seed(random_state)
 
-    audio_files, video_files = get_file_list(data_dir)
+    audio_files, video_files = get_file_list(data_dir, ensure_both=ensure_both)
     seeds = []
     for video_file in tqdm(random.sample(video_files, k)):
         seeds.append(pescador.Streamer(sampler, video_file, audio_files, augment=augment))
@@ -290,7 +314,8 @@ def on_epoch_end(self, epoch, logs=None):
 def train(train_data_dir, validation_data_dir, model_id, output_dir,
           num_epochs=150, epoch_size=512, batch_size=64, validation_size=1024,
           num_streamers=16, learning_rate=1e-4, random_state=20171021,
-          verbose=False, checkpoint_interval=10, augment=False, gpus=1):
+          verbose=False, checkpoint_interval=10, augment=False, gpus=1,
+          ensure_both_audio_video=False):
     single_gpu_model, inputs, outputs = construct_cnn_L3_orig()
     m = multi_gpu_model(single_gpu_model, gpus=gpus)
     loss = 'binary_crossentropy'
@@ -347,7 +372,8 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
         batch_size=batch_size,
         random_state=random_state,
         k=num_streamers,
-        augment=augment)
+        augment=augment,
+        ensure_both=ensure_both_audio_video)
 
     train_gen = pescador.maps.keras_tuples(train_gen,
                                            ['video', 'audio'],
diff --git a/train.py b/train.py
index 9710d39..8b08dd1 100644
--- a/train.py
+++ b/train.py
@@ -61,6 +61,14 @@ def parse_arguments():
                         default=1e-4,
                         help='Optimization learning rate')
 
+    parser.add_argument('-ci',
+                        '--checkpoint-interval',
+                        dest='checkpoint_interval',
+                        action='store',
+                        type=float,
+                        default=1e-4,
+                        help='Optimization learning rate')
+
     parser.add_argument('-r',
                         '--random-state',
                         dest='random_state',
@@ -82,6 +90,13 @@ def parse_arguments():
                         default=1,
                         help='Number of gpus used for data parallelism.')
 
+    parser.add_argument('-eb',
+                        '--ensure-both-audio-video',
+                        dest='ensure_both_audio_video',
+                        action='store_true',
+                        default=False,
+                        help='If True, only use video and audio files that exist in corresponding pairs')
+
     parser.add_argument('-v',
                         '--verbose',
                         dest='verbose',

From af38305152f24cef5c050e6ecee0b292f576546f Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 13 Nov 2017 15:00:27 -0500
Subject: [PATCH 028/201] Zero pad data that is less than 1 second, and downmix
 channels rather than take a single channel

---
 l3embedding/train.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 17333b6..3d532a2 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -4,6 +4,7 @@
 import os
 import pickle
 import random
+import warnings
 
 import keras
 from keras.optimizers import Adam
@@ -100,12 +101,19 @@ def sample_one_second(audio_data, sampling_frequency, start, label, augment=Fals
             start = 0
 
     audio_data = audio_data[start:start+sampling_frequency]
+    if audio_data.shape[0] != sampling_frequency:
+        # Pad audio that isn't one second
+        warnings.warn('Got audio that is less than one second', UserWarning)
+        audio_data = np.pad(audio_data,
+                            ((0, sampling_frequency - audio_data.shape[0]), (0,0)),
+                            mode='constant')
     if augment:
         # Make sure we don't clip
         if np.abs(audio_data).max():
             max_gain = min(0.1, 1.0/np.abs(audio_data).max() - 1)
         else:
             # TODO: Handle audio with all zeros
+            warnings.warn('Got audio sample with all zeros', UserWarning)
             max_gain = 0.1
         gain = 1 + random.uniform(-0.1, max_gain)
         audio_data *= gain
@@ -232,7 +240,7 @@ def sampler(video_file, audio_files, augment=False):
     else:
         label = 1
 
-    audio_data, sampling_frequency = sf.read(audio_file)
+    audio_data, sampling_frequency = sf.read(audio_file, always_2d=True)
 
     while True:
         sample_video_data, video_start, video_aug_params = sample_one_frame(video_data,
@@ -242,7 +250,7 @@ def sampler(video_file, audio_files, augment=False):
                                                                              video_start,
                                                                              label,
                                                                              augment=augment)
-        sample_audio_data = sample_audio_data[:, 0].reshape((1, len(sample_audio_data)))
+        sample_audio_data = sample_audio_data.mean(axis=-1).reshape((1, sample_audio_data.shape[0]))
 
         sample = {
             'video': sample_video_data,

From b1133affc443310f2c0e966900cd9dda2be4e5eb Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 13 Nov 2017 16:22:06 -0500
Subject: [PATCH 029/201] Fix checkpoint interval CLI option; always ensure
 both audio and video

---
 l3embedding/train.py | 24 +++++++++++-------------
 train.py             | 13 +++----------
 2 files changed, 14 insertions(+), 23 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 3d532a2..25c66eb 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -33,7 +33,7 @@ def get_filename(path):
     return os.path.splitext(os.path.basename(path))[0]
 
 
-def get_file_list(data_dir, ensure_both=False):
+def get_file_list(data_dir):
     """Return audio and video file list.
 
     Args:
@@ -52,13 +52,13 @@ def get_file_list(data_dir, ensure_both=False):
         audio_files = glob.glob('{}/**/audio/*'.format(data_dir))
         video_files = glob.glob('{}/**/video/*'.format(data_dir))
 
-    if ensure_both:
-        audio_filenames = set([get_filename(path) for path in audio_files])
-        video_filenames = set([get_filename(path) for path in video_files])
+    # Make sure that audio files and video files correspond to each other
+    audio_filenames = set([get_filename(path) for path in audio_files])
+    video_filenames = set([get_filename(path) for path in video_files])
 
-        valid_filenames = audio_filenames & video_filenames
-        audio_files = [path for path in audio_files if get_filename(path) in valid_filenames]
-        video_files = [path for path in video_files if get_filename(path) in valid_filenames]
+    valid_filenames = audio_filenames & video_filenames
+    audio_files = [path for path in audio_files if get_filename(path) in valid_filenames]
+    video_files = [path for path in video_files if get_filename(path) in valid_filenames]
 
     return audio_files, video_files
 
@@ -267,7 +267,7 @@ def sampler(video_file, audio_files, augment=False):
 
 
 def data_generator(data_dir, k=32, batch_size=64, random_state=20171021,
-                   augment=False, ensure_both=False):
+                   augment=False):
     """Sample video and audio from data_dir, returns a streamer that yield samples infinitely.
 
     Args:
@@ -282,7 +282,7 @@ def data_generator(data_dir, k=32, batch_size=64, random_state=20171021,
 
     random.seed(random_state)
 
-    audio_files, video_files = get_file_list(data_dir, ensure_both=ensure_both)
+    audio_files, video_files = get_file_list(data_dir)
     seeds = []
     for video_file in tqdm(random.sample(video_files, k)):
         seeds.append(pescador.Streamer(sampler, video_file, audio_files, augment=augment))
@@ -322,8 +322,7 @@ def on_epoch_end(self, epoch, logs=None):
 def train(train_data_dir, validation_data_dir, model_id, output_dir,
           num_epochs=150, epoch_size=512, batch_size=64, validation_size=1024,
           num_streamers=16, learning_rate=1e-4, random_state=20171021,
-          verbose=False, checkpoint_interval=10, augment=False, gpus=1,
-          ensure_both_audio_video=False):
+          verbose=False, checkpoint_interval=10, augment=False, gpus=1):
     single_gpu_model, inputs, outputs = construct_cnn_L3_orig()
     m = multi_gpu_model(single_gpu_model, gpus=gpus)
     loss = 'binary_crossentropy'
@@ -380,8 +379,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
         batch_size=batch_size,
         random_state=random_state,
         k=num_streamers,
-        augment=augment,
-        ensure_both=ensure_both_audio_video)
+        augment=augment)
 
     train_gen = pescador.maps.keras_tuples(train_gen,
                                            ['video', 'audio'],
diff --git a/train.py b/train.py
index 8b08dd1..7849ccd 100644
--- a/train.py
+++ b/train.py
@@ -65,9 +65,9 @@ def parse_arguments():
                         '--checkpoint-interval',
                         dest='checkpoint_interval',
                         action='store',
-                        type=float,
-                        default=1e-4,
-                        help='Optimization learning rate')
+                        type=int,
+                        default=10,
+                        help='The number of epochs between model checkpoints')
 
     parser.add_argument('-r',
                         '--random-state',
@@ -90,13 +90,6 @@ def parse_arguments():
                         default=1,
                         help='Number of gpus used for data parallelism.')
 
-    parser.add_argument('-eb',
-                        '--ensure-both-audio-video',
-                        dest='ensure_both_audio_video',
-                        action='store_true',
-                        default=False,
-                        help='If True, only use video and audio files that exist in corresponding pairs')
-
     parser.add_argument('-v',
                         '--verbose',
                         dest='verbose',

From 6dad75c592c4b68687b7b6fe4ae6574d95d4faf9 Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Tue, 14 Nov 2017 10:35:53 -0500
Subject: [PATCH 030/201] Include all video files for samples

---
 l3embedding/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 598ea61..07f4105 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -252,7 +252,7 @@ def data_generator(data_dir, k=32, batch_size=64, random_state=20171021, augment
 
     audio_files, video_files = get_file_list(data_dir)
     seeds = []
-    for video_file in tqdm(random.sample(video_files, k)):
+    for video_file in tqdm(video_files):
         seeds.append(pescador.Streamer(sampler, video_file, audio_files, augment=augment))
 
     mux = pescador.Mux(seeds, k)

From 1434d1383d1ad304dcbe57a672eb5e06b952e1cb Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Wed, 15 Nov 2017 23:16:43 -0500
Subject: [PATCH 031/201] video does not need to be 1 second long

---
 l3embedding/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index bf5a79e..706130a 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -172,7 +172,7 @@ def sample_one_frame(video_data, fps=30, scaling_func=None, augment=False):
     if not scaling_func:
         scaling_func = l3_frame_scaling
     num_frames = video_data.shape[0]
-    frame = random.randrange(num_frames - fps)
+    frame = random.randrange(num_frames)
     frame_data = video_data[frame, :, :, :]
     frame_data, bbox = scaling_func(frame_data)
 

From 01a8a2330283081d7d7b5897035171585c4c5c30 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 16 Nov 2017 14:49:37 -0500
Subject: [PATCH 032/201] Handle case where video is less than 1 second, change
 order of sampling to audio first so easier to sample video frames from entire
 video, save model weights only

---
 l3embedding/train.py | 65 +++++++++++++++++++++++++++++---------------
 1 file changed, 43 insertions(+), 22 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index bf5a79e..d5cf4cc 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -79,26 +79,23 @@ def video_to_audio(video_file):
     return '/'.join(path + ['audio', name])
 
 
-def sample_one_second(audio_data, sampling_frequency, start, label, augment=False):
-    """Return one second audio samples randomly if start is not specified,
-       otherwise, return one second audio samples including start (seconds).
+def sample_one_second(audio_data, sampling_frequency, augment=False):
+    """Return one second audio samples randomly
 
     Args:
-        audio_file: audio_file to sample from
-        start: starting time to fetch one second samples
+        audio_data: audio data to sample from
+        sampling_frequency: audio sample rate
+        augment: if True, perturb the data in some fashion
 
     Returns:
-        One second samples
+        One second samples, start time, and augmentation parameters
 
     """
     sampling_frequency = int(sampling_frequency)
-    if label:
-        start = max(0, int(start * sampling_frequency) - random.randint(0, sampling_frequency))
+    if len(audio_data) > sampling_frequency:
+        start = random.randrange(len(audio_data) - sampling_frequency)
     else:
-        if len(audio_data) > sampling_frequency:
-            start = random.randrange(len(audio_data) - sampling_frequency)
-        else:
-            start = 0
+        start = 0
 
     audio_data = audio_data[start:start+sampling_frequency]
     if audio_data.shape[0] != sampling_frequency:
@@ -158,21 +155,43 @@ def l3_frame_scaling(frame_data):
     return resized_frame_data[start_x:end_x, start_y:end_y, :], bbox
 
 
-def sample_one_frame(video_data, fps=30, scaling_func=None, augment=False):
+def sample_one_frame(video_data, start=None, fps=30, scaling_func=None, augment=False):
     """Return one frame randomly and time (seconds).
 
     Args:
         video_data: video data to sample from
+        start: start time of a one second window from which to sample
         fps: frame per second
+        scaling_func: function that rescales the sampled video frame
+        augment: if True, perturb the data in some fashion
 
     Returns:
-        One frame sampled randomly and time in seconds
+        One frame sampled randomly, start time in seconds, and augmentation parameters
 
     """
     if not scaling_func:
         scaling_func = l3_frame_scaling
+
     num_frames = video_data.shape[0]
-    frame = random.randrange(num_frames - fps)
+    if start is not None:
+        start_frame = int(start * fps)
+        # Sample frame from a one second window, or until the end of the video
+        # if the video is less than a second for some reason
+        # Audio should always be sampled one second from the end of the audio,
+        # so video frames we're sampling from should also be a second. If it's
+        # not, then our video is probably less than a second
+        duration = min(fps, num_frames - start_frame)
+        if duration != fps:
+            warnings.warn('Got video that is less than one second', UserWarning)
+
+        if duration != 0:
+            frame = start + random.randrange(duration)
+        else:
+            warnings.warn('Got video with only a single frame', UserWarning)
+            frame = 0
+    else:
+        frame = random.randrange(num_frames)
+
     frame_data = video_data[frame, :, :, :]
     frame_data, bbox = scaling_func(frame_data)
 
@@ -243,13 +262,13 @@ def sampler(video_file, audio_files, augment=False):
     audio_data, sampling_frequency = sf.read(audio_file, always_2d=True)
 
     while True:
-        sample_video_data, video_start, video_aug_params = sample_one_frame(video_data,
-                                                                            augment=augment)
         sample_audio_data, audio_start, audio_aug_params = sample_one_second(audio_data,
                                                                              sampling_frequency,
-                                                                             video_start,
                                                                              label,
                                                                              augment=augment)
+        sample_video_data, video_start, video_aug_params = sample_one_frame(video_data,
+                                                                            audio_start,
+                                                                            augment=augment)
         sample_audio_data = sample_audio_data.mean(axis=-1).reshape((1, sample_audio_data.shape[0]))
 
         sample = {
@@ -327,7 +346,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
     m = multi_gpu_model(single_gpu_model, gpus=gpus)
     loss = 'binary_crossentropy'
     metrics = ['accuracy']
-    #monitor = 'val_loss'
+    monitor = 'val_loss'
 
     # Make sure the directories we need exist
     model_dir = os.path.join(output_dir, model_id)
@@ -356,12 +375,14 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
 
     cb = []
     cb.append(keras.callbacks.ModelCheckpoint(weight_path,
+                                              save_weights_only=True,
                                               save_best_only=True,
-                                              verbose=1,))
-                                              #monitor=monitor))
+                                              verbose=1,
+                                              monitor=monitor))
 
     cb.append(keras.callbacks.ModelCheckpoint(checkpoint_weight_path,
-                                              #monitor=monitor,
+                                              save_weights_only=True,
+                                              monitor=monitor,
                                               period=checkpoint_interval))
 
     history_checkpoint = os.path.join(model_dir, 'history_checkpoint.pkl')

From f2b959f75f3fd4dbe08dbe1b3c7ce591d5093610 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Fri, 17 Nov 2017 13:33:15 -0500
Subject: [PATCH 033/201] Fix typos, randomly shuffle files, only use multi GPU
 model when using more than one GPU

---
 l3embedding/train.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index d5cf4cc..fd8d8c5 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -185,7 +185,7 @@ def sample_one_frame(video_data, start=None, fps=30, scaling_func=None, augment=
             warnings.warn('Got video that is less than one second', UserWarning)
 
         if duration != 0:
-            frame = start + random.randrange(duration)
+            frame = start_frame + random.randrange(duration)
         else:
             warnings.warn('Got video with only a single frame', UserWarning)
             frame = 0
@@ -264,10 +264,9 @@ def sampler(video_file, audio_files, augment=False):
     while True:
         sample_audio_data, audio_start, audio_aug_params = sample_one_second(audio_data,
                                                                              sampling_frequency,
-                                                                             label,
                                                                              augment=augment)
         sample_video_data, video_start, video_aug_params = sample_one_frame(video_data,
-                                                                            audio_start,
+                                                                            start=audio_start,
                                                                             augment=augment)
         sample_audio_data = sample_audio_data.mean(axis=-1).reshape((1, sample_audio_data.shape[0]))
 
@@ -302,6 +301,13 @@ def data_generator(data_dir, k=32, batch_size=64, random_state=20171021,
     random.seed(random_state)
 
     audio_files, video_files = get_file_list(data_dir)
+
+    # Randomly shuffle the files
+    ordering = list(range(len(audio_files)))
+    random.shuffle(ordering)
+    audio_files = [audio_files[idx] for idx in ordering]
+    video_files = [video_files[idx] for idx in ordering]
+
     seeds = []
     for video_file in tqdm(video_files):
         seeds.append(pescador.Streamer(sampler, video_file, audio_files, augment=augment))
@@ -342,8 +348,9 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
           num_epochs=150, epoch_size=512, batch_size=64, validation_size=1024,
           num_streamers=16, learning_rate=1e-4, random_state=20171021,
           verbose=False, checkpoint_interval=10, augment=False, gpus=1):
-    single_gpu_model, inputs, outputs = construct_cnn_L3_orig()
-    m = multi_gpu_model(single_gpu_model, gpus=gpus)
+    m, inputs, outputs = construct_cnn_L3_orig()
+    if gpus > 1:
+        m = multi_gpu_model(m, gpus=gpus)
     loss = 'binary_crossentropy'
     metrics = ['accuracy']
     monitor = 'val_loss'

From 4ccfcbbedbd444bbf7a4ddd1531390573baaa9a4 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Fri, 17 Nov 2017 13:51:42 -0500
Subject: [PATCH 034/201] User start frame instead of first frame when sample
 window is only a single frame and use last frame if the video was truncated

---
 l3embedding/train.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index fd8d8c5..d09cc15 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -184,11 +184,13 @@ def sample_one_frame(video_data, start=None, fps=30, scaling_func=None, augment=
         if duration != fps:
             warnings.warn('Got video that is less than one second', UserWarning)
 
-        if duration != 0:
+        if duration > 0:
             frame = start_frame + random.randrange(duration)
         else:
             warnings.warn('Got video with only a single frame', UserWarning)
-            frame = 0
+            # For robustness, use the last frame if the start_frame goes past
+            # the end of video frame
+            frame = min(start_frame, num_frames - 1)
     else:
         frame = random.randrange(num_frames)
 

From 7166e51a2b175e7e1d0ef64f7b3cbd2fb348dcda Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 22 Nov 2017 12:45:03 -0500
Subject: [PATCH 035/201] Close streamer if files cannot be opened

---
 l3embedding/train.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index d09cc15..ef53d39 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -252,7 +252,13 @@ def sampler(video_file, audio_files, augment=False):
 
     """
 
-    video_data = vread(video_file)
+    try:
+        video_data = vread(video_file)
+    except Exception as e:
+        warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
+        warnings.warn(warn_msg.format(video_file, type(e), e))
+        raise StopIteration()
+
     audio_file = video_to_audio(video_file)
 
     if random.random() < 0.5:
@@ -261,7 +267,12 @@ def sampler(video_file, audio_files, augment=False):
     else:
         label = 1
 
-    audio_data, sampling_frequency = sf.read(audio_file, always_2d=True)
+    try:
+        audio_data, sampling_frequency = sf.read(audio_file, always_2d=True)
+    except Exception as e:
+        warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
+        warnings.warn(warn_msg.format(audio_file, type(e), e))
+        raise StopIteration()
 
     while True:
         sample_audio_data, audio_start, audio_aug_params = sample_one_second(audio_data,
@@ -314,6 +325,11 @@ def data_generator(data_dir, k=32, batch_size=64, random_state=20171021,
     for video_file in tqdm(video_files):
         seeds.append(pescador.Streamer(sampler, video_file, audio_files, augment=augment))
 
+    # TODO:
+    # Order 1024 streamers open
+    # Set rate 16?
+    # Set larger rate for validation (32) for stability
+    # Sampling is very delicate!
     mux = pescador.Mux(seeds, k)
     if batch_size == 1:
         return mux

From a2ffb8219cdb487d98860d62f3f1742a8f5560f1 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 23 Nov 2017 15:04:43 -0500
Subject: [PATCH 036/201] Change sampling scheme and add filtering

* Change pescador streamers to open two videos and randomly choose video and audio
* Add filtering (positive or negative) based on ytid and class
---
 l3embedding/train.py    |  187 +-
 resources/ontology.json | 5690 +++++++++++++++++++++++++++++++++++++++
 train.py                |   98 +-
 3 files changed, 5924 insertions(+), 51 deletions(-)
 create mode 100644 resources/ontology.json

diff --git a/l3embedding/train.py b/l3embedding/train.py
index ef53d39..9aa0485 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -5,6 +5,7 @@
 import pickle
 import random
 import warnings
+import csv
 
 import keras
 from keras.optimizers import Adam
@@ -18,6 +19,7 @@
 from .image import *
 from .model import construct_cnn_L3_orig
 from .training_utils import multi_gpu_model
+from ..audioset.ontology import ASOntology
 
 
 #TODO: Consider putting the sampling functionality into another file
@@ -33,7 +35,45 @@ def get_filename(path):
     return os.path.splitext(os.path.basename(path))[0]
 
 
-def get_file_list(data_dir):
+def load_metadata(metadata_path):
+    metadata = {}
+    with open(metadata_path, 'r') as f:
+        for idx, line in enumerate(f):
+            if idx in (0, 1):
+                continue
+            elif idx == 2:
+                fields = [field.strip() for field in line.lstrip('# ').rstrip().split(',')]
+            else:
+                row = [val.strip() for val in line.strip().split(',')]
+                ytid = row[0]
+
+                entry = {field: val
+                         for field, val in zip(fields, row[1:])}
+
+                entry['positive_labels'] = entry['positive_labels'].strip('"').split(',')
+
+                metadata[ytid] = entry
+
+    return metadata
+
+
+def load_filters(filter_path):
+    filters = []
+
+    with open(filter_path, 'r') as f:
+        reader = csv.DictReader(filter_path)
+        for row in reader:
+            filters.append(row)
+
+    return filters
+
+def get_ytid_from_filename(filename):
+    first_us_idx = filename.rindex('_')
+    second_us_idx = filename.rindeX('_', 0, first_us_idx)
+    return filename[:second_us_idx]
+
+
+def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=None):
     """Return audio and video file list.
 
     Args:
@@ -57,6 +97,44 @@ def get_file_list(data_dir):
     video_filenames = set([get_filename(path) for path in video_files])
 
     valid_filenames = audio_filenames & video_filenames
+
+    if metadata_path and filter_path:
+        if not ontology_path:
+            raise ValueError('Must provide ontology path to filter')
+
+        ontology = ASOntology(ontology_path)
+
+        metadata = load_metadata(metadata_path)
+        filters = load_filters(filter_path)
+
+        filtered_filenames = []
+
+        for filename in valid_filenames:
+            ytid = get_ytid_from_filename(filename)
+            video_metadata = metadata[ytid]
+
+            accept = True
+            for _filter in filters:
+                filter_type = _filter['filter_type']
+                filter_accept = _filter['accept_reject'] == 'accept'
+                string = _filter['string']
+
+                if filter_type == 'ytid':
+                    match = ytid == string
+
+                elif filter_type == 'label':
+                    label = ontology.get_node(string).name
+                    match = label in video_metadata['positive_labels']
+
+                # TODO: check this logic
+                if match == filter_accept:
+                    accept = False
+
+            if accept:
+                filtered_filenames.append(filename)
+
+        valid_filenames = filtered_filenames
+
     audio_files = [path for path in audio_files if get_filename(path) in valid_filenames]
     video_files = [path for path in video_files if get_filename(path) in valid_filenames]
 
@@ -238,13 +316,13 @@ def sample_one_frame(video_data, start=None, fps=30, scaling_func=None, augment=
     return frame_data, frame / fps, video_aug_params
 
 
-def sampler(video_file, audio_files, augment=False):
+def sampler(video_file_1, video_file_2, augment=False):
     """Sample one frame from video_file, with 50% chance sample one second from corresponding audio_file,
        50% chance sample one second from another audio_file in the list of audio_files.
 
     Args:
-        video_file: video_file to sample from
-        audio_files: candidate audio_files to sample from
+        video_file_1: video_file to sample from
+        video_file_2: candidate audio_files to sample from
 
     Returns:
         A generator that yields dictionary of video sample, audio sample,
@@ -253,28 +331,57 @@ def sampler(video_file, audio_files, augment=False):
     """
 
     try:
-        video_data = vread(video_file)
+        video_data_1 = vread(video_file_1)
+    except Exception as e:
+        warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
+        warnings.warn(warn_msg.format(video_file_1, type(e), e))
+        raise StopIteration()
+
+    try:
+        video_data_2 = vread(video_file_2)
     except Exception as e:
         warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
-        warnings.warn(warn_msg.format(video_file, type(e), e))
+        warnings.warn(warn_msg.format(video_file_2, type(e), e))
         raise StopIteration()
 
-    audio_file = video_to_audio(video_file)
+    audio_file_1 = video_to_audio(video_file_1)
+    audio_file_2 = video_to_audio(video_file_2)
 
-    if random.random() < 0.5:
-        audio_file = random.choice([af for af in audio_files if af != audio_file])
-        label = 0
-    else:
-        label = 1
+    try:
+        audio_data_1, sampling_frequency = sf.read(audio_file_1, always_2d=True)
+    except Exception as e:
+        warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
+        warnings.warn(warn_msg.format(audio_file_1, type(e), e))
+        raise StopIteration()
 
     try:
-        audio_data, sampling_frequency = sf.read(audio_file, always_2d=True)
+        audio_data_2, sampling_frequency = sf.read(audio_file_2, always_2d=True)
     except Exception as e:
         warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
-        warnings.warn(warn_msg.format(audio_file, type(e), e))
+        warnings.warn(warn_msg.format(audio_file_2, type(e), e))
         raise StopIteration()
 
     while True:
+
+        video_choice = random.random() < 0.5
+        audio_choice = random.random() < 0.5
+
+        if audio_choice:
+            audio_file = audio_file_1
+            audio_data = audio_data_1
+        else:
+            audio_file = audio_file_2
+            audio_data = audio_data_2
+
+        if video_choice:
+            video_file = video_file_1
+            video_data = video_data_1
+        else:
+            video_file = video_file_2
+            video_data = video_data_2
+
+        label = int(video_choice != audio_choice)
+
         sample_audio_data, audio_start, audio_aug_params = sample_one_second(audio_data,
                                                                              sampling_frequency,
                                                                              augment=augment)
@@ -297,8 +404,9 @@ def sampler(video_file, audio_files, augment=False):
         yield sample
 
 
-def data_generator(data_dir, k=32, batch_size=64, random_state=20171021,
-                   augment=False):
+def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path=None,
+                   k=32, batch_size=64, random_state=20171021,
+                   num_distractors=1, augment=False, rate=32):
     """Sample video and audio from data_dir, returns a streamer that yield samples infinitely.
 
     Args:
@@ -313,7 +421,8 @@ def data_generator(data_dir, k=32, batch_size=64, random_state=20171021,
 
     random.seed(random_state)
 
-    audio_files, video_files = get_file_list(data_dir)
+    audio_files, video_files = get_file_list(data_dir, metadata_path=metadata_path,
+                                             filter_path=filter_path, ontology_path=ontology_path)
 
     # Randomly shuffle the files
     ordering = list(range(len(audio_files)))
@@ -322,19 +431,21 @@ def data_generator(data_dir, k=32, batch_size=64, random_state=20171021,
     video_files = [video_files[idx] for idx in ordering]
 
     seeds = []
-    for video_file in tqdm(video_files):
-        seeds.append(pescador.Streamer(sampler, video_file, audio_files, augment=augment))
+    for video_file_1 in tqdm(video_files):
+        for _ in range(num_distractors):
+            video_file_2 = random.choice(video_files)
+            seeds.append(pescador.Streamer(sampler, video_file_1, video_file_2, augment=augment))
 
     # TODO:
     # Order 1024 streamers open
     # Set rate 16?
     # Set larger rate for validation (32) for stability
     # Sampling is very delicate!
-    mux = pescador.Mux(seeds, k)
+    mux = pescador.Mux(seeds, k, rate=rate)
     if batch_size == 1:
         return mux
     else:
-        return pescador.BufferedStreamer(mux, batch_size)
+        return pescador.maps.buffer_stream(mux, batch_size)
 
 
 class LossHistory(keras.callbacks.Callback):
@@ -363,9 +474,16 @@ def on_epoch_end(self, epoch, logs=None):
 
 #def train(train_csv_path, model_id, output_dir, num_epochs=150, epoch_size=512,
 def train(train_data_dir, validation_data_dir, model_id, output_dir,
-          num_epochs=150, epoch_size=512, batch_size=64, validation_size=1024,
-          num_streamers=16, learning_rate=1e-4, random_state=20171021,
-          verbose=False, checkpoint_interval=10, augment=False, gpus=1):
+          num_epochs=150,
+          train_metadata_path=None, validation_metadata_path=None,
+          train_filter_path=None, validation_filter_path=None,
+          train_epoch_size=512, validation_epoch_size=1024,
+          train_batch_size=64, validation_batch_size=64,
+          train_num_streamers=16, validation_num_streamers=16,
+          train_mux_rate=16, validation_mux_rate=16,
+          learning_rate=1e-4, random_state=20171021,
+          verbose=False, checkpoint_interval=10, ontology_path=None,
+          augment=False, gpus=1):
     m, inputs, outputs = construct_cnn_L3_orig()
     if gpus > 1:
         m = multi_gpu_model(m, gpus=gpus)
@@ -420,12 +538,14 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
 
     print('Setting up train data generator...')
     train_gen = data_generator(
-        #train_csv_path,
         train_data_dir,
-        batch_size=batch_size,
+        metadata_path=train_metadata_path,
+        filter_path=train_filter_path,
+        batch_size=train_batch_size,
         random_state=random_state,
-        k=num_streamers,
-        augment=augment)
+        k=train_num_streamers,
+        augment=augment,
+        rate=train_mux_rate)
 
     train_gen = pescador.maps.keras_tuples(train_gen,
                                            ['video', 'audio'],
@@ -434,9 +554,12 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
     print('Setting up validation data generator...')
     val_gen = data_generator(
         validation_data_dir,
-        batch_size=batch_size,
+        metadata_path=validation_metadata_path,
+        filter_path=validation_filter_path,
+        batch_size=validation_batch_size,
         random_state=random_state,
-        k=num_streamers)
+        k=validation_num_streamers,
+        rate=validation_mux_rate)
 
     val_gen = pescador.maps.keras_tuples(val_gen,
                                            ['video', 'audio'],
@@ -450,9 +573,9 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
         verbosity = 1
     else:
         verbosity = 2
-    history = m.fit_generator(train_gen, epoch_size, num_epochs,
+    history = m.fit_generator(train_gen, train_epoch_size, num_epochs,
                               validation_data=val_gen,
-                              validation_steps=validation_size,
+                              validation_steps=validation_epoch_size,
                               callbacks=cb,
                               verbose=verbosity)
 
diff --git a/resources/ontology.json b/resources/ontology.json
new file mode 100644
index 0000000..cee2c50
--- /dev/null
+++ b/resources/ontology.json
@@ -0,0 +1,5690 @@
+[
+  {
+    "id": "/m/0dgw9r",
+    "name": "Human sounds",
+    "description": "Sounds produced by the human body through the actions of the individual.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/m/09l8g", "/m/01w250", "/m/09hlz4", "/m/0bpl036", "/m/0160x5", "/m/0k65p", "/m/01jg02", "/m/04xp5v", "/t/dd00012"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/m/09l8g",
+    "name": "Human voice",
+    "description": "The human voice consists of sound made by a human being using the vocal folds for talking, singing, laughing, crying, screaming, etc. The human voice is specifically a part of human sound production in which the vocal folds are the primary sound source.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Human_voice",
+    "positive_examples": [],
+    "child_ids": ["/m/09x0r", "/m/07p6fty", "/m/03qc9zr", "/m/02rtxlg", "/m/01j3sz", "/m/0463cq4", "/m/07qw_06", "/m/07plz5l", "/m/015lz1", "/m/02fxyj", "/m/07s2xch", "/m/07r4k75", "/m/01j423"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/m/09x0r",
+    "name": "Speech",
+    "description": "Speech is the vocalized form of human communication, created out of the phonetic combination of a limited set of vowel and consonant speech sound units.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Speech",
+    "positive_examples": ["youtu.be/8uI9H5jGRV8?start=30&end=40", "youtu.be/cz8QIJHjZh4?start=30&end=40", "youtu.be/91FIbeKyIhw?start=30&end=40", "youtu.be/V3HdocEv8gw?start=220&end=230", "youtu.be/a8igK2oWlVY?start=30&end=40", "youtu.be/2hvTtM7VdJg?start=30&end=40", "youtu.be/S7ytQItOrkk?start=30&end=40", "youtu.be/Zyl2Q3GepPA?start=30&end=40", "youtu.be/CU1qRm_Bcps?start=30&end=40"],
+    "child_ids": ["/m/05zppz", "/m/02zsn", "/m/0ytgt", "/m/01h8n0", "/m/02qldy", "/m/0261r1", "/m/0brhx"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05zppz",
+    "name": "Male speech, man speaking",
+    "description": "Speech uttered by an adult male human.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/6niRPYpLOpQ?start=30&end=40", "youtu.be/fPgUDHwQOmM?start=320&end=330", "youtu.be/g3YNJ3ESIW0?start=30&end=40", "youtu.be/Dys9DJoFnGA?start=30&end=40", "youtu.be/3PESOesDniw?start=30&end=40", "youtu.be/E8RfeFAzHK4?start=30&end=40", "youtu.be/A_WSbkJozPs?start=90&end=100", "youtu.be/-yZuFZRojYA?start=30&end=40", "youtu.be/ECOK21oFttk?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02zsn",
+    "name": "Female speech, woman speaking",
+    "description": "Speech uttered by an adult female human.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/4l05nCOnIRg?start=30&end=40", "youtu.be/5zyOhZsvIzI?start=30&end=40", "youtu.be/q0wv9AOl4r0?start=100&end=110", "youtu.be/AyPqm-tm2y4?start=30&end=40", "youtu.be/1xXqIpjxYG4?start=30&end=40", "youtu.be/xYPZJkfiyjk?start=90&end=100", "youtu.be/46lVxNflfDw?start=30&end=40", "youtu.be/8OpLsWMn6P0?start=30&end=40", "youtu.be/WLi5_z61zCE?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0ytgt",
+    "name": "Child speech, kid speaking",
+    "description": "Speech uttered by a human child, i.e. a human whose voice does not yet resemble that of an adult.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/iPIGoScYduI?start=210&end=220", "youtu.be/DiJgMH7vg1E?start=30&end=40", "youtu.be/yy0LZBmZRvE?start=60&end=70", "youtu.be/arz0xujSWdY?start=230&end=240", "youtu.be/-oC3FVOx62g?start=210&end=220", "youtu.be/5-R8zaohZrU?start=10&end=20", "youtu.be/OXJ77G_TqDc?start=40&end=50", "youtu.be/koW60o-aLjI?start=470&end=480", "youtu.be/uBVLhywbCQk?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01h8n0",
+    "name": "Conversation",
+    "description": "Interactive, spontaneous spoken communication between two or more people.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Conversation",
+    "positive_examples": ["youtu.be/4FQxw_49xAk?start=30&end=40", "youtu.be/0WcG16XxghA?start=30&end=40", "youtu.be/RHfaNoYg8vs?start=170&end=180", "youtu.be/twGajApu-7g?start=560&end=570", "youtu.be/A0Lpt0VWYCA?start=50&end=60", "youtu.be/ermWzXxDjOQ?start=110&end=120", "youtu.be/sEF1KNfvY90?start=230&end=240", "youtu.be/DhlPa8lUosM?start=300&end=310"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02qldy",
+    "name": "Narration, monologue",
+    "description": "Speech by a single human aimed at an audience (either present or assumed, as in a video blog).",
+    "citation_uri": "http://en.wikipedia.org/wiki/Narration",
+    "positive_examples": ["youtu.be/8maPz8qrmnM?start=30&end=40", "youtu.be/8vmb60ydHtY?start=30&end=40", "youtu.be/fL52Quhhomc?start=30&end=40", "youtu.be/6tXGhWY1I8I?start=30&end=40", "youtu.be/FwRo7yXVnL0?start=30&end=40", "youtu.be/TKPjvQtnELU?start=360&end=370", "youtu.be/Cf6AAX-r6wk?start=30&end=40", "youtu.be/EmLsodfGoes?start=30&end=40", "youtu.be/20lm7Ow9RSY?start=30&end=40", "youtu.be/25K0bryItv4?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0261r1",
+    "name": "Babbling",
+    "description": "Speech-like sounds uttered by a human that lack the deeper structure and meaning of conventional speech. Babbling is a stage in a child's development of language.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Babbling",
+    "positive_examples": ["youtu.be/ZR48r_8OhUI?start=50&end=60", "youtu.be/2ZxE0uMIfa0?start=10&end=20", "youtu.be/vFbu0dH7f-U?start=10&end=20", "youtu.be/soXBLBCaClA?start=90&end=100", "youtu.be/ICajcUYAan8?start=410&end=420", "youtu.be/lluzItlOMlQ?start=10&end=20", "youtu.be/EyCpVw_Xn60?start=130&end=140"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0brhx",
+    "name": "Speech synthesizer",
+    "description": "Artificially-produced human speech.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Speech_synthesis",
+    "positive_examples": ["youtu.be/Y1ywO5WfUqE?start=90&end=100", "youtu.be/-2Krnsj-edU?start=40&end=50", "youtu.be/0npIcSXb1Bc?start=13&end=23", "youtu.be/0wVUdVYmZ5A?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07p6fty",
+    "name": "Shout",
+    "description": "Talk in a loud voice to deliberately command attention or overcome interfering sounds. Does not have the stronger emotional connotations of Yell or Scream.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=shout",
+    "positive_examples": ["youtu.be/-aY64NxBVas?start=480&end=490", "youtu.be/mPr3kCL5y10?start=210&end=220", "youtu.be/5YB13i75Nr4?start=60&end=70", "youtu.be/X04FOAQFn5I?start=130&end=140"],
+    "child_ids": ["/m/07q4ntr", "/m/07rwj3x", "/m/07sr1lc", "/m/04gy_2", "/t/dd00135"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07q4ntr",
+    "name": "Bellow",
+    "description": "A very loud utterance by a person that seems more animal than human.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=bellow",
+    "positive_examples": ["youtu.be/EquXTta7QU0?start=230&end=240", "youtu.be/gLl7DnPIZd4?start=20&end=30", "youtu.be/bCqNjZRWogs?start=30&end=40", "youtu.be/EzIa5SQC7R0?start=30&end=40", "youtu.be/rGbppkm9MYs?start=90&end=100"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rwj3x",
+    "name": "Whoop",
+    "description": "A loud hooting cry of exultation or excitement.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=whoop",
+    "positive_examples": ["youtu.be/TKwuq-O67VE?start=9&end=19", "youtu.be/yfwQgGVZtqI?start=30&end=40", "youtu.be/2Z0FSALnSmM?start=30&end=40", "youtu.be/TDJ746q3sAU?start=170&end=180", "youtu.be/pB4Bh36b1Ss?start=30&end=40", "youtu.be/4Gw8jFlJyLI?start=50&end=60", "youtu.be/7yGHDT7r2u8?start=1&end=11", "youtu.be/5yD_YeSuTzQ?start=150&end=160", "youtu.be/NPkO63FcFXM?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07sr1lc",
+    "name": "Yell",
+    "description": "Utter or declare in a very loud voice that seems excessive or involuntary, or that is deliberately startling in order to command attention, as distinct from the pragmatically raised voice of Shout.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=yell",
+    "positive_examples": ["youtu.be/YMeuw-x86KU?start=30&end=40", "youtu.be/NMn-9tvzQZA?start=30&end=40", "youtu.be/AperXKEnTJ4?start=30&end=40", "youtu.be/pXawVUjQLok?start=30&end=40", "youtu.be/vOEF-FzwSlM?start=100&end=110"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04gy_2",
+    "name": "Battle cry",
+    "description": "A yell or chant taken up in battle, usually by members of the same combatant group, aiming at arousing aggression on one's own side and intimidating the enemy.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Battle_cry",
+    "positive_examples": ["youtu.be/iEfkkyrecGA?start=210&end=220", "youtu.be/8-xVEzStUkg?start=10&end=20", "youtu.be/aNhVWZI5DXY?start=130&end=140", "youtu.be/4Gz_dmk3v_A?start=30&end=40", "youtu.be/RATOmpfWqUo?start=180&end=190", "youtu.be/DRQPYwom_mY?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00135",
+    "name": "Children shouting",
+    "description": "The boisterous vocalizations of a group of children, for instance in a playground.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/W4AptCGYj5w?start=30&end=40", "youtu.be/w4VqralR-6E?start=10&end=20", "youtu.be/vsgHNEEOKxk?start=150&end=160", "youtu.be/7koyNkRHO8U?start=20&end=30", "youtu.be/Q8NC8aHonOU?start=70&end=80"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03qc9zr",
+    "name": "Screaming",
+    "description": "A sharp, high-pitched human vocalization; often an instinctive action indicating fear, pain, surprise, joy, anger, etc. Verbal content is absent or overwhelmed, unlike Shout and Yell.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Screaming",
+    "positive_examples": ["youtu.be/ZHrf_y9q96Q?start=20&end=30", "youtu.be/GqTExeZqXzw?start=20&end=30", "youtu.be/vgpAqgBIpdM?start=220&end=230", "youtu.be/GUsI_2tYVYM?start=29&end=39", "youtu.be/52waxTykuLc?start=30&end=40", "youtu.be/auC_LgwFF8g?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02rtxlg",
+    "name": "Whispering",
+    "description": "Unvoiced speech, usually intended to be quieter to avoid causing disturbance or being overheard.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Whispering",
+    "positive_examples": ["youtu.be/Tr6NMYItPJk?start=490&end=500", "youtu.be/wTjxa7yFyXQ?start=90&end=100", "youtu.be/a3xzicvE4Yk?start=270&end=280", "youtu.be/wAFz-WuuJ4o?start=210&end=220", "youtu.be/-26vNAAFWH0?start=350&end=360", "youtu.be/7S_iHrdfrvQ?start=30&end=40", "youtu.be/cus68zSHXDA?start=90&end=100", "youtu.be/CVGwF_-HXdo?start=540&end=550"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01j3sz",
+    "name": "Laughter",
+    "description": "The sound of rhythmical contractions of the diaphragm in response to stimuli such as tickling, or from humorous stories or thoughts.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Laughter",
+    "positive_examples": ["youtu.be/37hqUFpvl6A?start=40&end=50", "youtu.be/sjAKBnCgKt8?start=220&end=230", "youtu.be/ULTB6y2aj8I?start=40&end=50", "youtu.be/c0V_HAul7rI?start=30&end=40", "youtu.be/SwZ0V8bc2QY?start=350&end=360", "youtu.be/aXS2iHSQE6g?start=10&end=20", "youtu.be/O1snVHXG2j4?start=390&end=400", "youtu.be/4e-qMxrus0A?start=80&end=90", "youtu.be/rPV1mbwWms0?start=60&end=70"],
+    "child_ids": ["/t/dd00001", "/m/07r660_", "/m/07s04w4", "/m/07sq110", "/m/07rgt08"],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00001",
+    "name": "Baby laughter",
+    "description": "Laughter that is produced by a baby or infant.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/2ZqInEYQDOM?start=30&end=40", "youtu.be/BRtGk2YmdJ0?start=20&end=30", "youtu.be/T89y6lwH1Zg?start=30&end=40", "youtu.be/lwb96iGdhZs?start=210&end=220", "youtu.be/m2OBZqJ8qgI?start=90&end=100", "youtu.be/kykRU_67e94?start=20&end=30", "youtu.be/RLiHf1zIuXY?start=50&end=60", "youtu.be/3dh5bfCN4Hs?start=10&end=20"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07r660_",
+    "name": "Giggle",
+    "description": "A foolish or nervous laugh.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=giggle",
+    "positive_examples": ["youtu.be/XviVGrbiUmE?start=240&end=250", "youtu.be/UW7ioxBkBvc?start=410&end=420", "youtu.be/RDAnjeHmW5o?start=30&end=40", "youtu.be/fvg6SM8hnwM?start=30&end=40", "youtu.be/Ts-Y1-FHnUo?start=30&end=40", "youtu.be/K4sqsDcUfGM?start=30&end=40", "youtu.be/8DTw8Udr4rA?start=80&end=90"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07s04w4",
+    "name": "Snicker",
+    "description": "A disrespectful laugh.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=snicker",
+    "positive_examples": ["youtu.be/0Xsny5M-hyE?start=140&end=150", "youtu.be/2sDFQiVB67I?start=80&end=90", "youtu.be/PgiVYe0HbNg?start=290&end=300", "youtu.be/ZMwa1h1Aa8I?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07sq110",
+    "name": "Belly laugh",
+    "description": "Very loud and abrupt laughter that encompasses much of the body.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=belly%20laugh",
+    "positive_examples": ["youtu.be/x2guhF8m53o?start=90&end=100", "youtu.be/og_nfmnT-xA?start=180&end=190", "youtu.be/NUNcvnesc2I?start=310&end=320", "youtu.be/sWXpzTZWNAI?start=30&end=40", "youtu.be/I-XNolOQ1JQ?start=330&end=340", "youtu.be/h_zv_bxqtAM?start=440&end=450"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rgt08",
+    "name": "Chuckle, chortle",
+    "description": "Laugh quietly or with restraint.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=chuckle",
+    "positive_examples": ["youtu.be/C9WWt3dS1I8?start=50&end=60", "youtu.be/-iq2dAKozlc?start=50&end=60", "youtu.be/B_6FPEmhH70?start=40&end=50", "youtu.be/_zkfG4uVqA4?start=110&end=120", "youtu.be/XeKzuoN62QM?start=40&end=50", "youtu.be/3Ri8_DHI4fQ?start=30&end=40", "youtu.be/0WOwZJ9G2N8?start=110&end=120"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0463cq4",
+    "name": "Crying, sobbing",
+    "description": "Sound associated with the shedding of tears in response to an emotional state, arising from slow but erratic inhalation, occasional instances of breath holding and muscular tremor.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Crying",
+    "positive_examples": ["youtu.be/RM6ekNDE2wc?start=15&end=25", "youtu.be/8Y2VGN-QsEs?start=10&end=20", "youtu.be/kEuZE_uu1rs?start=10&end=20", "youtu.be/5Fnc7yYDDvM?start=10&end=20", "youtu.be/rYyKWyIfVpU?start=10&end=20", "youtu.be/5KgGTZEsMsQ?start=10&end=20", "youtu.be/x9vxPUVx40Y?start=30&end=40", "youtu.be/fpWdNeGCKqc?start=40&end=50", "youtu.be/SfG3IkOprQc?start=60&end=70"],
+    "child_ids": ["/t/dd00002", "/m/07qz6j3"],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00002",
+    "name": "Baby cry, infant cry",
+    "description": "The sound of a young child crying or bawling.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/6T2jEKesKXY?start=30&end=40", "youtu.be/-mvGA8duqKM?start=29&end=39", "youtu.be/qc8n-R16jqI?start=30&end=40", "youtu.be/YFK6RwUBpHg?start=10&end=20", "youtu.be/VbtgXizyof4?start=30&end=40", "youtu.be/R6-KBbwNo1Q?start=30&end=40", "youtu.be/CmXA759VLZo?start=7&end=17", "youtu.be/qK4ag_CDBq4?start=30&end=40", "youtu.be/d-1zfgzgBgc?start=14&end=24"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qz6j3",
+    "name": "Whimper",
+    "description": "A plaintive, whining vocalization with some abrupt changes in loudness.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=whimper",
+    "positive_examples": ["youtu.be/Paa2p-PWR_A?start=110&end=120", "youtu.be/ym17PNCgwbQ?start=11&end=21", "youtu.be/ZucQqoFyibg?start=80&end=90", "youtu.be/DLKDQyjlmMc?start=30&end=40", "youtu.be/bEah4B4VQwg?start=30&end=40", "youtu.be/NLoDz8kI6hA?start=22&end=32", "youtu.be/YCvOvWQjUuw?start=50&end=60", "youtu.be/dFNG62WLBMk?start=70&end=80"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qw_06",
+    "name": "Wail, moan",
+    "description": "A weak or soft emotional vocalization, possibly indicating pain, sadness, or pleasure.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=wail",
+    "positive_examples": ["youtu.be/kLElQSpQEdI?start=110&end=120", "youtu.be/1SH36SQZxG4?start=510&end=520", "youtu.be/sArGQfiV2Ro?start=400&end=410", "youtu.be/7nZcwE5VxgM?start=470&end=480", "youtu.be/uiPxvdr6S8c?start=60&end=70", "youtu.be/VCOQOjLdIj0?start=510&end=520"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07plz5l",
+    "name": "Sigh",
+    "description": "An utterance made by exhaling audibly.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=sigh",
+    "positive_examples": ["youtu.be/XOphuM8ZUhM?start=560&end=570", "youtu.be/giY25pWyJxM?start=140&end=150"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/015lz1",
+    "name": "Singing",
+    "description": "Musical sounds produced with the human voice, including sustained tonality, rhythm, and a variety of other vocal techniques.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Singing",
+    "positive_examples": ["youtu.be/tYsnQUR-Ovg?start=210&end=220", "youtu.be/qo2MBnTtMrs?start=30&end=40", "youtu.be/XgPZjCl7Mog?start=110&end=120", "youtu.be/nJwUDaURmEU?start=160&end=170", "youtu.be/0ejZ-FIdeOE?start=80&end=90", "youtu.be/CzoJ3saJ4pw?start=70&end=80", "youtu.be/-62ed8TfM9E?start=100&end=110", "youtu.be/GpMPvnxkibA?start=190&end=200", "youtu.be/ZgxmMjcGBIk?start=30&end=40"],
+    "child_ids": ["/m/0l14jd", "/m/01swy6", "/m/02bk07", "/t/dd00003", "/t/dd00004", "/t/dd00005", "/t/dd00006", "/m/06bxc"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0l14jd",
+    "name": "Choir",
+    "description": "A musical ensemble of singers.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Choir",
+    "positive_examples": ["youtu.be/BwSECmEnch0?start=30&end=40", "youtu.be/7KBnHYkfr88?start=280&end=290", "youtu.be/T1fzAEVrunY?start=40&end=50", "youtu.be/jvEf_n4skHM?start=210&end=220", "youtu.be/Kb8aE3C4aSw?start=370&end=380", "youtu.be/dZiS2K2eqhs?start=30&end=40", "youtu.be/OdhXgzLFed0?start=130&end=140", "youtu.be/Gx7yso0V26E?start=300&end=310", "youtu.be/94zsWCi4C6g?start=190&end=200"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01swy6",
+    "name": "Yodeling",
+    "description": "A form of singing that involves repeated and rapid changes of pitch between the low-pitch chest register and the high-pitch head register or falsetto.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Yodeling",
+    "positive_examples": ["youtu.be/2eSbUlbLTKw?start=70&end=80", "youtu.be/DTzl8RG1oqA?start=70&end=80", "youtu.be/h3qfNR2JJdI?start=50&end=60"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02bk07",
+    "name": "Chant",
+    "description": "The rhythmic speaking or singing of words or sounds, often primarily on one or two main pitches called reciting tones.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Chant",
+    "positive_examples": [],
+    "child_ids": ["/m/01c194"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01c194",
+    "name": "Mantra",
+    "description": "A melodic sacred utterance believed to have spiritual powers.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Mantra",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00003",
+    "name": "Male singing",
+    "description": "Singing produced by an adult human male.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/3MyiiJVkmZc?start=30&end=40", "youtu.be/Cxe11hqO8xM?start=40&end=50", "youtu.be/zwU8MqFhtLo?start=180&end=190", "youtu.be/7QYUijb6wEg?start=30&end=40", "youtu.be/5mvUQdbVvIs?start=30&end=40", "youtu.be/1Ct9cLEumKQ?start=30&end=40", "youtu.be/-KN8aM8jlRo?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00004",
+    "name": "Female singing",
+    "description": "Singing produced by an adult human female.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/6Mp3JkuGIiI?start=30&end=40", "youtu.be/w9a0J7_9zN8?start=30&end=40", "youtu.be/-7xRJ-Osv1E?start=30&end=40", "youtu.be/G0wvmTxaLl4?start=30&end=40", "youtu.be/5j9weh6E9Yc?start=30&end=40", "youtu.be/-Y77HcFQZRo?start=30&end=40", "youtu.be/4MLHuuh6Al0?start=30&end=40", "youtu.be/6_YVfDV_GSA?start=6&end=16", "youtu.be/0ow_OtEBvRM?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00005",
+    "name": "Child singing",
+    "description": "Singing produced by a young human whose voice has not yet reached its adult state.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/0aqR4zaYEn4?start=30&end=40", "youtu.be/RmaUdIhQhsI?start=60&end=70", "youtu.be/5pIdH6p3kuo?start=30&end=40", "youtu.be/hnJhbClGpNI?start=90&end=100", "youtu.be/4d9EfAmGVFs?start=30&end=40", "youtu.be/77ojcYuZn7A?start=30&end=40", "youtu.be/B8wfW-vq_Pk?start=60&end=70", "youtu.be/-UFvq4-iS-Y?start=560&end=570", "youtu.be/265WZznzbko?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00006",
+    "name": "Synthetic singing",
+    "description": "Singing-like sounds produced by artificial means, such as an electronic synthesizer.  Does not include pre-recorded sounds that originate in a human voice.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/d6-bQMCz7j0?start=30&end=40", "youtu.be/-NC5li0b4eY?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06bxc",
+    "name": "Rapping",
+    "description": "The sound of \"spoken or chanted rhyming lyrics\", serving the role of a vocal line in music but with normal speech intonation instead of a melodic line.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Rapping",
+    "positive_examples": ["youtu.be/o0UkYQyz7Go?start=10&end=20", "youtu.be/TlAoG8vYhZc?start=110&end=120", "youtu.be/x7RGnBNjxUc?start=40&end=50", "youtu.be/oqkwwyS91MU?start=100&end=110", "youtu.be/vUgzSosGFl0?start=100&end=110", "youtu.be/n75IvFxTnPc?start=500&end=510", "youtu.be/rQZOndx4yQk?start=220&end=230", "youtu.be/UABPePYISsQ?start=30&end=40", "youtu.be/gAX5Kae75y0?start=30&end=40", "youtu.be/IDhovk37whw?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02fxyj",
+    "name": "Humming",
+    "description": "A wordless tone produced by forcing the sound to emerge from the nose, often with a melody.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Humming",
+    "positive_examples": ["youtu.be/rDybDMElvEQ?start=30&end=40", "youtu.be/fDk8UFZFPqQ?start=30&end=40", "youtu.be/KRb26VscbJ8?start=120&end=130"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07s2xch",
+    "name": "Groan",
+    "description": "An utterance expressing pain or disapproval. Specifically refers to humans.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=groan",
+    "positive_examples": ["youtu.be/UfJczyLcB40?start=170&end=180", "youtu.be/NrD9vrfyBLc?start=40&end=50", "youtu.be/AjQpqRcpG4E?start=30&end=40", "youtu.be/OHIIpJzP8_8?start=20&end=30", "youtu.be/Dp0FeEXOdgM?start=260&end=270", "youtu.be/vL1goz4n82k?start=140&end=150", "youtu.be/DkK4GuqTzHw?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07r4k75",
+    "name": "Grunt",
+    "description": "A short low gruff noise, resembling the sound made by animals such as pigs. Specifically refers to humans.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=grunt",
+    "positive_examples": ["youtu.be/qt6ffKY0Vhs?start=160&end=170", "youtu.be/IUcXh9Zz0cQ?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01j423",
+    "name": "Yawn",
+    "description": "The sound of a reflex consisting of the simultaneous inhalation of air and the stretching of the eardrums, followed by an exhalation of breath, commonly associated with tiredness. Specifically refers to humans.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Yawn",
+    "positive_examples": ["youtu.be/nKdPjkVUbzc?start=90&end=100", "youtu.be/yMmc3uMa5xY?start=30&end=40", "youtu.be/C6Dh2wI6yss?start=70&end=80", "youtu.be/m5UQZ-zHd6I?start=13&end=23"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01w250",
+    "name": "Whistling",
+    "description": "High-pitched tone produced by blowing or sucking air through a small opening between the lips. Used to draw attention, as a personal tic, and to reproduce a melody.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Whistling",
+    "positive_examples": ["youtu.be/MJJv-SQVbQY?start=140&end=150", "youtu.be/snXiUKVgEVI?start=30&end=40", "youtu.be/7lqma04srnA?start=380&end=390", "youtu.be/78F0DHyl8BU?start=30&end=40", "youtu.be/_UahfxFAx0g?start=10&end=20", "youtu.be/EJr6MusXp-o?start=26&end=36", "youtu.be/zhTVzULeZQU?start=12&end=22"],
+    "child_ids": ["/m/079vc8"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/079vc8",
+    "name": "Wolf-whistling",
+    "description": "A two-toned whistle commonly made to show high interest or approval of something or someone (typically sexual).",
+    "citation_uri": "http://en.wikipedia.org/wiki/Wolf-whistling",
+    "positive_examples": ["youtu.be/wE527Sqn0E8?start=12&end=22", "youtu.be/YykyGidfpfw?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/09hlz4",
+    "name": "Respiratory sounds",
+    "description": "Sounds generated by the movement of air through the respiratory system (nose, mouth, trachea, and lungs).",
+    "citation_uri": "http://en.wikipedia.org/wiki/Respiratory_sounds",
+    "positive_examples": [],
+    "child_ids": ["/m/0lyf6", "/m/01b_21", "/m/01hsr_", "/m/07ppn3j"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/m/0lyf6",
+    "name": "Breathing",
+    "description": "Air being moved in and out of the lungs in order to sustain life.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Breathing",
+    "positive_examples": ["youtu.be/hd2yDnpnkUU?start=30&end=40", "youtu.be/1PFsaGB6rIw?start=220&end=230", "youtu.be/ncyIS2o9m74?start=30&end=40", "youtu.be/fBLIOyO67Ro?start=90&end=100", "youtu.be/YhSBpiBlIec?start=10&end=20", "youtu.be/e1urSpgRg7M?start=40&end=50", "youtu.be/XUPDODziWEw?start=80&end=90"],
+    "child_ids": ["/m/07mzm6", "/m/01d3sd", "/m/07s0dtb", "/m/07pyy8b", "/m/07q0yl5"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07mzm6",
+    "name": "Wheeze",
+    "description": "A continuous, coarse, whistling sound produced by obstruction of the respiratory airways during breathing.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Wheeze",
+    "positive_examples": ["youtu.be/NPVIL-WOOM4?start=30&end=40", "youtu.be/IvfCYhQXLd0?start=3&end=13", "youtu.be/6J9XeiKqzw0?start=18&end=28", "youtu.be/SVz7X1gqlcw?start=50&end=60"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01d3sd",
+    "name": "Snoring",
+    "description": "The sound resulting from obstructed respiratory airways during breathing while sleeping.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Snoring",
+    "positive_examples": ["youtu.be/mn824q6UoMg?start=140&end=150", "youtu.be/TiJ-Szo9yHQ?start=20&end=30", "youtu.be/oFiA12sNPyo?start=20&end=30", "youtu.be/c5DPrfjcnsw?start=100&end=110", "youtu.be/9LoDdANZIwg?start=50&end=60", "youtu.be/qWLuleckhOc?start=110&end=120", "youtu.be/3553HVNfiP8?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07s0dtb",
+    "name": "Gasp",
+    "description": "A short labored intake of breath with the mouth open. Often indicates shock or surprise.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=gasp",
+    "positive_examples": ["youtu.be/4bU0HWzCXMc?start=170&end=180", "youtu.be/soVRoIbewMM?start=50&end=60", "youtu.be/EvkxkkJp-v8?start=190&end=200"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pyy8b",
+    "name": "Pant",
+    "description": "Fast and noisy breathing, often associated with physical exertion.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=pant",
+    "positive_examples": ["youtu.be/LCB8rW5cvWk?start=30&end=40", "youtu.be/rgOSgFy0S3Q?start=350&end=360", "youtu.be/gOB0nM0PRTc?start=30&end=40", "youtu.be/e90rd071P-E?start=20&end=30", "youtu.be/jRcrVp2LZTo?start=50&end=60", "youtu.be/Kn-WVGzPVZg?start=20&end=30", "youtu.be/JWP10M-t0jo?start=110&end=120", "youtu.be/yOO9UmPZfns?start=460&end=470"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07q0yl5",
+    "name": "Snort",
+    "description": "The sound of a single, hard exhalation through the nose.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=snort",
+    "positive_examples": ["youtu.be/2PXcJCAVf38?start=50&end=60", "youtu.be/AJIRlmjloBo?start=300&end=310", "youtu.be/3xvs8X-u8NA?start=140&end=150", "youtu.be/fSilzq77oTE?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01b_21",
+    "name": "Cough",
+    "description": "A sudden and often repetitively occurring reflex which consists of an inhalation, a forced exhalation against a closed glottis, and a violent release of air following opening of the glottis.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Cough",
+    "positive_examples": ["youtu.be/c-yI09r8D5w?start=70&end=80", "youtu.be/NIzPRH3ko6s?start=50&end=60", "youtu.be/cLBKVLHFABc?start=40&end=50", "youtu.be/WkaNCu_1l68?start=10&end=20", "youtu.be/vA_AIILl76w?start=20&end=30"],
+    "child_ids": ["/m/0dl9sf8"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0dl9sf8",
+    "name": "Throat clearing",
+    "description": "Forced exhalation through a partially-closed glottis to disrupt built-up mucus, or used as a social signal to draw attention to one's presence.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/xAj1xof1FeU?start=510&end=520", "youtu.be/D40HPmkVjEU?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01hsr_",
+    "name": "Sneeze",
+    "description": "A convulsive expulsion of air from the lungs through the nose and mouth, usually caused by foreign particles irritating the nasal mucous.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Sneeze",
+    "positive_examples": ["youtu.be/8714J5_siEY?start=40&end=50", "youtu.be/SG3MrCBKskE?start=70&end=80", "youtu.be/gyF_tG0M9cM?start=10&end=20", "youtu.be/qDXFW2kemrE?start=250&end=260", "youtu.be/StJLcW5FKoI?start=20&end=30", "youtu.be/HMmbLN0NoYo?start=130&end=140"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07ppn3j",
+    "name": "Sniff",
+    "description": "Inhale audibly through the nose.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=sniff",
+    "positive_examples": ["youtu.be/BDrbBzzJ3hg?start=210&end=220", "youtu.be/MH33LSP9CiI?start=120&end=130", "youtu.be/-xyTvA1F16M?start=590&end=600"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0bpl036",
+    "name": "Human locomotion",
+    "description": "Sounds resulting from human body actions involved in changing location.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/m/06h7j", "/m/07qv_x_", "/m/07pbtc8"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/m/06h7j",
+    "name": "Run",
+    "description": "Move fast by using one's feet.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Running",
+    "positive_examples": ["youtu.be/F4Rqp8W59rY?start=30&end=40", "youtu.be/Q_29jnpYobY?start=30&end=40", "youtu.be/5uXUyEf8bzY?start=30&end=40", "youtu.be/9TDFjw6eF5M?start=30&end=40", "youtu.be/3cjFmZe9muo?start=300&end=310", "youtu.be/_rABcqr2InE?start=250&end=260"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qv_x_",
+    "name": "Shuffle",
+    "description": "Walk while dragging the feet, often indicating impaired mobility.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=shuffle",
+    "positive_examples": ["youtu.be/SJRNlFrlL6I?start=1&end=11", "youtu.be/u-xy9MsPuAI?start=7&end=17"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pbtc8",
+    "name": "Walk, footsteps",
+    "description": "The sound of feet or shoes contacting the ground in conventional human locomotion.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=walk",
+    "positive_examples": ["youtu.be/Dl_-9HuKqD0?start=220&end=230", "youtu.be/ulYCw2ZEiz4?start=50&end=60", "youtu.be/0qes0wPqdEs?start=30&end=40", "youtu.be/XYm5hJfJaFo?start=80&end=90", "youtu.be/DBRvVR2KHeg?start=230&end=240", "youtu.be/V851WPfpbwA?start=230&end=240", "youtu.be/Ix5uPjC1gHI?start=70&end=80", "youtu.be/2Z4THg6VA1A?start=10&end=20"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0160x5",
+    "name": "Digestive",
+    "description": "Sounds associated with the human function of eating and processing nutrition (food).",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/m/03cczk", "/m/07pdhp0", "/m/0939n_", "/m/01g90h", "/m/03q5_w", "/m/02p3nc", "/m/02_nn"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/m/03cczk",
+    "name": "Chewing, mastication",
+    "description": "Food being crushed and ground by teeth.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Mastication",
+    "positive_examples": ["youtu.be/EBnrA85wsc4?start=530&end=540", "youtu.be/EkCIh67xI_w?start=30&end=40", "youtu.be/fCQMflXjGU8?start=460&end=470", "youtu.be/A7MnWIjUVw8?start=40&end=50", "youtu.be/q2m5ChTEe4k?start=450&end=460", "youtu.be/mE092i7zPqY?start=19&end=29", "youtu.be/G3it26Z8E8U?start=30&end=40", "youtu.be/5xAEW3g6en8?start=100&end=110", "youtu.be/9lKmiUIJfGc?start=280&end=290"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pdhp0",
+    "name": "Biting",
+    "description": "Gripping or chewing off with the teeth and jaws.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=biting",
+    "positive_examples": ["youtu.be/3CkwNL_UGoA?start=30&end=40", "youtu.be/WoA9_cYAlfc?start=240&end=250", "youtu.be/XMGG2w26ea4?start=30&end=40", "youtu.be/PRsBCvps7D0?start=30&end=40", "youtu.be/BH4UwESvYw8?start=160&end=170", "youtu.be/X8ulg-9etoU?start=80&end=90"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0939n_",
+    "name": "Gargling",
+    "description": "Air from the lungs bubbling through a liquid in the mouth.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Gargling",
+    "positive_examples": ["youtu.be/Dz8filbrC98?start=60&end=70", "youtu.be/e8saB9V1UXo?start=70&end=80", "youtu.be/SVSjNao4ZFQ?start=80&end=90", "youtu.be/Rk6O2IWjwFg?start=520&end=530", "youtu.be/ACNcxeQ-Rns?start=20&end=30", "youtu.be/bQR7moK_WZg?start=30&end=40", "youtu.be/QS--7Slaacc?start=30&end=40", "youtu.be/AbNB97ANUDA?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01g90h",
+    "name": "Stomach rumble",
+    "description": "A rumbling, growling or gurgling noise produced by movement of the contents of the gastro-intestinal tract.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Stomach_rumble",
+    "positive_examples": ["youtu.be/Gq9nl91baGg?start=110&end=120", "youtu.be/kCXc7_Xie2s?start=12&end=22", "youtu.be/Fk8-06ipZYo?start=400&end=410", "youtu.be/2_Z3pMnrWKY?start=400&end=410"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03q5_w",
+    "name": "Burping, eructation",
+    "description": "The release of gas from the digestive tract through the mouth.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Burping",
+    "positive_examples": ["youtu.be/IsnR6Ck3j44?start=160&end=170", "youtu.be/8xMdlr7YTr4?start=70&end=80", "youtu.be/uhOh3x9I6bw?start=20&end=30", "youtu.be/cQAiS0dk8Zc?start=130&end=140", "youtu.be/1WI9921OeRc?start=570&end=580", "youtu.be/N_ggss-wQmQ?start=40&end=50", "youtu.be/Xo6WBR_cFXQ?start=80&end=90", "youtu.be/X9oZGgZij0c?start=220&end=230", "youtu.be/IP-zbN5rVNs?start=50&end=60"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02p3nc",
+    "name": "Hiccup",
+    "description": "A strong, involuntary contraction of the diaphragm that may repeat several times per minute.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Hiccup",
+    "positive_examples": ["youtu.be/m8yH5TLD1oQ?start=190&end=200", "youtu.be/g1mJnZVGQkk?start=30&end=40", "youtu.be/JQ2XH_Ci_tk?start=80&end=90", "youtu.be/7gdSJ30FfNU?start=460&end=470", "youtu.be/VMdN2OuP6fk?start=90&end=100"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02_nn",
+    "name": "Fart",
+    "description": "The release of gasses produced by digestion through the anus; flatulence.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Fart",
+    "positive_examples": ["youtu.be/xOkFu3ahJAs?start=50&end=60", "youtu.be/O9kJ6tBGGR0?start=70&end=80", "youtu.be/DKnViWndkPc?start=10&end=20", "youtu.be/qtLaibhO5hI?start=30&end=40", "youtu.be/ZMi9XHLZzSk?start=20&end=30", "youtu.be/Le9QaCTQFQ4?start=150&end=160"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0k65p",
+    "name": "Hands",
+    "description": "Sounds originating primarily from the hands and fingers of humans.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/tHASgPEwR_4?start=60&end=70", "youtu.be/s9ODY2HdI7g?start=30&end=40", "youtu.be/qPsooNWBc90?start=40&end=50", "youtu.be/S-n7Rkg32tE?start=230&end=240", "youtu.be/nGmhfmCVnnc?start=30&end=40", "youtu.be/O9z0AAuWG20?start=420&end=430", "youtu.be/PU-4TfhBjrA?start=60&end=70", "youtu.be/cJAm1VXhvGA?start=150&end=160", "youtu.be/KSOxq1eovtw?start=30&end=40"],
+    "child_ids": ["/m/025_jnm", "/m/0l15bq"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/025_jnm",
+    "name": "Finger snapping",
+    "description": "A snapping or clicking sound usually caused by building tension between the thumb and another finger and then moving the other finger forcefully downward to hits the palm at high speed.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Finger_snapping",
+    "positive_examples": ["youtu.be/Q6hIvsdfCF8?start=10&end=20", "youtu.be/p42W37beu88?start=90&end=100", "youtu.be/vxrM-HU9ITQ?start=30&end=40", "youtu.be/UPJl7MtJwhw?start=30&end=40", "youtu.be/p6VIZQAV5PQ?start=30&end=40", "youtu.be/lmSmwPsWvII?start=83&end=93", "youtu.be/JwWoO-dafDc?start=190&end=200", "youtu.be/_e7dRfvWLPo?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0l15bq",
+    "name": "Clapping",
+    "description": "A percussive sound made by a human striking together the palms of their two hands, often quickly and repeatedly to express appreciation or approval. This category is to be used for distinct claps by a single person; for a group of people clapping together, use Applause.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Clapping",
+    "positive_examples": ["youtu.be/ATHjFAM3ehk?start=80&end=90", "youtu.be/TCI_qydc554?start=20&end=30", "youtu.be/8byxt0IjVcQ?start=30&end=40", "youtu.be/0aSF6X4-Ksg?start=50&end=60", "youtu.be/-FRGpQEpEWM?start=30&end=40", "youtu.be/ikdaGO5jUYU?start=80&end=90", "youtu.be/3qEb6Y-D7Mw?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01jg02",
+    "name": "Heart sounds, heartbeat",
+    "description": "Noises generated by the beating heart and the resultant flow of blood through it.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Heart_sounds",
+    "positive_examples": ["youtu.be/40YB3ROwAbE?start=30&end=40", "youtu.be/g9IEEN6Zm8E?start=100&end=110", "youtu.be/T9I8C9l3fR0?start=30&end=40", "youtu.be/nFjBft9nEhs?start=30&end=40", "youtu.be/PkgYuDzOvnc?start=30&end=40", "youtu.be/sbcqHphoMDA?start=100&end=110"],
+    "child_ids": ["/m/01jg1z"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01jg1z",
+    "name": "Heart murmur",
+    "description": "Heart sounds produced by abnormal blood flow across one of the heart valves, typically detected by a stethoscope.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Heart_murmur",
+    "positive_examples": ["youtu.be/tpvMXSfxGOg?start=30&end=40", "youtu.be/m_QTr-0Y_Ek?start=110&end=120"],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/04xp5v",
+    "name": "Otoacoustic emission",
+    "description": "A sound generated within the inner ear, generally understood to result from positive feedback in the highly-sensitive ear mechanism.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Otoacoustic_emission",
+    "positive_examples": [],
+    "child_ids": ["/m/0pv6y"],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0pv6y",
+    "name": "Tinnitus, ringing in the ears",
+    "description": "A sound perceived when no external sound is present, often described as ringing, usually related to short-term trauma or longer-lasting damage to the ear.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Tinnitus",
+    "positive_examples": ["youtu.be/4XBH8x2mcZk?start=60&end=70", "youtu.be/8WFmDgu2Z9Y?start=10&end=20", "youtu.be/ZEHdCaVwtvs?start=270&end=280", "youtu.be/eCWPT9pS7so?start=40&end=50", "youtu.be/OE5fIoveLoM?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/t/dd00012",
+    "name": "Human group actions",
+    "description": "Sounds produced by groups of humans rather than individuals.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/m/0l15bq", "/m/053hz1", "/m/028ght", "/m/07rkbfh", "/m/03qtwd", "/m/07qfr4h", "/m/04v5dt", "/t/dd00013", "/t/dd00135"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/m/053hz1",
+    "name": "Cheering",
+    "description": "Encouraging, stimulating, or exciting vocalizations indicating approval or welcoming persons, announcements, etc.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Cheering",
+    "positive_examples": ["youtu.be/YtItqbSzKBs?start=190&end=200", "youtu.be/7yntRBheP3M?start=30&end=40", "youtu.be/rJ20qPKtLnU?start=70&end=80", "youtu.be/HxPdX8caIaM?start=10&end=20", "youtu.be/Sdl9uYLySyU?start=30&end=40", "youtu.be/OqpuMxjAYTs?start=180&end=190", "youtu.be/oxBsRnUc_6U?start=250&end=260", "youtu.be/4ul8uXkumZM?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/028ght",
+    "name": "Applause",
+    "description": "Group expression of approval by the collective act of clapping by a group.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Applause",
+    "positive_examples": ["youtu.be/lAD74Ey6bL4?start=510&end=520", "youtu.be/7WSH77zPigA?start=30&end=40", "youtu.be/kqzL3xznfiU?start=350&end=360", "youtu.be/0-ygGIzPk6Y?start=30&end=40", "youtu.be/Uzj6a_Btyqs?start=450&end=460", "youtu.be/e1gdVg7DcV4?start=360&end=370", "youtu.be/Rb2L0PmwcvA?start=420&end=430", "youtu.be/ep3rfNAyXGs?start=40&end=50", "youtu.be/rpoTKm-QB7Y?start=15&end=25"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rkbfh",
+    "name": "Chatter",
+    "description": "The combined sound of casual speech by many people in individual simultaneous conversations.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=chatter",
+    "positive_examples": ["youtu.be/ViYMLK4dujU?start=200&end=210", "youtu.be/XXsfmD0Bss0?start=400&end=410", "youtu.be/NyEcj25jlSY?start=40&end=50", "youtu.be/Fvbyp4w6EaM?start=220&end=230", "youtu.be/3YCmyruGOK8?start=140&end=150", "youtu.be/VmPeBqOwI-Y?start=140&end=150"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03qtwd",
+    "name": "Crowd",
+    "description": "The sound of a large group of people gathered together.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Crowd",
+    "positive_examples": ["youtu.be/kUKWkSKQEJY?start=310&end=320", "youtu.be/V5PKN1kIPh8?start=120&end=130", "youtu.be/y_betJ7lbvs?start=190&end=200", "youtu.be/xGPtos-dfDU?start=360&end=370", "youtu.be/qlsptFT16nA?start=350&end=360", "youtu.be/yNkD_yqgUk0?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qfr4h",
+    "name": "Hubbub, speech noise, speech babble",
+    "description": "Loud, disordered, unintelligible speech noise from many sources.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=hubbub",
+    "positive_examples": ["youtu.be/7kw_my9mSSY?start=30&end=40", "youtu.be/9heeSXrNtXY?start=60&end=70", "youtu.be/_frFEY2PG-8?start=140&end=150", "youtu.be/WIKHKccnhCg?start=190&end=200", "youtu.be/DTmnaeDISTY?start=20&end=30", "youtu.be/4Csr25pn41Q?start=30&end=40", "youtu.be/c8CsTpDQlaE?start=280&end=290", "youtu.be/ca5yYSLOcpY?start=10&end=20"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04v5dt",
+    "name": "Booing",
+    "description": "Vocal display of displeasure for someone or something, generally an entertainer, by loudly yelling \"boo!\" or making other noises of disparagement.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Booing",
+    "positive_examples": ["youtu.be/-Cz0muXWdeU?start=310&end=320", "youtu.be/X6xMdMyx4Q8?start=190&end=200", "youtu.be/07ckQxmU9KM?start=23&end=33"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00013",
+    "name": "Children playing",
+    "description": "The sound of children at play, including any of vocalization or the sounds of their activities.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/oxj5zL56UNQ?start=400&end=410", "youtu.be/MkLjgMq-XWo?start=20&end=30", "youtu.be/ZUu13n3vyT8?start=290&end=300", "youtu.be/H867X9MA2x4?start=130&end=140", "youtu.be/SjkBir6CfV8?start=20&end=30", "youtu.be/sQDMqzfpd-Q?start=10&end=20"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0jbk",
+    "name": "Animal",
+    "description": "All sound produced by the bodies and actions of nonhuman animals.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/kECE3qqOknU?start=240&end=250", "youtu.be/gDGP5i3GeGM?start=30&end=40", "youtu.be/Fup9UWyLXjA?start=190&end=200", "youtu.be/YCJs-dwWEks?start=30&end=40", "youtu.be/DNfdqCO9P18?start=30&end=40", "youtu.be/-oRgzBsFneA?start=22&end=32", "youtu.be/ao4g6YKxso8?start=20&end=30", "youtu.be/7j1G8xvOlYw?start=15&end=25", "youtu.be/7nC42YgzxrA?start=120&end=130"],
+    "child_ids": ["/m/068hy", "/m/0ch8v", "/m/01280g"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/068hy",
+    "name": "Domestic animals, pets",
+    "description": "Sounds of animals kept in close proximity to humans for company, protection, and entertainment.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Pet",
+    "positive_examples": ["youtu.be/3SQwwK4JU2I?start=50&end=60", "youtu.be/88AWuXFZkTU?start=17&end=27", "youtu.be/Byo0kKnIASM?start=10&end=20", "youtu.be/6Flj55tSIQE?start=10&end=20", "youtu.be/3Cu8KEB9C74?start=70&end=80", "youtu.be/bTLrBsAy-cE?start=30&end=40", "youtu.be/xwJUplydiuw?start=70&end=80"],
+    "child_ids": ["/m/0bt9lr", "/m/01yrx"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0bt9lr",
+    "name": "Dog",
+    "description": "Any sounds coming from the familiar domesticated canid which has been selectively bred over millennia for companionship, protection, as well as for superior sensory capabilities, and other useful behaviors.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Dog",
+    "positive_examples": ["youtu.be/2M9XExD5qZc?start=30&end=40", "youtu.be/x5Z7h8G-tYs?start=30&end=40", "youtu.be/8UQNzfJX-O8?start=80&end=90", "youtu.be/7h0FRbJSy7I?start=30&end=40", "youtu.be/iyg_5TVi7_Q?start=450&end=460", "youtu.be/1spRiYwd09c?start=70&end=80", "youtu.be/TsePRLNK3Ho?start=40&end=50", "youtu.be/7ajTgyQVczE?start=50&end=60", "youtu.be/azavLi-x9Cw?start=310&end=320"],
+    "child_ids": ["/m/05tny_", "/m/07r_k2n", "/m/07qf0zm", "/m/07rc7d9", "/m/0ghcn6", "/t/dd00136", "/m/07srf8z"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05tny_",
+    "name": "Bark",
+    "description": "Principal communication sound produced by dogs. Often transliterated as woof, especially for large dogs.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bark_(sound)",
+    "positive_examples": ["youtu.be/PGfUJCl4M2E?start=80&end=90", "youtu.be/o-a6xZnOvP4?start=40&end=50", "youtu.be/G8BSfIQcDI0?start=110&end=120", "youtu.be/NKfyi7q0JNU?start=20&end=30", "youtu.be/IYFq35F80hY?start=20&end=30", "youtu.be/YlgYRxOe7vA?start=170&end=180", "youtu.be/RVKEE-0WdLM?start=30&end=40", "youtu.be/CecFyrZB-FM?start=60&end=70"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07r_k2n",
+    "name": "Yip",
+    "description": "A sharp high-pitched bark or cry, typically from a miniature dog.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=yip",
+    "positive_examples": ["youtu.be/5ROJv9qD0e8?start=30&end=40", "youtu.be/9_5DNgJO-Gs?start=15&end=25", "youtu.be/-5IoPSM59xM?start=9&end=19"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qf0zm",
+    "name": "Howl",
+    "description": "The long plaintive cry of a dog, wolf, or other canidae.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=howl",
+    "positive_examples": ["youtu.be/i0j8eJ97eBg?start=20&end=30", "youtu.be/ZhgQ8G03Zjw?start=10&end=20", "youtu.be/UT43tXUgAqk?start=50&end=60", "youtu.be/TmcPW-sbKUw?start=30&end=40", "youtu.be/PWiI1jGK8kQ?start=60&end=70", "youtu.be/5ztulE7OhaA?start=50&end=60", "youtu.be/-hDuDDv0lbQ?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rc7d9",
+    "name": "Bow-wow",
+    "description": "Dog communication sound that is more tonal and less abrupt than a classic bark.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=bow-wow",
+    "positive_examples": ["youtu.be/c8T5BXR0Oxo?start=30&end=40", "youtu.be/2D-Zq-hS5vY?start=30&end=40", "youtu.be/FtqlvNd8meU?start=230&end=240", "youtu.be/N-5sU5MnwtE?start=90&end=100"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0ghcn6",
+    "name": "Growling",
+    "description": "A low, guttural vocalization produced by animals as a warning, a sign of aggression, or to express anger.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Growling",
+    "positive_examples": ["youtu.be/awGgmjI0oaI?start=19&end=29", "youtu.be/a9R4pBS_nnM?start=10&end=20", "youtu.be/goy6XN59IFY?start=490&end=500", "youtu.be/H-BEdsuYFFw?start=40&end=50", "youtu.be/YfxejAWkNcY?start=20&end=30", "youtu.be/jL6K_C9hRFs?start=80&end=90", "youtu.be/kPkat62eHrc?start=10&end=20", "youtu.be/xdl3MP9X1DM?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00136",
+    "name": "Whimper (dog)",
+    "description": "Muted dog vocalization indicating submission, fear, or pain.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/2VwfCvSyQ2U?start=30&end=40", "youtu.be/pstdKyySdbQ?start=30&end=40", "youtu.be/-PveIRMYamI?start=15&end=25", "youtu.be/-BL3E3sLc78?start=30&end=40", "youtu.be/63VwvFLgIak?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07srf8z",
+    "name": "Bay",
+    "description": "The sound made by a hound on the scent.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=bay",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/01yrx",
+    "name": "Cat",
+    "description": "Sounds associated with the familiar, furry, domestic carnivorous mammal.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Cat",
+    "positive_examples": ["youtu.be/5abeJmVq3hU?start=100&end=110", "youtu.be/NdJ910zuXrk?start=40&end=50", "youtu.be/_DkdN717Pj8?start=10&end=20", "youtu.be/ZJcDVzElozo?start=120&end=130", "youtu.be/mGcIhQw0d5U?start=40&end=50", "youtu.be/hsAnIv1pOx0?start=30&end=40", "youtu.be/nAKQA8W1Vqs?start=27&end=37", "youtu.be/fgdBZgILLzs?start=30&end=40"],
+    "child_ids": ["/m/02yds9", "/m/07qrkrw", "/m/07rjwbb", "/m/0f25s6", "/m/07r81j2", "/m/0ghcn6"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02yds9",
+    "name": "Purr",
+    "description": "A tonal fluttering sound made by cats to indicate contentment or relaxed pleasure.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Purr",
+    "positive_examples": ["youtu.be/suUMt34qu5c?start=30&end=40", "youtu.be/U80msVD7UwY?start=370&end=380", "youtu.be/RJcWUsq9S7k?start=90&end=100", "youtu.be/IEMsniLsvrg?start=20&end=30", "youtu.be/FFUPnvY3dXA?start=270&end=280", "youtu.be/OEHhFtvE50A?start=30&end=40", "youtu.be/I_tDt5W2x2Y?start=10&end=20", "youtu.be/EHziu6xjukk?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qrkrw",
+    "name": "Meow",
+    "description": "The classic tonal communication sound made by cats.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=meow",
+    "positive_examples": ["youtu.be/b3dYlDbU_78?start=30&end=40", "youtu.be/wDbhmulc2Lk?start=30&end=40", "youtu.be/8y_kK7fV1QU?start=10&end=20", "youtu.be/_zgFmCdbFSk?start=16&end=26", "youtu.be/95EUjkKsr1c?start=2&end=12", "youtu.be/AOj_Kpbg9xU?start=30&end=40", "youtu.be/T-7ed-4B_oc?start=30&end=40", "youtu.be/-w3qkf-V-PY?start=30&end=40", "youtu.be/3n--reua_a0?start=27&end=37"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rjwbb",
+    "name": "Hiss",
+    "description": "A fricative sound, such as from a cat giving warning, or an audience indicating disapproval.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=hiss",
+    "positive_examples": ["youtu.be/7fuyNj3y8z8?start=30&end=40", "youtu.be/sNFrUosKoq4?start=140&end=150", "youtu.be/3GkJeelJCB0?start=30&end=40", "youtu.be/uN4LHeFWBHY?start=30&end=40", "youtu.be/l4fED3GwuwY?start=30&end=40", "youtu.be/-dN7rnGYtNQ?start=30&end=40", "youtu.be/5wScdWqlS0c?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0f25s6",
+    "name": "Cat communication",
+    "description": "Other sound deliberately made by cats to communicate with other animals.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Cat_communication",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/07r81j2",
+    "name": "Caterwaul",
+    "description": "The yowling sound made by a cat in heat.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=caterwaul",
+    "positive_examples": ["youtu.be/02dEE23cyGU?start=2&end=12", "youtu.be/u3huL-igOL0?start=160&end=170", "youtu.be/1tC7bTilFWk?start=30&end=40", "youtu.be/Vvyroqu2QN4?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0ch8v",
+    "name": "Livestock, farm animals, working animals",
+    "description": "Sound made by animals associated with farms and agriculture.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Livestock",
+    "positive_examples": ["youtu.be/T5xYIPkOfXc?start=60&end=70", "youtu.be/lug9rO_5UGU?start=50&end=60", "youtu.be/rm9Bwksyl_A?start=30&end=40", "youtu.be/o58iPCU6vOs?start=60&end=70", "youtu.be/rs4Bpk_OXfc?start=30&end=40", "youtu.be/zUinGkfkq7Y?start=20&end=30", "youtu.be/RqYjn1aDHLY?start=50&end=60"],
+    "child_ids": ["/m/03k3r", "/m/0ffhf", "/m/01xq0k1", "/m/068zj", "/m/03fwl", "/m/07bgp", "/m/025rv6n"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03k3r",
+    "name": "Horse",
+    "description": "Sounds associated with the familiar domesticated large, fast, and strong ungulate mammal.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Horse",
+    "positive_examples": ["youtu.be/yajlx6BNbPI?start=160&end=170", "youtu.be/Da48zrkaU3E?start=340&end=350", "youtu.be/14ZSZeRJiG4?start=300&end=310", "youtu.be/dzSssD-ITNo?start=30&end=40", "youtu.be/5cCq9hL1Jxg?start=2&end=12", "youtu.be/3cc2GDUvdHo?start=14&end=24"],
+    "child_ids": ["/m/07rv9rh", "/m/07q5rw0", "/t/dd00139", "/t/dd00140"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rv9rh",
+    "name": "Clip-clop",
+    "description": "The sound of a horse's hoofs hitting on a hard surface.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=clip-clop",
+    "positive_examples": ["youtu.be/06JrdfCNSso?start=3&end=13", "youtu.be/HRFAq0D6ROY?start=30&end=40", "youtu.be/-qbPc4nh5ac?start=13&end=23", "youtu.be/DuWkYCyOrN0?start=30&end=40", "youtu.be/Ej-VJMQVxuc?start=30&end=40", "youtu.be/0O5xF0PdtBg?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07q5rw0",
+    "name": "Neigh, whinny",
+    "description": "The characteristic communication sound made by a horse.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=neigh",
+    "positive_examples": ["youtu.be/8I88xIoT3qM?start=110&end=120", "youtu.be/7wfM1zUa_zo?start=30&end=40", "youtu.be/CRlC9US2Vgs?start=30&end=40", "youtu.be/6L_HclNjdWg?start=30&end=40", "youtu.be/DcNVcRK4F4g?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00139",
+    "name": "Snort (horse)",
+    "description": "The sound of a rapid exhale through a horse's nose.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=snort",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00140",
+    "name": "Nicker",
+    "description": "A soft neighing sound made by a horse.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=nicker",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0ffhf",
+    "name": "Donkey, ass",
+    "description": "Sounds associated with the domesticated working animal that is a smaller member of the horse family.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Sheep",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01xq0k1",
+    "name": "Cattle, bovinae",
+    "description": "Sounds associated with the most common type of large domesticated ungulates.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Cattle",
+    "positive_examples": ["youtu.be/6Sny_-shK20?start=100&end=110", "youtu.be/pX9hIaLLBHU?start=20&end=30", "youtu.be/fSQaU5F914s?start=60&end=70", "youtu.be/ZyP7RBGTrgk?start=420&end=430", "youtu.be/Yu7FiBS9Qjg?start=50&end=60", "youtu.be/oLm3xrAkH0c?start=70&end=80", "youtu.be/rae7_kchsho?start=4&end=14", "youtu.be/LC4vDpnkDtM?start=90&end=100"],
+    "child_ids": ["/m/07rpkh9", "/m/0239kh", "/m/01hhp3"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rpkh9",
+    "name": "Moo",
+    "description": "The classic communication sound made by a cow or bull.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=moo",
+    "positive_examples": ["youtu.be/nWhh5mJP-x0?start=13&end=23", "youtu.be/ev_fiaC03oQ?start=29&end=39", "youtu.be/eXLPS2FHCaQ?start=25&end=35", "youtu.be/8lIh0qRN7PE?start=220&end=230", "youtu.be/GSAd8G_JsFs?start=8&end=18", "youtu.be/wDxqpIVXXEU?start=19&end=29", "youtu.be/s_SLQwB8PlI?start=7&end=17", "youtu.be/eI7IkWkGEpY?start=16&end=26"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0239kh",
+    "name": "Cowbell",
+    "description": "A simple metal bell historically tied around the necks of cattle to keep track of their whereabouts.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Cowbell_(instrument)",
+    "positive_examples": ["youtu.be/uXyeDpCw83c?start=50&end=60", "youtu.be/NSrLZ4Dc_FI?start=60&end=70", "youtu.be/bFGQQRHryuk?start=70&end=80", "youtu.be/KWRI6Lj8GEk?start=11&end=21", "youtu.be/pdOskdFwRPg?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01hhp3",
+    "name": "Yak",
+    "description": "The sound of the long-haired bovid found throughout the Himalaya region of southern Central Asia.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Yak",
+    "positive_examples": ["youtu.be/BV3sltuQaJE?start=20&end=30", "youtu.be/ZcULeCFl-1o?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/068zj",
+    "name": "Pig",
+    "description": "Sounds associated with the highly intelligent, very common, domesticated relative of the boar.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Pig",
+    "positive_examples": ["youtu.be/nV9B3G4L51c?start=80&end=90", "youtu.be/DX2WJrS04rU?start=270&end=280", "youtu.be/4C1WjCNy3i8?start=520&end=530", "youtu.be/rLhXPcqUK8A?start=40&end=50", "youtu.be/-pDS3yN4B1s?start=50&end=60", "youtu.be/FK9IFAJyD7w?start=100&end=110", "youtu.be/HuKFVWSbXK8?start=120&end=130"],
+    "child_ids": ["/t/dd00018"],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00018",
+    "name": "Oink",
+    "description": "Common vocalization sound associated with pigs.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/5Dl9LrilWZ4?start=30&end=40", "youtu.be/NG204aLoF1A?start=30&end=40", "youtu.be/p48WaCOxqfk?start=30&end=40", "youtu.be/l-N4TZ2y-5c?start=10&end=20", "youtu.be/r-uANPTedEA?start=30&end=40", "youtu.be/_YbVlWmj-jA?start=30&end=40", "youtu.be/sVnc5_CN3K0?start=4&end=14", "youtu.be/SMQfBU8YhUY?start=30&end=40", "youtu.be/jqAfRUCPONk?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03fwl",
+    "name": "Goat",
+    "description": "Sounds of the medium-sized domesticated bovidae.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Goat",
+    "positive_examples": ["youtu.be/pa8vtRejCe0?start=250&end=260", "youtu.be/s1-fRJvMRjU?start=340&end=350", "youtu.be/TYu_V6-nH68?start=130&end=140", "youtu.be/CFD5LdJ4c6g?start=240&end=250", "youtu.be/HflbP7GyruI?start=360&end=370", "youtu.be/Jftp72Bumqs?start=60&end=70", "youtu.be/gsljeuRL8B8?start=40&end=50", "youtu.be/oDUlhlroLJ4?start=370&end=380"],
+    "child_ids": ["/m/07q0h5t"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07q0h5t",
+    "name": "Bleat",
+    "description": "Communicative vocalization of sheep and goats.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=bleat",
+    "positive_examples": ["youtu.be/MUuAhLSHoIk?start=30&end=40", "youtu.be/czUsMpKy3cI?start=30&end=40", "youtu.be/T2YD9v6ae5U?start=4&end=14", "youtu.be/do67PhyvEy4?start=30&end=40", "youtu.be/jY58mai0TEY?start=8&end=18", "youtu.be/xgya66dtxqk?start=400&end=410", "youtu.be/fuHQHczH3qw?start=24&end=34"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07bgp",
+    "name": "Sheep",
+    "description": "Sounds associated with the familiar quadrupedal, ruminant mammal typically kept as livestock.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Sheep",
+    "positive_examples": ["youtu.be/P2UCPNoBdag?start=30&end=40", "youtu.be/WLxh17NMGA4?start=9&end=19", "youtu.be/6LC34Fb6G0Y?start=30&end=40", "youtu.be/Ee8uGUrgy9w?start=30&end=40", "youtu.be/97KbMMfCle4?start=30&end=40", "youtu.be/TdHQYarFtTk?start=11&end=21"],
+    "child_ids": ["/m/07q0h5t"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/025rv6n",
+    "name": "Fowl",
+    "description": "Sounds associated with the bird species commonly raised for human consumption.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Fowl",
+    "positive_examples": ["youtu.be/McKxt245g4w?start=30&end=40", "youtu.be/GBNj5Bsy6Fk?start=30&end=40", "youtu.be/X31JvVFCg-w?start=510&end=520", "youtu.be/gLdEFc0LNhI?start=100&end=110", "youtu.be/pd1CH9ukqEw?start=140&end=150", "youtu.be/V1F-0D3G0wc?start=220&end=230", "youtu.be/r2zp3mOVp0g?start=30&end=40", "youtu.be/XZzicSdJnIE?start=10&end=20", "youtu.be/JjGjX6lI-V8?start=230&end=240", "youtu.be/WTKUjbur9_c?start=30&end=40", "youtu.be/cU9AJygC17I?start=130&end=140", "youtu.be/B1TLRcWkYEI?start=20&end=30", "youtu.be/CLlaHYLiThk?start=30&end=40"],
+    "child_ids": ["/m/09b5t", "/m/01rd7k", "/m/09ddx", "/m/0dbvp"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/09b5t",
+    "name": "Chicken, rooster",
+    "description": "Sounds of the very common and widespread domesticated fowl, raised for its eggs and meat.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Chicken",
+    "positive_examples": ["youtu.be/jmcpNukZnxA?start=400&end=410", "youtu.be/ky48d3dPjuE?start=20&end=30", "youtu.be/LFscB8kM8Rg?start=22&end=32", "youtu.be/Jur2s-PBjrw?start=26&end=36", "youtu.be/AAsnk4u8lWk?start=60&end=70", "youtu.be/dzNfwINix_I?start=20&end=30", "youtu.be/8mgRjPyCf1E?start=180&end=190", "youtu.be/AfIilScU_Vg?start=10&end=20", "youtu.be/Mb5le4wgNw4?start=27&end=37"],
+    "child_ids": ["/m/07st89h", "/m/07qn5dc"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07st89h",
+    "name": "Cluck",
+    "description": "The soft, low-pitched, and repetitive sound made by chickens.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=cluck",
+    "positive_examples": ["youtu.be/kf9UZ4KULLE?start=70&end=80", "youtu.be/3xAjsgDxa-k?start=30&end=40", "youtu.be/h10aRwbbCII?start=230&end=240", "youtu.be/pDYOiUrK08c?start=60&end=70", "youtu.be/HmMyQjUUYXI?start=50&end=60"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qn5dc",
+    "name": "Crowing, cock-a-doodle-doo",
+    "description": "The multi-syllable warning call made by a rooster (male chicken).",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=cock-a-doodle-doo",
+    "positive_examples": ["youtu.be/iYv9pCf8EGg?start=210&end=220", "youtu.be/onu5WVubMBc?start=170&end=180", "youtu.be/RXo_RIfsF5w?start=220&end=230", "youtu.be/aQZoGTQPAss?start=30&end=40", "youtu.be/U9zltf0ENc0?start=30&end=40", "youtu.be/a_Lz2H2hwuE?start=1&end=11", "youtu.be/dX-hbl17gPA?start=5&end=15", "youtu.be/EMVnxrYkzoM?start=13&end=23", "youtu.be/Q7d8Cpmd-Iw?start=25&end=35", "youtu.be/HdgmT8_o5zQ?start=60&end=70"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01rd7k",
+    "name": "Turkey",
+    "description": "Sounds associated with the large poultry bird, originally domesticated in the Americas.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Domesticated_turkey",
+    "positive_examples": ["youtu.be/g13AFjJIpMU?start=19&end=29", "youtu.be/V1F-0D3G0wc?start=220&end=230", "youtu.be/r2zp3mOVp0g?start=30&end=40", "youtu.be/XZzicSdJnIE?start=10&end=20", "youtu.be/JjGjX6lI-V8?start=230&end=240"],
+    "child_ids": ["/m/07svc2k"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07svc2k",
+    "name": "Gobble",
+    "description": "The characteristic sound made by a turkey cock.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=gobble",
+    "positive_examples": ["youtu.be/Y-q_sD0qBFY?start=30&end=40", "youtu.be/0jWAWkL8Pfw?start=11&end=21", "youtu.be/d5qR0aff0Kk?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/09ddx",
+    "name": "Duck",
+    "description": "Sounds associated with this collection of waterfowl species.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Duck",
+    "positive_examples": ["youtu.be/ACBC8no-jDs?start=140&end=150", "youtu.be/7v0S0E5ROXU?start=30&end=40", "youtu.be/L9AKbLaXCWs?start=40&end=50", "youtu.be/UiqRyK6SJjI?start=13&end=23", "youtu.be/320LuZSMlio?start=200&end=210", "youtu.be/320LuZSMlio?start=50&end=60", "youtu.be/S9cndpKCDTE?start=40&end=50"],
+    "child_ids": ["/m/07qdb04"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qdb04",
+    "name": "Quack",
+    "description": "The harsh communication sound made by a duck.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=quack",
+    "positive_examples": ["youtu.be/rBod0icgIsE?start=28&end=38", "youtu.be/k8O07-MW96Q?start=30&end=40", "youtu.be/h2GONUHtAe0?start=29&end=39", "youtu.be/OWFFBIDXwt0?start=30&end=40", "youtu.be/7OSkKoylZmo?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0dbvp",
+    "name": "Goose",
+    "description": "Sounds associated with this species of larger waterfowl.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Goose",
+    "positive_examples": ["youtu.be/WTKUjbur9_c?start=30&end=40", "youtu.be/cU9AJygC17I?start=130&end=140", "youtu.be/B1TLRcWkYEI?start=20&end=30", "youtu.be/CLlaHYLiThk?start=30&end=40", "youtu.be/ImKzYS6swjg?start=430&end=440", "youtu.be/8UJKZE773Ro?start=110&end=120", "youtu.be/s9sbPkAALxw?start=110&end=120"],
+    "child_ids": ["/m/07qwf61"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qwf61",
+    "name": "Honk",
+    "description": "The cry of a goose.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=honk",
+    "positive_examples": ["youtu.be/Yvd3CNp4kT8?start=30&end=40", "youtu.be/DnMzD1SMmsY?start=13&end=23", "youtu.be/Cz1TdU8GjiA?start=30&end=40", "youtu.be/4JTY-UtZLPg?start=30&end=40", "youtu.be/VRJQZQcnv5s?start=30&end=40", "youtu.be/o5dJd0ZXHrc?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01280g",
+    "name": "Wild animals",
+    "description": "Sounds associated with animals not normally raised by humans.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Wildlife",
+    "positive_examples": ["youtu.be/89IuqwavDU8?start=10&end=20", "youtu.be/8R6H_3neiL8?start=20&end=30", "youtu.be/kQ0eoHaF9X8?start=60&end=70", "youtu.be/bSusH6RqC64?start=360&end=370", "youtu.be/Ku1l5EnaZAQ?start=170&end=180", "youtu.be/UKZrOQfUlIM?start=30&end=40", "youtu.be/JZuwem3Y9Rc?start=250&end=260", "youtu.be/hxnKQk51cMg?start=70&end=80"],
+    "child_ids": ["/m/0cdnk", "/m/015p6", "/m/01z5f", "/m/06hps", "/m/03vt0", "/m/09ld4", "/m/078jl", "/m/032n05"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0cdnk",
+    "name": "Roaring cats (lions, tigers)",
+    "description": "Sounds associated with the larger and more ferocious wild relatives of cats.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Panthera",
+    "positive_examples": ["youtu.be/R7LrWG51wi4?start=190&end=200", "youtu.be/rSu1Cr43S2s?start=10&end=20", "youtu.be/ztymBfhfdPY?start=10&end=20", "youtu.be/5e_s-Aaic4k?start=90&end=100", "youtu.be/__-8wVOYH3c?start=70&end=80", "youtu.be/pQBQBVcSjLk?start=290&end=300", "youtu.be/OrKEBfUj95I?start=120&end=130"],
+    "child_ids": ["/m/04cvmfc", "/m/0ghcn6"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04cvmfc",
+    "name": "Roar",
+    "description": "A deep, bellowing outburst of sound forced through an open mouth, made by a lion or other \"big cat\" species.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Roar_(utterance)",
+    "positive_examples": ["youtu.be/V_K53wcR_hc?start=200&end=210", "youtu.be/IFQGJ_jeTMQ?start=370&end=380", "youtu.be/JbPNmYKs7P0?start=38&end=48"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/015p6",
+    "name": "Bird",
+    "description": "Sounds associated with the large group of flying vertebrates.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bird",
+    "positive_examples": ["youtu.be/UPYKFMdmkBI?start=30&end=40", "youtu.be/h3DVqb55DRQ?start=10&end=20", "youtu.be/aBQ9m59MrWw?start=40&end=50", "youtu.be/P1HP0zaWpW4?start=150&end=160", "youtu.be/W1Wp4zzxtZc?start=10&end=20", "youtu.be/U0YJy6vQA4c?start=20&end=30", "youtu.be/orgwzt45ojE?start=30&end=40", "youtu.be/9yPUjp9LLsE?start=310&end=320", "youtu.be/hjicO_5mHF8?start=100&end=110", "youtu.be/TSbkLVHTETY?start=30&end=40"],
+    "child_ids": ["/m/020bb7", "/m/0h0rv", "/m/04s8yn", "/m/09d5_", "/m/01dwxx", "/m/05_wcq"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/020bb7",
+    "name": "Bird vocalization, bird call, bird song",
+    "description": "Bird communication calls, often considered melodious to the human ear.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bird_vocalization",
+    "positive_examples": ["youtu.be/ouPPVBBOODI?start=40&end=50", "youtu.be/s2QGitj2jUc?start=170&end=180", "youtu.be/wT52hDSLFTo?start=340&end=350", "youtu.be/492jB4DXETM?start=240&end=250", "youtu.be/uQB9xtJvgUk?start=30&end=40", "youtu.be/LCFO6o812y0?start=350&end=360", "youtu.be/OqoxikHu7eY?start=20&end=30", "youtu.be/QtZ6qZdacwc?start=70&end=80", "youtu.be/2bUbSeUd9n8?start=30&end=40"],
+    "child_ids": ["/m/07pggtn", "/m/07sx8x_"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pggtn",
+    "name": "Chirp, tweet",
+    "description": "A brief sound as made by a small bird.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=chirp",
+    "positive_examples": ["youtu.be/TSGfF4opE18?start=120&end=130", "youtu.be/DAmwRR8EWlc?start=380&end=390", "youtu.be/YWipp1KTXa8?start=480&end=490", "youtu.be/f6tGN34_izc?start=90&end=100", "youtu.be/QS8t1QDrxfo?start=130&end=140", "youtu.be/AbQBlpdjdu0?start=40&end=50", "youtu.be/sqghcdp_C8I?start=110&end=120", "youtu.be/Eh44C8ocwdM?start=550&end=560", "youtu.be/_JkYH257o2c?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07sx8x_",
+    "name": "Squawk",
+    "description": "Abrasive, unpleasant sharp sound produced by a bird.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=squawk",
+    "positive_examples": ["youtu.be/RtxzI2FngL4?start=20&end=30", "youtu.be/9EqiEH9fgwc?start=270&end=280", "youtu.be/apb5dycPX-0?start=7&end=17", "youtu.be/KaL17J_SMzo?start=400&end=410", "youtu.be/pw6ZPTpn0bE?start=580&end=590"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0h0rv",
+    "name": "Pigeon, dove",
+    "description": "Sounds associated with these species of birds having short necks and short, slender bills.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Columbidae",
+    "positive_examples": ["youtu.be/CF_cw6igH9s?start=30&end=40", "youtu.be/ngXDfqnWHEU?start=30&end=40", "youtu.be/K2xIzdu-Ga0?start=30&end=40", "youtu.be/LZEGjYrMM8o?start=30&end=40", "youtu.be/F5LddF9IU80?start=30&end=40", "youtu.be/mmPHyB4iAPA?start=280&end=290", "youtu.be/gI7kgvkz1DI?start=30&end=40", "youtu.be/jrdY2k4s37c?start=30&end=40", "youtu.be/jDY9LdYDqhI?start=270&end=280"],
+    "child_ids": ["/m/07r_25d"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07r_25d",
+    "name": "Coo",
+    "description": "The sound made by a pigeon or dove.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=coo",
+    "positive_examples": ["youtu.be/AvwSHC92GlY?start=100&end=110", "youtu.be/IHAteeW3JME?start=30&end=40", "youtu.be/R5OI1ftOAPs?start=30&end=40", "youtu.be/LcaLdDWhTb4?start=30&end=40", "youtu.be/qhuSO2o3nN8?start=30&end=40", "youtu.be/qizXoi1Re6s?start=30&end=40", "youtu.be/TeLKSjNW35g?start=9&end=19"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04s8yn",
+    "name": "Crow",
+    "description": "Sounds associated with the species of medium-to-large birds, Corvus.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Corvus",
+    "positive_examples": ["youtu.be/NJgk6yyMcIY?start=490&end=500", "youtu.be/EdvEBEGRWkM?start=30&end=40", "youtu.be/EdWgYZ4ffls?start=60&end=70", "youtu.be/iDYQtYqbFMA?start=40&end=50", "youtu.be/VHQjG81NcXE?start=30&end=40", "youtu.be/g6hFofVOmVU?start=170&end=180"],
+    "child_ids": ["/m/07r5c2p"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07r5c2p",
+    "name": "Caw",
+    "description": "The relatively harsh sound made by corvine birds.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=caw",
+    "positive_examples": ["youtu.be/xpWvIgTbq80?start=30&end=40", "youtu.be/VYiC9fRYZwM?start=10&end=20", "youtu.be/F7TE4bTue80?start=7&end=17", "youtu.be/gyW4t5ogLyE?start=30&end=40", "youtu.be/zQjpa4I__Ag?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/09d5_",
+    "name": "Owl",
+    "description": "Sounds associated with the mostly solitary and nocturnal carnivorous birds from the order Strigiformes.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Owl",
+    "positive_examples": ["youtu.be/PJRrqdRbh0Q?start=40&end=50", "youtu.be/LEJWmpErDRY?start=30&end=40", "youtu.be/HQuy1_EDCkA?start=30&end=40", "youtu.be/hVsaTVUOzxk?start=30&end=40", "youtu.be/ZLXmK5dVGe0?start=30&end=40", "youtu.be/yw-hqbvLLbc?start=30&end=40"],
+    "child_ids": ["/m/07r_80w"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07r_80w",
+    "name": "Hoot",
+    "description": "The loud raucous cry of an owl.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=hoot",
+    "positive_examples": ["youtu.be/m_39THY6PfY?start=30&end=40", "youtu.be/Zhu5ogju62A?start=1&end=11", "youtu.be/ILv7AQF-ltI?start=30&end=40", "youtu.be/dOeLFbMH09A?start=25&end=35"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01dwxx",
+    "name": "Gull, seagull",
+    "description": "Sounds associated with the family of seabirds often encountered in coastal regions.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Owl",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05_wcq",
+    "name": "Bird flight, flapping wings",
+    "description": "Sounds caused by a bird's use of its wings to travel through the air.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bird_flight",
+    "positive_examples": ["youtu.be/BUM9FcAvxJc?start=50&end=60", "youtu.be/SwZxKE3CnEg?start=80&end=90", "youtu.be/RBs0R_be7B0?start=50&end=60", "youtu.be/v_IaZzdchY8?start=30&end=40", "youtu.be/miNSZxKnDsM?start=180&end=190"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01z5f",
+    "name": "Canidae, dogs, wolves",
+    "description": "Sounds associated with the wild relatives of domesticated dogs.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Canidae",
+    "positive_examples": ["youtu.be/OFBbWkmHWok?start=40&end=50", "youtu.be/IRcPJNZkCTc?start=40&end=50", "youtu.be/L_1BdQxn5Is?start=30&end=40", "youtu.be/0HMa_BIbTo0?start=60&end=70", "youtu.be/ExcQZFswFks?start=60&end=70", "youtu.be/HV346cLZxpA?start=30&end=40", "youtu.be/zxEWn0uqw0A?start=30&end=40", "youtu.be/CqKcyXt6laA?start=14&end=24"],
+    "child_ids": ["/m/07qf0zm", "/m/0ghcn6"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06hps",
+    "name": "Rodents, rats, mice",
+    "description": "Sounds associated with the small mammals of the order Rodentia, found in vast numbers on all continents except Antarctica.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Rodent",
+    "positive_examples": ["youtu.be/BGEMgl1xjac?start=340&end=350", "youtu.be/mrcePXm-R0Y?start=110&end=120", "youtu.be/g3ITqDViDAk?start=210&end=220", "youtu.be/U3zf6nQ6pUE?start=10&end=20", "youtu.be/xtzjBhvhBmY?start=150&end=160"],
+    "child_ids": ["/m/04rmv", "/m/02021", "/m/07r4gkf"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04rmv",
+    "name": "Mouse",
+    "description": "Sounds associated with the familiar small rodent.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Mouse",
+    "positive_examples": ["youtu.be/CtfoWY3GqM4?start=60&end=70", "youtu.be/hvBFjdYnW4U?start=30&end=40", "youtu.be/KU4WtmR753Y?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02021",
+    "name": "Chipmunk",
+    "description": "Sounds associated with the small, striped rodents of the family Sciuridae.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Chipmunk",
+    "positive_examples": ["youtu.be/36XjVkEAC_A?start=13&end=23", "youtu.be/6-X5_NZA9Pw?start=2&end=12", "youtu.be/kipCBDj28-U?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07r4gkf",
+    "name": "Patter",
+    "description": "A quick succession of light rapid sounds, usually suggesting the locomotion of small animals.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=patter",
+    "positive_examples": ["youtu.be/l5CAOayPe4E?start=50&end=60", "youtu.be/2ay8uwJNX1E?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/03vt0",
+    "name": "Insect",
+    "description": "Sounds associated with invertebrates from the arthropod phylum that have a chitinous exoskeleton, a three-part body, three pairs of jointed legs, compound eyes, and one pair of antennae.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Insect",
+    "positive_examples": ["youtu.be/hIBwx8JqEJI?start=410&end=420", "youtu.be/Df8ayuKMUUs?start=50&end=60", "youtu.be/2VZeKEW4crE?start=50&end=60", "youtu.be/T455LBU8lfU?start=230&end=240", "youtu.be/CL7qusrB888?start=20&end=30", "youtu.be/WVQz8LM_SNI?start=70&end=80"],
+    "child_ids": ["/m/09xqv", "/m/09f96", "/m/0h2mp", "/m/01h3n"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/09xqv",
+    "name": "Cricket",
+    "description": "Sounds associated with these relatives of grasshoppers characterized by enlarged hind legs, providing power for jumping. Crickets are mainly nocturnal, and are best known for the loud, persistent, chirping song of males trying to attract females.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Cricket_(insect)",
+    "positive_examples": ["youtu.be/B3aFxHuR7bY?start=30&end=40", "youtu.be/LxUJwOOpJws?start=110&end=120", "youtu.be/jypEvhhECH4?start=70&end=80", "youtu.be/5JTshf6gISY?start=260&end=270", "youtu.be/X0xdCeFkk08?start=450&end=460"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/09f96",
+    "name": "Mosquito",
+    "description": "Sounds associated with these small, midge-like flies whose tube-like mouth parts pierce the hosts' skin to consume blood.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Mosquito",
+    "positive_examples": ["youtu.be/OIkpYphVHVI?start=30&end=40", "youtu.be/crtrt0HbAtg?start=30&end=40", "youtu.be/hq3e3aKFMjQ?start=30&end=40", "youtu.be/ccmAp6PBdDI?start=30&end=40", "youtu.be/p41gzO8pd4k?start=21&end=31"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0h2mp",
+    "name": "Fly, housefly",
+    "description": "Sounds associated with these insects of the order Diptera, which use only a single pair of wings to fly.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Fly",
+    "positive_examples": ["youtu.be/OAuVRzogngc?start=340&end=350", "youtu.be/6UOObVAQS_M?start=30&end=40", "youtu.be/41G_oW9IQrk?start=20&end=30", "youtu.be/kiMG-hDyOPk?start=120&end=130", "youtu.be/U24BkLFGDtg?start=490&end=500"],
+    "child_ids": ["/m/07pjwq1"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pjwq1",
+    "name": "Buzz",
+    "description": "The sound of rapid vibration, commonly the wings of a flying insect.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=buzz",
+    "positive_examples": ["youtu.be/hOqnxZQerhw?start=30&end=40", "youtu.be/PEiKSjl0x0M?start=3&end=13", "youtu.be/tHk9vA5xIPU?start=30&end=40", "youtu.be/-BZuLO9Uvnw?start=30&end=40", "youtu.be/A_n04sE1oyw?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01h3n",
+    "name": "Bee, wasp, etc.",
+    "description": "Sounds associated with these closely-related species of flying insects known for pollination, honey-production, and the ability to sting humans.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bee",
+    "positive_examples": ["youtu.be/GWmgGN3qwLk?start=280&end=290", "youtu.be/rdwmkUJzLRQ?start=400&end=410", "youtu.be/CqkChVlZ0Zc?start=480&end=490", "youtu.be/uGS7O46tlSo?start=70&end=80", "youtu.be/Uax2BiqOKEg?start=130&end=140", "youtu.be/N2arYiB9j-g?start=100&end=110"],
+    "child_ids": ["/m/07pjwq1"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/09ld4",
+    "name": "Frog",
+    "description": "Sounds associated with the short-bodied, tailless amphibians composing the order Anura.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Frog",
+    "positive_examples": ["youtu.be/duywiEdH7kU?start=9&end=19", "youtu.be/-IHAaycWLLo?start=30&end=40", "youtu.be/5NPWv-2F3eM?start=30&end=40", "youtu.be/gff36H5kddQ?start=26&end=36", "youtu.be/JIVTdjGj2eM?start=30&end=40", "youtu.be/T_maPCFQreY?start=60&end=70", "youtu.be/Li8cODFJBJk?start=60&end=70"],
+    "child_ids": ["/m/07st88b"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07st88b",
+    "name": "Croak",
+    "description": "The harsh hoarse utterance of a frog.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=croak",
+    "positive_examples": ["youtu.be/_pFSFgPWnjE?start=30&end=40", "youtu.be/Lc2gZT8Tloc?start=30&end=40", "youtu.be/cX3Zrrld1Tk?start=9&end=19", "youtu.be/QfkpNf93T-Y?start=30&end=40", "youtu.be/_Av6uyAugVQ?start=30&end=40", "youtu.be/BA8YrMd7KVk?start=320&end=330", "youtu.be/_SNr4XpbqNQ?start=30&end=40", "youtu.be/WCYs6hXENGc?start=11&end=21"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/078jl",
+    "name": "Snake",
+    "description": "Sounds associated with the elongated, legless, carnivorous reptiles of the suborder Serpentes.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Snake",
+    "positive_examples": ["youtu.be/6_TjTbIt-xo?start=30&end=40", "youtu.be/rf1OS5Crog8?start=30&end=40", "youtu.be/w3Bdu-s9Tpk?start=30&end=40", "youtu.be/sn0_4kZwmCw?start=90&end=100"],
+    "child_ids": ["/m/07rjwbb", "/m/07qn4z3"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qn4z3",
+    "name": "Rattle",
+    "description": "A rapid series of short loud sounds as of small hard items loose inside a container.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=rattle",
+    "positive_examples": ["youtu.be/cmNsnLQSFjw?start=4&end=14", "youtu.be/TSrlEhhOF5U?start=20&end=30", "youtu.be/8lhdsS_xrvg?start=30&end=40", "youtu.be/f5pPVPk_MY4?start=160&end=170", "youtu.be/uCpEA5JND_c?start=80&end=90"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/032n05",
+    "name": "Whale vocalization",
+    "description": "Sounds produced by whales for different kinds of communication.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Whale_vocalization",
+    "positive_examples": ["youtu.be/SayTcZUipZI?start=250&end=260", "youtu.be/H20tJk2cu-U?start=30&end=40", "youtu.be/u4-VEvmu9kU?start=30&end=40", "youtu.be/hx-UWi3VS8o?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04rlf",
+    "name": "Music",
+    "description": "Music is an art form and cultural activity whose medium is sound and silence. The common elements of music are pitch, rhythm, dynamics, and the sonic qualities of timbre and texture.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Music",
+    "positive_examples": ["youtu.be/E83fRA1wxKU?start=420&end=430", "youtu.be/uNve6Iyh5jA?start=170&end=180", "youtu.be/xMsVXjoVvBY?start=570&end=580", "youtu.be/MM1Fi3tOaKU?start=30&end=40", "youtu.be/obPlMMjBfJo?start=30&end=40", "youtu.be/3MTKFrRID6Y?start=30&end=40", "youtu.be/jz0EDSpW79Q?start=570&end=580", "youtu.be/-UVOFRthq_c?start=30&end=40", "youtu.be/9QxbRgSE0I0?start=30&end=40", "youtu.be/8P-psJ8v8ZA?start=30&end=40"],
+    "child_ids": ["/m/04szw", "/m/0kpv1t", "/t/dd00027", "/t/dd00028", "/t/dd00030"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04szw",
+    "name": "Musical instrument",
+    "description": "Sounds specifically associated with instruments created, adapted or otherwise used to make musical sounds.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Musical_instrument",
+    "positive_examples": ["youtu.be/7umb9yJpkKU?start=70&end=80", "youtu.be/obWlyVlIbXI?start=40&end=50", "youtu.be/5UMC51-0gtY?start=130&end=140", "youtu.be/7RedoApAa5g?start=50&end=60", "youtu.be/kkEwhEEe42E?start=50&end=60", "youtu.be/2SebmDnbwu8?start=120&end=130", "youtu.be/oqToPCSVKvE?start=30&end=40", "youtu.be/74CdH7qHL9Q?start=30&end=40", "youtu.be/QGL1xK5D8Fw?start=30&end=40"],
+    "child_ids": ["/m/0fx80y", "/m/05148p4", "/m/0l14md", "/m/05pd6", "/m/01kcd", "/m/0l14_3", "/m/085jw", "/m/03m5k", "/m/0l14jd", "/m/0395lw", "/m/03qjg", "/m/0mkg", "/m/0192l", "/m/02bxd", "/m/0l14l2", "/m/07kc_", "/m/0l14t7", "/m/05229", "/m/01vj9c", "/m/01hgjl"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0fx80y",
+    "name": "Plucked string instrument",
+    "description": "Sounds of a musical instrument using tensioned strings to produce tones, where the usual mode of playing is to pull and release the strings to cause them to vibrate.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Plucked_string_instrument",
+    "positive_examples": ["youtu.be/623y1sJMR3k?start=50&end=60", "youtu.be/cW7XnBXFi6w?start=30&end=40", "youtu.be/OFWD8T1wqHo?start=80&end=90", "youtu.be/sYoJJnswtZc?start=180&end=190", "youtu.be/e6yoR0fEmR0?start=30&end=40", "youtu.be/BxOQq1ZNtCk?start=30&end=40", "youtu.be/wBK_7cToiNU?start=50&end=60", "youtu.be/h-UHIxkUhBM?start=90&end=100"],
+    "child_ids": ["/m/0342h", "/m/018j2", "/m/0jtg0", "/m/04rzd", "/m/01bns_", "/m/07xzm"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0342h",
+    "name": "Guitar",
+    "description": "Sounds of a string instrument with between 4 and 18 strings (typically 6) running over a fretboard, typically played by strumming or plucking the strings with the right hand while fretting the strings with the fingers of the left hand.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Guitar",
+    "positive_examples": ["youtu.be/4M8jrnFOsv4?start=250&end=260", "youtu.be/3a7ZQYncZzo?start=30&end=40", "youtu.be/03zEmRftWkE?start=30&end=40", "youtu.be/0EZTOSkuFy0?start=30&end=40", "youtu.be/WhcHkiTCg2Y?start=10&end=20", "youtu.be/YophyPW6R2M?start=40&end=50", "youtu.be/NtjgQir_GwM?start=390&end=400"],
+    "child_ids": ["/m/02sgy", "/m/018vs", "/m/042v_gx", "/m/06w87", "/m/01glhc", "/m/07s0s5r"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02sgy",
+    "name": "Electric guitar",
+    "description": "Sounds of a guitar that uses a pickup to convert the vibration of its strings (which are typically made of metal, and which occurs when a guitarist strums, plucks, or fingerpicks the strings) into electrical impulses, which are then amplified and converted to sound.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Electric_guitar",
+    "positive_examples": ["youtu.be/vsseT3Rm88s?start=250&end=260", "youtu.be/GCz2JE4UUxs?start=120&end=130", "youtu.be/rEpG-An1E_g?start=470&end=480", "youtu.be/qOkBZPbv85k?start=410&end=420", "youtu.be/q03HDM5GvMY?start=20&end=30", "youtu.be/17Lx5LoeI3s?start=30&end=40", "youtu.be/uUQyXlqMlVo?start=60&end=70", "youtu.be/5_yzPAQHclw?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/018vs",
+    "name": "Bass guitar",
+    "description": "Sounds of a relative of the electric guitar with a longer neck and 4 to 6 strings. The common 4-string bass is usually tuned one octave lower than the four lowest pitched strings of a guitar.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bass_guitar",
+    "positive_examples": ["youtu.be/p4Wapz6rZWc?start=120&end=130", "youtu.be/f1yunD_7XFI?start=50&end=60", "youtu.be/3YtCNw7vY3A?start=10&end=20", "youtu.be/08uDXandwEQ?start=60&end=70", "youtu.be/9bQeCu-hYfE?start=40&end=50", "youtu.be/8tV2X-AvdY8?start=50&end=60", "youtu.be/XFQzDpqzkT4?start=80&end=90"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/042v_gx",
+    "name": "Acoustic guitar",
+    "description": "Sounds of a guitar that produces sound acoustically by transmitting the vibration of the strings from its body to the air. Includes guitars with both nylon (or gut) strings, and those with steel strings.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Acoustic_guitar",
+    "positive_examples": ["youtu.be/0_MLIPfb84s?start=30&end=40", "youtu.be/-37uZp3L3vw?start=30&end=40", "youtu.be/zeAO1j2LStk?start=10&end=20", "youtu.be/-SOoc9Rr39Y?start=12&end=22", "youtu.be/-DPoX0GdToU?start=130&end=140", "youtu.be/GDfQPFcG74k?start=150&end=160", "youtu.be/4gZkMShMpmc?start=30&end=40", "youtu.be/OSSa7XnAxkk?start=460&end=470", "youtu.be/mPEQ9bBWUYE?start=210&end=220", "youtu.be/ygonIq0Qatk?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06w87",
+    "name": "Steel guitar, slide guitar",
+    "description": "Sounds of a guitar whose string length is controlled by a hard object placed on the string (the \"slide\" or \"steel\") instead of by pressing against the frets with the fingers.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Steel_guitar",
+    "positive_examples": ["youtu.be/0fcW2Nu_yk4?start=70&end=80", "youtu.be/gGEMCZmcJo0?start=130&end=140", "youtu.be/KvDjUWfxLvw?start=210&end=220", "youtu.be/JY_l4taOZG4?start=180&end=190", "youtu.be/Zne2aFaPEjE?start=180&end=190"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01glhc",
+    "name": "Tapping (guitar technique)",
+    "description": "Sounds of a guitar playing technique, where a string is fretted and set into vibration as part of a single motion of being pushed onto the fretboard, as opposed to the standard technique being fretted with one hand and picked with the other.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Tapping",
+    "positive_examples": ["youtu.be/hrOa0R7Lfvg?start=10&end=20", "youtu.be/ML3A4Z1UrG4?start=60&end=70", "youtu.be/sAD_dtC_6cM?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07s0s5r",
+    "name": "Strum",
+    "description": "The sound of playing a guitar by dragging a finger or plectrum over multiple strings near-simultaneously.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=strum",
+    "positive_examples": ["youtu.be/J8Fp-ol3oWk?start=30&end=40", "youtu.be/-_WO2kys2tU?start=30&end=40", "youtu.be/14jEF7Bhx0Q?start=30&end=40", "youtu.be/-eUHclSdrz4?start=30&end=40", "youtu.be/2a-u6r7pPgU?start=30&end=40", "youtu.be/-rDW578Jqpw?start=30&end=40", "youtu.be/yfsV_LY2rS4?start=470&end=480", "youtu.be/tj375STXuek?start=90&end=100"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/018j2",
+    "name": "Banjo",
+    "description": "Sounds of a 4-, 5-, or 6-stringed instrument with a thin membrane stretched over a frame or cavity as a resonator, called the head.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Banjo",
+    "positive_examples": ["youtu.be/l_7B1wnTNtA?start=20&end=30", "youtu.be/R-coz17NCsE?start=250&end=260", "youtu.be/miFBEtS8iaw?start=70&end=80", "youtu.be/M2w7M0xhCok?start=30&end=40", "youtu.be/zm0qNj8QaLA?start=160&end=170", "youtu.be/3kaCL9Zu7NY?start=170&end=180", "youtu.be/1COTNE5iwpw?start=70&end=80"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0jtg0",
+    "name": "Sitar",
+    "description": "Sounds of a plucked stringed instrument used mainly in Hindustani and Indian classical music, which derives its distinctive timbre and resonance from sympathetic strings, bridge design, a long hollow neck, and a gourd-shaped resonance chamber.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Sitar",
+    "positive_examples": ["youtu.be/UB5Lh65DhGI?start=240&end=250", "youtu.be/NvAlKJXWuz0?start=220&end=230", "youtu.be/JLyg8PK6VzY?start=150&end=160", "youtu.be/9SvlmFWEsJE?start=530&end=540", "youtu.be/XLv-1DYujvo?start=470&end=480", "youtu.be/eVmuNiSlM08?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04rzd",
+    "name": "Mandolin",
+    "description": "Sounds of an instrument in the lute family, commonly with 4 courses of doubled metal strings, usually plucked with a plectrum or \"pick\".",
+    "citation_uri": "http://en.wikipedia.org/wiki/Mandolin",
+    "positive_examples": ["youtu.be/EiL4B7P8w5E?start=160&end=170", "youtu.be/UfCnuSlcb8c?start=230&end=240", "youtu.be/V2TOF7oO5iw?start=30&end=40", "youtu.be/6hBBOM8wkzY?start=30&end=40", "youtu.be/7vugt_BNbK0?start=230&end=240"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01bns_",
+    "name": "Zither",
+    "description": "Sounds of a class of stringed instruments consisting of many strings stretched across a thin, flat body, played by strumming or plucking.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Zither",
+    "positive_examples": ["youtu.be/dvmutEzAcjA?start=140&end=150", "youtu.be/-VtQ1KQPAzU?start=30&end=40", "youtu.be/-Mtj9kQgkk4?start=30&end=40", "youtu.be/03Am-YeLdK8?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07xzm",
+    "name": "Ukulele",
+    "description": "Sounds of a small four-stringed guitar-like instument that originated in Hawaii.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Ukulele",
+    "positive_examples": ["youtu.be/Szh-0r2nefs?start=30&end=40", "youtu.be/c6AjhdkSNWI?start=30&end=40", "youtu.be/lawaVpYj3t4?start=210&end=220", "youtu.be/UNJX3yYxJjE?start=30&end=40", "youtu.be/aVjQk_ZTOQU?start=230&end=240", "youtu.be/fxDPFgl_uhU?start=30&end=40", "youtu.be/7LhnENv-9vw?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05148p4",
+    "name": "Keyboard (musical)",
+    "description": "Sounds of a musical instrument played using a keyboard, most commonly the piano, organ, and various electronic keyboards, including synthesizers and digital pianos.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Keyboard_instrument",
+    "positive_examples": ["youtu.be/11bWvQaQhrM?start=440&end=450", "youtu.be/E7PfYoBL14c?start=80&end=90", "youtu.be/OOKSJ45MgmI?start=30&end=40", "youtu.be/8f3xNkAxq8E?start=20&end=30", "youtu.be/YfMY4JtzeEA?start=340&end=350", "youtu.be/6nTvu7T6cGk?start=30&end=40", "youtu.be/_HxoLyv0zA4?start=130&end=140", "youtu.be/1wYs3gtvFRk?start=340&end=350", "youtu.be/msC0nzoxin4?start=120&end=130"],
+    "child_ids": ["/m/05r5c", "/m/013y1f", "/m/0l14qv", "/m/03q5t"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05r5c",
+    "name": "Piano",
+    "description": "Sounds of a musical instrument played via a row of keys that the performer presses down or strikes with the fingers and thumbs of both hands, to trigger a decaying note whose loudness reflects the vigor of the press.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Piano",
+    "positive_examples": ["youtu.be/jETQAE_vBQI?start=30&end=40", "youtu.be/MvA6dSsIPdM?start=380&end=390", "youtu.be/Sq8vJKMiTbA?start=210&end=220", "youtu.be/vTCVxCW77wA?start=200&end=210", "youtu.be/5IAzn0I7IOs?start=30&end=40", "youtu.be/FR9EWioJq6s?start=590&end=600", "youtu.be/rPYyIE1lfeA?start=410&end=420"],
+    "child_ids": ["/m/01s0ps"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01s0ps",
+    "name": "Electric piano",
+    "description": "Sounds of a music instrument with a piano-style keyboard, where pressing keys causes mechanical hammers to strike metal strings, metal reeds, or wire tines, leading to vibrations which are converted into electrical signals by magnetic pickups.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Electric_piano",
+    "positive_examples": ["youtu.be/skeFawZe__U?start=30&end=40", "youtu.be/m484xjK104Y?start=20&end=30", "youtu.be/ExBXCryXOak?start=80&end=90", "youtu.be/-DDiBs4JIxc?start=140&end=150", "youtu.be/-YATTKBtmRA?start=190&end=200"],
+    "child_ids": ["/m/025cbm", "/m/0bxl5"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/025cbm",
+    "name": "Clavinet",
+    "description": "Sounds of an electrically-amplified keyboard instrument with a distinctive bright staccato sound, popular in funk, jazz-funk, rock, and soul songs.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Clavinet",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0bxl5",
+    "name": "Rhodes piano",
+    "description": "Sounds of an electric piano invented by Harold Rhodes in which the hammers strike thin metal tines, which are then amplified via an electromagnetic pickup.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Rhodes_piano",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/013y1f",
+    "name": "Organ",
+    "description": "Sounds of a keyboard instrument of one or more pipe divisions, each played with its own keyboard, played either with the hands on a keyboard or with the feet using pedals. Unlike the piano, organ notes typically sustain while the key is depressed, rather than dying away.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Organ_(music)",
+    "positive_examples": ["youtu.be/ddoz52PLnnM?start=480&end=490", "youtu.be/-JXGBn-QDFY?start=60&end=70", "youtu.be/BBfp97N31ag?start=30&end=40", "youtu.be/tkPPIT_n4NQ?start=60&end=70", "youtu.be/XW2nILE5__g?start=80&end=90", "youtu.be/Kvpav8cFKhM?start=100&end=110", "youtu.be/VGwNAk50fcQ?start=50&end=60"],
+    "child_ids": ["/m/03xq_f", "/m/03gvt"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03xq_f",
+    "name": "Electronic organ",
+    "description": "Sounds of an electronic keyboard instrument designed to imitate the sound of the pipe organ.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Electronic_organ",
+    "positive_examples": ["youtu.be/3KEPeUOABxc?start=30&end=40", "youtu.be/uE2oHfgljwE?start=230&end=240", "youtu.be/dyxYEQbWAxM?start=60&end=70", "youtu.be/sFq05nFNw4k?start=150&end=160", "youtu.be/nl0nKw-1VkU?start=30&end=40", "youtu.be/pk8CAgk7tHU?start=310&end=320", "youtu.be/zaw7smD79XA?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03gvt",
+    "name": "Hammond organ",
+    "description": "Sounds of an electric organ invented by Laurens Hammond and John M. Hanert in which sliding drawbars create a variety of sounds.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Hammond_organ",
+    "positive_examples": ["youtu.be/d9r_kYpOvW8?start=30&end=40", "youtu.be/BjPvQnVQsOw?start=20&end=30", "youtu.be/YQSYAXHgZ24?start=30&end=40", "youtu.be/UxOSSoMg7eA?start=30&end=40", "youtu.be/n6JP70Sr76k?start=10&end=20", "youtu.be/2eP3SBgYDEQ?start=30&end=40", "youtu.be/aDJDaIgme3k?start=30&end=40", "youtu.be/HH3qmmKao2E?start=100&end=110"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0l14qv",
+    "name": "Synthesizer",
+    "description": "Sounds associated with an electronic musical instrument that generates electric signals converted to sound through amplifiers and loudspeakers or headphones. Synthesizers may imitate instruments such as piano, organ, flute, or vocals; or they may generate novel electronic timbres.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Synthesizer",
+    "positive_examples": ["youtu.be/DQ5KwZ2qBMY?start=390&end=400", "youtu.be/wb6eNKCdmgc?start=290&end=300", "youtu.be/drDfZAkVi4k?start=500&end=510", "youtu.be/XBDxkLByWRQ?start=130&end=140", "youtu.be/vxJ_lQpZcRk?start=550&end=560", "youtu.be/xlEZYQOt9-Q?start=210&end=220"],
+    "child_ids": ["/m/01v1d8", "/m/0gkd1"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01v1d8",
+    "name": "Sampler",
+    "description": "Sounds suggesting the use of an electronic musical instrument that uses recordings of sounds loaded or recorded into it by the user, then played back by means of the sampler program itself, a keyboard, sequencer, or another triggering device.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Sampler_(musical_instrument)",
+    "positive_examples": ["youtu.be/P_Tr82fXp54?start=30&end=40", "youtu.be/EJXeCTtWX1I?start=200&end=210", "youtu.be/RNN-S-X61qM?start=30&end=40", "youtu.be/2KkNk9Ao7G4?start=150&end=160", "youtu.be/D0ZsJnmBde0?start=30&end=40", "youtu.be/CsGMebE_xgk?start=30&end=40", "youtu.be/9RBzsjga73s?start=10&end=20", "youtu.be/OMoXXckByJI?start=50&end=60", "youtu.be/bMp1fPnQyWo?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0gkd1",
+    "name": "Mellotron",
+    "description": "Sounds of an electro-mechanical, polyphonic tape replay keyboard in which each note is produced by a separate piece of pre-recorded tape, supplied in a corresponding set to simulate a particular instrument.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Mellotron",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/03q5t",
+    "name": "Harpsichord",
+    "description": "Sounds of a keyboard-controlled instrument in which each key plucks a particular string.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Harpsichord",
+    "positive_examples": ["youtu.be/3QyBsEwyI4g?start=230&end=240", "youtu.be/kaYBmFJzseo?start=400&end=410", "youtu.be/S_zInSBhWhU?start=100&end=110"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0l14md",
+    "name": "Percussion",
+    "description": "The sound of a musical instrument that is sounded by being struck or scraped by a beater; struck, scraped or rubbed by hand; or struck against another similar instrument.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Percussion_instrument",
+    "positive_examples": ["youtu.be/cEvgKicGrNs?start=30&end=40", "youtu.be/OK5Ev_1OcbU?start=220&end=230", "youtu.be/LszgixYT18E?start=30&end=40", "youtu.be/3yMo8i-ws7o?start=30&end=40", "youtu.be/dxPcrVYlaX8?start=30&end=40", "youtu.be/HFVIKa0XcPg?start=30&end=40", "youtu.be/x22NHqwKSNE?start=90&end=100"],
+    "child_ids": ["/m/02hnl", "/m/026t6", "/m/01qbl", "/m/0239kh", "/m/01sm1g", "/m/07brj", "/m/05r5wn", "/m/0mbct", "/m/016622", "/m/0j45pbj"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02hnl",
+    "name": "Drum kit",
+    "description": "Sounds of a collection of drums and other percussion instruments, typically cymbals, which are set up on stands to be played by a single player with drumsticks held in both hands and the feet operating pedals that control the hi-hat cymbal and the beater for the bass drum.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Drum_kit",
+    "positive_examples": ["youtu.be/-VqHJfjcNl8?start=70&end=80", "youtu.be/-kgUDrfLbKw?start=10&end=20", "youtu.be/a7N-Oa4NKKI?start=30&end=40", "youtu.be/BuHnSV6592c?start=30&end=40", "youtu.be/klUmhkMVtN8?start=70&end=80", "youtu.be/hcjHzFKgXSE?start=30&end=40", "youtu.be/xaHjFAy54dU?start=40&end=50", "youtu.be/p9Rh85FgFYk?start=90&end=100", "youtu.be/lhH4TTaAQvU?start=150&end=160"],
+    "child_ids": ["/m/0cfdd"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0cfdd",
+    "name": "Drum machine",
+    "description": "Sounds of an electronic musical instrument designed to imitate the sound of drums, cymbals, other percussion instruments, and often basslines.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Drum_machine",
+    "positive_examples": ["youtu.be/OEpMpYMjO9Y?start=30&end=40", "youtu.be/UmoQDRfLpiI?start=30&end=40", "youtu.be/0ZwxHoPdHqI?start=20&end=30", "youtu.be/aONVSrSlbYM?start=70&end=80", "youtu.be/EzALQdZ5RnI?start=30&end=40", "youtu.be/_jdcbOFrg4Q?start=360&end=370", "youtu.be/sfiDHExt6rI?start=10&end=20"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/026t6",
+    "name": "Drum",
+    "description": "The sound of an instrument consisting of at least one membrane, called a drumhead or drum skin, that is stretched over a shell and struck, either directly with the player's hands, or with a drum stick, to produce sound.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Drum",
+    "positive_examples": ["youtu.be/65uPPy5e8oI?start=30&end=40", "youtu.be/GZ-jXUN0Hpg?start=70&end=80", "youtu.be/P1f09xVBNTc?start=150&end=160", "youtu.be/lYALJ6L251c?start=320&end=330", "youtu.be/EyopPrqUxcU?start=180&end=190", "youtu.be/6WdMSnWhppk?start=280&end=290", "youtu.be/2ZEkawx_eL8?start=120&end=130", "youtu.be/7E4uaC3YkHM?start=60&end=70", "youtu.be/aJpy-R7ZFb8?start=30&end=40"],
+    "child_ids": ["/m/06rvn", "/m/0bm02", "/m/011k_j", "/m/01p970"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06rvn",
+    "name": "Snare drum",
+    "description": "The sound of a drum that produces a sharp staccato sound by virtue of metal coils (the \"snare\") stretched across one of the membranes.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Snare_drum",
+    "positive_examples": ["youtu.be/1vhU7Zn-VBM?start=270&end=280", "youtu.be/_U8C951VmOc?start=30&end=40", "youtu.be/rZa2PFRUfyg?start=90&end=100", "youtu.be/UtlddariDo4?start=60&end=70", "youtu.be/3e7FToRhcTE?start=40&end=50", "youtu.be/7BEUpZOEFAQ?start=100&end=110", "youtu.be/dAAfKqP13QE?start=30&end=40", "youtu.be/B6XqwiZ_cmE?start=40&end=50", "youtu.be/kmiLtUMZPeA?start=60&end=70"],
+    "child_ids": ["/m/03t3fj", "/m/02k_mr"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03t3fj",
+    "name": "Rimshot",
+    "description": "The sound of an accented snare drum beat produced by simultaneously hitting the rim and head of a drum with a drum stick.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Rimshot",
+    "positive_examples": ["youtu.be/BZLfT6ZqGHU?start=30&end=40", "youtu.be/1dXUNNc180g?start=270&end=280", "youtu.be/BMpPQuKbJ6U?start=30&end=40", "youtu.be/6ao2_JxtXjU?start=60&end=70", "youtu.be/-cNY6fbbldY?start=30&end=40", "youtu.be/LrLz1wRVdbg?start=30&end=40", "youtu.be/1Og2YfIVZKQ?start=20&end=30", "youtu.be/B6XqwiZ_cmE?start=40&end=50", "youtu.be/1Yw69hcsaYg?start=30&end=40", "youtu.be/1hTljKxD8aY?start=320&end=330"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02k_mr",
+    "name": "Drum roll",
+    "description": "A sustained sound produced by multiple, rapid strikes between drum sticks and drum, made possible by bouncing alternating sticks off the head to produce a temporally dense sequence.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Drum_roll",
+    "positive_examples": ["youtu.be/-r-iX1t7eE8?start=420&end=430", "youtu.be/vBHIihIl_vg?start=50&end=60", "youtu.be/jBIJvd3pUYE?start=230&end=240", "youtu.be/SC8RGKfaJrQ?start=60&end=70"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0bm02",
+    "name": "Bass drum",
+    "description": "The sound of a large drum that produces a note of low definite or indefinite pitch.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bass_drum",
+    "positive_examples": ["youtu.be/umIND3KZyqs?start=70&end=80", "youtu.be/h1L06porcL0?start=220&end=230", "youtu.be/khRBhDUyLb0?start=360&end=370", "youtu.be/ts76TqD6eiQ?start=30&end=40", "youtu.be/k36gVPLVPU4?start=50&end=60", "youtu.be/vYoHjD5slzg?start=80&end=90", "youtu.be/yTgBSKCZ1ag?start=240&end=250", "youtu.be/fokECmrANcs?start=310&end=320", "youtu.be/cBJGUfPyGcw?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/011k_j",
+    "name": "Timpani",
+    "description": "The sound of an orchestral drum consisting of a skin stretched over a large bowl traditionally made of copper, where the head is tensioned to produce an identifiable pitch.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Timpani",
+    "positive_examples": ["youtu.be/oczQMKP1GlU?start=190&end=200", "youtu.be/QVouc9CpXiw?start=30&end=40", "youtu.be/aoJ9P5IatjU?start=440&end=450", "youtu.be/boCwqpKwWsY?start=110&end=120", "youtu.be/IsFkruyADHc?start=410&end=420", "youtu.be/FzcVZOICJeU?start=240&end=250"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01p970",
+    "name": "Tabla",
+    "description": "The sound of a classical Hindustani percussion instrument consisting of a pair of hand drums of contrasting sizes and timbres, producing distinct pitches when struck. The larger drum has a pitch that can be manipulated with the player's hand.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Tabla",
+    "positive_examples": ["youtu.be/WXUfeWxCKlI?start=250&end=260", "youtu.be/FwW2Rjd-Rds?start=70&end=80", "youtu.be/5FQI0Fzt57Q?start=10&end=20", "youtu.be/OVkxmOGRzEk?start=270&end=280", "youtu.be/m9qHnNaAskw?start=100&end=110", "youtu.be/8fA1_gzERUE?start=20&end=30", "youtu.be/XmIRBoXWbQ0?start=190&end=200"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01qbl",
+    "name": "Cymbal",
+    "description": "The sound of a percussion instrument consisting of thin, normally round plates of various alloys. They may be used in pairs, struck together, or as single cymbals struck with sticks.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Cymbal",
+    "positive_examples": ["youtu.be/0nURxLEqvRc?start=170&end=180", "youtu.be/JnrAsNneLbo?start=110&end=120", "youtu.be/hZaqDtQprgI?start=130&end=140", "youtu.be/CBi8zo190Rc?start=70&end=80", "youtu.be/I6rouB_37sE?start=80&end=90", "youtu.be/MQ5RUPOYTRI?start=30&end=40", "youtu.be/2FViTj8j_oc?start=250&end=260", "youtu.be/AQ4y-cd5jTc?start=70&end=80"],
+    "child_ids": ["/m/03qtq", "/m/0bm0k"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03qtq",
+    "name": "Hi-hat",
+    "description": "The sound of a mating pair of small cymbals mounted atop one-another on a stand, the bottom fixed and the top activated up and down by a foot pedal. The rate of decay of the sound is changed between a shimmering and sharp effect by pressing on the pedal.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Hi-hat",
+    "positive_examples": ["youtu.be/Jo-Rlw1dl0Y?start=70&end=80", "youtu.be/jEgdYuCQPtk?start=30&end=40", "youtu.be/4LPBTlzBfgU?start=230&end=240", "youtu.be/AVwL9t-Hhlw?start=30&end=40", "youtu.be/-A743FXw0VE?start=140&end=150"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0bm0k",
+    "name": "Crash cymbal",
+    "description": "The sound of a cymbal that produces a loud, sharp \"crash\" used for occasional accents.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Crash_cymbal",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/01sm1g",
+    "name": "Wood block",
+    "description": "The sound of a small slit drum made from a single piece of wood, making a very short, clean sound.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Wood_block",
+    "positive_examples": ["youtu.be/4qxz8m23ZCk?start=3&end=13", "youtu.be/4uBq4jhmsEw?start=3&end=13", "youtu.be/nHPrHjUaPDI?start=70&end=80", "youtu.be/pKk1OLOBap4?start=220&end=230"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07brj",
+    "name": "Tambourine",
+    "description": "The sound of a musical instrument consisting of a frame with pairs of small metal jingles. There is often a drumhead stretched across the frame, and the instrument is usually played by striking the frame or head with the hand.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Tambourine",
+    "positive_examples": ["youtu.be/brAD8wCj00U?start=16&end=26", "youtu.be/sQY6hx71INQ?start=30&end=40", "youtu.be/U6aLiMIlqCc?start=250&end=260", "youtu.be/dlB9D-tD8Jg?start=13&end=23", "youtu.be/3P9Mp5FjMPo?start=120&end=130", "youtu.be/g-erbiFIy30?start=310&end=320", "youtu.be/gDnJoHpSL4M?start=40&end=50", "youtu.be/mU6cfEWw5Og?start=180&end=190", "youtu.be/U716X--8X18?start=150&end=160", "youtu.be/1ycXNv1vgFs?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05r5wn",
+    "name": "Rattle (instrument)",
+    "description": "The sound of an instrument that produces a sound when shaken, usually consisting of a closed rigid shell containing numerous small particles that strike the shell when shaken.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Rattle_(percussion_instrument)",
+    "positive_examples": ["youtu.be/ZJtY7zzYROI?start=10&end=20", "youtu.be/L7VmcEuksqs?start=120&end=130", "youtu.be/FAhfuP_xh5Y?start=450&end=460", "youtu.be/kNMqGt7SNMo?start=10&end=20", "youtu.be/Z05m7IQ3-b0?start=110&end=120"],
+    "child_ids": ["/m/0xzly"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0xzly",
+    "name": "Maraca",
+    "description": "The sound of a specific kind of rattle that appears in many genres of Caribbean and Latin music. Players hold them by their handles, usually in pairs, and shake them.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Maraca",
+    "positive_examples": ["youtu.be/tx39HB2qQfg?start=160&end=170", "youtu.be/UH--XDDtwUA?start=50&end=60", "youtu.be/BBncSiS4-jg?start=30&end=40", "youtu.be/UxKwjMKGLhQ?start=80&end=90"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0mbct",
+    "name": "Gong",
+    "description": "The sound of a musical percussion instrument consisting of a flat, circular metal disc which is hit with a mallet.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Gong",
+    "positive_examples": ["youtu.be/JUrbfbmArBQ?start=290&end=300", "youtu.be/cj3eSVIrKs8?start=50&end=60", "youtu.be/ShuwAKNiOsc?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/016622",
+    "name": "Tubular bells",
+    "description": "Sounds of tuned percussion instruments consisting of a metal tube, tuned by altering its length in the range C4-F5.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Tubular_bells",
+    "positive_examples": ["youtu.be/-bE5twf76Jw?start=30&end=40", "youtu.be/Ht3AjqvgRq0?start=20&end=30", "youtu.be/JKZ-_CVEJKY?start=210&end=220"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0j45pbj",
+    "name": "Mallet percussion",
+    "description": "Sounds of melodic percussion instruments consisting of an array of elements, one for each note, played with mallets.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Mallet_percussion",
+    "positive_examples": ["youtu.be/Xk5sz4pmOdw?start=180&end=190", "youtu.be/1WYJ1qfQghA?start=30&end=40", "youtu.be/ZTu4mSOhOrU?start=190&end=200", "youtu.be/Z7jt1J4zpys?start=80&end=90", "youtu.be/soKp5kWpZAI?start=20&end=30", "youtu.be/OIeyYgZknyM?start=80&end=90"],
+    "child_ids": ["/m/0dwsp", "/m/0dwtp", "/m/0dwt5", "/m/0l156b"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0dwsp",
+    "name": "Marimba, xylophone",
+    "description": "Sounds of a percussion instrument consisting of a set of wooden bars struck with mallets to produce musical tones. Resonators suspended underneath the bars may amplify their sound.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Marimba",
+    "positive_examples": ["youtu.be/boCwqpKwWsY?start=20&end=30", "youtu.be/Vvqf6rxvnJ8?start=40&end=50", "youtu.be/UB-F2trb4Fs?start=510&end=520", "youtu.be/ht4GN2h5bYo?start=440&end=450", "youtu.be/-iqoiKgKNYE?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0dwtp",
+    "name": "Glockenspiel",
+    "description": "Sounds of a percussion instrument composed of a set of tuned metal plates or tubes laid out like the keyboard of a piano.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Glockenspiel",
+    "positive_examples": ["youtu.be/jAuskt6flA4?start=110&end=120", "youtu.be/66B8oYpHKfc?start=20&end=30", "youtu.be/LCUwzeqbhdY?start=40&end=50", "youtu.be/gJopkFkV6ho?start=20&end=30", "youtu.be/P9_5_2tHTM8?start=170&end=180"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0dwt5",
+    "name": "Vibraphone",
+    "description": "Sounds of a tuned mallet percussion instrument similar to a xylophone or glockenspiel with the addition a motor-driven valve on each resonator that produce a tremolo or vibrato effect.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Vibraphone",
+    "positive_examples": ["youtu.be/2M-P1FB9QAk?start=10&end=20", "youtu.be/mAbApaNedVc?start=230&end=240", "youtu.be/eXTErlP38Cc?start=60&end=70", "youtu.be/7i3HNSNwz4s?start=60&end=70", "youtu.be/VeITc7tV24k?start=420&end=430", "youtu.be/XDtWBZ0EUBk?start=50&end=60"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0l156b",
+    "name": "Steelpan",
+    "description": "Sounds of a tuned percussion instrument originally constructed from steel oil drums by hammering out small patches on the head to produce separate pitches.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Steelpan",
+    "positive_examples": ["youtu.be/kMwTShj-LnY?start=90&end=100", "youtu.be/ZTfCj_u1wu0?start=80&end=90", "youtu.be/XC_k5KRn0yc?start=70&end=80", "youtu.be/qNIJV7f0CRY?start=430&end=440", "youtu.be/x56KbBd4wGU?start=40&end=50", "youtu.be/SxyxQndQL8g?start=30&end=40", "youtu.be/H_uOu7ipuYk?start=70&end=80"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05pd6",
+    "name": "Orchestra",
+    "description": "The sound of a large instrumental ensemble (a dozen or more people), often used in classical music, that contains sections of string, brass, woodwind, and percussion instruments.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Orchestra",
+    "positive_examples": ["youtu.be/AdXeApNvWEs?start=30&end=40", "youtu.be/INDEchOtcAs?start=140&end=150", "youtu.be/i1orsU9GvV4?start=210&end=220", "youtu.be/McaaPRnasK4?start=100&end=110", "youtu.be/XVocggdVOQU?start=30&end=40", "youtu.be/_LI3WVnOjPI?start=310&end=320", "youtu.be/57bXQfZlbzY?start=240&end=250"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01kcd",
+    "name": "Brass instrument",
+    "description": "The sound of a musical instrument that produces sound by vibration of air in a tubular resonator in sympathy with the vibration of the player's lips.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Brass_instrument",
+    "positive_examples": ["youtu.be/MQJSNWtvojI?start=10&end=20", "youtu.be/PaupGh4ZNpM?start=250&end=260", "youtu.be/2XAA3oS6ws4?start=170&end=180", "youtu.be/-k6TMxNASFQ?start=490&end=500", "youtu.be/uG4zreyH1Yo?start=60&end=70", "youtu.be/P3EpQMrRtpQ?start=40&end=50", "youtu.be/Di7Rs4QmYKI?start=150&end=160", "youtu.be/UB0P7ZG8DDg?start=20&end=30", "youtu.be/qhYax0Adtas?start=60&end=70"],
+    "child_ids": ["/m/0319l", "/m/07gql", "/m/07c6l", "/m/020w2", "/m/0y64j"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0319l",
+    "name": "French horn",
+    "description": "The sound of a brass instrument made of tubing wrapped into a coil with a flared bell. A modern French horn includes valves to control pitch by routing air into extra sections of tubing.",
+    "citation_uri": "http://en.wikipedia.org/wiki/French_horn",
+    "positive_examples": ["youtu.be/FfAGd9uFcuc?start=29&end=39", "youtu.be/dSAFI_547oQ?start=30&end=40", "youtu.be/wQhycJsKSPE?start=340&end=350", "youtu.be/EWnC6IbYpgs?start=440&end=450"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07gql",
+    "name": "Trumpet",
+    "description": "The sound of a high-pitched brass instruments played by blowing air through almost-closed lips, producing a \"buzzing\" sound that starts a standing wave vibration in the air column inside the instrument.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Trumpet",
+    "positive_examples": ["youtu.be/TaBJtSIFxNI?start=230&end=240", "youtu.be/2NY5vr-zuPk?start=30&end=40", "youtu.be/K6m0jA-zqkM?start=20&end=30", "youtu.be/jzlD88TWgAY?start=30&end=40", "youtu.be/eiVonYZuzUc?start=30&end=40", "youtu.be/njmeO5JDRAI?start=150&end=160"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07c6l",
+    "name": "Trombone",
+    "description": "The sound of a brass instrument that includes a telescoping slide mechanism that varies the length of the instrument to change the pitch.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Trombone",
+    "positive_examples": ["youtu.be/o6Lm-qQZM-w?start=30&end=40", "youtu.be/2DiNEVtvTlM?start=10&end=20", "youtu.be/TNG8t6gCDRI?start=440&end=450", "youtu.be/cTAZpiMmQ7E?start=40&end=50", "youtu.be/EU4RunterMg?start=110&end=120", "youtu.be/-3OxjnEhi3A?start=15&end=25"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/020w2",
+    "name": "Cornet",
+    "description": "The sound of a brass instrument distinguished from a trumpet by its conical bore, more compact shape, and mellower tone quality.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Cornet",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0y64j",
+    "name": "Bugle",
+    "description": "The sound of a simple brass instrument with no valves or other pitch-altering devices. All pitch control is done by varying the player's embouchure.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bugle",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0l14_3",
+    "name": "Bowed string instrument",
+    "description": "Sounds of the subcategory of string instruments that are usually played by a bow rubbing the strings.  Examples may include other playing techniques for these instruments, such as plucking.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bowed_string_instrument",
+    "positive_examples": ["youtu.be/VicqlR2dxTQ?start=20&end=30", "youtu.be/RK7KFZ9SWOc?start=30&end=40", "youtu.be/ucZ5P4EvFe0?start=140&end=150", "youtu.be/Pk5X4JnfT8o?start=220&end=230", "youtu.be/XxhwXQcX58Y?start=60&end=70", "youtu.be/SyXlIXHKfes?start=510&end=520"],
+    "child_ids": ["/m/02qmj0d", "/m/07y_7", "/m/01xqw", "/m/02fsn"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02qmj0d",
+    "name": "String section",
+    "description": "The sound of a group of string instruments typically present in a standard classical orchestra.",
+    "citation_uri": "http://en.wikipedia.org/wiki/String_section",
+    "positive_examples": ["youtu.be/59bu7JPOqR8?start=80&end=90", "youtu.be/rBOkr_DAlK4?start=500&end=510", "youtu.be/xKmDVsO2uSw?start=10&end=20", "youtu.be/ich_-wya6Cs?start=60&end=70", "youtu.be/SrQRhZVLFI0?start=530&end=540"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07y_7",
+    "name": "Violin, fiddle",
+    "description": "The sound of a wooden string instrument, typically with four strings tuned in perfect fifths, that is most commonly played by drawing a bow across its strings, though it can also be played by plucking the strings.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Violin",
+    "positive_examples": ["youtu.be/5HLDUExzhYU?start=30&end=40", "youtu.be/DY9Xm0boaeA?start=30&end=40", "youtu.be/qTeVf7bOztU?start=30&end=40", "youtu.be/lzp88JPSpJE?start=130&end=140", "youtu.be/-fFHHJmhnuM?start=10&end=20", "youtu.be/QJPNBp9eI5M?start=80&end=90", "youtu.be/9a572z81TIw?start=30&end=40", "youtu.be/OiEPSWBQMl0?start=30&end=40", "youtu.be/JyiVnMB8Z1A?start=30&end=40", "youtu.be/2KffWn62wZ4?start=6&end=16"],
+    "child_ids": ["/m/0d8_n"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0d8_n",
+    "name": "Pizzicato",
+    "description": "The sound of playing a stringed instrument that is usually bowed (such as a violin) by plucking the strings with the fingers.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Pizzicato",
+    "positive_examples": ["youtu.be/6DMtphlHzTM?start=30&end=40", "youtu.be/6cVpV4lX3wE?start=30&end=40", "youtu.be/8Wsmo4nUcUE?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01xqw",
+    "name": "Cello",
+    "description": "The sound of a larger, lower-pitched bowed string instrument with four strings tuned in perfect fifths, usually C2, G2, D3, and A3.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Cello",
+    "positive_examples": ["youtu.be/4Atbvyzcyms?start=30&end=40", "youtu.be/ekDtSo-QQmo?start=60&end=70", "youtu.be/iu6qTrNQaro?start=240&end=250", "youtu.be/jIPgZAc_GZQ?start=30&end=40", "youtu.be/EayN_Jj0740?start=40&end=50", "youtu.be/CEeX04qZg4Q?start=120&end=130", "youtu.be/u-qDCzZk6a8?start=60&end=70", "youtu.be/5e7zBZTv6Bo?start=540&end=550"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02fsn",
+    "name": "Double bass",
+    "description": "The sound of a large, bowed string instrument, the lowest-pitched in a standard symphony orchestra, with strings usually tuned to E1, A1, D2, and G2.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Double_bass",
+    "positive_examples": ["youtu.be/0MzKqnXHNW8?start=220&end=230", "youtu.be/X7u0UeMNg5c?start=20&end=30", "youtu.be/QvRueweYq6w?start=150&end=160", "youtu.be/Hg2v0Wb8WV4?start=60&end=70", "youtu.be/1xrK5P_fSBA?start=40&end=50", "youtu.be/6yZ67KvDFX8?start=30&end=40", "youtu.be/nhp2HwPoZRw?start=550&end=560"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/085jw",
+    "name": "Wind instrument, woodwind instrument",
+    "description": "Sounds of a family of musical instruments in which a column of air within a resonator is set into vibration by the player blowing into a mouthpiece set at the end of the resonator.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Wind_instrument",
+    "positive_examples": ["youtu.be/P8Ep-MUeZpE?start=30&end=40", "youtu.be/4vKah6NGacI?start=40&end=50", "youtu.be/xEJKVm-XlTI?start=140&end=150", "youtu.be/ko0_jYVXOjw?start=10&end=20", "youtu.be/Mjf34K-cGf0?start=50&end=60", "youtu.be/DVWoxLvpHH4?start=190&end=200", "youtu.be/dfUZdxXl_kU?start=570&end=580"],
+    "child_ids": ["/m/0l14j_", "/m/06ncr", "/m/01wy6", "/m/05kms", "/m/01c3q"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0l14j_",
+    "name": "Flute",
+    "description": "The sound of an aerophone or reedless wind instrument that produces its sound from the flow of air across an opening.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Flute",
+    "positive_examples": ["youtu.be/tF0u90ESFIs?start=30&end=40", "youtu.be/-oOzwEopvdY?start=80&end=90", "youtu.be/oXK4afYJ-ko?start=30&end=40", "youtu.be/xuEwpyrAE40?start=170&end=180", "youtu.be/14Ki89E_S_8?start=120&end=130", "youtu.be/-8ozvTgx--E?start=30&end=40", "youtu.be/sapHfiXGCfY?start=110&end=120"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06ncr",
+    "name": "Saxophone",
+    "description": "The sound of a woodwind instrument made of brass and played with a single-reed mouthpiece similar to that of the clarinet.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Saxophone",
+    "positive_examples": ["youtu.be/wq5wg0L-KF4?start=50&end=60", "youtu.be/jZ-guxYSD2E?start=320&end=330", "youtu.be/5h40xBO4cSM?start=45&end=55", "youtu.be/VBLMzXqcjeA?start=160&end=170", "youtu.be/VDUb4wHGDBE?start=40&end=50", "youtu.be/EtQeywjkRqI?start=60&end=70"],
+    "child_ids": ["/m/02pprs", "/m/03t22m"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02pprs",
+    "name": "Alto saxophone",
+    "description": "The sound of the most common member of the saxophone family covering a mid-range of pitches.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Alto_saxophone",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/03t22m",
+    "name": "Soprano saxophone",
+    "description": "The sound of a higher-register variety of the saxophone, popular as a lead instrument in jazz.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Soprano_saxophone",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/01wy6",
+    "name": "Clarinet",
+    "description": "The sound of a woodwind instrument consisting of a single-reed mouthpiece, a straight cylindrical tube with an almost cylindrical bore, and a flared bell.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Clarinet",
+    "positive_examples": ["youtu.be/fL1dLazpseU?start=10&end=20", "youtu.be/1qdxACwrgl0?start=150&end=160", "youtu.be/2dUAk6Ml4O0?start=40&end=50", "youtu.be/AHlMvXkziEY?start=30&end=40", "youtu.be/KTePg_avLAE?start=480&end=490", "youtu.be/Q4KNKaFsyCs?start=10&end=20"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05kms",
+    "name": "Oboe",
+    "description": "The sounds of straight-bored double-reed woodwind instruments.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Oboe",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01c3q",
+    "name": "Bassoon",
+    "description": "The sound of the low-pitched double-reed woodwind instrument with a folded bore.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Oboe",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03m5k",
+    "name": "Harp",
+    "description": "The sound of a stringed musical instrument with a number of individual strings, plucked with the fingers, running at an angle to its soundboard.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Harp",
+    "positive_examples": ["youtu.be/KVveqUp3QJk?start=190&end=200", "youtu.be/2htboubCq14?start=50&end=60", "youtu.be/GU8khdLxccg?start=30&end=40", "youtu.be/RhdGQ6vyLBg?start=80&end=90", "youtu.be/op8pav9qypA?start=70&end=80", "youtu.be/M88ZrlaX0Xs?start=320&end=330", "youtu.be/JCGhzN11lAw?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0395lw",
+    "name": "Bell",
+    "description": "The sound of a simple instrument usually consisting of metal cast in the shape of a hollow cup, whose sides form a resonator which vibrates in a single tone upon being struck by a \"clapper\" suspended within, or by a separate mallet or hammer.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bell",
+    "positive_examples": ["youtu.be/e-MkVbRmlIo?start=80&end=90", "youtu.be/hTb_hLSPH3w?start=200&end=210", "youtu.be/7WoxYO6rWLo?start=430&end=440", "youtu.be/btFRBcWGlbU?start=550&end=560", "youtu.be/tYZLPFuzNIs?start=20&end=30", "youtu.be/Qklbd5OLmac?start=60&end=70", "youtu.be/8Jp638z8SEs?start=30&end=40", "youtu.be/u3doJa_O5p8?start=30&end=40", "youtu.be/AlSWSI50R7U?start=60&end=70"],
+    "child_ids": ["/m/03w41f", "/m/0239kh", "/m/027m70_", "/m/0gy1t2s", "/m/07n_g", "/m/0f8s22", "/m/0150b9"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03w41f",
+    "name": "Church bell",
+    "description": "The sound of a large bell usually encountered in churches where it used either to signify the hour or the time for worshippers to go to church.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Church_bell",
+    "positive_examples": ["youtu.be/CwhbaRYl3kQ?start=180&end=190", "youtu.be/8c_pb6dimmU?start=180&end=190", "youtu.be/hTb_hLSPH3w?start=200&end=210", "youtu.be/ySKvlZdECBI?start=30&end=40", "youtu.be/xDFzNhcSDCU?start=30&end=40", "youtu.be/whFx67_zhFw?start=100&end=110", "youtu.be/YL_yzbTuiic?start=10&end=20", "youtu.be/qMZ1nj4fDas?start=290&end=300", "youtu.be/ITFzMwD9Q64?start=150&end=160"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/027m70_",
+    "name": "Jingle bell",
+    "description": "The sound of a type of bell which produces a distinctive 'jingle' sound, especially in large numbers. They are commonly produced from a single piece of sheet metal bent into a roughly spherical shape containing a small ball bearing.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Jingle_bell",
+    "positive_examples": ["youtu.be/m4ysMjnrTnQ?start=10&end=20", "youtu.be/TWQXP_v8opY?start=260&end=270", "youtu.be/VY9hzBD-Sh4?start=100&end=110", "youtu.be/dl793c9RCJY?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0gy1t2s",
+    "name": "Bicycle bell",
+    "description": "The sound of a percussive signaling instrument mounted on a bicycle for warning pedestrians and other cyclists.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bicycle_bell",
+    "positive_examples": ["youtu.be/LK8j0Q4UU4g?start=10&end=20", "youtu.be/s0F2kzW8quU?start=160&end=170", "youtu.be/k7oGk-ozhKI?start=30&end=40", "youtu.be/kTkG_z_GWFs?start=18&end=28", "youtu.be/OY-jfOrL-_A?start=20&end=30", "youtu.be/vSPaTX3nq5A?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07n_g",
+    "name": "Tuning fork",
+    "description": "The sound of an acoustic resonator in the form of a two-pronged fork with the prongs formed from a U-shaped bar of elastic metal. It emits a pure musical tone and is frequently used as a standard of pitch to tune musical instruments.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Tuning_fork",
+    "positive_examples": ["youtu.be/A96UdAiDxfI?start=30&end=40", "youtu.be/NxPqGv1MYh4?start=180&end=190", "youtu.be/s7sRKH_EdAY?start=60&end=70", "youtu.be/rlSvS7InhjE?start=30&end=40", "youtu.be/X1iK3Un7bbI?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0f8s22",
+    "name": "Chime",
+    "description": "The sound of a musical instrument consisting of multiple tuned bells, usually rung by an automated mechanism to produce a short, characteristic tone or melody.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Chime_(bell_instrument)",
+    "positive_examples": ["youtu.be/2cJw4p0NzL8?start=30&end=40", "youtu.be/-OIgmZ_M7Co?start=60&end=70", "youtu.be/JO3yqua94PM?start=17&end=27", "youtu.be/B4WnT62BRog?start=500&end=510", "youtu.be/wiYHKu0-9XM?start=170&end=180", "youtu.be/ofNRyTtPZ8I?start=10&end=20", "youtu.be/So5SUEh2bZc?start=100&end=110", "youtu.be/m9qHnNaAskw?start=20&end=30", "youtu.be/_RbYie-Fxo4?start=60&end=70", "youtu.be/caVRLI71CAA?start=25&end=35", "youtu.be/ZpZsTWwUCSY?start=3&end=13"],
+    "child_ids": ["/m/026fgl"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/026fgl",
+    "name": "Wind chime",
+    "description": "The sound of an instrument constructed from suspended tubes, rods, bells, or other objects that are often made of metal or wood along with some type of weight the tubes or rods can strike when they or another wind-catching surface are blown by the natural movement of air outside.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Wind_chime",
+    "positive_examples": ["youtu.be/1zmAwtZgohE?start=170&end=180", "youtu.be/3aR0vdDSuAA?start=0&end=10", "youtu.be/2xzSvnP5LHU?start=10&end=20", "youtu.be/LhXxwS30-es?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0150b9",
+    "name": "Change ringing (campanology)",
+    "description": "Sounds of the art of ringing a set of (typically large) tuned bells in a controlled manner to produce variations in their sounding order.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Change_ringing",
+    "positive_examples": ["youtu.be/OP1OQKL0LJc?start=280&end=290", "youtu.be/NmKVkgXB64g?start=280&end=290", "youtu.be/2_Ea3T14HNA?start=130&end=140", "youtu.be/vaMn20oJ4Bc?start=10&end=20", "youtu.be/MJI_9fDsX3I?start=40&end=50", "youtu.be/OMMjjf5O1gY?start=240&end=250", "youtu.be/KHapBfftSoE?start=70&end=80", "youtu.be/gKEIFL2uFgU?start=30&end=40", "youtu.be/DhEiv81k5BI?start=180&end=190"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03qjg",
+    "name": "Harmonica",
+    "description": "The sound of a wind instrument played by using the mouth to direct air into or out of one or more holes along a mouthpiece. Behind each hole is a chamber containing at least one reed, producing a specific note or notes.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Harmonica",
+    "positive_examples": ["youtu.be/079QG8nZA7k?start=110&end=120", "youtu.be/yDAkWVjjJFo?start=200&end=210", "youtu.be/gM0V0aomTyA?start=20&end=30", "youtu.be/pbGa3ut9s24?start=60&end=70", "youtu.be/IRCpKYdBrwo?start=120&end=130", "youtu.be/ybpj_cvRPII?start=30&end=40", "youtu.be/edPpQ4A1TY4?start=50&end=60"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0mkg",
+    "name": "Accordion",
+    "description": "The sound of a family of box-shaped bellows-driven musical instruments played by pressing buttons or keys which allow air to flow across strips of brass or steel, called reeds, that vibrate to produce sound inside the body.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Accordion",
+    "positive_examples": ["youtu.be/WXCy_B98FoI?start=210&end=220", "youtu.be/_NUdDndH7kw?start=140&end=150", "youtu.be/MNAPqNAJLRk?start=30&end=40", "youtu.be/1UmPZEeakuQ?start=110&end=120", "youtu.be/JzgMUjrm4HQ?start=100&end=110", "youtu.be/i5LHakiuDAM?start=131&end=141", "youtu.be/KNJCUmcwLYY?start=50&end=60"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0192l",
+    "name": "Bagpipes",
+    "description": "The sound of a wind instrument using enclosed reeds fed from a constant reservoir of air in the form of a bag.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bagpipes",
+    "positive_examples": ["youtu.be/vFpiDoOZc8Y?start=30&end=40", "youtu.be/GKbldfNmkZ8?start=120&end=130", "youtu.be/s6K8qalzgI8?start=30&end=40", "youtu.be/uUoEB3DBSjo?start=30&end=40", "youtu.be/eFzPYy6IWq4?start=100&end=110", "youtu.be/xyvS_KsEJC0?start=60&end=70", "youtu.be/oGeklgrvf2g?start=180&end=190"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02bxd",
+    "name": "Didgeridoo",
+    "description": "The sound of an Indigenous Australian wind instrument, usually cylindrical or conical, measuring anywhere from 1 to 3 m long.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Didgeridoo",
+    "positive_examples": ["youtu.be/xqdv0dHk8sM?start=50&end=60", "youtu.be/d2jVpLHIDYA?start=400&end=410", "youtu.be/dimAXdB5M4A?start=30&end=40", "youtu.be/_LbQtA860Ys?start=30&end=40", "youtu.be/ZYcHl5hSG0c?start=80&end=90", "youtu.be/ELLJd_c3cG8?start=100&end=110", "youtu.be/GD89xqFvx9I?start=30&end=40", "youtu.be/6AnnPCJJ8dM?start=80&end=90", "youtu.be/NHz8UGzTMAY?start=120&end=130"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0l14l2",
+    "name": "Shofar",
+    "description": "The sound of an ancient musical horn made of ram's horn, used for Jewish religious purposes.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Shofar",
+    "positive_examples": ["youtu.be/PmiLDro91gk?start=570&end=580", "youtu.be/ZzB115GeM6M?start=70&end=80", "youtu.be/x6f0yu_JzsE?start=40&end=50", "youtu.be/pq9irpvQhMk?start=4&end=14"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07kc_",
+    "name": "Theremin",
+    "description": "The sound of an early electronic musical instrument controlled without physical contact via two metal antennas that sense the relative position of the player's hands and control oscillators for frequency with one hand, and amplitude with the other.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Theremin",
+    "positive_examples": ["youtu.be/W8iIOvGNNcE?start=100&end=110", "youtu.be/y_MbC4fVy2g?start=80&end=90", "youtu.be/8bakI0ITCqQ?start=580&end=590", "youtu.be/5DUdq5JPUsQ?start=50&end=60"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0l14t7",
+    "name": "Singing bowl",
+    "description": "The sound of a type of bell that sits with the bottom surface resting, allowing the rim to vibrate to produce sound.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Singing_bowl",
+    "positive_examples": ["youtu.be/Y2MwHholr3c?start=340&end=350", "youtu.be/Q3FWUtPD08o?start=240&end=250", "youtu.be/jGIWR_k1Yt4?start=400&end=410", "youtu.be/ToQqkcyH21U?start=30&end=40", "youtu.be/t3Ufg50eARI?start=290&end=300"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05229",
+    "name": "Musical ensemble",
+    "description": "The sound of a group of people who perform instrumental and/or vocal music.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Musical_ensemble",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/01vj9c",
+    "name": "Bass (instrument role)",
+    "description": "Sounds of musical instruments that produce tones in the low-pitched range, typically providing a tonic root in a musical ensemble.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bass_(instrument)",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/01hgjl",
+    "name": "Scratching (performance technique)",
+    "description": "The sound of a technique of moving a vinyl record back and forth on a turntable while optionally manipulating the crossfader on a DJ mixer.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Scratching",
+    "positive_examples": ["youtu.be/Gt1DQkzcDCk?start=90&end=100", "youtu.be/Iue-KvTNGRY?start=30&end=40", "youtu.be/inTLqCdltCc?start=60&end=70", "youtu.be/8zf62sLj0IM?start=140&end=150", "youtu.be/ZkVC4Iw3YDE?start=590&end=600", "youtu.be/crgM97w4fM4?start=490&end=500", "youtu.be/SI7uLBg7HIk?start=230&end=240", "youtu.be/xU1dkNbkRkU?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0kpv1t",
+    "name": "Music genre",
+    "description": "Portmanteau class holding categories that represent styles or classifications of music, useful in identifying and organizing similar musical artists or recordings.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/m/064t9", "/m/0glt670", "/m/06by7", "/m/06j6l", "/m/0gywn", "/m/06cqb", "/m/01lyv", "/m/02x8m", "/m/02w4v", "/m/06j64v", "/m/03_d0", "/m/026z9", "/m/0ggq0m", "/m/02lkt", "/m/0g293", "/m/0155w", "/m/05fw6t", "/m/02v2lh", "/m/0y4f8", "/m/0164x2", "/m/02mscn", "/m/028sqc", "/m/06rqw", "/m/02p0sh1", "/m/05rwpb"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/m/064t9",
+    "name": "Pop music",
+    "description": "A genre of popular music that originated in the West during the 1950s and 1960s. Pop music is eclectic, often borrowing elements from urban, dance, rock, Latin, country, and other styles. Songs are typically short to medium-length with repeated choruses, melodic tunes, and hooks.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Pop_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0glt670",
+    "name": "Hip hop music",
+    "description": "Hip hop or rap music formed in the United States in the 1970s and consists of stylized rhythmic music that commonly accompanies rhythmic and rhyming speech (\"rapping\").",
+    "citation_uri": "http://en.wikipedia.org/wiki/Hip_hop_music",
+    "positive_examples": [],
+    "child_ids": ["/m/04j_h4", "/m/0n8zsc8", "/m/02cz_7"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04j_h4",
+    "name": "Grime music",
+    "description": "A genre of music that emerged in East London in the early 2000s as a development of UK garage and jungle.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Grime_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0n8zsc8",
+    "name": "Trap music",
+    "description": "A music genre that originated from hip hop in the early 1990s in the Southern United States, typified by its aggressive lyrical content and sound.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Trap_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/02cz_7",
+    "name": "Beatboxing",
+    "description": "A form of vocal percussion primarily involving the art of mimicking drum machines using one's mouth, lips, tongue, and voice.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Beatboxing",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06by7",
+    "name": "Rock music",
+    "description": "A genre of popular music that originated as \"rock and roll\" in the United States in the 1950s, and developed into a range of different styles in the 1960s and later. Compared to pop music, rock places a higher degree of emphasis on musicianship, live performance, and an ideology of authenticity.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Rock_music",
+    "positive_examples": [],
+    "child_ids": ["/m/03lty", "/m/05r6t", "/m/0dls3", "/m/0dl5d", "/m/07sbbz2", "/m/05w3f"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03lty",
+    "name": "Heavy metal",
+    "description": "A genre of rock music that developed in the late 1960s and early 1970s, consisting of a thick, massive sound, characterized by highly amplified distortion, extended guitar solos, emphatic beats, and overall loudness.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Heavy_metal_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05r6t",
+    "name": "Punk rock",
+    "description": "A rock music genre that developed between 1974 and 1976 typically composed of short or fast-paced songs, with hard-edged melodies and singing styles, stripped-down instrumentation, and often political, anti-establishment lyrics.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Punk_rock",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0dls3",
+    "name": "Grunge",
+    "description": "A subgenre of alternative rock that emerged during the mid-1980s fusing elements of punk rock and heavy metal, such as distorted electric guitars. Lyrics are typically angst-filled, addressing themes such as social alienation, apathy, confinement, and a desire for freedom.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Grunge",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0dl5d",
+    "name": "Progressive rock",
+    "description": "A rock music subgenre that developed in the late 1960s as an attempt to give greater artistic credibility to rock music. Songs were replaced by musical suites that often stretched to 20 or 40 minutes in length and contained symphonic influences, extended musical themes, fantasy-like ambience and lyrics, and complex orchestrations.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Progressive_rock",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07sbbz2",
+    "name": "Rock and roll",
+    "description": "A genre of popular music that originated and evolved in the United States during the late 1940s and early 1950s, from a combination of African-American genres such as blues, boogie-woogie, jump blues, jazz, and gospel music, together with Western swing and country music.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Rock_and_roll",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05w3f",
+    "name": "Psychedelic rock",
+    "description": "A style of rock music influenced by psychedelic culture and attempts to replicate and enhance the mind-altering experiences of psychedelic drugs, most notably LSD. It often uses new recording techniques and effects and sometimes draws on sources such as the ragas and drones of Indian music.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Psychedelic_rock",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06j6l",
+    "name": "Rhythm and blues",
+    "description": "A genre of popular African-American music that originated in the 1940s as urbane, rocking, jazz based music with a heavy, insistent beat. Lyrics focus heavily on the themes of triumphs and failures in terms of relationships, freedom, economics, aspirations, and sex.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Rhythm_and_blues",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0gywn",
+    "name": "Soul music",
+    "description": "A popular music genre that combines elements of African-American gospel music, rhythm and blues and jazz.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Soul_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06cqb",
+    "name": "Reggae",
+    "description": "A music genre that originated in Jamaica in the late 1960s, strongly influenced by traditional mento as well as American jazz and rhythm and blues, instantly recognizable from the counterpoint between the bass and drum downbeat, and the offbeat rhythm section.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Reggae",
+    "positive_examples": [],
+    "child_ids": ["/m/0190y4"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0190y4",
+    "name": "Dub",
+    "description": "A genre of music that grew out of reggae, consisting predominantly of instrumental remixes of existing recordings achieved by significantly manipulating and reshaping the recordings, usually by removing the vocals from an existing music piece, and emphasizing the drum and bass parts.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Dub_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/01lyv",
+    "name": "Country",
+    "description": "A genre of United States popular music with origins in folk, Blues and Western music, often consisting of ballads and dance tunes with generally simple forms and harmonies accompanied by mostly string instruments such as banjos, electric and acoustic guitars, dobros, and fiddles as well as harmonicas.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Country_music",
+    "positive_examples": [],
+    "child_ids": ["/m/015y_n", "/m/0gg8l"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/015y_n",
+    "name": "Swing music",
+    "description": "A form of American music that dominated in the 1930s and 1940s, using a strong rhythm section of double bass and drums as the anchor, a lead section of brass, woodwind, and sometimes stringed instruments, medium to fast tempos, and a \"lilting\" swing time rhythm.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Swing_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0gg8l",
+    "name": "Bluegrass",
+    "description": "A form of American roots music from the Appalachias with mixed roots in Irish, Scottish, Welsh, and English traditional music, later influenced by the music of African-Americans through incorporation of jazz elements.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bluegrass_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02x8m",
+    "name": "Funk",
+    "description": "A music genre that originated in the 1960s when African American musicians created a rhythmic, danceable new form of music that de-emphasized melody and chord progressions to bring a strong rhythmic groove of a bass line and drum part to the foreground.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Funk",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02w4v",
+    "name": "Folk music",
+    "description": "A genre that evolved from traditional music during the 20th century folk revival. One meaning often given is that of old songs with no known composers; another is music that has been transmitted and evolved by a process of oral transmission or performed by custom over a long period of time.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Folk_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06j64v",
+    "name": "Middle Eastern music",
+    "description": "Music originating from the vast region from Morocco to Iran, including the Arabic countries of the Middle East and North Africa, the Iraqi traditions of Mesopotamia, Iranian traditions of Persia, the Hebrew music of Israel, Armenian music, the varied traditions of Cypriot music, the music of Turkey, traditional Assyrian music, Berbers of North Africa, and Coptic Christians in Egypt.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Middle_Eastern_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03_d0",
+    "name": "Jazz",
+    "description": "A music genre that originated from African American communities of New Orleans during the late 19th and early 20th centuries in the form of independent traditional and popular musical styles, all linked by the common bonds of African American and European American musical parentage with a performance orientation.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Jazz",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/026z9",
+    "name": "Disco",
+    "description": "A genre of dance music containing elements of funk, soul, pop, and salsa that achieved popularity during the mid-1970s to the early 1980s.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Disco",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0ggq0m",
+    "name": "Classical music",
+    "description": "Art music produced or rooted in the traditions of Western music, including both liturgical and secular music, over the broad span of time from roughly the 11th century to the present day.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Classical_music",
+    "positive_examples": [],
+    "child_ids": ["/m/05lls"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05lls",
+    "name": "Opera",
+    "description": "An art form in which singers and musicians perform a dramatic work combining text and musical score, usually in a theatrical setting.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Opera",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02lkt",
+    "name": "Electronic music",
+    "description": "A large set of predominantly popular and dance genres in which synthesizers and other electronic instruments are the primary sources of sound.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/m/03mb9", "/m/07gxw", "/m/07s72n", "/m/029h7y", "/m/0283d", "/m/0m0jc", "/m/08cyft", "/m/0fd3y", "/m/07lnk", "/m/0m0fw", "/m/0bmfpc"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03mb9",
+    "name": "House music",
+    "description": "A genre of electronic dance music originating in Chicago in the early 1980s which is generally dance-based music characterized by repetitive 4/4 beats, rhythms mainly provided by drum machines, off-beat hi-hat cymbals, and synthesized basslines.",
+    "citation_uri": "http://en.wikipedia.org/wiki/House_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07gxw",
+    "name": "Techno",
+    "description": "A form of electronic dance music that emerged in Detroit during the mid-to-late 1980s, resulting from the melding of African American music including with electronic music by artists such as Kraftwerk, Giorgio Moroder, and Yellow Magic Orchestra.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Techno",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07s72n",
+    "name": "Dubstep",
+    "description": "A genre of electronic dance music that originated in South London in the late 1990s which generally features sparse, syncopated drum and percussion patterns with bass lines that contain prominent sub bass frequencies.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Dubstep",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/029h7y",
+    "name": "Electro",
+    "description": "A genre of electronic music and early hip hop influenced by funk and German and Japanese electropop which typically features drum machines and heavy electronic sounds, usually without vocals. If vocals are present they are delivered in a deadpan manner, often through electronic distortion such as vocoding and talkboxing.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Electro_(music)",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0283d",
+    "name": "Drum and bass",
+    "description": "A genre of electronic music that emerged from rave and oldschool jungle scenes in England during the early 1990s characterized by fast, syncopated breakbeats with heavy bass and sub-bass lines, sampled sources, and synthesizers.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Drum_and_bass",
+    "positive_examples": [],
+    "child_ids": ["/m/01f9gb"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01f9gb",
+    "name": "Oldschool jungle",
+    "description": "A genre of electronic music that developed in England in the early 1990s characterized by fast tempos, relatively slow and lyrical reggae-derived basslines, breakbeats, and heavily syncopated percussive loops, samples, and synthesized effects.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Oldschool_jungle",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0m0jc",
+    "name": "Electronica",
+    "description": "An umbrella term that encompasses a broad group of electronic-based styles such as techno, house, ambient, drum and bass, jungle, and industrial dance, among others.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Electronica",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/08cyft",
+    "name": "Electronic dance music",
+    "description": "A broad range of percussive electronic music genres including techno, house, trance, drum and bass, dubstep, and Jersey club produced largely for nightclubs, raves, and festivals, or by disc jockeys in the context of a live mix.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Electronic_dance_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0fd3y",
+    "name": "Ambient music",
+    "description": "A genre of music that developed in England in the 1970s from experimental and synthesizer-oriented styles which puts an emphasis on tone and atmosphere over traditional musical structure or rhythm.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Ambient_music",
+    "positive_examples": [],
+    "child_ids": ["/m/052smk"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/052smk",
+    "name": "Drone music",
+    "description": "A minimalist genre of music that emphasizes the use of sustained or repeated sounds, notes, or tone-clusters with relatively slight harmonic variations.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Drone_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/07lnk",
+    "name": "Trance music",
+    "description": "A genre of electronic dance music that developed in Germany during the 1990s characterized by a tempo between 125 and 150 beats per minute, repeating melodic phrases, and a musical form that distinctly builds tension throughout a track by mixing layers with distinctly foreshadowed build-up and release.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Trance_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0m0fw",
+    "name": "Noise music",
+    "description": "A genre of music that challenges the distinction between musical and non-musical sound with the expressive use of acoustically or electronically generated noise.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Noise_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0bmfpc",
+    "name": "UK garage",
+    "description": "A genre of electronic music from England in the early 1990s that featuress a tempo around 130 beats per minute with a distinctive 4/4 percussive rhythm, syncopated hi-hats, cymbals, and snares, and occasionally beat-skipping kick drums and altered vocal samples.",
+    "citation_uri": "http://en.wikipedia.org/wiki/UK_garage",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0g293",
+    "name": "Music of Latin America",
+    "description": "Music originating from Latin America which encompasses a wide variety of styles, including son, rumba, salsa, merengue, tango, samba, and bossa nova.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Music_of_Latin_America",
+    "positive_examples": [],
+    "child_ids": ["/m/02ccj9", "/m/0ln16", "/m/0kpck", "/m/080nby", "/m/05q7ms", "/m/0326g"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02ccj9",
+    "name": "Cumbia",
+    "description": "A dance-oriented music genre popular throughout Latin America which began as a courtship dance practiced among the African population on the Caribbean coasts of Colombia and Panama but later mixed with Amerindian and European instruments, steps, and musical characteristics.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Cumbia",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0ln16",
+    "name": "Salsa music",
+    "description": "A genre of music that is based largely on Cuban, Puerto Rican, and Dominican popular dance music, occasionally incorporating elements of rock, R&B, and funk.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Salsa_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0kpck",
+    "name": "Soca music",
+    "description": "A genre of Caribbean music that originated in Trinidad and Tobago in the late 1970s as an offshoot of kaiso and calypso, with influences from cadence, funk, and soul.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Soca_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/080nby",
+    "name": "Kuduro",
+    "description": "A type of music and dance developed in Angola in the 1980s sampled from traditional Caribbean carnival music and Angolan semba around a fast, energetic 4/4 beat.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Kuduro",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/05q7ms",
+    "name": "Funk carioca",
+    "description": "A type of dance music originally from Rio de Janeiro derived from Miami bass and African style music, also known as favela funk and baile funk.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Funk_carioca",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0326g",
+    "name": "Flamenco",
+    "description": "A genre originating in Andalusian music and dance styles which includes cante, toque, baile, jaleo, palmas, and pitos.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Flamenco",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0155w",
+    "name": "Blues",
+    "description": "A genre and musical form developed by African Americans in the United States around the end of the 19th century from African-American work songs and European-American folk music. The blues form, ubiquitous in jazz and rock and roll, is characterized by the call-and-response pattern, the blues scale and specific chord progressions, of which the twelve-bar blues is the most common. Blues shuffles or walking bass reinforce the trance-like rhythm and form a repetitive groove effect.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Blues",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05fw6t",
+    "name": "Music for children",
+    "description": "Music performed for children, often designed to provide an entertaining means of teaching about cultures, good behavior, facts, and skills.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Children's_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02v2lh",
+    "name": "New-age music",
+    "description": "A genre of music intended to create artistic inspiration, relaxation, and optimism used by listeners for yoga, massage, meditation, and reading as a method of stress management or to create a peaceful atmosphere. Includes both electronic and acoustic forms.",
+    "citation_uri": "http://en.wikipedia.org/wiki/New-age_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0y4f8",
+    "name": "Vocal music",
+    "description": "Music performed by one or more singers, with or without instrumental accompaniment, in which singing provides the main focus of the piece.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Vocal_music",
+    "positive_examples": [],
+    "child_ids": ["/m/0z9c", "/m/02bk07", "/m/02cz_7"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0z9c",
+    "name": "A capella",
+    "description": "Group or solo singing without instrumental accompaniment.",
+    "citation_uri": "http://en.wikipedia.org/wiki/A_cappella",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0164x2",
+    "name": "Music of Africa",
+    "description": "Music whose style or form clearly indicates African origin or primarily African influence. Given the vastness of the continent, this covers many distinct musical traditions. Sub-Saharan African music frequently relies on percussion instruments including xylophones, drums, and tone-producing instruments such as the mbira or \"thumb piano.\"",
+    "citation_uri": "http://en.wikipedia.org/wiki/Music_of_Africa",
+    "positive_examples": [],
+    "child_ids": ["/m/0145m", "/m/022dgg"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0145m",
+    "name": "Afrobeat",
+    "description": "A combination of traditional Nigerian music, Ghanaian music, jazz, highlife, funk, and chanted vocals, fused with percussion and vocal styles, popularised in Africa in the 1970s.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Afrobeat",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/022dgg",
+    "name": "Kwaito",
+    "description": "A music genre that emerged in South Africa, during the 1990s. It is a variant of house music featuring the use of African sounds and samples.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Kwaito",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/02mscn",
+    "name": "Christian music",
+    "description": "Music that has been written to express either personal or a communal belief regarding Christian life and faith. Its forms vary widely across the world, according to culture and social context.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Christian_music",
+    "positive_examples": [],
+    "child_ids": ["/m/016cjb"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/016cjb",
+    "name": "Gospel music",
+    "description": "A genre of Christian music usually featuring dominant vocals with Christian lyrics. Gospel music is rooted in the black oral tradition, in which hymns and sacred songs were repeated in call and response fashion, with hand clapping and foot stomping as rhythmic accompaniment.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Gospel_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/028sqc",
+    "name": "Music of Asia",
+    "description": "Musical styles originating from a large number of Asian countries located in Central, Southern, and East Asia.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Music_of_Asia",
+    "positive_examples": [],
+    "child_ids": ["/m/015vgc", "/m/0dq0md"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/015vgc",
+    "name": "Carnatic music",
+    "description": "A system of music commonly associated with the southern India, whose main emphasis is on vocal music. Although improvisation plays an important role, Carnatic music is mainly sung through compositions.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Carnatic_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0dq0md",
+    "name": "Music of Bollywood",
+    "description": "Hindi film songs featured in Bollywood films. Bollywood songs, along with dance, are a characteristic motif of Hindi cinema which gives it enduring popular appeal, cultural value, and context.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Music_of_Bollywood",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06rqw",
+    "name": "Ska",
+    "description": "A music genre that originated in Jamaica in the late 1950s, combining elements of Caribbean mento and calypso with American jazz and rhythm and blues. It is characterized by a walking bass line accented with rhythms on the off-beat.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Ska",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02p0sh1",
+    "name": "Traditional music",
+    "description": "Musical forms that have origins many generations into the past, commonly without formal notation or description, commonly familiar to people in a given culture.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05rwpb",
+    "name": "Independent music",
+    "description": "Music produced independent of major commercial record labels, possibly including a do-it-yourself approach to recording and publishing. The term indie is also used to describe music of this style regardless of actual production channel.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Independent_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00027",
+    "name": "Musical concepts",
+    "description": "Portmanteau class for descriptions of music sounds that reflect their abstract music-theoretical properties rather than their physical realizations.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/m/074ft", "/m/09dsr", "/m/05jcn", "/m/022c7z", "/m/01gp74", "/m/0b128", "/m/021wwz", "/m/0kc2j", "/m/03w6d1"],
+    "restrictions": ["abstract", "blacklist"]
+  },
+  {
+    "id": "/m/074ft",
+    "name": "Song",
+    "description": "A musical work intended to be sung by the human voice with distinct and fixed pitches and patterns using sound and silence and a variety of forms that often include the repetition of sections.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Song",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/09dsr",
+    "name": "Melody",
+    "description": "A linear succession of musical tones that the listener perceives as a single entity; a combination of pitch and rhythm, also known as a tune, voice, or line.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Melody",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/05jcn",
+    "name": "Musical note",
+    "description": "A pitched sound forming the \"atom\" of much written music.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Musical_note",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/022c7z",
+    "name": "Beat",
+    "description": "The basic unit of musical time -- the pulse -- of the mensural level. The beat is often defined as the rhythm listeners would tap their toes to when listening to a piece of music.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Beat_(music)",
+    "positive_examples": [],
+    "child_ids": ["/m/05xp3j"],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/05xp3j",
+    "name": "Drum beat",
+    "description": "A rhythmic pattern establishing the meter and groove through the pulse and subdivision, played on drum kits and other percussion instruments.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Drum_beat",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/01gp74",
+    "name": "Chord",
+    "description": "Any musical set of usually three or more notes that is heard as if sounding simultaneously.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Chord_(music)",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0b128",
+    "name": "Harmony",
+    "description": "The use of simultaneous pitches or chords. Harmony is often said to refer to the \"vertical\" aspect of music, as distinguished from melodic line, or the \"horizontal\" aspect.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Harmony",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/021wwz",
+    "name": "Bassline",
+    "description": "The low-pitched instrumental part or line played by a rhythm section instrument such as the electric bass, double bass, cello, tuba, or keyboard in many forms of popular and other music.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bassline",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0kc2j",
+    "name": "Loop",
+    "description": "A repeating section of sound material, usually in electroacoustical music.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Loop_(music)",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/03w6d1",
+    "name": "Drone",
+    "description": "A harmonic or monophonic effect or accompaniment where a note or chord is continuously sounded throughout most or all of a musical piece.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Drone_(music)",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/t/dd00028",
+    "name": "Music role",
+    "description": "Portmanteau class for categories that describe music according to its functional role. Although the genre and instrumentation of examples of each of these classes may vary widely, a listener will be able to recognize that the music is being used for a common specific purpose as stated in each class definition.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/m/025td0t", "/m/02cjck", "/m/03r5q_", "/m/0l14gg", "/m/07pkxdp", "/m/01z7dr", "/m/0140xf", "/m/0ggx5q", "/m/04wptg", "/t/dd00029"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/m/025td0t",
+    "name": "Background music",
+    "description": "Styles of music or soundscapes primarily intended to be listened to passively and not meant to be the main focus of an audience.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Background_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02cjck",
+    "name": "Theme music",
+    "description": "Music often written specifically for a radio program, television program, video game, or movie, and usually played during the intro, opening credits, and/or ending credits.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Theme_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03r5q_",
+    "name": "Jingle (music)",
+    "description": "A short song or tune used in advertising and for other commercial uses. The jingle contains one or more hooks that explicitly promote the product or service being advertised.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Jingle",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0l14gg",
+    "name": "Soundtrack music",
+    "description": "Recorded music accompanying a movie, video, TV show, video game, or the like. Although any music can be used in a soundtrack, this specifically refers to music that suggests its use to accompany images, rather than something that happens to have been used that way.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Soundtrack",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pkxdp",
+    "name": "Lullaby",
+    "description": "The act of singing a quiet song to lull a child to sleep.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=lullaby",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01z7dr",
+    "name": "Video game music",
+    "description": "The soundtrack that accompanies video games. Early video game music was once limited to simple melodies of early sound synthesizer technology; with advances in technology, video game music has now grown to include the same breadth and complexity associated with television and movie soundtracks.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Video_game_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0140xf",
+    "name": "Christmas music",
+    "description": "A variety of genres of music normally performed or heard around the Christmas season.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Christmas_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0ggx5q",
+    "name": "Dance music",
+    "description": "Music composed specifically to facilitate or accompany dancing. It can be either a whole musical piece or part of a larger musical arrangement.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Dance_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04wptg",
+    "name": "Wedding music",
+    "description": "Music played at wedding celebrations, including the ceremony and any festivities before or after the event.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Wedding_music",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00029",
+    "name": "Birthday music",
+    "description": "Music specifically used or performed at birthday celebrations.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00030",
+    "name": "Music mood",
+    "description": "Portmanteau class for music categories that indicate the over emotional affect of music, regardless of genre or instrumentation. These judgments are intrinsically subjective and likely to vary substantially between different musical cultures.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/t/dd00031", "/t/dd00032", "/t/dd00033", "/t/dd00034", "/t/dd00035", "/t/dd00036", "/t/dd00037"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/t/dd00031",
+    "name": "Happy music",
+    "description": "Music that evokes or conveys feelings of happiness.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00032",
+    "name": "Funny music",
+    "description": "Music that evokes or conveys amusement.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00033",
+    "name": "Sad music",
+    "description": "Music that evokes or conveys feelings of sadness.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00034",
+    "name": "Tender music",
+    "description": "Music that evokes or conveys feelings of tenderness.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00035",
+    "name": "Exciting music",
+    "description": "Music that evokes or conveys feelings of excitement.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00036",
+    "name": "Angry music",
+    "description": "Music that evokes or conveys feelings of anger.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00037",
+    "name": "Scary music",
+    "description": "Music that evokes or conveys feelings of fear.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/059j3w",
+    "name": "Natural sounds",
+    "description": "Sounds produced by natural sources in their normal soundscape, excluding animal and human sounds.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Natural_sounds",
+    "positive_examples": [],
+    "child_ids": ["/m/03m9d0z", "/m/0jb2l", "/m/0838f", "/m/02_41"],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/03m9d0z",
+    "name": "Wind",
+    "description": "Sounds caused by the large-scale flow of gases, especially the air flowing over the surface of the Earth. Long-duration winds have various names associated with their average strength, such as breeze, gale, storm, and hurricane.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Wind",
+    "positive_examples": ["youtu.be/JRyhdsQwFXM?start=110&end=120", "youtu.be/-PJrbyYLtS4?start=330&end=340", "youtu.be/dOU3KOnQM2Q?start=200&end=210", "youtu.be/48HQ_eeoygY?start=10&end=20", "youtu.be/HfaFm2c4a78?start=40&end=50", "youtu.be/aqAvJgN6GOE?start=220&end=230", "youtu.be/aN-oXqXdsVM?start=100&end=110", "youtu.be/cpFdPrI-8EQ?start=120&end=130", "youtu.be/U-WqWv0pXwI?start=20&end=30", "youtu.be/urfP_4xhJCE?start=90&end=100"],
+    "child_ids": ["/m/07q8f3b", "/m/09t49", "/t/dd00092"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07q8f3b",
+    "name": "Howl (wind)",
+    "description": "A loud sustained noise caused by wind, resembling the cry of a hound.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=howl",
+    "positive_examples": ["youtu.be/FhEJYUlNEqM?start=260&end=270", "youtu.be/JfzqqDmIwhY?start=30&end=40", "youtu.be/MRnpKEhDQNE?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/09t49",
+    "name": "Rustling leaves",
+    "description": "The sound of dried leaves rubbing against one another in respondse to being moved by the wind or another cause.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Leaf",
+    "positive_examples": ["youtu.be/18aaA2efGng?start=370&end=380", "youtu.be/0Hxth3DK480?start=140&end=150"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00092",
+    "name": "Wind noise (microphone)",
+    "description": "The noise produced when a strong air current passes over a microphone, causing large amplitude local turbulence, normally recorded as mechanical clipping as the microphone element exceeds its limits of linearity.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/LvfGG-k7ZO8?start=250&end=260", "youtu.be/IfvejNeLtno?start=30&end=40", "youtu.be/CTmRRq187y4?start=30&end=40", "youtu.be/GGQDjbHCq-E?start=260&end=270", "youtu.be/VtdAZlFNuTw?start=150&end=160", "youtu.be/10JMr_3UN0g?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0jb2l",
+    "name": "Thunderstorm",
+    "description": "The sound of a storm characterized by the presence of lightning and its acoustic effect on the Earth's atmosphere, known as thunder.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Thunderstorm",
+    "positive_examples": ["youtu.be/9gpdoYeFjoU?start=110&end=120", "youtu.be/zQptvcZ6Q64?start=30&end=40", "youtu.be/IADbYuioZ9s?start=190&end=200", "youtu.be/RllsaWFnyz0?start=10&end=20", "youtu.be/iqy2_Ofk2TQ?start=100&end=110", "youtu.be/AI1OweEW8C0?start=50&end=60", "youtu.be/9TKSDR1HANg?start=280&end=290", "youtu.be/ThUAAmLjUKM?start=20&end=30"],
+    "child_ids": ["/m/0ngt1"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0ngt1",
+    "name": "Thunder",
+    "description": "The sound caused by lightning. Depending on the distance and nature of the lightning, thunder can range from a sharp, loud crack to a long, low rumble.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Thunder",
+    "positive_examples": ["youtu.be/3y2aZEs1F5s?start=80&end=90", "youtu.be/S38pF1kVLYI?start=310&end=320", "youtu.be/cg1A8R7B7B8?start=30&end=40", "youtu.be/k4ggjvnuzbI?start=150&end=160", "youtu.be/Mmml5YKQYIY?start=400&end=410", "youtu.be/NN6kHRAbGeA?start=390&end=400", "youtu.be/JY9VX8KnreU?start=110&end=120", "youtu.be/1lMN9N4DBOc?start=200&end=210", "youtu.be/Jh-upSoOqTc?start=70&end=80"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0838f",
+    "name": "Water",
+    "description": "Sounds caused by motion of liquid water.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Properties_of_water",
+    "positive_examples": ["youtu.be/UDEyewB7-Hg?start=60&end=70", "youtu.be/GOJiiisgi-Q?start=30&end=40", "youtu.be/s0O-mPb6yHE?start=30&end=40", "youtu.be/IZe1BGDPzA8?start=30&end=40", "youtu.be/jy6ZJRjl1No?start=70&end=80", "youtu.be/F0gZgJRxJhA?start=100&end=110", "youtu.be/btD25mb9BRE?start=50&end=60", "youtu.be/U2-7bxSMCzA?start=30&end=40", "youtu.be/Gw0gwDD8cts?start=20&end=30"],
+    "child_ids": ["/m/06mb1", "/m/0j6m2", "/m/0j2kx", "/m/05kq4", "/m/06wzb", "/m/07swgks"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06mb1",
+    "name": "Rain",
+    "description": "The sound of droplets of liquid water that have condensed from atmospheric water vapor and then fallen under gravity.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Rain",
+    "positive_examples": ["youtu.be/5Ehgo2dKD3c?start=260&end=270", "youtu.be/aDViuftS7Wk?start=210&end=220", "youtu.be/nULDV0qOSDw?start=30&end=40", "youtu.be/wyXpYD4E-Lo?start=50&end=60", "youtu.be/QL-GkJ0BP0w?start=90&end=100", "youtu.be/uH3j2cZCzvk?start=80&end=90", "youtu.be/ZriuDmppWfw?start=140&end=150"],
+    "child_ids": ["/m/07r10fb", "/t/dd00038"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07r10fb",
+    "name": "Raindrop",
+    "description": "A sound clearly attributable to a single drop of rain.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=raindrop",
+    "positive_examples": ["youtu.be/AOYnd6OzO2U?start=30&end=40", "youtu.be/V_J1Zx1PbFU?start=30&end=40", "youtu.be/doi3DiqyMl0?start=6&end=16", "youtu.be/JFWiNHljGFI?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00038",
+    "name": "Rain on surface",
+    "description": "Steady sound of many rain drops striking a surface such as a roof or window.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/1r0Y4VFXrmE?start=30&end=40", "youtu.be/1C7lFCTvd6U?start=30&end=40", "youtu.be/0j9XJr31TA4?start=24&end=34"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0j6m2",
+    "name": "Stream",
+    "description": "Sounds of a body of water with a current, confined within a bed and banks.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Stream",
+    "positive_examples": ["youtu.be/5Hh3nZGtzaI?start=30&end=40", "youtu.be/r_qcJwjFuM4?start=100&end=110", "youtu.be/rUr2EL0sp7s?start=20&end=30", "youtu.be/12PDR1SiMoY?start=30&end=40", "youtu.be/P2kW_zbZEyc?start=210&end=220", "youtu.be/2YjgcnIKUmc?start=70&end=80"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0j2kx",
+    "name": "Waterfall",
+    "description": "The sound of water flowing over a vertical drop or a series of drops in the course of a stream or river.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Waterfall",
+    "positive_examples": ["youtu.be/P2kW_zbZEyc?start=210&end=220", "youtu.be/0WQ5Xejk0cs?start=30&end=40", "youtu.be/FrHCQ5x2pNQ?start=20&end=30", "youtu.be/D-oU-1zhS-U?start=30&end=40", "youtu.be/E4duB2A-ces?start=50&end=60", "youtu.be/XbigWWtnGL4?start=70&end=80"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05kq4",
+    "name": "Ocean",
+    "description": "Sounds of the body of saline water that composes much of Earth's surface.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Ocean",
+    "positive_examples": ["youtu.be/b1iahVw7phI?start=240&end=250", "youtu.be/bspTVc-6whI?start=70&end=80", "youtu.be/DVqUkCoFwgk?start=530&end=540", "youtu.be/Hi1HVjOqJa8?start=120&end=130", "youtu.be/Y6ft09j8rwI?start=370&end=380", "youtu.be/lQBqdnyzFH8?start=10&end=20", "youtu.be/2lEGN0fx9t4?start=10&end=20"],
+    "child_ids": ["/m/034srq"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/034srq",
+    "name": "Waves, surf",
+    "description": "Sounds caused by the surface waves that occur on the free surface of oceans, seas, lakes, rivers, etc., resulting from the wind blowing over the surface.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Wind_wave",
+    "positive_examples": ["youtu.be/F1acJgs48ug?start=420&end=430", "youtu.be/ciShfkeGGPs?start=140&end=150", "youtu.be/ZVveYlQuMAw?start=30&end=40", "youtu.be/3DhQmxwJ5f0?start=530&end=540", "youtu.be/acCLHAVHOO4?start=80&end=90"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06wzb",
+    "name": "Steam",
+    "description": "Sounds caused by  water in the gas phase, which is formed when water boils.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Steam",
+    "positive_examples": ["youtu.be/AoQcAv9swE4?start=220&end=230", "youtu.be/q73cABJraqg?start=60&end=70", "youtu.be/qgASsbk7Jo0?start=50&end=60", "youtu.be/AEGHk8z6Sbc?start=40&end=50", "youtu.be/UyC5UsOHaN8?start=270&end=280", "youtu.be/QKkSUUsM7kw?start=90&end=100"],
+    "child_ids": ["/m/07rjwbb"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07swgks",
+    "name": "Gurgling",
+    "description": "The bubbling sound of water flowing through a narrow constriction, such as from a bottle with a narrow neck.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=gurgling",
+    "positive_examples": ["youtu.be/ueGrX54zEs4?start=30&end=40", "youtu.be/PlPg7pFN-c0?start=30&end=40", "youtu.be/bN-pEae5Ojk?start=10&end=20"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02_41",
+    "name": "Fire",
+    "description": "Sounds resulting from  the rapid oxidation of a material in the exothermic chemical process of combustion, releasing heat, light, and various reaction products.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Fire",
+    "positive_examples": ["youtu.be/rRA29fWQ-yo?start=40&end=50", "youtu.be/ncTBfKkL98M?start=10&end=20", "youtu.be/xGzpK8xOJj8?start=220&end=230", "youtu.be/FkW00xIxnMM?start=160&end=170", "youtu.be/JocAxXHMaVw?start=40&end=50"],
+    "child_ids": ["/m/07pzfmf", "/m/0fjy1"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pzfmf",
+    "name": "Crackle",
+    "description": "An irregular sequence of sharp sounds, as from sudden vaporization of liquids trapped in a burning solid, or from a collection of snapping noises.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=crackle",
+    "positive_examples": ["youtu.be/yD8-rFGIX1Y?start=80&end=90", "youtu.be/bbQgr9juJHU?start=200&end=210", "youtu.be/MNQlSUhlaoU?start=130&end=140", "youtu.be/o2ll07zNfvo?start=20&end=30", "youtu.be/eDRs0CPfUT8?start=350&end=360", "youtu.be/J1gpa4iJ-ds?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0fjy1",
+    "name": "Wildfire",
+    "description": "Sounds of an uncontrolled fire in an area of combustible vegetation that occurs in the countryside.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Wildfire",
+    "positive_examples": ["youtu.be/7N-rIXXcb-Q?start=170&end=180", "youtu.be/gM1DIbory6A?start=70&end=80"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00041",
+    "name": "Sounds of things",
+    "description": "Set of sound classes referring to sounds that are immediately understood by listeners as arising from specific objects (rather than being heard more literally as \"sounds\").",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/m/07yv9", "/m/02mk9", "/t/dd00071", "/m/0395lw", "/m/07pp_mv", "/t/dd00077", "/m/07k1x", "/m/014zdl", "/m/083vt", "/m/039jq", "/m/04k94", "/t/dd00089", "/t/dd00133"],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/07yv9",
+    "name": "Vehicle",
+    "description": "Sounds of mobile machines that transport people or cargo.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Vehicle",
+    "positive_examples": ["youtu.be/Pp-F-z-_yfk?start=160&end=170", "youtu.be/551dHGGa05I?start=10&end=20", "youtu.be/H4993LXvCZ0?start=110&end=120", "youtu.be/e_1B185lSIo?start=40&end=50", "youtu.be/bS647KEfsH4?start=30&end=40", "youtu.be/DvdscEsZh6A?start=70&end=80", "youtu.be/8ycflE3dIHw?start=20&end=30", "youtu.be/BjXjO7fsBT4?start=270&end=280", "youtu.be/f8vk98Ul2yQ?start=180&end=190", "youtu.be/xquV-Ui8cMc?start=260&end=270"],
+    "child_ids": ["/m/019jd", "/m/012f08", "/m/06d_3", "/m/0k5j", "/t/dd00061"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/019jd",
+    "name": "Boat, Water vehicle",
+    "description": "Sounds of watercraft designed to float, plane, work, or travel on water.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Boat",
+    "positive_examples": ["youtu.be/6j1Rp2fqNxI?start=540&end=550", "youtu.be/m55Fx5rDh8g?start=50&end=60", "youtu.be/xISRpYtGzaw?start=260&end=270", "youtu.be/vihk0nkq5Bg?start=80&end=90", "youtu.be/vGPlsl1S4mM?start=260&end=270", "youtu.be/2Uir5X9ZgyM?start=30&end=40", "youtu.be/adjWwiHzQAg?start=430&end=440"],
+    "child_ids": ["/m/0hsrw", "/m/056ks2", "/m/02rlv9", "/m/06q74"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0hsrw",
+    "name": "Sailboat, sailing ship",
+    "description": "Sounds of a boat (large or small) propelled partly or entirely by sails.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Sailboat",
+    "positive_examples": ["youtu.be/bhs6VmkT8KI?start=340&end=350", "youtu.be/YcuMAHW6qBQ?start=130&end=140", "youtu.be/CyljecUybZw?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/056ks2",
+    "name": "Rowboat, canoe, kayak",
+    "description": "Sounds of a boat propelled using the motion of oars in the water.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Rowing",
+    "positive_examples": ["youtu.be/hoGzOkPLH1U?start=10&end=20", "youtu.be/hseqZnjc_t0?start=30&end=40", "youtu.be/EAt6jhYn8Nc?start=26&end=36", "youtu.be/nNbI-kSSeTg?start=30&end=40", "youtu.be/Rpo1MaZsBmA?start=170&end=180", "youtu.be/TMHeXKbz8vA?start=30&end=40", "youtu.be/CIN14f0ybzQ?start=360&end=370"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02rlv9",
+    "name": "Motorboat, speedboat",
+    "description": "Sounds of a boat which is powered by an inboard or outboard engine.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Motorboat",
+    "positive_examples": ["youtu.be/PpWc5v7JxoE?start=30&end=40", "youtu.be/3xnZraMY7ug?start=30&end=40", "youtu.be/MdptfcMqSNU?start=110&end=120", "youtu.be/22MGaPGKU58?start=30&end=40", "youtu.be/ljWbztbJyGI?start=30&end=40", "youtu.be/KVCEkvej6jA?start=26&end=36", "youtu.be/6T2wUBeDDl0?start=10&end=20", "youtu.be/A0sa5M5sJjM?start=30&end=40", "youtu.be/1-H5_TcnWNY?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06q74",
+    "name": "Ship",
+    "description": "The sound of a large buoyant watercraft, distinguished from boats based on size, shape, and cargo or passenger capacity.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Ship",
+    "positive_examples": ["youtu.be/q4w9sFyLvps?start=350&end=360", "youtu.be/BW4QjhWFfE0?start=70&end=80", "youtu.be/0xWD_SlWXBM?start=80&end=90", "youtu.be/ILAtgaMVy60?start=30&end=40", "youtu.be/er8zQZ4d0Ew?start=250&end=260"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/012f08",
+    "name": "Motor vehicle (road)",
+    "description": "Sounds of self-propelled land vehicles, usually wheeled, that do not operate on rails. Includes \"off-road\" vehicles designed for unmade roads.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Motor_vehicle",
+    "positive_examples": ["youtu.be/tMPg88c5ciQ?start=160&end=170", "youtu.be/-znLAiE_r_w?start=30&end=40", "youtu.be/2R2qwpgb0Dw?start=30&end=40", "youtu.be/ko91_sThwdo?start=290&end=300", "youtu.be/u8rFWqmbx18?start=240&end=250", "youtu.be/g6o47ksKlpQ?start=70&end=80", "youtu.be/gKKzQNZiymI?start=130&end=140", "youtu.be/f1yc8bkg0sc?start=10&end=20", "youtu.be/B1vvkDwpgLQ?start=30&end=40"],
+    "child_ids": ["/m/0k4j", "/m/07r04", "/m/01bjv", "/m/03j1ly", "/m/04_sv", "/m/0btp2"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0k4j",
+    "name": "Car",
+    "description": "Sounds of a self-powered motor vehicle with usually four wheels used for transportation of one to eight people, designed to run primarily on roads.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Car",
+    "positive_examples": ["youtu.be/EiyEPSJY_cM?start=20&end=30", "youtu.be/dThSTe35jb0?start=250&end=260", "youtu.be/a_35ez_FOY4?start=40&end=50", "youtu.be/S-VtBVNXxHw?start=30&end=40", "youtu.be/qKUVlPkkWNc?start=30&end=40", "youtu.be/TrczACxTtrg?start=270&end=280", "youtu.be/1dOV9NEPc60?start=8&end=18", "youtu.be/717m6D03130?start=100&end=110", "youtu.be/5xOlEfsD0_8?start=14&end=24"],
+    "child_ids": ["/m/0912c9", "/m/02mfyn", "/m/04gxbd", "/m/07rknqz", "/m/0h9mv", "/t/dd00134", "/m/0ltv"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0912c9",
+    "name": "Vehicle horn, car horn, honking",
+    "description": "The sound of a motor vehicle accessory whose purpose is to provide an audible alert of the vehicle's presence.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Vehicle_horn",
+    "positive_examples": ["youtu.be/tlyFVJ7P-po?start=23&end=33", "youtu.be/_waM2-2QnK8?start=30&end=40", "youtu.be/-aYumc8KoXg?start=60&end=70"],
+    "child_ids": ["/m/07qv_d5"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qv_d5",
+    "name": "Toot",
+    "description": "A blast of a horn.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=toot",
+    "positive_examples": ["youtu.be/PrNMIEfwmm4?start=240&end=250", "youtu.be/ikJlmuxCMgo?start=50&end=60", "youtu.be/0SoVB6iEHTU?start=100&end=110", "youtu.be/wN0i4-VxOgc?start=120&end=130", "youtu.be/Mcuclfu6kcg?start=100&end=110", "youtu.be/EYsAX8CaJ-4?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02mfyn",
+    "name": "Car alarm",
+    "description": "The loud sound of an electronic device installed in a vehicle in an attempt to discourage theft of the vehicle or its contents.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Car_alarm",
+    "positive_examples": ["youtu.be/17VuPl9Wxvs?start=20&end=30", "youtu.be/o799nvBLsaQ?start=50&end=60", "youtu.be/VC9nqtFULW4?start=280&end=290", "youtu.be/n8gnTTCrC98?start=40&end=50", "youtu.be/4A1Ar1TIXIY?start=30&end=40", "youtu.be/0ct9AETvIdE?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04gxbd",
+    "name": "Power windows, electric windows",
+    "description": "The sound of automobile windows being raised and lowered by a built-in electric motor, controlled by a button or switch.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Power_window",
+    "positive_examples": ["youtu.be/z-wdnR3D8y4?start=8&end=18", "youtu.be/PKDeI8PzYxk?start=30&end=40", "youtu.be/ta2vGgYNBgg?start=10&end=20"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rknqz",
+    "name": "Skidding",
+    "description": "The sound of a vehicle sliding on a road surface as a consequence of its wheels losing sufficient frictional contact.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=skidding",
+    "positive_examples": ["youtu.be/q9CaARH_mgI?start=40&end=50", "youtu.be/UtLGcgDmYhI?start=210&end=220", "youtu.be/TWcbU5_9jao?start=150&end=160", "youtu.be/0IUR0a8uR5c?start=30&end=40", "youtu.be/V3CQ5EUog80?start=180&end=190", "youtu.be/22olCB3wQaA?start=40&end=50", "youtu.be/3GHVkqqQZ_k?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0h9mv",
+    "name": "Tire squeal",
+    "description": "The high-pitched sound of a rubber vehicle tire sliding over a road surface when the torque applied to the wheel (to either accelerate or decelerate) exceeds the maximum frictional force that can be supported.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Tire",
+    "positive_examples": ["youtu.be/ZLvD9fHgaUQ?start=20&end=30", "youtu.be/TCudWnNMr0s?start=10&end=20", "youtu.be/UNjIcZaQ9YM?start=400&end=410", "youtu.be/OoJJu9TzPXo?start=150&end=160", "youtu.be/_Jr8Wehn1uo?start=400&end=410", "youtu.be/YOuBSVA8m2k?start=180&end=190"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00134",
+    "name": "Car passing by",
+    "description": "The sound of a motorized vehicle as it passes by a listener close to the vehicle's path. The sound may include engine and tire noise and will typically involve a clear build-up and/or decay of intensity as the vehicle approaches and retreats, as well as possible Doppler shift.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/-5-jLuQ6U_E?start=19&end=29", "youtu.be/NvReVInFNxk?start=30&end=40", "youtu.be/2FyalYzSF6g?start=17&end=27", "youtu.be/31iv4K9G_4E?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0ltv",
+    "name": "Race car, auto racing",
+    "description": "The sound of one or more motor vehicles engaged in a competition to complete a course as quickly as possible. Specialized race cars include many modifications, often making them much louder than common street vehicles.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Auto_racing",
+    "positive_examples": ["youtu.be/VWkQ9L1Uc5M?start=270&end=280", "youtu.be/5BthTpmqpqY?start=580&end=590", "youtu.be/_dQop-gc2hA?start=50&end=60", "youtu.be/MlHCVDr_tAU?start=20&end=30", "youtu.be/bq9EO2TQx8w?start=20&end=30", "youtu.be/H6HLCq8o3KE?start=230&end=240"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07r04",
+    "name": "Truck",
+    "description": "Sounds of large motor vehicles designed to transport cargo on roads.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Truck",
+    "positive_examples": ["youtu.be/3k7zGsqUsbY?start=30&end=40", "youtu.be/fbNmnmuCha8?start=170&end=180", "youtu.be/j7muFWO1vN4?start=30&end=40", "youtu.be/d6dTtZwpSN8?start=150&end=160", "youtu.be/GVrspqytLM0?start=80&end=90", "youtu.be/Iu5uS7s-7ic?start=240&end=250", "youtu.be/fyYoxBvG29A?start=210&end=220"],
+    "child_ids": ["/m/0gvgw0", "/m/05x_td", "/m/02rhddq", "/m/03cl9h"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0gvgw0",
+    "name": "Air brake",
+    "description": "Sounds of a vehicle braking system in which compressed air is used to transmit force to the brake pads on the wheels. The pneumatic system makes an abrupt hissing sound when the pressure is released by venting the air.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Air_brake_(road_vehicle)",
+    "positive_examples": ["youtu.be/8Wk-ZmlsUqY?start=28&end=38", "youtu.be/6OOX9BS-ydU?start=30&end=40", "youtu.be/PJULCvSqmPw?start=30&end=40", "youtu.be/oancmFO62ns?start=21&end=31", "youtu.be/9keCNLhx4a0?start=30&end=40", "youtu.be/NecAPvps9io?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05x_td",
+    "name": "Air horn, truck horn",
+    "description": "The sound of a pneumatic device mounted on large vehicles designed to create an extremely loud noise for signalling purposes.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Air_horn",
+    "positive_examples": ["youtu.be/EF7i4DHBcnE?start=22&end=32", "youtu.be/AVC9tvCnKYE?start=30&end=40", "youtu.be/kcFS8Ugs3PU?start=30&end=40", "youtu.be/4nomoKwSVys?start=30&end=40", "youtu.be/fBMj2amDb7c?start=3&end=13", "youtu.be/ZIMZy7mOX7Y?start=30&end=40", "youtu.be/4iPQgalitDs?start=250&end=260", "youtu.be/58zUgdsdM0w?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02rhddq",
+    "name": "Reversing beeps",
+    "description": "The sound of an electronic warning device fitted to many commercial vehicles to alert pedestrians when the reverse gear is engaged, signaling that the vehicle is likely to move backwards.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Reversing_(vehicle_maneuver)",
+    "positive_examples": ["youtu.be/9OfYjXOhqmk?start=17&end=27", "youtu.be/DlKTxOx07jA?start=60&end=70", "youtu.be/7bBUxUpozzQ?start=40&end=50", "youtu.be/gQ1KUsQayuY?start=190&end=200", "youtu.be/38F6eeIR-s0?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03cl9h",
+    "name": "Ice cream truck, ice cream van",
+    "description": "Sounds of a commercial vehicle that serves as a mobile retail outlet for ice cream, usually during the summer. Ice cream vans often play a repeating nursery-rhyme-style melody from speakers mounted on the vehicle's exterior to attract customers.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Ice_cream_van",
+    "positive_examples": ["youtu.be/JXbMV5CqLkc?start=20&end=30", "youtu.be/6de_mfVMVik?start=40&end=50", "youtu.be/DRnIWAFMJOI?start=7&end=17", "youtu.be/fJpbrsTaWa4?start=19&end=29", "youtu.be/s1apsX51xBA?start=150&end=160", "youtu.be/kPIGN1jZdpk?start=10&end=20"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01bjv",
+    "name": "Bus",
+    "description": "Interior and exterior sounds of a road vehicle designed to carry many passengers.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bus",
+    "positive_examples": ["youtu.be/FLcqHUR58AU?start=60&end=70", "youtu.be/k0aJQaWGSYE?start=80&end=90", "youtu.be/h4hquL9bIEw?start=110&end=120", "youtu.be/YxvDDpmv-s4?start=30&end=40", "youtu.be/D3iiZju71yw?start=10&end=20", "youtu.be/NG3AQzmVgnE?start=14&end=24", "youtu.be/Tcdrj-kHoUc?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03j1ly",
+    "name": "Emergency vehicle",
+    "description": "The sound of any vehicle that is designated and authorized to respond to an emergency. Since emergency vehicles often need to travel at high speeds, they are frequently fitted with audible sirens to warn other motorists to give way.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Emergency_vehicle",
+    "positive_examples": ["youtu.be/xV0eTva6SKQ?start=24&end=34", "youtu.be/E6sS2d-NeTE?start=20&end=30", "youtu.be/JWd6vF4yfG8?start=100&end=110", "youtu.be/VqczA8N9pqM?start=19&end=29", "youtu.be/5gyVNyAY6Ns?start=30&end=40", "youtu.be/WPASuIJUWns?start=180&end=190", "youtu.be/mohBlDuZyYw?start=410&end=420", "youtu.be/XDJ9sFKqTV8?start=90&end=100", "youtu.be/ywRo3KB5AuM?start=10&end=20", "youtu.be/jg9TpmmZ5Ws?start=18&end=28"],
+    "child_ids": ["/m/04qvtq", "/m/012n7d", "/m/012ndj"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04qvtq",
+    "name": "Police car (siren)",
+    "description": "The sound of a siren fitted to a ground vehicle used by police when responding to incidents.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Police_car",
+    "positive_examples": ["youtu.be/C8Ne4NNA_CA?start=10&end=20", "youtu.be/lWDYbFDhV8k?start=40&end=50", "youtu.be/QRBD2TjXpcc?start=250&end=260", "youtu.be/ieDcpj6T7XI?start=30&end=40", "youtu.be/TBhbEn20dLs?start=120&end=130", "youtu.be/k9u987RcH64?start=2&end=12"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/012n7d",
+    "name": "Ambulance (siren)",
+    "description": "The sound of a siren fitted to a vehicle used to transport sick or injured people to a hospital or other medical facility.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Ambulance",
+    "positive_examples": ["youtu.be/0UX7Cl2iJJs?start=110&end=120", "youtu.be/VDom_PHnwZA?start=40&end=50", "youtu.be/u0VWWoFT0mA?start=30&end=40", "youtu.be/ZGkgx5m0Jt4?start=100&end=110", "youtu.be/RcLlkeN5TRc?start=270&end=280", "youtu.be/PTOF-fb9S5k?start=10&end=20", "youtu.be/3M3FsgEyWW8?start=290&end=300"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/012ndj",
+    "name": "Fire engine, fire truck (siren)",
+    "description": "The sound of a siren fitted to a vehicle used to transport firefighters and their equipment to an emergency situation.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Fire_engine",
+    "positive_examples": ["youtu.be/8Mnb35qaNBg?start=30&end=40", "youtu.be/AXzS-mChOB4?start=30&end=40", "youtu.be/0JPT13OUVV8?start=60&end=70", "youtu.be/iKHqGVW-Zu8?start=60&end=70", "youtu.be/7hPshpbbMBg?start=60&end=70", "youtu.be/orPJwjAeZHA?start=190&end=200", "youtu.be/qXAxJ0liwAE?start=70&end=80", "youtu.be/OP4mg83ynII?start=320&end=330"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04_sv",
+    "name": "Motorcycle",
+    "description": "Sounds of a small motor vehicle, usually with only two wheels. Motorcycles typically lack an external shell and seat riders astride the engine.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Motorcycle",
+    "positive_examples": ["youtu.be/BPXptOeqaqQ?start=80&end=90", "youtu.be/IhesEJJid7g?start=110&end=120", "youtu.be/J_EJg7FeJVQ?start=140&end=150", "youtu.be/4_qnGJihHvM?start=320&end=330", "youtu.be/PtLaYbr0ezA?start=90&end=100", "youtu.be/T7t7eymKq8I?start=360&end=370", "youtu.be/dd8ExHtH4qM?start=390&end=400", "youtu.be/FtHpeHIK3s0?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0btp2",
+    "name": "Traffic noise, roadway noise",
+    "description": "The combined sounds of many motor vehicles traveling on roads.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Traffic",
+    "positive_examples": ["youtu.be/LZ1bM5g_E0w?start=270&end=280", "youtu.be/7AN6Sfm3cFc?start=160&end=170", "youtu.be/yLW0Jd0_OAs?start=100&end=110", "youtu.be/FN1rC23Rrlg?start=170&end=180", "youtu.be/w5GiL7HGYj8?start=30&end=40", "youtu.be/K1k4sCWJS_g?start=30&end=40", "youtu.be/PIsYNyv77F4?start=570&end=580", "youtu.be/5ptjqWWz8W0?start=160&end=170"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06d_3",
+    "name": "Rail transport",
+    "description": "Sounds associated with wheeled vehicles running on rails (tracks).",
+    "citation_uri": "http://en.wikipedia.org/wiki/Rail_transport",
+    "positive_examples": ["youtu.be/nLvQz5b_520?start=140&end=150", "youtu.be/FD1ZidaXVv0?start=80&end=90", "youtu.be/mqbx_JLqq7c?start=40&end=50", "youtu.be/EL0ztYzolF0?start=410&end=420", "youtu.be/OGSX8NTNowE?start=70&end=80", "youtu.be/5jn-KJICC9g?start=110&end=120", "youtu.be/hQIu0GkaFZM?start=330&end=340", "youtu.be/x8r1bqLPm88?start=270&end=280"],
+    "child_ids": ["/m/07jdr", "/m/01g50p", "/t/dd00048", "/m/0195fx"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07jdr",
+    "name": "Train",
+    "description": "The sound of one or more coupled vehicles running on a rail track to transport cargo or passengers, propelled by a separate locomotive or individual motors in each railroad car.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Train",
+    "positive_examples": ["youtu.be/NTcgZ8rSpNI?start=20&end=30", "youtu.be/ulhaRVr-cVc?start=120&end=130", "youtu.be/-TIHs3wuY9E?start=160&end=170", "youtu.be/q1na9CH3Ufw?start=320&end=330"],
+    "child_ids": ["/m/04zmvq", "/m/0284vy3"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04zmvq",
+    "name": "Train whistle",
+    "description": "The sound of an audible signaling device on a steam locomotive used to warn that the train is approaching, and to communicate with rail workers.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Train_whistle",
+    "positive_examples": ["youtu.be/5rrwSsOeFso?start=20&end=30", "youtu.be/sCrfI81URlw?start=20&end=30", "youtu.be/30n4GYsBucU?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0284vy3",
+    "name": "Train horn",
+    "description": "The sound of an air horn which serves as an audible warning device on diesel and electric locomotives.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Train_horn",
+    "positive_examples": ["youtu.be/9kKfuvM4b64?start=110&end=120", "youtu.be/DxOJ47vC7lk?start=280&end=290", "youtu.be/JHhEjsAkZoc?start=320&end=330", "youtu.be/SimGJNnJvjA?start=30&end=40", "youtu.be/pNnwMXKDsH8?start=110&end=120", "youtu.be/KeoJ5d2CwAM?start=10&end=20", "youtu.be/gTFCK9TuLOQ?start=30&end=40", "youtu.be/_O5NaJziPSo?start=490&end=500", "youtu.be/_flJpCGAs5Y?start=220&end=230"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01g50p",
+    "name": "Railroad car, train wagon",
+    "description": "The sound of a vehicle used for carrying cargo or passengers on a rail transport system.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Railroad_car",
+    "positive_examples": ["youtu.be/6-RxMAsi5Lg?start=240&end=250", "youtu.be/AEA8ZJTyanM?start=280&end=290", "youtu.be/3ijC110Y964?start=40&end=50", "youtu.be/WWvv8lYU_Cc?start=30&end=40", "youtu.be/TA3E0Sk4Rno?start=410&end=420", "youtu.be/pYcn70JiIic?start=23&end=33", "youtu.be/Tk2dDhXIR1Y?start=120&end=130", "youtu.be/02w3vd_GgF0?start=20&end=30", "youtu.be/bYdlZOsrf60?start=450&end=460", "youtu.be/eONg_DTdnYQ?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00048",
+    "name": "Train wheels squealing",
+    "description": "The high-pitched tones caused by steel train wheels rubbing against rails, for instance when traversing a tight bend or when braking.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/3aRfForfdwQ?start=130&end=140", "youtu.be/gMWHNN1ff3E?start=380&end=390", "youtu.be/ASkhovrsIPA?start=310&end=320"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0195fx",
+    "name": "Subway, metro, underground",
+    "description": "Sounds of high-capacity electric railways that serve as public transport, generally in urban areas.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Rapid_transit",
+    "positive_examples": ["youtu.be/d1P3n4tQuqw?start=290&end=300", "youtu.be/i50lpZQbRm4?start=200&end=210", "youtu.be/-FBT_lY07L0?start=160&end=170", "youtu.be/UX6N6HRIS4w?start=10&end=20", "youtu.be/39TSxaQNm3E?start=50&end=60", "youtu.be/DB-nDjDvnfI?start=60&end=70", "youtu.be/U6QktI8TvUs?start=160&end=170", "youtu.be/XzfyPcaAEcE?start=420&end=430", "youtu.be/FF-FeQjcHo8?start=100&end=110"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0k5j",
+    "name": "Aircraft",
+    "description": "Sound of vehicles that fly by gaining support from the air.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Aircraft",
+    "positive_examples": ["youtu.be/CwiCsQ1uuwE?start=40&end=50", "youtu.be/lAeup78oZR4?start=10&end=20", "youtu.be/7lQGTHLXdhM?start=390&end=400", "youtu.be/z794C41L1pI?start=60&end=70", "youtu.be/oFECKaZ_3oE?start=80&end=90", "youtu.be/zj2R0XoFr5k?start=70&end=80", "youtu.be/T0jJZL3GKT0?start=40&end=50", "youtu.be/Fgp8ljPZNd8?start=70&end=80"],
+    "child_ids": ["/m/014yck", "/m/09ct_", "/m/0cmf2"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/014yck",
+    "name": "Aircraft engine",
+    "description": "The sound of the part of an aircraft that generates mechanical power. Aircraft engines are almost always lightweight piston engines, gas turbines, or, in small multicopter UAVs, electric motors.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Aircraft_engine",
+    "positive_examples": ["youtu.be/fn3iwcnOPrg?start=90&end=100", "youtu.be/2coSAx95GQU?start=50&end=60", "youtu.be/RKpQP1AAtBk?start=260&end=270", "youtu.be/xx17bh-qHyY?start=60&end=70", "youtu.be/4IsppmrZ5o0?start=20&end=30", "youtu.be/RqcINAxN51c?start=390&end=400", "youtu.be/Dia6QIxORbM?start=140&end=150", "youtu.be/ZGs7MopJt6o?start=10&end=20", "youtu.be/TaJHKexu3sk?start=10&end=20", "youtu.be/FsmBCrwWsJk?start=30&end=40"],
+    "child_ids": ["/m/04229", "/m/02l6bg"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04229",
+    "name": "Jet engine",
+    "description": "The sound of an internal combustion airbreathing engine that generates thrust by discharging a fast-moving jet of gas. Jet aircraft use such engines for long-distance travel.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Jet_engine",
+    "positive_examples": ["youtu.be/8IgKdVZ363c?start=30&end=40", "youtu.be/cPCbqyGC0-w?start=330&end=340", "youtu.be/ObFke2ZOH2g?start=130&end=140", "youtu.be/dENOlUJG-xI?start=160&end=170", "youtu.be/QXZBL0TQAdI?start=30&end=40", "youtu.be/pXDNMv9t990?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02l6bg",
+    "name": "Propeller, airscrew",
+    "description": "The sound of an aircraft propulsion system that converts rotary motion from an engine via a rotating set of blades into forward thrust through the air.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Propeller_(aeronautics)",
+    "positive_examples": ["youtu.be/1ag_IA2r_Ig?start=10&end=20", "youtu.be/sJwx1lnXXts?start=120&end=130", "youtu.be/n7FIIQPIbPg?start=310&end=320", "youtu.be/CuF4tcbKVpM?start=120&end=130", "youtu.be/rXNzfRo9BXU?start=140&end=150"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/09ct_",
+    "name": "Helicopter",
+    "description": "Sounds of a type of rotorcraft in which lift and thrust are supplied by rotors. This allows the helicopter to take off and land vertically, to hover, and to fly forward, backward, and laterally.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Helicopter",
+    "positive_examples": ["youtu.be/xpZpYRkr43g?start=110&end=120", "youtu.be/W1713WwiTSA?start=10&end=20", "youtu.be/ALdPf99u3s4?start=390&end=400", "youtu.be/2F9iORblSyw?start=150&end=160", "youtu.be/vuucKdxP4Ko?start=20&end=30", "youtu.be/yQwe8tQ9nNY?start=100&end=110", "youtu.be/6pzE3TgR2u8?start=20&end=30", "youtu.be/EoPk7WTZ_GY?start=470&end=480", "youtu.be/ZLJMgS5ysaQ?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0cmf2",
+    "name": "Fixed-wing aircraft, airplane",
+    "description": "Sounds of an aircraft capable of flight using wings that generate lift caused by the vehicle's forward airspeed and the shape of the wings.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Fixed-wing_aircraft",
+    "positive_examples": ["youtu.be/Gv7ZFYKFmqg?start=130&end=140", "youtu.be/8voZhq4ZUdw?start=60&end=70", "youtu.be/Ie2KkAvU1zU?start=130&end=140", "youtu.be/8vIButnIMp4?start=30&end=40", "youtu.be/ghcZErZzTPI?start=10&end=20", "youtu.be/t6-3INu2258?start=90&end=100", "youtu.be/j9N_Q9EXFP0?start=23&end=33", "youtu.be/XGammZXc2Eo?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00061",
+    "name": "Non-motorized land vehicle",
+    "description": "Sounds of human-powered vehicles used on roads or elsewhere on land.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/Dv-Ey9Na0C8?start=4&end=14"],
+    "child_ids": ["/m/0199g", "/m/06_fw"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0199g",
+    "name": "Bicycle",
+    "description": "Sounds of a human-powered, pedal-driven vehicle, having two wheels attached to a frame, one behind the other.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bicycle",
+    "positive_examples": ["youtu.be/Cg5sZo1W_EI?start=190&end=200", "youtu.be/FPi-h5oLFbw?start=410&end=420", "youtu.be/m3SxgaD4vGw?start=30&end=40", "youtu.be/DkjX3KtPHCk?start=60&end=70", "youtu.be/JqrYxQ6_eak?start=480&end=490", "youtu.be/8nfOPcwObls?start=70&end=80", "youtu.be/6SFinhyJaCs?start=30&end=40"],
+    "child_ids": ["/m/0gy1t2s"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06_fw",
+    "name": "Skateboard",
+    "description": "Sounds of a type of sports equipment consisting of a board mounted on two pairs of wheels. A skateboard is typically propelled by pushing with one foot while the other remains on the board, or by simply standing on the deck while on a downward slope and allowing gravity to propel the board and rider.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Skateboard",
+    "positive_examples": ["youtu.be/ja4EA1MiDs8?start=130&end=140", "youtu.be/4mLv61JADOQ?start=40&end=50", "youtu.be/kwQQSiYG1lk?start=70&end=80", "youtu.be/eKFbKpSfddE?start=25&end=35", "youtu.be/7awbfTNKjWc?start=30&end=40", "youtu.be/IGBRNQ9-9r0?start=250&end=260", "youtu.be/APYR7TeoGCQ?start=30&end=40", "youtu.be/yRzTXFjC-Hw?start=50&end=60"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02mk9",
+    "name": "Engine",
+    "description": "The sound of a machine designed to produce mechanical energy. Combustion engines burn a fuel to create heat, which then creates a force. Electric motors convert electrical energy into mechanical motion. Other classes of engines include pneumatic motors and clockwork motors.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Engine",
+    "positive_examples": ["youtu.be/rD1XBW018e4?start=120&end=130", "youtu.be/21Hv4dPd-5c?start=21&end=31", "youtu.be/1OulOUPg7-8?start=3&end=13", "youtu.be/DgyFll5DJeQ?start=30&end=40", "youtu.be/ETVA-wrKgtA?start=120&end=130", "youtu.be/33eq2a2BeZI?start=30&end=40", "youtu.be/V0CRYp34c2k?start=30&end=40", "youtu.be/fO250kjXV7U?start=20&end=30"],
+    "child_ids": ["/t/dd00065", "/t/dd00066", "/t/dd00067", "/m/04229", "/m/01h82_", "/t/dd00130", "/m/07pb8fc", "/m/07q2z82"],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00065",
+    "name": "Light engine (high frequency)",
+    "description": "The sound of a small engine such as a toy car, sewing machine, or moped.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/i69icWu7vgM?start=120&end=130", "youtu.be/_yFwVTg-V-M?start=10&end=20", "youtu.be/Bt-h7IczpuM?start=40&end=50", "youtu.be/5xiXwDn7eJ8?start=80&end=90", "youtu.be/HI7BKGzGerc?start=190&end=200", "youtu.be/SaUTCeOPdUQ?start=50&end=60", "youtu.be/Lzec-fG2xwI?start=520&end=530", "youtu.be/ki9Mnrelw5s?start=360&end=370"],
+    "child_ids": ["/m/08j51y", "/m/01yg9g", "/m/01j4z9"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/08j51y",
+    "name": "Dental drill, dentist's drill",
+    "description": "The sound of a small, high-speed drill used during dental procedures, usually to remove decay and shape tooth structure prior to the insertion of a filling or crown.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Dental_drill",
+    "positive_examples": ["youtu.be/Z5I3qUNg8Xg?start=280&end=290", "youtu.be/52sTvbwi7Mg?start=130&end=140"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01yg9g",
+    "name": "Lawn mower",
+    "description": "The sound of a machine utilizing one or more revolving blades to cut grass, powered by an electric motor, a small internal combustion engine, or by a human pushing the machine.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Lawn_mower",
+    "positive_examples": ["youtu.be/095CdSbYrFc?start=30&end=40", "youtu.be/CPjttawNaII?start=50&end=60", "youtu.be/zj_gAYMxfQ8?start=20&end=30", "youtu.be/LB4w_fc8F-A?start=60&end=70", "youtu.be/16d3D1AWFu8?start=20&end=30", "youtu.be/UM5xPbgiDbs?start=60&end=70"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01j4z9",
+    "name": "Chainsaw",
+    "description": "The sound of a portable mechanical saw which cuts with a set of teeth attached to a rotating chain. Chainsaws are commonly powered by a small two-stroke internal combustion engine.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Chainsaw",
+    "positive_examples": ["youtu.be/uFenFl01Eu8?start=360&end=370", "youtu.be/8yBGrAo62iQ?start=70&end=80", "youtu.be/-RNbb1ZTu8Y?start=30&end=40", "youtu.be/HZ69dl6yTYM?start=90&end=100", "youtu.be/Ae6pZjdGfRY?start=30&end=40", "youtu.be/vRa8ncJ-Zrw?start=230&end=240", "youtu.be/iWOlAJb2ogg?start=30&end=40", "youtu.be/CDm5VBH6nL4?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00066",
+    "name": "Medium engine (mid frequency)",
+    "description": "The sound of a moderately-sized engine such as that which powers a motorcycle, sedan, or small truck.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/1uTnWkTP9a0?start=24&end=34", "youtu.be/2zsjLiSNK0Q?start=30&end=40", "youtu.be/s0irKAYCfOE?start=30&end=40", "youtu.be/6Pv8cptxmaI?start=17&end=27", "youtu.be/2aQWtSZafxA?start=30&end=40", "youtu.be/0Xny2kBTr1U?start=340&end=350", "youtu.be/BrGwC9j8gZg?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00067",
+    "name": "Heavy engine (low frequency)",
+    "description": "The sound of a large, heavy-duty internal combustion engine, such as that which powers a large truck, locomotive, or ship.  Jet engines, steam engines, rocket engines etc. are excluded.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/-TBHymyAbCs?start=30&end=40", "youtu.be/-TFvKCs-Nlc?start=30&end=40", "youtu.be/QkBBROJPasI?start=30&end=40", "youtu.be/-kXfIGe5I8Y?start=28&end=38", "youtu.be/-olpPhZLSWI?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01h82_",
+    "name": "Engine knocking",
+    "description": "A metallic \"pinging\" sound caused by a malfunctioning spark-ignited internal combustion engine. Engine knocking indicates that the combustion of the air/fuel mixture in the cylinder is occurring at the wrong point in the piston's stroke.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Engine_knocking",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/t/dd00130",
+    "name": "Engine starting",
+    "description": "The sound of an engine starting from rest, which may involve a specific starter mechanism (as in a standard car).",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/6O2fJUnSxnE?start=30&end=40", "youtu.be/4fl09vJXgH4?start=30&end=40", "youtu.be/tRl0-Q0ZJZI?start=18&end=28", "youtu.be/AnpUWRakgmE?start=150&end=160", "youtu.be/BEKB0WKczpg?start=15&end=25"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pb8fc",
+    "name": "Idling",
+    "description": "The sound of an engine (typically in an automobile) running without any load and at minimal RPM.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=idling",
+    "positive_examples": ["youtu.be/-QN2auumfOI?start=30&end=40", "youtu.be/BJb9Idgq_xo?start=30&end=40", "youtu.be/1E8u66vm870?start=20&end=30", "youtu.be/9H_aqeYUlak?start=17&end=27", "youtu.be/--lKwp2nVm0?start=30&end=40", "youtu.be/90oHXiTbKGU?start=30&end=40", "youtu.be/7t2M6fwsvzs?start=510&end=520", "youtu.be/73Vz2uj8iKY?start=300&end=310"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07q2z82",
+    "name": "Accelerating, revving, vroom",
+    "description": "The sound made by the engine of a vehicle undergoing acceleration, usually involving an increase in loudness and a rising pitch.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=accelerating",
+    "positive_examples": ["youtu.be/FkyO6R1Frgw?start=30&end=40", "youtu.be/bU-KxtzE1CA?start=30&end=40", "youtu.be/gif5KpUlVT0?start=90&end=100", "youtu.be/HxiEFMumhHY?start=28&end=38", "youtu.be/M4f5MMdu4mY?start=30&end=40", "youtu.be/1NGdiCVV7iM?start=30&end=40", "youtu.be/Y7nMy5Ei_2A?start=10&end=20", "youtu.be/x2mQVZ3NpG0?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00071",
+    "name": "Domestic sounds, home sounds",
+    "description": "Sounds that suggest they have been recorded in the home.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/m/02dgv", "/m/0642b4", "/m/0fqfqc", "/m/04brg2", "/m/023pjk", "/m/07pn_8q", "/m/0dxrf", "/m/0fx9l", "/m/02pjr4", "/g/11b630rrvh", "/m/02jz0l", "/m/0130jx", "/m/03dnzn", "/m/03wvsk", "/m/01jt3m", "/m/012xff", "/m/0d31p", "/m/01s0vc", "/m/0zmy2j9", "/m/03v3yw", "/m/0242l", "/m/05mxj0q", "/m/01lsmm", "/m/02g901", "/m/05rj2", "/m/0316dw", "/m/081rb"],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/02dgv",
+    "name": "Door",
+    "description": "Sounds of a moving structure used to open or close an entrance to an enclosed space such as a building or vehicle.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Door",
+    "positive_examples": ["youtu.be/yM25TgQQ6Ug?start=30&end=40", "youtu.be/XPZnacpWtGI?start=100&end=110", "youtu.be/OcDE75D-O9k?start=230&end=240", "youtu.be/HMp3VHakVKw?start=30&end=40", "youtu.be/kKhnHNY2eqs?start=18&end=28", "youtu.be/8nPjSFqIIoo?start=200&end=210", "youtu.be/nzPQjQppcjc?start=30&end=40"],
+    "child_ids": ["/m/03wwcy", "/m/02y_763", "/m/07rjzl8", "/m/07r4wb8", "/m/07qcpgn", "/m/07q6cd_"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03wwcy",
+    "name": "Doorbell",
+    "description": "The sound of a signaling device typically placed near a building's entrance with which a visitor can announce their presence to occupants.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Doorbell",
+    "positive_examples": ["youtu.be/_oiZ9F6EVtw?start=130&end=140", "youtu.be/2NV5jIsXl0g?start=2&end=12", "youtu.be/-DoFDKsAOo4?start=30&end=40", "youtu.be/JvgYGGBGZpM?start=30&end=40", "youtu.be/hqCL8MieXp8?start=20&end=30"],
+    "child_ids": ["/m/07r67yg"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07r67yg",
+    "name": "Ding-dong",
+    "description": "A multi-tone chime made by a doorbell.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=ding-dong",
+    "positive_examples": ["youtu.be/bVvJzXMLaS8?start=20&end=30", "youtu.be/mP0zSmVOMNE?start=1&end=11", "youtu.be/pNoFoH57fOg?start=210&end=220"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02y_763",
+    "name": "Sliding door",
+    "description": "The sound of a type of door which opens horizontally by sliding, usually parallel to a wall.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Sliding_door",
+    "positive_examples": ["youtu.be/Fs462T7ODUQ?start=30&end=40", "youtu.be/49thYHlXkSA?start=21&end=31", "youtu.be/bH_4hTLhZi4?start=30&end=40", "youtu.be/HeUnxmbKYqc?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rjzl8",
+    "name": "Slam",
+    "description": "The sound of a door or other hinged cover being closed violently.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=slam",
+    "positive_examples": ["youtu.be/0W4KKR9sUIo?start=50&end=60"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07r4wb8",
+    "name": "Knock",
+    "description": "A sharp noise of a rigid surface being struck, usually without damage and deliberately, most often with the knuckles of the hand.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=knock",
+    "positive_examples": ["youtu.be/bWppskIvs40?start=30&end=40", "youtu.be/qhR8NGj4QJU?start=10&end=20", "youtu.be/Lc17Q6sGH48?start=120&end=130", "youtu.be/-OsX9u6oJw8?start=40&end=50", "youtu.be/ZQ0OfMZRPGM?start=10&end=20", "youtu.be/TIjqdHhg6n0?start=220&end=230"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qcpgn",
+    "name": "Tap",
+    "description": "The sound of a light collision between a rigid object and a rigid surface, for instance to deliberately attract attention.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=tap",
+    "positive_examples": ["youtu.be/DOG1JX-hIC8?start=30&end=40", "youtu.be/EcMb6OIRnWQ?start=30&end=40", "youtu.be/-WwGFIPH6us?start=30&end=40", "youtu.be/lII65O5C_H0?start=400&end=410", "youtu.be/ch4_bbDOKMc?start=30&end=40", "youtu.be/CbeZxCPsS2w?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07q6cd_",
+    "name": "Squeak",
+    "description": "A short, high-pitched noise without a sharp attack.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=squeak",
+    "positive_examples": ["youtu.be/sS9j4nxxw4c?start=30&end=40", "youtu.be/4YPpyKEKZzc?start=350&end=360", "youtu.be/z_iUAQCVLwo?start=130&end=140", "youtu.be/l9kNnBvzQyE?start=170&end=180", "youtu.be/7v51jrS0aWw?start=80&end=90", "youtu.be/VKhsmuzuwSA?start=460&end=470", "youtu.be/-2XJTklj-Y0?start=30&end=40", "youtu.be/0K8CacUc7bc?start=260&end=270", "youtu.be/UD0pisqyLKA?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0642b4",
+    "name": "Cupboard open or close",
+    "description": "The sound of the door to a storage cabinet being opened or closed.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Cupboard",
+    "positive_examples": ["youtu.be/zdA3_n7rbVA?start=350&end=360", "youtu.be/f-4xmt1Ny_A?start=20&end=30", "youtu.be/VY4Z_VyHQx8?start=440&end=450"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0fqfqc",
+    "name": "Drawer open or close",
+    "description": "The sound of the opening or closing of a box-shaped container that fits into a piece of furniture in such a way that it can be drawn out horizontally to reach its contents.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Drawer_(furniture)",
+    "positive_examples": ["youtu.be/CR3HqozYwos?start=130&end=140", "youtu.be/w3vWTE4U-C0?start=280&end=290", "youtu.be/RYyqlQusoWk?start=30&end=40", "youtu.be/u__XGncVbM4?start=190&end=200", "youtu.be/zNG3JJY1Gq0?start=430&end=440"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04brg2",
+    "name": "Dishes, pots, and pans",
+    "description": "The sound of non-destructive collisions between plates, dishes, glassware, cooking vessels, and kitchen and eating utensils.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Tableware",
+    "positive_examples": ["youtu.be/5ohCslwvxdw?start=40&end=50", "youtu.be/il4kvln-pMk?start=160&end=170", "youtu.be/T4EAWRT6XoE?start=110&end=120", "youtu.be/T9z7Ju7S7rs?start=70&end=80", "youtu.be/i34M-tBZLtA?start=480&end=490", "youtu.be/It7mU9zMI4w?start=80&end=90", "youtu.be/1X3ab9ycVL0?start=180&end=190", "youtu.be/uA_0qDi-_a0?start=350&end=360"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/023pjk",
+    "name": "Cutlery, silverware",
+    "description": "The sounds of hand implements such as knife, fork, and spoon, that are typically made of steel and are used in preparing, serving, and eating food in the Western world.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Cutlery",
+    "positive_examples": ["youtu.be/-MhcTz7hPIE?start=210&end=220", "youtu.be/2osey_ew_Kw?start=80&end=90", "youtu.be/3jSNHT62tAc?start=2&end=12", "youtu.be/4dG_VyOjAqc?start=200&end=210"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pn_8q",
+    "name": "Chopping (food)",
+    "description": "The sound of cutting ingredients into pieces with a knife or similar blade tool.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=chop",
+    "positive_examples": ["youtu.be/_BETXRVC4bs?start=30&end=40", "youtu.be/AGjDiJ4mfYY?start=25&end=35", "youtu.be/EtvVawYx2pc?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0dxrf",
+    "name": "Frying (food)",
+    "description": "The sound of cooking food in oil or another fat.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Frying",
+    "positive_examples": ["youtu.be/KP-sFwyyi2k?start=160&end=170", "youtu.be/N_0XMZaUHtE?start=510&end=520", "youtu.be/OQYqCbdDWFc?start=50&end=60", "youtu.be/HJRCxcMFOMk?start=290&end=300", "youtu.be/zCghjBsfJEc?start=30&end=40", "youtu.be/yGx08kFWCSE?start=100&end=110", "youtu.be/g765TOvaIcY?start=140&end=150", "youtu.be/CrIC2a3X1Y0?start=100&end=110"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0fx9l",
+    "name": "Microwave oven",
+    "description": "Sounds made by a kitchen appliance that heats food by exposing it to microwave radiation, including the noise of the fan, rotation mechanism, and microwave source, as well as the alert sound used to indicate that cooking is complete.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Microwave_oven",
+    "positive_examples": ["youtu.be/bvMxz7Rmvus?start=110&end=120", "youtu.be/gxPVEluTzWg?start=130&end=140", "youtu.be/l0AcPKfa5hY?start=19&end=29", "youtu.be/ZusvtL69US0?start=60&end=70", "youtu.be/vHuPSThm1Hs?start=50&end=60", "youtu.be/JQsWiBWEQlc?start=470&end=480", "youtu.be/jPlFLbfXfWc?start=140&end=150", "youtu.be/OGRxucs06r8?start=280&end=290"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02pjr4",
+    "name": "Blender",
+    "description": "The sound of a kitchen and laboratory appliance used to mix, purée, or emulsify food and other substances, usually by means of metal blades at the base of a container spun rapidly by an electric motor.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Blender",
+    "positive_examples": ["youtu.be/VGgQ2zyaClY?start=30&end=40", "youtu.be/DptBcsuywGc?start=90&end=100", "youtu.be/Uqj2GGPVo2E?start=420&end=430", "youtu.be/-ghB9TOCBR8?start=50&end=60", "youtu.be/FOUjOK2AZ30?start=550&end=560", "youtu.be/CcUoNeNvHOc?start=260&end=270", "youtu.be/pSyQDrAA9sg?start=140&end=150", "youtu.be/ATI2Fz0g4yk?start=3&end=13", "youtu.be/JZq60-sNsEY?start=290&end=300"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/g/11b630rrvh",
+    "name": "Kettle whistle",
+    "description": "The high-pitched sound of a whistle attached to the spout of an appliance for boiling water.  When the water boils, steam is forced through the whistle, alerting users that boiling is complete.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/5n1i0WeteUk?start=80&end=90", "youtu.be/Zo5WbFdzMKY?start=220&end=230"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02jz0l",
+    "name": "Water tap, faucet",
+    "description": "The sound of a hand-operated valve controlling the release of water, for instance as fitted to a sink or bath, and the resulting water flow.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Tap_(valve)",
+    "positive_examples": ["youtu.be/-VvftMHnhLA?start=30&end=40", "youtu.be/iED4FnsSrt4?start=40&end=50", "youtu.be/ApyWX6Zstno?start=30&end=40", "youtu.be/5Ms9SsvnVIQ?start=80&end=90", "youtu.be/K23jUY6DnrE?start=390&end=400", "youtu.be/ixFsIj_CcN4?start=330&end=340", "youtu.be/Uu7u_EcCIXg?start=400&end=410"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0130jx",
+    "name": "Sink (filling or washing)",
+    "description": "Sounds originating from a bowl-shaped plumbing fixture that stores and drains water and that is used for washing hands, dishwashing, and other purposes.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Sink",
+    "positive_examples": ["youtu.be/p4HKDsrHt80?start=310&end=320", "youtu.be/O4_OKqnPKU8?start=1&end=11", "youtu.be/IZpgE9_0SKw?start=70&end=80", "youtu.be/8kffb_v6MG0?start=90&end=100", "youtu.be/csNTcYADSIQ?start=90&end=100", "youtu.be/27NlyHgiaPw?start=120&end=130"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03dnzn",
+    "name": "Bathtub (filling or washing)",
+    "description": "Sounds originating from a large container for holding water in which a person may bathe.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bathtub",
+    "positive_examples": ["youtu.be/J3YxLTjCXt8?start=60&end=70", "youtu.be/9uSqapjvYrg?start=420&end=430", "youtu.be/NrqUBYJB1_w?start=100&end=110", "youtu.be/i8TgI5kv5AE?start=240&end=250"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03wvsk",
+    "name": "Hair dryer",
+    "description": "The sound of an electromechanical device designed to blow normal or hot air over damp hair, in order to dry the hair.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Hair_dryer",
+    "positive_examples": ["youtu.be/ezfz-icEVek?start=520&end=530", "youtu.be/HbKbkTkvOL0?start=180&end=190", "youtu.be/jwGfwbsF4c4?start=30&end=40", "youtu.be/IsGRLS6yPuE?start=500&end=510"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01jt3m",
+    "name": "Toilet flush",
+    "description": "The sound of a toilet flushing water along with any waste through a drainpipe.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Flush_toilet",
+    "positive_examples": ["youtu.be/DZa-AIhRZDM?start=40&end=50", "youtu.be/wmkFfxfOHLA?start=120&end=130", "youtu.be/P3VT_FsOCW8?start=270&end=280", "youtu.be/sQEk9hRHvqQ?start=30&end=40", "youtu.be/kWpxOCSDKsw?start=10&end=20", "youtu.be/rw7A8gr2nms?start=30&end=40", "youtu.be/7UynE2aCvIQ?start=20&end=30", "youtu.be/uGfdRA5derI?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/012xff",
+    "name": "Toothbrush",
+    "description": "Sounds of an instrument used to clean the teeth and gums consisting of a head of tightly clustered bristles mounted on a handle.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Toothbrush",
+    "positive_examples": ["youtu.be/JKX35FOb76Q?start=220&end=230", "youtu.be/WQBVIFHMJ40?start=50&end=60", "youtu.be/BpDu70Sa6qo?start=50&end=60", "youtu.be/bnjY4k3Pk5Y?start=20&end=30", "youtu.be/1BXaCy0ox3c?start=280&end=290", "youtu.be/AyA6zrWfkYY?start=310&end=320", "youtu.be/Gt50mgYkDH0?start=100&end=110"],
+    "child_ids": ["/m/04fgwm"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04fgwm",
+    "name": "Electric toothbrush",
+    "description": "The sound of a toothbrush that makes rapid, automatic bristle motions driven by an electric motor in order to clean teeth.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Electric_toothbrush",
+    "positive_examples": ["youtu.be/dm0tSDrMGUA?start=360&end=370", "youtu.be/WG08A74Cnf4?start=70&end=80", "youtu.be/mV8WUIvEKk0?start=16&end=26"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0d31p",
+    "name": "Vacuum cleaner",
+    "description": "The sound of a device that uses an air pump to create a partial vacuum to suck up dust and dirt, usually from floors and other surfaces such as upholstery and draperies.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Vacuum_cleaner",
+    "positive_examples": ["youtu.be/zPyYpAWVarc?start=150&end=160", "youtu.be/zmIwBvOXJXQ?start=130&end=140", "youtu.be/CgtYVinJPa8?start=290&end=300", "youtu.be/Qyolp0Qf9Nc?start=140&end=150", "youtu.be/cP-OvBQ5RmA?start=30&end=40", "youtu.be/QVZKgL4EORQ?start=220&end=230", "youtu.be/7HXiTVCIiQM?start=180&end=190"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01s0vc",
+    "name": "Zipper (clothing)",
+    "description": "The sound of a device consisting of two rows of small interlocking teeth for binding the edges of an opening of fabric or other flexible material, like on a garment or a bag.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Zipper",
+    "positive_examples": ["youtu.be/0XX0K_HNflo?start=30&end=40", "youtu.be/39sSIF_5oiA?start=30&end=40", "youtu.be/zjeigOmIz7U?start=30&end=40", "youtu.be/mpJolOCZ-_E?start=30&end=40", "youtu.be/t1APkUsoNhg?start=18&end=28", "youtu.be/kckdevn05rM?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0zmy2j9",
+    "name": "Velcro, hook and loop fastener",
+    "description": "Sounds of fasteners typically consisting of two lineal fabric strips, one featuring tiny hooks, the second consisting of even smaller and \"hairier\" loops.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Hook_and_loop_fastener",
+    "positive_examples": ["youtu.be/i6_BUnSbcmg?start=380&end=390", "youtu.be/nawwAJZQfzU?start=21&end=31", "youtu.be/BqtXSwmi4o0?start=170&end=180", "youtu.be/Sc-fSFNSfkg?start=30&end=40", "youtu.be/Qe6J61XXTZw?start=30&end=40", "youtu.be/3ej1cmiQYGM?start=24&end=34"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03v3yw",
+    "name": "Keys jangling",
+    "description": "Sounds made by the mutual collision of small pieces of metal, which are often linked on a ring, used to operate locks.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Key_(lock)",
+    "positive_examples": ["youtu.be/eHHBXazWbpc?start=180&end=190", "youtu.be/EMtKMP3HVs0?start=280&end=290", "youtu.be/4zEv7RY2kTg?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0242l",
+    "name": "Coin (dropping)",
+    "description": "The sound of a small, flat, round piece of hard material such as metal or plastic colliding with or bouncing on a hard surface.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Coin",
+    "positive_examples": ["youtu.be/LT0udusisI4?start=570&end=580", "youtu.be/_-u7RBpA2_Y?start=40&end=50", "youtu.be/yZDtVd414io?start=20&end=30", "youtu.be/beZ5vqRqO5U?start=220&end=230", "youtu.be/D6_LkeTx4uE?start=350&end=360", "youtu.be/KRfb-rFWqZk?start=50&end=60"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05mxj0q",
+    "name": "Packing tape, duct tape",
+    "description": "Sounds associated with self-adhesive tape from wide rolls used to seal cardboard boxes and other temporary attachments. Specifically includes the distinctive noise of the tape being dispensed from a roll.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Box-sealing_tape",
+    "positive_examples": ["youtu.be/30gm0gZpYrw?start=90&end=100"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01lsmm",
+    "name": "Scissors",
+    "description": "The sound of hand-operated shearing tools consisting of a pair of metal blades pivoted so that the sharpened edges slide against each other when the handles opposite to the pivot are closed.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Scissors",
+    "positive_examples": ["youtu.be/HDZ2_fLVZm0?start=30&end=40", "youtu.be/orxEbCkwH9M?start=320&end=330", "youtu.be/pbwT9PIfJcU?start=300&end=310", "youtu.be/1hba9jmspEQ?start=30&end=40", "youtu.be/l2LSwFtqGKk?start=470&end=480", "youtu.be/3hBvEDX7Q8k?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02g901",
+    "name": "Electric shaver, electric razor",
+    "description": "The sound of a bladed tool primarily used in the removal of unwanted body hair where the blades are continually moved by an electric motor.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Razor",
+    "positive_examples": ["youtu.be/raXE9JRtcHQ?start=40&end=50", "youtu.be/De4WXe0RE7A?start=300&end=310", "youtu.be/JODPEwUPyUE?start=40&end=50", "youtu.be/OS-kL4p0XpI?start=80&end=90", "youtu.be/EtM-2FtKBf0?start=30&end=40", "youtu.be/wHoSjefsmsE?start=420&end=430", "youtu.be/IEC8cLcZ6IE?start=170&end=180"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05rj2",
+    "name": "Shuffling cards",
+    "description": "The sound of playing cards being manually rearranged into a random order.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Shuffling",
+    "positive_examples": ["youtu.be/83W_DUSS830?start=410&end=420", "youtu.be/nn9iLR99iAM?start=30&end=40", "youtu.be/4KMp9SlzNjk?start=380&end=390", "youtu.be/dI95aySekBI?start=50&end=60", "youtu.be/QfF8hRWie4c?start=30&end=40", "youtu.be/9ju96NjwhVY?start=260&end=270", "youtu.be/fu--BTYNcKw?start=230&end=240"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0316dw",
+    "name": "Typing",
+    "description": "The sound of writing or inputting text by pressing keys on a typewriter, computer keyboard, cell phone, or calculator.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Typing",
+    "positive_examples": ["youtu.be/pfZpFTHWpEI?start=160&end=170", "youtu.be/_iOR9VnJ_iQ?start=30&end=40", "youtu.be/0_yN1k-S99I?start=30&end=40", "youtu.be/fuGiiIhekW8?start=390&end=400", "youtu.be/Rpx9ROxCJ04?start=190&end=200", "youtu.be/StRjbDLa23U?start=20&end=30", "youtu.be/-b1bMK154QU?start=280&end=290", "youtu.be/O9tM5h9beDM?start=30&end=40"],
+    "child_ids": ["/m/0c2wf", "/m/01m2v"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0c2wf",
+    "name": "Typewriter",
+    "description": "Sounds of a mechanical or electromechanical machine for printing characters by means of keyboard-operated types striking a ribbon to transfer ink or carbon impressions onto paper.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Typewriter",
+    "positive_examples": ["youtu.be/I82-B02oOBg?start=380&end=390", "youtu.be/UJ9xKsKaKAA?start=30&end=40", "youtu.be/c40hgtfiYm0?start=30&end=40", "youtu.be/yoiR7G1Y85Y?start=30&end=40", "youtu.be/9H7ugTcVbXQ?start=10&end=20", "youtu.be/L4uwP5teSwM?start=10&end=20"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01m2v",
+    "name": "Computer keyboard",
+    "description": "Sounds created by a typewriter-style device which uses an arrangement of buttons or keys controlling switches to enter text or other information into a computer or other electronic device.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Computer_keyboard",
+    "positive_examples": ["youtu.be/MK-hwHhgDLY?start=450&end=460", "youtu.be/7rGGBa_lZ54?start=260&end=270", "youtu.be/xMTVzNoGOxA?start=50&end=60", "youtu.be/jHtAPbCj0U4?start=410&end=420", "youtu.be/oBEvBG2D8AE?start=190&end=200", "youtu.be/9NcxNH75ByE?start=30&end=40", "youtu.be/eW83JnAnMww?start=260&end=270", "youtu.be/AL-dutJvx0s?start=30&end=40", "youtu.be/8GHLfJ6y6zA?start=570&end=580"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/081rb",
+    "name": "Writing",
+    "description": "The sound of inscribing signs and symbols, most often by using a purpose-made hand-held device, such as a pen or pencil, directly upon paper or similar media.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Writing",
+    "positive_examples": ["youtu.be/JO3nmsEXrro?start=340&end=350", "youtu.be/Y8uYhcPssuE?start=80&end=90", "youtu.be/74Vqth_HvqU?start=130&end=140", "youtu.be/HIfhUyU9W20?start=80&end=90", "youtu.be/drtbr0LpLDI?start=22&end=32", "youtu.be/-YKVzemh9RQ?start=30&end=40", "youtu.be/eD-ANIqSWzU?start=90&end=100"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pp_mv",
+    "name": "Alarm",
+    "description": "The sound of an automatic signal warning of danger.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=alarm",
+    "positive_examples": ["youtu.be/uZDkS4PQPhg?start=30&end=40", "youtu.be/1Hf_1QejY4I?start=20&end=30", "youtu.be/a3jZ7QBDhgc?start=30&end=40", "youtu.be/v2Mmlp3YPPM?start=150&end=160", "youtu.be/M82tO6erP8Y?start=160&end=170", "youtu.be/ygXK90AVXFY?start=50&end=60"],
+    "child_ids": ["/m/07cx4", "/m/046dlr", "/m/03kmc9", "/m/03wwcy", "/m/030rvx", "/m/01y3hg", "/m/0c3f7m", "/m/02mfyn", "/m/0912c9", "/m/0gy1t2s", "/m/05x_td", "/m/04fq5q", "/m/0l156k"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07cx4",
+    "name": "Telephone",
+    "description": "The sound of a telecommunications device that permits two or more users to conduct a conversation when they are too far apart to be heard directly.  This class covers sounds associated with the device itself, not voice that has been carried via the telephone network.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Telephone",
+    "positive_examples": ["youtu.be/0uETEh6kc0o?start=30&end=40", "youtu.be/Y12efRHPBok?start=130&end=140", "youtu.be/jDsfFsfox9o?start=20&end=30", "youtu.be/S_ZdkIw32hY?start=30&end=40", "youtu.be/ackdwXSwINs?start=30&end=40", "youtu.be/5K7mSWlamvI?start=10&end=20", "youtu.be/CaARlo0W-WI?start=5&end=15", "youtu.be/sWxcqTcEWPo?start=330&end=340"],
+    "child_ids": ["/m/07pp8cl", "/m/01hnzm", "/m/01sb50", "/m/02c8p", "/m/015jpf", "/m/01z47d"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pp8cl",
+    "name": "Telephone bell ringing",
+    "description": "The sound of a physical or synthesized electric bell ringing to signal an incoming telephone call.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=telephone%20bell",
+    "positive_examples": ["youtu.be/0RItg3CCgAc?start=280&end=290", "youtu.be/xIztYnMIWUA?start=30&end=40", "youtu.be/koZiVapPW4Q?start=220&end=230", "youtu.be/y8Kp8RmvKFw?start=30&end=40", "youtu.be/j9NQitaeuGg?start=30&end=40", "youtu.be/UJ8vpkVsY34?start=30&end=40", "youtu.be/v-C5UHWHDw4?start=11&end=21"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01hnzm",
+    "name": "Ringtone",
+    "description": "The synthesized or pre-recorded sounds made by contemporary electronic telephones to indicate an incoming call or text message.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Ringtone",
+    "positive_examples": ["youtu.be/F91ueYOdGfI?start=400&end=410", "youtu.be/NUe_yfLTqc8?start=30&end=40", "youtu.be/LhqpEHInT_w?start=380&end=390", "youtu.be/8o7yBPCaYyM?start=380&end=390", "youtu.be/NXPSFXXzzkg?start=230&end=240", "youtu.be/JC70GNb_dmo?start=180&end=190", "youtu.be/sFok1rcnU6U?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01sb50",
+    "name": "Cellphone buzz, vibrating alert",
+    "description": "The low buzz emitted by the vibrotactile actuator used to provide \"silent alarms\" on mobile phones and other devices.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Vibrating_alert",
+    "positive_examples": ["youtu.be/bipdap5oNJY?start=270&end=280"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02c8p",
+    "name": "Telephone dialing, DTMF",
+    "description": "The sound of a telephony signaling system where short combinations of tones within the frequency range of human speech are used to transmit individual digits.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Dual-tone_multi-frequency_signaling",
+    "positive_examples": ["youtu.be/4eLBAlJq1cg?start=20&end=30", "youtu.be/mxxvJtatNx8?start=30&end=40", "youtu.be/WqP9GopZdAk?start=460&end=470", "youtu.be/soJspyOEeXw?start=410&end=420", "youtu.be/xHYs_58hXfg?start=60&end=70", "youtu.be/HherPtX1e5c?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/015jpf",
+    "name": "Dial tone",
+    "description": "The sound of a signal that a telephone exchange or other switching equipment sends to a telephone to confirm that it is ready to initiate a connection.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Dial_tone",
+    "positive_examples": ["youtu.be/0Wfdq_3F9VA?start=3&end=13", "youtu.be/BcAuCN7qQzI?start=40&end=50", "youtu.be/j7whx_ntCog?start=180&end=190", "youtu.be/YTBCmc70I20?start=270&end=280"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01z47d",
+    "name": "Busy signal",
+    "description": "A sound (usually a simple repeating tone) played over a telephone line to indicate that the requested connection could not be completed, for instance because the remote party is already engaged in a different call.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Busy_signal",
+    "positive_examples": ["youtu.be/CtweZ8ULo78?start=30&end=40", "youtu.be/NiHk39QzgVc?start=90&end=100", "youtu.be/iT9zHI02zDo?start=160&end=170"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/046dlr",
+    "name": "Alarm clock",
+    "description": "Sounds made by a clock that is designed to make an audible signal at a specific time. The primary use of alarm clocks is to awaken people from their night's sleep.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Alarm_clock",
+    "positive_examples": ["youtu.be/q8i4B6lSshE?start=30&end=40", "youtu.be/fqQHxi-CBwY?start=20&end=30", "youtu.be/WSNtES6nmRI?start=50&end=60", "youtu.be/61X1X_LmORw?start=90&end=100", "youtu.be/stiK2hvKAeA?start=24&end=34"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03kmc9",
+    "name": "Siren",
+    "description": "The sound of a loud noise-making device used to provide warnings to people nearby. A siren typically consists of a single pitch that changes either smoothly or abruptly on timescales around one second.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Siren_(alarm)",
+    "positive_examples": ["youtu.be/ZHnpad4e8rI?start=16&end=26", "youtu.be/bUmokBhoTlw?start=29&end=39", "youtu.be/kn09Reh4DZc?start=30&end=40", "youtu.be/BaZiXQC0aAw?start=30&end=40", "youtu.be/BxXcX0kuZTI?start=4&end=14", "youtu.be/b1dvYzcVcKA?start=30&end=40", "youtu.be/6g-ttVYeOX4?start=180&end=190", "youtu.be/o3nq59MHsTU?start=30&end=40", "youtu.be/Cldv-kSc7kU?start=30&end=40", "youtu.be/KjO-sgL8_10?start=60&end=70"],
+    "child_ids": ["/m/04qvtq", "/m/012n7d", "/m/012ndj", "/m/0dgbq"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0dgbq",
+    "name": "Civil defense siren",
+    "description": "The sound of a siren used to provide emergency population warning of approaching danger such as hurricanes, earthquakes, or military attacks.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Civil_defense_siren",
+    "positive_examples": ["youtu.be/ExSi3SDF45o?start=30&end=40", "youtu.be/wVbDqtAXCug?start=30&end=40", "youtu.be/AY7UQYV1GpM?start=30&end=40", "youtu.be/fsaCfxi_VAs?start=30&end=40", "youtu.be/Xy4aPDjvXkQ?start=400&end=410", "youtu.be/rM4XSEZw-ug?start=30&end=40", "youtu.be/Q3ckbWJ9aI4?start=260&end=270", "youtu.be/Kc2-IHeqq0o?start=30&end=40", "youtu.be/8A30tAwXdzM?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/030rvx",
+    "name": "Buzzer",
+    "description": "Sounds of a mechanical, electromechanical, or piezoelectric signaling device that emits a steady, highly-audible tone with a spectrum extending to high frequencies.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Buzzer",
+    "positive_examples": ["youtu.be/-uuG4ZCJmyU?start=30&end=40", "youtu.be/3Ab0sgelCEw?start=30&end=40", "youtu.be/eJM8xH6ny28?start=30&end=40", "youtu.be/OtaGvBD1oEc?start=30&end=40", "youtu.be/Nbmc0U5HPl0?start=270&end=280", "youtu.be/xqOErPbPZFQ?start=70&end=80", "youtu.be/L3vuNVEuZeQ?start=20&end=30", "youtu.be/vaz3zMS8Cus?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01y3hg",
+    "name": "Smoke detector, smoke alarm",
+    "description": "Sounds emitted by a device that senses smoke, typically to warn of a fire.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Smoke_detector",
+    "positive_examples": ["youtu.be/6CzgrjobyzQ?start=30&end=40", "youtu.be/BRnPYH9WHSM?start=30&end=40", "youtu.be/YmsNoKDfWUE?start=120&end=130", "youtu.be/acoqKEVczug?start=19&end=29"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0c3f7m",
+    "name": "Fire alarm",
+    "description": "Sounds of a system designed to warn people when smoke, fire, carbon monoxide or other emergencies are present. Alarms can be either motorized bells or wall-mountable sounders or horns.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Fire_alarm_system",
+    "positive_examples": ["youtu.be/NyT8rRl6nd8?start=8&end=18", "youtu.be/oy-ZMaQbe9c?start=210&end=220", "youtu.be/Q9qGvtK8bBs?start=10&end=20", "youtu.be/e0cGR8OojpQ?start=50&end=60", "youtu.be/ZDxoinMcZj0?start=30&end=40", "youtu.be/AdW3WsQy7QY?start=170&end=180", "youtu.be/4xcSq8UjPCI?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04fq5q",
+    "name": "Foghorn",
+    "description": "A signaling sound used primarily in foggy conditions to warn watercraft of the presence of nearby vessels or other navigational hazards.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Foghorn",
+    "positive_examples": ["youtu.be/0kgiYfKS_qE?start=30&end=40", "youtu.be/siWV-9iulHI?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0l156k",
+    "name": "Whistle",
+    "description": "The sound of an instrument that produces sound from a stream of forced air.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Whistle",
+    "positive_examples": ["youtu.be/zUbqCMZvqP4?start=330&end=340", "youtu.be/mVpQrw3yWuc?start=20&end=30", "youtu.be/wMor1YCRU4E?start=270&end=280"],
+    "child_ids": ["/g/11b630rrvh", "/m/06hck5"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06hck5",
+    "name": "Steam whistle",
+    "description": "The sound of a device used to produce sound with the aid of live steam, which acts as a vibrating system.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Steam_whistle",
+    "positive_examples": ["youtu.be/4t7Mi3pnSA4?start=100&end=110", "youtu.be/00iiNF4tn4o?start=20&end=30", "youtu.be/E5DNG4kVPeg?start=20&end=30", "youtu.be/7kSxwHZYrL4?start=400&end=410", "youtu.be/VSVhZ2aDjro?start=30&end=40", "youtu.be/2FkfQgDiW_o?start=369&end=379"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00077",
+    "name": "Mechanisms",
+    "description": "Sounds that originate from human-created machinery and devices.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/5K6zvn-Zdw0?start=1&end=11", "youtu.be/EFB1v8UGXZk?start=590&end=600", "youtu.be/wY3hhIqVvjo?start=60&end=70", "youtu.be/kcqmwG_rHFA?start=80&end=90"],
+    "child_ids": ["/m/02bm9n", "/m/01x3z", "/m/0l7xg", "/m/05zc1", "/m/0llzx", "/m/02x984l", "/m/025wky1", "/m/024dl", "/m/01m4t", "/m/0dv5r"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02bm9n",
+    "name": "Ratchet, pawl",
+    "description": "The sound of a mechanical device that allows continuous linear or rotary motion in only one direction while preventing motion in the opposite direction.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Ratchet_(device)",
+    "positive_examples": ["youtu.be/CGk8RdSyKVs?start=30&end=40", "youtu.be/6Ks0uENycv4?start=1&end=11", "youtu.be/7S55nEHxIHE?start=30&end=40", "youtu.be/V2M4UI7O0Jo?start=0&end=6", "youtu.be/I3z4A5M-XEQ?start=4&end=14"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01x3z",
+    "name": "Clock",
+    "description": "Sounds made by an instrument designed to keep and indicate time.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Clock",
+    "positive_examples": ["youtu.be/iq62ay8Lv4o?start=450&end=460", "youtu.be/FuWDfaQjbvs?start=150&end=160", "youtu.be/CJCwr2gl2CM?start=120&end=130", "youtu.be/G40Kohywqh4?start=20&end=30", "youtu.be/CuOWQr1a5-c?start=120&end=130", "youtu.be/G23Me3A60b8?start=20&end=30", "youtu.be/Qda2h9SXacs?start=20&end=30", "youtu.be/OL63NVm1v7A?start=20&end=30"],
+    "child_ids": ["/m/07qjznt", "/m/07qjznl"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qjznt",
+    "name": "Tick",
+    "description": "A metallic tapping sound.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=tick",
+    "positive_examples": ["youtu.be/aTcKrIlKI9o?start=30&end=40", "youtu.be/y-hd-Ol4ES4?start=30&end=40", "youtu.be/GgpH6htMmQQ?start=30&end=40", "youtu.be/xNZ0aU2X5uQ?start=30&end=40", "youtu.be/R6VUgJzDwBQ?start=30&end=40", "youtu.be/WupBieE-8Us?start=30&end=40", "youtu.be/YKtbQkTmvnY?start=30&end=40", "youtu.be/t6Hlse8lKmg?start=20&end=30", "youtu.be/8nS-KGITte8?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qjznl",
+    "name": "Tick-tock",
+    "description": "A steady recurrent ticking sound as made by a clock.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=ticktock",
+    "positive_examples": ["youtu.be/gD5Pft3FKdU?start=8&end=18", "youtu.be/wRnYhBhofVY?start=30&end=40", "youtu.be/pDjj6MiFNuc?start=30&end=40", "youtu.be/4ftDFi4684Y?start=30&end=40", "youtu.be/y5cRJZB83ZA?start=4&end=14"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0l7xg",
+    "name": "Gears",
+    "description": "Sounds of rotating machine parts having cut teeth, or cogs, which mesh with other toothed parts to transmit torque.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Gear",
+    "positive_examples": ["youtu.be/0HGFcukxF9Q?start=30&end=40", "youtu.be/0xyapQxC7yM?start=30&end=40", "youtu.be/3EnkJZx4b94?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05zc1",
+    "name": "Pulleys",
+    "description": "Sounds of a wheel on an axle or shaft that is designed to support movement and change of direction of a taut cable or belt along its circumference.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Pulley",
+    "positive_examples": ["youtu.be/LYbzpPSQLbI?start=30&end=40", "youtu.be/j7KrjKwz1VY?start=30&end=40", "youtu.be/OuKRNNbgsGc?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0llzx",
+    "name": "Sewing machine",
+    "description": "Sounds of a machine used to stitch fabric and other materials together with thread.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Sewing_machine",
+    "positive_examples": ["youtu.be/3ejlQ7u5kkU?start=220&end=230", "youtu.be/x7ROZ8Lps2k?start=130&end=140", "youtu.be/DMbLpqQlYjc?start=30&end=40", "youtu.be/r9MHNEIYNBY?start=80&end=90", "youtu.be/9ZjoQLYo03Y?start=130&end=140", "youtu.be/qZMkCBSNzLo?start=450&end=460", "youtu.be/AfZTLVMTAD4?start=30&end=40", "youtu.be/4GYEVk6ngdk?start=40&end=50", "youtu.be/4zEP9py4ZO0?start=310&end=320"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02x984l",
+    "name": "Mechanical fan",
+    "description": "Sounds of a machine used to create air flow via the motion of a rotating arrangement of vanes or blades.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Mechanical_fan",
+    "positive_examples": ["youtu.be/DwTfsGSG3y8?start=30&end=40", "youtu.be/3qeBcAw5T8U?start=240&end=250", "youtu.be/ESeWXm2Obnc?start=60&end=70", "youtu.be/S5KErFSn92w?start=210&end=220", "youtu.be/rJZo6erXckU?start=180&end=190", "youtu.be/ZVtm-Pw6gJU?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/025wky1",
+    "name": "Air conditioning",
+    "description": "The sound of a device that removes heat from the air inside a building or vehicle, thus lowering the air temperature.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Air_conditioning",
+    "positive_examples": ["youtu.be/wRBGmik5Dz0?start=160&end=170", "youtu.be/xBxsVs1BQh0?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/024dl",
+    "name": "Cash register",
+    "description": "Sounds of a mechanical or electronic device for registering and calculating transactions, usually attached to a drawer for storing cash.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Cash_register",
+    "positive_examples": ["youtu.be/OY5485e1_yM?start=30&end=40", "youtu.be/mJiXm0TbS1w?start=460&end=470", "youtu.be/w3wyawCHZOM?start=110&end=120", "youtu.be/nvY30V3-kD8?start=250&end=260", "youtu.be/Wb3ZTFSc7bc?start=120&end=130", "youtu.be/GuFTgsvMAUc?start=23&end=33", "youtu.be/twQ9-kxoJ70?start=10&end=20", "youtu.be/dXKql7mi4rU?start=60&end=70"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01m4t",
+    "name": "Printer",
+    "description": "Sounds of a computer peripheral which makes a persistent human readable representation of graphics or text on paper or similar physical media.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Printer_(computing)",
+    "positive_examples": ["youtu.be/NjDe3O45xjI?start=470&end=480", "youtu.be/xCK4-CaqGBw?start=40&end=50", "youtu.be/4uSbpeUmxAk?start=30&end=40", "youtu.be/mRtqC_FswFc?start=30&end=40", "youtu.be/7BzyG8MV_zs?start=30&end=40", "youtu.be/MFFrOFlAXQ8?start=30&end=40", "youtu.be/1qkdvq7Qlsc?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0dv5r",
+    "name": "Camera",
+    "description": "Sounds of mechanical or electronic instruments that employ optics to record or capture images.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Camera",
+    "positive_examples": ["youtu.be/sH33P94VpZQ?start=50&end=60", "youtu.be/QdGg7SIlTj8?start=350&end=360", "youtu.be/eF7BVZza7qg?start=390&end=400", "youtu.be/bq8-CK-dXfY?start=40&end=50", "youtu.be/djy4CXMXZls?start=310&end=320", "youtu.be/mYmn4kAwLG8?start=300&end=310", "youtu.be/a4eISIE97EU?start=260&end=270", "youtu.be/APJEtvVrv1Q?start=30&end=40", "youtu.be/vQwg9vG_7E0?start=40&end=50"],
+    "child_ids": ["/m/07bjf"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07bjf",
+    "name": "Single-lens reflex camera",
+    "description": "Sounds of camera that uses a mirror and prism system to permit the photographer to view through the lens and see exactly what will be captured.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Single-lens_reflex_camera",
+    "positive_examples": ["youtu.be/7o2YlSlfLp8?start=30&end=40", "youtu.be/9hzWgd8CNoI?start=30&end=40", "youtu.be/4bibld_jSR0?start=30&end=40", "youtu.be/51pSJYU7wdU?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07k1x",
+    "name": "Tools",
+    "description": "Sounds associated with any physical item that can be used to achieve a goal, especially if the item is not consumed in the process.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Tool",
+    "positive_examples": ["youtu.be/6EvqESUDuh4?start=150&end=160", "youtu.be/--hYWyUh_BY?start=30&end=40", "youtu.be/GJhuBSoQeRg?start=170&end=180", "youtu.be/gnytgNTz4Tw?start=460&end=470", "youtu.be/YpjgUzOBWSM?start=420&end=430", "youtu.be/WpeT6stfbac?start=40&end=50", "youtu.be/itFDsYrpdQ4?start=80&end=90", "youtu.be/5I8lmN8rwDM?start=540&end=550", "youtu.be/bE2hGIuk3nE?start=20&end=30"],
+    "child_ids": ["/m/03l9g", "/m/03p19w", "/m/01b82r", "/m/02p01q", "/m/023vsd", "/m/0_ksk"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03l9g",
+    "name": "Hammer",
+    "description": "Sounds made by a tool or device that delivers a blow to an object. Most hammers are hand tools used to drive nails, fit parts, forge metal, and break apart objects.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Hammer",
+    "positive_examples": ["youtu.be/DPWTr_fAamU?start=50&end=60", "youtu.be/gwz2RRFSp_I?start=70&end=80", "youtu.be/UJubaPeUu58?start=100&end=110", "youtu.be/rUyjioYts8k?start=220&end=230", "youtu.be/hqeM4MaM8AY?start=270&end=280", "youtu.be/QnKuwMUYhIo?start=120&end=130", "youtu.be/oy_ZYPJ_i4U?start=130&end=140"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/03p19w",
+    "name": "Jackhammer",
+    "description": "The sound of a pneumatic or electro-mechanical tool that combines a hammer directly with a chisel, usually used to break up rock, pavement, and concrete.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Jackhammer",
+    "positive_examples": ["youtu.be/D8ZGG2r9rw4?start=70&end=80", "youtu.be/9PXlmLwYdSU?start=30&end=40", "youtu.be/2OVIxKp0tDY?start=30&end=40", "youtu.be/4qWfY_x6hpA?start=20&end=30", "youtu.be/2_MZIyjeHkg?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01b82r",
+    "name": "Sawing",
+    "description": "Sounds of a tool consisting of a tough blade, wire, or chain with a hard toothed edge, used to cut through material, most often wood.  Includes both manual and motorized saws.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Saw",
+    "positive_examples": ["youtu.be/2cA6U33DhmU?start=21&end=31", "youtu.be/IIrxuKxXgbI?start=50&end=60", "youtu.be/4UhoP_5tq-8?start=40&end=50", "youtu.be/CZbwyN1DAUU?start=30&end=40", "youtu.be/dSdfeqU-CHE?start=390&end=400", "youtu.be/tHaynxVvo0k?start=30&end=40", "youtu.be/KkO1_TZkPmA?start=70&end=80"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02p01q",
+    "name": "Filing (rasp)",
+    "description": "Sounds of a common wood- and metal-working tool used to remove fine amounts of material from a workpiece.",
+    "citation_uri": "http://en.wikipedia.org/wiki/File_(tool)",
+    "positive_examples": ["youtu.be/WvLRozv0VO8?start=480&end=490", "youtu.be/xobSVmWsS3M?start=230&end=240", "youtu.be/bmlAZ217IeQ?start=390&end=400", "youtu.be/QsuVk4wgxVA?start=210&end=220", "youtu.be/1lkHxuNt0ys?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/023vsd",
+    "name": "Sanding",
+    "description": "The sounds of a surface being scraped by an abrasive material affixed to sheets of paper or cloth, either to make the surface smoother, remove a layer of material, or sometimes to make the surface rougher.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Sandpaper",
+    "positive_examples": ["youtu.be/5f2ygkdBhXQ?start=180&end=190", "youtu.be/gq3mrdhrUco?start=370&end=380", "youtu.be/6KPXNKlaAo8?start=60&end=70", "youtu.be/6SjftAv04VM?start=440&end=450", "youtu.be/l4raHxFuVIU?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0_ksk",
+    "name": "Power tool",
+    "description": "Sounds of tools that employ a power source other than the manual labor used with hand tools. The most common types of power tools use electric motors.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Power_tool",
+    "positive_examples": ["youtu.be/xcqLleK10js?start=50&end=60", "youtu.be/awcVolHHBHA?start=100&end=110", "youtu.be/AVst_gI8r0Y?start=130&end=140", "youtu.be/c7WBUf2Rslg?start=300&end=310", "youtu.be/iXJ6cCkogvA?start=130&end=140", "youtu.be/w7Ar6TrvBiE?start=460&end=470", "youtu.be/ggP0GzUehZ8?start=100&end=110", "youtu.be/pPbaKEYpP-4?start=60&end=70", "youtu.be/oQA78VCxa2U?start=60&end=70"],
+    "child_ids": ["/m/01d380"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01d380",
+    "name": "Drill",
+    "description": "Sounds of a tool used for making holes in various materials via a rotating drill bit.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Drill",
+    "positive_examples": ["youtu.be/2hYNQDynVrY?start=20&end=30", "youtu.be/g2Q3U2nID1o?start=50&end=60", "youtu.be/0zfWGqDMT04?start=60&end=70", "youtu.be/g_tNcWk39d8?start=20&end=30", "youtu.be/MDWok2Z9eK8?start=100&end=110", "youtu.be/-XoJN3kzimY?start=90&end=100"],
+    "child_ids": ["/m/08j51y"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/014zdl",
+    "name": "Explosion",
+    "description": "The sound of a rapid increase in volume and release of energy in an extreme manner, usually with the generation of high temperatures and the release of gases.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Explosion",
+    "positive_examples": ["youtu.be/PrLxgTuUFMM?start=30&end=40", "youtu.be/kQpyLZEenDs?start=280&end=290", "youtu.be/ir1XTdyt4IY?start=18&end=28", "youtu.be/T_ZCW1gW9MY?start=21&end=31", "youtu.be/V6oQYxXbUGs?start=30&end=40", "youtu.be/52P29_VwBT0?start=30&end=40", "youtu.be/1LYnGafjSfs?start=30&end=40", "youtu.be/BG7QfANwqsw?start=170&end=180"],
+    "child_ids": ["/m/032s66", "/m/0g6b5", "/m/07qsvvw", "/m/07pxg6y", "/m/07qqyl4"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/032s66",
+    "name": "Gunshot, gunfire",
+    "description": "The sound of the discharge of a firearm, or multiple such discharges.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Gunshot",
+    "positive_examples": ["youtu.be/--PG66A3lo4?start=80&end=90", "youtu.be/PKEhOxE-Ovs?start=130&end=140", "youtu.be/c9030y4sJo0?start=140&end=150", "youtu.be/6slrju_ar9U?start=290&end=300", "youtu.be/K1cnDXbkPu0?start=170&end=180", "youtu.be/AjIQf3HK_Vc?start=130&end=140", "youtu.be/klCJfirqUF8?start=30&end=40", "youtu.be/Ma65O2T_hN0?start=10&end=20", "youtu.be/-Ho5tDtuah0?start=50&end=60"],
+    "child_ids": ["/m/04zjc", "/m/02z32qm", "/m/0_1c", "/m/073cg4"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04zjc",
+    "name": "Machine gun",
+    "description": "Sounds of a fully automatic mounted or portable firearm, designed to fire bullets in quick succession from an ammunition belt or magazine, typically at a rate of 300 to 1800 rounds per minute.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Machine_gun",
+    "positive_examples": ["youtu.be/CMsuBmtxj1c?start=50&end=60", "youtu.be/DFVVjacIPJc?start=40&end=50", "youtu.be/HrQ_2pDcXgk?start=200&end=210", "youtu.be/YGzn9Xn-hxU?start=540&end=550"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/02z32qm",
+    "name": "Fusillade",
+    "description": "The sound of the simultaneous and continuous firing of a group of firearms on command.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Fusillade",
+    "positive_examples": ["youtu.be/7ZrDUuR6jhQ?start=50&end=60"],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0_1c",
+    "name": "Artillery fire",
+    "description": "The sound of large military weapons built to fire munitions far beyond the range and power of infantry's small arms.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Artillery",
+    "positive_examples": ["youtu.be/PqXCkCGMd_Y?start=30&end=40", "youtu.be/pEDafzZbNwE?start=30&end=40", "youtu.be/RaSAyTj1oQ8?start=22&end=32", "youtu.be/SoDJkvxOYKA?start=30&end=40", "youtu.be/ZtK9HZGMyDo?start=30&end=40", "youtu.be/7G0L7iR1J30?start=30&end=40", "youtu.be/v6BTFTJnkN8?start=30&end=40", "youtu.be/DmfwH6q4nSA?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/073cg4",
+    "name": "Cap gun",
+    "description": "The sound of a toy gun that creates a loud sound simulating a gunshot and a puff of smoke when a small percussion cap is exploded.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Cap_gun",
+    "positive_examples": ["youtu.be/0Hnrufry54I?start=11&end=21", "youtu.be/a4nkA0NL4Hw?start=0&end=9", "youtu.be/s14pZzKGsdE?start=30&end=40", "youtu.be/FTPLIbmvoSs?start=140&end=150"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0g6b5",
+    "name": "Fireworks",
+    "description": "Sounds of a class of low explosive pyrotechnic devices used for aesthetic and entertainment purposes.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Fireworks",
+    "positive_examples": ["youtu.be/cWgHxozrrVg?start=30&end=40", "youtu.be/PtNDWxRjBwU?start=90&end=100", "youtu.be/e6GFJxUv6WQ?start=450&end=460", "youtu.be/3Eexn6Q-S0g?start=4&end=14", "youtu.be/Y0P1S-mEMR4?start=180&end=190", "youtu.be/AW8HjiOEwjk?start=20&end=30", "youtu.be/SFdFVOilebk?start=10&end=20", "youtu.be/JUBUL74vtjs?start=270&end=280", "youtu.be/H8K9xauYZSM?start=340&end=350"],
+    "child_ids": ["/g/122z_qxw"],
+    "restrictions": []
+  },
+  {
+    "id": "/g/122z_qxw",
+    "name": "Firecracker",
+    "description": "The sound of a small explosive device primarily designed to produce a large amount of noise, especially in the form of a loud bang.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Firecracker",
+    "positive_examples": ["youtu.be/HA1QdosEauQ?start=230&end=240", "youtu.be/H6xfwBVAk8k?start=10&end=20", "youtu.be/2He2w7av5Zc?start=50&end=60", "youtu.be/eUIUHMSS870?start=20&end=30", "youtu.be/AfSs1ULO6so?start=40&end=50", "youtu.be/7_GIrU0H3FA?start=290&end=300"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qsvvw",
+    "name": "Burst, pop",
+    "description": "The sound of an enclosure holding gas or liquid coming open suddenly and violently.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=burst",
+    "positive_examples": ["youtu.be/RpFDag_WTi4?start=30&end=40", "youtu.be/s-tz_tCHzWo?start=10&end=20", "youtu.be/Ozzhq8roq54?start=15&end=25", "youtu.be/MXOe4NZQCvg?start=8&end=18"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pxg6y",
+    "name": "Eruption",
+    "description": "The sound of a volcano violently ejecting ash and lava, or a similar event in which large volumes of material are explosively released.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=eruption",
+    "positive_examples": ["youtu.be/2b4c8PGpqhI?start=30&end=40", "youtu.be/0bD6alaQ8p4?start=540&end=550"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qqyl4",
+    "name": "Boom",
+    "description": "A deep prolonged loud noise.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=boom",
+    "positive_examples": ["youtu.be/txZ0Ka2JQxo?start=60&end=70", "youtu.be/j2iXdCbRD1w?start=110&end=120", "youtu.be/CrurGAJGFEo?start=570&end=580", "youtu.be/mPA-o61cmsU?start=530&end=540", "youtu.be/bdLPcIBg1jQ?start=70&end=80", "youtu.be/HJrMrPOFwMw?start=270&end=280", "youtu.be/rRft1-LuYFE?start=370&end=380", "youtu.be/EUk1c1LphAU?start=130&end=140"],
+    "child_ids": ["/m/0193bn"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0193bn",
+    "name": "Sonic boom",
+    "description": "The sound associated with the shock waves created by an object traveling through the air faster than the speed of sound.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Sonic_boom",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/083vt",
+    "name": "Wood",
+    "description": "Sounds associated with the porous and fibrous structural tissue found in the stems and roots of trees, and other woody plants.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Wood",
+    "positive_examples": ["youtu.be/6uw4VuJaiyU?start=200&end=210", "youtu.be/0xiCiydk7Z8?start=50&end=60", "youtu.be/ZiWvhIbB7JU?start=110&end=120", "youtu.be/Hy2zg9K4iWw?start=230&end=240", "youtu.be/Dc7Uago6dTo?start=120&end=130", "youtu.be/DPpeWsDmMqM?start=430&end=440"],
+    "child_ids": ["/m/07pczhz", "/m/07pl1bw", "/m/07qs1cx", "/m/07pc8l3"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pczhz",
+    "name": "Chop",
+    "description": "The sound of cutting a solid material such as wood into smaller pieces by using an ax or other hacking tool.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=chop",
+    "positive_examples": ["youtu.be/TgnBMRMoqiU?start=30&end=40", "youtu.be/GSNkf-1GPSQ?start=30&end=40", "youtu.be/RswPwZBfRi4?start=80&end=90"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pl1bw",
+    "name": "Splinter",
+    "description": "The sound of a rigid fibrous material, typically wood, breaking up into splinters or slivers.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=splinter",
+    "positive_examples": ["youtu.be/KXhwErBuMgY?start=30&end=40", "youtu.be/2zKFMrXOIIM?start=30&end=40", "youtu.be/1vTcgIFYqS0?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qs1cx",
+    "name": "Crack",
+    "description": "Abrupt sounds of individual ruptures in a rigid substance under stress.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=crack",
+    "positive_examples": ["youtu.be/7uOf0j0P8HU?start=30&end=40", "youtu.be/RnwN5cfvtm4?start=30&end=40", "youtu.be/3bGqkpfyk6k?start=230&end=240", "youtu.be/oRINjxOZKt8?start=150&end=160"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pc8l3",
+    "name": "Snap",
+    "description": "The sound of a thin and rigid element breaking suddenly and abruptly.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=snap",
+    "positive_examples": ["youtu.be/NuMmJD2JrBI?start=30&end=40", "youtu.be/aJP8tmE_77E?start=3&end=13", "youtu.be/Cw1OYx8S4-Y?start=30&end=40", "youtu.be/FCkRdsB6a-U?start=0&end=9", "youtu.be/GxtlETpVSYU?start=8&end=18"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/039jq",
+    "name": "Glass",
+    "description": "Sounds associated with the non-crystalline amorphous solid that is often transparent and has widespread practical, technological, and decorative uses.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Glass",
+    "positive_examples": ["youtu.be/PN10dchsywY?start=70&end=80", "youtu.be/LBbNXGVJCWw?start=110&end=120", "youtu.be/uN7Hp-Yk8OQ?start=330&end=340", "youtu.be/VWPIFiqTHWs?start=180&end=190", "youtu.be/n3cOIGJUlPQ?start=530&end=540", "youtu.be/723KuaTtsVc?start=50&end=60", "youtu.be/E1QaQHjCAv4?start=230&end=240", "youtu.be/qlnoKIMMWm4?start=110&end=120", "youtu.be/LT5wHi8Atmg?start=70&end=80"],
+    "child_ids": ["/m/07q7njn", "/m/07rn7sz"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07q7njn",
+    "name": "Chink, clink",
+    "description": "A short light metallic sound.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=chink",
+    "positive_examples": ["youtu.be/MtzyuvdvfpQ?start=2&end=12", "youtu.be/N8YbmQ_YvW4?start=30&end=40", "youtu.be/aWsU1yATn5U?start=8&end=18", "youtu.be/5YhaZSsAQ6M?start=70&end=80", "youtu.be/8-qDda-c6yo?start=30&end=40", "youtu.be/8efvnCaFuzQ?start=17&end=27"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rn7sz",
+    "name": "Shatter",
+    "description": "The sound of a brittle rigid substance rapidly breaking into many pieces.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=shatter",
+    "positive_examples": ["youtu.be/rBUWlPPHunk?start=50&end=60", "youtu.be/gkac73Rq7dU?start=350&end=360", "youtu.be/PDtzyKfzAc8?start=180&end=190", "youtu.be/_O0n0l_tL1Q?start=150&end=160", "youtu.be/1d4ASeSEXVI?start=220&end=230"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04k94",
+    "name": "Liquid",
+    "description": "Sounds of a fluid that conforms to the shape of its container but retains a constant volume independent of pressure.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Liquid",
+    "positive_examples": ["youtu.be/5ntpW4iXtP8?start=70&end=80", "youtu.be/ES8lAqq8cSg?start=270&end=280", "youtu.be/EvNPF1oGjPQ?start=10&end=20", "youtu.be/vKLoUE_DBVQ?start=230&end=240", "youtu.be/1QkOu15rjXM?start=40&end=50", "youtu.be/EyOMSnIc9SY?start=440&end=450", "youtu.be/qtKiqaRZ9x8?start=10&end=20"],
+    "child_ids": ["/m/07rrlb6", "/m/07qlwh6", "/m/07r5v4s", "/m/07prgkl", "/m/07p7b8y", "/m/07qlf79", "/m/07ptzwd", "/m/07ptfmf", "/m/0dv3j"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rrlb6",
+    "name": "Splash, splatter",
+    "description": "The sound of a liquid colliding with a hard surface.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=splash",
+    "positive_examples": ["youtu.be/rFF12MllDhg?start=210&end=220", "youtu.be/whfUQXL3luQ?start=10&end=20", "youtu.be/IOwEFjW1ReU?start=50&end=60", "youtu.be/S-p4PzFB_mI?start=20&end=30", "youtu.be/XdSCT_cQDbE?start=10&end=20", "youtu.be/sW4wBBH37qA?start=10&end=20"],
+    "child_ids": ["/m/07p6mqd"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07p6mqd",
+    "name": "Slosh",
+    "description": "The sound of small waves of liquid held in a container breaking against the edges.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=slosh",
+    "positive_examples": ["youtu.be/1vvxtLJFbzk?start=140&end=150", "youtu.be/0D_D7pd2Auk?start=210&end=220", "youtu.be/0y5ZMfoo3EI?start=30&end=40", "youtu.be/8BmJ_BAnf-k?start=100&end=110", "youtu.be/i_IwowmdAOg?start=30&end=40", "youtu.be/5CjCEFSCFuw?start=30&end=40", "youtu.be/DlMgXSJq1vg?start=180&end=190"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qlwh6",
+    "name": "Squish",
+    "description": "Sounds made by a liquid-saturated material being squeezed.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=squish",
+    "positive_examples": ["youtu.be/3EmVk33-7ng?start=30&end=40", "youtu.be/H8COAreN7CI?start=30&end=40", "youtu.be/1oH7EmcORMk?start=30&end=40", "youtu.be/Wf2-FDI22Xg?start=220&end=230", "youtu.be/0Lr0d7HoFRQ?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07r5v4s",
+    "name": "Drip",
+    "description": "The sound of liquid falling in drops.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=drip",
+    "positive_examples": ["youtu.be/egSo8OqYcak?start=190&end=200", "youtu.be/RiIo5obGWgQ?start=70&end=80", "youtu.be/WUOXH-9cfbM?start=30&end=40", "youtu.be/mqSklGvTBo0?start=60&end=70"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07prgkl",
+    "name": "Pour",
+    "description": "The sound of liquid being transferred in a steady stream.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=pour",
+    "positive_examples": ["youtu.be/vSwJupCXq6g?start=10&end=20", "youtu.be/klF5jHsp2rs?start=150&end=160", "youtu.be/7NBoEcVv0U8?start=70&end=80", "youtu.be/_WHA4Awjq1Q?start=200&end=210", "youtu.be/IAP0Hr4zJcE?start=570&end=580", "youtu.be/lIWoEaiHEV8?start=380&end=390", "youtu.be/qavftbteuzU?start=570&end=580", "youtu.be/laey-lg_NQ4?start=470&end=480"],
+    "child_ids": ["/m/07pqc89", "/t/dd00088"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pqc89",
+    "name": "Trickle, dribble",
+    "description": "The sound of a liquid flowing slowly in an unsteady stream or drops.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=trickle",
+    "positive_examples": ["youtu.be/KatU7SuscEU?start=130&end=140", "youtu.be/5-26yY_4M6c?start=590&end=600", "youtu.be/zMEi4gB_m_0?start=30&end=40", "youtu.be/j8TblDlV7ms?start=120&end=130"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00088",
+    "name": "Gush",
+    "description": "The sound of a liquid coming out of a pipe or other opening rapidly and in large volumes.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/nb-qnjlexk4?start=200&end=210", "youtu.be/es8e2wKAyAg?start=110&end=120"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07p7b8y",
+    "name": "Fill (with liquid)",
+    "description": "The sound of a container being filled with liquid.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=fill",
+    "positive_examples": ["youtu.be/XDEoGiMbmtc?start=270&end=280", "youtu.be/ZoW4eL72fYA?start=210&end=220", "youtu.be/KRXChlhyrrw?start=120&end=130", "youtu.be/A8up3toh8Js?start=250&end=260", "youtu.be/ZM-nZ8Yh9b0?start=10&end=20"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qlf79",
+    "name": "Spray",
+    "description": "The sound of a liquid being transferred in a scatter or jet of droplets.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=spray",
+    "positive_examples": ["youtu.be/YDbxVJEFyvI?start=14&end=24", "youtu.be/wKByBE9Gy_8?start=500&end=510", "youtu.be/yq6m58KXxcs?start=30&end=40", "youtu.be/FYW1xkJBWBE?start=10&end=20", "youtu.be/UTsDmkLU7Og?start=30&end=40", "youtu.be/04EocSVWMoM?start=30&end=40", "youtu.be/6sYf3-kmw0k?start=30&end=40", "youtu.be/nsEhTV5-e2s?start=27&end=37"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07ptzwd",
+    "name": "Pump (liquid)",
+    "description": "The sound of a liquid being drawn or poured with a pump.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=pump",
+    "positive_examples": ["youtu.be/7bhWazNlIAY?start=370&end=380", "youtu.be/yQLgFJ3T51Q?start=180&end=190", "youtu.be/8W7uDxY5nB4?start=30&end=40", "youtu.be/k2CCMOJ2ubY?start=90&end=100", "youtu.be/sfkMAdEyB4c?start=140&end=150", "youtu.be/hZ9a8YVFOmo?start=30&end=40", "youtu.be/5Fc4wjBLQeM?start=300&end=310", "youtu.be/JGmCVlieEj0?start=10&end=20"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07ptfmf",
+    "name": "Stir",
+    "description": "The sound of moving an implement through a fluid substance.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=stir",
+    "positive_examples": ["youtu.be/USAby5e7hwI?start=30&end=40", "youtu.be/5hUK9ubSMZU?start=300&end=310", "youtu.be/KHSsK-2wDIo?start=240&end=250", "youtu.be/6DXtiywoWyQ?start=300&end=310", "youtu.be/3AmLt4xnIg8?start=200&end=210", "youtu.be/cNcj8z_wv7E?start=270&end=280"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0dv3j",
+    "name": "Boiling",
+    "description": "The sound of a liquid that is reaching the temperature at which it transitions to a vapor, resulting in bubbles emerging at the surface.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Boiling",
+    "positive_examples": ["youtu.be/inTsmf3BslU?start=180&end=190", "youtu.be/PPadmzjMODM?start=30&end=40", "youtu.be/7asRT4sLUMw?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00089",
+    "name": "Miscellaneous sources",
+    "description": "Portmanteau class for sound categories that clearly suggest specific sources not covered in the other categories.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/m/0790c", "/m/04179zz", "/m/0dl83", "/t/dd00091"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/m/0790c",
+    "name": "Sonar",
+    "description": "Sounds associated with underwater sensing via acoustic signals. A classic sonar sound involves reverberant reflections of an emitted tone pip.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Sonar",
+    "positive_examples": ["youtu.be/_l2qH62qG-Y?start=30&end=40", "youtu.be/rbN9akpEJh4?start=40&end=50", "youtu.be/f47LeLK_ucU?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/04179zz",
+    "name": "Duck call (hunting tool)",
+    "description": "The sound of a tool which imitates the communication calls of waterfowl and is used primarily by hunters to lure prey.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Duck_call",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0dl83",
+    "name": "Arrow",
+    "description": "Sounds of a shafted projectile that is shot with a bow.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Arrow",
+    "positive_examples": ["youtu.be/STeG7lU2seQ?start=70&end=80", "youtu.be/uFROlibANvw?start=40&end=50", "youtu.be/AAdoBK3LG_s?start=110&end=120", "youtu.be/SgZSBDALO1k?start=210&end=220", "youtu.be/sb4VwqX1Wss?start=130&end=140", "youtu.be/lPbA3GnfWDQ?start=30&end=40", "youtu.be/KP74BeNnPjU?start=60&end=70", "youtu.be/FfMwEVBCmaQ?start=360&end=370"],
+    "child_ids": ["/m/07rqsjt", "/m/07qnq_y", "/m/07pk7mg"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rqsjt",
+    "name": "Whoosh, swoosh, swish",
+    "description": "A sibilant sound caused by a rigid object sweeping through the air.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=whoosh",
+    "positive_examples": ["youtu.be/jRocgDvVV6U?start=30&end=40", "youtu.be/-xuq3kOwGGw?start=30&end=40", "youtu.be/HuoE9pT4DFw?start=0&end=5", "youtu.be/DdMAiFT2dIY?start=30&end=40", "youtu.be/1SDz3QpoPRk?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qnq_y",
+    "name": "Thump, thud",
+    "description": "A heavy dull sound, as made by impact of heavy objects with a relatively soft point of contact.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=thump",
+    "positive_examples": ["youtu.be/t6rniZNuoM4?start=60&end=70", "youtu.be/t4Q8i_J-Fos?start=50&end=60", "youtu.be/gNMsHIMAHXs?start=280&end=290", "youtu.be/StjeR1EIv0A?start=410&end=420", "youtu.be/b4a1SE8pJtw?start=150&end=160", "youtu.be/cx8hGMuV5Ok?start=420&end=430", "youtu.be/NLilfM4RVOA?start=50&end=60", "youtu.be/_FDKLplMbf4?start=320&end=330"],
+    "child_ids": ["/m/07rrh0c", "/t/dd00108"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rrh0c",
+    "name": "Thunk",
+    "description": "A partly dull hollow sound with a damped resonance.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=thunk",
+    "positive_examples": ["youtu.be/Ze1hkRWoz-M?start=110&end=120", "youtu.be/txiZk4ap_4c?start=100&end=110", "youtu.be/e57je4YO9cM?start=230&end=240", "youtu.be/DPI_-objvnY?start=110&end=120", "youtu.be/Q4mvcscgRdA?start=30&end=40", "youtu.be/HzQphRe4bgM?start=130&end=140", "youtu.be/qD9Uvxrq63U?start=100&end=110"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00108",
+    "name": "Clunk",
+    "description": "A predominantly low-frequency impact noise that starts with a cleaner, high-frequency click, suggesting the contact of rigid surfaces (as of a car door closing).",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/x-dLBbkPEKA?start=26&end=36", "youtu.be/fdD2UwvJdC4?start=20&end=30", "youtu.be/p-nnE3kbpCQ?start=30&end=40", "youtu.be/KULfVYWeYgw?start=30&end=40", "youtu.be/d3-_V1rijLo?start=20&end=30", "youtu.be/otLsSpCumO8?start=30&end=40", "youtu.be/aQ7m0KUB0xY?start=30&end=40", "youtu.be/HTQ7u3pcdYs?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pk7mg",
+    "name": "Wobble",
+    "description": "The sound of a structure shaking unsteadily back and forth, as in response to an impact.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=wobble",
+    "positive_examples": ["youtu.be/AesYVC3ej1g?start=340&end=350"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00091",
+    "name": "Sound equipment",
+    "description": "Sounds that suggest the involvement of electronic audio processing equipment.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/a_eC1dvlFyQ?start=60&end=70", "youtu.be/yZJCg_RBPpg?start=290&end=300", "youtu.be/wJsvxEHIUwc?start=280&end=290", "youtu.be/UEDM20GlRlA?start=10&end=20", "youtu.be/oEr9yy3dx-w?start=100&end=110"],
+    "child_ids": ["/m/0hg7b", "/m/0b_fwt", "/m/01vfsf", "/m/02rr_"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/m/0hg7b",
+    "name": "Microphone",
+    "description": "Sounds whose origin lie with the microphone used in the recording, rather than the external sound source that was intended to be recorded.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Microphone",
+    "positive_examples": [],
+    "child_ids": ["/t/dd00092"],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0b_fwt",
+    "name": "Electronic tuner",
+    "description": "Sounds produced by an electronic device whose purpose is to aid in the tuning of musical instruments.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Electronic_tuner",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/01vfsf",
+    "name": "Guitar amplifier",
+    "description": "Sounds specifically associated with an audio amplifier intended for use with an electric guitar, such as overdrive distortion.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Guitar_amplifier",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/02rr_",
+    "name": "Effects unit",
+    "description": "Sounds associated with an electronic device that alters how a musical instrument or other audio source sounds. Some effects subtly \"color\" a sound while others transform it dramatically.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Effects_unit",
+    "positive_examples": ["youtu.be/aAmyLNfCbyo?start=580&end=590", "youtu.be/7lTBoMcuy10?start=190&end=200", "youtu.be/ND-Fs2VaFL0?start=30&end=40", "youtu.be/8FiXfyDdaZo?start=160&end=170", "youtu.be/thvxJisK52g?start=50&end=60", "youtu.be/PFr8GwUVzfw?start=300&end=310", "youtu.be/GsAUcq1V2HY?start=30&end=40", "youtu.be/UMkFu4MsDjc?start=160&end=170"],
+    "child_ids": ["/m/07m2kt"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07m2kt",
+    "name": "Chorus effect",
+    "description": "The ensemble sound produced by an electronic effects unit which outputs simultaneous versions of the input source with small variations in pitch and timbre.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Chorus_effect",
+    "positive_examples": ["youtu.be/QK200gvvi68?start=60&end=70", "youtu.be/XXzpCKOVswg?start=200&end=210", "youtu.be/vKNGqQ3GRB8?start=90&end=100"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00133",
+    "name": "Specific impact sounds",
+    "description": "Transient contact sounds that are strongly suggestive of the particular objects involved in the collision.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/m/018w8"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/m/018w8",
+    "name": "Basketball bounce",
+    "description": "The sound of a basketball bouncing on a hard surface, such as a basketball court.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Basketball",
+    "positive_examples": ["youtu.be/-64NRxM2YLg?start=60&end=70", "youtu.be/2Tg8LHzMOgs?start=150&end=160", "youtu.be/WXtRxqxRIHQ?start=50&end=60", "youtu.be/zjxL61AXNTI?start=60&end=70", "youtu.be/ZGYGusrK84E?start=220&end=230", "youtu.be/GUSqE3a1P3I?start=100&end=110"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00098",
+    "name": "Source-ambiguous sounds",
+    "description": "Portmanteau class for sounds that do not immediately suggest specific source objects, but which are more likely to be perceived and described according to their acoustic properties.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/t/dd00099", "/t/dd00109", "/t/dd00110", "/m/05n1m", "/m/028v0c", "/t/dd00122"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/t/dd00099",
+    "name": "Generic impact sounds",
+    "description": "Sounds of impacts or collisions preferentially described by their acoustic properties, rather than by immediately referring to their assumed source object(s).",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/m/07pws3f", "/m/07ryjzk", "/m/07rdhzs", "/m/07pjjrj", "/m/07pc8lb", "/m/07pqn27", "/m/07r4wb8", "/m/07qcpgn", "/m/07qnq_y", "/m/07rbp7_", "/m/07pyf11"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/m/07pws3f",
+    "name": "Bang",
+    "description": "A brief and loud noise.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=bang",
+    "positive_examples": ["youtu.be/0zsRRg6NZDQ?start=21&end=31", "youtu.be/ayhkvgG1Cvw?start=210&end=220", "youtu.be/Oe6sYXWAM7E?start=90&end=100", "youtu.be/rhG_ie69Nwc?start=300&end=310"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07ryjzk",
+    "name": "Slap, smack",
+    "description": "The sharp sound of a flat object striking another (e.g., an open hand hitting a face).",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=slap",
+    "positive_examples": ["youtu.be/-gfaj2Evqus?start=290&end=300", "youtu.be/KDQV3eSkhZ4?start=70&end=80", "youtu.be/toWBIJlu2NM?start=80&end=90", "youtu.be/yjEZx7TvwcA?start=60&end=70", "youtu.be/kasvqBhDBC0?start=240&end=250", "youtu.be/ls_QsNFNDbg?start=10&end=20", "youtu.be/v-Al9LqV9Mo?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rdhzs",
+    "name": "Whack, thwack",
+    "description": "The sound of a partially rigid object forcefully striking another object, which may be preceded by the sound of the rapid, driven motion of one object (as of a baseball bat hitting a ball).",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=whack",
+    "positive_examples": ["youtu.be/redtHVmxs7s?start=30&end=40", "youtu.be/dVi2QoAYsAQ?start=180&end=190", "youtu.be/aB5CD4u16gM?start=40&end=50", "youtu.be/zruJV_pq4z0?start=270&end=280", "youtu.be/e-lOchEQ1i0?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pjjrj",
+    "name": "Smash, crash",
+    "description": "The sound of a rigid structure violently breaking into many pieces.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=smash",
+    "positive_examples": ["youtu.be/-3MV6T3u9Ig?start=30&end=40", "youtu.be/-DTgp1cbyug?start=0&end=2", "youtu.be/Ywc4_D3sKHQ?start=30&end=40", "youtu.be/PBTPWOf_E-w?start=250&end=260", "youtu.be/c0nVFint5bk?start=160&end=170", "youtu.be/5lLMWk_-fQw?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pc8lb",
+    "name": "Breaking",
+    "description": "The sound of a rigid object become separated into pieces or fragments.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=break",
+    "positive_examples": ["youtu.be/frZUEXzWE5Q?start=30&end=40", "youtu.be/1SAdEjaSf3g?start=6&end=16", "youtu.be/mKTTCj3WrLg?start=30&end=40", "youtu.be/LF-070rGEbQ?start=230&end=240", "youtu.be/FeaKHpQ4UhE?start=90&end=100", "youtu.be/wyv4blKX3No?start=390&end=400"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pqn27",
+    "name": "Bouncing",
+    "description": "The sound of an elastic, non-destructive collision in which one or both objects spring back. Depending on the situation, there may be multiple such collisions in a characteristic pattern, as of a ball bouncing.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=bounce",
+    "positive_examples": ["youtu.be/Tk2v1UaTgmk?start=40&end=50", "youtu.be/Od85JwO7ChQ?start=10&end=20", "youtu.be/FSVfPifghQ4?start=50&end=60", "youtu.be/00XaUxjGuX8?start=170&end=180", "youtu.be/BKnuN5hoZnM?start=40&end=50", "youtu.be/QErefA2Ms4c?start=390&end=400", "youtu.be/tHcW8l5ifLQ?start=10&end=20"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rbp7_",
+    "name": "Whip",
+    "description": "The sound of whipping, i.e., the greatly accelerated motion of the tip of a flexible structure, as the result of concentrated angular momentum.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=whip",
+    "positive_examples": ["youtu.be/LhbzdtNRTKs?start=30&end=40", "youtu.be/ivoEsA_OErY?start=30&end=40", "youtu.be/9qMbQFMHA5g?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pyf11",
+    "name": "Flap",
+    "description": "The noisy sound of a flexible surface, such as a flag, moving back-and-forth in the air and optionally coming into contact with an adjacent hard surface.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=flap",
+    "positive_examples": ["youtu.be/-1MpcvB9PAI?start=30&end=40", "youtu.be/M3301jxTkto?start=5&end=15", "youtu.be/7dIIVzcYhD4?start=310&end=320"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00109",
+    "name": "Surface contact",
+    "description": "Sounds produced by a more extended interaction between two objects or surfaces instead of a collision or bounce.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/m/07qb_dv", "/m/07qv4k0", "/m/07pdjhy", "/m/07s8j8t", "/m/07pt6mm"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/m/07qb_dv",
+    "name": "Scratch",
+    "description": "The sound of a hard, sharp point of contact leaving a permanent groove in a hard surface.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=scratch",
+    "positive_examples": ["youtu.be/tgHySwhil9s?start=50&end=60", "youtu.be/Y9R0M2gQE04?start=30&end=40", "youtu.be/yUEDrDKIYV0?start=450&end=460", "youtu.be/7GapruwcCO0?start=300&end=310", "youtu.be/NG_Pa0_Rc74?start=50&end=60"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qv4k0",
+    "name": "Scrape",
+    "description": "The sound of a hard edge dragging across a surface.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=scrape",
+    "positive_examples": ["youtu.be/CDXoMKXrn58?start=30&end=40", "youtu.be/SAVRtzEodx4?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pdjhy",
+    "name": "Rub",
+    "description": "The sound of two surfaces moving smoothly over each other with pressure but without excessive friction.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=rub",
+    "positive_examples": ["youtu.be/LSEjPSU7wSA?start=210&end=220", "youtu.be/hngBJnxLVmQ?start=180&end=190", "youtu.be/AXsR125JDEA?start=520&end=530", "youtu.be/fHmROtp_U00?start=410&end=420", "youtu.be/MDPCZrtDl5c?start=450&end=460", "youtu.be/cLuDhdm-1tc?start=20&end=30", "youtu.be/H5nQnPCk0Uo?start=100&end=110", "youtu.be/4ugXkiWI-7A?start=25&end=35"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07s8j8t",
+    "name": "Roll",
+    "description": "The sound of an object moving by turning over or rotating.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=roll",
+    "positive_examples": ["youtu.be/2TE4hAMo9EE?start=4&end=14", "youtu.be/3250CBWhvWE?start=30&end=40", "youtu.be/45fwQe0d62o?start=30&end=40", "youtu.be/Fx0tr98DzN8?start=0&end=8", "youtu.be/OoAtJAR6qto?start=0&end=5"],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/07pt6mm",
+    "name": "Grind",
+    "description": "The sound of a substance being reduced to small pieces or particles by pounding or abrading, or the sound of two surfaces rubbing with high friction that suggests that one or both surfaces are being disintergrated.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=grind",
+    "positive_examples": ["youtu.be/6jborgl8I1c?start=30&end=40", "youtu.be/KcNqclVaRCg?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00110",
+    "name": "Deformable shell",
+    "description": "Sounds that are associated with shells, rigid sheets, or other predominantly two-dimensional surfaces.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/m/07plct2", "/t/dd00112", "/m/07qcx4z"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/m/07plct2",
+    "name": "Crushing",
+    "description": "The sound of a brittle, rigid structure breaking into small pieces or catastrophically deforming.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=crush",
+    "positive_examples": ["youtu.be/-LRl10YdS-U?start=30&end=40", "youtu.be/R9aRI18jBsg?start=40&end=50", "youtu.be/0bbgeqkDm6U?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00112",
+    "name": "Crumpling, crinkling",
+    "description": "The sound of a rigid but flexible sheet, such as paper or aluminum, buckling into a smaller volume.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/R5Sa5FuBghs?start=80&end=90", "youtu.be/A70nDKXBvd0?start=70&end=80", "youtu.be/6oxdgmtwhPo?start=50&end=60", "youtu.be/OZete7K0H9Q?start=100&end=110", "youtu.be/Oceb2JMUNWE?start=40&end=50", "youtu.be/al801VjCvn0?start=150&end=160", "youtu.be/4q8KuEMFfhA?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qcx4z",
+    "name": "Tearing",
+    "description": "The sound of a flat or smooth surface separating abruptly.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=tear",
+    "positive_examples": ["youtu.be/OVt4xBfulms?start=40&end=50", "youtu.be/BIluBHK8DO0?start=70&end=80", "youtu.be/oS8kziiAWTk?start=40&end=50", "youtu.be/xgrsFb35uew?start=440&end=450", "youtu.be/vwze3eMulhs?start=50&end=60"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/05n1m",
+    "name": "Onomatopoeia",
+    "description": "A class of sounds named to phonetically imitate, resemble, or suggest the sound it describes.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Onomatopoeia",
+    "positive_examples": [],
+    "child_ids": ["/t/dd000138", "/m/07rjwbb", "/m/07qh7jl", "/m/07qn4z3", "/m/07rqsjt", "/m/07qwyj0", "/m/07s34ls", "/m/07qmpdm", "/m/07p9k1k", "/m/07qc9xj", "/m/07phhsh", "/m/07r_r9m", "/m/07qyrcz", "/m/07qfgpx", "/t/dd00118", "/m/07q34h3", "/m/07rcgpl", "/m/07qlwh6", "/m/07p78v5", "/t/dd00121", "/m/07pzfmf", "/m/07s12q4", "/m/07qs1cx", "/m/07pc8l3"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/t/dd000138",
+    "name": "Brief tone",
+    "description": "Short-duration sounds composed of strong harmonics with a distinct pitch.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/m/02fs_r", "/m/07qwdck", "/m/07phxs1", "/m/07rv4dm", "/m/07rpk1c", "/m/07pggtn", "/m/07pjwq1", "/m/07q6cd_", "/m/07s02z0", "/m/07q8k13"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/m/02fs_r",
+    "name": "Beep, bleep",
+    "description": "A short, single tone, generally made by a computer or other machine and typically high-pitched.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Beep_(sound)",
+    "positive_examples": ["youtu.be/iAKUYTMv23Y?start=55&end=65", "youtu.be/EotkYg493i0?start=7&end=17", "youtu.be/XnfPKj7_y4c?start=20&end=30", "youtu.be/l5CEprbZ0OE?start=30&end=40", "youtu.be/ht4cuUz_X14?start=60&end=70", "youtu.be/Q7Ohn9lAM0s?start=8&end=18", "youtu.be/jBVlmCUGv3M?start=30&end=40", "youtu.be/YmtRXXJribE?start=170&end=180", "youtu.be/SlUOFN4uc2I?start=14&end=24"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qwdck",
+    "name": "Ping",
+    "description": "A sharp high-pitched resonant sound with a clear decay.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=ping",
+    "positive_examples": ["youtu.be/FiSh0F3XnxM?start=30&end=40", "youtu.be/N1ZKrgbnGAk?start=10&end=20", "youtu.be/CkfqM_atyeI?start=90&end=100", "youtu.be/Pln4GLIMqKY?start=30&end=40", "youtu.be/2f3h6QRLyDk?start=30&end=40", "youtu.be/rnNdkwGZph0?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07phxs1",
+    "name": "Ding",
+    "description": "A high-pitched ringing sound, as of a small metal object being struck.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=ding",
+    "positive_examples": ["youtu.be/sjW5I8bhufY?start=7&end=17", "youtu.be/9ejLc44MJCY?start=50&end=60", "youtu.be/KiByOwi-V7w?start=110&end=120", "youtu.be/lLaooX_97tM?start=120&end=130", "youtu.be/khT36zSVvtg?start=140&end=150", "youtu.be/93VprpYqN4o?start=160&end=170"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rv4dm",
+    "name": "Clang",
+    "description": "A loud, resonant, discordant noise, as of a large and partly hollow metal structure begin struck.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=clang",
+    "positive_examples": ["youtu.be/eG_CHAeuu0A?start=70&end=80", "youtu.be/sIdGNeRCUHg?start=30&end=40", "youtu.be/10sNKRTjv4Q?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rpk1c",
+    "name": "Twang",
+    "description": "A sharp, vibrating sound, as of a branch whipping back into shape, or a bow after releasing an arrow.  Musical plucked strings are not included.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=twang",
+    "positive_examples": ["youtu.be/eqox9z6yadA?start=0&end=2", "youtu.be/xlO2Wy86Jys?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07s02z0",
+    "name": "Squeal",
+    "description": "A short, high-pitched tone without a sharp attack and with a perceptible variation in pitch.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=squeal",
+    "positive_examples": ["youtu.be/VLcAu0p8NE4?start=210&end=220", "youtu.be/RKUGuaVaW4Q?start=230&end=240", "youtu.be/TAl6LU38u8M?start=30&end=40", "youtu.be/2gx1koonRlE?start=100&end=110"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07q8k13",
+    "name": "Screech",
+    "description": "A high-pitched noise with a rough quality resembling the cry of a human or other animal.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=screech",
+    "positive_examples": ["youtu.be/0nCj9XmrqGQ?start=2&end=12"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qh7jl",
+    "name": "Creak",
+    "description": "A high-pitched noise with a perceptible variation in pitch as a result of pressure being shifted or applied on a surface, most commonly on wood.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=creak",
+    "positive_examples": ["youtu.be/572i4h3JcnU?start=210&end=220", "youtu.be/Cl_EaIRJRtM?start=29&end=39", "youtu.be/1z1sXjQ6Szs?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qwyj0",
+    "name": "Rustle",
+    "description": "A sound of light, thin objects, such as dried leaves, rubbing against one another.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=rustle",
+    "positive_examples": ["youtu.be/-8vMtYGbkN0?start=30&end=40", "youtu.be/8EhFLlt7Rnk?start=30&end=40", "youtu.be/84uFe8TgQmY?start=80&end=90", "youtu.be/HmbYRIsQFRE?start=0&end=8", "youtu.be/JwA0-o35yBQ?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07s34ls",
+    "name": "Whir",
+    "description": "A quiet sound of a small object, typically mechanical, in rapid, cyclic motion.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=whir",
+    "positive_examples": ["youtu.be/yFpW-T9dDUY?start=30&end=40", "youtu.be/Qf0_UKxpPl8?start=30&end=40", "youtu.be/FBPLX-RS0zY?start=30&end=40", "youtu.be/lXG4YpTn8pE?start=30&end=40", "youtu.be/BI4fjQBZliY?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qmpdm",
+    "name": "Clatter",
+    "description": "An irregular rattling noise, often produced by rapid movement, consisting of a cluster of transient sounds.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=clatter",
+    "positive_examples": ["youtu.be/3cZ1A6osRZg?start=5&end=15", "youtu.be/vrnIFkMSokk?start=17&end=27"],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/07p9k1k",
+    "name": "Sizzle",
+    "description": "The sound of many small bubbles bursting, most often caused by fat vaporizing as it is heated.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=sizzle",
+    "positive_examples": ["youtu.be/1HGzcj92CEE?start=210&end=220", "youtu.be/AqrSHCCehgY?start=30&end=40", "youtu.be/7rLP0ZG0Fps?start=390&end=400", "youtu.be/quO3yc9wxOg?start=30&end=40", "youtu.be/USAby5e7hwI?start=20&end=30", "youtu.be/pYbP2qDmtlY?start=170&end=180", "youtu.be/J1KvjY6dhOA?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qc9xj",
+    "name": "Clicking",
+    "description": "A short, pitchless, transient sound, as of a gentle impact between small hard objects.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=click",
+    "positive_examples": ["youtu.be/U58FSo_rAhI?start=370&end=380", "youtu.be/MR1pVYnWERs?start=180&end=190", "youtu.be/GkpLUcvxKSw?start=430&end=440", "youtu.be/iF8uTeM79LY?start=480&end=490", "youtu.be/7iYtmGtabDk?start=500&end=510", "youtu.be/f3nsAF4s6mY?start=450&end=460"],
+    "child_ids": ["/m/07qjznt", "/m/07rv9rh", "/m/07rwm0c"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rwm0c",
+    "name": "Clickety-clack",
+    "description": "The sound of fast and rhythmic clicking of machinery, such as keystrokes on a typewriter or steel wheels running on tracks.  Does not include the sound of horses feet (use /m/07rv9rh).",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=clickety-clack",
+    "positive_examples": ["youtu.be/xIKrqNqSyy4?start=13&end=23", "youtu.be/eM0Xs-NQwLM?start=460&end=470", "youtu.be/BW-0GDDoxFI?start=30&end=40", "youtu.be/1vt9phoCrdY?start=40&end=50", "youtu.be/ZiGOglchf5E?start=170&end=180", "youtu.be/4ZRm2_XWgSo?start=8&end=18"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07phhsh",
+    "name": "Rumble",
+    "description": "A loud, low-pitched, dull, continuous noise.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=rumble",
+    "positive_examples": ["youtu.be/EKsUrMobexk?start=60&end=70", "youtu.be/Ex56cROdjss?start=170&end=180", "youtu.be/nF3DgONcYSQ?start=30&end=40", "youtu.be/dpIMT_E53ss?start=10&end=20", "youtu.be/YhOccauNQtA?start=90&end=100", "youtu.be/O3N2oUBWUjo?start=6&end=16", "youtu.be/Br_tSpJoQSg?start=90&end=100"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07r_r9m",
+    "name": "Blare",
+    "description": "A loud harsh or strident noise, often a warning sound.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=blare",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/07qyrcz",
+    "name": "Plop",
+    "description": "The sound of a small object dropping into a liquid without a splash.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=plop",
+    "positive_examples": ["youtu.be/1leOTk_mzdA?start=20&end=30", "youtu.be/I2uWNEkXy3o?start=200&end=210"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07qfgpx",
+    "name": "Jingle, tinkle",
+    "description": "An irregular pattern of short metallic sounds, especially from small metal objects such as keys or coins shaking against each other.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=jingle",
+    "positive_examples": ["youtu.be/EW3ZkpsGNBA?start=10&end=20", "youtu.be/q7oAC59AP9c?start=1&end=11", "youtu.be/rcjv5ARPGMc?start=30&end=40", "youtu.be/gS_lDm2zjak?start=0&end=10"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00118",
+    "name": "Fizz",
+    "description": "The sound of the popping of many small bubbles, most often as gas is released from solution, such as in a carbonated beverage.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/bvEA9WionJc?start=20&end=30", "youtu.be/Cy56gEufE7E?start=170&end=180", "youtu.be/X4S4paFuHX4?start=110&end=120"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07q34h3",
+    "name": "Puff",
+    "description": "The sound of a gas, such as air, being blown forcefully in a short burst.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=puff",
+    "positive_examples": ["youtu.be/igrJbMKwioI?start=10&end=20"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07rcgpl",
+    "name": "Hum",
+    "description": "A continuous, dull tone.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=hum",
+    "positive_examples": ["youtu.be/WxxeWEBoVys?start=30&end=40", "youtu.be/cDVmKSYc4HY?start=240&end=250", "youtu.be/0R6KIVvlyfQ?start=30&end=40", "youtu.be/tHfn-Hc-kRE?start=120&end=130"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07p78v5",
+    "name": "Zing",
+    "description": "A brief high-pitched buzzing or humming sound.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=zing",
+    "positive_examples": ["youtu.be/rTuTeQVEcDs?start=30&end=40", "youtu.be/F7Kf8aTJbOU?start=30&end=40", "youtu.be/FmcdlDqtrFQ?start=10&end=20"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00121",
+    "name": "Boing",
+    "description": "A bouncing or spring-like sound with a tonal component that sweeps upwards.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/p1lMVPToEY0?start=210&end=220", "youtu.be/agLvQZD7Wr4?start=590&end=600", "youtu.be/1levp6fUO28?start=260&end=270", "youtu.be/D5fDXY7RxR0?start=20&end=30", "youtu.be/iYbA-P5Y8Rk?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07s12q4",
+    "name": "Crunch",
+    "description": "The sound of a brittle substance being crushed.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=crunch",
+    "positive_examples": ["youtu.be/7iKYHaoItuU?start=30&end=40", "youtu.be/Vx_5MCemF0o?start=540&end=550", "youtu.be/EvR0XweufUs?start=30&end=40", "youtu.be/9ao5CaBafF4?start=210&end=220", "youtu.be/Am1ipsgofdg?start=520&end=530", "youtu.be/k0iBnLQjdxo?start=100&end=110", "youtu.be/J6P5zKL3tu4?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/028v0c",
+    "name": "Silence",
+    "description": "The absence of audible sound or presence of sounds of very low intensity.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Silence",
+    "positive_examples": ["youtu.be/Jwjk9wxUoPg?start=200&end=210", "youtu.be/JZMj1suEC4g?start=290&end=300", "youtu.be/ITOF0fX8NjY?start=200&end=210", "youtu.be/u7Mq0AX6TRU?start=200&end=210", "youtu.be/Wyoj8hAUAtc?start=290&end=300", "youtu.be/m7jdjVxj4Gc?start=30&end=40", "youtu.be/4N2FV9ZuusU?start=140&end=150", "youtu.be/KrN-TxlSkT8?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00122",
+    "name": "Other sourceless",
+    "description": "A class for sound categories not represented by other classes that are peceived by their sound instead of their source.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/m/01v_m0", "/m/0c1dj", "/m/07pt_g0", "/m/01fhw5", "/m/017gp", "/m/05szwhw"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/m/01v_m0",
+    "name": "Sine wave",
+    "description": "A continuous, pure tone of fixed pitch and amplitude.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Sine_wave",
+    "positive_examples": ["youtu.be/0FOegaxNaqk?start=210&end=220", "youtu.be/Z2SmKZVU-cE?start=60&end=70", "youtu.be/kFz9rZjCFkQ?start=30&end=40", "youtu.be/iVPrW-aRC_s?start=80&end=90", "youtu.be/4mfg3_dVVaU?start=20&end=30"],
+    "child_ids": ["/m/0b9m1", "/m/0hdsk"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0b9m1",
+    "name": "Harmonic",
+    "description": "A sine tone whose frequency is an integer multiple of some fundamental tone.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Harmonic",
+    "positive_examples": ["youtu.be/TWnrcjA6VTk?start=60&end=70"],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0hdsk",
+    "name": "Chirp tone",
+    "description": "A tone whose frequency increases or decreases with time.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Chirp",
+    "positive_examples": ["youtu.be/-kE7IL6pnKU?start=20&end=30"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0c1dj",
+    "name": "Sound effect",
+    "description": "An artificially created or enhanced sound used to emphasize artistic or other content of films, television shows, live performances, animation, video games, music, or other media.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Sound_effect",
+    "positive_examples": ["youtu.be/-1cGAf0KSvo?start=1&end=11", "youtu.be/-25RvGqa7wQ?start=30&end=40", "youtu.be/-DoFDKsAOo4?start=30&end=40", "youtu.be/-Ia8FPe6GDE?start=0&end=6", "youtu.be/-TDBk9Unpuc?start=240&end=250", "youtu.be/-Th_4dhrDaM?start=9&end=19", "youtu.be/-bX_9jVt6hQ?start=30&end=40", "youtu.be/XgMWhtZ7ugo?start=310&end=320"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07pt_g0",
+    "name": "Pulse",
+    "description": "A sound that is modulated in short, regular bursts at approximately 1 Hz.  This class is not for heartbeat sounds, use /m/01jg02.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=pulse",
+    "positive_examples": ["youtu.be/_CM-Opj41xk?start=160&end=170", "youtu.be/SCf5rx_OyYM?start=100&end=110", "youtu.be/dQvTJnthAV8?start=80&end=90", "youtu.be/fRwmCHUBjBQ?start=380&end=390"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01fhw5",
+    "name": "Infrasound",
+    "description": "A sound of very low frequency (e.g., below 20 Hz) such that is not perceived, or barely perceived, by human listeners.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Infrasound",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/017gp",
+    "name": "Bass (frequency range)",
+    "description": "Low frequency sound energy in the range from 16-256 Hz.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Bass_(sound)",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/05szwhw",
+    "name": "Ringing (of resonator)",
+    "description": "The sound of a simple linear resonator in free oscillation in response to an impulse of excitation, usually ending in an exponentially decaying sine tone.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Ringing_(signal)",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/t/dd00123",
+    "name": "Channel, environment and background",
+    "description": "A class for sound categories that suggest information about attributes other than the foreground or target objects.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/t/dd00093", "/m/096m7z", "/m/07bm98"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/t/dd00093",
+    "name": "Acoustic environment",
+    "description": "A class for sounds that convey the spatiality of the recording.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": ["/t/dd00125", "/t/dd00126", "/t/dd00127", "/t/dd00128", "/t/dd00129", "/m/01b9nn", "/m/01jnbd"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/t/dd00125",
+    "name": "Inside, small room",
+    "description": "Sounds that appear to have been recorded within a small room, such as one that can accommodate no more than 15 people.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/eqWF_TDX4NM?start=190&end=200", "youtu.be/rLqyFZBYlD4?start=410&end=420", "youtu.be/k3_EmfJcNaY?start=100&end=110", "youtu.be/-vxWG4YoKZM?start=120&end=130", "youtu.be/ONXKc7AVfNY?start=310&end=320", "youtu.be/QZ9oN_zqEdc?start=310&end=320", "youtu.be/L7GtX6Kvr4s?start=60&end=70", "youtu.be/oIoaqgtM9NU?start=80&end=90"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00126",
+    "name": "Inside, large room or hall",
+    "description": "Sounds that appear to have been recorded in a large room or hall, often with clearly perceptible reverberation.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/2Jddu5vI4QY?start=70&end=80", "youtu.be/avlwY9gdjV0?start=240&end=250", "youtu.be/a4jcJ7QZ-OQ?start=150&end=160", "youtu.be/KPq-sqU58-Y?start=100&end=110", "youtu.be/zPMchtKMXkc?start=100&end=110", "youtu.be/cbirYMm7Vzk?start=420&end=430"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00127",
+    "name": "Inside, public space",
+    "description": "Sounds that appear to have been recorded in a public space such as store, restaurant, or travel terminus, often characterized by both reverberation and continuous background noise.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/d5PnDZ0LMDw?start=510&end=520", "youtu.be/Qtm98j24sQU?start=160&end=170", "youtu.be/GAXY3-A4EU0?start=10&end=20", "youtu.be/6kBgDyL1IPM?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00128",
+    "name": "Outside, urban or manmade",
+    "description": "Sounds that appear to have been recorded outdoors in a human-constructed environment.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/BzVGMdWM27I?start=400&end=410", "youtu.be/n878F0QbyYk?start=90&end=100", "youtu.be/Buf__Z-1cfQ?start=120&end=130", "youtu.be/fdR79DRDZKA?start=30&end=40", "youtu.be/H7V96qPFZ3g?start=40&end=50", "youtu.be/rhtw3CBDw3I?start=10&end=20", "youtu.be/LmsnecVb_GU?start=330&end=340"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/t/dd00129",
+    "name": "Outside, rural or natural",
+    "description": "Sounds that appear to have been recorded outdoors in a natural, non-manmade environment.",
+    "citation_uri": "",
+    "positive_examples": ["youtu.be/9oeGmqm6g8E?start=60&end=70", "youtu.be/-RbwX4LX-2U?start=170&end=180", "youtu.be/PRpUxcQtvNo?start=300&end=310", "youtu.be/FyhUO5Luq6Y?start=290&end=300", "youtu.be/W8qYyw3F5kA?start=520&end=530", "youtu.be/4Ohhodom94c?start=70&end=80", "youtu.be/TtXooTU9_6s?start=100&end=110", "youtu.be/TM9EFeC9ZUk?start=380&end=390", "youtu.be/w2_oQgcIpqI?start=70&end=80"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01b9nn",
+    "name": "Reverberation",
+    "description": "The sound of an acoustic environment that includes substantial reflection from multiple walls or other hard surfaces. Reverberation is generally perceived as being able to 'hear' the presence and size of the room or space in which the sound is made.  The reflected sounds should remain audible for hundreds of milliseconds or longer.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Reverberation",
+    "positive_examples": ["youtu.be/DqM1zpXPUgs?start=520&end=530", "youtu.be/QARJ4o82SrM?start=430&end=440", "youtu.be/f3HnJ2DhIyQ?start=180&end=190", "youtu.be/MsnMRiqmouI?start=170&end=180", "youtu.be/HpLcGBgPh5s?start=110&end=120", "youtu.be/FcHvkDWXPno?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01jnbd",
+    "name": "Echo",
+    "description": "A sound which is a delayed and attenuated repetition of an earlier sound, for example because the sound wave has been reflected back from a distant, planar surface. To be perceived as an echo, the repetion must be delayed by around 100 milliseconds or more.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Echo",
+    "positive_examples": ["youtu.be/OplvFkbmN-8?start=190&end=200", "youtu.be/P6AQcAJG8NU?start=160&end=170", "youtu.be/I3X2Yl0snS4?start=30&end=40", "youtu.be/ijsyajsdSoU?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/096m7z",
+    "name": "Noise",
+    "description": "A sound that has no perceptible structure and that typically interferes with the perception of more interesting or important sounds.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Noise_(radio)",
+    "positive_examples": ["youtu.be/lTiF59wDZ5o?start=100&end=110", "youtu.be/Q_rZIH5_5C0?start=80&end=90", "youtu.be/RjF9D8xDYFg?start=30&end=40"],
+    "child_ids": ["/m/093_4n", "/m/07qfr4h", "/m/07szfh9", "/m/0chx_", "/m/0cj0r", "/m/07p_0gm", "/m/01jwx6"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/093_4n",
+    "name": "Background noise",
+    "description": "Sounds present alongside a sound of interest but typically weaker in energy and generally not of interest.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Background_noise",
+    "positive_examples": ["youtu.be/Ea0mjqDst9E?start=80&end=90", "youtu.be/utMYmua1SkE?start=30&end=40", "youtu.be/hRJM4JH3rjE?start=410&end=420", "youtu.be/qQBAvAZHvAg?start=10&end=20", "youtu.be/tQYVJlOnx8s?start=60&end=70", "youtu.be/f8AwggqxxVM?start=140&end=150"],
+    "child_ids": ["/m/06_y0by", "/m/09d1b1", "/m/07rgkc5", "/m/06xkwv", "/m/0g12c5", "/m/08p9q4"],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06_y0by",
+    "name": "Environmental noise",
+    "description": "The combined sounds of transport, industrial, and recreational activities.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Environmental_noise",
+    "positive_examples": ["youtu.be/LmafBI1fKw0?start=340&end=350", "youtu.be/8_M3lTg3_oU?start=10&end=20", "youtu.be/OEVLtm3IzWw?start=40&end=50"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/09d1b1",
+    "name": "Tape hiss",
+    "description": "A high frequency noise present on analog magnetic tape recordings caused by the size of the magnetic particles used to make the tape.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Tape_hiss",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/07rgkc5",
+    "name": "Static",
+    "description": "A crackling or hissing noise caused by electrical interference.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=static",
+    "positive_examples": ["youtu.be/zQv5mqOaysE?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06xkwv",
+    "name": "Mains hum",
+    "description": "A low, continuous tone caused by electromagnetic interference from the alternating current of the mains electricity. The fundamental frequency of this sound is usually 50 Hz or 60 Hz, depending on the local power-line frequency.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Mains_hum",
+    "positive_examples": ["youtu.be/GH-sJJ04axg?start=520&end=530", "youtu.be/fd3DJ0rY_RU?start=270&end=280", "youtu.be/xf0TIS26BUE?start=70&end=80"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0g12c5",
+    "name": "Distortion",
+    "description": "A sound that has been modified in a nonlinear fashion, usually in electronic equipment, to introduce sound correlated with the original signal. In many cases, notably with amplified electric musical instruments, distortion is introduced as an effect by increasing the gain and clipping the source signal.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Distortion_(music)",
+    "positive_examples": ["youtu.be/EAHYCGhoyM4?start=30&end=40", "youtu.be/MXtktxGEdOY?start=30&end=40", "youtu.be/L2ImCbQsRUk?start=60&end=70", "youtu.be/PaIuJvwzZW8?start=40&end=50", "youtu.be/F1PUkKI4sR0?start=40&end=50", "youtu.be/VCEOxg9ZX24?start=230&end=240"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/08p9q4",
+    "name": "Sidetone",
+    "description": "The audible feedback on a telecommunications channel that allows the speaker to know that the channel is operational.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Sidetone",
+    "positive_examples": ["youtu.be/Qmw5vuWbdkM?start=160&end=170", "youtu.be/afnR1C7HTh8?start=30&end=40", "youtu.be/j-8EgxtL-Tw?start=510&end=520"],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/07szfh9",
+    "name": "Cacophony",
+    "description": "A loud, confusing, disagreeable sound, such as many voices shouting simultaneously.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=cacophony",
+    "positive_examples": ["youtu.be/-r4PDh9kiuo?start=90&end=100", "youtu.be/YGTJ2LlaSAQ?start=30&end=40", "youtu.be/ruaeTvNFGTQ?start=530&end=540", "youtu.be/Y-Zxw19hwZQ?start=80&end=90"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0chx_",
+    "name": "White noise",
+    "description": "A random, unstructured sound in which the value at any moment provides no information about the value at any other moment. White noise has equal energy in all frequency bands.",
+    "citation_uri": "http://en.wikipedia.org/wiki/White_noise",
+    "positive_examples": ["youtu.be/IEdT5i6ogWI?start=360&end=370", "youtu.be/4oIp7EgT7B4?start=80&end=90", "youtu.be/G_iWfCQ3fOU?start=110&end=120", "youtu.be/9DKPBZ00Q3A?start=17&end=27"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0cj0r",
+    "name": "Pink noise",
+    "description": "Unstructured noise whose energy decreases with frequency such that equal amounts of energy are distributed in logarithmic bands of frequency, typically octaves.  Pink noise sounds more like a rumble where white noise sounds like a hiss.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Pink_noise",
+    "positive_examples": ["youtu.be/XKNWjAq3xUk?start=162&end=172"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/07p_0gm",
+    "name": "Throbbing",
+    "description": "A dull, continunous sound with a strong rhythmic modulation.",
+    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=throbbing",
+    "positive_examples": ["youtu.be/K2E-WWG597Q?start=30&end=40", "youtu.be/8Py7bmA96xA?start=30&end=40", "youtu.be/fTE53ntPQf4?start=40&end=50", "youtu.be/nD3ZyFWYlLM?start=10&end=20"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/01jwx6",
+    "name": "Vibration",
+    "description": "The general class of sounds caused by a periodic mechanical oscillation.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Vibration",
+    "positive_examples": ["youtu.be/-FpGTCaiioc?start=30&end=40", "youtu.be/T_g9P-K-9Ac?start=30&end=40", "youtu.be/LriokFTJZt4?start=40&end=50", "youtu.be/OR2yfr-Cb4Y?start=3&end=13", "youtu.be/CBX4Kl8ibRM?start=30&end=40"],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/07bm98",
+    "name": "Sound reproduction",
+    "description": "A class for sounds that reveal the particular equipment used to capture or reproduce the sound.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Sound_recording_and_reproduction",
+    "positive_examples": [],
+    "child_ids": ["/m/07c52", "/m/06bz3", "/m/0cfpc", "/m/01b7fy", "/m/025l19", "/m/0174nj", "/m/01www", "/m/04zc0"],
+    "restrictions": ["abstract"]
+  },
+  {
+    "id": "/m/07c52",
+    "name": "Television",
+    "description": "A sound that appears to have been produced by a television set.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Television",
+    "positive_examples": ["youtu.be/VrUCCto21RU?start=20&end=30", "youtu.be/zdkybSg9Ddw?start=130&end=140", "youtu.be/irMaO2tcCns?start=260&end=270"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/06bz3",
+    "name": "Radio",
+    "description": "A sound that appears to have been produced by a radio receiver.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Radio",
+    "positive_examples": ["youtu.be/soyKe4M8EUM?start=180&end=190", "youtu.be/E0Q6l3_VTLY?start=590&end=600", "youtu.be/UAT_j1ma23E?start=50&end=60", "youtu.be/MdjoPGi_A2Y?start=110&end=120", "youtu.be/3m2i2PQubjc?start=30&end=40", "youtu.be/AQXT9gOjT4s?start=30&end=40", "youtu.be/ogIv5VmM71Y?start=200&end=210"],
+    "child_ids": [],
+    "restrictions": []
+  },
+  {
+    "id": "/m/0cfpc",
+    "name": "Loudspeaker",
+    "description": "A sound that appears to have been produced by a loudspeaker.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Loudspeaker",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/01b7fy",
+    "name": "Headphones",
+    "description": "A sound that appears to have been produced by headphones.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Headphones",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/025l19",
+    "name": "Recording",
+    "description": "A sound that appears to come from a recording of a sound, rather than the original sound itself.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Recording",
+    "positive_examples": [],
+    "child_ids": ["/m/07hvw1"],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/07hvw1",
+    "name": "Field recording",
+    "description": "A sound that appears to have been recorded in some specific environment, usually with the intention of capturing and representing that environment.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Field_recording",
+    "positive_examples": ["youtu.be/Gxc0gxJpato?start=490&end=500", "youtu.be/2HxnUDbhgFM?start=30&end=40", "youtu.be/6KVFGJVJ6QY?start=18&end=28", "youtu.be/wDGzCV1lJJY?start=190&end=200", "youtu.be/1en9fyEUfYU?start=10&end=20"],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/0174nj",
+    "name": "Gramophone record",
+    "description": "A sound which appears to come from a gramophone or vinyl record disc being played on a turntable.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Gramophone_record",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/01www",
+    "name": "Compact disc",
+    "description": "A sound which appears to come from a digital audio Compact Disc.",
+    "citation_uri": "http://en.wikipedia.org/wiki/Compact_disc",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  },
+  {
+    "id": "/m/04zc0",
+    "name": "MP3",
+    "description": "A sound which appears to come from a compressed digital file using the MP3 format, discernible by containing nearly imperceptible compression artifacts.",
+    "citation_uri": "",
+    "positive_examples": [],
+    "child_ids": [],
+    "restrictions": ["blacklist"]
+  }
+]
diff --git a/train.py b/train.py
index 7849ccd..e8bd42c 100644
--- a/train.py
+++ b/train.py
@@ -1,4 +1,5 @@
 import argparse
+import os.path
 from l3embedding.train import *
 
 
@@ -21,37 +22,69 @@ def parse_arguments():
                         default=150,
                         help='Maximum number of training epochs')
 
-    parser.add_argument('-es',
-                        '--epoch-size',
-                        dest='epoch_size',
+    parser.add_argument('-tes',
+                        '--train-epoch-size',
+                        dest='train_epoch_size',
                         action='store',
                         type=int,
                         default=512,
                         help='Number of training batches per epoch')
 
-    parser.add_argument('-bs',
-                        '--batch-size',
+    parser.add_argument('-ves',
+                        '--validation-epoch-size',
+                        dest='validation_size',
+                        action='store',
+                        type=int,
+                        default=1024,
+                        help='Number of validation batches per epoch')
+
+    parser.add_argument('-tbs',
+                        '--train-batch-size',
+                        dest='train_batch_size',
+                        action='store',
+                        type=int,
+                        default=64,
+                        help='Number of examples per training batch')
+
+    parser.add_argument('-vbs',
+                        '--validation-batch-size',
                         dest='batch_size',
                         action='store',
                         type=int,
                         default=64,
-                        help='Number of training examples per batch')
+                        help='Number of examples per  batch')
 
-    parser.add_argument('-vs',
-                        '--validation-size',
-                        dest='validation_size',
+    parser.add_argument('-ts',
+                        '--train-num-streamers',
+                        dest='train_num_streamers',
                         action='store',
                         type=int,
-                        default=1024,
-                        help='Number of trianing examples in the validation set')
+                        default=32,
+                        help='Number of training pescador streamers that can be open concurrently')
 
-    parser.add_argument('-s',
-                        '--num-streamers',
-                        dest='num_streamers',
+    parser.add_argument('-vs',
+                        '--validation-num-streamers',
+                        dest='validation_num_streamers',
                         action='store',
                         type=int,
                         default=32,
-                        help='Number of pescador streamers that can be open concurrently')
+                        help='Number of validation pescador streamers that can be open concurrently')
+
+    parser.add_argument('-tmr',
+                        '--train-mux-rate',
+                        dest='train_mux_rate',
+                        action='store',
+                        type=float,
+                        default=16.0,
+                        help='Poisson distribution parameter for determining number of training samples to take from a streamer')
+
+    parser.add_argument('-vmr',
+                        '--validation-mux-rate',
+                        dest='validation_mux_rate',
+                        action='store',
+                        type=float,
+                        default=16.0,
+                        help='Poisson distribution parameter for determining number of validation samples to take from a streamer')
 
     parser.add_argument('-lr',
                         '--learning-rate',
@@ -69,6 +102,14 @@ def parse_arguments():
                         default=10,
                         help='The number of epochs between model checkpoints')
 
+    parser.add_argument('-o'
+                        '--ontology-path',
+                        dest='ontology_path',
+                        action='store',
+                        type=str,
+                        default=os.path.join(os.path.dirname(__file__), 'resources/ontology.json'),
+                        help='Path to AudioSet ontology')
+
     parser.add_argument('-r',
                         '--random-state',
                         dest='random_state',
@@ -97,12 +138,31 @@ def parse_arguments():
                         default=False,
                         help='If True, print detailed messages')
 
-    """
-    parser.add_argument('train_csv_path',
+    parser.add_argument('-tmp',
+                        '--train-metadata-path',
+                        dest='train_metadata_path',
                         action='store',
                         type=str,
-                        help='Path to training csv file')
-    """
+                        help='Path to training csv file(s). Accepts a glob string.')
+
+    parser.add_argument('-vmp',
+                        '--validation-metadata-path',
+                        action='store',
+                        type=str,
+                        help='Path to validation csv file. Accepts a glob string.')
+
+    parser.add_argument('-tfp',
+                        '--train-filter-path',
+                        action='store',
+                        type=str,
+                        help='Path to training csv file(s). Accepts a glob string.')
+
+    parser.add_argument('-vcp',
+                        '--validation-csv-path',
+                        action='store',
+                        type=str,
+                        help='Path to validation csv file. Accepts a glob string.')
+
     parser.add_argument('train_data_dir',
                         action='store',
                         type=str,

From 81c4b9fb4cce37d615e4aa82b2e9608083669799 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sat, 25 Nov 2017 14:03:04 -0500
Subject: [PATCH 037/201] Fix typos and add missing arguments

---
 l3embedding/train.py |  5 ++++-
 train.py             | 33 ++++++++++++++++++++++++++-------
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 9aa0485..646faca 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -19,7 +19,7 @@
 from .image import *
 from .model import construct_cnn_L3_orig
 from .training_utils import multi_gpu_model
-from ..audioset.ontology import ASOntology
+from audioset.ontology import ASOntology
 
 
 #TODO: Consider putting the sampling functionality into another file
@@ -480,6 +480,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
           train_epoch_size=512, validation_epoch_size=1024,
           train_batch_size=64, validation_batch_size=64,
           train_num_streamers=16, validation_num_streamers=16,
+          train_num_distractors=1, validation_num_distractors=2,
           train_mux_rate=16, validation_mux_rate=16,
           learning_rate=1e-4, random_state=20171021,
           verbose=False, checkpoint_interval=10, ontology_path=None,
@@ -545,6 +546,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
         random_state=random_state,
         k=train_num_streamers,
         augment=augment,
+        num_distractors=train_num_distractors,
         rate=train_mux_rate)
 
     train_gen = pescador.maps.keras_tuples(train_gen,
@@ -559,6 +561,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
         batch_size=validation_batch_size,
         random_state=random_state,
         k=validation_num_streamers,
+        num_distractors=validation_num_distractors,
         rate=validation_mux_rate)
 
     val_gen = pescador.maps.keras_tuples(val_gen,
diff --git a/train.py b/train.py
index e8bd42c..4f05674 100644
--- a/train.py
+++ b/train.py
@@ -32,7 +32,7 @@ def parse_arguments():
 
     parser.add_argument('-ves',
                         '--validation-epoch-size',
-                        dest='validation_size',
+                        dest='validation_epoch_size',
                         action='store',
                         type=int,
                         default=1024,
@@ -48,13 +48,13 @@ def parse_arguments():
 
     parser.add_argument('-vbs',
                         '--validation-batch-size',
-                        dest='batch_size',
+                        dest='validation_batch_size',
                         action='store',
                         type=int,
                         default=64,
                         help='Number of examples per  batch')
 
-    parser.add_argument('-ts',
+    parser.add_argument('-tns',
                         '--train-num-streamers',
                         dest='train_num_streamers',
                         action='store',
@@ -62,7 +62,7 @@ def parse_arguments():
                         default=32,
                         help='Number of training pescador streamers that can be open concurrently')
 
-    parser.add_argument('-vs',
+    parser.add_argument('-vns',
                         '--validation-num-streamers',
                         dest='validation_num_streamers',
                         action='store',
@@ -70,6 +70,22 @@ def parse_arguments():
                         default=32,
                         help='Number of validation pescador streamers that can be open concurrently')
 
+    parser.add_argument('-tnd',
+                        '--train-num-distractors',
+                        dest='train_num_distractors',
+                        action='store',
+                        type=int,
+                        default=1,
+                        help='Number of distractors for generating training examples')
+
+    parser.add_argument('-vnd',
+                        '--validation-num-distractors',
+                        dest='validation_num_distractors',
+                        action='store',
+                        type=int,
+                        default=2,
+                        help='Number of distractors for generating validation examples')
+
     parser.add_argument('-tmr',
                         '--train-mux-rate',
                         dest='train_mux_rate',
@@ -147,21 +163,24 @@ def parse_arguments():
 
     parser.add_argument('-vmp',
                         '--validation-metadata-path',
+                        dest='validation_metadata_path',
                         action='store',
                         type=str,
                         help='Path to validation csv file. Accepts a glob string.')
 
     parser.add_argument('-tfp',
                         '--train-filter-path',
+                        dest='train_filter_path',
                         action='store',
                         type=str,
                         help='Path to training csv file(s). Accepts a glob string.')
 
-    parser.add_argument('-vcp',
-                        '--validation-csv-path',
+    parser.add_argument('-vfp',
+                        '--validation-filter-path',
+                        dest='validation_filter_path',
                         action='store',
                         type=str,
-                        help='Path to validation csv file. Accepts a glob string.')
+                        help='Path to validationing csv file(s). Accepts a glob string.')
 
     parser.add_argument('train_data_dir',
                         action='store',

From a7d5c54f6e632d32966c1c4736daa28bdb51d598 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sat, 25 Nov 2017 14:07:43 -0500
Subject: [PATCH 038/201] Make sure different video file is chosen as a
 distractor

---
 l3embedding/train.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 646faca..ba6702d 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -433,7 +433,10 @@ def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path
     seeds = []
     for video_file_1 in tqdm(video_files):
         for _ in range(num_distractors):
-            video_file_2 = random.choice(video_files)
+            video_file_2 = video_file_1
+            # Make sure we sample a different file
+            while video_file_2 == video_file_1:
+                video_file_2 = random.choice(video_files)
             seeds.append(pescador.Streamer(sampler, video_file_1, video_file_2, augment=augment))
 
     # TODO:

From 9e8bfa663db7fec7e438d411212df3efbff5a9c2 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 28 Nov 2017 20:13:38 -0500
Subject: [PATCH 039/201] Reduce memory used by streamers

* Load audio data as np.float32
* Load video data as np.float32 (cast after loading)
* Rescale video before sampling frames, which takes longer but uses less memory
---
 l3embedding/train.py | 59 ++++++++++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 24 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index ba6702d..ca5c7e5 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -199,28 +199,45 @@ def sample_one_second(audio_data, sampling_frequency, augment=False):
     return audio_data, start / sampling_frequency, audio_aug_params
 
 
-def l3_frame_scaling(frame_data):
+def rescale_video(video_data):
     """
-    Scale and crop an video frame, using the method from Look, Listen and Learn
+    Rescales video such that the minimum dimension of the video becomes 256,
+    as is down in Look, Listen and Learn
 
 
     Args:
-        frame_data: video frame data array
+        video_data: video data array
 
     Returns:
-        scaled_frame_data: scaled and cropped frame data
-        bbox: bounding box for the cropped image
+        rescaled_video_data: rescaled video data array
     """
-    nx, ny, nc = frame_data.shape
+    num_frames, nx, ny, nc = video_data.shape
+
     scaling = 256.0 / min(nx, ny)
 
     new_nx, new_ny = math.ceil(scaling * nx), math.ceil(scaling * ny)
     assert 256 in (new_nx, new_ny), str((new_nx, new_ny))
 
+    resized_video_data = np.array([scipy.misc.imresize(frame, (new_nx, new_ny, nc))
+                                   for frame in video_data])
 
-    resized_frame_data = scipy.misc.imresize(frame_data, (new_nx, new_ny, nc))
+    return resized_video_data
 
-    start_x, start_y = random.randrange(new_nx - 224), random.randrange(new_ny - 224)
+
+def sample_cropped_frame(frame_data):
+    """
+    Randomly crop a video frame, using the method from Look, Listen and Learn
+
+
+    Args:
+        frame_data: video frame data array
+
+    Returns:
+        scaled_frame_data: scaled and cropped frame data
+        bbox: bounding box for the cropped image
+    """
+    nx, ny, nc = frame_data.shape
+    start_x, start_y = random.randrange(nx - 224), random.randrange(ny - 224)
     end_x, end_y = start_x + 224, start_y + 224
 
     bbox = {
@@ -230,25 +247,22 @@ def l3_frame_scaling(frame_data):
         'end_y': end_y
     }
 
-    return resized_frame_data[start_x:end_x, start_y:end_y, :], bbox
+    return frame_data[start_x:end_x, start_y:end_y, :], bbox
 
 
-def sample_one_frame(video_data, start=None, fps=30, scaling_func=None, augment=False):
+def sample_one_frame(video_data, start=None, fps=30, augment=False):
     """Return one frame randomly and time (seconds).
 
     Args:
         video_data: video data to sample from
         start: start time of a one second window from which to sample
         fps: frame per second
-        scaling_func: function that rescales the sampled video frame
         augment: if True, perturb the data in some fashion
 
     Returns:
         One frame sampled randomly, start time in seconds, and augmentation parameters
 
     """
-    if not scaling_func:
-        scaling_func = l3_frame_scaling
 
     num_frames = video_data.shape[0]
     if start is not None:
@@ -273,7 +287,7 @@ def sample_one_frame(video_data, start=None, fps=30, scaling_func=None, augment=
         frame = random.randrange(num_frames)
 
     frame_data = video_data[frame, :, :, :]
-    frame_data, bbox = scaling_func(frame_data)
+    frame_data, bbox = sample_cropped_frame(frame_data)
 
     video_aug_params = {'bounding_box': bbox}
 
@@ -331,14 +345,14 @@ def sampler(video_file_1, video_file_2, augment=False):
     """
 
     try:
-        video_data_1 = vread(video_file_1)
+        video_data_1 = rescale_video(vread(video_file_1).astype('float32'))
     except Exception as e:
         warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
         warnings.warn(warn_msg.format(video_file_1, type(e), e))
         raise StopIteration()
 
     try:
-        video_data_2 = vread(video_file_2)
+        video_data_2 = rescale_video(vread(video_file_2).astype('float32'))
     except Exception as e:
         warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
         warnings.warn(warn_msg.format(video_file_2, type(e), e))
@@ -348,14 +362,14 @@ def sampler(video_file_1, video_file_2, augment=False):
     audio_file_2 = video_to_audio(video_file_2)
 
     try:
-        audio_data_1, sampling_frequency = sf.read(audio_file_1, always_2d=True)
+        audio_data_1, sampling_frequency = sf.read(audio_file_1, dtype='float32', always_2d=True)
     except Exception as e:
         warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
         warnings.warn(warn_msg.format(audio_file_1, type(e), e))
         raise StopIteration()
 
     try:
-        audio_data_2, sampling_frequency = sf.read(audio_file_2, always_2d=True)
+        audio_data_2, sampling_frequency = sf.read(audio_file_2, dtype='float32', always_2d=True)
     except Exception as e:
         warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
         warnings.warn(warn_msg.format(audio_file_2, type(e), e))
@@ -424,12 +438,6 @@ def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path
     audio_files, video_files = get_file_list(data_dir, metadata_path=metadata_path,
                                              filter_path=filter_path, ontology_path=ontology_path)
 
-    # Randomly shuffle the files
-    ordering = list(range(len(audio_files)))
-    random.shuffle(ordering)
-    audio_files = [audio_files[idx] for idx in ordering]
-    video_files = [video_files[idx] for idx in ordering]
-
     seeds = []
     for video_file_1 in tqdm(video_files):
         for _ in range(num_distractors):
@@ -439,6 +447,9 @@ def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path
                 video_file_2 = random.choice(video_files)
             seeds.append(pescador.Streamer(sampler, video_file_1, video_file_2, augment=augment))
 
+    # Randomly shuffle the seeds
+    random.shuffle(seeds)
+
     # TODO:
     # Order 1024 streamers open
     # Set rate 16?

From 42bc8458c5fad73acd8c9cccdd0383fb9be1ead4 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 4 Dec 2017 11:06:22 -0500
Subject: [PATCH 040/201] Add model loading functions and add training
 procedure for urban sound classification models

---
 classifier/__init__.py      |   0
 classifier/train.py         | 537 ++++++++++++++++++++++++++++++++++++
 l3embedding/audio_model.py  |   1 +
 l3embedding/model.py        |  55 +++-
 l3embedding/vision_model.py |   1 +
 train_classifier.py         | 135 +++++++++
 6 files changed, 727 insertions(+), 2 deletions(-)
 create mode 100644 classifier/__init__.py
 create mode 100644 classifier/train.py
 create mode 100644 train_classifier.py

diff --git a/classifier/__init__.py b/classifier/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/classifier/train.py b/classifier/train.py
new file mode 100644
index 0000000..6b1c268
--- /dev/null
+++ b/classifier/train.py
@@ -0,0 +1,537 @@
+import os
+import scipy as sp
+import numpy as np
+import soundfile as sf
+import csv
+import librosa
+from itertools import islice
+from sklearn.externals import joblib
+from sklearn.svm import SVC
+from sklearn.preprocessing import StandardScaler
+from l3embedding.train import sample_one_second
+from l3embedding.model import load_embedding
+
+
+def load_us8k_metadata(path):
+    """
+    Load UrbanSound8K metadata
+
+    Args:
+        path: Path to metadata csv file
+              (Type: str)
+
+    Returns:
+        metadata: Metadata dictionary
+                  (Type: dict[str, *])
+    """
+    metadata = {}
+    with open(path) as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            fname = row['slice_file_name']
+            row['start'] = float(row['start'])
+            row['end'] = float(row['end'])
+            row['salience'] = float(row['salience'])
+            row['fold'] = int(row['fold'])
+            row['classID'] = int(row['classID'])
+            metadata[fname] = row
+
+    return metadata
+
+
+def one_hot(idx, n_classes=10):
+    """
+    Creates a one hot encoding vector
+
+    Args:
+        idx:  Class index
+              (Type: int)
+
+    Keyword Args:
+        n_classes: Number of classes
+                   (Type: int)
+
+    Returns:
+        one_hot_vector: One hot encoded vector
+                        (Type: np.ndarray)
+    """
+    y = np.zeros((n_classes,))
+    y[idx] = 1
+    return y
+
+
+def get_l3_stack_features(audio_path, l3embedding_model):
+    """
+    Get stacked L3 embedding features, i.e. stack embedding features for each
+    1 second (overlapping) window of the given audio
+
+
+    Args:
+        audio_path: Path to audio file
+                    (Type: str)
+
+        l3embedding_model:  Audio embedding model
+                            (keras.engine.training.Model)
+
+    Returns:
+        features:  Feature vector
+                   (Type: np.ndarray)
+    """
+    sr = 48000
+    audio, _ = librosa.load(audio_path, sr=sr, mono=True)
+    audio_length = len(audio)
+
+    # Zero pad to 4 seconds
+    target_len = 48000 * 4
+    if audio_length < target_len:
+        pad_length = target_len - audio_length
+        # Use (roughly) symmetric padding
+        left_pad = pad_length // 2
+        right_pad= pad_length - left_pad
+        audio = np.pad(audio, (left_pad, right_pad), mode='constant')
+    elif audio_length > target_len:
+        # Take center of audio (ASSUMES NOT MUCH GREATER THAN TARGET LENGTH)
+        center_sample = audio_length // 2
+        half_len = target_len // 2
+        audio = audio[center_sample-half_len:center_sample+half_len]
+
+
+
+    # Divide into overlapping 1 second frames
+    x = librosa.util.utils.frame(audio, frame_length=sr * 1, hop_length=sr // 2).T
+
+    # Add a channel dimension
+    x = x.reshape((x.shape[0], 1, x.shape[-1]))
+
+    # Get the L3 embedding for each frame
+    l3embedding = l3embedding_model.predict(x)
+
+    # Return a flattened vector of the embeddings
+    return l3embedding.flatten()
+
+
+def get_l3_stats_features(audio_path, l3embedding_model):
+    """
+    Get L3 embedding stats features, i.e. compute statistics for each of the
+    embedding features across 1 second (overlapping) window of the given audio
+
+    Args:
+        audio_path: Path to audio file
+                    (Type: str)
+
+        l3embedding_model:  Audio embedding model
+                            (keras.engine.training.Model)
+
+    Returns:
+        features:  Feature vector
+                   (Type: np.ndarray)
+    """
+    sr = 48000
+    audio = librosa.load(audio_path, sr=sr, mono=True)
+
+    hop_size = 10e-3
+    hop_length = int(hop_size * sr)
+    frame_length = 48000 * 1
+
+    # Zero pad so we compute embedding on all samples
+    pad_length = int(np.ceil(audio_length - frame_length)/hop_length) * hop_length \
+                 - (audio_length - frame_length)
+
+    if pad_length > 0:
+        # Use (roughly) symmetric padding
+        left_pad = pad_length // 2
+        right_pad= pad_length - left_pad
+        audio = np.pad(audio, (left_pad, right_pad), mode='constant')
+
+
+    # Divide into overlapping 1 second frames
+    x = librosa.util.utils.frame(audio, frame_length=frame_length, hop_length=hop_length)
+
+    # Add a channel dimension
+    x = x.reshape((x.shape[0], 1, x.shape[-1]))
+
+    # Get the L3 embedding for each frame
+    l3embedding = l3embedding_model.predict(x)
+
+
+    # Compute statistics on the time series of embeddings
+    minimum = np.min(l3embedding, axis=0)
+    maximum = np.max(l3embedding, axis=0)
+    median = np.median(l3embedding, axis=0)
+    mean = np.mean(l3embedding, axis=0)
+    var = np.var(l3embedding, axis=0)
+    skewness = sp.stats.skew(l3embedding, axis=0)
+    kurtosis = sp.stats.kurtosis(l3embedding, axis=0)
+
+    # Compute statistics on the first and second derivatives of time series of embeddings
+
+    # Use finite differences to approximate the derivatives
+    d1 = np.gradient(l3embedding, 1/sr, edge_order=1, axis=0)
+    d2 = np.gradient(l3embedding, 1/sr, edge_order=2, axis=0)
+
+    d1_mean = np.mean(d1, axis=0)
+    d1_var = np.var(d1, axis=0)
+
+    d2_mean = np.mean(d2, axis=0)
+    d2_var = np.var(d2, axis=0)
+
+    return np.concatenate((minimum, maximum, median, mean, var, skewness, kurtosis,
+                           d1_mean, d1_var, d2_mean, d2_var))
+
+
+def get_us8k_folds(metadata, data_dir, l3embedding_model=None,
+                   features='l3_stack', label_format='int'):
+    """
+    Load all of the data for each fold
+
+    Args:
+        metadata: Metadata dictionary
+                  (Type: dict[str,*])
+
+        data_dir: Path to data directory
+                  (Type: str)
+
+    Keyword Args:
+        l3embedding_model: L3 embedding model, used if L3 features are used
+                           (Type: keras.engine.training.Model or None)
+
+        features: Type of features to be computed
+                  (Type: str)
+
+        label_format: Type of format used for encoding outputs
+                      (Type: str)
+
+    Returns:
+        fold_data: List of data for each fold
+                   (Type: list[tuple[np.ndarray, np.ndarray]])
+
+    """
+    fold_data = []
+    for fold_idx in range(10):
+        fold_data.append(get_fold_data(metadata, data_dir, fold_idx,
+                                       l3embedding_model=l3embedding_model,
+                                       features=features, label_format=label_format))
+
+    return fold_data
+
+
+def get_fold_data(metadata, data_dir, fold_idx, l3embedding_model=None, features='l3_stack', label_format='int'):
+    """
+    Load all of the data for a specific fold
+
+    Args:
+        metadata: Metadata dictionary
+                  (Type: dict[str,*])
+
+        data_dir: Path to data directory
+                  (Type: str)
+
+        fold_idx: Index of fold to load
+                  (Type: int)
+
+    Keyword Args:
+        l3embedding_model: L3 embedding model, used if L3 features are used
+                           (Type: keras.engine.training.Model or None)
+
+        features: Type of features to be computed
+                  (Type: str)
+
+        label_format: Type of format used for encoding outputs
+                      (Type: str)
+
+    Returns:
+        X: Feature data
+           (Type: np.ndarray)
+        y: Label data
+           (Type: np.ndarray)
+
+    """
+    X = []
+    y = []
+
+    for idx, (fname, example_metadata) in enumerate(metadata.items()):
+        if example_metadata['fold'] != (fold_idx+1):
+            continue
+
+        path = os.path.join(data_dir, "fold{}".format(fold_idx+1), fname)
+
+        if features.startswith('l3') and not l3embedding_model:
+            raise ValueError('Must provide L3 embedding model to use {} features'.format(features))
+
+        if features == 'l3_stack':
+            X.append(get_l3_stack_features(path, l3embedding_model))
+        elif features == 'l3_stats':
+            X.append(get_l3_stats_features(path, l3embedding_model))
+        else:
+            raise ValueError('Invalid feature type: {}'.format(features))
+
+
+        class_label = example_metadata['classID']
+        if label_format == 'int':
+            y.append(class_label)
+        elif label_format == 'one_hot':
+            y.append(one_hot(class_label))
+        else:
+            raise ValueError('Invalid label format: {}'.format(label_format))
+
+    return np.array(X), np.array(y)
+
+
+def get_fold_split(fold_data, fold_idx):
+    """
+    Given the fold to use as held out, return the data split between training
+    and testing
+
+    Args:
+        fold_data: List of data for each fold
+                   (Type: list[tuple[np.ndarray, np.ndarray]])
+
+        fold_idx: Fold to use as held out data
+                  (Type: int)
+
+    Returns:
+        X_train: Training feature data
+                 (Type: np.ndarray)
+        y_train: Training label data
+                 (Type: np.ndarray)
+        X_test: Testing feature data
+                (Type: np.ndarray)
+        y_test: Testing label data
+                (Type: np.ndarray)
+    """
+    X_test, y_test = fold_data[fold_idx]
+    train = fold_data[:fold_idx] + fold_data[fold_idx+1:]
+
+    X_train, y_train = zip(*train)
+    X_train = np.concatenate(X_train, axis=0)
+    y_train = np.concatenate(y_train, axis=0)
+
+    return X_train, y_train, X_test, y_test
+
+
+def train_svm(X_train, y_train, X_test, y_test, C=1e-4, verbose=False, **kwargs):
+    """
+    Train a Support Vector Machine model on the given data
+
+    Args:
+        X_train: Training feature data
+                 (Type: np.ndarray)
+        y_train: Training label data
+                 (Type: np.ndarray)
+        X_test: Testing feature data
+                (Type: np.ndarray)
+        y_test: Testing label data
+                (Type: np.ndarray)
+
+    Keyword Args:
+        C: SVM regularization hyperparameter
+           (Type: float)
+
+        verbose:  If True, print verbose messages
+                  (Type: bool)
+
+    Returns:
+        clf: Classifier object
+             (Type: sklearn.svm.SVC)
+
+        y_test_pred: Predicted test output of classifier
+                     (Type: np.ndarray)
+    """
+    # Standardize
+    stdizer = StandardScaler()
+    X_train = stdizer.fit_transform(X_train)
+
+    clf = SVC(C=C, verbose=verbose)
+    clf.fit(X_train, y_train)
+
+
+    X_test = stdizer.transform(X_test)
+    y_test_pred = clf.predict(X_test)
+
+    return clf, y_test_pred
+
+
+def train_mlp(X_train, y_train, X_test, y_test, verbose=False, **kwargs):
+    """
+    Train a Multi-layer perceptron model on the given data
+
+    Args:
+        X_train: Training feature data
+                 (Type: np.ndarray)
+        y_train: Training label data
+                 (Type: np.ndarray)
+        X_test: Testing feature data
+                (Type: np.ndarray)
+        y_test: Testing label data
+                (Type: np.ndarray)
+
+    Keyword Args:
+        verbose:  If True, print verbose messages
+                  (Type: bool)
+    """
+    raise NotImplementedError()
+
+
+def compute_metrics(y, pred):
+    """
+    Compute perfomance metrics given the predicted labels and the true labels
+
+    Args:
+        y: True label vector
+           (Type: np.ndarray)
+
+        pred: Predicted label vector
+              (Type: np.ndarray)
+
+    Returns:
+        metrics: Metrics dictionary
+                 (Type: dict[str, *])
+    """
+    # Convert from one-hot to integer encoding if necessary
+    if y.ndim == 2:
+        y = np.argmax(y, axis=1)
+    if pred.ndim == 2:
+        pred = np.argmax(pred, axis=1)
+
+    acc = (y == pred).mean()
+
+    sum_class_acc = 0.0
+    for class_idx in range(10):
+        idxs = (y == class_idx)
+        sum_class_acc += (y[idxs] == pred[idxs]).mean()
+
+    ave_class_acc = sum_class_acc / 10
+
+    return {
+        'accuracy': acc,
+        'average_class_accuracy': ave_class_acc
+    }
+
+
+def aggregate_metrics(fold_metrics):
+    """
+    Aggregate fold metrics using different stats
+
+    Args:
+        fold_metrics: List of fold metrics dictionaries
+                      (Type: list[dict[str, *]])
+    """
+    metric_keys = list(fold_metrics[0].keys())
+    fold_metrics_list = {k: [fold[k] for fold in fold_metrics]
+                         for k in metric_keys}
+    aggr_metrics = {}
+
+    for metric in metric_keys:
+        metric_list = fold_metrics_list[metric]
+        aggr_metrics[metric] = {
+            'mean': np.mean(metric_list),
+            'var': np.var(metric_list),
+            'min': np.min(metric_list),
+            '25_%ile': np.percentile(metric_list, 25),
+            '75_%ile': np.percentile(metric_list, 75),
+            'median': np.median(metric_list),
+            'max': np.max(metric_list)
+        }
+
+    return aggr_metrics
+
+
+def print_metrics(metrics):
+    """
+    Print classifier metrics
+
+    Args:
+        metrics: Metrics dictionary
+                 (Type: dict[str, *])
+    """
+    print("Results")
+    print("=====================================================")
+    for metric, metric_stats in metrics.items():
+        print("* " + metric)
+        for stat_name, stat_val in metric_stats.items():
+            print("\t- {}: {}".format(stat_name, stat_val))
+        print()
+
+
+def train(metadata_path, data_dir, model_id, output_dir,
+          model_type='svm', features='l3_stack', label_format='int',
+          l3embedding_model_path=None, l3embedding_model_type='cnn_L3_orig',
+          random_state=20171021, verbose=False, **model_args):
+
+    # Make sure the directories we need exist
+    model_dir = os.path.join(output_dir, model_id)
+    if not os.path.isdir(model_dir):
+        os.makedirs(model_dir)
+
+    if features.startswith('l3'):
+        print('Loading embedding model...')
+        l3embedding_model = load_embedding(l3embedding_model_path,
+                                           l3embedding_model_type, 'audio')
+    else:
+        l3embedding_model = None
+
+
+    print('Loading data...')
+    # Load metadata
+    metadata = load_us8k_metadata(metadata_path)
+
+    # Get fold data
+    fold_data = get_us8k_folds(metadata, data_dir, l3embedding_model=l3embedding_model,
+                               features=features, label_format=label_format)
+
+    model_output_path = os.path.join(model_dir, "model_fold{}.{}")
+    fold_metrics = []
+    fold_preds = []
+    # Fit the model
+    print('Fitting model...')
+    for fold_idx in range(10):
+        print('\t* Training with fold {} held out'.format(fold_idx+1))
+        X_train, y_train, X_test, y_test = get_fold_split(fold_data, fold_idx)
+        if model_type == 'svm':
+            model, y_test_pred = train_svm(X_train, y_train, X_test, y_test,
+                                           verbose=verbose, **model_args)
+
+            # Save the model for this fold
+            joblib.dump(model, model_output_path.format(fold_idx+1, 'pkl'))
+
+        elif model_type == 'mlp':
+            model, y_test_pred = train_mlp(X_train, y_train, X_test, y_test,
+                                           verbose=verbose, **model_args)
+
+            # TODO: Save MLP model
+
+        else:
+            raise ValueError('Invalid model type: {}'.format(model_type))
+
+        fold_preds.append(y_test_pred)
+
+        # Compute metrics for this fold
+        test_metrics = compute_metrics(y_test, y_test_pred)
+        fold_metrics.append(test_metrics)
+
+    print(fold_preds)
+    print(fold_metrics)
+
+    metrics = aggregate_metrics(fold_metrics)
+    print_metrics(metrics)
+
+    print('Done training. Saving results to disk...')
+
+    # Evaluate model
+    # print('Evaluate model...')
+    # Load best params
+    # m.load_weights(weight_path)
+    # with open(os.path.join(output_dir, 'index_test.json'), 'r') as fp:
+    #     test_idx = json.load(fp)['id']
+
+    # Compute eval scores
+    # results = score_model(output_dir, pump, model, test_idx, working,
+    #                       strong_label_file, duration, modelid,
+    #                       use_orig_duration=True)
+
+    # Save results to disk
+    # results_file = os.path.join(model_dir, 'results.json')
+    # with open(results_file, 'w') as fp:
+    #     json.dump(results, fp, indent=2)
+
+    print('Done!')
diff --git a/l3embedding/audio_model.py b/l3embedding/audio_model.py
index 410752d..49c1525 100644
--- a/l3embedding/audio_model.py
+++ b/l3embedding/audio_model.py
@@ -96,5 +96,6 @@ def construct_cnn_L3_orig_audio_model():
     y_a = Flatten(name='audio_embedding')(y_a)
 
     m = Model(inputs=x_a, outputs=y_a)
+    m.name = 'audio_embedding'
 
     return m, x_a, y_a
diff --git a/l3embedding/model.py b/l3embedding/model.py
index 023f80b..470504f 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -5,7 +5,7 @@
 from .audio_model import construct_cnn_L3_orig_audio_model
 
 
-def L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a):
+def L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, model_name):
     """
     Merges the audio and vision subnetworks and adds additional fully connected
     layers in the fashion of the model used in Look, Listen and Learn
@@ -28,10 +28,60 @@ def L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a):
     y = Dense(128, activation='relu', kernel_regularizer=l2_weight_decay)(y)
     y = Dense(2, activation='softmax', kernel_regularizer=l2_weight_decay)(y)
     m = Model(inputs=[x_i, x_a], outputs=y)
+    m.name = model_name
 
     return m, [x_i, x_a], y
 
 
+def load_model(weights_path, model_type):
+    """
+    Loads an audio-visual correspondence model
+
+    Args:
+        weights_path:  Path to Keras weights file
+                       (Type: str)
+        model_type:    Name of model type
+                       (Type: str)
+
+    Returns:
+        model:  Loaded model object
+                (Type: keras.engine.training.Model)
+    """
+    if model_type not in MODELS:
+        raise ValueError('Invalid model type: "{}"'.format(model_type))
+
+    m, inputs, output = MODELS[model_type]()
+    m.load_weights(weights_path)
+    return m
+
+
+def load_embedding(weights_path, model_type, embedding_type):
+    """
+    Loads an embedding model
+
+    Args:
+        weights_path:    Path to Keras weights file
+                         (Type: str)
+        model_type:      Name of model type
+                         (Type: str)
+        embedding_type:  Type of embedding to load ('audio' or 'vision')
+                         (Type: str)
+
+    Returns:
+        model:  Loaded model object
+                (Type: keras.engine.training.Model)
+    """
+    m = load_model(weights_path, model_type)
+    if embedding_type == 'vision':
+        m_emb = m.get_layer('vision_embedding')
+    elif embedding_type == 'audio':
+        m_emb = m.get_layer('audio_embedding')
+    else:
+        raise ValueError('Invalid embedding type: "{}"'.format(embedding_type))
+
+    return m_emb
+
+
 def construct_cnn_L3_orig():
     """
     Constructs a model that replicates that used in Look, Listen and Learn
@@ -50,7 +100,8 @@ def construct_cnn_L3_orig():
     vision_model, x_i, y_i = construct_cnn_L3_orig_vision_model()
     audio_model, x_a, y_a = construct_cnn_L3_orig_audio_model()
 
-    return L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a)
+    m = L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, 'cnn_L3_orig')
+    return m
 
 
 MODELS = {
diff --git a/l3embedding/vision_model.py b/l3embedding/vision_model.py
index 2d0696c..8a29513 100644
--- a/l3embedding/vision_model.py
+++ b/l3embedding/vision_model.py
@@ -86,5 +86,6 @@ def construct_cnn_L3_orig_vision_model():
     y_i = Flatten(name='vision_embedding')(y_i)
 
     m = Model(inputs=x_i, outputs=y_i)
+    m.name = 'vision_embedding'
 
     return m, x_i, y_i
diff --git a/train_classifier.py b/train_classifier.py
new file mode 100644
index 0000000..fcf1de5
--- /dev/null
+++ b/train_classifier.py
@@ -0,0 +1,135 @@
+import argparse
+from classifier.train import *
+
+
+def parse_arguments():
+    """
+    Parse arguments from the command line
+
+
+    Returns:
+        args:  Argument dictionary
+               (Type: dict[str, *])
+    """
+    parser = argparse.ArgumentParser(description='Train an urban sound classification model')
+
+    parser.add_argument('-e',
+                        '--num-epochs',
+                        dest='num_epochs',
+                        action='store',
+                        type=int,
+                        default=150,
+                        help='(MLP) Maximum number of training epochs')
+
+    parser.add_argument('-es',
+                        '--epoch-size',
+                        dest='epoch_size',
+                        action='store',
+                        type=int,
+                        default=512,
+                        help='(MLP) Number of training batches per epoch')
+
+    parser.add_argument('-bs',
+                        '--batch-size',
+                        dest='batch_size',
+                        action='store',
+                        type=int,
+                        default=64,
+                        help='(MLP) Number of training examples per batch')
+
+    parser.add_argument('-vs',
+                        '--validation-size',
+                        dest='validation_size',
+                        action='store',
+                        type=int,
+                        default=1024,
+                        help='(MLP) Number of trianing examples in the validation set')
+
+    parser.add_argument('-lr',
+                        '--learning-rate',
+                        dest='learning_rate',
+                        action='store',
+                        type=float,
+                        default=1e-4,
+                        help='(MLP) Optimization learning rate')
+
+    parser.add_argument('-r',
+                        '--random-state',
+                        dest='random_state',
+                        action='store',
+                        type=int,
+                        default=20171021,
+                        help='Random seed used to set the RNG state')
+
+    parser.add_argument('-v',
+                        '--verbose',
+                        dest='verbose',
+                        action='store_true',
+                        default=False,
+                        help='If True, print detailed messages')
+
+    parser.add_argument('-f',
+                        '--features',
+                        dest='features',
+                        action='store',
+                        type=str,
+                        default='l3_stack',
+                        help='Type of features to be used in training')
+
+    parser.add_argument('-lf',
+                        '--label-format',
+                        dest='label_format',
+                        action='store',
+                        type=str,
+                        default='int',
+                        help='Type of format used for encoding outputs')
+
+    parser.add_argument('-mt',
+                        '--model-type',
+                        dest='model_type',
+                        action='store',
+                        type=str,
+                        default='svm',
+                        help='Type of model used for training classifier')
+
+    parser.add_argument('-lmp',
+                        '--l3embedding-model-path',
+                        dest='l3embedding_model_path',
+                        action='store',
+                        type=str,
+                        help='Path to L3 embedding model weights file')
+
+    parser.add_argument('-lmt',
+                        '--l3embedding-model-type',
+                        dest='l3embedding_model_type',
+                        action='store',
+                        type=str,
+                        default='cnn_L3_orig',
+                        help='Type of L3 embedding model')
+
+    parser.add_argument('metadata_path',
+                        action='store',
+                        type=str,
+                        help='Path to UrbanSound8K metadata file')
+
+    parser.add_argument('data_dir',
+                        action='store',
+                        type=str,
+                        help='Path to directory where training set files are stored')
+
+    parser.add_argument('model_id',
+                        action='store',
+                        type=str,
+                        help='Identifier for this model')
+
+    parser.add_argument('output_dir',
+                        action='store',
+                        type=str,
+                        help='Path to directory where output files will be stored')
+
+
+    return vars(parser.parse_args())
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    train(**args)

From ed8c7386aefff05e2b8b6af8c671e4599a5b7bb6 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 4 Dec 2017 22:00:44 -0500
Subject: [PATCH 041/201] Compute metrics for both train and test sets and save
 results to disk

---
 classifier/train.py | 66 +++++++++++++++++++++++++++++++--------------
 1 file changed, 46 insertions(+), 20 deletions(-)

diff --git a/classifier/train.py b/classifier/train.py
index 6b1c268..ac2df60 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -334,6 +334,9 @@ def train_svm(X_train, y_train, X_test, y_test, C=1e-4, verbose=False, **kwargs)
         clf: Classifier object
              (Type: sklearn.svm.SVC)
 
+        y_train_pred: Predicted train output of classifier
+                     (Type: np.ndarray)
+
         y_test_pred: Predicted test output of classifier
                      (Type: np.ndarray)
     """
@@ -342,13 +345,13 @@ def train_svm(X_train, y_train, X_test, y_test, C=1e-4, verbose=False, **kwargs)
     X_train = stdizer.fit_transform(X_train)
 
     clf = SVC(C=C, verbose=verbose)
-    clf.fit(X_train, y_train)
+    y_train_pred = clf.fit_transform(X_train, y_train)
 
 
     X_test = stdizer.transform(X_test)
     y_test_pred = clf.predict(X_test)
 
-    return clf, y_test_pred
+    return clf, y_train_pred, y_test_pred
 
 
 def train_mlp(X_train, y_train, X_test, y_test, verbose=False, **kwargs):
@@ -436,7 +439,7 @@ def aggregate_metrics(fold_metrics):
     return aggr_metrics
 
 
-def print_metrics(metrics):
+def print_metrics(metrics, subset_name):
     """
     Print classifier metrics
 
@@ -444,7 +447,7 @@ def print_metrics(metrics):
         metrics: Metrics dictionary
                  (Type: dict[str, *])
     """
-    print("Results")
+    print("Results metrics for {}".format(subset_name))
     print("=====================================================")
     for metric, metric_stats in metrics.items():
         print("* " + metric)
@@ -480,40 +483,63 @@ def train(metadata_path, data_dir, model_id, output_dir,
                                features=features, label_format=label_format)
 
     model_output_path = os.path.join(model_dir, "model_fold{}.{}")
-    fold_metrics = []
-    fold_preds = []
+
+    results = {
+        'folds': []
+    }
     # Fit the model
     print('Fitting model...')
     for fold_idx in range(10):
         print('\t* Training with fold {} held out'.format(fold_idx+1))
         X_train, y_train, X_test, y_test = get_fold_split(fold_data, fold_idx)
         if model_type == 'svm':
-            model, y_test_pred = train_svm(X_train, y_train, X_test, y_test,
-                                           verbose=verbose, **model_args)
+            model, y_train_pred, y_test_pred = train_svm(
+                    X_train, y_train, X_test, y_test,
+                    verbose=verbose, **model_args)
 
             # Save the model for this fold
             joblib.dump(model, model_output_path.format(fold_idx+1, 'pkl'))
 
         elif model_type == 'mlp':
-            model, y_test_pred = train_mlp(X_train, y_train, X_test, y_test,
-                                           verbose=verbose, **model_args)
+            model, y_train_pred, y_test_pred = train_mlp(
+                    X_train, y_train, X_test, y_test,
+                    verbose=verbose, **model_args)
 
             # TODO: Save MLP model
 
         else:
             raise ValueError('Invalid model type: {}'.format(model_type))
 
-        fold_preds.append(y_test_pred)
-
         # Compute metrics for this fold
+        train_metrics = compute_metrics(y_train, y_train_pred)
         test_metrics = compute_metrics(y_test, y_test_pred)
+
+        results['folds'].append({
+            'train': {
+                'metrics': train_metrics,
+                'target': y_train.tolist(),
+                'prediction': y_train_pred.tolist()
+            }
+            'test': {
+                'metrics': test_metrics,
+                'target': y_test.tolist(),
+                'prediction': y_test_pred.tolist()
+            }
+        })
         fold_metrics.append(test_metrics)
 
-    print(fold_preds)
-    print(fold_metrics)
+    train_metrics = aggregate_metrics([fold['train']['metrics']
+                                       for fold in results['folds']])
+    print_metrics(train_metrics, 'train')
 
-    metrics = aggregate_metrics(fold_metrics)
-    print_metrics(metrics)
+    test_metrics = aggregate_metrics([fold['test']['metrics']
+                                      for fold in results['folds']])
+    print_metrics(test_metrics, 'test')
+
+    results['summary'] = {
+        'train': train_metrics,
+        'test': test_metrics
+    }
 
     print('Done training. Saving results to disk...')
 
@@ -529,9 +555,9 @@ def train(metadata_path, data_dir, model_id, output_dir,
     #                       strong_label_file, duration, modelid,
     #                       use_orig_duration=True)
 
-    # Save results to disk
-    # results_file = os.path.join(model_dir, 'results.json')
-    # with open(results_file, 'w') as fp:
-    #     json.dump(results, fp, indent=2)
+    Save results to disk
+    results_file = os.path.join(model_dir, 'results.json')
+    with open(results_file, 'w') as fp:
+        json.dump(results, fp, indent=2)
 
     print('Done!')

From 1345150b3579e27ad410636cbf614a0369892404 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 6 Dec 2017 18:34:30 -0500
Subject: [PATCH 042/201] Fix issues and add logging

* Add MLP training code
* Fix issues with feature computation
* Fix issues with AudioSet filtering
* Add logging to embedding and classifier training code
---
 classifier/train.py  | 169 ++++++++++++++++++++++++++++++++-----------
 l3embedding/train.py |  87 +++++++++++++++-------
 log.py               |  46 ++++++++++++
 train.py             |  17 ++++-
 train_classifier.py  |  55 ++++++++++----
 5 files changed, 289 insertions(+), 85 deletions(-)
 create mode 100644 log.py

diff --git a/classifier/train.py b/classifier/train.py
index ac2df60..7d0b655 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -2,14 +2,25 @@
 import scipy as sp
 import numpy as np
 import soundfile as sf
+import json
 import csv
+import logging
 import librosa
+import keras
+from keras.optimizers import Adam
+import keras.regularizers as regularizers
+from keras.models import Model
+from keras.layers import Input, Dense, Activation
 from itertools import islice
 from sklearn.externals import joblib
 from sklearn.svm import SVC
 from sklearn.preprocessing import StandardScaler
-from l3embedding.train import sample_one_second
+from l3embedding.train import sample_one_second, LossHistory
 from l3embedding.model import load_embedding
+from log import *
+
+LOGGER = logging.getLogger('classifier')
+LOGGER.setLevel(logging.DEBUG)
 
 
 def load_us8k_metadata(path):
@@ -21,10 +32,10 @@ def load_us8k_metadata(path):
               (Type: str)
 
     Returns:
-        metadata: Metadata dictionary
-                  (Type: dict[str, *])
+        metadata: List of metadata dictionaries
+                  (Type: list[dict[str, *]])
     """
-    metadata = {}
+    metadata = [{} for _ in range(10)]
     with open(path) as csvfile:
         reader = csv.DictReader(csvfile)
         for row in reader:
@@ -32,9 +43,9 @@ def load_us8k_metadata(path):
             row['start'] = float(row['start'])
             row['end'] = float(row['end'])
             row['salience'] = float(row['salience'])
-            row['fold'] = int(row['fold'])
+            fold_num = row['fold'] = int(row['fold'])
             row['classID'] = int(row['classID'])
-            metadata[fname] = row
+            metadata[fold_num-1][fname] = row
 
     return metadata
 
@@ -127,15 +138,21 @@ def get_l3_stats_features(audio_path, l3embedding_model):
                    (Type: np.ndarray)
     """
     sr = 48000
-    audio = librosa.load(audio_path, sr=sr, mono=True)
+    audio, _ = librosa.load(audio_path, sr=sr, mono=True)
 
-    hop_size = 10e-3
+    hop_size = 0.25 # REVISIT
     hop_length = int(hop_size * sr)
     frame_length = 48000 * 1
 
-    # Zero pad so we compute embedding on all samples
-    pad_length = int(np.ceil(audio_length - frame_length)/hop_length) * hop_length \
-                 - (audio_length - frame_length)
+    audio_length = len(audio)
+    if audio_length < (frame_length + 2*hop_length):
+        # Make sure we can have at least three frames so that we can compute
+        # all of the stats.
+        pad_length = frame_length + 2*hop_length - audio_length
+    else:
+        # Zero pad so we compute embedding on all samples
+        pad_length = int(np.ceil(audio_length - frame_length)/hop_length) * hop_length \
+                     - (audio_length - frame_length)
 
     if pad_length > 0:
         # Use (roughly) symmetric padding
@@ -145,7 +162,7 @@ def get_l3_stats_features(audio_path, l3embedding_model):
 
 
     # Divide into overlapping 1 second frames
-    x = librosa.util.utils.frame(audio, frame_length=frame_length, hop_length=hop_length)
+    x = librosa.util.utils.frame(audio, frame_length=frame_length, hop_length=hop_length).T
 
     # Add a channel dimension
     x = x.reshape((x.shape[0], 1, x.shape[-1]))
@@ -153,7 +170,6 @@ def get_l3_stats_features(audio_path, l3embedding_model):
     # Get the L3 embedding for each frame
     l3embedding = l3embedding_model.predict(x)
 
-
     # Compute statistics on the time series of embeddings
     minimum = np.min(l3embedding, axis=0)
     maximum = np.max(l3embedding, axis=0)
@@ -185,7 +201,7 @@ def get_us8k_folds(metadata, data_dir, l3embedding_model=None,
     Load all of the data for each fold
 
     Args:
-        metadata: Metadata dictionary
+        metadata: List of metadata dictionaries
                   (Type: dict[str,*])
 
         data_dir: Path to data directory
@@ -208,6 +224,7 @@ def get_us8k_folds(metadata, data_dir, l3embedding_model=None,
     """
     fold_data = []
     for fold_idx in range(10):
+        LOGGER.info("Loading fold {}...".format(fold_idx+1))
         fold_data.append(get_fold_data(metadata, data_dir, fold_idx,
                                        l3embedding_model=l3embedding_model,
                                        features=features, label_format=label_format))
@@ -220,8 +237,8 @@ def get_fold_data(metadata, data_dir, fold_idx, l3embedding_model=None, features
     Load all of the data for a specific fold
 
     Args:
-        metadata: Metadata dictionary
-                  (Type: dict[str,*])
+        metadata: List of metadata dictionaries
+                  (Type: list[dict[str,*]])
 
         data_dir: Path to data directory
                   (Type: str)
@@ -249,22 +266,25 @@ def get_fold_data(metadata, data_dir, fold_idx, l3embedding_model=None, features
     X = []
     y = []
 
-    for idx, (fname, example_metadata) in enumerate(metadata.items()):
-        if example_metadata['fold'] != (fold_idx+1):
-            continue
-
+    for idx, (fname, example_metadata) in enumerate(metadata[fold_idx].items()):
         path = os.path.join(data_dir, "fold{}".format(fold_idx+1), fname)
 
         if features.startswith('l3') and not l3embedding_model:
             raise ValueError('Must provide L3 embedding model to use {} features'.format(features))
 
         if features == 'l3_stack':
-            X.append(get_l3_stack_features(path, l3embedding_model))
+            feature_vector = get_l3_stack_features(path, l3embedding_model)
         elif features == 'l3_stats':
-            X.append(get_l3_stats_features(path, l3embedding_model))
+            feature_vector = get_l3_stats_features(path, l3embedding_model)
         else:
             raise ValueError('Invalid feature type: {}'.format(features))
 
+        # If we were not able to compute the features, skip this file
+        if feature_vector is None:
+            continue
+
+        X.append(feature_vector)
+
 
         class_label = example_metadata['classID']
         if label_format == 'int':
@@ -341,20 +361,38 @@ def train_svm(X_train, y_train, X_test, y_test, C=1e-4, verbose=False, **kwargs)
                      (Type: np.ndarray)
     """
     # Standardize
+    LOGGER.debug('Standardizing data...')
     stdizer = StandardScaler()
     X_train = stdizer.fit_transform(X_train)
 
     clf = SVC(C=C, verbose=verbose)
-    y_train_pred = clf.fit_transform(X_train, y_train)
-
+    LOGGER.debug('Fitting model to data...')
+    clf.fit(X_train, y_train)
+    y_train_pred = clf.predict(X_train)
 
     X_test = stdizer.transform(X_test)
     y_test_pred = clf.predict(X_test)
 
     return clf, y_train_pred, y_test_pred
 
+def construct_mlp_model(input_shape, weight_decay=1e-5):
+    weight_decay = 1e-5
+    l2_weight_decay = regularizers.l2(weight_decay)
+    inp = Input(shape=input_shape, dtype='float32')
+    y = Dense(512, activation='relu', kernel_regularizer=l2_weight_decay)(inp)
+    y = Dense(128, activation='relu', kernel_regularizer=l2_weight_decay)(y)
+    y = Dense(10, activation='softmax', kernel_regularizer=l2_weight_decay)(y)
+    m = Model(inputs=inp, outputs=y)
+    m.name = 'urban_sound_classifier'
+
+    return m, inp, y
+
 
-def train_mlp(X_train, y_train, X_test, y_test, verbose=False, **kwargs):
+def train_mlp(X_train, y_train, X_test, y_test, model_dir,
+              batch_size=64, num_epochs=100, train_epoch_size=None,
+              validation_epoch_size=None, validation_split=0.1,
+              learning_rate=1e-4, weight_decay=1e-5,
+              verbose=False, **kwargs):
     """
     Train a Multi-layer perceptron model on the given data
 
@@ -367,12 +405,51 @@ def train_mlp(X_train, y_train, X_test, y_test, verbose=False, **kwargs):
                 (Type: np.ndarray)
         y_test: Testing label data
                 (Type: np.ndarray)
+        model_dir: Path to model directory
+                   (Type: str)
 
     Keyword Args:
         verbose:  If True, print verbose messages
                   (Type: bool)
     """
-    raise NotImplementedError()
+    loss = 'categorical_crossentropy'
+    metrics = ['accuracy']
+    monitor = 'val_loss'
+
+    m, inp, out = construct_mlp_model(X_train.shape[1:], weight_decay=weight_decay)
+    weight_path = os.path.join(model_dir, 'model.h5')
+
+    cb = []
+    cb.append(keras.callbacks.ModelCheckpoint(weight_path,
+                                              save_weights_only=True,
+                                              save_best_only=True,
+                                              monitor=monitor))
+
+    history_checkpoint = os.path.join(model_dir, 'history_checkpoint.pkl')
+    cb.append(LossHistory(history_checkpoint))
+
+    history_csvlog = os.path.join(model_dir, 'history_csvlog.csv')
+    cb.append(keras.callbacks.CSVLogger(history_csvlog, append=True,
+                                        separator=','))
+
+    LOGGER.debug('Compiling model...')
+
+    m.compile(Adam(lr=learning_rate), loss=loss, metrics=metrics)
+
+    LOGGER.debug('Fitting model to data...')
+
+    history = m.fit(x=X_train, y=y_train, batch_size=batch_size,
+                    epochs=num_epochs,
+                    steps_per_epoch=train_epoch_size,
+                    validation_split=validation_split,
+                    validation_steps=validation_epoch_size,
+                    callbacks=cb,
+                    verbose=10 if verbose else 1)
+
+    y_train_pred = m.predict(X_train)
+    y_test_pred = m.predict(X_test)
+
+    return m, y_train_pred, y_test_pred
 
 
 def compute_metrics(y, pred):
@@ -447,19 +524,25 @@ def print_metrics(metrics, subset_name):
         metrics: Metrics dictionary
                  (Type: dict[str, *])
     """
-    print("Results metrics for {}".format(subset_name))
-    print("=====================================================")
+    LOGGER.info("Results metrics for {}".format(subset_name))
+    LOGGER.info("=====================================================")
     for metric, metric_stats in metrics.items():
-        print("* " + metric)
+        LOGGER.info("* " + metric)
         for stat_name, stat_val in metric_stats.items():
-            print("\t- {}: {}".format(stat_name, stat_val))
-        print()
+            LOGGER.info("\t- {}: {}".format(stat_name, stat_val))
+        LOGGER.info()
 
 
 def train(metadata_path, data_dir, model_id, output_dir,
           model_type='svm', features='l3_stack', label_format='int',
           l3embedding_model_path=None, l3embedding_model_type='cnn_L3_orig',
-          random_state=20171021, verbose=False, **model_args):
+          random_state=20171021, verbose=False, log_path=None,
+          disable_logging=False, **model_args):
+
+    init_console_logger(LOGGER, verbose=verbose)
+    if not disable_logging:
+        init_file_logger(LOGGER, log_path=log_path)
+    LOGGER.debug('Initialized logging.')
 
     # Make sure the directories we need exist
     model_dir = os.path.join(output_dir, model_id)
@@ -467,14 +550,14 @@ def train(metadata_path, data_dir, model_id, output_dir,
         os.makedirs(model_dir)
 
     if features.startswith('l3'):
-        print('Loading embedding model...')
+        LOGGER.info('Loading embedding model...')
         l3embedding_model = load_embedding(l3embedding_model_path,
                                            l3embedding_model_type, 'audio')
     else:
         l3embedding_model = None
 
 
-    print('Loading data...')
+    LOGGER.info('Loading data...')
     # Load metadata
     metadata = load_us8k_metadata(metadata_path)
 
@@ -488,25 +571,24 @@ def train(metadata_path, data_dir, model_id, output_dir,
         'folds': []
     }
     # Fit the model
-    print('Fitting model...')
+    LOGGER.info('Preparing to fit models...')
     for fold_idx in range(10):
-        print('\t* Training with fold {} held out'.format(fold_idx+1))
+        LOGGER.info('\t* Training with fold {} held out'.format(fold_idx+1))
         X_train, y_train, X_test, y_test = get_fold_split(fold_data, fold_idx)
         if model_type == 'svm':
             model, y_train_pred, y_test_pred = train_svm(
                     X_train, y_train, X_test, y_test,
                     verbose=verbose, **model_args)
 
+            LOGGER.info('Saving model...')
             # Save the model for this fold
             joblib.dump(model, model_output_path.format(fold_idx+1, 'pkl'))
 
         elif model_type == 'mlp':
             model, y_train_pred, y_test_pred = train_mlp(
-                    X_train, y_train, X_test, y_test,
+                    X_train, y_train, X_test, y_test, model_dir,
                     verbose=verbose, **model_args)
 
-            # TODO: Save MLP model
-
         else:
             raise ValueError('Invalid model type: {}'.format(model_type))
 
@@ -519,14 +601,13 @@ def train(metadata_path, data_dir, model_id, output_dir,
                 'metrics': train_metrics,
                 'target': y_train.tolist(),
                 'prediction': y_train_pred.tolist()
-            }
+            },
             'test': {
                 'metrics': test_metrics,
                 'target': y_test.tolist(),
                 'prediction': y_test_pred.tolist()
             }
         })
-        fold_metrics.append(test_metrics)
 
     train_metrics = aggregate_metrics([fold['train']['metrics']
                                        for fold in results['folds']])
@@ -541,7 +622,7 @@ def train(metadata_path, data_dir, model_id, output_dir,
         'test': test_metrics
     }
 
-    print('Done training. Saving results to disk...')
+    LOGGER.info('Done training. Saving results to disk...')
 
     # Evaluate model
     # print('Evaluate model...')
@@ -555,9 +636,9 @@ def train(metadata_path, data_dir, model_id, output_dir,
     #                       strong_label_file, duration, modelid,
     #                       use_orig_duration=True)
 
-    Save results to disk
+    # Save results to disk
     results_file = os.path.join(model_dir, 'results.json')
     with open(results_file, 'w') as fp:
         json.dump(results, fp, indent=2)
 
-    print('Done!')
+    LOGGER.info('Done!')
diff --git a/l3embedding/train.py b/l3embedding/train.py
index ca5c7e5..675f80e 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -20,6 +20,10 @@
 from .model import construct_cnn_L3_orig
 from .training_utils import multi_gpu_model
 from audioset.ontology import ASOntology
+from log import *
+
+LOGGER = logging.getLogger('l3embedding')
+LOGGER.setLevel(logging.DEBUG)
 
 
 #TODO: Consider putting the sampling functionality into another file
@@ -37,22 +41,25 @@ def get_filename(path):
 
 def load_metadata(metadata_path):
     metadata = {}
-    with open(metadata_path, 'r') as f:
-        for idx, line in enumerate(f):
-            if idx in (0, 1):
-                continue
-            elif idx == 2:
-                fields = [field.strip() for field in line.lstrip('# ').rstrip().split(',')]
-            else:
-                row = [val.strip() for val in line.strip().split(',')]
-                ytid = row[0]
-
-                entry = {field: val
-                         for field, val in zip(fields, row[1:])}
-
-                entry['positive_labels'] = entry['positive_labels'].strip('"').split(',')
-
-                metadata[ytid] = entry
+    for path in glob.glob(metadata_path):
+        with open(path, 'r') as f:
+            for idx, line in enumerate(f):
+                if idx in (0, 1):
+                    continue
+                elif idx == 2:
+                    fields = [field.strip() for field in line.lstrip('# ').rstrip().split(', ')]
+                else:
+                    row = [val.strip() for val in line.strip().split(', ')]
+                    ytid = row[0]
+
+                    entry = {field: val
+                            for field, val in zip(fields[1:], row[1:])}
+
+                    entry['positive_labels'] = entry['positive_labels'].strip('"').split(',')
+                    entry['start_seconds'] = float(entry['start_seconds'])
+                    entry['end_seconds'] = float(entry['end_seconds'])
+
+                    metadata[ytid] = entry
 
     return metadata
 
@@ -61,7 +68,7 @@ def load_filters(filter_path):
     filters = []
 
     with open(filter_path, 'r') as f:
-        reader = csv.DictReader(filter_path)
+        reader = csv.DictReader(f)
         for row in reader:
             filters.append(row)
 
@@ -69,7 +76,7 @@ def load_filters(filter_path):
 
 def get_ytid_from_filename(filename):
     first_us_idx = filename.rindex('_')
-    second_us_idx = filename.rindeX('_', 0, first_us_idx)
+    second_us_idx = filename.rindex('_', 0, first_us_idx)
     return filename[:second_us_idx]
 
 
@@ -99,8 +106,11 @@ def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=
     valid_filenames = audio_filenames & video_filenames
 
     if metadata_path and filter_path:
+        LOGGER.info('Filtering examples...')
         if not ontology_path:
-            raise ValueError('Must provide ontology path to filter')
+            err_msg = 'Must provide ontology path to filter'
+            LOGGER.error(err_msg)
+            raise ValueError(err_msg)
 
         ontology = ASOntology(ontology_path)
 
@@ -113,6 +123,9 @@ def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=
             ytid = get_ytid_from_filename(filename)
             video_metadata = metadata[ytid]
 
+            video_labels = [ontology.get_node(label_id).name.lower()
+                            for label_id in video_metadata['positive_labels']]
+
             accept = True
             for _filter in filters:
                 filter_type = _filter['filter_type']
@@ -123,18 +136,19 @@ def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=
                     match = ytid == string
 
                 elif filter_type == 'label':
-                    label = ontology.get_node(string).name
-                    match = label in video_metadata['positive_labels']
+                    match = string.lower() in video_labels
 
                 # TODO: check this logic
                 if match == filter_accept:
                     accept = False
 
             if accept:
+                LOGGER.debug('Using video: "{}"'.format(filename))
                 filtered_filenames.append(filename)
 
         valid_filenames = filtered_filenames
 
+    LOGGER.info('Total videos used: {}'.format(len(valid_filenames)))
     audio_files = [path for path in audio_files if get_filename(path) in valid_filenames]
     video_files = [path for path in video_files if get_filename(path) in valid_filenames]
 
@@ -343,11 +357,14 @@ def sampler(video_file_1, video_file_2, augment=False):
         and label (0: not from corresponding files, 1: from corresponding files)
 
     """
+    debug_msg = 'Initializing streamer with videos "{}" and "{}"'
+    LOGGER.debug(debug_msg.format(video_file_1, video_file_2))
 
     try:
         video_data_1 = rescale_video(vread(video_file_1).astype('float32'))
     except Exception as e:
         warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
+        LOGGER.warning(warn_msg)
         warnings.warn(warn_msg.format(video_file_1, type(e), e))
         raise StopIteration()
 
@@ -355,6 +372,7 @@ def sampler(video_file_1, video_file_2, augment=False):
         video_data_2 = rescale_video(vread(video_file_2).astype('float32'))
     except Exception as e:
         warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
+        LOGGER.warning(warn_msg)
         warnings.warn(warn_msg.format(video_file_2, type(e), e))
         raise StopIteration()
 
@@ -365,6 +383,7 @@ def sampler(video_file_1, video_file_2, augment=False):
         audio_data_1, sampling_frequency = sf.read(audio_file_1, dtype='float32', always_2d=True)
     except Exception as e:
         warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
+        LOGGER.warning(warn_msg)
         warnings.warn(warn_msg.format(audio_file_1, type(e), e))
         raise StopIteration()
 
@@ -372,6 +391,7 @@ def sampler(video_file_1, video_file_2, augment=False):
         audio_data_2, sampling_frequency = sf.read(audio_file_2, dtype='float32', always_2d=True)
     except Exception as e:
         warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
+        LOGGER.warning(warn_msg)
         warnings.warn(warn_msg.format(audio_file_2, type(e), e))
         raise StopIteration()
 
@@ -435,9 +455,11 @@ def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path
 
     random.seed(random_state)
 
+    LOGGER.info("Getting file list...")
     audio_files, video_files = get_file_list(data_dir, metadata_path=metadata_path,
                                              filter_path=filter_path, ontology_path=ontology_path)
 
+    LOGGER.info("Creating streamers...")
     seeds = []
     for video_file_1 in tqdm(video_files):
         for _ in range(num_distractors):
@@ -445,6 +467,9 @@ def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path
             # Make sure we sample a different file
             while video_file_2 == video_file_1:
                 video_file_2 = random.choice(video_files)
+
+            debug_msg = 'Created streamer for videos "{}" and "{}'
+            LOGGER.info(debug_msg.format(video_file_1, video_file_2))
             seeds.append(pescador.Streamer(sampler, video_file_1, video_file_2, augment=augment))
 
     # Randomly shuffle the seeds
@@ -498,7 +523,13 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
           train_mux_rate=16, validation_mux_rate=16,
           learning_rate=1e-4, random_state=20171021,
           verbose=False, checkpoint_interval=10, ontology_path=None,
-          augment=False, gpus=1):
+          log_path=None, disable_logging=False, augment=False, gpus=1):
+
+    init_console_logger(LOGGER, verbose=verbose)
+    if not disable_logging:
+        init_file_logger(LOGGER, log_path=log_path)
+    LOGGER.debug('Initialized logging.')
+
     m, inputs, outputs = construct_cnn_L3_orig()
     if gpus > 1:
         m = multi_gpu_model(m, gpus=gpus)
@@ -513,7 +544,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
     if not os.path.isdir(model_dir):
         os.makedirs(model_dir)
 
-    print('Compile model...')
+    LOGGER.info('Compile model...')
     m.compile(Adam(lr=learning_rate),
               loss=loss,
               metrics=metrics)
@@ -551,10 +582,11 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
                                         separator=','))
 
 
-    print('Setting up train data generator...')
+    LOGGER.info('Setting up train data generator...')
     train_gen = data_generator(
         train_data_dir,
         metadata_path=train_metadata_path,
+        ontology_path=ontology_path,
         filter_path=train_filter_path,
         batch_size=train_batch_size,
         random_state=random_state,
@@ -567,10 +599,11 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
                                            ['video', 'audio'],
                                            'label')
 
-    print('Setting up validation data generator...')
+    LOGGER.info('Setting up validation data generator...')
     val_gen = data_generator(
         validation_data_dir,
         metadata_path=validation_metadata_path,
+        ontology_path=ontology_path,
         filter_path=validation_filter_path,
         batch_size=validation_batch_size,
         random_state=random_state,
@@ -585,7 +618,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
 
 
     # Fit the model
-    print('Fit model...')
+    LOGGER.info('Fit model...')
     if verbose:
         verbosity = 1
     else:
@@ -596,7 +629,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
                               callbacks=cb,
                               verbose=verbosity)
 
-    print('Done training. Saving results to disk...')
+    LOGGER.info('Done training. Saving results to disk...')
     # Save history
     with open(os.path.join(model_dir, 'history.pkl'), 'wb') as fd:
         pickle.dump(history.history, fd)
diff --git a/log.py b/log.py
new file mode 100644
index 0000000..76d1ca1
--- /dev/null
+++ b/log.py
@@ -0,0 +1,46 @@
+import logging
+import logging.handlers
+
+
+def init_file_logger(logger, log_path=None):
+    """
+    Initializes logging to a file.
+
+    Saves log to "audiosetdl.log" in the current directory, and rotates them
+    after they reach 1MiB.
+
+    Args:
+        logger:  Logger object
+                 (Type: logging.Logger)
+    """
+    # Set up file handler
+    if not log_path:
+        log_path = './l3embedding.log'
+    handler = logging.handlers.RotatingFileHandler(log_path, maxBytes=2**20)
+    handler.setLevel(logging.DEBUG)
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+
+
+def init_console_logger(logger, verbose=False):
+    """
+    Initializes logging to stdout
+
+    Args:
+        logger:  Logger object
+                 (Type: logging.Logger)
+
+    Kwargs:
+        verbose:  If true, prints verbose information to stdout. False by default.
+                  (Type: bool)
+    """
+    # Log to stderr also
+    stream_handler = logging.StreamHandler()
+    if verbose:
+        stream_handler.setLevel(logging.DEBUG)
+    else:
+        stream_handler.setLevel(logging.ERROR)
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    stream_handler.setFormatter(formatter)
+    logger.addHandler(stream_handler)
diff --git a/train.py b/train.py
index 4f05674..f81b029 100644
--- a/train.py
+++ b/train.py
@@ -118,7 +118,7 @@ def parse_arguments():
                         default=10,
                         help='The number of epochs between model checkpoints')
 
-    parser.add_argument('-o'
+    parser.add_argument('-o',
                         '--ontology-path',
                         dest='ontology_path',
                         action='store',
@@ -182,6 +182,21 @@ def parse_arguments():
                         type=str,
                         help='Path to validationing csv file(s). Accepts a glob string.')
 
+    parser.add_argument('-lp',
+                        '--log-path',
+                        dest='log_path',
+                        action='store',
+                        default=None,
+                        help='Path to log file generated by this script. ' \
+                             'By default, the path is "./l3embedding.log".')
+
+    parser.add_argument('-nl',
+                        '--no-logging',
+                        dest='disable_logging',
+                        action='store_true',
+                        default=False,
+                        help='Disables logging if flag enabled')
+
     parser.add_argument('train_data_dir',
                         action='store',
                         type=str,
diff --git a/train_classifier.py b/train_classifier.py
index fcf1de5..7d2b001 100644
--- a/train_classifier.py
+++ b/train_classifier.py
@@ -21,14 +21,6 @@ def parse_arguments():
                         default=150,
                         help='(MLP) Maximum number of training epochs')
 
-    parser.add_argument('-es',
-                        '--epoch-size',
-                        dest='epoch_size',
-                        action='store',
-                        type=int,
-                        default=512,
-                        help='(MLP) Number of training batches per epoch')
-
     parser.add_argument('-bs',
                         '--batch-size',
                         dest='batch_size',
@@ -37,13 +29,27 @@ def parse_arguments():
                         default=64,
                         help='(MLP) Number of training examples per batch')
 
-    parser.add_argument('-vs',
-                        '--validation-size',
-                        dest='validation_size',
+    parser.add_argument('-tes',
+                        '--train-epoch-size',
+                        dest='train_epoch_size',
                         action='store',
                         type=int,
-                        default=1024,
-                        help='(MLP) Number of trianing examples in the validation set')
+                        help='(MLP) Number of training batches per epoch')
+
+    parser.add_argument('-ves',
+                        '--validation-epoch-size',
+                        dest='validation_epoch_size',
+                        action='store',
+                        type=int,
+                        help='(MLP) Number of validation batches per epoch')
+
+    parser.add_argument('-vs',
+                        '--validation-split',
+                        dest='validation_split',
+                        action='store',
+                        type=float,
+                        default=0.1,
+                        help='(MLP) Percentage of training data used for validation')
 
     parser.add_argument('-lr',
                         '--learning-rate',
@@ -53,6 +59,14 @@ def parse_arguments():
                         default=1e-4,
                         help='(MLP) Optimization learning rate')
 
+    parser.add_argument('-wd',
+                        '--weight-decay',
+                        dest='weight_decay',
+                        action='store',
+                        type=float,
+                        default=1e-5,
+                        help='(MLP) L2 regularization penalty factor')
+
     parser.add_argument('-r',
                         '--random-state',
                         dest='random_state',
@@ -107,6 +121,21 @@ def parse_arguments():
                         default='cnn_L3_orig',
                         help='Type of L3 embedding model')
 
+    parser.add_argument('-nl',
+                        '--no-logging',
+                        dest='disable_logging',
+                        action='store_true',
+                        default=False,
+                        help='Disables logging if flag enabled')
+
+    parser.add_argument('-lp',
+                        '--log-path',
+                        dest='log_path',
+                        action='store',
+                        default=None,
+                        help='Path to log file generated by this script. ' \
+                             'By default, the path is "./audiosetdl.log".')
+
     parser.add_argument('metadata_path',
                         action='store',
                         type=str,

From 89cadf81304959a6037ccc82cb68185978ae5a07 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 6 Dec 2017 20:46:37 -0500
Subject: [PATCH 043/201] Make sure valid_filenames after filtering is a set so
 membership checks are fast

---
 l3embedding/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 675f80e..00917c9 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -146,7 +146,7 @@ def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=
                 LOGGER.debug('Using video: "{}"'.format(filename))
                 filtered_filenames.append(filename)
 
-        valid_filenames = filtered_filenames
+        valid_filenames = set(filtered_filenames)
 
     LOGGER.info('Total videos used: {}'.format(len(valid_filenames)))
     audio_files = [path for path in audio_files if get_filename(path) in valid_filenames]

From 7b4de29d132fa349e4f1fd369becd67f5838c793 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sat, 9 Dec 2017 20:04:49 -0500
Subject: [PATCH 044/201] Add missing newline in logging

---
 classifier/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/classifier/train.py b/classifier/train.py
index 7d0b655..f3dc91f 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -530,7 +530,7 @@ def print_metrics(metrics, subset_name):
         LOGGER.info("* " + metric)
         for stat_name, stat_val in metric_stats.items():
             LOGGER.info("\t- {}: {}".format(stat_name, stat_val))
-        LOGGER.info()
+        LOGGER.info("\n")
 
 
 def train(metadata_path, data_dir, model_id, output_dir,

From cd82a922e3744dff3d780587055f3d1db725f2a9 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sat, 9 Dec 2017 20:07:51 -0500
Subject: [PATCH 045/201] Change video sampling and fix loss type

* Use img_to_float32 to convert dtype rather than just casting to float
* Change loss to categorical cross-entropy loss
* Add option to scale individual frames in sample_one_frame
* Make logging less verbose
* NOTE: scikit-image 0.14dev0 was installed from source
* NOTE: pillow-simd was installed to make scipy.misc.resize slightly faster
---
 l3embedding/train.py | 89 ++++++++++++++++++++++++++++++++------------
 1 file changed, 66 insertions(+), 23 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 00917c9..4f64082 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -1,6 +1,7 @@
 import glob
 import json
 import math
+import time
 import os
 import pickle
 import random
@@ -12,6 +13,7 @@
 import numpy as np
 import pescador
 import scipy.misc
+import skimage
 from skvideo.io import vread
 import soundfile as sf
 from tqdm import tqdm
@@ -143,7 +145,7 @@ def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=
                     accept = False
 
             if accept:
-                LOGGER.debug('Using video: "{}"'.format(filename))
+                #LOGGER.debug('Using video: "{}"'.format(filename))
                 filtered_filenames.append(filename)
 
         valid_filenames = set(filtered_filenames)
@@ -238,7 +240,31 @@ def rescale_video(video_data):
     return resized_video_data
 
 
-def sample_cropped_frame(frame_data):
+def rescale_frame(frame_data):
+    """
+    Rescales frame such that the minimum dimension of the frame becomes 256,
+    as is down in Look, Listen and Learn
+
+
+    Args:
+        frame_data: frame data array
+
+    Returns:
+        rescaled_frame_data: rescaled frame data array
+    """
+    nx, ny, nc = frame_data.shape
+
+    scaling = 256.0 / min(nx, ny)
+
+    new_nx, new_ny = math.ceil(scaling * nx), math.ceil(scaling * ny)
+    assert 256 in (new_nx, new_ny), str((new_nx, new_ny))
+
+    resized_frame_data = scipy.misc.imresize(frame, (new_nx, new_ny, nc))
+
+    return resized_frame_data
+
+
+def sample_cropped_frame(frame_data, rescale=True):
     """
     Randomly crop a video frame, using the method from Look, Listen and Learn
 
@@ -250,6 +276,8 @@ def sample_cropped_frame(frame_data):
         scaled_frame_data: scaled and cropped frame data
         bbox: bounding box for the cropped image
     """
+    if rescale:
+        frame_data = rescale_frame(frame_data)
     nx, ny, nc = frame_data.shape
     start_x, start_y = random.randrange(nx - 224), random.randrange(ny - 224)
     end_x, end_y = start_x + 224, start_y + 224
@@ -264,7 +292,7 @@ def sample_cropped_frame(frame_data):
     return frame_data[start_x:end_x, start_y:end_y, :], bbox
 
 
-def sample_one_frame(video_data, start=None, fps=30, augment=False):
+def sample_one_frame(video_data, start=None, fps=30, augment=False, rescale=True):
     """Return one frame randomly and time (seconds).
 
     Args:
@@ -301,7 +329,7 @@ def sample_one_frame(video_data, start=None, fps=30, augment=False):
         frame = random.randrange(num_frames)
 
     frame_data = video_data[frame, :, :, :]
-    frame_data, bbox = sample_cropped_frame(frame_data)
+    frame_data, bbox = sample_cropped_frame(frame_data, rescale=rescale)
 
     video_aug_params = {'bounding_box': bbox}
 
@@ -359,9 +387,11 @@ def sampler(video_file_1, video_file_2, augment=False):
     """
     debug_msg = 'Initializing streamer with videos "{}" and "{}"'
     LOGGER.debug(debug_msg.format(video_file_1, video_file_2))
+    audio_file_1 = video_to_audio(video_file_1)
+    audio_file_2 = video_to_audio(video_file_2)
 
     try:
-        video_data_1 = rescale_video(vread(video_file_1).astype('float32'))
+        video_data_1 = skimage.img_as_float32(rescale_video(vread(video_file_1)))
     except Exception as e:
         warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
         LOGGER.warning(warn_msg)
@@ -369,16 +399,13 @@ def sampler(video_file_1, video_file_2, augment=False):
         raise StopIteration()
 
     try:
-        video_data_2 = rescale_video(vread(video_file_2).astype('float32'))
+        video_data_2 = skimage.img_as_float32(rescale_video(vread(video_file_2)))
     except Exception as e:
         warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
         LOGGER.warning(warn_msg)
         warnings.warn(warn_msg.format(video_file_2, type(e), e))
         raise StopIteration()
 
-    audio_file_1 = video_to_audio(video_file_1)
-    audio_file_2 = video_to_audio(video_file_2)
-
     try:
         audio_data_1, sampling_frequency = sf.read(audio_file_1, dtype='float32', always_2d=True)
     except Exception as e:
@@ -416,12 +443,12 @@ def sampler(video_file_1, video_file_2, augment=False):
 
         label = int(video_choice != audio_choice)
 
-        sample_audio_data, audio_start, audio_aug_params = sample_one_second(audio_data,
-                                                                             sampling_frequency,
-                                                                             augment=augment)
-        sample_video_data, video_start, video_aug_params = sample_one_frame(video_data,
-                                                                            start=audio_start,
-                                                                            augment=augment)
+        sample_audio_data, audio_start, audio_aug_params \
+            = sample_one_second(audio_data, sampling_frequency, augment=augment)
+
+        sample_video_data, video_start, video_aug_params \
+            = sample_one_frame(video_data, start=audio_start, augment=augment, rescale=False)
+
         sample_audio_data = sample_audio_data.mean(axis=-1).reshape((1, sample_audio_data.shape[0]))
 
         sample = {
@@ -468,8 +495,8 @@ def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path
             while video_file_2 == video_file_1:
                 video_file_2 = random.choice(video_files)
 
-            debug_msg = 'Created streamer for videos "{}" and "{}'
-            LOGGER.info(debug_msg.format(video_file_1, video_file_2))
+            #debug_msg = 'Created streamer for videos "{}" and "{}'
+            #LOGGER.debug(debug_msg.format(video_file_1, video_file_2))
             seeds.append(pescador.Streamer(sampler, video_file_1, video_file_2, augment=augment))
 
     # Randomly shuffle the seeds
@@ -511,6 +538,20 @@ def on_epoch_end(self, epoch, logs=None):
             pickle.dump(loss_dict, fp)
 
 
+class TimeHistory(keras.callbacks.Callback):
+    # Copied from https://stackoverflow.com/a/43186440/1260544
+    def on_train_begin(self, logs={}):
+        self.times = []
+
+    def on_epoch_begin(self, batch, logs={}):
+        self.epoch_time_start = time.time()
+
+    def on_epoch_end(self, batch, logs={}):
+        t = time.time() - self.epoch_time_start
+        LOGGER.info('Epoch took {} seconds'.format(t))
+        self.times.append(t)
+
+
 #def train(train_csv_path, model_id, output_dir, num_epochs=150, epoch_size=512,
 def train(train_data_dir, validation_data_dir, model_id, output_dir,
           num_epochs=150,
@@ -533,7 +574,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
     m, inputs, outputs = construct_cnn_L3_orig()
     if gpus > 1:
         m = multi_gpu_model(m, gpus=gpus)
-    loss = 'binary_crossentropy'
+    loss = 'categorical_crossentropy'
     metrics = ['accuracy']
     monitor = 'val_loss'
 
@@ -544,7 +585,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
     if not os.path.isdir(model_dir):
         os.makedirs(model_dir)
 
-    LOGGER.info('Compile model...')
+    LOGGER.info('Compiling model...')
     m.compile(Adam(lr=learning_rate),
               loss=loss,
               metrics=metrics)
@@ -574,6 +615,9 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
                                               monitor=monitor,
                                               period=checkpoint_interval))
 
+    timer_cb = TimeHistory()
+    cb.append(timer_cb)
+
     history_checkpoint = os.path.join(model_dir, 'history_checkpoint.pkl')
     cb.append(LossHistory(history_checkpoint))
 
@@ -581,7 +625,6 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
     cb.append(keras.callbacks.CSVLogger(history_csvlog, append=True,
                                         separator=','))
 
-
     LOGGER.info('Setting up train data generator...')
     train_gen = data_generator(
         train_data_dir,
@@ -618,11 +661,11 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
 
 
     # Fit the model
-    LOGGER.info('Fit model...')
+    LOGGER.info('Fitting model...')
     if verbose:
-        verbosity = 1
-    else:
         verbosity = 2
+    else:
+        verbosity = 1
     history = m.fit_generator(train_gen, train_epoch_size, num_epochs,
                               validation_data=val_gen,
                               validation_steps=validation_epoch_size,

From 909de54d8db863809c8f7d0dd3e78b6491e9afb7 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 11 Dec 2017 13:47:53 -0500
Subject: [PATCH 046/201] Change sampling to precompute samples

---
 l3embedding/train.py | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 4f64082..5ba4efa 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -259,7 +259,7 @@ def rescale_frame(frame_data):
     new_nx, new_ny = math.ceil(scaling * nx), math.ceil(scaling * ny)
     assert 256 in (new_nx, new_ny), str((new_nx, new_ny))
 
-    resized_frame_data = scipy.misc.imresize(frame, (new_nx, new_ny, nc))
+    resized_frame_data = scipy.misc.imresize(frame_data, (new_nx, new_ny, nc))
 
     return resized_frame_data
 
@@ -277,7 +277,7 @@ def sample_cropped_frame(frame_data, rescale=True):
         bbox: bounding box for the cropped image
     """
     if rescale:
-        frame_data = rescale_frame(frame_data)
+        frame_data = skimage.img_as_float32(rescale_frame(frame_data))
     nx, ny, nc = frame_data.shape
     start_x, start_y = random.randrange(nx - 224), random.randrange(ny - 224)
     end_x, end_y = start_x + 224, start_y + 224
@@ -372,7 +372,7 @@ def sample_one_frame(video_data, start=None, fps=30, augment=False, rescale=True
     return frame_data, frame / fps, video_aug_params
 
 
-def sampler(video_file_1, video_file_2, augment=False):
+def sampler(video_file_1, video_file_2, rate=32, augment=False):
     """Sample one frame from video_file, with 50% chance sample one second from corresponding audio_file,
        50% chance sample one second from another audio_file in the list of audio_files.
 
@@ -390,8 +390,13 @@ def sampler(video_file_1, video_file_2, augment=False):
     audio_file_1 = video_to_audio(video_file_1)
     audio_file_2 = video_to_audio(video_file_2)
 
+    # Hack: choose a number of samples such that we with high probability, we
+    #       won't run out of samples, but is also less than the entire length of
+    #       the video so we don't have to resize all of the frames
+    num_samples = int(scipy.stats.poisson.ppf(0.999, rate))
+
     try:
-        video_data_1 = skimage.img_as_float32(rescale_video(vread(video_file_1)))
+        video_data_1 = vread(video_file_1)
     except Exception as e:
         warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
         LOGGER.warning(warn_msg)
@@ -399,7 +404,7 @@ def sampler(video_file_1, video_file_2, augment=False):
         raise StopIteration()
 
     try:
-        video_data_2 = skimage.img_as_float32(rescale_video(vread(video_file_2)))
+        video_data_2 = vread(video_file_2)
     except Exception as e:
         warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
         LOGGER.warning(warn_msg)
@@ -422,7 +427,8 @@ def sampler(video_file_1, video_file_2, augment=False):
         warnings.warn(warn_msg.format(audio_file_2, type(e), e))
         raise StopIteration()
 
-    while True:
+    samples = []
+    for _ in range(num_samples):
 
         video_choice = random.random() < 0.5
         audio_choice = random.random() < 0.5
@@ -447,7 +453,7 @@ def sampler(video_file_1, video_file_2, augment=False):
             = sample_one_second(audio_data, sampling_frequency, augment=augment)
 
         sample_video_data, video_start, video_aug_params \
-            = sample_one_frame(video_data, start=audio_start, augment=augment, rescale=False)
+            = sample_one_frame(video_data, start=audio_start, augment=augment, rescale=True)
 
         sample_audio_data = sample_audio_data.mean(axis=-1).reshape((1, sample_audio_data.shape[0]))
 
@@ -462,8 +468,20 @@ def sampler(video_file_1, video_file_2, augment=False):
             'audio_augment_params': audio_aug_params,
             'video_augment_params': video_aug_params
         }
+        samples.append(sample)
+
+    del audio_data
+    del video_data
+    del audio_data_1
+    del audio_data_2
+    del video_data_1
+    del video_data_2
+
+    for sample in samples:
         yield sample
 
+    raise StopIteration()
+
 
 def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path=None,
                    k=32, batch_size=64, random_state=20171021,
@@ -497,7 +515,7 @@ def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path
 
             #debug_msg = 'Created streamer for videos "{}" and "{}'
             #LOGGER.debug(debug_msg.format(video_file_1, video_file_2))
-            seeds.append(pescador.Streamer(sampler, video_file_1, video_file_2, augment=augment))
+            seeds.append(pescador.Streamer(sampler, video_file_1, video_file_2, rate=rate, augment=augment))
 
     # Randomly shuffle the seeds
     random.shuffle(seeds)

From 9227665ebc3cb041a1487be7450ee10e3207ff07 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 12 Dec 2017 14:13:13 -0500
Subject: [PATCH 047/201] Fix warnings and add ability to use a random
 subsample of videos

---
 l3embedding/train.py | 41 +++++++++++++++++++++++++++--------------
 train.py             | 14 ++++++++++++++
 2 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 5ba4efa..89ceb17 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -1,6 +1,7 @@
 import glob
 import json
 import math
+import datetime
 import time
 import os
 import pickle
@@ -399,32 +400,36 @@ def sampler(video_file_1, video_file_2, rate=32, augment=False):
         video_data_1 = vread(video_file_1)
     except Exception as e:
         warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
+        warn_msg = warn_msg.format(video_file_1, type(e), e)
         LOGGER.warning(warn_msg)
-        warnings.warn(warn_msg.format(video_file_1, type(e), e))
+        warnings.warn(warn_msg)
         raise StopIteration()
 
     try:
         video_data_2 = vread(video_file_2)
     except Exception as e:
         warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
+        warn_msg = warn_msg.format(video_file_2, type(e), e)
         LOGGER.warning(warn_msg)
-        warnings.warn(warn_msg.format(video_file_2, type(e), e))
+        warnings.warn(warn_msg)
         raise StopIteration()
 
     try:
         audio_data_1, sampling_frequency = sf.read(audio_file_1, dtype='float32', always_2d=True)
     except Exception as e:
         warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
+        warn_msg = warn_msg.format(audio_file_1, type(e), e)
         LOGGER.warning(warn_msg)
-        warnings.warn(warn_msg.format(audio_file_1, type(e), e))
+        warnings.warn(warn_msg)
         raise StopIteration()
 
     try:
         audio_data_2, sampling_frequency = sf.read(audio_file_2, dtype='float32', always_2d=True)
     except Exception as e:
         warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
+        warn_msg = warn_msg.format(audio_file_2, type(e), e)
         LOGGER.warning(warn_msg)
-        warnings.warn(warn_msg.format(audio_file_2, type(e), e))
+        warnings.warn(warn_msg)
         raise StopIteration()
 
     samples = []
@@ -485,7 +490,7 @@ def sampler(video_file_1, video_file_2, rate=32, augment=False):
 
 def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path=None,
                    k=32, batch_size=64, random_state=20171021,
-                   num_distractors=1, augment=False, rate=32):
+                   num_distractors=1, augment=False, rate=32, max_videos=None):
     """Sample video and audio from data_dir, returns a streamer that yield samples infinitely.
 
     Args:
@@ -501,10 +506,15 @@ def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path
     random.seed(random_state)
 
     LOGGER.info("Getting file list...")
-    audio_files, video_files = get_file_list(data_dir, metadata_path=metadata_path,
+    _, video_files = get_file_list(data_dir, metadata_path=metadata_path,
                                              filter_path=filter_path, ontology_path=ontology_path)
 
     LOGGER.info("Creating streamers...")
+    if max_videos is not None and max_videos < len(video_files):
+        LOGGER.info("Using a subset of {} videos".format(max_videos))
+        random.shuffle(video_files)
+        video_files = video_files[:max_videos]
+
     seeds = []
     for video_file_1 in tqdm(video_files):
         for _ in range(num_distractors):
@@ -580,9 +590,10 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
           train_num_streamers=16, validation_num_streamers=16,
           train_num_distractors=1, validation_num_distractors=2,
           train_mux_rate=16, validation_mux_rate=16,
-          learning_rate=1e-4, random_state=20171021,
-          verbose=False, checkpoint_interval=10, ontology_path=None,
-          log_path=None, disable_logging=False, augment=False, gpus=1):
+          learning_rate=1e-4, random_state=20171021, train_max_videos=None,
+          validation_max_videos=None, verbose=False, checkpoint_interval=10,
+          ontology_path=None, log_path=None, disable_logging=False,
+          augment=False, gpus=1):
 
     init_console_logger(LOGGER, verbose=verbose)
     if not disable_logging:
@@ -597,9 +608,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
     monitor = 'val_loss'
 
     # Make sure the directories we need exist
-    model_dir = os.path.join(output_dir, model_id)
-    if not os.path.isdir(output_dir):
-        os.makedirs(output_dir)
+    model_dir = os.path.join(output_dir, model_id, datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
     if not os.path.isdir(model_dir):
         os.makedirs(model_dir)
 
@@ -608,6 +617,8 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
               loss=loss,
               metrics=metrics)
 
+    LOGGER.info('Model files can be found in "{}"'.format(model_dir))
+
     # Save the model
     model_spec_path = os.path.join(model_dir, 'model_spec.pkl')
     model_spec = keras.utils.serialize_keras_object(m)
@@ -654,6 +665,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
         k=train_num_streamers,
         augment=augment,
         num_distractors=train_num_distractors,
+        max_videos=train_max_videos,
         rate=train_mux_rate)
 
     train_gen = pescador.maps.keras_tuples(train_gen,
@@ -670,6 +682,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
         random_state=random_state,
         k=validation_num_streamers,
         num_distractors=validation_num_distractors,
+        max_videos=validation_max_videos,
         rate=validation_mux_rate)
 
     val_gen = pescador.maps.keras_tuples(val_gen,
@@ -681,9 +694,9 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
     # Fit the model
     LOGGER.info('Fitting model...')
     if verbose:
-        verbosity = 2
-    else:
         verbosity = 1
+    else:
+        verbosity = 2
     history = m.fit_generator(train_gen, train_epoch_size, num_epochs,
                               validation_data=val_gen,
                               validation_steps=validation_epoch_size,
diff --git a/train.py b/train.py
index f81b029..73b6215 100644
--- a/train.py
+++ b/train.py
@@ -126,6 +126,20 @@ def parse_arguments():
                         default=os.path.join(os.path.dirname(__file__), 'resources/ontology.json'),
                         help='Path to AudioSet ontology')
 
+    parser.add_argument('-tmv',
+                        '--train-max-videos',
+                        dest='train_max_videos',
+                        action='store',
+                        type=int,
+                        help='Maximum number of videos to use for training. If not specified, all videos will be used')
+
+    parser.add_argument('-vmv',
+                        '--validation-max-videos',
+                        dest='validation_max_videos',
+                        action='store',
+                        type=int,
+                        help='Maximum number of videos to use for validation. If not specified, all videos will be used')
+
     parser.add_argument('-r',
                         '--random-state',
                         dest='random_state',

From 7d1c3aab5cc43ed8573549ee6ba789dea1a12a27 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 18 Dec 2017 16:59:42 -0500
Subject: [PATCH 048/201] Add various timing, efficiency, etc changes and fixes
 * Add timing * Create separate weight decay regularization objects * Fix
 embedding extraction from the model to be consistent with paper * Make model
 type configurable via command line option * Add a relatively small model for
 debugging * Load videos using skvideo.io.FFmpegReader to load slightly more
 efficiently and rescale video more efficiently * Add timing for verbose
 logging * Make precomputing samples configurable via command line option *
 Add batch timing to TimeHistory

---
 l3embedding/audio_model.py  |  91 ++++++++--
 l3embedding/image.py        |   3 +-
 l3embedding/model.py        |  56 +++++--
 l3embedding/train.py        | 321 ++++++++++++++++++++----------------
 l3embedding/vision_model.py |  84 ++++++++--
 log.py                      |  93 ++++++++++-
 train.py                    |  15 ++
 7 files changed, 478 insertions(+), 185 deletions(-)

diff --git a/l3embedding/audio_model.py b/l3embedding/audio_model.py
index 49c1525..330d224 100644
--- a/l3embedding/audio_model.py
+++ b/l3embedding/audio_model.py
@@ -20,8 +20,7 @@ def construct_cnn_L3_orig_audio_model():
     outputs: Model outputs
             (Type: keras.layers.Layer)
     """
-    weight_decay = 1e-5
-    l2_weight_decay = regularizers.l2(weight_decay)
+    weight_decay = 1e-8
     ####
     # Audio subnetwork
     ####
@@ -42,11 +41,11 @@ def construct_cnn_L3_orig_audio_model():
     filt_size_a_1 = (3, 3)
     pool_size_a_1 = (2, 2)
     y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_a)
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
     y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_a)
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
     y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a)
@@ -56,11 +55,11 @@ def construct_cnn_L3_orig_audio_model():
     filt_size_a_2 = (3, 3)
     pool_size_a_2 = (2, 2)
     y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_a)
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
     y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_a)
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
     y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a)
@@ -70,11 +69,11 @@ def construct_cnn_L3_orig_audio_model():
     filt_size_a_3 = (3, 3)
     pool_size_a_3 = (2, 2)
     y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_a)
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
     y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_a)
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
     y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a)
@@ -84,18 +83,84 @@ def construct_cnn_L3_orig_audio_model():
     filt_size_a_4 = (3, 3)
     pool_size_a_4 = (32, 24)
     y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_a)
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
-    y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_a)
+    y_a = Conv2D(name='audio_embedding_layer',
+                 n_filter_a_4, filt_size_a_4, padding='same',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
     y_a = MaxPooling2D(pool_size=pool_size_a_4)(y_a)
 
-    y_a = Flatten(name='audio_embedding')(y_a)
+    y_a = Flatten()(y_a)
+
+    m = Model(inputs=x_a, outputs=y_a)
+    m.name = 'audio_model'
+
+    return m, x_a, y_a
+
+
+def construct_cnn_l3_orig_audio_embedding(audio_model, x_a):
+    pool_size = (8, 8)
+    embed_layer = audio_model.get_layer('audio_embedding_layer')
+    y_a = MaxPooling2D(pool_size=pool_size, padding='same')(embed_layer)
+    y_a = Flatten()(y_a)
+
+    m = Model(inputs=x_a, outputs=y_a)
+
+
+def construct_tiny_L3_audio_model():
+    """
+    Constructs a model that implements a small L3 audio subnetwork
+
+    Returns
+    -------
+    model:  L3 CNN model
+            (Type: keras.models.Model)
+    inputs: Model inputs
+            (Type: list[keras.layers.Input])
+    outputs: Model outputs
+            (Type: keras.layers.Layer)
+    """
+    weight_decay = 1e-5
+    ####
+    # Audio subnetwork
+    ####
+    n_dft = 512
+    n_win = 480
+    n_hop = n_win//2
+    asr = 48000
+    audio_window_dur = 1
+    # INPUT
+    x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32')
+
+    # SPECTROGRAM PREPROCESSING
+    y_a = Spectrogram(n_dft=n_dft, n_win=n_win, n_hop=n_hop,
+                      return_decibel_spectrogram=True, padding='valid')(x_a)
 
+    y_a = Conv2D(10, (5,5), padding='valid', strides=(1,1),
+                         kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=(3,3), strides=3)(y_a)
+    y_a = Conv2D(10, (5,5), padding='valid', strides=(1,1),
+                         kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=(3,3), strides=3)(y_a)
+    y_a = Conv2D(10, (5,5), padding='valid', strides=(1,1),
+                         kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=(3,3), strides=3)(y_a)
+    y_a = Flatten(name='embedding')(y_a)
     m = Model(inputs=x_a, outputs=y_a)
-    m.name = 'audio_embedding'
+    m.name = 'audio_model'
 
     return m, x_a, y_a
+
+
+EMBEDDING_MODELS = {
+    'cnn_L3_orig': construct_cnn_l3_orig_audio_embedding
+}
diff --git a/l3embedding/image.py b/l3embedding/image.py
index f14486e..c3619be 100644
--- a/l3embedding/image.py
+++ b/l3embedding/image.py
@@ -14,7 +14,8 @@ def adjust_saturation(rgb_img, factor):
         adjusted_img: RGB image with adjusted saturation
     """
     hsv_img = skimage.color.rgb2hsv(rgb_img)
-    hsv_img[:,:,1] = np.clip(hsv_img[:,:,1] * factor, 0.0, 1.0)
+    imin, imax = skimage.dtype_limits(hsv_img)
+    hsv_img[:,:,1] = np.clip(hsv_img[:,:,1] * factor, imin, imax)
     return skimage.color.hsv2rgb(hsv_img)
 
 
diff --git a/l3embedding/model.py b/l3embedding/model.py
index 470504f..49be181 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -1,11 +1,11 @@
 from keras.models import Model
-from keras.layers import concatenate, Dense
+from keras.layers import concatenate, Dense, MaxPooling2D, Flatten
 import keras.regularizers as regularizers
-from .vision_model import construct_cnn_L3_orig_vision_model
-from .audio_model import construct_cnn_L3_orig_audio_model
+from .vision_model import *
+from .audio_model import *
 
 
-def L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, model_name):
+def L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, model_name, layer_size=128):
     """
     Merges the audio and vision subnetworks and adds additional fully connected
     layers in the fashion of the model used in Look, Listen and Learn
@@ -23,17 +23,18 @@ def L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, model_name
     """
     # Merge the subnetworks
     weight_decay = 1e-5
-    l2_weight_decay = regularizers.l2(weight_decay)
     y = concatenate([vision_model(x_i), audio_model(x_a)])
-    y = Dense(128, activation='relu', kernel_regularizer=l2_weight_decay)(y)
-    y = Dense(2, activation='softmax', kernel_regularizer=l2_weight_decay)(y)
+    y = Dense(layer_size, activation='relu',
+              kernel_regularizer=regularizers.l2(weight_decay))(y)
+    y = Dense(2, activation='softmax',
+              kernel_regularizer=regularizers.l2(weight_decay))(y)
     m = Model(inputs=[x_i, x_a], outputs=y)
     m.name = model_name
 
     return m, [x_i, x_a], y
 
 
-def load_model(weights_path, model_type):
+def load_model(weights_path, model_type, return_io=False):
     """
     Loads an audio-visual correspondence model
 
@@ -52,7 +53,10 @@ def load_model(weights_path, model_type):
 
     m, inputs, output = MODELS[model_type]()
     m.load_weights(weights_path)
-    return m
+    if return_io:
+        return m, inputs, outputs
+    else:
+        return m
 
 
 def load_embedding(weights_path, model_type, embedding_type):
@@ -71,15 +75,21 @@ def load_embedding(weights_path, model_type, embedding_type):
         model:  Loaded model object
                 (Type: keras.engine.training.Model)
     """
-    m = load_model(weights_path, model_type)
+    m, inputs, output = load_model(weights_path, model_type, return_io=True)
+    x_i, x_a = inputs
     if embedding_type == 'vision':
-        m_emb = m.get_layer('vision_embedding')
+        m_embed_model = m.get_layer('vision_model')
+        m_embed_layer = m_embed.get_layer('vision_embedding_layer')
+        m_embed = EMBEDDING_MODELS[model_type](m_embed, x_i)
+
     elif embedding_type == 'audio':
-        m_emb = m.get_layer('audio_embedding')
+        m_embed_model = m.get_layer('audio_model')
+        m_embed_layer = m_embed.get_layer('audio_embedding_layer')
+        m_embed = EMBEDDING_MODELS[model_type](m_embed, x_a)
     else:
         raise ValueError('Invalid embedding type: "{}"'.format(embedding_type))
 
-    return m_emb
+    return m_embed
 
 
 def construct_cnn_L3_orig():
@@ -103,7 +113,27 @@ def construct_cnn_L3_orig():
     m = L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, 'cnn_L3_orig')
     return m
 
+def construct_tiny_L3():
+    """
+    Constructs a model that implements a small L3 model for validation purposes
+
+    Returns
+    -------
+    model:  L3 CNN model
+            (Type: keras.models.Model)
+    inputs: Model inputs
+            (Type: list[keras.layers.Input])
+    outputs: Model outputs
+            (Type: keras.layers.Layer)
+    """
+    vision_model, x_i, y_i = construct_tiny_L3_vision_model()
+    audio_model, x_a, y_a = construct_tiny_L3_audio_model()
+
+    m = L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, 'tiny_L3', layer_size=64)
+    return m
+
 
 MODELS = {
     'cnn_L3_orig': construct_cnn_L3_orig,
+    'tiny_L3': construct_tiny_L3
 }
diff --git a/l3embedding/train.py b/l3embedding/train.py
index 89ceb17..f44c16a 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -15,12 +15,12 @@
 import pescador
 import scipy.misc
 import skimage
-from skvideo.io import vread
+from skvideo.io import FFmpegReader, ffprobe
 import soundfile as sf
 from tqdm import tqdm
 
 from .image import *
-from .model import construct_cnn_L3_orig
+from .model import MODELS
 from .training_utils import multi_gpu_model
 from audioset.ontology import ASOntology
 from log import *
@@ -192,13 +192,16 @@ def sample_one_second(audio_data, sampling_frequency, augment=False):
     else:
         start = 0
 
-    audio_data = audio_data[start:start+sampling_frequency]
+    with LogTimer(LOGGER, 'Slicing audio'):
+        audio_data = audio_data[start:start+sampling_frequency]
+
     if audio_data.shape[0] != sampling_frequency:
         # Pad audio that isn't one second
         warnings.warn('Got audio that is less than one second', UserWarning)
-        audio_data = np.pad(audio_data,
-                            ((0, sampling_frequency - audio_data.shape[0]), (0,0)),
-                            mode='constant')
+        with LogTimer(LOGGER, 'Slicing audio'):
+            audio_data = np.pad(audio_data,
+                                ((0, sampling_frequency - audio_data.shape[0]), (0,0)),
+                                mode='constant')
     if augment:
         # Make sure we don't clip
         if np.abs(audio_data).max():
@@ -208,7 +211,8 @@ def sample_one_second(audio_data, sampling_frequency, augment=False):
             warnings.warn('Got audio sample with all zeros', UserWarning)
             max_gain = 0.1
         gain = 1 + random.uniform(-0.1, max_gain)
-        audio_data *= gain
+        with LogTimer(LOGGER, 'Applying gain to audio'):
+            audio_data *= gain
         audio_aug_params = {'gain': gain}
     else:
         audio_aug_params = {}
@@ -216,56 +220,7 @@ def sample_one_second(audio_data, sampling_frequency, augment=False):
     return audio_data, start / sampling_frequency, audio_aug_params
 
 
-def rescale_video(video_data):
-    """
-    Rescales video such that the minimum dimension of the video becomes 256,
-    as is down in Look, Listen and Learn
-
-
-    Args:
-        video_data: video data array
-
-    Returns:
-        rescaled_video_data: rescaled video data array
-    """
-    num_frames, nx, ny, nc = video_data.shape
-
-    scaling = 256.0 / min(nx, ny)
-
-    new_nx, new_ny = math.ceil(scaling * nx), math.ceil(scaling * ny)
-    assert 256 in (new_nx, new_ny), str((new_nx, new_ny))
-
-    resized_video_data = np.array([scipy.misc.imresize(frame, (new_nx, new_ny, nc))
-                                   for frame in video_data])
-
-    return resized_video_data
-
-
-def rescale_frame(frame_data):
-    """
-    Rescales frame such that the minimum dimension of the frame becomes 256,
-    as is down in Look, Listen and Learn
-
-
-    Args:
-        frame_data: frame data array
-
-    Returns:
-        rescaled_frame_data: rescaled frame data array
-    """
-    nx, ny, nc = frame_data.shape
-
-    scaling = 256.0 / min(nx, ny)
-
-    new_nx, new_ny = math.ceil(scaling * nx), math.ceil(scaling * ny)
-    assert 256 in (new_nx, new_ny), str((new_nx, new_ny))
-
-    resized_frame_data = scipy.misc.imresize(frame_data, (new_nx, new_ny, nc))
-
-    return resized_frame_data
-
-
-def sample_cropped_frame(frame_data, rescale=True):
+def sample_cropped_frame(frame_data):
     """
     Randomly crop a video frame, using the method from Look, Listen and Learn
 
@@ -277,8 +232,6 @@ def sample_cropped_frame(frame_data, rescale=True):
         scaled_frame_data: scaled and cropped frame data
         bbox: bounding box for the cropped image
     """
-    if rescale:
-        frame_data = skimage.img_as_float32(rescale_frame(frame_data))
     nx, ny, nc = frame_data.shape
     start_x, start_y = random.randrange(nx - 224), random.randrange(ny - 224)
     end_x, end_y = start_x + 224, start_y + 224
@@ -290,10 +243,13 @@ def sample_cropped_frame(frame_data, rescale=True):
         'end_y': end_y
     }
 
-    return frame_data[start_x:end_x, start_y:end_y, :], bbox
+    with LogTimer(LOGGER, 'Cropping frame'):
+        frame_data = frame_data[start_x:end_x, start_y:end_y, :]
+
+    return frame_data, bbox
 
 
-def sample_one_frame(video_data, start=None, fps=30, augment=False, rescale=True):
+def sample_one_frame(video_data, start=None, fps=30, augment=False):
     """Return one frame randomly and time (seconds).
 
     Args:
@@ -307,7 +263,7 @@ def sample_one_frame(video_data, start=None, fps=30, augment=False, rescale=True
 
     """
 
-    num_frames = video_data.shape[0]
+    num_frames = len(video_data)
     if start is not None:
         start_frame = int(start * fps)
         # Sample frame from a one second window, or until the end of the video
@@ -329,8 +285,8 @@ def sample_one_frame(video_data, start=None, fps=30, augment=False, rescale=True
     else:
         frame = random.randrange(num_frames)
 
-    frame_data = video_data[frame, :, :, :]
-    frame_data, bbox = sample_cropped_frame(frame_data, rescale=rescale)
+    frame_data = video_data[frame]
+    frame_data, bbox = sample_cropped_frame(frame_data)
 
     video_aug_params = {'bounding_box': bbox}
 
@@ -338,30 +294,36 @@ def sample_one_frame(video_data, start=None, fps=30, augment=False, rescale=True
         # Randomly horizontally flip the image
         horizontal_flip = False
         if random.random() < 0.5:
-            frame_data = horiz_flip(frame_data)
+            with LogTimer(LOGGER, 'Flipping frame'):
+                frame_data = horiz_flip(frame_data)
             horizontal_flip = True
 
+
         # Ranges taken from https://github.com/tensorflow/models/blob/master/research/slim/preprocessing/inception_preprocessing.py
 
         # Randomize the order of saturation jitter and brightness jitter
         if random.random() < 0.5:
             # Add saturation jitter
-            saturation_factor = random.random() + 0.5
-            frame_data = adjust_saturation(frame_data, saturation_factor)
+            saturation_factor = np.float32(random.random() + 0.5)
+            with LogTimer(LOGGER, 'Adjusting saturation'):
+                frame_data = adjust_saturation(frame_data, saturation_factor)
 
             # Add brightness jitter
             max_delta = 32. / 255.
-            brightness_delta = (2*random.random() - 1) * max_delta
-            frame_data = adjust_brightness(frame_data, brightness_delta)
+            brightness_delta = np.float32((2*random.random() - 1) * max_delta)
+            with LogTimer(LOGGER, 'Adjusting brightness'):
+                frame_data = adjust_brightness(frame_data, brightness_delta)
         else:
             # Add brightness jitter
             max_delta = 32. / 255.
-            brightness_delta = (2*random.random() - 1) * max_delta
-            frame_data = adjust_brightness(frame_data, brightness_delta)
+            brightness_delta = np.float32((2*random.random() - 1) * max_delta)
+            with LogTimer(LOGGER, 'Adjusting brightness'):
+                frame_data = adjust_brightness(frame_data, brightness_delta)
 
             # Add saturation jitter
-            saturation_factor = random.random() + 0.5
-            frame_data = adjust_saturation(frame_data, saturation_factor)
+            saturation_factor = np.float32(random.random() + 0.5)
+            with LogTimer(LOGGER, 'Adjusting saturation'):
+                frame_data = adjust_saturation(frame_data, saturation_factor)
 
         video_aug_params.update({
             'horizontal_flip': horizontal_flip,
@@ -369,11 +331,79 @@ def sample_one_frame(video_data, start=None, fps=30, augment=False, rescale=True
             'brightness_delta': brightness_delta
         })
 
+    frame_data = skimage.img_as_float32(frame_data)
+
 
     return frame_data, frame / fps, video_aug_params
 
 
-def sampler(video_file_1, video_file_2, rate=32, augment=False):
+def read_video(video_path):
+    vinfo = ffprobe(video_path)['video']
+    width = int(vinfo['@width'])
+    height = int(vinfo['@height'])
+
+    scaling = 256.0 / min(width, height)
+    new_width = math.ceil(scaling * width)
+    new_height = math.ceil(scaling * height)
+
+    # Resize frames
+    reader = FFmpegReader(video_path,
+                          outputdict={'-s': "{}x{}".format(new_width,
+                                                           new_height) })
+
+    frames = []
+    for frame in reader.nextFrame():
+        frames.append(frame)
+
+    return frames
+
+
+def generate_sample(audio_file_1, audio_data_1, audio_file_2, audio_data_2,
+                    video_file_1, video_data_1, video_file_2, video_data_2,
+                    audio_sampling_frequency, augment=False):
+    video_choice = random.random() < 0.5
+    audio_choice = random.random() < 0.5
+
+    if audio_choice:
+        audio_file = audio_file_1
+        audio_data = audio_data_1
+    else:
+        audio_file = audio_file_2
+        audio_data = audio_data_2
+
+    if video_choice:
+        video_file = video_file_1
+        video_data = video_data_1
+    else:
+        video_file = video_file_2
+        video_data = video_data_2
+
+    label = int(video_choice != audio_choice)
+
+    sample_audio_data, audio_start, audio_aug_params \
+        = sample_one_second(audio_data, audio_sampling_frequency, augment=augment)
+
+    sample_video_data, video_start, video_aug_params \
+        = sample_one_frame(video_data, start=audio_start, augment=augment)
+
+    sample_audio_data = sample_audio_data.mean(axis=-1).reshape((1, sample_audio_data.shape[0]))
+
+    sample = {
+        'video': np.ascontiguousarray(sample_video_data),
+        'audio': np.ascontiguousarray(sample_audio_data),
+        'label': np.ascontiguousarray(np.array([label, 1 - label])),
+#            'audio_file': audio_file,
+#            'video_file': video_file,
+#            'audio_start': audio_start,
+#            'video_start': video_start,
+#            'audio_augment_params': audio_aug_params,
+#            'video_augment_params': video_aug_params
+    }
+
+    return sample
+
+
+def sampler(video_file_1, video_file_2, rate=32, augment=False, precompute=False):
     """Sample one frame from video_file, with 50% chance sample one second from corresponding audio_file,
        50% chance sample one second from another audio_file in the list of audio_files.
 
@@ -396,8 +426,10 @@ def sampler(video_file_1, video_file_2, rate=32, augment=False):
     #       the video so we don't have to resize all of the frames
     num_samples = int(scipy.stats.poisson.ppf(0.999, rate))
 
+
     try:
-        video_data_1 = vread(video_file_1)
+        with LogTimer(LOGGER, 'Reading video'):
+            video_data_1 = read_video(video_file_1)
     except Exception as e:
         warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
         warn_msg = warn_msg.format(video_file_1, type(e), e)
@@ -406,7 +438,8 @@ def sampler(video_file_1, video_file_2, rate=32, augment=False):
         raise StopIteration()
 
     try:
-        video_data_2 = vread(video_file_2)
+        with LogTimer(LOGGER, 'Reading video'):
+            video_data_2 = read_video(video_file_2)
     except Exception as e:
         warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
         warn_msg = warn_msg.format(video_file_2, type(e), e)
@@ -415,7 +448,10 @@ def sampler(video_file_1, video_file_2, rate=32, augment=False):
         raise StopIteration()
 
     try:
-        audio_data_1, sampling_frequency = sf.read(audio_file_1, dtype='float32', always_2d=True)
+        with LogTimer(LOGGER, 'Reading audio'):
+            audio_data_1, sampling_frequency = sf.read(audio_file_1,
+                                                       dtype='float32',
+                                                       always_2d=True)
     except Exception as e:
         warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
         warn_msg = warn_msg.format(audio_file_1, type(e), e)
@@ -424,7 +460,10 @@ def sampler(video_file_1, video_file_2, rate=32, augment=False):
         raise StopIteration()
 
     try:
-        audio_data_2, sampling_frequency = sf.read(audio_file_2, dtype='float32', always_2d=True)
+        with LogTimer(LOGGER, 'Reading audio'):
+            audio_data_2, sampling_frequency = sf.read(audio_file_2,
+                                                       dtype='float32',
+                                                       always_2d=True)
     except Exception as e:
         warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
         warn_msg = warn_msg.format(audio_file_2, type(e), e)
@@ -432,64 +471,46 @@ def sampler(video_file_1, video_file_2, rate=32, augment=False):
         warnings.warn(warn_msg)
         raise StopIteration()
 
-    samples = []
-    for _ in range(num_samples):
-
-        video_choice = random.random() < 0.5
-        audio_choice = random.random() < 0.5
-
-        if audio_choice:
-            audio_file = audio_file_1
-            audio_data = audio_data_1
-        else:
-            audio_file = audio_file_2
-            audio_data = audio_data_2
-
-        if video_choice:
-            video_file = video_file_1
-            video_data = video_data_1
-        else:
-            video_file = video_file_2
-            video_data = video_data_2
-
-        label = int(video_choice != audio_choice)
-
-        sample_audio_data, audio_start, audio_aug_params \
-            = sample_one_second(audio_data, sampling_frequency, augment=augment)
-
-        sample_video_data, video_start, video_aug_params \
-            = sample_one_frame(video_data, start=audio_start, augment=augment, rescale=True)
-
-        sample_audio_data = sample_audio_data.mean(axis=-1).reshape((1, sample_audio_data.shape[0]))
-
-        sample = {
-            'video': sample_video_data,
-            'audio': sample_audio_data,
-            'label': np.array([label, 1 - label]),
-            'audio_file': audio_file,
-            'video_file': video_file,
-            'audio_start': audio_start,
-            'video_start': video_start,
-            'audio_augment_params': audio_aug_params,
-            'video_augment_params': video_aug_params
-        }
-        samples.append(sample)
-
-    del audio_data
-    del video_data
-    del audio_data_1
-    del audio_data_2
-    del video_data_1
-    del video_data_2
-
-    for sample in samples:
-        yield sample
+    if precompute:
+        samples = []
+        for _ in range(num_samples):
+            sample = generate_sample(
+                audio_file_1, audio_data_1, audio_file_2, audio_data_2,
+                video_file_1, video_data_1, video_file_2, video_data_2,
+                sampling_frequency, augment=augment)
+
+            samples.append(sample)
+
+        # Clear the data from memory
+        video_data_1 = None
+        video_data_2 = None
+        audio_data_1 = None
+        audio_data_2 = None
+        video_data = None
+        audio_data = None
+        del video_data_1
+        del video_data_2
+        del audio_data_1
+        del audio_data_2
+        del video_data
+        del audio_data
+
+        while samples:
+            # Yield the sample, and remove from the list to free up some memory
+            yield samples.pop()
+    else:
+        while True:
+            yield generate_sample(
+                audio_file_1, audio_data_1, audio_file_2, audio_data_2,
+                video_file_1, video_data_1, video_file_2, video_data_2,
+                sampling_frequency, augment=augment)
 
     raise StopIteration()
 
 
+
 def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path=None,
-                   k=32, batch_size=64, random_state=20171021,
+                   k=32, batch_size=64, random_state=20171021, precompute=False,
                    num_distractors=1, augment=False, rate=32, max_videos=None):
     """Sample video and audio from data_dir, returns a streamer that yield samples infinitely.
 
@@ -525,16 +546,16 @@ def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path
 
             #debug_msg = 'Created streamer for videos "{}" and "{}'
             #LOGGER.debug(debug_msg.format(video_file_1, video_file_2))
-            seeds.append(pescador.Streamer(sampler, video_file_1, video_file_2, rate=rate, augment=augment))
+            streamer = pescador.Streamer(sampler, video_file_1, video_file_2,
+                                         rate=rate, augment=augment,
+                                         precompute=precompute)
+            # ZMQ streamers make the script stall inexplicably
+            #streamer = pescador.ZMQStreamer(streamer)
+            seeds.append(streamer)
 
     # Randomly shuffle the seeds
     random.shuffle(seeds)
 
-    # TODO:
-    # Order 1024 streamers open
-    # Set rate 16?
-    # Set larger rate for validation (32) for stability
-    # Sampling is very delicate!
     mux = pescador.Mux(seeds, k, rate=rate)
     if batch_size == 1:
         return mux
@@ -569,15 +590,24 @@ def on_epoch_end(self, epoch, logs=None):
 class TimeHistory(keras.callbacks.Callback):
     # Copied from https://stackoverflow.com/a/43186440/1260544
     def on_train_begin(self, logs={}):
-        self.times = []
+        self.epoch_times = []
+        self.batch_times = []
 
-    def on_epoch_begin(self, batch, logs={}):
+    def on_epoch_begin(self, batch, logs=None):
         self.epoch_time_start = time.time()
 
-    def on_epoch_end(self, batch, logs={}):
+    def on_epoch_end(self, batch, logs=None):
         t = time.time() - self.epoch_time_start
         LOGGER.info('Epoch took {} seconds'.format(t))
-        self.times.append(t)
+        self.epoch_times.append(t)
+
+    def on_batch_begin(self, batch, logs=None):
+        self.batch_time_start = time.time()
+
+    def on_batch_end(self, batch, logs=None):
+        t = time.time() - self.batch_time_start
+        LOGGER.info('Batch took {} seconds'.format(t))
+        self.batch_times.append(t)
 
 
 #def train(train_csv_path, model_id, output_dir, num_epochs=150, epoch_size=512,
@@ -589,18 +619,18 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
           train_batch_size=64, validation_batch_size=64,
           train_num_streamers=16, validation_num_streamers=16,
           train_num_distractors=1, validation_num_distractors=2,
-          train_mux_rate=16, validation_mux_rate=16,
+          model_type='cnn_L3_orig', train_mux_rate=16, validation_mux_rate=16,
           learning_rate=1e-4, random_state=20171021, train_max_videos=None,
           validation_max_videos=None, verbose=False, checkpoint_interval=10,
           ontology_path=None, log_path=None, disable_logging=False,
-          augment=False, gpus=1):
+          precompute=False, augment=False, gpus=1):
 
     init_console_logger(LOGGER, verbose=verbose)
     if not disable_logging:
         init_file_logger(LOGGER, log_path=log_path)
     LOGGER.debug('Initialized logging.')
 
-    m, inputs, outputs = construct_cnn_L3_orig()
+    m, inputs, outputs = MODELS[model_type]()
     if gpus > 1:
         m = multi_gpu_model(m, gpus=gpus)
     loss = 'categorical_crossentropy'
@@ -666,6 +696,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
         augment=augment,
         num_distractors=train_num_distractors,
         max_videos=train_max_videos,
+        precompute=precompute,
         rate=train_mux_rate)
 
     train_gen = pescador.maps.keras_tuples(train_gen,
@@ -683,6 +714,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
         k=validation_num_streamers,
         num_distractors=validation_num_distractors,
         max_videos=validation_max_videos,
+        precompute=precompute,
         rate=validation_mux_rate)
 
     val_gen = pescador.maps.keras_tuples(val_gen,
@@ -700,6 +732,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
     history = m.fit_generator(train_gen, train_epoch_size, num_epochs,
                               validation_data=val_gen,
                               validation_steps=validation_epoch_size,
+                              #use_multiprocessing=True,
                               callbacks=cb,
                               verbose=verbosity)
 
diff --git a/l3embedding/vision_model.py b/l3embedding/vision_model.py
index 8a29513..5c365b4 100644
--- a/l3embedding/vision_model.py
+++ b/l3embedding/vision_model.py
@@ -20,8 +20,7 @@ def construct_cnn_L3_orig_vision_model():
     outputs: Model outputs
             (Type: keras.layers.Layer)
     """
-    weight_decay = 1e-5
-    l2_weight_decay = regularizers.l2(weight_decay)
+    weight_decay = 1e-8
     ####
     # Image subnetwork
     ####
@@ -33,11 +32,11 @@ def construct_cnn_L3_orig_vision_model():
     filt_size_i_1 = (3, 3)
     pool_size_i_1 = (2, 2)
     y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
-                 kernel_regularizer=l2_weight_decay)(x_i)
+                 kernel_regularizer=regularizers.l2(weight_decay))(x_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
     y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_i)
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = Activation('relu')(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = MaxPooling2D(pool_size=pool_size_i_1, strides=2, padding='same')(y_i)
@@ -47,11 +46,11 @@ def construct_cnn_L3_orig_vision_model():
     filt_size_i_2 = (3, 3)
     pool_size_i_2 = (2, 2)
     y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_i)
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
     y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_i)
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
     y_i = MaxPooling2D(pool_size=pool_size_i_2, strides=2, padding='same')(y_i)
@@ -61,11 +60,11 @@ def construct_cnn_L3_orig_vision_model():
     filt_size_i_3 = (3, 3)
     pool_size_i_3 = (2, 2)
     y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_i)
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
     y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_i)
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
     y_i = MaxPooling2D(pool_size=pool_size_i_3, strides=2, padding='same')(y_i)
@@ -75,17 +74,76 @@ def construct_cnn_L3_orig_vision_model():
     filt_size_i_4 = (3, 3)
     pool_size_i_4 = (28, 28)
     y_i = Conv2D(n_filter_i_4, filt_size_i_4, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_i)
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
-    y_i = Conv2D(n_filter_i_4, filt_size_i_4, padding='same',
-                 kernel_regularizer=l2_weight_decay)(y_i)
+    y_i = Conv2D(name='vision_embedding_layer',
+                 n_filter_i_4, filt_size_i_4, padding='same',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
     y_i = MaxPooling2D(pool_size=pool_size_i_4, padding='same')(y_i)
-    y_i = Flatten(name='vision_embedding')(y_i)
+    y_i = Flatten()(y_i)
 
     m = Model(inputs=x_i, outputs=y_i)
-    m.name = 'vision_embedding'
+    m.name = 'vision_model'
 
     return m, x_i, y_i
+
+
+def construct_cnn_l3_orig_vision_embedding(vision_model, x_i):
+    pool_size = (7, 7)
+    embed_layer = vision_model.get_layer('vision_embedding_layer')
+    y_i = MaxPooling2D(pool_size=pool_size, padding='same')(embed_layer)
+    y_i = Flatten()(y_i)
+
+    m = Model(inputs=x_i, outputs=y_i)
+
+
+def construct_tiny_L3_vision_model():
+    """
+    Constructs a model that implements a small L3 audio subnetwork
+
+    Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, .
+
+    Returns
+    -------
+    model:  L3 CNN model
+            (Type: keras.models.Model)
+    inputs: Model inputs
+            (Type: list[keras.layers.Input])
+    outputs: Model outputs
+            (Type: keras.layers.Layer)
+    """
+    weight_decay = 1e-5
+    ####
+    # Image subnetwork
+    ####
+    # INPUT
+    x_i = Input(shape=(224, 224, 3), dtype='float32')
+
+    y_i = Conv2D(10, (5,5), padding='valid', strides=(1,1),
+                 kernel_regularizer=regularizers.l2(weight_decay))(x_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = MaxPooling2D(pool_size=(3,3), strides=3)(y_i)
+    y_i = Conv2D(10, (5,5), padding='valid', strides=(1,1),
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = MaxPooling2D(pool_size=(3,3), strides=3)(y_i)
+    y_i = Conv2D(10, (5,5), padding='valid', strides=(1,1),
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = MaxPooling2D(pool_size=(3,3), strides=3)(y_i)
+    y_i = Flatten(name='vision_embedding')(y_i)
+    m = Model(inputs=x_i, outputs=y_i)
+    m.name = 'vision_model'
+
+    return m, x_i, y_i
+
+
+EMBEDDING_MODELS = {
+    'cnn_L3_orig': construct_cnn_l3_orig_vision_embedding
+}
diff --git a/log.py b/log.py
index 76d1ca1..3fbd63c 100644
--- a/log.py
+++ b/log.py
@@ -1,7 +1,98 @@
+import time
 import logging
 import logging.handlers
 
 
+class LogTimer(object):
+    """
+    A context manager that times the execution of a block of code and logs it.
+    """
+
+    def __init__(self, logger, desc, log_level=logging.DEBUG):
+        """
+        Creates an instance of a log timer context manager.
+
+        The log events will be logged to the given logger with the given level
+        in the format:
+
+        '<desc>' took <duration> seconds
+
+
+        Args:
+            logger:     Logger to log to
+                        (Type: logging.Logger)
+
+            desc:       Description of the code to time
+                        (Type: str)
+
+            log_level:  Logging level to log to. By default, the level is
+                        set to logging.DEBUG.
+                        (Type: int, logging.{NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL})
+        """
+        ## Logger used to log timing information
+        ## (Type: logging.Logger)
+        self.logger = logger
+        ## Logging level to log to
+        ## (Type: int, logging.{NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL})
+        self.log_level = log_level
+        ## Description of the code block being timed
+        ## (Type: str)
+        self.desc = desc
+        self._start_time = None
+
+        if log_level == logging.NOTSET:
+            raise ValueError('Cannot use NOTSET logging level.')
+
+    def __enter__(self):
+        """
+        Get the execution start time before the code block starts executing.
+        """
+        self._start_time = time.time()
+
+    def __exit__(self, type_, value, tb):
+        """
+        Compute the runtime duration of the code block, and log it to the given logger.
+
+        If an error occurred during the code block, nothing is logged.
+
+
+        Args:
+            type_:  Type of Exception thrown, if one was thrown before the
+                    context was exited.
+                    (Type: type)
+
+            value:  The instance of the Exception thrown if one was thrown
+                    before the context was exited.
+                    (Type: Exception)
+
+            tb:     The traceback of the error that occurred.
+                    (Type: str)
+        """
+        # Compute the duration that the block of code took to execute
+        end_time = time.time()
+        duration = end_time - self._start_time
+        self._start_time = None
+
+        # We don't need to time if something went wrong
+        if type_ or value or tb:
+            return
+
+        # Make the message out of the given description and the duration
+        msg = "{0} took {1} seconds".format(self.desc, duration)
+
+        # Log at the appropriate level
+        if self.log_level == logging.DEBUG:
+            self.logger.debug(msg)
+        elif self.log_level == logging.INFO:
+            self.logger.info(msg)
+        elif self.log_level == logging.WARNING:
+            self.logger.warning(msg)
+        elif self.log_level == logging.ERROR:
+            self.logger.error(msg)
+        elif self.log_level == logging.CRITICAL:
+            self.logger.critical(msg)
+
+
 def init_file_logger(logger, log_path=None):
     """
     Initializes logging to a file.
@@ -40,7 +131,7 @@ def init_console_logger(logger, verbose=False):
     if verbose:
         stream_handler.setLevel(logging.DEBUG)
     else:
-        stream_handler.setLevel(logging.ERROR)
+        stream_handler.setLevel(logging.INFO)
     formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
     stream_handler.setFormatter(formatter)
     logger.addHandler(stream_handler)
diff --git a/train.py b/train.py
index 73b6215..7a2c60b 100644
--- a/train.py
+++ b/train.py
@@ -110,6 +110,14 @@ def parse_arguments():
                         default=1e-4,
                         help='Optimization learning rate')
 
+    parser.add_argument('-mt',
+                        '--model-type',
+                        dest='model_type',
+                        action='store',
+                        type=str,
+                        default='cnn_L3_orig',
+                        help='Name of model type to train')
+
     parser.add_argument('-ci',
                         '--checkpoint-interval',
                         dest='checkpoint_interval',
@@ -155,6 +163,13 @@ def parse_arguments():
                         default=False,
                         help='If True, performs data augmentation on audio and images')
 
+    parser.add_argument('-pc',
+                        '--precompute',
+                        dest='precompute',
+                        action='store_true',
+                        default=False,
+                        help='If True, streamer precompute samples')
+
     parser.add_argument('--gpus',
                         dest='gpus',
                         type=int,

From a10f213638e9785b62353f39d698b0c86541916b Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 18 Dec 2017 17:07:05 -0500
Subject: [PATCH 049/201] Add slurm scripts

---
 jobs/classifier-train.sbatch  | 41 ++++++++++++++++++++++++++
 jobs/l3embedding-train.sbatch | 55 +++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+)
 create mode 100644 jobs/classifier-train.sbatch
 create mode 100644 jobs/l3embedding-train.sbatch

diff --git a/jobs/classifier-train.sbatch b/jobs/classifier-train.sbatch
new file mode 100644
index 0000000..e2455ee
--- /dev/null
+++ b/jobs/classifier-train.sbatch
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+##SBATCH --gres=gpu:1
+#SBATCH --job-name=us8k-classifier-train
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=2
+#SBATCH --mem=64GB
+#SBATCH --time=7-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@email.org
+#SBATCH --output="classifier-train-%j.out"
+#SBATCH --err="classifier-train-%j.err"
+
+
+source ~/.bashrc
+source activate l3embedding-cpu
+
+SRCDIR=''
+US8K_PATH=''
+METADATA_PATH=''
+DATA_DIR=''
+OUTPUT_DIR=''
+MODEL_PATH=''
+MODEL_ID='mycls'
+
+module purge
+#module load cuda/8.0.44
+#module load cudnn/8.0v6.0
+
+python $SRCDIR/train_classifier.py \
+    --random-state 20171021 \
+    --features l3_stats \
+    --label-format int \
+    --model-type svm \
+    --l3embedding-model-path $MODEL_PATH \
+    --l3embedding-model-type cnn_L3_orig \
+    --verbose \
+    $METADATA_PATH \
+    $DATA_DIR \
+    $MODEL_ID \
+    $OUTPUT_DIR
diff --git a/jobs/l3embedding-train.sbatch b/jobs/l3embedding-train.sbatch
new file mode 100644
index 0000000..8bee837
--- /dev/null
+++ b/jobs/l3embedding-train.sbatch
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+
+##SBATCH --gres=gpu:1
+#SBATCH --job-name=l3embedding-train
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=2
+#SBATCH --mem=64GB
+#SBATCH --time=7-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@email.org
+#SBATCH --output="l3embedding-train-%j.out"
+#SBATCH --err="l3embedding-train-%j.err"
+
+
+source ~/.bashrc
+source activate l3embedding
+
+SRCDIR=''
+TRAIN_DATA_DIR=''
+VAL_DATA_DIR=''
+MODEL_ID='mymodel'
+OUTPUT_DIR=''
+
+module purge
+module load cuda/8.0.44
+module load cudnn/8.0v6.0
+
+python $SRCDIR/train.py \
+    --num-epochs 150 \
+    --train-epoch-size 128 \
+    --train-batch-size 16 \
+    --train-num-streamers 32 \
+    --train-num-distractors 1 \
+    --train-mux-rate 256 \
+    --train-filter-path $SRCDIR/audioset_filter.csv \
+    --train-metadata-path "$TRAIN_DATA_DIR/../csv/*.csv" \
+    --ontology-path $SRCDIR/resources/ontology.json \
+    --model-type tiny_L3 \
+    --validation-epoch-size 128 \
+    --validation-batch-size 16 \
+    --validation-num-streamers 32 \
+    --validation-num-distractors 1 \
+    --validation-mux-rate 256 \
+    --train-max-videos 100 \
+    --validation-max-videos 100 \
+    --checkpoint-interval 10 \
+    --gpus 1 \
+    --learning-rate 0.001 \
+    --random-state 20171021 \
+    --augment \
+    --verbose \
+    $TRAIN_DATA_DIR \
+    $VAL_DATA_DIR \
+    $MODEL_ID \
+    $OUTPUT_DIR

From 9b6863084cbaf9b98f0d574bbfdeed9314f67e2d Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 18 Dec 2017 17:07:39 -0500
Subject: [PATCH 050/201] Add .gitignore

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index bc8a670..398f615 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1 @@
-.idea/*
\ No newline at end of file
+.idea/**.pyc*.err*.out
\ No newline at end of file

From e3db51a4435d1205bffed878322d0fe54aa0cbce Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 18 Dec 2017 22:18:55 -0500
Subject: [PATCH 051/201] Add docstrings, README, and fix some minor typos

---
 README.md                   | 13 ++++-
 classifier/train.py         | 43 +++++++++++++++--
 l3embedding/audio_model.py  | 22 +++++++--
 l3embedding/model.py        | 34 +++++++++----
 l3embedding/train.py        | 95 ++++++++++++++++++++++++++-----------
 l3embedding/vision_model.py | 22 +++++++--
 6 files changed, 181 insertions(+), 48 deletions(-)

diff --git a/README.md b/README.md
index b0dc68b..b4f7149 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,13 @@
 # l3embedding
-Learn and L3 embedding from audio/video pairs
+This is an implementation of the proposed model in Look, Listen and Learn ([Arandjelović, R., Zisserman, A. 2017](https://arxiv.org/pdf/1705.08168.pdf)). This model uses videos to learn vision and audio features in an unsupervised fashion by training the model for the proposed Audio-Visual Correspondence (AVC) task. This task tries to determine whether a piece of audio and an image frame come from the same video and occur simulatneously.
+
+The code for the model and training implementation can be found in `l3embedding/`. Note that the metadata format expected is the same used in [AudioSet](https://research.google.com/audioset/download.html) ([Gemmeke, J., Ellis, D., et al. 2017](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45857.pdf)), as training this model on AudioSet was one of the goals for this implementation.
+
+You can train an AVC/embedding model using `train.py`. Run `python train.py -h` to read the help message regarding how to use the script.
+
+There is also a module `classifier/` which contains code to train a classifier using that uses extracts embeddings on new audio using the embedding model. Currently this only supports using the [UrbanSound8K dataset](https://serv.cusp.nyu.edu/projects/urbansounddataset/urbansound8k.html) ([Salamon, J., Jacoby, C., Bello, J. 2014](https://serv.cusp.nyu.edu/projects/urbansounddataset/salamon_urbansound_acmmm14.pdf))
+
+You can train an urban sound classification model using `train_classifier.py`. Run `python train_classifier.py -h` to read the help message regarding how to use the script.
+
+
+If you use a SLURM environment, `sbatch` scripts are available in `jobs/`.
diff --git a/classifier/train.py b/classifier/train.py
index f3dc91f..582e893 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -1,7 +1,6 @@
 import os
 import scipy as sp
 import numpy as np
-import soundfile as sf
 import json
 import csv
 import logging
@@ -11,7 +10,6 @@
 import keras.regularizers as regularizers
 from keras.models import Model
 from keras.layers import Input, Dense, Activation
-from itertools import islice
 from sklearn.externals import joblib
 from sklearn.svm import SVC
 from sklearn.preprocessing import StandardScaler
@@ -232,7 +230,8 @@ def get_us8k_folds(metadata, data_dir, l3embedding_model=None,
     return fold_data
 
 
-def get_fold_data(metadata, data_dir, fold_idx, l3embedding_model=None, features='l3_stack', label_format='int'):
+def get_fold_data(metadata, data_dir, fold_idx, l3embedding_model=None,
+                  features='l3_stack', label_format='int'):
     """
     Load all of the data for a specific fold
 
@@ -375,7 +374,25 @@ def train_svm(X_train, y_train, X_test, y_test, C=1e-4, verbose=False, **kwargs)
 
     return clf, y_train_pred, y_test_pred
 
+
 def construct_mlp_model(input_shape, weight_decay=1e-5):
+    """
+    Constructs a multi-layer perceptron model
+
+    Args:
+        input_shape: Shape of input data
+                     (Type: tuple[int])
+        weight_decay: L2 regularization factor
+                      (Type: float)
+
+    Returns:
+        model: L3 CNN model
+               (Type: keras.models.Model)
+        input: Model input
+               (Type: list[keras.layers.Input])
+        output:Model output
+                (Type: keras.layers.Layer)
+    """
     weight_decay = 1e-5
     l2_weight_decay = regularizers.l2(weight_decay)
     inp = Input(shape=input_shape, dtype='float32')
@@ -411,6 +428,20 @@ def train_mlp(X_train, y_train, X_test, y_test, model_dir,
     Keyword Args:
         verbose:  If True, print verbose messages
                   (Type: bool)
+        batch_size: Number of smamples per batch
+                    (Type: int)
+        num_epochs: Number of training epochs
+                    (Type: int)
+        train_epoch_size: Number of training batches per training epoch
+                          (Type: int)
+        validation_epoch_size: Number of validation batches per validation epoch
+                               (Type: int)
+        validation_split: Percentage of training data used for validation
+                          (Type: float)
+        learning_rate: Learning rate value
+                       (Type: float)
+        weight_decay: L2 regularization factor
+                      (Type: float)
     """
     loss = 'categorical_crossentropy'
     metrics = ['accuracy']
@@ -495,6 +526,10 @@ def aggregate_metrics(fold_metrics):
     Args:
         fold_metrics: List of fold metrics dictionaries
                       (Type: list[dict[str, *]])
+
+    Returns:
+        aggr_metrics: Statistics averaged across epochs
+                      (Type: dict[str, dict[str,float]])
     """
     metric_keys = list(fold_metrics[0].keys())
     fold_metrics_list = {k: [fold[k] for fold in fold_metrics]
@@ -523,6 +558,8 @@ def print_metrics(metrics, subset_name):
     Args:
         metrics: Metrics dictionary
                  (Type: dict[str, *])
+        subset_name: Name of subset to print
+                     (Type: str)
     """
     LOGGER.info("Results metrics for {}".format(subset_name))
     LOGGER.info("=====================================================")
diff --git a/l3embedding/audio_model.py b/l3embedding/audio_model.py
index 330d224..b20a424 100644
--- a/l3embedding/audio_model.py
+++ b/l3embedding/audio_model.py
@@ -86,8 +86,8 @@ def construct_cnn_L3_orig_audio_model():
                  kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
-    y_a = Conv2D(name='audio_embedding_layer',
-                 n_filter_a_4, filt_size_a_4, padding='same',
+    y_a = Conv2D(n_filter_a_4, filt_size_a_4,
+                 name='audio_embedding_layer', padding='same',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
@@ -101,13 +101,27 @@ def construct_cnn_L3_orig_audio_model():
     return m, x_a, y_a
 
 
-def construct_cnn_l3_orig_audio_embedding(audio_model, x_a):
+def construct_cnn_l3_orig_audio_embedding_model(audio_model, x_a):
+    """
+    Constructs a model that produces the learned audio embedding
+
+    Args:
+        audio_model: audio subnetwork
+        x_a: Image data input Tensor
+
+    Returns:
+        m:   Model object
+        x_a: Image data input Tensor
+        y_a: Embedding output Tensor
+
+    """
     pool_size = (8, 8)
     embed_layer = audio_model.get_layer('audio_embedding_layer')
     y_a = MaxPooling2D(pool_size=pool_size, padding='same')(embed_layer)
     y_a = Flatten()(y_a)
 
     m = Model(inputs=x_a, outputs=y_a)
+    return m, x_a, y_a
 
 
 def construct_tiny_L3_audio_model():
@@ -162,5 +176,5 @@ def construct_tiny_L3_audio_model():
 
 
 EMBEDDING_MODELS = {
-    'cnn_L3_orig': construct_cnn_l3_orig_audio_embedding
+    'cnn_L3_orig': construct_cnn_l3_orig_audio_embedding_model
 }
diff --git a/l3embedding/model.py b/l3embedding/model.py
index 49be181..0c31278 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -1,6 +1,5 @@
 from keras.models import Model
 from keras.layers import concatenate, Dense, MaxPooling2D, Flatten
-import keras.regularizers as regularizers
 from .vision_model import *
 from .audio_model import *
 
@@ -44,9 +43,17 @@ def load_model(weights_path, model_type, return_io=False):
         model_type:    Name of model type
                        (Type: str)
 
+    Keyword Args:
+        return_io:  If True, return input and output tensors
+                    (Type: bool)
+
     Returns:
         model:  Loaded model object
                 (Type: keras.engine.training.Model)
+        x_i:    Input Tensor. Not returned if return_io is False.
+                (Type: keras.layers.Input)
+        y_i:    Embedding output Tensor/Layer. Not returned if return_io is False.
+                (Type: keras.layers.Layer)
     """
     if model_type not in MODELS:
         raise ValueError('Invalid model type: "{}"'.format(model_type))
@@ -54,12 +61,12 @@ def load_model(weights_path, model_type, return_io=False):
     m, inputs, output = MODELS[model_type]()
     m.load_weights(weights_path)
     if return_io:
-        return m, inputs, outputs
+        return m, inputs, output
     else:
         return m
 
 
-def load_embedding(weights_path, model_type, embedding_type):
+def load_embedding(weights_path, model_type, embedding_type, return_io=False):
     """
     Loads an embedding model
 
@@ -71,25 +78,34 @@ def load_embedding(weights_path, model_type, embedding_type):
         embedding_type:  Type of embedding to load ('audio' or 'vision')
                          (Type: str)
 
+    Keyword Args:
+        return_io:  If True, return input and output tensors
+                    (Type: bool)
+
     Returns:
-        model:  Loaded model object
+        model:  Embedding model object
                 (Type: keras.engine.training.Model)
+        x_i:    Input Tensor. Not returned if return_io is False.
+                (Type: keras.layers.Input)
+        y_i:    Embedding output Tensor/Layer. Not returned if return_io is False.
+                (Type: keras.layers.Layer)
     """
     m, inputs, output = load_model(weights_path, model_type, return_io=True)
     x_i, x_a = inputs
     if embedding_type == 'vision':
         m_embed_model = m.get_layer('vision_model')
-        m_embed_layer = m_embed.get_layer('vision_embedding_layer')
-        m_embed = EMBEDDING_MODELS[model_type](m_embed, x_i)
+        m_embed, x_embed, y_embed = EMBEDDING_MODELS[model_type](m_embed_model, x_i)
 
     elif embedding_type == 'audio':
         m_embed_model = m.get_layer('audio_model')
-        m_embed_layer = m_embed.get_layer('audio_embedding_layer')
-        m_embed = EMBEDDING_MODELS[model_type](m_embed, x_a)
+        m_embed, x_embed, y_embed = EMBEDDING_MODELS[model_type](m_embed_model, x_a)
     else:
         raise ValueError('Invalid embedding type: "{}"'.format(embedding_type))
 
-    return m_embed
+    if return_io:
+        return m_embed, x_embed, y_embed
+    else:
+        return m_embed
 
 
 def construct_cnn_L3_orig():
diff --git a/l3embedding/train.py b/l3embedding/train.py
index f44c16a..dd46333 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -11,10 +11,8 @@
 
 import keras
 from keras.optimizers import Adam
-import numpy as np
 import pescador
 import scipy.misc
-import skimage
 from skvideo.io import FFmpegReader, ffprobe
 import soundfile as sf
 from tqdm import tqdm
@@ -89,6 +87,11 @@ def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=
     Args:
         data_dir: input directory that contains audio/ and video/
 
+    Keyword Args:
+        metadata_path: Path to audioset metadata file
+        filter_path: Path to filter specification file
+        ontology_path: Path to AudioSet ontology file
+
     Returns:
         audio_files: list of audio files
         video_files: list of video files
@@ -254,6 +257,8 @@ def sample_one_frame(video_data, start=None, fps=30, augment=False):
 
     Args:
         video_data: video data to sample from
+
+    Keyword Args:
         start: start time of a one second window from which to sample
         fps: frame per second
         augment: if True, perturb the data in some fashion
@@ -338,6 +343,18 @@ def sample_one_frame(video_data, start=None, fps=30, augment=False):
 
 
 def read_video(video_path):
+    """
+    Read a video file as a numpy array
+
+    Resizes frames so that the minimum side is 256 pixels
+
+    Args:
+        video_path: Path to video file
+
+    Returns:
+        video: Numpy data array
+
+    """
     vinfo = ffprobe(video_path)['video']
     width = int(vinfo['@width'])
     height = int(vinfo['@height'])
@@ -361,6 +378,30 @@ def read_video(video_path):
 def generate_sample(audio_file_1, audio_data_1, audio_file_2, audio_data_2,
                     video_file_1, video_data_1, video_file_2, video_data_2,
                     audio_sampling_frequency, augment=False):
+    """
+    Generate a sample from the given audio and video files
+
+    The video from which the audio come from is chosen with a fair coin, as is
+    the video frame. Thus, there is a 50% chance of producing a positive or
+    negative example.
+
+    Args:
+        audio_file_1: audio filename
+        audio_data_1: audio data array
+        audio_file_2: audio filename
+        audio_data_2: audio data array
+        video_file_1: video filename
+        video_data_1: video data array
+        video_file_2: video filename
+        video_data_2: video data array
+        audio_sampling_frequency: audio sample rate
+
+    Keyword Args
+        augment: If True, perform data augmention
+
+    Returns:
+        sample: sample dictionary
+    """
     video_choice = random.random() < 0.5
     audio_choice = random.random() < 0.5
 
@@ -392,12 +433,12 @@ def generate_sample(audio_file_1, audio_data_1, audio_file_2, audio_data_2,
         'video': np.ascontiguousarray(sample_video_data),
         'audio': np.ascontiguousarray(sample_audio_data),
         'label': np.ascontiguousarray(np.array([label, 1 - label])),
-#            'audio_file': audio_file,
-#            'video_file': video_file,
-#            'audio_start': audio_start,
-#            'video_start': video_start,
-#            'audio_augment_params': audio_aug_params,
-#            'video_augment_params': video_aug_params
+#       'audio_file': audio_file,
+#       'video_file': video_file,
+#       'audio_start': audio_start,
+#       'video_start': video_start,
+#       'audio_augment_params': audio_aug_params,
+#       'video_augment_params': video_aug_params
     }
 
     return sample
@@ -411,6 +452,12 @@ def sampler(video_file_1, video_file_2, rate=32, augment=False, precompute=False
         video_file_1: video_file to sample from
         video_file_2: candidate audio_files to sample from
 
+    Keyword Args:
+        rate: Poisson rate parameter. Used for precomputing samples
+        augment: If True, perform data augmention
+        precompute: If True, precompute samples during initialization so that
+                    memory can be discarded
+
     Returns:
         A generator that yields dictionary of video sample, audio sample,
         and label (0: not from corresponding files, 1: from corresponding files)
@@ -516,8 +563,13 @@ def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path
 
     Args:
         data_dir: directory to sample video and audio from
+        metadata_path: Path to audioset metadata file
+        filter_path: Path to filter specification file
+        ontology_path: Path to AudioSet ontology file
         k: number of concurrent open streamer
         batch_size: batch size
+        random_state: Value used to initialize state of RNG
+        num_distractors: Number of pairs to generate a stream for each video
 
     Returns:
         A generator that yield infinite video and audio samples from data_dir
@@ -564,6 +616,9 @@ def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path
 
 
 class LossHistory(keras.callbacks.Callback):
+    """
+    Keras callback to record loss history
+    """
 
     def __init__(self, outfile):
         super().__init__()
@@ -588,8 +643,11 @@ def on_epoch_end(self, epoch, logs=None):
 
 
 class TimeHistory(keras.callbacks.Callback):
+    """
+    Keras callback to log epoch and batch running time
+    """
     # Copied from https://stackoverflow.com/a/43186440/1260544
-    def on_train_begin(self, logs={}):
+    def on_train_begin(self, logs=None):
         self.epoch_times = []
         self.batch_times = []
 
@@ -741,21 +799,4 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
     with open(os.path.join(model_dir, 'history.pkl'), 'wb') as fd:
         pickle.dump(history.history, fd)
 
-    # Evaluate model
-    # print('Evaluate model...')
-    # Load best params
-    # m.load_weights(weight_path)
-    # with open(os.path.join(output_dir, 'index_test.json'), 'r') as fp:
-    #     test_idx = json.load(fp)['id']
-
-    # Compute eval scores
-    # results = score_model(output_dir, pump, model, test_idx, working,
-    #                       strong_label_file, duration, modelid,
-    #                       use_orig_duration=True)
-
-    # Save results to disk
-    # results_file = os.path.join(model_dir, 'results.json')
-    # with open(results_file, 'w') as fp:
-    #     json.dump(results, fp, indent=2)
-
-    print('Done!')
+    LOGGER.info('Done!')
diff --git a/l3embedding/vision_model.py b/l3embedding/vision_model.py
index 5c365b4..150289c 100644
--- a/l3embedding/vision_model.py
+++ b/l3embedding/vision_model.py
@@ -77,8 +77,8 @@ def construct_cnn_L3_orig_vision_model():
                  kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
-    y_i = Conv2D(name='vision_embedding_layer',
-                 n_filter_i_4, filt_size_i_4, padding='same',
+    y_i = Conv2D(n_filter_i_4, filt_size_i_4,
+                 name='vision_embedding_layer', padding='same',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
@@ -91,13 +91,27 @@ def construct_cnn_L3_orig_vision_model():
     return m, x_i, y_i
 
 
-def construct_cnn_l3_orig_vision_embedding(vision_model, x_i):
+def construct_cnn_l3_orig_vision_embedding_model(vision_model, x_i):
+    """
+    Constructs a model that produces the learned vision embedding
+
+    Args:
+        vision_model: Vision subnetwork
+        x_i: Image data input Tensor
+
+    Returns:
+        m:   Model object
+        x_i: Image data input Tensor
+        y_i: Embedding output Tensor
+
+    """
     pool_size = (7, 7)
     embed_layer = vision_model.get_layer('vision_embedding_layer')
     y_i = MaxPooling2D(pool_size=pool_size, padding='same')(embed_layer)
     y_i = Flatten()(y_i)
 
     m = Model(inputs=x_i, outputs=y_i)
+    return m, x_i, y_i
 
 
 def construct_tiny_L3_vision_model():
@@ -145,5 +159,5 @@ def construct_tiny_L3_vision_model():
 
 
 EMBEDDING_MODELS = {
-    'cnn_L3_orig': construct_cnn_l3_orig_vision_embedding
+    'cnn_L3_orig': construct_cnn_l3_orig_vision_embedding_model
 }

From b4dbcb1f2d8f2e9b0987de484d6a0f0867f4b4f3 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 18 Dec 2017 22:56:29 -0500
Subject: [PATCH 052/201] Add requirements

---
 README.md        |  5 +++++
 requirements.txt | 13 +++++++++++++
 2 files changed, 18 insertions(+)
 create mode 100644 requirements.txt

diff --git a/README.md b/README.md
index b4f7149..9be2656 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,11 @@
 # l3embedding
 This is an implementation of the proposed model in Look, Listen and Learn ([Arandjelović, R., Zisserman, A. 2017](https://arxiv.org/pdf/1705.08168.pdf)). This model uses videos to learn vision and audio features in an unsupervised fashion by training the model for the proposed Audio-Visual Correspondence (AVC) task. This task tries to determine whether a piece of audio and an image frame come from the same video and occur simulatneously.
 
+Dependencies
+* Python 3
+* [ffmpeg](http://www.ffmpeg.org)
+* [sox](http://sox.sourceforge.net)
+
 The code for the model and training implementation can be found in `l3embedding/`. Note that the metadata format expected is the same used in [AudioSet](https://research.google.com/audioset/download.html) ([Gemmeke, J., Ellis, D., et al. 2017](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45857.pdf)), as training this model on AudioSet was one of the goals for this implementation.
 
 You can train an AVC/embedding model using `train.py`. Run `python train.py -h` to read the help message regarding how to use the script.
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..4df454a
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,13 @@
+Keras==2.1.0
+librosa==0.5.1
+numpy==1.13.3
+pescador==1.1.0
+Pillow-SIMD==4.3.0.post0
+PySoundFile==0.9.0.post1
+-e git+https://github.com/jtcramer/kapre.git@7057f06a722113341e149516c143a4ecb00ceebd#egg=kapre
+-e git+https://github.com/scikit-image/scikit-image.git@e2a609415f17230549b845c38511315659f29f1e#egg=scikit_image
+scikit-learn==0.19.0
+scipy==0.19.1
+sk-video==1.1.8
+tensorflow==1.3.0
+tqdm==4.19.4

From f8c844b132362c1e6eb4ad2f558ba9ad6af2d9d8 Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Wed, 17 Jan 2018 18:30:20 -0500
Subject: [PATCH 053/201] Add script for pre sampling

---
 sample.py | 740 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 740 insertions(+)
 create mode 100644 sample.py

diff --git a/sample.py b/sample.py
new file mode 100644
index 0000000..863df31
--- /dev/null
+++ b/sample.py
@@ -0,0 +1,740 @@
+import argparse
+import csv
+import datetime
+import glob
+import json
+import math
+from multiprocessing import Pool
+import os
+import pickle
+import random
+import time
+import warnings
+
+import h5py
+import numpy as np
+import pescador
+import scipy.misc
+import skimage
+import skimage.color
+from skvideo.io import FFmpegReader, ffprobe
+import soundfile as sf
+from tqdm import tqdm
+
+from log import *
+
+LOGGER = logging.getLogger('l3embedding')
+LOGGER.setLevel(logging.ERROR)
+
+
+def adjust_saturation(rgb_img, factor):
+    """
+    Adjust the saturation of an RGB image
+
+    Args:
+        rgb_img: RGB image data array
+        factor: Multiplicative scaling factor to be applied to saturation
+
+    Returns:
+        adjusted_img: RGB image with adjusted saturation
+    """
+    hsv_img = skimage.color.rgb2hsv(rgb_img)
+    imin, imax = skimage.dtype_limits(hsv_img)
+    hsv_img[:,:,1] = np.clip(hsv_img[:,:,1] * factor, imin, imax)
+    return skimage.color.hsv2rgb(hsv_img)
+
+
+def adjust_brightness(rgb_img, delta):
+    """
+    Adjust the brightness of an RGB image
+
+    Args:
+        rgb_img: RGB image data array
+        delta: Additive (normalized) gain factor applied to each pixel
+
+    Returns:
+        adjusted_img: RGB image with adjusted saturation
+    """
+    imin, imax = skimage.dtype_limits(rgb_img)
+    # Convert delta into the range of the image data
+    delta = rgb_img.dtype.type((imax - imin) * delta)
+
+    return np.clip(rgb_img + delta, imin, imax)
+
+
+def horiz_flip(rgb_img):
+    """
+    Horizontally flip the given image
+
+    Args:
+        rgb_img: RGB image data array
+
+    Returns:
+        flipped_img: Horizontally flipped image
+    """
+    return rgb_img[:,::-1,:]
+
+
+def get_filename(path):
+    """Return the filename of a path
+
+    Args: path: path to file
+
+    Returns:
+        filename: name of file (without extension)
+    """
+    return os.path.splitext(os.path.basename(path))[0]
+
+
+def load_metadata(metadata_path):
+    metadata = {}
+    for path in glob.glob(metadata_path):
+        with open(path, 'r') as f:
+            for idx, line in enumerate(f):
+                if idx in (0, 1):
+                    continue
+                elif idx == 2:
+                    fields = [field.strip() for field in line.lstrip('# ').rstrip().split(', ')]
+                else:
+                    row = [val.strip() for val in line.strip().split(', ')]
+                    ytid = row[0]
+
+                    entry = {field: val
+                            for field, val in zip(fields[1:], row[1:])}
+
+                    entry['positive_labels'] = entry['positive_labels'].strip('"').split(',')
+                    entry['start_seconds'] = float(entry['start_seconds'])
+                    entry['end_seconds'] = float(entry['end_seconds'])
+
+                    metadata[ytid] = entry
+
+    return metadata
+
+
+def load_filters(filter_path):
+    filters = []
+
+    with open(filter_path, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            filters.append(row)
+
+    return filters
+
+def get_ytid_from_filename(filename):
+    first_us_idx = filename.rindex('_')
+    second_us_idx = filename.rindex('_', 0, first_us_idx)
+    return filename[:second_us_idx]
+
+
+def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=None):
+    """Return audio and video file list.
+
+    Args:
+        data_dir: input directory that contains audio/ and video/
+
+    Keyword Args:
+        metadata_path: Path to audioset metadata file
+        filter_path: Path to filter specification file
+        ontology_path: Path to AudioSet ontology file
+
+    Returns:
+        audio_files: list of audio files
+        video_files: list of video files
+
+    """
+    data_dir_contents = set(os.listdir(data_dir))
+    if 'audio' in data_dir_contents and 'video' in data_dir_contents:
+        audio_files = glob.glob('{}/audio/*'.format(data_dir))
+        video_files = glob.glob('{}/video/*'.format(data_dir))
+    else:
+        audio_files = glob.glob('{}/**/audio/*'.format(data_dir))
+        video_files = glob.glob('{}/**/video/*'.format(data_dir))
+
+    # Make sure that audio files and video files correspond to each other
+    audio_filenames = set([get_filename(path) for path in audio_files])
+    video_filenames = set([get_filename(path) for path in video_files])
+
+    valid_filenames = audio_filenames & video_filenames
+
+    if metadata_path and filter_path:
+        LOGGER.info('Filtering examples...')
+        if not ontology_path:
+            err_msg = 'Must provide ontology path to filter'
+            LOGGER.error(err_msg)
+            raise ValueError(err_msg)
+
+        ontology = ASOntology(ontology_path)
+
+        metadata = load_metadata(metadata_path)
+        filters = load_filters(filter_path)
+
+        filtered_filenames = []
+
+        for filename in valid_filenames:
+            ytid = get_ytid_from_filename(filename)
+            video_metadata = metadata[ytid]
+
+            video_labels = [ontology.get_node(label_id).name.lower()
+                            for label_id in video_metadata['positive_labels']]
+
+            accept = True
+            for _filter in filters:
+                filter_type = _filter['filter_type']
+                filter_accept = _filter['accept_reject'] == 'accept'
+                string = _filter['string']
+
+                if filter_type == 'ytid':
+                    match = ytid == string
+
+                elif filter_type == 'label':
+                    match = string.lower() in video_labels
+
+                # TODO: check this logic
+                if match == filter_accept:
+                    accept = False
+
+            if accept:
+                #LOGGER.debug('Using video: "{}"'.format(filename))
+                filtered_filenames.append(filename)
+
+        valid_filenames = set(filtered_filenames)
+
+    LOGGER.info('Total videos used: {}'.format(len(valid_filenames)))
+    audio_files = [path for path in audio_files if get_filename(path) in valid_filenames]
+    video_files = [path for path in video_files if get_filename(path) in valid_filenames]
+
+    return audio_files, video_files
+
+
+def video_to_audio(video_file):
+    """Return corresponding audio_file.
+
+    Args:
+        video_file: video_file
+
+    Returns:
+        audio_file
+
+    """
+
+    *path, _, name = video_file.split('/')
+    name = name.split('.')[0] + '.flac'
+    return '/'.join(path + ['audio', name])
+
+
+def sample_one_second(audio_data, sampling_frequency, augment=False):
+    """Return one second audio samples randomly
+
+    Args:
+        audio_data: audio data to sample from
+        sampling_frequency: audio sample rate
+        augment: if True, perturb the data in some fashion
+
+    Returns:
+        One second samples, start time, and augmentation parameters
+
+    """
+    sampling_frequency = int(sampling_frequency)
+    if len(audio_data) > sampling_frequency:
+        start = random.randrange(len(audio_data) - sampling_frequency)
+    else:
+        start = 0
+
+    with LogTimer(LOGGER, 'Slicing audio'):
+        audio_data = audio_data[start:start+sampling_frequency]
+
+    if audio_data.shape[0] != sampling_frequency:
+        # Pad audio that isn't one second
+        warnings.warn('Got audio that is less than one second', UserWarning)
+        with LogTimer(LOGGER, 'Slicing audio'):
+            audio_data = np.pad(audio_data,
+                                ((0, sampling_frequency - audio_data.shape[0]), (0,0)),
+                                mode='constant')
+    if augment:
+        # Make sure we don't clip
+        if np.abs(audio_data).max():
+            max_gain = min(0.1, 1.0/np.abs(audio_data).max() - 1)
+        else:
+            # TODO: Handle audio with all zeros
+            warnings.warn('Got audio sample with all zeros', UserWarning)
+            max_gain = 0.1
+        gain = 1 + random.uniform(-0.1, max_gain)
+        with LogTimer(LOGGER, 'Applying gain to audio'):
+            audio_data *= gain
+        audio_aug_params = {'gain': gain}
+    else:
+        audio_aug_params = {}
+
+    return audio_data, start / sampling_frequency, audio_aug_params
+
+
+def sample_cropped_frame(frame_data):
+    """
+    Randomly crop a video frame, using the method from Look, Listen and Learn
+
+
+    Args:
+        frame_data: video frame data array
+
+    Returns:
+        scaled_frame_data: scaled and cropped frame data
+        bbox: bounding box for the cropped image
+    """
+    nx, ny, nc = frame_data.shape
+    start_x, start_y = random.randrange(nx - 224), random.randrange(ny - 224)
+    end_x, end_y = start_x + 224, start_y + 224
+
+    bbox = {
+        'start_x': start_x,
+        'start_y': start_y,
+        'end_x': end_x,
+        'end_y': end_y
+    }
+
+    with LogTimer(LOGGER, 'Cropping frame'):
+        frame_data = frame_data[start_x:end_x, start_y:end_y, :]
+
+    return frame_data, bbox
+
+
+def sample_one_frame(video_data, start=None, fps=30, augment=False):
+    """Return one frame randomly and time (seconds).
+
+    Args:
+        video_data: video data to sample from
+
+    Keyword Args:
+        start: start time of a one second window from which to sample
+        fps: frame per second
+        augment: if True, perturb the data in some fashion
+
+    Returns:
+        One frame sampled randomly, start time in seconds, and augmentation parameters
+
+    """
+
+    num_frames = len(video_data)
+    if start is not None:
+        start_frame = int(start * fps)
+        # Sample frame from a one second window, or until the end of the video
+        # if the video is less than a second for some reason
+        # Audio should always be sampled one second from the end of the audio,
+        # so video frames we're sampling from should also be a second. If it's
+        # not, then our video is probably less than a second
+        duration = min(fps, num_frames - start_frame)
+        if duration != fps:
+            warnings.warn('Got video that is less than one second', UserWarning)
+
+        if duration > 0:
+            frame = start_frame + random.randrange(duration)
+        else:
+            warnings.warn('Got video with only a single frame', UserWarning)
+            # For robustness, use the last frame if the start_frame goes past
+            # the end of video frame
+            frame = min(start_frame, num_frames - 1)
+    else:
+        frame = random.randrange(num_frames)
+
+    frame_data = video_data[frame]
+    frame_data, bbox = sample_cropped_frame(frame_data)
+
+    video_aug_params = {'bounding_box': bbox}
+
+    if augment:
+        # Randomly horizontally flip the image
+        horizontal_flip = False
+        if random.random() < 0.5:
+            with LogTimer(LOGGER, 'Flipping frame'):
+                frame_data = horiz_flip(frame_data)
+            horizontal_flip = True
+
+
+        # Ranges taken from https://github.com/tensorflow/models/blob/master/research/slim/preprocessing/inception_preprocessing.py
+
+        # Randomize the order of saturation jitter and brightness jitter
+        if random.random() < 0.5:
+            # Add saturation jitter
+            saturation_factor = np.float32(random.random() + 0.5)
+            with LogTimer(LOGGER, 'Adjusting saturation'):
+                frame_data = adjust_saturation(frame_data, saturation_factor)
+
+            # Add brightness jitter
+            max_delta = 32. / 255.
+            brightness_delta = np.float32((2*random.random() - 1) * max_delta)
+            with LogTimer(LOGGER, 'Adjusting brightness'):
+                frame_data = adjust_brightness(frame_data, brightness_delta)
+        else:
+            # Add brightness jitter
+            max_delta = 32. / 255.
+            brightness_delta = np.float32((2*random.random() - 1) * max_delta)
+            with LogTimer(LOGGER, 'Adjusting brightness'):
+                frame_data = adjust_brightness(frame_data, brightness_delta)
+
+            # Add saturation jitter
+            saturation_factor = np.float32(random.random() + 0.5)
+            with LogTimer(LOGGER, 'Adjusting saturation'):
+                frame_data = adjust_saturation(frame_data, saturation_factor)
+
+        video_aug_params.update({
+            'horizontal_flip': horizontal_flip,
+            'saturation_factor': saturation_factor,
+            'brightness_delta': brightness_delta
+        })
+
+    frame_data = skimage.img_as_float32(frame_data)
+
+
+    return frame_data, frame / fps, video_aug_params
+
+
+def read_video(video_path):
+    """
+    Read a video file as a numpy array
+
+    Resizes frames so that the minimum side is 256 pixels
+
+    Args:
+        video_path: Path to video file
+
+    Returns:
+        video: Numpy data array
+
+    """
+    vinfo = ffprobe(video_path)['video']
+    width = int(vinfo['@width'])
+    height = int(vinfo['@height'])
+
+    scaling = 256.0 / min(width, height)
+    new_width = math.ceil(scaling * width)
+    new_height = math.ceil(scaling * height)
+
+    # Resize frames
+    reader = FFmpegReader(video_path,
+                          outputdict={'-s': "{}x{}".format(new_width,
+                                                           new_height) })
+
+    frames = []
+    for frame in reader.nextFrame():
+        frames.append(frame)
+
+    return frames
+
+
+def generate_sample(audio_file_1, audio_data_1, audio_file_2, audio_data_2,
+                    video_file_1, video_data_1, video_file_2, video_data_2,
+                    audio_sampling_frequency, augment=False):
+    """
+    Generate a sample from the given audio and video files
+
+    The video from which the audio come from is chosen with a fair coin, as is
+    the video frame. Thus, there is a 50% chance of producing a positive or
+    negative example.
+
+    Args:
+        audio_file_1: audio filename
+        audio_data_1: audio data array
+        audio_file_2: audio filename
+        audio_data_2: audio data array
+        video_file_1: video filename
+        video_data_1: video data array
+        video_file_2: video filename
+        video_data_2: video data array
+        audio_sampling_frequency: audio sample rate
+
+    Keyword Args
+        augment: If True, perform data augmention
+
+    Returns:
+        sample: sample dictionary
+    """
+    video_choice = random.random() < 0.5
+    audio_choice = random.random() < 0.5
+
+    if audio_choice:
+        audio_file = audio_file_1
+        audio_data = audio_data_1
+    else:
+        audio_file = audio_file_2
+        audio_data = audio_data_2
+
+    if video_choice:
+        video_file = video_file_1
+        video_data = video_data_1
+    else:
+        video_file = video_file_2
+        video_data = video_data_2
+
+    label = int(video_choice != audio_choice)
+
+    sample_audio_data, audio_start, audio_aug_params \
+        = sample_one_second(audio_data, audio_sampling_frequency, augment=augment)
+
+    sample_video_data, video_start, video_aug_params \
+        = sample_one_frame(video_data, start=audio_start, augment=augment)
+
+    sample_audio_data = sample_audio_data.mean(axis=-1).reshape((1, sample_audio_data.shape[0]))
+
+    sample = {
+        'video': np.ascontiguousarray(sample_video_data),
+        'audio': np.ascontiguousarray(sample_audio_data),
+        'label': np.ascontiguousarray(np.array([label, 1 - label])),
+    }
+
+    return sample
+
+
+def sampler(video_file_1, video_file_2, rate=32, augment=False, precompute=False):
+    """Sample one frame from video_file, with 50% chance sample one second from corresponding audio_file,
+       50% chance sample one second from another audio_file in the list of audio_files.
+
+    Args:
+        video_file_1: video_file to sample from
+        video_file_2: candidate audio_files to sample from
+
+    Keyword Args:
+        rate: Poisson rate parameter. Used for precomputing samples
+        augment: If True, perform data augmention
+        precompute: If True, precompute samples during initialization so that
+                    memory can be discarded
+
+    Returns:
+        A generator that yields dictionary of video sample, audio sample,
+        and label (0: not from corresponding files, 1: from corresponding files)
+
+    """
+    debug_msg = 'Initializing streamer with videos "{}" and "{}"'
+    LOGGER.debug(debug_msg.format(video_file_1, video_file_2))
+    audio_file_1 = video_to_audio(video_file_1)
+    audio_file_2 = video_to_audio(video_file_2)
+
+    # Hack: choose a number of samples such that we with high probability, we
+    #       won't run out of samples, but is also less than the entire length of
+    #       the video so we don't have to resize all of the frames
+    num_samples = int(scipy.stats.poisson.ppf(0.999, rate))
+
+
+    try:
+        with LogTimer(LOGGER, 'Reading video'):
+            video_data_1 = read_video(video_file_1)
+    except Exception as e:
+        warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
+        warn_msg = warn_msg.format(video_file_1, type(e), e)
+        LOGGER.warning(warn_msg)
+        warnings.warn(warn_msg)
+        raise StopIteration()
+
+    try:
+        with LogTimer(LOGGER, 'Reading video'):
+            video_data_2 = read_video(video_file_2)
+    except Exception as e:
+        warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
+        warn_msg = warn_msg.format(video_file_2, type(e), e)
+        LOGGER.warning(warn_msg)
+        warnings.warn(warn_msg)
+        raise StopIteration()
+
+    try:
+        with LogTimer(LOGGER, 'Reading audio'):
+            audio_data_1, sampling_frequency = sf.read(audio_file_1,
+                                                       dtype='float32',
+                                                       always_2d=True)
+    except Exception as e:
+        warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
+        warn_msg = warn_msg.format(audio_file_1, type(e), e)
+        LOGGER.warning(warn_msg)
+        warnings.warn(warn_msg)
+        raise StopIteration()
+
+    try:
+        with LogTimer(LOGGER, 'Reading audio'):
+            audio_data_2, sampling_frequency = sf.read(audio_file_2,
+                                                       dtype='float32',
+                                                       always_2d=True)
+    except Exception as e:
+        warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
+        warn_msg = warn_msg.format(audio_file_2, type(e), e)
+        LOGGER.warning(warn_msg)
+        warnings.warn(warn_msg)
+        raise StopIteration()
+
+    if precompute:
+        samples = []
+        for _ in range(num_samples):
+            sample = generate_sample(
+                audio_file_1, audio_data_1, audio_file_2, audio_data_2,
+                video_file_1, video_data_1, video_file_2, video_data_2,
+                sampling_frequency, augment=augment)
+
+            samples.append(sample)
+
+        # Clear the data from memory
+        video_data_1 = None
+        video_data_2 = None
+        audio_data_1 = None
+        audio_data_2 = None
+        video_data = None
+        audio_data = None
+        del video_data_1
+        del video_data_2
+        del audio_data_1
+        del audio_data_2
+        del video_data
+        del audio_data
+
+        while samples:
+            # Yield the sample, and remove from the list to free up some memory
+            yield samples.pop()
+    else:
+        while True:
+            yield generate_sample(
+                audio_file_1, audio_data_1, audio_file_2, audio_data_2,
+                video_file_1, video_data_1, video_file_2, video_data_2,
+                sampling_frequency, augment=augment)
+
+    raise StopIteration()
+
+
+
+def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path=None,
+                   k=32, batch_size=64, random_state=20171021, precompute=False,
+                   num_distractors=1, augment=False, rate=32, max_videos=None):
+    """Sample video and audio from data_dir, returns a streamer that yield samples infinitely.
+
+    Args:
+        data_dir: directory to sample video and audio from
+        metadata_path: Path to audioset metadata file
+        filter_path: Path to filter specification file
+        ontology_path: Path to AudioSet ontology file
+        k: number of concurrent open streamer
+        batch_size: batch size
+        random_state: Value used to initialize state of RNG
+        num_distractors: Number of pairs to generate a stream for each video
+
+    Returns:
+        A generator that yield infinite video and audio samples from data_dir
+
+    """
+
+    random.seed(random_state)
+
+    LOGGER.info("Getting file list...")
+    _, video_files = get_file_list(data_dir, metadata_path=metadata_path,
+                                             filter_path=filter_path, ontology_path=ontology_path)
+
+    LOGGER.info("Creating streamers...")
+    if max_videos is not None and max_videos < len(video_files):
+        LOGGER.info("Using a subset of {} videos".format(max_videos))
+        random.shuffle(video_files)
+        video_files = video_files[:max_videos]
+
+    seeds = []
+    for video_file_1 in tqdm(video_files):
+        for _ in range(num_distractors):
+            video_file_2 = video_file_1
+            # Make sure we sample a different file
+            while video_file_2 == video_file_1:
+                video_file_2 = random.choice(video_files)
+
+            streamer = pescador.Streamer(sampler, video_file_1, video_file_2,
+                                         rate=rate, augment=augment,
+                                         precompute=precompute)
+            seeds.append(streamer)
+
+    # Randomly shuffle the seeds
+    random.shuffle(seeds)
+
+    mux = pescador.Mux(seeds, k, rate=rate)
+    if batch_size == 1:
+        return mux
+    else:
+        return pescador.maps.buffer_stream(mux, batch_size)
+
+
+def write_to_h5(path, batch):
+    with h5py.File(path, 'w') as f:
+        f.create_dataset('video', data=batch['video'], compression='gzip')
+        f.create_dataset('audio', data=batch['audio'], compression='gzip')
+        f.create_dataset('label', data=batch['label'], compression='gzip')
+
+
+def sample_and_save(index):
+    train_gen = data_generator(
+        train_data_dir,
+        metadata_path=None,
+        ontology_path=None,
+        filter_path=None,
+        batch_size=train_batch_size,
+        random_state=index,
+        k=train_num_streamers,
+        augment=True,
+        num_distractors=1,
+        max_videos=None,
+        precompute=False,
+        rate=train_mux_rate)
+
+    for sub_index, batch in enumerate(train_gen):
+        write_to_h5('{}_{}_{}.h5'.format(output_dir, index, sub_index), batch)
+
+
+from multiprocessing import Pool
+
+def map_iterate_in_parallel(iterable, function, processes=8):
+    pool = Pool(processes=processes)
+    output = pool.map(function, iterable)
+    return list(output)
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Pre-sample videos and audios for L3 model.')
+    parser.add_argument('-tbs',
+                        '--train-batch-size',
+                        dest='train_batch_size',
+                        action='store',
+                        type=int,
+                        default=64,
+                        help='Number of examples per training batch')
+
+    parser.add_argument('-tns',
+                        '--train-num-streamers',
+                        dest='train_num_streamers',
+                        action='store',
+                        type=int,
+                        default=64,
+                        help='Number of training pescador streamers that can be open concurrently')
+
+    parser.add_argument('-tmr',
+                        '--train-mux-rate',
+                        dest='train_mux_rate',
+                        action='store',
+                        type=float,
+                        default=2.0,
+                        help='Poisson distribution parameter for determining number of training samples to take from a streamer')
+
+    parser.add_argument('train_data_dir',
+                        action='store',
+                        type=str,
+                        help='Path to directory where training set files are stored')
+
+    parser.add_argument('output_dir',
+                        action='store',
+                        type=str,
+                        help='Path to directory where output files will be stored')
+
+    args = parser.parse_args()
+
+    # train_data_dir = '/beegfs/work/AudioSet/data'
+    # train_batch_size = 64
+    # train_num_streamers = 64
+    # train_mux_rate = 2
+    # output_dir = '/scratch/hhw230/train'
+
+    train_data_dir = args.train_data_dir
+    train_batch_size = args.train_batch_size
+    train_num_streamers = args.train_num_streamers
+    train_mux_rate = args.train_mux_rate
+    output_dir = args.output_dir
+
+    number_proc = 2
+    map_iterate_in_parallel(range(number_proc), sample_and_save,
+                            processes=number_proc)

From 2abdd7331958350f590a02d9107af8a3c4fade2b Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Wed, 17 Jan 2018 18:32:26 -0500
Subject: [PATCH 054/201] make num of workers args

---
 sample.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/sample.py b/sample.py
index 863df31..2957db5 100644
--- a/sample.py
+++ b/sample.py
@@ -711,6 +711,14 @@ def map_iterate_in_parallel(iterable, function, processes=8):
                         default=2.0,
                         help='Poisson distribution parameter for determining number of training samples to take from a streamer')
 
+    parser.add_argument('-n',
+                        '--num-workers',
+                        dest='num_workers',
+                        action='store',
+                        type=int,
+                        default=4,
+                        help='Number of multiprocessing workers used to download videos')
+
     parser.add_argument('train_data_dir',
                         action='store',
                         type=str,
@@ -735,6 +743,5 @@ def map_iterate_in_parallel(iterable, function, processes=8):
     train_mux_rate = args.train_mux_rate
     output_dir = args.output_dir
 
-    number_proc = 2
-    map_iterate_in_parallel(range(number_proc), sample_and_save,
-                            processes=number_proc)
+    map_iterate_in_parallel(range(args.num_workers), sample_and_save,
+                            processes=args.num_workers)

From 171f4152acf62c3602ba698abeb1d7feb0428052 Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Wed, 17 Jan 2018 18:49:18 -0500
Subject: [PATCH 055/201] Rearrange args

---
 sample.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sample.py b/sample.py
index 2957db5..a433c60 100644
--- a/sample.py
+++ b/sample.py
@@ -23,7 +23,7 @@
 
 from log import *
 
-LOGGER = logging.getLogger('l3embedding')
+LOGGER = logging.getLogger('sampling')
 LOGGER.setLevel(logging.ERROR)
 
 
@@ -677,8 +677,6 @@ def sample_and_save(index):
         write_to_h5('{}_{}_{}.h5'.format(output_dir, index, sub_index), batch)
 
 
-from multiprocessing import Pool
-
 def map_iterate_in_parallel(iterable, function, processes=8):
     pool = Pool(processes=processes)
     output = pool.map(function, iterable)
@@ -736,12 +734,14 @@ def map_iterate_in_parallel(iterable, function, processes=8):
     # train_num_streamers = 64
     # train_mux_rate = 2
     # output_dir = '/scratch/hhw230/train'
+    # num_workers = 2
 
     train_data_dir = args.train_data_dir
     train_batch_size = args.train_batch_size
     train_num_streamers = args.train_num_streamers
     train_mux_rate = args.train_mux_rate
     output_dir = args.output_dir
+    num_workers = args.num_workers
 
-    map_iterate_in_parallel(range(args.num_workers), sample_and_save,
-                            processes=args.num_workers)
+    map_iterate_in_parallel(range(num_workers), sample_and_save,
+                            processes=num_workers)

From f5eae536055573364f15ab081f0e901cedbb0061 Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Wed, 17 Jan 2018 18:56:26 -0500
Subject: [PATCH 056/201] output_dir

---
 sample.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sample.py b/sample.py
index a433c60..9b7fb6d 100644
--- a/sample.py
+++ b/sample.py
@@ -674,7 +674,7 @@ def sample_and_save(index):
         rate=train_mux_rate)
 
     for sub_index, batch in enumerate(train_gen):
-        write_to_h5('{}_{}_{}.h5'.format(output_dir, index, sub_index), batch)
+        write_to_h5('{}/{}_{}.h5'.format(output_dir, index, sub_index), batch)
 
 
 def map_iterate_in_parallel(iterable, function, processes=8):

From b3d41a07ab127ed8180cb65c9a10728fbf82e163 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 18 Jan 2018 23:13:58 -0500
Subject: [PATCH 057/201] Create code and scripts to create data subsets and
 fixed label filtering bug

---
 01_create_subsets.py     | 125 ++++++++++++++++++++++++
 audioset_filter.csv      | 111 ++++++++++++++++++++++
 data/__init__.py         |   0
 data/subsets.py          | 198 +++++++++++++++++++++++++++++++++++++++
 jobs/data-subsets.sbatch |  31 ++++++
 5 files changed, 465 insertions(+)
 create mode 100644 01_create_subsets.py
 create mode 100644 audioset_filter.csv
 create mode 100644 data/__init__.py
 create mode 100644 data/subsets.py
 create mode 100644 jobs/data-subsets.sbatch

diff --git a/01_create_subsets.py b/01_create_subsets.py
new file mode 100644
index 0000000..30fad08
--- /dev/null
+++ b/01_create_subsets.py
@@ -0,0 +1,125 @@
+import argparse
+import logging
+import os
+from log import init_console_logger
+from data.subsets import get_subset_split
+from csv import DictWriter
+
+LOGGER = logging.getLogger('data')
+LOGGER.setLevel(logging.DEBUG)
+
+
+def write_subset_file(path, subset_list):
+    with open(path, 'w') as f:
+        field_names = list(subset_list[0].keys())
+        writer = DictWriter(f, field_names)
+        writer.writeheader()
+
+        for item in subset_list:
+            item = dict(item)
+            item['labels'] = ';'.join(item['labels'])
+            writer.writerow(item)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description='Creates CSVs containing a train-valid-test split for the given dataset')
+
+    parser.add_argument('-vr',
+                        '--valid-ratio',
+                        dest='valid_ratio',
+                        action='store',
+                        type=float,
+                        default=0.1,
+                        help='Ratio of dataset used for validation set')
+
+    parser.add_argument('-tr',
+                        '--test-ratio',
+                        dest='test_ratio',
+                        action='store',
+                        type=float,
+                        default=0.1,
+                        help='Ratio of dataset used for test set')
+
+    parser.add_argument('-rs',
+                        '--random-seed',
+                        dest='random_seed',
+                        action='store',
+                        type=int,
+                        default=12345678,
+                        help='Random seed used for generating split')
+
+    parser.add_argument('-o',
+                        '--ontology-path',
+                        dest='ontology_path',
+                        action='store',
+                        type=str,
+                        default=os.path.join(os.path.dirname(__file__), 'resources/ontology.json'),
+                        help='Path to AudioSet ontology')
+
+    parser.add_argument('-mp',
+                        '--metadata-path',
+                        dest='metadata_path',
+                        action='store',
+                        type=str,
+                        help='Path to metadata csv file(s). Accepts a glob string.')
+
+    parser.add_argument('-fp',
+                        '--filter-path',
+                        dest='filter_path',
+                        action='store',
+                        type=str,
+                        help='Path to filter csv file(s).')
+
+
+    parser.add_argument('-r',
+                        '--random-state',
+                        dest='random_state',
+                        action='store',
+                        type=int,
+                        default=20171021,
+                        help='Random seed used to set the RNG state')
+
+    parser.add_argument('data_dir',
+                        action='store',
+                        type=str,
+                        help='Path to directory where data files are stored')
+
+    parser.add_argument('output_dir',
+                        action='store',
+                        type=str,
+                        help='Path to directory where output files will be stored')
+
+    parser.add_argument('filename_prefix',
+                        action='store',
+                        type=str,
+                        help='Path to directory where output files will be stored')
+
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    init_console_logger(LOGGER, verbose=True)
+
+    args = parse_arguments()
+
+    train_list, valid_list, test_list \
+        = get_subset_split(args.data_dir,
+                           valid_ratio=args.valid_ratio,
+                           test_ratio=args.test_ratio,
+                           random_state=args.random_state,
+                           metadata_path=args.metadata_path,
+                           filter_path=args.filter_path,
+                           ontology_path=args.ontology_path)
+
+    output_dir = args.output_dir
+    filename_prefix = args.filename_prefix
+    train_subset_path = os.path.join(output_dir, filename_prefix + '_train.csv')
+    valid_subset_path = os.path.join(output_dir, filename_prefix + '_valid.csv')
+    test_subset_path = os.path.join(output_dir, filename_prefix + '_test.csv')
+
+    if not os.path.isdir(output_dir):
+        os.makedirs(output_dir)
+
+    write_subset_file(train_subset_path, train_list)
+    write_subset_file(valid_subset_path, valid_list)
+    write_subset_file(test_subset_path, test_list)
diff --git a/audioset_filter.csv b/audioset_filter.csv
new file mode 100644
index 0000000..86ebe62
--- /dev/null
+++ b/audioset_filter.csv
@@ -0,0 +1,111 @@
+filter_type,accept_reject,string
+label,accept,Accordion
+label,accept,Acoustic guitar
+label,accept,Alto saxophone
+label,accept,Bagpipes
+label,accept,Banjo
+label,accept,Bass (instrument role)
+label,accept,Bass drum
+label,accept,Bass guitar
+label,accept,Bassoon
+label,accept,Bell
+label,accept,Bicycle bell
+label,accept,Bowed string instrument
+label,accept,Brass instrument
+label,accept,Bugle
+label,accept,Cello
+label,accept,Change ringing (campanology)
+label,accept,Chant
+label,accept,Child singing
+label,accept,Chime
+label,accept,Choir
+label,accept,Church bell
+label,accept,Clarinet
+label,accept,Clavinet
+label,accept,Cornet
+label,accept,Cowbell
+label,accept,Crash cymbal
+label,accept,Cymbal
+label,accept,"Dental drill, dentist's drill"
+label,accept,Didgeridoo
+label,accept,Double bass
+label,accept,Drill
+label,accept,Drum
+label,accept,Drum kit
+label,accept,Drum machine
+label,accept,Drum roll
+label,accept,Electric guitar
+label,accept,Electric piano
+label,accept,Electronic organ
+label,accept,Female singing
+label,accept,Filing (rasp)
+label,accept,Flute
+label,accept,French horn
+label,accept,Glockenspiel
+label,accept,Gong
+label,accept,Guitar
+label,accept,Hammer
+label,accept,Hammond organ
+label,accept,Harmonica
+label,accept,Harp
+label,accept,Harpsichord
+label,accept,Hi-hat
+label,accept,Jackhammer
+label,accept,Jingle bell
+label,accept,Keyboard (musical)
+label,accept,Male singing
+label,accept,Mallet percussion
+label,accept,Mandolin
+label,accept,Mantra
+label,accept,Maraca
+label,accept,"Marimba, xylophone"
+label,accept,Mellotron
+label,accept,Musical ensemble
+label,accept,Musical instrument
+label,accept,Oboe
+label,accept,Orchestra
+label,accept,Organ
+label,accept,Percussion
+label,accept,Piano
+label,accept,Pizzicato
+label,accept,Plucked string instrument
+label,accept,Power tool
+label,accept,Rapping
+label,accept,Rattle (instrument)
+label,accept,Rhodes piano
+label,accept,Rimshot
+label,accept,Sampler
+label,accept,Sanding
+label,accept,Sawing
+label,accept,Saxophone
+label,accept,Scratching (performance technique)
+label,accept,Shofar
+label,accept,Singing
+label,accept,Singing bowl
+label,accept,Sitar
+label,accept,Snare drum
+label,accept,Soprano saxophone
+label,accept,"Steel guitar, slide guitar"
+label,accept,Steelpan
+label,accept,String section
+label,accept,Strum
+label,accept,Synthesizer
+label,accept,Synthetic singing
+label,accept,Tabla
+label,accept,Tambourine
+label,accept,Tapping (guitar technique)
+label,accept,Theremin
+label,accept,Timpani
+label,accept,Tools
+label,accept,Trombone
+label,accept,Trumpet
+label,accept,Tubular bells
+label,accept,Tuning fork
+label,accept,Ukulele
+label,accept,Vibraphone
+label,accept,"Violin, fiddle"
+label,accept,Wind chime
+label,accept,"Wind instrument, woodwind instrument"
+label,accept,Wood block
+label,accept,Yodeling
+label,accept,Zither
diff --git a/data/__init__.py b/data/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/data/subsets.py b/data/subsets.py
new file mode 100644
index 0000000..9e57377
--- /dev/null
+++ b/data/subsets.py
@@ -0,0 +1,198 @@
+import csv
+import os
+import glob
+import logging
+import random
+from audioset.ontology import ASOntology
+from collections import OrderedDict
+
+LOGGER = logging.getLogger('data')
+LOGGER.setLevel(logging.DEBUG)
+
+def get_filename(path):
+    """Return the filename of a path
+
+    Args: path: path to file
+
+    Returns:
+        filename: name of file (without extension)
+    """
+    return os.path.splitext(os.path.basename(path))[0]
+
+
+def load_filters(filter_path):
+    filters = []
+
+    with open(filter_path, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            filters.append(row)
+
+    return filters
+
+
+def get_ytid_from_filename(filename):
+    first_us_idx = filename.rindex('_')
+    second_us_idx = filename.rindex('_', 0, first_us_idx)
+    return filename[:second_us_idx]
+
+
+def load_metadata(metadata_path):
+    metadata = {}
+    for path in glob.glob(metadata_path):
+        with open(path, 'r') as f:
+            for idx, line in enumerate(f):
+                if idx in (0, 1):
+                    continue
+                elif idx == 2:
+                    fields = [field.strip() for field in line.lstrip('# ').rstrip().split(', ')]
+                else:
+                    row = [val.strip() for val in line.strip().split(', ')]
+                    ytid = row[0]
+
+                    entry = {field: val
+                            for field, val in zip(fields[1:], row[1:])}
+
+                    entry['positive_labels'] = entry['positive_labels'].strip('"').split(',')
+                    entry['start_seconds'] = float(entry['start_seconds'])
+                    entry['end_seconds'] = float(entry['end_seconds'])
+
+                    metadata[ytid] = entry
+
+    return metadata
+
+
+def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=None):
+    """Return audio and video file list.
+
+    Args:
+        data_dir: input directory that contains audio/ and video/
+
+    Keyword Args:
+        metadata_path: Path to audioset metadata file
+        filter_path: Path to filter specification file
+        ontology_path: Path to AudioSet ontology file
+
+    Returns:
+        audio_files: list of audio files
+        video_files: list of video files
+
+    """
+    data_dir_contents = set(os.listdir(data_dir))
+    LOGGER.info('Getting list of files...')
+    if 'audio' in data_dir_contents and 'video' in data_dir_contents:
+        audio_files = glob.glob('{}/audio/*'.format(data_dir))
+        video_files = glob.glob('{}/video/*'.format(data_dir))
+    else:
+        audio_files = glob.glob('{}/**/audio/*'.format(data_dir))
+        video_files = glob.glob('{}/**/video/*'.format(data_dir))
+
+    # Make sure that audio files and video files correspond to each other
+    audio_filenames = set([get_filename(path) for path in audio_files])
+    video_filenames = set([get_filename(path) for path in video_files])
+
+    valid_filenames = audio_filenames & video_filenames
+
+    # Get map from filename to full audio and video paths
+    audio_paths = {get_filename(path): path for path in audio_files
+                                       if get_filename(path) in valid_filenames}
+    video_paths = {get_filename(path): path for path in video_files
+                                       if get_filename(path) in valid_filenames}
+
+    if metadata_path:
+        LOGGER.info('Loading metadata...')
+        if not ontology_path:
+            err_msg = 'Must provide ontology path to filter'
+            LOGGER.error(err_msg)
+            raise ValueError(err_msg)
+
+        metadata = load_metadata(metadata_path)
+        ontology = ASOntology(ontology_path)
+
+    file_list = []
+    for filename in valid_filenames:
+        ytid = get_ytid_from_filename(filename)
+        video_metadata = metadata[ytid]
+
+        audio_path = audio_paths[filename]
+        video_path = video_paths[filename]
+
+        item = OrderedDict()
+        item['ytid'] = ytid
+        item['audio_filepath'] = audio_path
+        item['video_filepath'] = video_path
+
+        # Add the labels if the metadata is provided
+        if metadata_path:
+            item['labels'] = [ontology.get_node(label_id).name.lower()
+                              for label_id in video_metadata['positive_labels']]
+
+        file_list.append(item)
+
+
+    if metadata_path and filter_path:
+        LOGGER.info('Filtering examples...')
+        filters = load_filters(filter_path)
+
+        filtered_file_list = []
+
+        for item in file_list:
+            ytid = item['ytid']
+            label_list = item['labels']
+
+            accept = None
+            for _filter in filters:
+                filter_type = _filter['filter_type']
+                filter_accept = _filter['accept_reject'] == 'accept'
+                string = _filter['string']
+
+                if filter_type == 'ytid':
+                    match = ytid == string
+
+                elif filter_type == 'label':
+                    match = string.lower() in label_list
+
+                if filter_accept:
+                    # If "accept" has not been set, and there is a match with an accept filter, set "accept" to True
+                    # -> If "accept" has already been set (which is True if another accept filter has already approved the
+                    #    file or if a reject filter has rejected the file already, though this )
+                    if accept is None and match:
+                        accept = True
+                elif not filter_accept and match:
+                    # If a reject filter gets a match, do not accept file and break from the loop
+                    accept = False
+                    break
+
+            if accept:
+                #LOGGER.debug('Using video: "{}"'.format(filename))
+                filtered_file_list.append(item)
+
+
+        file_list = filtered_file_list
+
+    LOGGER.info('Total videos used: {}'.format(len(file_list)))
+    return file_list
+
+
+def get_subset_split(data_dir, valid_ratio=0.1, test_ratio=0.1, random_state=12345678,
+                     metadata_path=None, filter_path=None, ontology_path=None):
+    # Set random seed for reproducability
+    random.seed(random_state)
+
+    file_list = get_file_list(data_dir, metadata_path=metadata_path,
+                              filter_path=filter_path, ontology_path=ontology_path)
+
+    # Shuffle file list
+    random.shuffle(file_list)
+
+    # Figure out number of files for each subset
+    num_files = len(file_list)
+    num_valid = int(num_files * valid_ratio)
+    num_test = int(num_files * test_ratio)
+
+    # Get subset lists
+    valid_list = file_list[:num_valid]
+    test_list = file_list[num_valid:num_valid+num_test]
+    train_list = file_list[num_valid+num_test:]
+
+    return train_list, valid_list, test_list
diff --git a/jobs/data-subsets.sbatch b/jobs/data-subsets.sbatch
new file mode 100644
index 0000000..77c3235
--- /dev/null
+++ b/jobs/data-subsets.sbatch
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+#SBATCH --job-name=data-subsets-audioset
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=32GB
+#SBATCH --time=1-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@email.org
+#SBATCH --output="data-subsets-audioset-%j.out"
+#SBATCH --err="data-subsets-audioset-%j.err"
+
+
+source ~/.bashrc
+source activate l3embedding
+
+SRCDIR=''
+DATA_DIR=''
+OUTPUT_DIR=''
+
+module purge
+
+python $SRCDIR/01_create_subsets.py \
+    --valid-ratio 0.1 \
+    --test-ratio 0.1 \
+    --metadata-path "$DATA_DIR/../csv/*.csv" \
+    --ontology-path $SRCDIR/resources/ontology.json \
+    --random-state 20180118 \
+    $DATA_DIR \
+    $OUTPUT_DIR \
+    audioset_filtered

From e6d991200b360771a17b823dd9a0f19701c71988 Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Fri, 19 Jan 2018 09:31:09 -0500
Subject: [PATCH 058/201] modify sampling script to take filter file

---
 sample.py | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/sample.py b/sample.py
index 9b7fb6d..a739896 100644
--- a/sample.py
+++ b/sample.py
@@ -13,6 +13,7 @@
 
 import h5py
 import numpy as np
+import pandas as pd
 import pescador
 import scipy.misc
 import skimage
@@ -86,6 +87,12 @@ def get_filename(path):
     return os.path.splitext(os.path.basename(path))[0]
 
 
+def get_filter_video_list(filter_file):
+    df = pd.read_csv(filter_file)
+    filter_video_list = list(df['video_filepath'])
+    return set([get_filename(video) for video in filter_video_list])
+
+
 def load_metadata(metadata_path):
     metadata = {}
     for path in glob.glob(metadata_path):
@@ -127,7 +134,7 @@ def get_ytid_from_filename(filename):
     return filename[:second_us_idx]
 
 
-def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=None):
+def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=None, filter_file=None):
     """Return audio and video file list.
 
     Args:
@@ -155,6 +162,10 @@ def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=
     audio_filenames = set([get_filename(path) for path in audio_files])
     video_filenames = set([get_filename(path) for path in video_files])
 
+    if filter_file:
+        filter_video_list = get_filter_video_list(filter_file)
+        video_filenames = video_filenames & filter_video_list
+
     valid_filenames = audio_filenames & video_filenames
 
     if metadata_path and filter_path:
@@ -598,7 +609,7 @@ def sampler(video_file_1, video_file_2, rate=32, augment=False, precompute=False
 
 def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path=None,
                    k=32, batch_size=64, random_state=20171021, precompute=False,
-                   num_distractors=1, augment=False, rate=32, max_videos=None):
+                   num_distractors=1, augment=False, rate=32, max_videos=None, filter_file=None):
     """Sample video and audio from data_dir, returns a streamer that yield samples infinitely.
 
     Args:
@@ -620,7 +631,9 @@ def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path
 
     LOGGER.info("Getting file list...")
     _, video_files = get_file_list(data_dir, metadata_path=metadata_path,
-                                             filter_path=filter_path, ontology_path=ontology_path)
+                                             filter_path=filter_path,
+                                             ontology_path=ontology_path,
+                                             filter_file=filter_file)
 
     LOGGER.info("Creating streamers...")
     if max_videos is not None and max_videos < len(video_files):
@@ -717,6 +730,13 @@ def map_iterate_in_parallel(iterable, function, processes=8):
                         default=4,
                         help='Number of multiprocessing workers used to download videos')
 
+    parser.add_argument('-tff',
+                        '--train-filter-file',
+                        dest='train_filter_file',
+                        action='store',
+                        type=str,
+                        default=None)
+
     parser.add_argument('train_data_dir',
                         action='store',
                         type=str,
@@ -730,13 +750,15 @@ def map_iterate_in_parallel(iterable, function, processes=8):
     args = parser.parse_args()
 
     # train_data_dir = '/beegfs/work/AudioSet/data'
-    # train_batch_size = 64
+    # train_filter_file = '/scratch/jtc440/audioset_subsets/audioset_filtered_train.csv'
+    # train_batch_size = 512
     # train_num_streamers = 64
     # train_mux_rate = 2
     # output_dir = '/scratch/hhw230/train'
-    # num_workers = 2
+    # num_workers = 16
 
     train_data_dir = args.train_data_dir
+    train_filter_file = args.train_filter_file
     train_batch_size = args.train_batch_size
     train_num_streamers = args.train_num_streamers
     train_mux_rate = args.train_mux_rate

From 41b966228b3ecd20c4d1bc2fc188952f3906d81c Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Fri, 19 Jan 2018 09:51:15 -0500
Subject: [PATCH 059/201] Add filter_file in sample function

---
 sample.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sample.py b/sample.py
index a739896..c9c8c9c 100644
--- a/sample.py
+++ b/sample.py
@@ -684,7 +684,8 @@ def sample_and_save(index):
         num_distractors=1,
         max_videos=None,
         precompute=False,
-        rate=train_mux_rate)
+        rate=train_mux_rate,
+        filter_file=train_filter_file)
 
     for sub_index, batch in enumerate(train_gen):
         write_to_h5('{}/{}_{}.h5'.format(output_dir, index, sub_index), batch)

From e58b0e3357fc761d8d6f137aeca2a9ee95435971 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 22 Jan 2018 00:03:29 -0500
Subject: [PATCH 060/201] Handle case where there are no accept filters and add
 utils

---
 data/subsets.py | 21 +++++++++------------
 data/utils.py   | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 12 deletions(-)
 create mode 100644 data/utils.py

diff --git a/data/subsets.py b/data/subsets.py
index 9e57377..66be378 100644
--- a/data/subsets.py
+++ b/data/subsets.py
@@ -5,6 +5,7 @@
 import random
 from audioset.ontology import ASOntology
 from collections import OrderedDict
+from .utils import read_csv_as_dicts
 
 LOGGER = logging.getLogger('data')
 LOGGER.setLevel(logging.DEBUG)
@@ -20,17 +21,6 @@ def get_filename(path):
     return os.path.splitext(os.path.basename(path))[0]
 
 
-def load_filters(filter_path):
-    filters = []
-
-    with open(filter_path, 'r') as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            filters.append(row)
-
-    return filters
-
-
 def get_ytid_from_filename(filename):
     first_us_idx = filename.rindex('_')
     second_us_idx = filename.rindex('_', 0, first_us_idx)
@@ -132,7 +122,7 @@ def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=
 
     if metadata_path and filter_path:
         LOGGER.info('Filtering examples...')
-        filters = load_filters(filter_path)
+        filters = read_csv_as_dicts(filter_path)
 
         filtered_file_list = []
 
@@ -141,11 +131,15 @@ def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=
             label_list = item['labels']
 
             accept = None
+            has_accept_filter = False
             for _filter in filters:
                 filter_type = _filter['filter_type']
                 filter_accept = _filter['accept_reject'] == 'accept'
                 string = _filter['string']
 
+                if filter_accept:
+                    has_accept_filter = True
+
                 if filter_type == 'ytid':
                     match = ytid == string
 
@@ -163,6 +157,9 @@ def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=
                     accept = False
                     break
 
+            if accept is None:
+                accept = not has_accept_filter
+
             if accept:
                 #LOGGER.debug('Using video: "{}"'.format(filename))
                 filtered_file_list.append(item)
diff --git a/data/utils.py b/data/utils.py
new file mode 100644
index 0000000..8460489
--- /dev/null
+++ b/data/utils.py
@@ -0,0 +1,33 @@
+import csv
+from multiprocessing import Pool
+
+
+def read_csv_as_dicts(path):
+    items = []
+
+    with open(path, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            items.append(row)
+
+    return items
+
+def map_iterate_in_parallel(iterable, function, processes=8):
+    pool = Pool(processes=processes)
+    output = pool.map(function, iterable)
+    return list(output)
+
+
+def flatten_dict(dct, parent_key=None):
+    new_dct = type(dct)()
+    for k,v in dct.items():
+        if parent_key is not None:
+            k = '{}_{}'.format(parent_key, k)
+
+        if isinstance(v, dict):
+            new_dct.update(flatten_dict(v, parent_key=k))
+        else:
+            new_dct[k] = v
+
+    return new_dct
+

From aa6b796b8ab0bd1f6a828d5f93df6d9450ca89b0 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 22 Jan 2018 00:06:48 -0500
Subject: [PATCH 061/201] Refactor sample script

---
 02_generate_samples.py | 119 ++++++
 data/sample.py         | 578 +++++++++++++++++++++++++++++
 sample.py              | 799 ++++-------------------------------------
 3 files changed, 771 insertions(+), 725 deletions(-)
 create mode 100644 02_generate_samples.py
 create mode 100644 data/sample.py

diff --git a/02_generate_samples.py b/02_generate_samples.py
new file mode 100644
index 0000000..47c1992
--- /dev/null
+++ b/02_generate_samples.py
@@ -0,0 +1,119 @@
+import argparse
+import logging
+from functools import partial
+from log import init_console_logger
+from data.sample import sample_and_save
+from data.utils import map_iterate_in_parallel
+
+
+LOGGER = logging.getLogger('sampling')
+LOGGER.setLevel(logging.ERROR)
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Pre-sample videos and audios for L3 model.')
+    parser.add_argument('-bs',
+                        '--batch-size',
+                        dest='batch_size',
+                        action='store',
+                        type=int,
+                        default=64,
+                        help='Number of examples per training batch')
+
+    parser.add_argument('-ns',
+                        '--num-streamers',
+                        dest='num_streamers',
+                        action='store',
+                        type=int,
+                        default=64,
+                        help='Number of training pescador streamers that can be open concurrently')
+
+    parser.add_argument('-mr',
+                        '--mux-rate',
+                        dest='mux_rate',
+                        action='store',
+                        type=float,
+                        default=2.0,
+                        help='Poisson distribution parameter for determining number of training samples to take from a streamer')
+
+    parser.add_argument('-a',
+                        '--augment',
+                        dest='augment',
+                        action='store_true',
+                        default=False,
+                        help='If True, performs data augmentation on audio and images')
+
+    parser.add_argument('-pc',
+                        '--precompute',
+                        dest='precompute',
+                        action='store_true',
+                        default=False,
+                        help='If True, streamer precompute samples')
+
+    parser.add_argument('-nd',
+                        '--num-distractors',
+                        dest='num_distractors',
+                        action='store',
+                        type=int,
+                        default=1,
+                        help='Number of distractors for generating examples')
+
+    parser.add_argument('-im',
+                        '--include-metadata',
+                        dest='include_metadata',
+                        action='store_true',
+                        help='If True, includes additional metadata in h5 files')
+
+    parser.add_argument('-mv',
+                        '--max-videos',
+                        dest='max_videos',
+                        action='store',
+                        type=int,
+                        help='Maximum number of videos to use for generating examples. If not specified, all videos will be used')
+
+    parser.add_argument('-r',
+                        '--random-state',
+                        dest='random_state',
+                        action='store',
+                        type=int,
+                        default=20171021,
+                        help='Random seed used to set the RNG state')
+
+    parser.add_argument('-n',
+                        '--num-workers',
+                        dest='num_workers',
+                        action='store',
+                        type=int,
+                        default=4,
+                        help='Number of multiprocessing workers used to download videos')
+
+    parser.add_argument('subset_path',
+                        action='store',
+                        type=str,
+                        help='Path to subset file')
+
+    parser.add_argument('output_dir',
+                        action='store',
+                        type=str,
+                        help='Path to directory where output files will be stored')
+
+    args = parser.parse_args()
+
+    init_console_logger(LOGGER, verbose=True)
+
+    worker_func = partial(sample_and_save,
+        subset_path=args.subset_path,
+        output_dir=args.output_dir,
+        num_streamers=args.num_streamers,
+        batch_size=args.batch_size,
+        random_state=args.random_state,
+        precompute=args.precompute,
+        num_distractors=args.num_distractors,
+        augment=args.augment,
+        rate=args.mux_rate,
+        max_videos=args.max_videos,
+        include_metadata=args.include_metadata)
+
+    num_workers = args.num_workers
+    map_iterate_in_parallel(range(num_workers), worker_func,
+                            processes=num_workers)
diff --git a/data/sample.py b/data/sample.py
new file mode 100644
index 0000000..3c6bf00
--- /dev/null
+++ b/data/sample.py
@@ -0,0 +1,578 @@
+import argparse
+import csv
+import datetime
+import glob
+import json
+import math
+import logging
+from multiprocessing import Pool
+import os
+import pickle
+import random
+import time
+import warnings
+
+import h5py
+import numpy as np
+import pandas as pd
+import pescador
+import scipy.misc
+import skimage
+import skimage.color
+from skvideo.io import FFmpegReader, ffprobe
+import soundfile as sf
+from tqdm import tqdm
+from .utils import read_csv_as_dicts, flatten_dict
+from log import LogTimer
+
+LOGGER = logging.getLogger('sampling')
+LOGGER.setLevel(logging.ERROR)
+
+
+def adjust_saturation(rgb_img, factor):
+    """
+    Adjust the saturation of an RGB image
+
+    Args:
+        rgb_img: RGB image data array
+        factor: Multiplicative scaling factor to be applied to saturation
+
+    Returns:
+        adjusted_img: RGB image with adjusted saturation
+    """
+    hsv_img = skimage.color.rgb2hsv(rgb_img)
+    imin, imax = skimage.dtype_limits(hsv_img)
+    hsv_img[:,:,1] = np.clip(hsv_img[:,:,1] * factor, imin, imax)
+    return skimage.color.hsv2rgb(hsv_img)
+
+
+def adjust_brightness(rgb_img, delta):
+    """
+    Adjust the brightness of an RGB image
+
+    Args:
+        rgb_img: RGB image data array
+        delta: Additive (normalized) gain factor applied to each pixel
+
+    Returns:
+        adjusted_img: RGB image with adjusted saturation
+    """
+    imin, imax = skimage.dtype_limits(rgb_img)
+    # Convert delta into the range of the image data
+    delta = rgb_img.dtype.type((imax - imin) * delta)
+
+    return np.clip(rgb_img + delta, imin, imax)
+
+
+def horiz_flip(rgb_img):
+    """
+    Horizontally flip the given image
+
+    Args:
+        rgb_img: RGB image data array
+
+    Returns:
+        flipped_img: Horizontally flipped image
+    """
+    return rgb_img[:,::-1,:]
+
+
+def get_filename(path):
+    """Return the filename of a path
+
+    Args: path: path to file
+
+    Returns:
+        filename: name of file (without extension)
+    """
+    return os.path.splitext(os.path.basename(path))[0]
+
+
+def load_metadata(metadata_path):
+    metadata = {}
+    for path in glob.glob(metadata_path):
+        with open(path, 'r') as f:
+            for idx, line in enumerate(f):
+                if idx in (0, 1):
+                    continue
+                elif idx == 2:
+                    fields = [field.strip() for field in line.lstrip('# ').rstrip().split(', ')]
+                else:
+                    row = [val.strip() for val in line.strip().split(', ')]
+                    ytid = row[0]
+
+                    entry = {field: val
+                            for field, val in zip(fields[1:], row[1:])}
+
+                    entry['positive_labels'] = entry['positive_labels'].strip('"').split(',')
+                    entry['start_seconds'] = float(entry['start_seconds'])
+                    entry['end_seconds'] = float(entry['end_seconds'])
+
+                    metadata[ytid] = entry
+
+    return metadata
+
+
+def sample_one_second(audio_data, sampling_frequency, augment=False):
+    """Return one second audio samples randomly
+
+    Args:
+        audio_data: audio data to sample from
+        sampling_frequency: audio sample rate
+        augment: if True, perturb the data in some fashion
+
+    Returns:
+        One second samples, start time, and augmentation parameters
+
+    """
+    sampling_frequency = int(sampling_frequency)
+    if len(audio_data) > sampling_frequency:
+        start = random.randrange(len(audio_data) - sampling_frequency)
+    else:
+        start = 0
+
+    with LogTimer(LOGGER, 'Slicing audio'):
+        audio_data = audio_data[start:start+sampling_frequency]
+
+    if audio_data.shape[0] != sampling_frequency:
+        # Pad audio that isn't one second
+        warnings.warn('Got audio that is less than one second', UserWarning)
+        with LogTimer(LOGGER, 'Slicing audio'):
+            audio_data = np.pad(audio_data,
+                                ((0, sampling_frequency - audio_data.shape[0]), (0,0)),
+                                mode='constant')
+    if augment:
+        # Make sure we don't clip
+        if np.abs(audio_data).max():
+            max_gain = min(0.1, 1.0/np.abs(audio_data).max() - 1)
+        else:
+            # TODO: Handle audio with all zeros
+            warnings.warn('Got audio sample with all zeros', UserWarning)
+            max_gain = 0.1
+        gain = 1 + random.uniform(-0.1, max_gain)
+        with LogTimer(LOGGER, 'Applying gain to audio'):
+            audio_data *= gain
+        audio_aug_params = {'gain': gain}
+    else:
+        audio_aug_params = {}
+
+    return audio_data, start / sampling_frequency, audio_aug_params
+
+
+def sample_cropped_frame(frame_data):
+    """
+    Randomly crop a video frame, using the method from Look, Listen and Learn
+
+
+    Args:
+        frame_data: video frame data array
+
+    Returns:
+        scaled_frame_data: scaled and cropped frame data
+        bbox: bounding box for the cropped image
+    """
+    nx, ny, nc = frame_data.shape
+    start_x, start_y = random.randrange(nx - 224), random.randrange(ny - 224)
+    end_x, end_y = start_x + 224, start_y + 224
+
+    bbox = {
+        'start_x': start_x,
+        'start_y': start_y,
+        'end_x': end_x,
+        'end_y': end_y
+    }
+
+    with LogTimer(LOGGER, 'Cropping frame'):
+        frame_data = frame_data[start_x:end_x, start_y:end_y, :]
+
+    return frame_data, bbox
+
+
+def sample_one_frame(video_data, start=None, fps=30, augment=False):
+    """Return one frame randomly and time (seconds).
+
+    Args:
+        video_data: video data to sample from
+
+    Keyword Args:
+        start: start time of a one second window from which to sample
+        fps: frame per second
+        augment: if True, perturb the data in some fashion
+
+    Returns:
+        One frame sampled randomly, start time in seconds, and augmentation parameters
+
+    """
+
+    num_frames = len(video_data)
+    if start is not None:
+        start_frame = int(start * fps)
+        # Sample frame from a one second window, or until the end of the video
+        # if the video is less than a second for some reason
+        # Audio should always be sampled one second from the end of the audio,
+        # so video frames we're sampling from should also be a second. If it's
+        # not, then our video is probably less than a second
+        duration = min(fps, num_frames - start_frame)
+        if duration != fps:
+            warnings.warn('Got video that is less than one second', UserWarning)
+
+        if duration > 0:
+            frame = start_frame + random.randrange(duration)
+        else:
+            warnings.warn('Got video with only a single frame', UserWarning)
+            # For robustness, use the last frame if the start_frame goes past
+            # the end of video frame
+            frame = min(start_frame, num_frames - 1)
+    else:
+        frame = random.randrange(num_frames)
+
+    frame_data = video_data[frame]
+    frame_data, bbox = sample_cropped_frame(frame_data)
+
+    video_aug_params = {'bounding_box': bbox}
+
+    if augment:
+        # Randomly horizontally flip the image
+        horizontal_flip = False
+        if random.random() < 0.5:
+            with LogTimer(LOGGER, 'Flipping frame'):
+                frame_data = horiz_flip(frame_data)
+            horizontal_flip = True
+
+
+        # Ranges taken from https://github.com/tensorflow/models/blob/master/research/slim/preprocessing/inception_preprocessing.py
+
+        # Randomize the order of saturation jitter and brightness jitter
+        if random.random() < 0.5:
+            # Add saturation jitter
+            saturation_factor = np.float32(random.random() + 0.5)
+            with LogTimer(LOGGER, 'Adjusting saturation'):
+                frame_data = adjust_saturation(frame_data, saturation_factor)
+
+            # Add brightness jitter
+            max_delta = 32. / 255.
+            brightness_delta = np.float32((2*random.random() - 1) * max_delta)
+            with LogTimer(LOGGER, 'Adjusting brightness'):
+                frame_data = adjust_brightness(frame_data, brightness_delta)
+        else:
+            # Add brightness jitter
+            max_delta = 32. / 255.
+            brightness_delta = np.float32((2*random.random() - 1) * max_delta)
+            with LogTimer(LOGGER, 'Adjusting brightness'):
+                frame_data = adjust_brightness(frame_data, brightness_delta)
+
+            # Add saturation jitter
+            saturation_factor = np.float32(random.random() + 0.5)
+            with LogTimer(LOGGER, 'Adjusting saturation'):
+                frame_data = adjust_saturation(frame_data, saturation_factor)
+
+        video_aug_params.update({
+            'horizontal_flip': horizontal_flip,
+            'saturation_factor': saturation_factor,
+            'brightness_delta': brightness_delta
+        })
+
+    frame_data = skimage.img_as_float32(frame_data)
+
+
+    return frame_data, frame / fps, video_aug_params
+
+
+def read_video(video_path):
+    """
+    Read a video file as a numpy array
+
+    Resizes frames so that the minimum side is 256 pixels
+
+    Args:
+        video_path: Path to video file
+
+    Returns:
+        video: Numpy data array
+
+    """
+    vinfo = ffprobe(video_path)['video']
+    width = int(vinfo['@width'])
+    height = int(vinfo['@height'])
+
+    scaling = 256.0 / min(width, height)
+    new_width = math.ceil(scaling * width)
+    new_height = math.ceil(scaling * height)
+
+    # Resize frames
+    reader = FFmpegReader(video_path,
+                          outputdict={'-s': "{}x{}".format(new_width,
+                                                           new_height) })
+
+    frames = []
+    for frame in reader.nextFrame():
+        frames.append(frame)
+
+    return frames
+
+
+def generate_sample(audio_file_1, audio_data_1, audio_file_2, audio_data_2,
+                    video_file_1, video_data_1, video_file_2, video_data_2,
+                    audio_sampling_frequency, augment=False, include_metadata=False):
+    """
+    Generate a sample from the given audio and video files
+
+    The video from which the audio come from is chosen with a fair coin, as is
+    the video frame. Thus, there is a 50% chance of producing a positive or
+    negative example.
+
+    Args:
+        audio_file_1: audio filename
+        audio_data_1: audio data array
+        audio_file_2: audio filename
+        audio_data_2: audio data array
+        video_file_1: video filename
+        video_data_1: video data array
+        video_file_2: video filename
+        video_data_2: video data array
+        audio_sampling_frequency: audio sample rate
+
+    Keyword Args
+        augment: If True, perform data augmention
+
+    Returns:
+        sample: sample dictionary
+    """
+    video_choice = random.random() < 0.5
+    audio_choice = random.random() < 0.5
+
+    if audio_choice:
+        audio_file = audio_file_1
+        audio_data = audio_data_1
+    else:
+        audio_file = audio_file_2
+        audio_data = audio_data_2
+
+    if video_choice:
+        video_file = video_file_1
+        video_data = video_data_1
+    else:
+        video_file = video_file_2
+        video_data = video_data_2
+
+    label = int(video_choice != audio_choice)
+
+    sample_audio_data, audio_start, audio_aug_params \
+        = sample_one_second(audio_data, audio_sampling_frequency, augment=augment)
+
+    sample_video_data, video_start, video_aug_params \
+        = sample_one_frame(video_data, start=audio_start, augment=augment)
+
+    sample_audio_data = sample_audio_data.mean(axis=-1).reshape((1, sample_audio_data.shape[0]))
+
+    sample = {
+        'video': np.ascontiguousarray(sample_video_data),
+        'audio': np.ascontiguousarray(sample_audio_data),
+        'label': np.ascontiguousarray(np.array([label, 1 - label])),
+    }
+
+    if include_metadata:
+        sample['audio_file'] = audio_file
+        sample['video_file'] = video_file
+        sample['audio_start'] = audio_start
+        sample['video_start'] = video_start
+        sample.update(flatten_dict(audio_aug_params))
+        sample.update(flatten_dict(video_aug_params))
+
+    return sample
+
+
+def sampler(video_1, video_2, rate=32, augment=False, precompute=False, include_metadata=False):
+    """Sample one frame from video_file, with 50% chance sample one second from corresponding audio_file,
+       50% chance sample one second from another audio_file in the list of audio_files.
+
+    Args:
+        video_1: dict for candidate video to sample from
+        video_2: dict for candidate video to sample from
+
+    Keyword Args:
+        rate: Poisson rate parameter. Used for precomputing samples
+        augment: If True, perform data augmention
+        precompute: If True, precompute samples during initialization so that
+                    memory can be discarded
+
+    Returns:
+        A generator that yields dictionary of video sample, audio sample,
+        and label (0: not from corresponding files, 1: from corresponding files)
+
+    """
+    video_file_1 = video_1['video_filepath']
+    video_file_2 = video_2['video_filepath']
+    audio_file_1 = video_1['audio_filepath']
+    audio_file_2 = video_2['audio_filepath']
+
+    debug_msg = 'Initializing streamer with videos "{}" and "{}"'
+    LOGGER.debug(debug_msg.format(video_file_1, video_file_2))
+
+    # Hack: choose a number of samples such that we with high probability, we
+    #       won't run out of samples, but is also less than the entire length of
+    #       the video so we don't have to resize all of the frames
+    num_samples = int(scipy.stats.poisson.ppf(0.999, rate))
+
+
+    try:
+        with LogTimer(LOGGER, 'Reading video'):
+            video_data_1 = read_video(video_file_1)
+    except Exception as e:
+        warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
+        warn_msg = warn_msg.format(video_file_1, type(e), e)
+        LOGGER.warning(warn_msg)
+        warnings.warn(warn_msg)
+        raise StopIteration()
+
+    try:
+        with LogTimer(LOGGER, 'Reading video'):
+            video_data_2 = read_video(video_file_2)
+    except Exception as e:
+        warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
+        warn_msg = warn_msg.format(video_file_2, type(e), e)
+        LOGGER.warning(warn_msg)
+        warnings.warn(warn_msg)
+        raise StopIteration()
+
+    try:
+        with LogTimer(LOGGER, 'Reading audio'):
+            audio_data_1, sampling_frequency = sf.read(audio_file_1,
+                                                       dtype='float32',
+                                                       always_2d=True)
+    except Exception as e:
+        warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
+        warn_msg = warn_msg.format(audio_file_1, type(e), e)
+        LOGGER.warning(warn_msg)
+        warnings.warn(warn_msg)
+        raise StopIteration()
+
+    try:
+        with LogTimer(LOGGER, 'Reading audio'):
+            audio_data_2, sampling_frequency = sf.read(audio_file_2,
+                                                       dtype='float32',
+                                                       always_2d=True)
+    except Exception as e:
+        warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
+        warn_msg = warn_msg.format(audio_file_2, type(e), e)
+        LOGGER.warning(warn_msg)
+        warnings.warn(warn_msg)
+        raise StopIteration()
+
+    if precompute:
+        samples = []
+        for _ in range(num_samples):
+            sample = generate_sample(
+                audio_file_1, audio_data_1, audio_file_2, audio_data_2,
+                video_file_1, video_data_1, video_file_2, video_data_2,
+                sampling_frequency, augment=augment, include_metadata=include_metadata)
+
+            samples.append(sample)
+
+        # Clear the data from memory
+        video_data_1 = None
+        video_data_2 = None
+        audio_data_1 = None
+        audio_data_2 = None
+        video_data = None
+        audio_data = None
+        del video_data_1
+        del video_data_2
+        del audio_data_1
+        del audio_data_2
+        del video_data
+        del audio_data
+
+        while samples:
+            # Yield the sample, and remove from the list to free up some memory
+            yield samples.pop()
+    else:
+        while True:
+            yield generate_sample(
+                audio_file_1, audio_data_1, audio_file_2, audio_data_2,
+                video_file_1, video_data_1, video_file_2, video_data_2,
+                sampling_frequency, augment=augment, include_metadata=include_metadata)
+
+    raise StopIteration()
+
+
+
+def data_generator(subset_path, k=32, batch_size=64, random_state=20171021,
+                   precompute=False, num_distractors=1, augment=False, rate=32,
+                   max_videos=None, include_metadata=False):
+    """Sample video and audio from data_dir, returns a streamer that yield samples infinitely.
+
+    Args:
+        subset_path: path to subset file
+        k: number of concurrent open streamer
+        batch_size: batch size
+        random_state: Value used to initialize state of RNG
+        num_distractors: Number of pairs to generate a stream for each video
+
+    Returns:
+        A generator that yield infinite video and audio samples from data_dir
+
+    """
+
+    random.seed(random_state)
+
+    LOGGER.info("Loading subset list")
+    file_list = read_csv_as_dicts(subset_path)
+
+    LOGGER.info("Creating streamers...")
+    if max_videos is not None and max_videos < len(video_files):
+        LOGGER.info("Using a subset of {} videos".format(max_videos))
+        random.shuffle(file_list)
+        file_list = file_list[:max_videos]
+
+    seeds = []
+    for video_1 in tqdm(file_list):
+        for _ in range(num_distractors):
+            video_2 = video_1
+            # Make sure we sample a different file
+            while video_2 == video_1:
+                video_2 = random.choice(file_list)
+
+            streamer = pescador.Streamer(sampler, video_1, video_2,
+                                         rate=rate, augment=augment,
+                                         precompute=precompute,
+                                         include_metadata=include_metadata)
+            seeds.append(streamer)
+
+    # Randomly shuffle the seeds
+    random.shuffle(seeds)
+
+    mux = pescador.Mux(seeds, k, rate=rate)
+    if batch_size == 1:
+        return mux
+    else:
+        return pescador.maps.buffer_stream(mux, batch_size)
+
+
+def write_to_h5(path, batch):
+    with h5py.File(path, 'w') as f:
+        for key in batch.keys():
+            f.create_dataset(key, data=batch[key], compression='gzip')
+
+
+def sample_and_save(index, subset_path, output_dir, num_streamers=32, batch_size=64,
+                    random_state=20171021, precompute=False, num_distractors=1,
+                    augment=False, rate=32, max_videos=None, include_metadata=False):
+    data_gen = data_generator(
+        subset_path,
+        batch_size=batch_size,
+        random_state=random_state + index,
+        k=num_streamers,
+        augment=augment,
+        num_distractors=num_distractors,
+        max_videos=max_videos,
+        precompute=precompute,
+        rate=rate,
+        include_metadata=include_metadata)
+
+    if not os.path.isdir(output_dir):
+        os.makedirs(output_dir)
+
+    for sub_index, batch in enumerate(data_gen):
+        batch_path = os.path.join(output_dir, '{}_{}.h5'.format(index, sub_index))
+        write_to_h5(batch_path, batch)
diff --git a/sample.py b/sample.py
index c9c8c9c..47c1992 100644
--- a/sample.py
+++ b/sample.py
@@ -1,728 +1,84 @@
 import argparse
-import csv
-import datetime
-import glob
-import json
-import math
-from multiprocessing import Pool
-import os
-import pickle
-import random
-import time
-import warnings
+import logging
+from functools import partial
+from log import init_console_logger
+from data.sample import sample_and_save
+from data.utils import map_iterate_in_parallel
 
-import h5py
-import numpy as np
-import pandas as pd
-import pescador
-import scipy.misc
-import skimage
-import skimage.color
-from skvideo.io import FFmpegReader, ffprobe
-import soundfile as sf
-from tqdm import tqdm
-
-from log import *
 
 LOGGER = logging.getLogger('sampling')
 LOGGER.setLevel(logging.ERROR)
 
-
-def adjust_saturation(rgb_img, factor):
-    """
-    Adjust the saturation of an RGB image
-
-    Args:
-        rgb_img: RGB image data array
-        factor: Multiplicative scaling factor to be applied to saturation
-
-    Returns:
-        adjusted_img: RGB image with adjusted saturation
-    """
-    hsv_img = skimage.color.rgb2hsv(rgb_img)
-    imin, imax = skimage.dtype_limits(hsv_img)
-    hsv_img[:,:,1] = np.clip(hsv_img[:,:,1] * factor, imin, imax)
-    return skimage.color.hsv2rgb(hsv_img)
-
-
-def adjust_brightness(rgb_img, delta):
-    """
-    Adjust the brightness of an RGB image
-
-    Args:
-        rgb_img: RGB image data array
-        delta: Additive (normalized) gain factor applied to each pixel
-
-    Returns:
-        adjusted_img: RGB image with adjusted saturation
-    """
-    imin, imax = skimage.dtype_limits(rgb_img)
-    # Convert delta into the range of the image data
-    delta = rgb_img.dtype.type((imax - imin) * delta)
-
-    return np.clip(rgb_img + delta, imin, imax)
-
-
-def horiz_flip(rgb_img):
-    """
-    Horizontally flip the given image
-
-    Args:
-        rgb_img: RGB image data array
-
-    Returns:
-        flipped_img: Horizontally flipped image
-    """
-    return rgb_img[:,::-1,:]
-
-
-def get_filename(path):
-    """Return the filename of a path
-
-    Args: path: path to file
-
-    Returns:
-        filename: name of file (without extension)
-    """
-    return os.path.splitext(os.path.basename(path))[0]
-
-
-def get_filter_video_list(filter_file):
-    df = pd.read_csv(filter_file)
-    filter_video_list = list(df['video_filepath'])
-    return set([get_filename(video) for video in filter_video_list])
-
-
-def load_metadata(metadata_path):
-    metadata = {}
-    for path in glob.glob(metadata_path):
-        with open(path, 'r') as f:
-            for idx, line in enumerate(f):
-                if idx in (0, 1):
-                    continue
-                elif idx == 2:
-                    fields = [field.strip() for field in line.lstrip('# ').rstrip().split(', ')]
-                else:
-                    row = [val.strip() for val in line.strip().split(', ')]
-                    ytid = row[0]
-
-                    entry = {field: val
-                            for field, val in zip(fields[1:], row[1:])}
-
-                    entry['positive_labels'] = entry['positive_labels'].strip('"').split(',')
-                    entry['start_seconds'] = float(entry['start_seconds'])
-                    entry['end_seconds'] = float(entry['end_seconds'])
-
-                    metadata[ytid] = entry
-
-    return metadata
-
-
-def load_filters(filter_path):
-    filters = []
-
-    with open(filter_path, 'r') as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            filters.append(row)
-
-    return filters
-
-def get_ytid_from_filename(filename):
-    first_us_idx = filename.rindex('_')
-    second_us_idx = filename.rindex('_', 0, first_us_idx)
-    return filename[:second_us_idx]
-
-
-def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=None, filter_file=None):
-    """Return audio and video file list.
-
-    Args:
-        data_dir: input directory that contains audio/ and video/
-
-    Keyword Args:
-        metadata_path: Path to audioset metadata file
-        filter_path: Path to filter specification file
-        ontology_path: Path to AudioSet ontology file
-
-    Returns:
-        audio_files: list of audio files
-        video_files: list of video files
-
-    """
-    data_dir_contents = set(os.listdir(data_dir))
-    if 'audio' in data_dir_contents and 'video' in data_dir_contents:
-        audio_files = glob.glob('{}/audio/*'.format(data_dir))
-        video_files = glob.glob('{}/video/*'.format(data_dir))
-    else:
-        audio_files = glob.glob('{}/**/audio/*'.format(data_dir))
-        video_files = glob.glob('{}/**/video/*'.format(data_dir))
-
-    # Make sure that audio files and video files correspond to each other
-    audio_filenames = set([get_filename(path) for path in audio_files])
-    video_filenames = set([get_filename(path) for path in video_files])
-
-    if filter_file:
-        filter_video_list = get_filter_video_list(filter_file)
-        video_filenames = video_filenames & filter_video_list
-
-    valid_filenames = audio_filenames & video_filenames
-
-    if metadata_path and filter_path:
-        LOGGER.info('Filtering examples...')
-        if not ontology_path:
-            err_msg = 'Must provide ontology path to filter'
-            LOGGER.error(err_msg)
-            raise ValueError(err_msg)
-
-        ontology = ASOntology(ontology_path)
-
-        metadata = load_metadata(metadata_path)
-        filters = load_filters(filter_path)
-
-        filtered_filenames = []
-
-        for filename in valid_filenames:
-            ytid = get_ytid_from_filename(filename)
-            video_metadata = metadata[ytid]
-
-            video_labels = [ontology.get_node(label_id).name.lower()
-                            for label_id in video_metadata['positive_labels']]
-
-            accept = True
-            for _filter in filters:
-                filter_type = _filter['filter_type']
-                filter_accept = _filter['accept_reject'] == 'accept'
-                string = _filter['string']
-
-                if filter_type == 'ytid':
-                    match = ytid == string
-
-                elif filter_type == 'label':
-                    match = string.lower() in video_labels
-
-                # TODO: check this logic
-                if match == filter_accept:
-                    accept = False
-
-            if accept:
-                #LOGGER.debug('Using video: "{}"'.format(filename))
-                filtered_filenames.append(filename)
-
-        valid_filenames = set(filtered_filenames)
-
-    LOGGER.info('Total videos used: {}'.format(len(valid_filenames)))
-    audio_files = [path for path in audio_files if get_filename(path) in valid_filenames]
-    video_files = [path for path in video_files if get_filename(path) in valid_filenames]
-
-    return audio_files, video_files
-
-
-def video_to_audio(video_file):
-    """Return corresponding audio_file.
-
-    Args:
-        video_file: video_file
-
-    Returns:
-        audio_file
-
-    """
-
-    *path, _, name = video_file.split('/')
-    name = name.split('.')[0] + '.flac'
-    return '/'.join(path + ['audio', name])
-
-
-def sample_one_second(audio_data, sampling_frequency, augment=False):
-    """Return one second audio samples randomly
-
-    Args:
-        audio_data: audio data to sample from
-        sampling_frequency: audio sample rate
-        augment: if True, perturb the data in some fashion
-
-    Returns:
-        One second samples, start time, and augmentation parameters
-
-    """
-    sampling_frequency = int(sampling_frequency)
-    if len(audio_data) > sampling_frequency:
-        start = random.randrange(len(audio_data) - sampling_frequency)
-    else:
-        start = 0
-
-    with LogTimer(LOGGER, 'Slicing audio'):
-        audio_data = audio_data[start:start+sampling_frequency]
-
-    if audio_data.shape[0] != sampling_frequency:
-        # Pad audio that isn't one second
-        warnings.warn('Got audio that is less than one second', UserWarning)
-        with LogTimer(LOGGER, 'Slicing audio'):
-            audio_data = np.pad(audio_data,
-                                ((0, sampling_frequency - audio_data.shape[0]), (0,0)),
-                                mode='constant')
-    if augment:
-        # Make sure we don't clip
-        if np.abs(audio_data).max():
-            max_gain = min(0.1, 1.0/np.abs(audio_data).max() - 1)
-        else:
-            # TODO: Handle audio with all zeros
-            warnings.warn('Got audio sample with all zeros', UserWarning)
-            max_gain = 0.1
-        gain = 1 + random.uniform(-0.1, max_gain)
-        with LogTimer(LOGGER, 'Applying gain to audio'):
-            audio_data *= gain
-        audio_aug_params = {'gain': gain}
-    else:
-        audio_aug_params = {}
-
-    return audio_data, start / sampling_frequency, audio_aug_params
-
-
-def sample_cropped_frame(frame_data):
-    """
-    Randomly crop a video frame, using the method from Look, Listen and Learn
-
-
-    Args:
-        frame_data: video frame data array
-
-    Returns:
-        scaled_frame_data: scaled and cropped frame data
-        bbox: bounding box for the cropped image
-    """
-    nx, ny, nc = frame_data.shape
-    start_x, start_y = random.randrange(nx - 224), random.randrange(ny - 224)
-    end_x, end_y = start_x + 224, start_y + 224
-
-    bbox = {
-        'start_x': start_x,
-        'start_y': start_y,
-        'end_x': end_x,
-        'end_y': end_y
-    }
-
-    with LogTimer(LOGGER, 'Cropping frame'):
-        frame_data = frame_data[start_x:end_x, start_y:end_y, :]
-
-    return frame_data, bbox
-
-
-def sample_one_frame(video_data, start=None, fps=30, augment=False):
-    """Return one frame randomly and time (seconds).
-
-    Args:
-        video_data: video data to sample from
-
-    Keyword Args:
-        start: start time of a one second window from which to sample
-        fps: frame per second
-        augment: if True, perturb the data in some fashion
-
-    Returns:
-        One frame sampled randomly, start time in seconds, and augmentation parameters
-
-    """
-
-    num_frames = len(video_data)
-    if start is not None:
-        start_frame = int(start * fps)
-        # Sample frame from a one second window, or until the end of the video
-        # if the video is less than a second for some reason
-        # Audio should always be sampled one second from the end of the audio,
-        # so video frames we're sampling from should also be a second. If it's
-        # not, then our video is probably less than a second
-        duration = min(fps, num_frames - start_frame)
-        if duration != fps:
-            warnings.warn('Got video that is less than one second', UserWarning)
-
-        if duration > 0:
-            frame = start_frame + random.randrange(duration)
-        else:
-            warnings.warn('Got video with only a single frame', UserWarning)
-            # For robustness, use the last frame if the start_frame goes past
-            # the end of video frame
-            frame = min(start_frame, num_frames - 1)
-    else:
-        frame = random.randrange(num_frames)
-
-    frame_data = video_data[frame]
-    frame_data, bbox = sample_cropped_frame(frame_data)
-
-    video_aug_params = {'bounding_box': bbox}
-
-    if augment:
-        # Randomly horizontally flip the image
-        horizontal_flip = False
-        if random.random() < 0.5:
-            with LogTimer(LOGGER, 'Flipping frame'):
-                frame_data = horiz_flip(frame_data)
-            horizontal_flip = True
-
-
-        # Ranges taken from https://github.com/tensorflow/models/blob/master/research/slim/preprocessing/inception_preprocessing.py
-
-        # Randomize the order of saturation jitter and brightness jitter
-        if random.random() < 0.5:
-            # Add saturation jitter
-            saturation_factor = np.float32(random.random() + 0.5)
-            with LogTimer(LOGGER, 'Adjusting saturation'):
-                frame_data = adjust_saturation(frame_data, saturation_factor)
-
-            # Add brightness jitter
-            max_delta = 32. / 255.
-            brightness_delta = np.float32((2*random.random() - 1) * max_delta)
-            with LogTimer(LOGGER, 'Adjusting brightness'):
-                frame_data = adjust_brightness(frame_data, brightness_delta)
-        else:
-            # Add brightness jitter
-            max_delta = 32. / 255.
-            brightness_delta = np.float32((2*random.random() - 1) * max_delta)
-            with LogTimer(LOGGER, 'Adjusting brightness'):
-                frame_data = adjust_brightness(frame_data, brightness_delta)
-
-            # Add saturation jitter
-            saturation_factor = np.float32(random.random() + 0.5)
-            with LogTimer(LOGGER, 'Adjusting saturation'):
-                frame_data = adjust_saturation(frame_data, saturation_factor)
-
-        video_aug_params.update({
-            'horizontal_flip': horizontal_flip,
-            'saturation_factor': saturation_factor,
-            'brightness_delta': brightness_delta
-        })
-
-    frame_data = skimage.img_as_float32(frame_data)
-
-
-    return frame_data, frame / fps, video_aug_params
-
-
-def read_video(video_path):
-    """
-    Read a video file as a numpy array
-
-    Resizes frames so that the minimum side is 256 pixels
-
-    Args:
-        video_path: Path to video file
-
-    Returns:
-        video: Numpy data array
-
-    """
-    vinfo = ffprobe(video_path)['video']
-    width = int(vinfo['@width'])
-    height = int(vinfo['@height'])
-
-    scaling = 256.0 / min(width, height)
-    new_width = math.ceil(scaling * width)
-    new_height = math.ceil(scaling * height)
-
-    # Resize frames
-    reader = FFmpegReader(video_path,
-                          outputdict={'-s': "{}x{}".format(new_width,
-                                                           new_height) })
-
-    frames = []
-    for frame in reader.nextFrame():
-        frames.append(frame)
-
-    return frames
-
-
-def generate_sample(audio_file_1, audio_data_1, audio_file_2, audio_data_2,
-                    video_file_1, video_data_1, video_file_2, video_data_2,
-                    audio_sampling_frequency, augment=False):
-    """
-    Generate a sample from the given audio and video files
-
-    The video from which the audio come from is chosen with a fair coin, as is
-    the video frame. Thus, there is a 50% chance of producing a positive or
-    negative example.
-
-    Args:
-        audio_file_1: audio filename
-        audio_data_1: audio data array
-        audio_file_2: audio filename
-        audio_data_2: audio data array
-        video_file_1: video filename
-        video_data_1: video data array
-        video_file_2: video filename
-        video_data_2: video data array
-        audio_sampling_frequency: audio sample rate
-
-    Keyword Args
-        augment: If True, perform data augmention
-
-    Returns:
-        sample: sample dictionary
-    """
-    video_choice = random.random() < 0.5
-    audio_choice = random.random() < 0.5
-
-    if audio_choice:
-        audio_file = audio_file_1
-        audio_data = audio_data_1
-    else:
-        audio_file = audio_file_2
-        audio_data = audio_data_2
-
-    if video_choice:
-        video_file = video_file_1
-        video_data = video_data_1
-    else:
-        video_file = video_file_2
-        video_data = video_data_2
-
-    label = int(video_choice != audio_choice)
-
-    sample_audio_data, audio_start, audio_aug_params \
-        = sample_one_second(audio_data, audio_sampling_frequency, augment=augment)
-
-    sample_video_data, video_start, video_aug_params \
-        = sample_one_frame(video_data, start=audio_start, augment=augment)
-
-    sample_audio_data = sample_audio_data.mean(axis=-1).reshape((1, sample_audio_data.shape[0]))
-
-    sample = {
-        'video': np.ascontiguousarray(sample_video_data),
-        'audio': np.ascontiguousarray(sample_audio_data),
-        'label': np.ascontiguousarray(np.array([label, 1 - label])),
-    }
-
-    return sample
-
-
-def sampler(video_file_1, video_file_2, rate=32, augment=False, precompute=False):
-    """Sample one frame from video_file, with 50% chance sample one second from corresponding audio_file,
-       50% chance sample one second from another audio_file in the list of audio_files.
-
-    Args:
-        video_file_1: video_file to sample from
-        video_file_2: candidate audio_files to sample from
-
-    Keyword Args:
-        rate: Poisson rate parameter. Used for precomputing samples
-        augment: If True, perform data augmention
-        precompute: If True, precompute samples during initialization so that
-                    memory can be discarded
-
-    Returns:
-        A generator that yields dictionary of video sample, audio sample,
-        and label (0: not from corresponding files, 1: from corresponding files)
-
-    """
-    debug_msg = 'Initializing streamer with videos "{}" and "{}"'
-    LOGGER.debug(debug_msg.format(video_file_1, video_file_2))
-    audio_file_1 = video_to_audio(video_file_1)
-    audio_file_2 = video_to_audio(video_file_2)
-
-    # Hack: choose a number of samples such that we with high probability, we
-    #       won't run out of samples, but is also less than the entire length of
-    #       the video so we don't have to resize all of the frames
-    num_samples = int(scipy.stats.poisson.ppf(0.999, rate))
-
-
-    try:
-        with LogTimer(LOGGER, 'Reading video'):
-            video_data_1 = read_video(video_file_1)
-    except Exception as e:
-        warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
-        warn_msg = warn_msg.format(video_file_1, type(e), e)
-        LOGGER.warning(warn_msg)
-        warnings.warn(warn_msg)
-        raise StopIteration()
-
-    try:
-        with LogTimer(LOGGER, 'Reading video'):
-            video_data_2 = read_video(video_file_2)
-    except Exception as e:
-        warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
-        warn_msg = warn_msg.format(video_file_2, type(e), e)
-        LOGGER.warning(warn_msg)
-        warnings.warn(warn_msg)
-        raise StopIteration()
-
-    try:
-        with LogTimer(LOGGER, 'Reading audio'):
-            audio_data_1, sampling_frequency = sf.read(audio_file_1,
-                                                       dtype='float32',
-                                                       always_2d=True)
-    except Exception as e:
-        warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
-        warn_msg = warn_msg.format(audio_file_1, type(e), e)
-        LOGGER.warning(warn_msg)
-        warnings.warn(warn_msg)
-        raise StopIteration()
-
-    try:
-        with LogTimer(LOGGER, 'Reading audio'):
-            audio_data_2, sampling_frequency = sf.read(audio_file_2,
-                                                       dtype='float32',
-                                                       always_2d=True)
-    except Exception as e:
-        warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
-        warn_msg = warn_msg.format(audio_file_2, type(e), e)
-        LOGGER.warning(warn_msg)
-        warnings.warn(warn_msg)
-        raise StopIteration()
-
-    if precompute:
-        samples = []
-        for _ in range(num_samples):
-            sample = generate_sample(
-                audio_file_1, audio_data_1, audio_file_2, audio_data_2,
-                video_file_1, video_data_1, video_file_2, video_data_2,
-                sampling_frequency, augment=augment)
-
-            samples.append(sample)
-
-        # Clear the data from memory
-        video_data_1 = None
-        video_data_2 = None
-        audio_data_1 = None
-        audio_data_2 = None
-        video_data = None
-        audio_data = None
-        del video_data_1
-        del video_data_2
-        del audio_data_1
-        del audio_data_2
-        del video_data
-        del audio_data
-
-        while samples:
-            # Yield the sample, and remove from the list to free up some memory
-            yield samples.pop()
-    else:
-        while True:
-            yield generate_sample(
-                audio_file_1, audio_data_1, audio_file_2, audio_data_2,
-                video_file_1, video_data_1, video_file_2, video_data_2,
-                sampling_frequency, augment=augment)
-
-    raise StopIteration()
-
-
-
-def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path=None,
-                   k=32, batch_size=64, random_state=20171021, precompute=False,
-                   num_distractors=1, augment=False, rate=32, max_videos=None, filter_file=None):
-    """Sample video and audio from data_dir, returns a streamer that yield samples infinitely.
-
-    Args:
-        data_dir: directory to sample video and audio from
-        metadata_path: Path to audioset metadata file
-        filter_path: Path to filter specification file
-        ontology_path: Path to AudioSet ontology file
-        k: number of concurrent open streamer
-        batch_size: batch size
-        random_state: Value used to initialize state of RNG
-        num_distractors: Number of pairs to generate a stream for each video
-
-    Returns:
-        A generator that yield infinite video and audio samples from data_dir
-
-    """
-
-    random.seed(random_state)
-
-    LOGGER.info("Getting file list...")
-    _, video_files = get_file_list(data_dir, metadata_path=metadata_path,
-                                             filter_path=filter_path,
-                                             ontology_path=ontology_path,
-                                             filter_file=filter_file)
-
-    LOGGER.info("Creating streamers...")
-    if max_videos is not None and max_videos < len(video_files):
-        LOGGER.info("Using a subset of {} videos".format(max_videos))
-        random.shuffle(video_files)
-        video_files = video_files[:max_videos]
-
-    seeds = []
-    for video_file_1 in tqdm(video_files):
-        for _ in range(num_distractors):
-            video_file_2 = video_file_1
-            # Make sure we sample a different file
-            while video_file_2 == video_file_1:
-                video_file_2 = random.choice(video_files)
-
-            streamer = pescador.Streamer(sampler, video_file_1, video_file_2,
-                                         rate=rate, augment=augment,
-                                         precompute=precompute)
-            seeds.append(streamer)
-
-    # Randomly shuffle the seeds
-    random.shuffle(seeds)
-
-    mux = pescador.Mux(seeds, k, rate=rate)
-    if batch_size == 1:
-        return mux
-    else:
-        return pescador.maps.buffer_stream(mux, batch_size)
-
-
-def write_to_h5(path, batch):
-    with h5py.File(path, 'w') as f:
-        f.create_dataset('video', data=batch['video'], compression='gzip')
-        f.create_dataset('audio', data=batch['audio'], compression='gzip')
-        f.create_dataset('label', data=batch['label'], compression='gzip')
-
-
-def sample_and_save(index):
-    train_gen = data_generator(
-        train_data_dir,
-        metadata_path=None,
-        ontology_path=None,
-        filter_path=None,
-        batch_size=train_batch_size,
-        random_state=index,
-        k=train_num_streamers,
-        augment=True,
-        num_distractors=1,
-        max_videos=None,
-        precompute=False,
-        rate=train_mux_rate,
-        filter_file=train_filter_file)
-
-    for sub_index, batch in enumerate(train_gen):
-        write_to_h5('{}/{}_{}.h5'.format(output_dir, index, sub_index), batch)
-
-
-def map_iterate_in_parallel(iterable, function, processes=8):
-    pool = Pool(processes=processes)
-    output = pool.map(function, iterable)
-    return list(output)
-
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser(description='Pre-sample videos and audios for L3 model.')
-    parser.add_argument('-tbs',
-                        '--train-batch-size',
-                        dest='train_batch_size',
+    parser.add_argument('-bs',
+                        '--batch-size',
+                        dest='batch_size',
                         action='store',
                         type=int,
                         default=64,
                         help='Number of examples per training batch')
 
-    parser.add_argument('-tns',
-                        '--train-num-streamers',
-                        dest='train_num_streamers',
+    parser.add_argument('-ns',
+                        '--num-streamers',
+                        dest='num_streamers',
                         action='store',
                         type=int,
                         default=64,
                         help='Number of training pescador streamers that can be open concurrently')
 
-    parser.add_argument('-tmr',
-                        '--train-mux-rate',
-                        dest='train_mux_rate',
+    parser.add_argument('-mr',
+                        '--mux-rate',
+                        dest='mux_rate',
                         action='store',
                         type=float,
                         default=2.0,
                         help='Poisson distribution parameter for determining number of training samples to take from a streamer')
 
+    parser.add_argument('-a',
+                        '--augment',
+                        dest='augment',
+                        action='store_true',
+                        default=False,
+                        help='If True, performs data augmentation on audio and images')
+
+    parser.add_argument('-pc',
+                        '--precompute',
+                        dest='precompute',
+                        action='store_true',
+                        default=False,
+                        help='If True, streamer precompute samples')
+
+    parser.add_argument('-nd',
+                        '--num-distractors',
+                        dest='num_distractors',
+                        action='store',
+                        type=int,
+                        default=1,
+                        help='Number of distractors for generating examples')
+
+    parser.add_argument('-im',
+                        '--include-metadata',
+                        dest='include_metadata',
+                        action='store_true',
+                        help='If True, includes additional metadata in h5 files')
+
+    parser.add_argument('-mv',
+                        '--max-videos',
+                        dest='max_videos',
+                        action='store',
+                        type=int,
+                        help='Maximum number of videos to use for generating examples. If not specified, all videos will be used')
+
+    parser.add_argument('-r',
+                        '--random-state',
+                        dest='random_state',
+                        action='store',
+                        type=int,
+                        default=20171021,
+                        help='Random seed used to set the RNG state')
+
     parser.add_argument('-n',
                         '--num-workers',
                         dest='num_workers',
@@ -731,17 +87,10 @@ def map_iterate_in_parallel(iterable, function, processes=8):
                         default=4,
                         help='Number of multiprocessing workers used to download videos')
 
-    parser.add_argument('-tff',
-                        '--train-filter-file',
-                        dest='train_filter_file',
+    parser.add_argument('subset_path',
                         action='store',
                         type=str,
-                        default=None)
-
-    parser.add_argument('train_data_dir',
-                        action='store',
-                        type=str,
-                        help='Path to directory where training set files are stored')
+                        help='Path to subset file')
 
     parser.add_argument('output_dir',
                         action='store',
@@ -750,21 +99,21 @@ def map_iterate_in_parallel(iterable, function, processes=8):
 
     args = parser.parse_args()
 
-    # train_data_dir = '/beegfs/work/AudioSet/data'
-    # train_filter_file = '/scratch/jtc440/audioset_subsets/audioset_filtered_train.csv'
-    # train_batch_size = 512
-    # train_num_streamers = 64
-    # train_mux_rate = 2
-    # output_dir = '/scratch/hhw230/train'
-    # num_workers = 16
+    init_console_logger(LOGGER, verbose=True)
+
+    worker_func = partial(sample_and_save,
+        subset_path=args.subset_path,
+        output_dir=args.output_dir,
+        num_streamers=args.num_streamers,
+        batch_size=args.batch_size,
+        random_state=args.random_state,
+        precompute=args.precompute,
+        num_distractors=args.num_distractors,
+        augment=args.augment,
+        rate=args.mux_rate,
+        max_videos=args.max_videos,
+        include_metadata=args.include_metadata)
 
-    train_data_dir = args.train_data_dir
-    train_filter_file = args.train_filter_file
-    train_batch_size = args.train_batch_size
-    train_num_streamers = args.train_num_streamers
-    train_mux_rate = args.train_mux_rate
-    output_dir = args.output_dir
     num_workers = args.num_workers
-
-    map_iterate_in_parallel(range(num_workers), sample_and_save,
+    map_iterate_in_parallel(range(num_workers), worker_func,
                             processes=num_workers)

From cd4ec7317acd1c8f2603669794db1b72212e7343 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 22 Jan 2018 17:38:27 -0500
Subject: [PATCH 062/201] Allow user to choose number of samples generated

---
 02_generate_samples.py |  31 +++++++++--
 data/sample.py         |  15 ++++--
 sample.py              | 119 -----------------------------------------
 3 files changed, 38 insertions(+), 127 deletions(-)
 delete mode 100644 sample.py

diff --git a/02_generate_samples.py b/02_generate_samples.py
index 47c1992..362fe34 100644
--- a/02_generate_samples.py
+++ b/02_generate_samples.py
@@ -1,5 +1,7 @@
 import argparse
 import logging
+import multiprocessing_logging
+import math
 from functools import partial
 from log import init_console_logger
 from data.sample import sample_and_save
@@ -7,7 +9,7 @@
 
 
 LOGGER = logging.getLogger('sampling')
-LOGGER.setLevel(logging.ERROR)
+LOGGER.setLevel(logging.DEBUG)
 
 if __name__ == '__main__':
 
@@ -87,11 +89,24 @@
                         default=4,
                         help='Number of multiprocessing workers used to download videos')
 
+    parser.add_argument('-v',
+                        '--verbose',
+                        dest='verbose',
+                        action='store_true',
+                        default=False,
+                        help='Logs verbose info')
+
+
     parser.add_argument('subset_path',
                         action='store',
                         type=str,
                         help='Path to subset file')
 
+    parser.add_argument('num_samples',
+                        action='store',
+                        type=int,
+                        help='(Minimum) number of samples to generate')
+
     parser.add_argument('output_dir',
                         action='store',
                         type=str,
@@ -99,13 +114,20 @@
 
     args = parser.parse_args()
 
-    init_console_logger(LOGGER, verbose=True)
+    init_console_logger(LOGGER, verbose=args.verbose)
+    multiprocessing_logging.install_mp_handler()
+
+    # Just round up for now
+    num_workers = args.num_workers
+    batch_size = args.batch_size
+    batches_per_worker = int(math.ceil(args.num_samples / (num_workers * batch_size)))
 
     worker_func = partial(sample_and_save,
         subset_path=args.subset_path,
+        num_batches=batches_per_worker,
         output_dir=args.output_dir,
         num_streamers=args.num_streamers,
-        batch_size=args.batch_size,
+        batch_size=batch_size,
         random_state=args.random_state,
         precompute=args.precompute,
         num_distractors=args.num_distractors,
@@ -114,6 +136,7 @@
         max_videos=args.max_videos,
         include_metadata=args.include_metadata)
 
-    num_workers = args.num_workers
     map_iterate_in_parallel(range(num_workers), worker_func,
                             processes=num_workers)
+
+    LOGGER.info('Done!')
diff --git a/data/sample.py b/data/sample.py
index 3c6bf00..a351986 100644
--- a/data/sample.py
+++ b/data/sample.py
@@ -499,7 +499,7 @@ def sampler(video_1, video_2, rate=32, augment=False, precompute=False, include_
 
 def data_generator(subset_path, k=32, batch_size=64, random_state=20171021,
                    precompute=False, num_distractors=1, augment=False, rate=32,
-                   max_videos=None, include_metadata=False):
+                   max_videos=None, include_metadata=False, cycle=False):
     """Sample video and audio from data_dir, returns a streamer that yield samples infinitely.
 
     Args:
@@ -543,6 +543,9 @@ def data_generator(subset_path, k=32, batch_size=64, random_state=20171021,
     random.shuffle(seeds)
 
     mux = pescador.Mux(seeds, k, rate=rate)
+    if cycle:
+        mux = mux.cycle()
+
     if batch_size == 1:
         return mux
     else:
@@ -555,9 +558,10 @@ def write_to_h5(path, batch):
             f.create_dataset(key, data=batch[key], compression='gzip')
 
 
-def sample_and_save(index, subset_path, output_dir, num_streamers=32, batch_size=64,
-                    random_state=20171021, precompute=False, num_distractors=1,
-                    augment=False, rate=32, max_videos=None, include_metadata=False):
+def sample_and_save(index, subset_path, num_batches, output_dir,
+                    num_streamers=32, batch_size=64, random_state=20171021,
+                    precompute=False, num_distractors=1, augment=False, rate=32,
+                    max_videos=None, include_metadata=False):
     data_gen = data_generator(
         subset_path,
         batch_size=batch_size,
@@ -576,3 +580,6 @@ def sample_and_save(index, subset_path, output_dir, num_streamers=32, batch_size
     for sub_index, batch in enumerate(data_gen):
         batch_path = os.path.join(output_dir, '{}_{}.h5'.format(index, sub_index))
         write_to_h5(batch_path, batch)
+
+        if sub_index == (num_batches - 1):
+            break
diff --git a/sample.py b/sample.py
deleted file mode 100644
index 47c1992..0000000
--- a/sample.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import argparse
-import logging
-from functools import partial
-from log import init_console_logger
-from data.sample import sample_and_save
-from data.utils import map_iterate_in_parallel
-
-
-LOGGER = logging.getLogger('sampling')
-LOGGER.setLevel(logging.ERROR)
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser(description='Pre-sample videos and audios for L3 model.')
-    parser.add_argument('-bs',
-                        '--batch-size',
-                        dest='batch_size',
-                        action='store',
-                        type=int,
-                        default=64,
-                        help='Number of examples per training batch')
-
-    parser.add_argument('-ns',
-                        '--num-streamers',
-                        dest='num_streamers',
-                        action='store',
-                        type=int,
-                        default=64,
-                        help='Number of training pescador streamers that can be open concurrently')
-
-    parser.add_argument('-mr',
-                        '--mux-rate',
-                        dest='mux_rate',
-                        action='store',
-                        type=float,
-                        default=2.0,
-                        help='Poisson distribution parameter for determining number of training samples to take from a streamer')
-
-    parser.add_argument('-a',
-                        '--augment',
-                        dest='augment',
-                        action='store_true',
-                        default=False,
-                        help='If True, performs data augmentation on audio and images')
-
-    parser.add_argument('-pc',
-                        '--precompute',
-                        dest='precompute',
-                        action='store_true',
-                        default=False,
-                        help='If True, streamer precompute samples')
-
-    parser.add_argument('-nd',
-                        '--num-distractors',
-                        dest='num_distractors',
-                        action='store',
-                        type=int,
-                        default=1,
-                        help='Number of distractors for generating examples')
-
-    parser.add_argument('-im',
-                        '--include-metadata',
-                        dest='include_metadata',
-                        action='store_true',
-                        help='If True, includes additional metadata in h5 files')
-
-    parser.add_argument('-mv',
-                        '--max-videos',
-                        dest='max_videos',
-                        action='store',
-                        type=int,
-                        help='Maximum number of videos to use for generating examples. If not specified, all videos will be used')
-
-    parser.add_argument('-r',
-                        '--random-state',
-                        dest='random_state',
-                        action='store',
-                        type=int,
-                        default=20171021,
-                        help='Random seed used to set the RNG state')
-
-    parser.add_argument('-n',
-                        '--num-workers',
-                        dest='num_workers',
-                        action='store',
-                        type=int,
-                        default=4,
-                        help='Number of multiprocessing workers used to download videos')
-
-    parser.add_argument('subset_path',
-                        action='store',
-                        type=str,
-                        help='Path to subset file')
-
-    parser.add_argument('output_dir',
-                        action='store',
-                        type=str,
-                        help='Path to directory where output files will be stored')
-
-    args = parser.parse_args()
-
-    init_console_logger(LOGGER, verbose=True)
-
-    worker_func = partial(sample_and_save,
-        subset_path=args.subset_path,
-        output_dir=args.output_dir,
-        num_streamers=args.num_streamers,
-        batch_size=args.batch_size,
-        random_state=args.random_state,
-        precompute=args.precompute,
-        num_distractors=args.num_distractors,
-        augment=args.augment,
-        rate=args.mux_rate,
-        max_videos=args.max_videos,
-        include_metadata=args.include_metadata)
-
-    num_workers = args.num_workers
-    map_iterate_in_parallel(range(num_workers), worker_func,
-                            processes=num_workers)

From b39a724f5f81b2d060bca1ccedc1632f061095f7 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 23 Jan 2018 12:59:53 -0500
Subject: [PATCH 063/201] Add example sbatch script for generating samples

---
 jobs/generate_samples.sbatch | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 jobs/generate_samples.sbatch

diff --git a/jobs/generate_samples.sbatch b/jobs/generate_samples.sbatch
new file mode 100644
index 0000000..678fc9d
--- /dev/null
+++ b/jobs/generate_samples.sbatch
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+#SBATCH --job-name=generate-samples-audioset
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=64GB
+#SBATCH --time=1-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@email.com
+#SBATCH --output="generate-samples-audioset-%j.out"
+#SBATCH --err="generate-samples-audioset-%j.err"
+
+
+source ~/.bashrc
+cd /home/$USER/dev
+source activate l3embedding
+
+SRCDIR=''
+DATA_DIR=''
+OUTPUT_DIR=''
+SUBSET_PATH=''
+
+module purge
+
+python $SRCDIR/02_generate_samples.py \
+    --batch-size 16 \
+    --num-streamers 64 \
+    --mux-rate 2 \
+    --augment \
+    --precompute \
+    --num-workers 2 \
+    --num-distractors 1 \
+    --random-state 20180118 \
+    $SUBSET_PATH \
+    1000 \
+    $OUTPUT_DIR

From 95fb2a0ae9c4556aa10495153191b37abe87b414 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 23 Jan 2018 13:04:48 -0500
Subject: [PATCH 064/201] Add random state to batch file name

---
 data/sample.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/data/sample.py b/data/sample.py
index a351986..6b35c19 100644
--- a/data/sample.py
+++ b/data/sample.py
@@ -272,7 +272,8 @@ def sample_one_frame(video_data, start=None, fps=30, augment=False):
             'brightness_delta': brightness_delta
         })
 
-    frame_data = skimage.img_as_float32(frame_data)
+    # Make in range [-1,1]
+    frame_data = skimage.img_as_float32(frame_data) * 2 - 1
 
 
     return frame_data, frame / fps, video_aug_params
@@ -578,7 +579,7 @@ def sample_and_save(index, subset_path, num_batches, output_dir,
         os.makedirs(output_dir)
 
     for sub_index, batch in enumerate(data_gen):
-        batch_path = os.path.join(output_dir, '{}_{}.h5'.format(index, sub_index))
+        batch_path = os.path.join(output_dir, '{}_{}_{}.h5'.format(random_state, index, sub_index))
         write_to_h5(batch_path, batch)
 
         if sub_index == (num_batches - 1):

From 443c8c58b0b3999b5de4419d7d44b01fa9db8ad6 Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Tue, 23 Jan 2018 13:39:57 -0500
Subject: [PATCH 065/201] output exact random_state used

---
 data/sample.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/sample.py b/data/sample.py
index 6b35c19..7cde690 100644
--- a/data/sample.py
+++ b/data/sample.py
@@ -579,7 +579,7 @@ def sample_and_save(index, subset_path, num_batches, output_dir,
         os.makedirs(output_dir)
 
     for sub_index, batch in enumerate(data_gen):
-        batch_path = os.path.join(output_dir, '{}_{}_{}.h5'.format(random_state, index, sub_index))
+        batch_path = os.path.join(output_dir, '{}_{}_{}.h5'.format(random_state, random_state + index, sub_index))
         write_to_h5(batch_path, batch)
 
         if sub_index == (num_batches - 1):

From ac4e413fc1f9c9ab0dc18043e98d85401a91beed Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 23 Jan 2018 16:54:15 -0500
Subject: [PATCH 066/201] Make mux cycling True by default, fix unicode issue
 when writing batches to H5 files, remove image preprocessing (this will be
 done when loading the data), and change some field names

---
 data/sample.py | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/data/sample.py b/data/sample.py
index 7cde690..3346c9c 100644
--- a/data/sample.py
+++ b/data/sample.py
@@ -272,10 +272,6 @@ def sample_one_frame(video_data, start=None, fps=30, augment=False):
             'brightness_delta': brightness_delta
         })
 
-    # Make in range [-1,1]
-    frame_data = skimage.img_as_float32(frame_data) * 2 - 1
-
-
     return frame_data, frame / fps, video_aug_params
 
 
@@ -373,12 +369,12 @@ def generate_sample(audio_file_1, audio_data_1, audio_file_2, audio_data_2,
     }
 
     if include_metadata:
-        sample['audio_file'] = audio_file
-        sample['video_file'] = video_file
-        sample['audio_start'] = audio_start
-        sample['video_start'] = video_start
-        sample.update(flatten_dict(audio_aug_params))
-        sample.update(flatten_dict(video_aug_params))
+        sample['audio_file'] = audio_file.encode('utf-8')
+        sample['video_file'] = video_file.encode('utf-8')
+        sample['audio_start_ts'] = audio_start
+        sample['video_start_ts'] = video_start
+        sample.update(flatten_dict(audio_aug_params, 'audio'))
+        sample.update(flatten_dict(video_aug_params, 'video'))
 
     return sample
 
@@ -500,7 +496,7 @@ def sampler(video_1, video_2, rate=32, augment=False, precompute=False, include_
 
 def data_generator(subset_path, k=32, batch_size=64, random_state=20171021,
                    precompute=False, num_distractors=1, augment=False, rate=32,
-                   max_videos=None, include_metadata=False, cycle=False):
+                   max_videos=None, include_metadata=False, cycle=True):
     """Sample video and audio from data_dir, returns a streamer that yield samples infinitely.
 
     Args:
@@ -521,7 +517,7 @@ def data_generator(subset_path, k=32, batch_size=64, random_state=20171021,
     file_list = read_csv_as_dicts(subset_path)
 
     LOGGER.info("Creating streamers...")
-    if max_videos is not None and max_videos < len(video_files):
+    if max_videos is not None and max_videos < len(file_list):
         LOGGER.info("Using a subset of {} videos".format(max_videos))
         random.shuffle(file_list)
         file_list = file_list[:max_videos]
@@ -579,7 +575,7 @@ def sample_and_save(index, subset_path, num_batches, output_dir,
         os.makedirs(output_dir)
 
     for sub_index, batch in enumerate(data_gen):
-        batch_path = os.path.join(output_dir, '{}_{}_{}.h5'.format(random_state, random_state + index, sub_index))
+        batch_path = os.path.join(output_dir, '{}_{}.h5'.format(random_state + index, index, sub_index))
         write_to_h5(batch_path, batch)
 
         if sub_index == (num_batches - 1):

From 9f60e0d188e729af909a41ae79052c4d6abc493c Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 23 Jan 2018 19:40:52 -0500
Subject: [PATCH 067/201] Change generator to load from files, allow continuing
 previous training runs, clean up train functions and module, rename train
 script

---
 train.py => 03_train_embedding.py | 114 +----
 l3embedding/train.py              | 793 ++++++------------------------
 2 files changed, 158 insertions(+), 749 deletions(-)
 rename train.py => 03_train_embedding.py (51%)

diff --git a/train.py b/03_train_embedding.py
similarity index 51%
rename from train.py
rename to 03_train_embedding.py
index 7a2c60b..6d6025f 100644
--- a/train.py
+++ b/03_train_embedding.py
@@ -1,4 +1,5 @@
 import argparse
+import logging
 import os.path
 from l3embedding.train import *
 
@@ -54,54 +55,6 @@ def parse_arguments():
                         default=64,
                         help='Number of examples per  batch')
 
-    parser.add_argument('-tns',
-                        '--train-num-streamers',
-                        dest='train_num_streamers',
-                        action='store',
-                        type=int,
-                        default=32,
-                        help='Number of training pescador streamers that can be open concurrently')
-
-    parser.add_argument('-vns',
-                        '--validation-num-streamers',
-                        dest='validation_num_streamers',
-                        action='store',
-                        type=int,
-                        default=32,
-                        help='Number of validation pescador streamers that can be open concurrently')
-
-    parser.add_argument('-tnd',
-                        '--train-num-distractors',
-                        dest='train_num_distractors',
-                        action='store',
-                        type=int,
-                        default=1,
-                        help='Number of distractors for generating training examples')
-
-    parser.add_argument('-vnd',
-                        '--validation-num-distractors',
-                        dest='validation_num_distractors',
-                        action='store',
-                        type=int,
-                        default=2,
-                        help='Number of distractors for generating validation examples')
-
-    parser.add_argument('-tmr',
-                        '--train-mux-rate',
-                        dest='train_mux_rate',
-                        action='store',
-                        type=float,
-                        default=16.0,
-                        help='Poisson distribution parameter for determining number of training samples to take from a streamer')
-
-    parser.add_argument('-vmr',
-                        '--validation-mux-rate',
-                        dest='validation_mux_rate',
-                        action='store',
-                        type=float,
-                        default=16.0,
-                        help='Poisson distribution parameter for determining number of validation samples to take from a streamer')
-
     parser.add_argument('-lr',
                         '--learning-rate',
                         dest='learning_rate',
@@ -126,28 +79,6 @@ def parse_arguments():
                         default=10,
                         help='The number of epochs between model checkpoints')
 
-    parser.add_argument('-o',
-                        '--ontology-path',
-                        dest='ontology_path',
-                        action='store',
-                        type=str,
-                        default=os.path.join(os.path.dirname(__file__), 'resources/ontology.json'),
-                        help='Path to AudioSet ontology')
-
-    parser.add_argument('-tmv',
-                        '--train-max-videos',
-                        dest='train_max_videos',
-                        action='store',
-                        type=int,
-                        help='Maximum number of videos to use for training. If not specified, all videos will be used')
-
-    parser.add_argument('-vmv',
-                        '--validation-max-videos',
-                        dest='validation_max_videos',
-                        action='store',
-                        type=int,
-                        help='Maximum number of videos to use for validation. If not specified, all videos will be used')
-
     parser.add_argument('-r',
                         '--random-state',
                         dest='random_state',
@@ -156,20 +87,6 @@ def parse_arguments():
                         default=20171021,
                         help='Random seed used to set the RNG state')
 
-    parser.add_argument('-a',
-                        '--augment',
-                        dest='augment',
-                        action='store_true',
-                        default=False,
-                        help='If True, performs data augmentation on audio and images')
-
-    parser.add_argument('-pc',
-                        '--precompute',
-                        dest='precompute',
-                        action='store_true',
-                        default=False,
-                        help='If True, streamer precompute samples')
-
     parser.add_argument('--gpus',
                         dest='gpus',
                         type=int,
@@ -183,33 +100,12 @@ def parse_arguments():
                         default=False,
                         help='If True, print detailed messages')
 
-    parser.add_argument('-tmp',
-                        '--train-metadata-path',
-                        dest='train_metadata_path',
-                        action='store',
-                        type=str,
-                        help='Path to training csv file(s). Accepts a glob string.')
-
-    parser.add_argument('-vmp',
-                        '--validation-metadata-path',
-                        dest='validation_metadata_path',
-                        action='store',
-                        type=str,
-                        help='Path to validation csv file. Accepts a glob string.')
-
-    parser.add_argument('-tfp',
-                        '--train-filter-path',
-                        dest='train_filter_path',
-                        action='store',
-                        type=str,
-                        help='Path to training csv file(s). Accepts a glob string.')
-
-    parser.add_argument('-vfp',
-                        '--validation-filter-path',
-                        dest='validation_filter_path',
+    parser.add_argument('-cmd',
+                        '--continue-model-dir',
+                        dest='continue_model_dir',
                         action='store',
                         type=str,
-                        help='Path to validationing csv file(s). Accepts a glob string.')
+                        help='Path to directory containing a model with which to resume training')
 
     parser.add_argument('-lp',
                         '--log-path',
diff --git a/l3embedding/train.py b/l3embedding/train.py
index dd46333..e9a9d1b 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -1,4 +1,3 @@
-import glob
 import json
 import math
 import datetime
@@ -7,8 +6,10 @@
 import pickle
 import random
 import warnings
+import itertools
 import csv
 
+import numpy as np
 import keras
 from keras.optimizers import Adam
 import pescador
@@ -17,604 +18,15 @@
 import soundfile as sf
 from tqdm import tqdm
 
-from .image import *
 from .model import MODELS
 from .training_utils import multi_gpu_model
-from audioset.ontology import ASOntology
 from log import *
+import h5py
 
 LOGGER = logging.getLogger('l3embedding')
 LOGGER.setLevel(logging.DEBUG)
 
 
-#TODO: Consider putting the sampling functionality into another file
-
-def get_filename(path):
-    """Return the filename of a path
-
-    Args: path: path to file
-
-    Returns:
-        filename: name of file (without extension)
-    """
-    return os.path.splitext(os.path.basename(path))[0]
-
-
-def load_metadata(metadata_path):
-    metadata = {}
-    for path in glob.glob(metadata_path):
-        with open(path, 'r') as f:
-            for idx, line in enumerate(f):
-                if idx in (0, 1):
-                    continue
-                elif idx == 2:
-                    fields = [field.strip() for field in line.lstrip('# ').rstrip().split(', ')]
-                else:
-                    row = [val.strip() for val in line.strip().split(', ')]
-                    ytid = row[0]
-
-                    entry = {field: val
-                            for field, val in zip(fields[1:], row[1:])}
-
-                    entry['positive_labels'] = entry['positive_labels'].strip('"').split(',')
-                    entry['start_seconds'] = float(entry['start_seconds'])
-                    entry['end_seconds'] = float(entry['end_seconds'])
-
-                    metadata[ytid] = entry
-
-    return metadata
-
-
-def load_filters(filter_path):
-    filters = []
-
-    with open(filter_path, 'r') as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            filters.append(row)
-
-    return filters
-
-def get_ytid_from_filename(filename):
-    first_us_idx = filename.rindex('_')
-    second_us_idx = filename.rindex('_', 0, first_us_idx)
-    return filename[:second_us_idx]
-
-
-def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=None):
-    """Return audio and video file list.
-
-    Args:
-        data_dir: input directory that contains audio/ and video/
-
-    Keyword Args:
-        metadata_path: Path to audioset metadata file
-        filter_path: Path to filter specification file
-        ontology_path: Path to AudioSet ontology file
-
-    Returns:
-        audio_files: list of audio files
-        video_files: list of video files
-
-    """
-    data_dir_contents = set(os.listdir(data_dir))
-    if 'audio' in data_dir_contents and 'video' in data_dir_contents:
-        audio_files = glob.glob('{}/audio/*'.format(data_dir))
-        video_files = glob.glob('{}/video/*'.format(data_dir))
-    else:
-        audio_files = glob.glob('{}/**/audio/*'.format(data_dir))
-        video_files = glob.glob('{}/**/video/*'.format(data_dir))
-
-    # Make sure that audio files and video files correspond to each other
-    audio_filenames = set([get_filename(path) for path in audio_files])
-    video_filenames = set([get_filename(path) for path in video_files])
-
-    valid_filenames = audio_filenames & video_filenames
-
-    if metadata_path and filter_path:
-        LOGGER.info('Filtering examples...')
-        if not ontology_path:
-            err_msg = 'Must provide ontology path to filter'
-            LOGGER.error(err_msg)
-            raise ValueError(err_msg)
-
-        ontology = ASOntology(ontology_path)
-
-        metadata = load_metadata(metadata_path)
-        filters = load_filters(filter_path)
-
-        filtered_filenames = []
-
-        for filename in valid_filenames:
-            ytid = get_ytid_from_filename(filename)
-            video_metadata = metadata[ytid]
-
-            video_labels = [ontology.get_node(label_id).name.lower()
-                            for label_id in video_metadata['positive_labels']]
-
-            accept = True
-            for _filter in filters:
-                filter_type = _filter['filter_type']
-                filter_accept = _filter['accept_reject'] == 'accept'
-                string = _filter['string']
-
-                if filter_type == 'ytid':
-                    match = ytid == string
-
-                elif filter_type == 'label':
-                    match = string.lower() in video_labels
-
-                # TODO: check this logic
-                if match == filter_accept:
-                    accept = False
-
-            if accept:
-                #LOGGER.debug('Using video: "{}"'.format(filename))
-                filtered_filenames.append(filename)
-
-        valid_filenames = set(filtered_filenames)
-
-    LOGGER.info('Total videos used: {}'.format(len(valid_filenames)))
-    audio_files = [path for path in audio_files if get_filename(path) in valid_filenames]
-    video_files = [path for path in video_files if get_filename(path) in valid_filenames]
-
-    return audio_files, video_files
-
-
-def video_to_audio(video_file):
-    """Return corresponding audio_file.
-
-    Args:
-        video_file: video_file
-
-    Returns:
-        audio_file
-
-    """
-
-    *path, _, name = video_file.split('/')
-    name = name.split('.')[0] + '.flac'
-    return '/'.join(path + ['audio', name])
-
-
-def sample_one_second(audio_data, sampling_frequency, augment=False):
-    """Return one second audio samples randomly
-
-    Args:
-        audio_data: audio data to sample from
-        sampling_frequency: audio sample rate
-        augment: if True, perturb the data in some fashion
-
-    Returns:
-        One second samples, start time, and augmentation parameters
-
-    """
-    sampling_frequency = int(sampling_frequency)
-    if len(audio_data) > sampling_frequency:
-        start = random.randrange(len(audio_data) - sampling_frequency)
-    else:
-        start = 0
-
-    with LogTimer(LOGGER, 'Slicing audio'):
-        audio_data = audio_data[start:start+sampling_frequency]
-
-    if audio_data.shape[0] != sampling_frequency:
-        # Pad audio that isn't one second
-        warnings.warn('Got audio that is less than one second', UserWarning)
-        with LogTimer(LOGGER, 'Slicing audio'):
-            audio_data = np.pad(audio_data,
-                                ((0, sampling_frequency - audio_data.shape[0]), (0,0)),
-                                mode='constant')
-    if augment:
-        # Make sure we don't clip
-        if np.abs(audio_data).max():
-            max_gain = min(0.1, 1.0/np.abs(audio_data).max() - 1)
-        else:
-            # TODO: Handle audio with all zeros
-            warnings.warn('Got audio sample with all zeros', UserWarning)
-            max_gain = 0.1
-        gain = 1 + random.uniform(-0.1, max_gain)
-        with LogTimer(LOGGER, 'Applying gain to audio'):
-            audio_data *= gain
-        audio_aug_params = {'gain': gain}
-    else:
-        audio_aug_params = {}
-
-    return audio_data, start / sampling_frequency, audio_aug_params
-
-
-def sample_cropped_frame(frame_data):
-    """
-    Randomly crop a video frame, using the method from Look, Listen and Learn
-
-
-    Args:
-        frame_data: video frame data array
-
-    Returns:
-        scaled_frame_data: scaled and cropped frame data
-        bbox: bounding box for the cropped image
-    """
-    nx, ny, nc = frame_data.shape
-    start_x, start_y = random.randrange(nx - 224), random.randrange(ny - 224)
-    end_x, end_y = start_x + 224, start_y + 224
-
-    bbox = {
-        'start_x': start_x,
-        'start_y': start_y,
-        'end_x': end_x,
-        'end_y': end_y
-    }
-
-    with LogTimer(LOGGER, 'Cropping frame'):
-        frame_data = frame_data[start_x:end_x, start_y:end_y, :]
-
-    return frame_data, bbox
-
-
-def sample_one_frame(video_data, start=None, fps=30, augment=False):
-    """Return one frame randomly and time (seconds).
-
-    Args:
-        video_data: video data to sample from
-
-    Keyword Args:
-        start: start time of a one second window from which to sample
-        fps: frame per second
-        augment: if True, perturb the data in some fashion
-
-    Returns:
-        One frame sampled randomly, start time in seconds, and augmentation parameters
-
-    """
-
-    num_frames = len(video_data)
-    if start is not None:
-        start_frame = int(start * fps)
-        # Sample frame from a one second window, or until the end of the video
-        # if the video is less than a second for some reason
-        # Audio should always be sampled one second from the end of the audio,
-        # so video frames we're sampling from should also be a second. If it's
-        # not, then our video is probably less than a second
-        duration = min(fps, num_frames - start_frame)
-        if duration != fps:
-            warnings.warn('Got video that is less than one second', UserWarning)
-
-        if duration > 0:
-            frame = start_frame + random.randrange(duration)
-        else:
-            warnings.warn('Got video with only a single frame', UserWarning)
-            # For robustness, use the last frame if the start_frame goes past
-            # the end of video frame
-            frame = min(start_frame, num_frames - 1)
-    else:
-        frame = random.randrange(num_frames)
-
-    frame_data = video_data[frame]
-    frame_data, bbox = sample_cropped_frame(frame_data)
-
-    video_aug_params = {'bounding_box': bbox}
-
-    if augment:
-        # Randomly horizontally flip the image
-        horizontal_flip = False
-        if random.random() < 0.5:
-            with LogTimer(LOGGER, 'Flipping frame'):
-                frame_data = horiz_flip(frame_data)
-            horizontal_flip = True
-
-
-        # Ranges taken from https://github.com/tensorflow/models/blob/master/research/slim/preprocessing/inception_preprocessing.py
-
-        # Randomize the order of saturation jitter and brightness jitter
-        if random.random() < 0.5:
-            # Add saturation jitter
-            saturation_factor = np.float32(random.random() + 0.5)
-            with LogTimer(LOGGER, 'Adjusting saturation'):
-                frame_data = adjust_saturation(frame_data, saturation_factor)
-
-            # Add brightness jitter
-            max_delta = 32. / 255.
-            brightness_delta = np.float32((2*random.random() - 1) * max_delta)
-            with LogTimer(LOGGER, 'Adjusting brightness'):
-                frame_data = adjust_brightness(frame_data, brightness_delta)
-        else:
-            # Add brightness jitter
-            max_delta = 32. / 255.
-            brightness_delta = np.float32((2*random.random() - 1) * max_delta)
-            with LogTimer(LOGGER, 'Adjusting brightness'):
-                frame_data = adjust_brightness(frame_data, brightness_delta)
-
-            # Add saturation jitter
-            saturation_factor = np.float32(random.random() + 0.5)
-            with LogTimer(LOGGER, 'Adjusting saturation'):
-                frame_data = adjust_saturation(frame_data, saturation_factor)
-
-        video_aug_params.update({
-            'horizontal_flip': horizontal_flip,
-            'saturation_factor': saturation_factor,
-            'brightness_delta': brightness_delta
-        })
-
-    frame_data = skimage.img_as_float32(frame_data)
-
-
-    return frame_data, frame / fps, video_aug_params
-
-
-def read_video(video_path):
-    """
-    Read a video file as a numpy array
-
-    Resizes frames so that the minimum side is 256 pixels
-
-    Args:
-        video_path: Path to video file
-
-    Returns:
-        video: Numpy data array
-
-    """
-    vinfo = ffprobe(video_path)['video']
-    width = int(vinfo['@width'])
-    height = int(vinfo['@height'])
-
-    scaling = 256.0 / min(width, height)
-    new_width = math.ceil(scaling * width)
-    new_height = math.ceil(scaling * height)
-
-    # Resize frames
-    reader = FFmpegReader(video_path,
-                          outputdict={'-s': "{}x{}".format(new_width,
-                                                           new_height) })
-
-    frames = []
-    for frame in reader.nextFrame():
-        frames.append(frame)
-
-    return frames
-
-
-def generate_sample(audio_file_1, audio_data_1, audio_file_2, audio_data_2,
-                    video_file_1, video_data_1, video_file_2, video_data_2,
-                    audio_sampling_frequency, augment=False):
-    """
-    Generate a sample from the given audio and video files
-
-    The video from which the audio come from is chosen with a fair coin, as is
-    the video frame. Thus, there is a 50% chance of producing a positive or
-    negative example.
-
-    Args:
-        audio_file_1: audio filename
-        audio_data_1: audio data array
-        audio_file_2: audio filename
-        audio_data_2: audio data array
-        video_file_1: video filename
-        video_data_1: video data array
-        video_file_2: video filename
-        video_data_2: video data array
-        audio_sampling_frequency: audio sample rate
-
-    Keyword Args
-        augment: If True, perform data augmention
-
-    Returns:
-        sample: sample dictionary
-    """
-    video_choice = random.random() < 0.5
-    audio_choice = random.random() < 0.5
-
-    if audio_choice:
-        audio_file = audio_file_1
-        audio_data = audio_data_1
-    else:
-        audio_file = audio_file_2
-        audio_data = audio_data_2
-
-    if video_choice:
-        video_file = video_file_1
-        video_data = video_data_1
-    else:
-        video_file = video_file_2
-        video_data = video_data_2
-
-    label = int(video_choice != audio_choice)
-
-    sample_audio_data, audio_start, audio_aug_params \
-        = sample_one_second(audio_data, audio_sampling_frequency, augment=augment)
-
-    sample_video_data, video_start, video_aug_params \
-        = sample_one_frame(video_data, start=audio_start, augment=augment)
-
-    sample_audio_data = sample_audio_data.mean(axis=-1).reshape((1, sample_audio_data.shape[0]))
-
-    sample = {
-        'video': np.ascontiguousarray(sample_video_data),
-        'audio': np.ascontiguousarray(sample_audio_data),
-        'label': np.ascontiguousarray(np.array([label, 1 - label])),
-#       'audio_file': audio_file,
-#       'video_file': video_file,
-#       'audio_start': audio_start,
-#       'video_start': video_start,
-#       'audio_augment_params': audio_aug_params,
-#       'video_augment_params': video_aug_params
-    }
-
-    return sample
-
-
-def sampler(video_file_1, video_file_2, rate=32, augment=False, precompute=False):
-    """Sample one frame from video_file, with 50% chance sample one second from corresponding audio_file,
-       50% chance sample one second from another audio_file in the list of audio_files.
-
-    Args:
-        video_file_1: video_file to sample from
-        video_file_2: candidate audio_files to sample from
-
-    Keyword Args:
-        rate: Poisson rate parameter. Used for precomputing samples
-        augment: If True, perform data augmention
-        precompute: If True, precompute samples during initialization so that
-                    memory can be discarded
-
-    Returns:
-        A generator that yields dictionary of video sample, audio sample,
-        and label (0: not from corresponding files, 1: from corresponding files)
-
-    """
-    debug_msg = 'Initializing streamer with videos "{}" and "{}"'
-    LOGGER.debug(debug_msg.format(video_file_1, video_file_2))
-    audio_file_1 = video_to_audio(video_file_1)
-    audio_file_2 = video_to_audio(video_file_2)
-
-    # Hack: choose a number of samples such that we with high probability, we
-    #       won't run out of samples, but is also less than the entire length of
-    #       the video so we don't have to resize all of the frames
-    num_samples = int(scipy.stats.poisson.ppf(0.999, rate))
-
-
-    try:
-        with LogTimer(LOGGER, 'Reading video'):
-            video_data_1 = read_video(video_file_1)
-    except Exception as e:
-        warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
-        warn_msg = warn_msg.format(video_file_1, type(e), e)
-        LOGGER.warning(warn_msg)
-        warnings.warn(warn_msg)
-        raise StopIteration()
-
-    try:
-        with LogTimer(LOGGER, 'Reading video'):
-            video_data_2 = read_video(video_file_2)
-    except Exception as e:
-        warn_msg = 'Could not open video file {} - {}: {}; Skipping...'
-        warn_msg = warn_msg.format(video_file_2, type(e), e)
-        LOGGER.warning(warn_msg)
-        warnings.warn(warn_msg)
-        raise StopIteration()
-
-    try:
-        with LogTimer(LOGGER, 'Reading audio'):
-            audio_data_1, sampling_frequency = sf.read(audio_file_1,
-                                                       dtype='float32',
-                                                       always_2d=True)
-    except Exception as e:
-        warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
-        warn_msg = warn_msg.format(audio_file_1, type(e), e)
-        LOGGER.warning(warn_msg)
-        warnings.warn(warn_msg)
-        raise StopIteration()
-
-    try:
-        with LogTimer(LOGGER, 'Reading audio'):
-            audio_data_2, sampling_frequency = sf.read(audio_file_2,
-                                                       dtype='float32',
-                                                       always_2d=True)
-    except Exception as e:
-        warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
-        warn_msg = warn_msg.format(audio_file_2, type(e), e)
-        LOGGER.warning(warn_msg)
-        warnings.warn(warn_msg)
-        raise StopIteration()
-
-    if precompute:
-        samples = []
-        for _ in range(num_samples):
-            sample = generate_sample(
-                audio_file_1, audio_data_1, audio_file_2, audio_data_2,
-                video_file_1, video_data_1, video_file_2, video_data_2,
-                sampling_frequency, augment=augment)
-
-            samples.append(sample)
-
-        # Clear the data from memory
-        video_data_1 = None
-        video_data_2 = None
-        audio_data_1 = None
-        audio_data_2 = None
-        video_data = None
-        audio_data = None
-        del video_data_1
-        del video_data_2
-        del audio_data_1
-        del audio_data_2
-        del video_data
-        del audio_data
-
-        while samples:
-            # Yield the sample, and remove from the list to free up some memory
-            yield samples.pop()
-    else:
-        while True:
-            yield generate_sample(
-                audio_file_1, audio_data_1, audio_file_2, audio_data_2,
-                video_file_1, video_data_1, video_file_2, video_data_2,
-                sampling_frequency, augment=augment)
-
-    raise StopIteration()
-
-
-
-def data_generator(data_dir, metadata_path=None, filter_path=None, ontology_path=None,
-                   k=32, batch_size=64, random_state=20171021, precompute=False,
-                   num_distractors=1, augment=False, rate=32, max_videos=None):
-    """Sample video and audio from data_dir, returns a streamer that yield samples infinitely.
-
-    Args:
-        data_dir: directory to sample video and audio from
-        metadata_path: Path to audioset metadata file
-        filter_path: Path to filter specification file
-        ontology_path: Path to AudioSet ontology file
-        k: number of concurrent open streamer
-        batch_size: batch size
-        random_state: Value used to initialize state of RNG
-        num_distractors: Number of pairs to generate a stream for each video
-
-    Returns:
-        A generator that yield infinite video and audio samples from data_dir
-
-    """
-
-    random.seed(random_state)
-
-    LOGGER.info("Getting file list...")
-    _, video_files = get_file_list(data_dir, metadata_path=metadata_path,
-                                             filter_path=filter_path, ontology_path=ontology_path)
-
-    LOGGER.info("Creating streamers...")
-    if max_videos is not None and max_videos < len(video_files):
-        LOGGER.info("Using a subset of {} videos".format(max_videos))
-        random.shuffle(video_files)
-        video_files = video_files[:max_videos]
-
-    seeds = []
-    for video_file_1 in tqdm(video_files):
-        for _ in range(num_distractors):
-            video_file_2 = video_file_1
-            # Make sure we sample a different file
-            while video_file_2 == video_file_1:
-                video_file_2 = random.choice(video_files)
-
-            #debug_msg = 'Created streamer for videos "{}" and "{}'
-            #LOGGER.debug(debug_msg.format(video_file_1, video_file_2))
-            streamer = pescador.Streamer(sampler, video_file_1, video_file_2,
-                                         rate=rate, augment=augment,
-                                         precompute=precompute)
-            # ZMQ streamers make the script stall inexplicably
-            #streamer = pescador.ZMQStreamer(streamer)
-            seeds.append(streamer)
-
-    # Randomly shuffle the seeds
-    random.shuffle(seeds)
-
-    mux = pescador.Mux(seeds, k, rate=rate)
-    if batch_size == 1:
-        return mux
-    else:
-        return pescador.maps.buffer_stream(mux, batch_size)
-
-
 class LossHistory(keras.callbacks.Callback):
     """
     Keras callback to record loss history
@@ -668,35 +80,117 @@ def on_batch_end(self, batch, logs=None):
         self.batch_times.append(t)
 
 
-#def train(train_csv_path, model_id, output_dir, num_epochs=150, epoch_size=512,
+def cycle_shuffle(iterable, shuffle=True):
+    lst = list(iterable)
+    while True:
+        yield from lst
+        if shuffle:
+            random.shuffle(lst)
+
+
+def data_generator(data_dir, batch_size=512, random_state=20180123,
+                   start_batch_idx=None, keys=None):
+    random.seed(random_state)
+
+    batch = None
+    curr_batch_size = 0
+    batch_idx = 0
+
+    # Limit keys to avoid producing batches with all of the metadata fields
+    if not keys:
+        keys = ['audio', 'video', 'label']
+
+    for fname in cycle_shuffle(os.listdir(data_dir)):
+        batch_path = os.path.join(data_dir, fname)
+        blob_start_idx = 0
+
+        blob = h5py.File(batch_path, 'r')
+        blob_size = len(blob['label'])
+
+        while blob_start_idx < blob_size:
+            blob_end_idx = min(blob_start_idx + batch_size - curr_batch_size, blob_size)
+
+            print(batch_path, blob_start_idx, blob_end_idx, curr_batch_size)
+            # If we are starting from a particular batch, skip computing all of
+            # the prior batches
+            if start_batch_idx is None or batch_idx >= start_batch_idx:
+                if batch is None:
+                    batch = {k:blob[k][blob_start_idx:blob_end_idx]
+                             for k in keys}
+                else:
+                    for k in keys:
+                        batch[k] = np.concatenate([batch[k],
+                                                   blob[k][blob_start_idx:blob_end_idx]])
+
+            curr_batch_size += blob_end_idx - blob_start_idx
+            blob_start_idx = blob_end_idx
+
+            if blob_end_idx == blob_size:
+                blob.close()
+
+            if curr_batch_size == batch_size:
+                # If we are starting from a particular batch, skip yielding all
+                # of the prior batches
+                if start_batch_idx is None or batch_idx >= start_batch_idx:
+                    # Preprocess video so samples are in [-1,1]
+                    batch['video'] = 2 * batch['video'] - 1
+
+                    yield batch
+
+                batch_idx += 1
+                curr_batch_size = 0
+                batch = None
+
+
+def single_epoch_data_generator(data_dir, epoch_size, **kwargs):
+    while True:
+        data_gen = data_generator(data_dir, **kwargs)
+        for idx, item in enumerate(data_gen):
+            yield item
+            # Once we generate all batches for an epoch, restart the generator
+            if (idx + 1) == epoch_size:
+                break
+
+
+def get_restart_info(history_path):
+    last = None
+    with open(history_path, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            last = row
+
+    return int(last['epoch']), float(last['val_acc']), float(last['val_loss'])
+
+
 def train(train_data_dir, validation_data_dir, model_id, output_dir,
-          num_epochs=150,
-          train_metadata_path=None, validation_metadata_path=None,
-          train_filter_path=None, validation_filter_path=None,
-          train_epoch_size=512, validation_epoch_size=1024,
+          num_epochs=150, train_epoch_size=512, validation_epoch_size=1024,
           train_batch_size=64, validation_batch_size=64,
-          train_num_streamers=16, validation_num_streamers=16,
-          train_num_distractors=1, validation_num_distractors=2,
-          model_type='cnn_L3_orig', train_mux_rate=16, validation_mux_rate=16,
-          learning_rate=1e-4, random_state=20171021, train_max_videos=None,
-          validation_max_videos=None, verbose=False, checkpoint_interval=10,
-          ontology_path=None, log_path=None, disable_logging=False,
-          precompute=False, augment=False, gpus=1):
+          model_type='cnn_L3_orig', random_state=20180123,
+          learning_rate=1e-4, verbose=False,
+          checkpoint_interval=10, log_path=None,
+          disable_logging=False, gpus=1, continue_model_dir=None):
 
     init_console_logger(LOGGER, verbose=verbose)
     if not disable_logging:
         init_file_logger(LOGGER, log_path=log_path)
     LOGGER.debug('Initialized logging.')
 
-    m, inputs, outputs = MODELS[model_type]()
-    if gpus > 1:
-        m = multi_gpu_model(m, gpus=gpus)
+    if continue_model_dir:
+        latest_model_path = os.path.join(continue_model_dir, 'model_latest.h5')
+        m, inputs, outputs = load_model(latest_model_path, model_type, return_io=True)
+    else:
+        m, inputs, outputs = MODELS[model_type]()
+        if gpus > 1:
+            m = multi_gpu_model(m, gpus=gpus)
+
     loss = 'categorical_crossentropy'
     metrics = ['accuracy']
-    monitor = 'val_loss'
 
     # Make sure the directories we need exist
-    model_dir = os.path.join(output_dir, model_id, datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
+    if continue_model_dir:
+        model_dir = continue_model_dir
+    else:
+        model_dir = os.path.join(output_dir, model_id, datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
     if not os.path.isdir(model_dir):
         os.makedirs(model_dir)
 
@@ -717,20 +211,46 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
     with open(model_json_path, 'w') as fd:
         json.dump(model_json, fd, indent=2)
 
-    weight_path = os.path.join(model_dir, 'model.h5')
-    checkpoint_weight_path = os.path.join(model_dir, 'model.{epoch:02d}.h5')
+    latest_weight_path = os.path.join(model_dir, 'model_latest.h5')
+    best_valid_acc_weight_path = os.path.join(model_dir, 'model_best_valid_accuracy.h5')
+    best_valid_loss_weight_path = os.path.join(model_dir, 'model_best_valid_loss.h5')
+    checkpoint_weight_path = os.path.join(model_dir, 'model_checkpoint.{epoch:02d}.h5')
 
+    # Load information about last epoch for initializing callbacks and data generators
+    if continue_model_dir is not None:
+        prev_train_hist_path = os.path.join(continue_model_dir, 'history_csvlog.csv')
+        last_epoch_idx, last_val_acc, last_val_loss = get_restart_info(prev_train_hist_path)
+
+    # Set up callbacks
     cb = []
-    cb.append(keras.callbacks.ModelCheckpoint(weight_path,
+    cb.append(keras.callbacks.ModelCheckpoint(latest_weight_path,
+                                              save_weights_only=True,
+                                              verbose=1))
+
+    best_val_acc_cb = keras.callbacks.ModelCheckpoint(best_valid_acc_weight_path,
+                                              save_weights_only=True,
+                                              save_best_only=True,
+                                              verbose=1,
+                                              monitor='val_acc')
+    if continue_model_dir is not None:
+        best_val_acc_cb.best = val_acc
+    cb.append(best_val_acc_cb)
+
+    best_val_loss_cb = keras.callbacks.ModelCheckpoint(best_valid_loss_weight_path,
                                               save_weights_only=True,
                                               save_best_only=True,
                                               verbose=1,
-                                              monitor=monitor))
+                                              monitor='val_loss')
+    if continue_model_dir is not None:
+        best_val_loss_cb.best = val_loss
+    cb.append(best_val_loss_cb)
 
-    cb.append(keras.callbacks.ModelCheckpoint(checkpoint_weight_path,
+    checkpoint_cb = keras.callbacks.ModelCheckpoint(checkpoint_weight_path,
                                               save_weights_only=True,
-                                              monitor=monitor,
-                                              period=checkpoint_interval))
+                                              period=checkpoint_interval)
+    if continue_model_dir is not None:
+        checkpoint_cb.epochs_since_last_save = (last_epoch_idx + 1) % checkpoint_interval
+    cb.append(checkpoint_cb)
 
     timer_cb = TimeHistory()
     cb.append(timer_cb)
@@ -743,43 +263,30 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
                                         separator=','))
 
     LOGGER.info('Setting up train data generator...')
+    if continue_model_dir is not None:
+        train_start_batch_idx = train_epoch_size * (last_epoch_idx + 1)
+    else:
+        train_start_batch_idx = None
+
     train_gen = data_generator(
         train_data_dir,
-        metadata_path=train_metadata_path,
-        ontology_path=ontology_path,
-        filter_path=train_filter_path,
         batch_size=train_batch_size,
         random_state=random_state,
-        k=train_num_streamers,
-        augment=augment,
-        num_distractors=train_num_distractors,
-        max_videos=train_max_videos,
-        precompute=precompute,
-        rate=train_mux_rate)
+        start_batch_idx=train_start_batch_idx)
 
     train_gen = pescador.maps.keras_tuples(train_gen,
                                            ['video', 'audio'],
                                            'label')
 
     LOGGER.info('Setting up validation data generator...')
-    val_gen = data_generator(
+    val_gen = single_epoch_data_generator(
         validation_data_dir,
-        metadata_path=validation_metadata_path,
-        ontology_path=ontology_path,
-        filter_path=validation_filter_path,
-        batch_size=validation_batch_size,
-        random_state=random_state,
-        k=validation_num_streamers,
-        num_distractors=validation_num_distractors,
-        max_videos=validation_max_videos,
-        precompute=precompute,
-        rate=validation_mux_rate)
+        validation_epoch_size,
+        random_state=random_state)
 
     val_gen = pescador.maps.keras_tuples(val_gen,
-                                           ['video', 'audio'],
-                                           'label')
-
-
+                                         ['video', 'audio'],
+                                         'label')
 
     # Fit the model
     LOGGER.info('Fitting model...')
@@ -787,12 +294,18 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
         verbosity = 1
     else:
         verbosity = 2
+
+    if continue_model_dir is not None:
+        initial_epoch = last_epoch_idx + 1
+    else:
+        initial_epoch = 0
     history = m.fit_generator(train_gen, train_epoch_size, num_epochs,
                               validation_data=val_gen,
                               validation_steps=validation_epoch_size,
                               #use_multiprocessing=True,
                               callbacks=cb,
-                              verbose=verbosity)
+                              verbose=verbosity,
+                              initial_epoch=initial_epoch)
 
     LOGGER.info('Done training. Saving results to disk...')
     # Save history

From 4a0bad0720ac76088a4467f23406f12fe1717215 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 23 Jan 2018 20:47:28 -0500
Subject: [PATCH 068/201] Update generate sample sbatch script and add
 corresponding array script

---
 jobs/generate_samples.sbatch       |  5 ++--
 jobs/generate_samples_array.sbatch | 38 ++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 2 deletions(-)
 create mode 100644 jobs/generate_samples_array.sbatch

diff --git a/jobs/generate_samples.sbatch b/jobs/generate_samples.sbatch
index 678fc9d..2965d1d 100644
--- a/jobs/generate_samples.sbatch
+++ b/jobs/generate_samples.sbatch
@@ -16,9 +16,9 @@ cd /home/$USER/dev
 source activate l3embedding
 
 SRCDIR=''
-DATA_DIR=''
 OUTPUT_DIR=''
 SUBSET_PATH=''
+BASE_RANDOM_STATE=20180118
 
 module purge
 
@@ -30,7 +30,8 @@ python $SRCDIR/02_generate_samples.py \
     --precompute \
     --num-workers 2 \
     --num-distractors 1 \
-    --random-state 20180118 \
+    --random-state $BASE_RANDOM_STATE \
+    --include-metadata \
     $SUBSET_PATH \
     1000 \
     $OUTPUT_DIR
diff --git a/jobs/generate_samples_array.sbatch b/jobs/generate_samples_array.sbatch
new file mode 100644
index 0000000..722c17b
--- /dev/null
+++ b/jobs/generate_samples_array.sbatch
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+#SBATCH --job-name=generate-samples-audioset
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=64GB
+#SBATCH --time=1-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@email.com
+#SBATCH --output="generate-samples-audioset-%A-%a.out"
+#SBATCH --err="generate-samples-audioset-%A-%a.err"
+
+
+source ~/.bashrc
+cd /home/$USER/dev
+source activate l3embedding
+
+SRCDIR=''
+OUTPUT_DIR=''
+SUBSET_PATH=''
+OFFSET=0
+BASE_RANDOM_STATE=20180118
+
+module purge
+
+python $SRCDIR/02_generate_samples.py \
+    --batch-size 16 \
+    --num-streamers 64 \
+    --mux-rate 2 \
+    --augment \
+    --precompute \
+    --num-workers 2 \
+    --num-distractors 1 \
+    --random-state $(($BASE_RANDOM_STATE + $OFFSET + $SLURM_ARRAY_TASK_ID)) \
+    --include-metadata \
+    $SUBSET_PATH \
+    1000 \
+    $OUTPUT_DIR

From c2bd038f5d3878d1c04c8fe91c24de4838ab64c0 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 23 Jan 2018 21:03:44 -0500
Subject: [PATCH 069/201] Add back missing cast to float, and make compatible
 with pypi version of skimage

---
 data/sample.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/data/sample.py b/data/sample.py
index 3346c9c..0308107 100644
--- a/data/sample.py
+++ b/data/sample.py
@@ -185,6 +185,8 @@ def sample_cropped_frame(frame_data):
     with LogTimer(LOGGER, 'Cropping frame'):
         frame_data = frame_data[start_x:end_x, start_y:end_y, :]
 
+    frame_data = skimage.img_as_float(frame_data).astype('float32')
+
     return frame_data, bbox
 
 

From 4b0d1ff67b162886aa3e34e5d2b576fa591a6ab6 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 24 Jan 2018 00:12:55 -0500
Subject: [PATCH 070/201] Fix random state in sbatch array script

---
 jobs/generate_samples_array.sbatch | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/jobs/generate_samples_array.sbatch b/jobs/generate_samples_array.sbatch
index 722c17b..b4d6b17 100644
--- a/jobs/generate_samples_array.sbatch
+++ b/jobs/generate_samples_array.sbatch
@@ -2,7 +2,7 @@
 
 #SBATCH --job-name=generate-samples-audioset
 #SBATCH --nodes=1
-#SBATCH --cpus-per-task=16
+#SBATCH --cpus-per-task=8
 #SBATCH --mem=64GB
 #SBATCH --time=1-0
 #SBATCH --mail-type=ALL
@@ -18,7 +18,9 @@ source activate l3embedding
 SRCDIR=''
 OUTPUT_DIR=''
 SUBSET_PATH=''
-OFFSET=0
+USER_IDX=0
+NUM_WORKERS=16
+NUM_TASKS=20
 BASE_RANDOM_STATE=20180118
 
 module purge
@@ -29,9 +31,9 @@ python $SRCDIR/02_generate_samples.py \
     --mux-rate 2 \
     --augment \
     --precompute \
-    --num-workers 2 \
+    --num-workers $NUM_WORKERS \
     --num-distractors 1 \
-    --random-state $(($BASE_RANDOM_STATE + $OFFSET + $SLURM_ARRAY_TASK_ID)) \
+    --random-state $[$BASE_RANDOM_STATE + $NUM_WORKERS * ($SLURM_ARRAY_TASK_ID  + $NUM_TASKS * $USER_IDX)] \
     --include-metadata \
     $SUBSET_PATH \
     1000 \

From f03473ce465331f0b65a524bc4e0da612c0549d5 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 24 Jan 2018 20:00:57 -0500
Subject: [PATCH 071/201] Fix batch filename

---
 data/sample.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/sample.py b/data/sample.py
index 0308107..ef92455 100644
--- a/data/sample.py
+++ b/data/sample.py
@@ -577,7 +577,7 @@ def sample_and_save(index, subset_path, num_batches, output_dir,
         os.makedirs(output_dir)
 
     for sub_index, batch in enumerate(data_gen):
-        batch_path = os.path.join(output_dir, '{}_{}.h5'.format(random_state + index, index, sub_index))
+        batch_path = os.path.join(output_dir, '{}_{}_{}.h5'.format(random_state + index, index, sub_index))
         write_to_h5(batch_path, batch)
 
         if sub_index == (num_batches - 1):

From 70a78f6f5e4b335cc3d1a0f6e9ec0dbe8548b95c Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 24 Jan 2018 20:37:20 -0500
Subject: [PATCH 072/201] Fix number of tasks in sample generation sbatch array
 script

---
 jobs/generate_samples_array.sbatch | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/jobs/generate_samples_array.sbatch b/jobs/generate_samples_array.sbatch
index b4d6b17..b3e8081 100644
--- a/jobs/generate_samples_array.sbatch
+++ b/jobs/generate_samples_array.sbatch
@@ -4,7 +4,7 @@
 #SBATCH --nodes=1
 #SBATCH --cpus-per-task=8
 #SBATCH --mem=64GB
-#SBATCH --time=1-0
+#SBATCH --time=7-0
 #SBATCH --mail-type=ALL
 #SBATCH --mail-user=name@email.com
 #SBATCH --output="generate-samples-audioset-%A-%a.out"
@@ -24,6 +24,7 @@ NUM_TASKS=20
 BASE_RANDOM_STATE=20180118
 
 module purge
+module load ffmpeg/intel/3.2.2
 
 python $SRCDIR/02_generate_samples.py \
     --batch-size 16 \
@@ -36,5 +37,5 @@ python $SRCDIR/02_generate_samples.py \
     --random-state $[$BASE_RANDOM_STATE + $NUM_WORKERS * ($SLURM_ARRAY_TASK_ID  + $NUM_TASKS * $USER_IDX)] \
     --include-metadata \
     $SUBSET_PATH \
-    1000 \
+    $[1000 / $NUM_TASKS] \
     $OUTPUT_DIR

From 830ace9ffb0aa378eb92f3e0a1113d2ddae4000e Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 24 Jan 2018 21:49:40 -0500
Subject: [PATCH 073/201] Add sample generation sbatch script for training set

---
 jobs/generate_samples_array_train.sbatch | 41 ++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 jobs/generate_samples_array_train.sbatch

diff --git a/jobs/generate_samples_array_train.sbatch b/jobs/generate_samples_array_train.sbatch
new file mode 100644
index 0000000..44043fa
--- /dev/null
+++ b/jobs/generate_samples_array_train.sbatch
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+#SBATCH --job-name=generate-samples-audioset
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64GB
+#SBATCH --time=7-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@email.com
+#SBATCH --output="generate-samples-audioset-%A-%a.out"
+#SBATCH --err="generate-samples-audioset-%A-%a.err"
+
+
+source ~/.bashrc
+cd /home/$USER/dev
+source activate l3embedding
+
+SRCDIR=$HOME/dev/l3embedding
+OUTPUT_DIR=/scratch/jtc440/audioset_subsets_data/filtered_train
+SUBSET_PATH=/scratch/jtc440/audioset_subsets/audioset_filtered_train.csv
+USER_IDX=0
+NUM_WORKERS=8
+NUM_TASKS=16
+BASE_RANDOM_STATE=20180118
+
+module purge
+module load ffmpeg/intel/3.2.2
+
+python $SRCDIR/02_generate_samples.py \
+    --batch-size 1024 \
+    --num-streamers 20 \
+    --mux-rate 20 \
+    --augment \
+    --precompute \
+    --num-workers $NUM_WORKERS \
+    --num-distractors 2 \
+    --random-state $[$BASE_RANDOM_STATE + $NUM_WORKERS * ($SLURM_ARRAY_TASK_ID  + $NUM_TASKS * $USER_IDX)] \
+    --include-metadata \
+    $SUBSET_PATH \
+    $[30000000 / $NUM_TASKS] \
+    $OUTPUT_DIR

From 49737b8836c5429f7e21c17d489b131d8db1567a Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Fri, 26 Jan 2018 00:04:01 -0500
Subject: [PATCH 074/201] Fix bug where batch size was not given to validation
 data generator and remove unnecessary print statement

---
 l3embedding/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index e9a9d1b..cd7671b 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -110,7 +110,6 @@ def data_generator(data_dir, batch_size=512, random_state=20180123,
         while blob_start_idx < blob_size:
             blob_end_idx = min(blob_start_idx + batch_size - curr_batch_size, blob_size)
 
-            print(batch_path, blob_start_idx, blob_end_idx, curr_batch_size)
             # If we are starting from a particular batch, skip computing all of
             # the prior batches
             if start_batch_idx is None or batch_idx >= start_batch_idx:
@@ -282,6 +281,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
     val_gen = single_epoch_data_generator(
         validation_data_dir,
         validation_epoch_size,
+        batch_size=validation_batch_size,
         random_state=random_state)
 
     val_gen = pescador.maps.keras_tuples(val_gen,

From ac84fdaa94a613d0204ebcff106393350274a0ac Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Fri, 26 Jan 2018 00:04:33 -0500
Subject: [PATCH 075/201] Remove dependency on kapre fork

---
 l3embedding/audio_model.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/l3embedding/audio_model.py b/l3embedding/audio_model.py
index b20a424..bdcb5c2 100644
--- a/l3embedding/audio_model.py
+++ b/l3embedding/audio_model.py
@@ -25,8 +25,9 @@ def construct_cnn_L3_orig_audio_model():
     # Audio subnetwork
     ####
     n_dft = 512
-    n_win = 480
-    n_hop = n_win//2
+    #n_win = 480
+    #n_hop = n_win//2
+    n_hop = 242
     asr = 48000
     audio_window_dur = 1
     # INPUT
@@ -34,7 +35,7 @@ def construct_cnn_L3_orig_audio_model():
 
     # SPECTROGRAM PREPROCESSING
     # 257 x 199 x 1
-    y_a = Spectrogram(n_dft=n_dft, n_win=n_win, n_hop=n_hop,
+    y_a = Spectrogram(n_dft=n_dft, n_hop=n_hop, # n_win=n_win,
                       return_decibel_spectrogram=True, padding='valid')(x_a)
     # CONV BLOCK 1
     n_filter_a_1 = 64

From 90d4481fb93268f38f4fcbc736cda5f7cc7ff022 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sun, 28 Jan 2018 11:03:11 -0500
Subject: [PATCH 076/201] Fix typos related to continuing training previous
 models

---
 l3embedding/train.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index cd7671b..7dac233 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -18,7 +18,7 @@
 import soundfile as sf
 from tqdm import tqdm
 
-from .model import MODELS
+from .model import MODELS, load_model
 from .training_utils import multi_gpu_model
 from log import *
 import h5py
@@ -232,7 +232,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
                                               verbose=1,
                                               monitor='val_acc')
     if continue_model_dir is not None:
-        best_val_acc_cb.best = val_acc
+        best_val_acc_cb.best = last_val_acc
     cb.append(best_val_acc_cb)
 
     best_val_loss_cb = keras.callbacks.ModelCheckpoint(best_valid_loss_weight_path,
@@ -241,7 +241,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
                                               verbose=1,
                                               monitor='val_loss')
     if continue_model_dir is not None:
-        best_val_loss_cb.best = val_loss
+        best_val_loss_cb.best = last_val_loss
     cb.append(best_val_loss_cb)
 
     checkpoint_cb = keras.callbacks.ModelCheckpoint(checkpoint_weight_path,

From 04640216277fef25335e696c62615a7ad4f58d86 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 29 Jan 2018 14:34:33 -0500
Subject: [PATCH 077/201] Save images as uint8 to save 3/4 of image space, and
 remove unnecessary metadata

---
 data/sample.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/data/sample.py b/data/sample.py
index ef92455..c38a61a 100644
--- a/data/sample.py
+++ b/data/sample.py
@@ -178,15 +178,11 @@ def sample_cropped_frame(frame_data):
     bbox = {
         'start_x': start_x,
         'start_y': start_y,
-        'end_x': end_x,
-        'end_y': end_y
     }
 
     with LogTimer(LOGGER, 'Cropping frame'):
         frame_data = frame_data[start_x:end_x, start_y:end_y, :]
 
-    frame_data = skimage.img_as_float(frame_data).astype('float32')
-
     return frame_data, bbox
 
 
@@ -231,6 +227,8 @@ def sample_one_frame(video_data, start=None, fps=30, augment=False):
     frame_data = video_data[frame]
     frame_data, bbox = sample_cropped_frame(frame_data)
 
+    frame_data = skimage.img_as_float(frame_data)
+
     video_aug_params = {'bounding_box': bbox}
 
     if augment:
@@ -241,7 +239,6 @@ def sample_one_frame(video_data, start=None, fps=30, augment=False):
                 frame_data = horiz_flip(frame_data)
             horizontal_flip = True
 
-
         # Ranges taken from https://github.com/tensorflow/models/blob/master/research/slim/preprocessing/inception_preprocessing.py
 
         # Randomize the order of saturation jitter and brightness jitter
@@ -274,6 +271,8 @@ def sample_one_frame(video_data, start=None, fps=30, augment=False):
             'brightness_delta': brightness_delta
         })
 
+    frame_data = skimage.img_as_ubyte(frame_data)
+
     return frame_data, frame / fps, video_aug_params
 
 

From 70f4ae27b9721e7e3fb582bbf7b54445b27dc07a Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 29 Jan 2018 15:56:46 -0500
Subject: [PATCH 078/201] Change sampling to save audio as int16, trim
 filenames, and return sample/frame indices instead of timestamps

---
 data/sample.py | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/data/sample.py b/data/sample.py
index c38a61a..c07e1da 100644
--- a/data/sample.py
+++ b/data/sample.py
@@ -141,7 +141,10 @@ def sample_one_second(audio_data, sampling_frequency, augment=False):
             audio_data = np.pad(audio_data,
                                 ((0, sampling_frequency - audio_data.shape[0]), (0,0)),
                                 mode='constant')
+
     if augment:
+        orig_dtype = audio_data.dtype
+        audio_data = audio_data.astype(float)
         # Make sure we don't clip
         if np.abs(audio_data).max():
             max_gain = min(0.1, 1.0/np.abs(audio_data).max() - 1)
@@ -152,11 +155,13 @@ def sample_one_second(audio_data, sampling_frequency, augment=False):
         gain = 1 + random.uniform(-0.1, max_gain)
         with LogTimer(LOGGER, 'Applying gain to audio'):
             audio_data *= gain
+
+        audio_data = audio_data.astype(orig_dtype)
         audio_aug_params = {'gain': gain}
     else:
         audio_aug_params = {}
 
-    return audio_data, start / sampling_frequency, audio_aug_params
+    return audio_data, start, audio_aug_params
 
 
 def sample_cropped_frame(frame_data):
@@ -193,7 +198,7 @@ def sample_one_frame(video_data, start=None, fps=30, augment=False):
         video_data: video data to sample from
 
     Keyword Args:
-        start: start time of a one second window from which to sample
+        start: start frame of a one second window from which to sample
         fps: frame per second
         augment: if True, perturb the data in some fashion
 
@@ -273,7 +278,7 @@ def sample_one_frame(video_data, start=None, fps=30, augment=False):
 
     frame_data = skimage.img_as_ubyte(frame_data)
 
-    return frame_data, frame / fps, video_aug_params
+    return frame_data, frame, video_aug_params
 
 
 def read_video(video_path):
@@ -361,7 +366,7 @@ def generate_sample(audio_file_1, audio_data_1, audio_file_2, audio_data_2,
     sample_video_data, video_start, video_aug_params \
         = sample_one_frame(video_data, start=audio_start, augment=augment)
 
-    sample_audio_data = sample_audio_data.mean(axis=-1).reshape((1, sample_audio_data.shape[0]))
+    sample_audio_data = sample_audio_data.reshape((1, sample_audio_data.shape[0]))
 
     sample = {
         'video': np.ascontiguousarray(sample_video_data),
@@ -370,10 +375,10 @@ def generate_sample(audio_file_1, audio_data_1, audio_file_2, audio_data_2,
     }
 
     if include_metadata:
-        sample['audio_file'] = audio_file.encode('utf-8')
-        sample['video_file'] = video_file.encode('utf-8')
-        sample['audio_start_ts'] = audio_start
-        sample['video_start_ts'] = video_start
+        sample['audio_file'] = os.path.basename(audio_file).encode('utf-8')
+        sample['video_file'] = os.path.basename(video_file).encode('utf-8')
+        sample['audio_start_sample_idx'] = audio_start
+        sample['video_start_frame_idx'] = video_start
         sample.update(flatten_dict(audio_aug_params, 'audio'))
         sample.update(flatten_dict(video_aug_params, 'video'))
 
@@ -436,8 +441,10 @@ def sampler(video_1, video_2, rate=32, augment=False, precompute=False, include_
     try:
         with LogTimer(LOGGER, 'Reading audio'):
             audio_data_1, sampling_frequency = sf.read(audio_file_1,
-                                                       dtype='float32',
+                                                       dtype='int16',
                                                        always_2d=True)
+            audio_data_1 = audio_data_1.mean(axis=-1).astype('int16')
+
     except Exception as e:
         warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
         warn_msg = warn_msg.format(audio_file_1, type(e), e)
@@ -448,8 +455,9 @@ def sampler(video_1, video_2, rate=32, augment=False, precompute=False, include_
     try:
         with LogTimer(LOGGER, 'Reading audio'):
             audio_data_2, sampling_frequency = sf.read(audio_file_2,
-                                                       dtype='float32',
+                                                       dtype='int16',
                                                        always_2d=True)
+            audio_data_2 = audio_data_2.mean(axis=-1).astype('int16')
     except Exception as e:
         warn_msg = 'Could not open audio file {} - {}: {}; Skipping...'
         warn_msg = warn_msg.format(audio_file_2, type(e), e)

From 1c5dbb6955472915f523632af33bccedc398007c Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 29 Jan 2018 22:04:39 -0500
Subject: [PATCH 079/201] Update train batch generation sbatch script

---
 jobs/generate_samples_array_train.sbatch | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/jobs/generate_samples_array_train.sbatch b/jobs/generate_samples_array_train.sbatch
index 44043fa..1324c4a 100644
--- a/jobs/generate_samples_array_train.sbatch
+++ b/jobs/generate_samples_array_train.sbatch
@@ -16,11 +16,11 @@ cd /home/$USER/dev
 source activate l3embedding
 
 SRCDIR=$HOME/dev/l3embedding
-OUTPUT_DIR=/scratch/jtc440/audioset_subsets_data/filtered_train
+OUTPUT_DIR=/beegfs/work/AudioSetSamples/filtered_train
 SUBSET_PATH=/scratch/jtc440/audioset_subsets/audioset_filtered_train.csv
 USER_IDX=0
 NUM_WORKERS=8
-NUM_TASKS=16
+NUM_TASKS=12
 BASE_RANDOM_STATE=20180118
 
 module purge

From e77b67a753c25b2ca34568c910dc1270ac07f6df Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 1 Feb 2018 00:34:55 -0500
Subject: [PATCH 080/201] Properly set random seed for pescador sampling

---
 data/sample.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/sample.py b/data/sample.py
index c07e1da..0efab07 100644
--- a/data/sample.py
+++ b/data/sample.py
@@ -521,6 +521,8 @@ def data_generator(subset_path, k=32, batch_size=64, random_state=20171021,
     """
 
     random.seed(random_state)
+    np.random.seed(random_state)
+
 
     LOGGER.info("Loading subset list")
     file_list = read_csv_as_dicts(subset_path)
@@ -548,7 +550,7 @@ def data_generator(subset_path, k=32, batch_size=64, random_state=20171021,
     # Randomly shuffle the seeds
     random.shuffle(seeds)
 
-    mux = pescador.Mux(seeds, k, rate=rate)
+    mux = pescador.Mux(seeds, k, rate=rate, random_state=random_state)
     if cycle:
         mux = mux.cycle()
 

From 92f4c1349d4be0d4d6d4a71520f2007629ddaf4b Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 1 Feb 2018 00:35:47 -0500
Subject: [PATCH 081/201] Make array task id start at zero from random state
 computation

---
 jobs/generate_samples_array.sbatch | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jobs/generate_samples_array.sbatch b/jobs/generate_samples_array.sbatch
index b3e8081..1ce2a9c 100644
--- a/jobs/generate_samples_array.sbatch
+++ b/jobs/generate_samples_array.sbatch
@@ -34,7 +34,7 @@ python $SRCDIR/02_generate_samples.py \
     --precompute \
     --num-workers $NUM_WORKERS \
     --num-distractors 1 \
-    --random-state $[$BASE_RANDOM_STATE + $NUM_WORKERS * ($SLURM_ARRAY_TASK_ID  + $NUM_TASKS * $USER_IDX)] \
+    --random-state $[$BASE_RANDOM_STATE + $NUM_WORKERS * ($SLURM_ARRAY_TASK_ID - 1 + $NUM_TASKS * $USER_IDX)] \
     --include-metadata \
     $SUBSET_PATH \
     $[1000 / $NUM_TASKS] \

From 6c30c87e36cdc98bcc6d655c753770a80eeb0751 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 1 Feb 2018 00:36:13 -0500
Subject: [PATCH 082/201] Add second train generation sbatch script

---
 jobs/generate_samples_array_train_2.sbatch | 41 ++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 jobs/generate_samples_array_train_2.sbatch

diff --git a/jobs/generate_samples_array_train_2.sbatch b/jobs/generate_samples_array_train_2.sbatch
new file mode 100644
index 0000000..3bc4733
--- /dev/null
+++ b/jobs/generate_samples_array_train_2.sbatch
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+#SBATCH --job-name=generate-samples-audioset
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64GB
+#SBATCH --time=7-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@email.com
+#SBATCH --output="generate-samples-audioset-%A-%a.out"
+#SBATCH --err="generate-samples-audioset-%A-%a.err"
+
+
+source ~/.bashrc
+cd /home/$USER/dev
+source activate l3embedding
+
+SRCDIR=$HOME/dev/l3embedding
+OUTPUT_DIR=/beegfs/work/AudioSetSamples/filtered_train
+SUBSET_PATH=/scratch/jtc440/audioset_subsets/audioset_filtered_train.csv
+USER_IDX=0
+NUM_WORKERS=4
+NUM_TASKS=12
+BASE_RANDOM_STATE=20183000
+
+module purge
+module load ffmpeg/intel/3.2.2
+
+python $SRCDIR/02_generate_samples.py \
+    --batch-size 1024 \
+    --num-streamers 20 \
+    --mux-rate 20 \
+    --augment \
+    --precompute \
+    --num-workers $NUM_WORKERS \
+    --num-distractors 2 \
+    --random-state $[$BASE_RANDOM_STATE + $NUM_WORKERS * ($SLURM_ARRAY_TASK_ID - 1  + $NUM_TASKS * $USER_IDX)] \
+    --include-metadata \
+    $SUBSET_PATH \
+    $[16284032 / $NUM_TASKS] \
+    $OUTPUT_DIR

From 1312d6d896ca2a8af8263411d9958fd18ee8a1a3 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Fri, 2 Feb 2018 12:31:28 -0500
Subject: [PATCH 083/201] Fix array indexing in template sample generation
 sbatch script

---
 jobs/generate_samples_array_train.sbatch | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/jobs/generate_samples_array_train.sbatch b/jobs/generate_samples_array_train.sbatch
index 1324c4a..cc617b0 100644
--- a/jobs/generate_samples_array_train.sbatch
+++ b/jobs/generate_samples_array_train.sbatch
@@ -19,9 +19,9 @@ SRCDIR=$HOME/dev/l3embedding
 OUTPUT_DIR=/beegfs/work/AudioSetSamples/filtered_train
 SUBSET_PATH=/scratch/jtc440/audioset_subsets/audioset_filtered_train.csv
 USER_IDX=0
-NUM_WORKERS=8
+NUM_WORKERS=4
 NUM_TASKS=12
-BASE_RANDOM_STATE=20180118
+BASE_RANDOM_STATE=20183000
 
 module purge
 module load ffmpeg/intel/3.2.2
@@ -34,7 +34,7 @@ python $SRCDIR/02_generate_samples.py \
     --precompute \
     --num-workers $NUM_WORKERS \
     --num-distractors 2 \
-    --random-state $[$BASE_RANDOM_STATE + $NUM_WORKERS * ($SLURM_ARRAY_TASK_ID  + $NUM_TASKS * $USER_IDX)] \
+    --random-state $[$BASE_RANDOM_STATE + $NUM_WORKERS * ($SLURM_ARRAY_TASK_ID - 1  + $NUM_TASKS * $USER_IDX)] \
     --include-metadata \
     $SUBSET_PATH \
     $[30000000 / $NUM_TASKS] \

From 1ef0259af61c2e66707038a0601020b530dee5f7 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Fri, 2 Feb 2018 12:32:33 -0500
Subject: [PATCH 084/201] Add latest sample generation sbatch array script

---
 jobs/generate_samples_array_train_3.sbatch | 41 ++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 jobs/generate_samples_array_train_3.sbatch

diff --git a/jobs/generate_samples_array_train_3.sbatch b/jobs/generate_samples_array_train_3.sbatch
new file mode 100644
index 0000000..1afeac0
--- /dev/null
+++ b/jobs/generate_samples_array_train_3.sbatch
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+#SBATCH --job-name=generate-samples-audioset
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64GB
+#SBATCH --time=7-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@email.com
+#SBATCH --output="generate-samples-audioset-%A-%a.out"
+#SBATCH --err="generate-samples-audioset-%A-%a.err"
+
+
+source ~/.bashrc
+cd /home/$USER/dev
+source activate l3embedding
+
+SRCDIR=$HOME/dev/l3embedding
+OUTPUT_DIR=/beegfs/work/AudioSetSamples/filtered_train
+SUBSET_PATH=/scratch/jtc440/audioset_subsets/audioset_filtered_train.csv
+USER_IDX=0
+NUM_WORKERS=4
+NUM_TASKS=12
+BASE_RANDOM_STATE=20185000
+
+module purge
+module load ffmpeg/intel/3.2.2
+
+python $SRCDIR/02_generate_samples.py \
+    --batch-size 1024 \
+    --num-streamers 20 \
+    --mux-rate 20 \
+    --augment \
+    --precompute \
+    --num-workers $NUM_WORKERS \
+    --num-distractors 2 \
+    --random-state $[$BASE_RANDOM_STATE + $NUM_WORKERS * ($SLURM_ARRAY_TASK_ID - 1  + $NUM_TASKS * $USER_IDX)] \
+    --include-metadata \
+    $SUBSET_PATH \
+    $[5834624 / $NUM_TASKS] \
+    $OUTPUT_DIR

From 1fa53f7b49c121f884360ebc7c5a1e7d1948559a Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 5 Feb 2018 21:49:27 -0500
Subject: [PATCH 085/201] Fix zero padding bug introduced when downmixing audio
 before sampling

---
 data/sample.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/sample.py b/data/sample.py
index 0efab07..805a3a9 100644
--- a/data/sample.py
+++ b/data/sample.py
@@ -139,7 +139,7 @@ def sample_one_second(audio_data, sampling_frequency, augment=False):
         warnings.warn('Got audio that is less than one second', UserWarning)
         with LogTimer(LOGGER, 'Slicing audio'):
             audio_data = np.pad(audio_data,
-                                ((0, sampling_frequency - audio_data.shape[0]), (0,0)),
+                                ((0, sampling_frequency - audio_data.shape[0]),),
                                 mode='constant')
 
     if augment:

From 002a5443596c722dea504d4051b5fc01de4732eb Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 5 Feb 2018 22:01:51 -0500
Subject: [PATCH 086/201] Convert loaded batch data from int to float

---
 l3embedding/audio.py | 31 +++++++++++++++++++++++++++++++
 l3embedding/train.py |  7 ++++++-
 2 files changed, 37 insertions(+), 1 deletion(-)
 create mode 100644 l3embedding/audio.py

diff --git a/l3embedding/audio.py b/l3embedding/audio.py
new file mode 100644
index 0000000..46d9b5f
--- /dev/null
+++ b/l3embedding/audio.py
@@ -0,0 +1,31 @@
+# Copied from https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
+import numpy as np
+
+def pcm2float(sig, dtype='float64'):
+    """Convert PCM signal to floating point with a range from -1 to 1.
+    Use dtype='float32' for single precision.
+    Parameters
+    ----------
+    sig : array_like
+        Input array, must have integral type.
+    dtype : data type, optional
+        Desired (floating point) data type.
+    Returns
+    -------
+    numpy.ndarray
+        Normalized floating point data.
+    See Also
+    --------
+    float2pcm, dtype
+    """
+    sig = np.asarray(sig)
+    if sig.dtype.kind not in 'iu':
+        raise TypeError("'sig' must be an array of integers")
+    dtype = np.dtype(dtype)
+    if dtype.kind != 'f':
+        raise TypeError("'dtype' must be a floating point type")
+
+    i = np.iinfo(sig.dtype)
+    abs_max = 2 ** (i.bits - 1)
+    offset = i.min + abs_max
+    return (sig.astype(dtype) - offset) / abs_max
diff --git a/l3embedding/train.py b/l3embedding/train.py
index 7dac233..6eff62a 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -15,11 +15,13 @@
 import pescador
 import scipy.misc
 from skvideo.io import FFmpegReader, ffprobe
+from skimage import img_as_float
 import soundfile as sf
 from tqdm import tqdm
 
 from .model import MODELS, load_model
 from .training_utils import multi_gpu_model
+from .audio import pcm2float
 from log import *
 import h5py
 
@@ -132,7 +134,10 @@ def data_generator(data_dir, batch_size=512, random_state=20180123,
                 # of the prior batches
                 if start_batch_idx is None or batch_idx >= start_batch_idx:
                     # Preprocess video so samples are in [-1,1]
-                    batch['video'] = 2 * batch['video'] - 1
+                    batch['video'] = 2 * img_as_float(batch['video']).astype('float32') - 1
+
+                    # Convert audio to float
+                    batch['audio'] = pcm2float(batch['audio'], dtype='float32')
 
                     yield batch
 

From 4d21d3f2d2a5e1a6577ed2472390ae6d123cb777 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 5 Feb 2018 23:13:30 -0500
Subject: [PATCH 087/201] Add spectrogram normalization from L3 authors

---
 l3embedding/audio_model.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/l3embedding/audio_model.py b/l3embedding/audio_model.py
index bdcb5c2..c2c8b7e 100644
--- a/l3embedding/audio_model.py
+++ b/l3embedding/audio_model.py
@@ -1,7 +1,8 @@
 from keras.models import Model
 from keras.layers import Input, Conv2D, BatchNormalization, MaxPooling2D, \
-    Flatten, Activation
+    Flatten, Activation, Lambda
 from kapre.time_frequency import Spectrogram
+import tensorflow as tf
 import keras.regularizers as regularizers
 
 def construct_cnn_L3_orig_audio_model():
@@ -35,8 +36,12 @@ def construct_cnn_L3_orig_audio_model():
 
     # SPECTROGRAM PREPROCESSING
     # 257 x 199 x 1
-    y_a = Spectrogram(n_dft=n_dft, n_hop=n_hop, # n_win=n_win,
-                      return_decibel_spectrogram=True, padding='valid')(x_a)
+    y_a = Spectrogram(n_dft=n_dft, n_hop=n_hop, power_spectrogram=1.0, # n_win=n_win,
+                      return_decibel_spectrogram=False, padding='valid')(x_a)
+
+    # Apply normalization from L3 paper
+    y_a = Lambda(lambda x: tf.log(tf.maximum(x, 1e-12)) / 5.0)(y_a)
+
     # CONV BLOCK 1
     n_filter_a_1 = 64
     filt_size_a_1 = (3, 3)

From e30eaf36398f2a27193528a1476e57d26f0e62de Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 5 Feb 2018 23:16:54 -0500
Subject: [PATCH 088/201] Revert weight decay factors

---
 l3embedding/audio_model.py  | 2 +-
 l3embedding/vision_model.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/l3embedding/audio_model.py b/l3embedding/audio_model.py
index c2c8b7e..827f970 100644
--- a/l3embedding/audio_model.py
+++ b/l3embedding/audio_model.py
@@ -21,7 +21,7 @@ def construct_cnn_L3_orig_audio_model():
     outputs: Model outputs
             (Type: keras.layers.Layer)
     """
-    weight_decay = 1e-8
+    weight_decay = 1e-5
     ####
     # Audio subnetwork
     ####
diff --git a/l3embedding/vision_model.py b/l3embedding/vision_model.py
index 150289c..82797a3 100644
--- a/l3embedding/vision_model.py
+++ b/l3embedding/vision_model.py
@@ -20,7 +20,7 @@ def construct_cnn_L3_orig_vision_model():
     outputs: Model outputs
             (Type: keras.layers.Layer)
     """
-    weight_decay = 1e-8
+    weight_decay = 1e-5
     ####
     # Image subnetwork
     ####

From 4f1f26a92ee0e271bf4e9be146461e324b64c42a Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Fri, 9 Feb 2018 13:08:05 -0500
Subject: [PATCH 089/201] Add latest train sbatch script

---
 jobs/l3embedding-train-02052018.sbatch | 44 ++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 jobs/l3embedding-train-02052018.sbatch

diff --git a/jobs/l3embedding-train-02052018.sbatch b/jobs/l3embedding-train-02052018.sbatch
new file mode 100644
index 0000000..15707a7
--- /dev/null
+++ b/jobs/l3embedding-train-02052018.sbatch
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+
+#SBATCH --gres=gpu:4
+#SBATCH --job-name=l3embedding-train-filtered
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=2
+#SBATCH --mem=64GB
+#SBATCH --time=7-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@email.com
+#SBATCH --output="l3embedding-train-filtered-test-%j.out"
+#SBATCH --err="l3embedding-train-filtered-test-%j.err"
+
+
+source ~/.bashrc
+cd /home/$USER/dev
+source activate l3embedding
+
+SRCDIR=$HOME/dev/l3embedding
+TRAIN_DATA_DIR=/beegfs/work/AudioSetSamples/filtered_train
+VAL_DATA_DIR=/beegfs/work/AudioSetSamples/filtered_valid
+MODEL_ID='new_train_test'
+OUTPUT_DIR=/scratch/myname/l3_output
+
+module purge
+module load cuda/8.0.44
+module load cudnn/8.0v6.0
+
+python $SRCDIR/03_train_embedding.py \
+    --num-epochs 150 \
+    --train-epoch-size 4096 \
+    --train-batch-size 72 \
+    --model-type cnn_L3_orig \
+    --validation-epoch-size 4096 \
+    --validation-batch-size 16 \
+    --checkpoint-interval 10 \
+    --gpus 1 \
+    --learning-rate 0.0001 \
+    --random-state 20171021 \
+    --verbose \
+    $TRAIN_DATA_DIR \
+    $VAL_DATA_DIR \
+    $MODEL_ID \
+    $OUTPUT_DIR

From 901df5884bab7288938e1b242e15ef090d923071 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 12 Feb 2018 20:14:09 -0500
Subject: [PATCH 090/201] Add script for plotting training history

---
 plot_training_history.py | 73 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 plot_training_history.py

diff --git a/plot_training_history.py b/plot_training_history.py
new file mode 100644
index 0000000..34376b1
--- /dev/null
+++ b/plot_training_history.py
@@ -0,0 +1,73 @@
+import argparse
+import os.path
+import matplotlib.pyplot as plt
+import csv
+
+
+def generate_plot(csv_path, display=True, output_path=None):
+    """
+    Generate a plot of the accuracy and loss of training and validation sets
+
+    Args:
+        csv_path: path to training history CSV file
+        display: if True, display plot in a window
+        output_path: if provided, save plot to the path
+    """
+    epochs = []
+    train_acc = []
+    train_loss = []
+    val_acc = []
+    val_loss = []
+
+    csv_path = os.path.abspath(csv_path)
+
+    # Get model ID and timestamp for plot title
+    dir_parts = os.path.dirname(csv_path).split('/')
+    model_id, timestamp = dir_parts[-2], dir_parts[-1]
+
+    # Read CSV file
+    with open(csv_path, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            epochs.append(int(row['epoch']) + 1)
+            train_acc.append(float(row['acc']))
+            train_loss.append(float(row['loss']))
+            val_acc.append(float(row['val_acc']))
+            val_loss.append(float(row['val_loss']))
+
+    # Plot accuracy
+    plt.figure()
+    plt.subplot(1, 2, 1)
+    plt.plot(epochs, train_acc)
+    plt.plot(epochs, val_acc)
+    plt.legend(['Train', 'Validation'])
+    plt.xlabel('Epoch')
+    plt.ylabel('Accuracy')
+    plt.title('Subset Accuracy')
+
+    # Plot loss
+    plt.subplot(1, 2, 2)
+    plt.plot(epochs, train_loss)
+    plt.plot(epochs, val_loss)
+    plt.legend(['Train', 'Validation'])
+    plt.xlabel('Epoch')
+    plt.ylabel('Loss')
+    plt.title('Subset Loss')
+
+    plt.suptitle('{}/{}'.format(model_id, timestamp))
+
+    if output_path:
+        plt.savefig(output_path)
+
+    if display:
+        plt.show()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Plot the training and validation accuracy and loss for a given training run')
+    parser.add_argument('csv_path', type=str, help='path to training csv file')
+    parser.add_argument('--hide-display', '-hd', dest='display', action='store_false', help='if set, do not display plot')
+    parser.add_argument('--output-path', '-o', dest='output_path', type=str, help='optional output path for plot image')
+
+    args = parser.parse_args()
+    generate_plot(args.csv_path, display=args.display, output_path=args.output_path)

From 92c5e5d56f6b61ae9273feeac7aa503c8af831c0 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 13 Feb 2018 16:05:16 -0500
Subject: [PATCH 091/201] Add He-Normal initialization to the model

---
 l3embedding/audio_model.py  | 11 +++++++++++
 l3embedding/model.py        |  2 ++
 l3embedding/vision_model.py | 11 +++++++++++
 3 files changed, 24 insertions(+)

diff --git a/l3embedding/audio_model.py b/l3embedding/audio_model.py
index 827f970..8783093 100644
--- a/l3embedding/audio_model.py
+++ b/l3embedding/audio_model.py
@@ -47,10 +47,12 @@ def construct_cnn_L3_orig_audio_model():
     filt_size_a_1 = (3, 3)
     pool_size_a_1 = (2, 2)
     y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
     y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
@@ -61,10 +63,12 @@ def construct_cnn_L3_orig_audio_model():
     filt_size_a_2 = (3, 3)
     pool_size_a_2 = (2, 2)
     y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
     y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
@@ -75,10 +79,12 @@ def construct_cnn_L3_orig_audio_model():
     filt_size_a_3 = (3, 3)
     pool_size_a_3 = (2, 2)
     y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
     y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
@@ -89,10 +95,12 @@ def construct_cnn_L3_orig_audio_model():
     filt_size_a_4 = (3, 3)
     pool_size_a_4 = (32, 24)
     y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
     y_a = Conv2D(n_filter_a_4, filt_size_a_4,
+                 kernel_initializer='he_normal',
                  name='audio_embedding_layer', padding='same',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
@@ -160,16 +168,19 @@ def construct_tiny_L3_audio_model():
                       return_decibel_spectrogram=True, padding='valid')(x_a)
 
     y_a = Conv2D(10, (5,5), padding='valid', strides=(1,1),
+                 kernel_initializer='he_normal',
                          kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
     y_a = MaxPooling2D(pool_size=(3,3), strides=3)(y_a)
     y_a = Conv2D(10, (5,5), padding='valid', strides=(1,1),
+                 kernel_initializer='he_normal',
                          kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
     y_a = MaxPooling2D(pool_size=(3,3), strides=3)(y_a)
     y_a = Conv2D(10, (5,5), padding='valid', strides=(1,1),
+                 kernel_initializer='he_normal',
                          kernel_regularizer=regularizers.l2(weight_decay))(y_a)
     y_a = BatchNormalization()(y_a)
     y_a = Activation('relu')(y_a)
diff --git a/l3embedding/model.py b/l3embedding/model.py
index 0c31278..46e679f 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -24,8 +24,10 @@ def L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, model_name
     weight_decay = 1e-5
     y = concatenate([vision_model(x_i), audio_model(x_a)])
     y = Dense(layer_size, activation='relu',
+              kernel_initializer='he_normal',
               kernel_regularizer=regularizers.l2(weight_decay))(y)
     y = Dense(2, activation='softmax',
+              kernel_initializer='he_normal',
               kernel_regularizer=regularizers.l2(weight_decay))(y)
     m = Model(inputs=[x_i, x_a], outputs=y)
     m.name = model_name
diff --git a/l3embedding/vision_model.py b/l3embedding/vision_model.py
index 82797a3..2928c06 100644
--- a/l3embedding/vision_model.py
+++ b/l3embedding/vision_model.py
@@ -32,10 +32,12 @@ def construct_cnn_L3_orig_vision_model():
     filt_size_i_1 = (3, 3)
     pool_size_i_1 = (2, 2)
     y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(x_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
     y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = Activation('relu')(y_i)
     y_i = BatchNormalization()(y_i)
@@ -46,10 +48,12 @@ def construct_cnn_L3_orig_vision_model():
     filt_size_i_2 = (3, 3)
     pool_size_i_2 = (2, 2)
     y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same',
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
     y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same',
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
@@ -60,10 +64,12 @@ def construct_cnn_L3_orig_vision_model():
     filt_size_i_3 = (3, 3)
     pool_size_i_3 = (2, 2)
     y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same',
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
     y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same',
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
@@ -74,11 +80,13 @@ def construct_cnn_L3_orig_vision_model():
     filt_size_i_4 = (3, 3)
     pool_size_i_4 = (28, 28)
     y_i = Conv2D(n_filter_i_4, filt_size_i_4, padding='same',
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
     y_i = Conv2D(n_filter_i_4, filt_size_i_4,
                  name='vision_embedding_layer', padding='same',
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
@@ -137,16 +145,19 @@ def construct_tiny_L3_vision_model():
     x_i = Input(shape=(224, 224, 3), dtype='float32')
 
     y_i = Conv2D(10, (5,5), padding='valid', strides=(1,1),
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(x_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
     y_i = MaxPooling2D(pool_size=(3,3), strides=3)(y_i)
     y_i = Conv2D(10, (5,5), padding='valid', strides=(1,1),
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
     y_i = MaxPooling2D(pool_size=(3,3), strides=3)(y_i)
     y_i = Conv2D(10, (5,5), padding='valid', strides=(1,1),
+                 kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)

From 474748dc3ff688c69d6b3e6d86a7066ec3ebcb19 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 13 Feb 2018 16:06:45 -0500
Subject: [PATCH 092/201] Log train arguments, and change loss to
 binary_crossentropy (which should not matter, but arguably the language is
 more appropriate)

---
 l3embedding/train.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 6eff62a..de8a87f 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -179,6 +179,28 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
         init_file_logger(LOGGER, log_path=log_path)
     LOGGER.debug('Initialized logging.')
 
+    # TODO: Maybe refactor so this logging can be done more robustly
+    LOGGER.info('Training with the following arguments: {}'.format({
+          'train_data_dir': train_data_dir,
+          'validation_data_dir': validation_data_dir,
+          'model_id': model_id,
+          'output_dir': output_dir,
+          'num_epochs': num_epochs,
+          'train_epoch_size': train_epoch_size,
+          'validation_epoch_size': validation_epoch_size,
+          'train_batch_size': train_batch_size,
+          'validation_batch_size': validation_batch_size,
+          'model_type': model_type,
+          'random_state': random_state,
+          'learning_rate': learning_rate,
+          'verbose': verbose,
+          'checkpoint_interval': checkpoint_interval,
+          'log_path': log_path,
+          'disable_logging': disable_logging,
+          'gpus': gpus,
+          'continue_model_dir': continue_model_dir
+    }))
+
     if continue_model_dir:
         latest_model_path = os.path.join(continue_model_dir, 'model_latest.h5')
         m, inputs, outputs = load_model(latest_model_path, model_type, return_io=True)
@@ -187,7 +209,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
         if gpus > 1:
             m = multi_gpu_model(m, gpus=gpus)
 
-    loss = 'categorical_crossentropy'
+    loss = 'binary_crossentropy'
     metrics = ['accuracy']
 
     # Make sure the directories we need exist

From 614c96ba7d8045ff2799836073e9530f8915dfbb Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 13 Feb 2018 18:03:00 -0500
Subject: [PATCH 093/201] Add new variant model with Kapre decibel computation,
 and input standardization via batch normalization

---
 l3embedding/audio_model.py  | 108 ++++++++++++++++++++++++++++++++++++
 l3embedding/model.py        |  24 +++++++-
 l3embedding/vision_model.py |  96 ++++++++++++++++++++++++++++++++
 3 files changed, 227 insertions(+), 1 deletion(-)

diff --git a/l3embedding/audio_model.py b/l3embedding/audio_model.py
index 8783093..3bed39f 100644
--- a/l3embedding/audio_model.py
+++ b/l3embedding/audio_model.py
@@ -115,6 +115,114 @@ def construct_cnn_L3_orig_audio_model():
     return m, x_a, y_a
 
 
+def construct_cnn_L3_kapredbinputbn_audio_model():
+    """
+    Constructs a model that replicates the audio subnetwork  used in Look,
+    Listen and Learn
+
+    Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, .
+
+    Returns
+    -------
+    model:  L3 CNN model
+            (Type: keras.models.Model)
+    inputs: Model inputs
+            (Type: list[keras.layers.Input])
+    outputs: Model outputs
+            (Type: keras.layers.Layer)
+    """
+    weight_decay = 1e-5
+    ####
+    # Audio subnetwork
+    ####
+    n_dft = 512
+    #n_win = 480
+    #n_hop = n_win//2
+    n_hop = 242
+    asr = 48000
+    audio_window_dur = 1
+    # INPUT
+    x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32')
+
+    # SPECTROGRAM PREPROCESSING
+    # 257 x 199 x 1
+    y_a = Spectrogram(n_dft=n_dft, n_hop=n_hop, power_spectrogram=1.0, # n_win=n_win,
+                      return_decibel_spectrogram=True, padding='valid')(x_a)
+    y_a = BatchNormalization()(y_a)
+
+    # CONV BLOCK 1
+    n_filter_a_1 = 64
+    filt_size_a_1 = (3, 3)
+    pool_size_a_1 = (2, 2)
+    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a)
+
+    # CONV BLOCK 2
+    n_filter_a_2 = 128
+    filt_size_a_2 = (3, 3)
+    pool_size_a_2 = (2, 2)
+    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a)
+
+    # CONV BLOCK 3
+    n_filter_a_3 = 256
+    filt_size_a_3 = (3, 3)
+    pool_size_a_3 = (2, 2)
+    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a)
+
+    # CONV BLOCK 4
+    n_filter_a_4 = 512
+    filt_size_a_4 = (3, 3)
+    pool_size_a_4 = (32, 24)
+    y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = Conv2D(n_filter_a_4, filt_size_a_4,
+                 kernel_initializer='he_normal',
+                 name='audio_embedding_layer', padding='same',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_4)(y_a)
+
+    y_a = Flatten()(y_a)
+
+    m = Model(inputs=x_a, outputs=y_a)
+    m.name = 'audio_model'
+
+    return m, x_a, y_a
+
+
 def construct_cnn_l3_orig_audio_embedding_model(audio_model, x_a):
     """
     Constructs a model that produces the learned audio embedding
diff --git a/l3embedding/model.py b/l3embedding/model.py
index 46e679f..47b06ea 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -131,6 +131,27 @@ def construct_cnn_L3_orig():
     m = L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, 'cnn_L3_orig')
     return m
 
+def construct_cnn_L3_kapredbinputbn():
+    """
+    Constructs a model that replicates that used in Look, Listen and Learn
+
+    Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, .
+
+    Returns
+    -------
+    model:  L3 CNN model
+            (Type: keras.models.Model)
+    inputs: Model inputs
+            (Type: list[keras.layers.Input])
+    outputs: Model outputs
+            (Type: keras.layers.Layer)
+    """
+    vision_model, x_i, y_i = construct_cnn_L3_orig_inputbn_vision_model()
+    audio_model, x_a, y_a = construct_cnn_L3_kapredbinputbn_audio_model()
+
+    m = L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, 'cnn_L3_kapredbinputbn')
+    return m
+
 def construct_tiny_L3():
     """
     Constructs a model that implements a small L3 model for validation purposes
@@ -153,5 +174,6 @@ def construct_tiny_L3():
 
 MODELS = {
     'cnn_L3_orig': construct_cnn_L3_orig,
-    'tiny_L3': construct_tiny_L3
+    'tiny_L3': construct_tiny_L3,
+    'cnn_L3_kapredbinputbn': construct_cnn_L3_kapredbinputbn
 }
diff --git a/l3embedding/vision_model.py b/l3embedding/vision_model.py
index 2928c06..5e3342b 100644
--- a/l3embedding/vision_model.py
+++ b/l3embedding/vision_model.py
@@ -99,6 +99,102 @@ def construct_cnn_L3_orig_vision_model():
     return m, x_i, y_i
 
 
+def construct_cnn_L3_orig_inputbn_vision_model():
+    """
+    Constructs a model that replicates the vision subnetwork  used in Look,
+    Listen and Learn
+
+    Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, .
+
+    Returns
+    -------
+    model:  L3 CNN model
+            (Type: keras.models.Model)
+    inputs: Model inputs
+            (Type: list[keras.layers.Input])
+    outputs: Model outputs
+            (Type: keras.layers.Layer)
+    """
+    weight_decay = 1e-5
+    ####
+    # Image subnetwork
+    ####
+    # INPUT
+    x_i = Input(shape=(224, 224, 3), dtype='float32')
+    x_i = BatchNormalization()(x_i)
+
+    # CONV BLOCK 1
+    n_filter_i_1 = 64
+    filt_size_i_1 = (3, 3)
+    pool_size_i_1 = (2, 2)
+    y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(x_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = MaxPooling2D(pool_size=pool_size_i_1, strides=2, padding='same')(y_i)
+
+    # CONV BLOCK 2
+    n_filter_i_2 = 128
+    filt_size_i_2 = (3, 3)
+    pool_size_i_2 = (2, 2)
+    y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = MaxPooling2D(pool_size=pool_size_i_2, strides=2, padding='same')(y_i)
+
+    # CONV BLOCK 3
+    n_filter_i_3 = 256
+    filt_size_i_3 = (3, 3)
+    pool_size_i_3 = (2, 2)
+    y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = MaxPooling2D(pool_size=pool_size_i_3, strides=2, padding='same')(y_i)
+
+    # CONV BLOCK 4
+    n_filter_i_4 = 512
+    filt_size_i_4 = (3, 3)
+    pool_size_i_4 = (28, 28)
+    y_i = Conv2D(n_filter_i_4, filt_size_i_4, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = Conv2D(n_filter_i_4, filt_size_i_4,
+                 name='vision_embedding_layer', padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Activation('relu')(y_i)
+    y_i = MaxPooling2D(pool_size=pool_size_i_4, padding='same')(y_i)
+    y_i = Flatten()(y_i)
+
+    m = Model(inputs=x_i, outputs=y_i)
+    m.name = 'vision_model'
+
+    return m, x_i, y_i
+
+
 def construct_cnn_l3_orig_vision_embedding_model(vision_model, x_i):
     """
     Constructs a model that produces the learned vision embedding

From 207ea2853d17d403d3f192b7a6ac1b6fc5ab8a10 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Fri, 16 Feb 2018 16:12:54 -0500
Subject: [PATCH 094/201] Add Google Sheets hooks for consolidated bookkeeping
 of experiments

---
 03_train_embedding.py                  |  12 +++
 gsheets.py                             | 140 +++++++++++++++++++++++++
 jobs/l3embedding-train-02162018.sbatch |  49 +++++++++
 l3embedding/train.py                   |  93 ++++++++++++++--
 4 files changed, 287 insertions(+), 7 deletions(-)
 create mode 100644 gsheets.py
 create mode 100644 jobs/l3embedding-train-02162018.sbatch

diff --git a/03_train_embedding.py b/03_train_embedding.py
index 6d6025f..62d8659 100644
--- a/03_train_embedding.py
+++ b/03_train_embedding.py
@@ -93,6 +93,18 @@ def parse_arguments():
                         default=1,
                         help='Number of gpus used for data parallelism.')
 
+    parser.add_argument('-gsid',
+                        '--gsheet-id',
+                        dest='gsheet_id',
+                        type=str,
+                        help='Google Spreadsheet ID for centralized logging of experiments')
+
+    parser.add_argument('-gdan',
+                        '--google-dev-app-name',
+                        dest='google_dev_app_name',
+                        type=str,
+                        help='Google Developer Application Name for using API')
+
     parser.add_argument('-v',
                         '--verbose',
                         dest='verbose',
diff --git a/gsheets.py b/gsheets.py
new file mode 100644
index 0000000..5ab2b83
--- /dev/null
+++ b/gsheets.py
@@ -0,0 +1,140 @@
+import httplib2
+import os
+
+from oauth2client import client
+from oauth2client import tools
+from oauth2client.file import Storage
+from googleapiclient import discovery
+
+# TODO: Implement initializing spreadsheet
+
+FIELD_NAMES = [
+          'username',
+          'model_id',
+          'model_dir',
+          'git_commit',
+          'train_data_dir',
+          'validation_data_dir',
+          'continue_model_dir',
+          'model_type',
+          'num_epochs',
+          'train_epoch_size',
+          'validation_epoch_size',
+          'train_batch_size',
+          'validation_batch_size',
+          'random_state',
+          'learning_rate',
+          'gpus',
+          'checkpoint_interval',
+          'latest_epoch',
+          'latest_train_loss',
+          'latest_validation_loss',
+          'latest_train_acc',
+          'latest_validation_acc',
+          'best_train_loss',
+          'best_validation_loss',
+          'best_train_acc',
+          'best_validation_acc'
+]
+
+
+# If modifying these scopes, delete your previously saved credentials
+# at ~/.credentials/sheets.googleapis.com-python-quickstart.json
+SCOPES = 'https://www.googleapis.com/auth/spreadsheets'
+
+
+def get_credentials(application_name):
+    """Gets valid user credentials from storage.
+
+    If nothing has been stored, or if the stored credentials are invalid,
+    the OAuth2 flow is completed to obtain the new credentials.
+
+    Returns:
+        Credentials, the obtained credential.
+    """
+    home_dir = os.path.expanduser('~')
+    credential_dir = os.path.join(home_dir, '.credentials')
+    if not os.path.exists(credential_dir):
+        os.makedirs(credential_dir)
+    credential_path = os.path.join(credential_dir,
+                                   'sheets.googleapis.com-python-quickstart.json')
+
+    store = Storage(credential_path)
+    credentials = store.get()
+    if not credentials or credentials.invalid:
+        flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
+        flow.user_agent = application_name
+        if flags:
+            credentials = tools.run_flow(flow, store, flags)
+        else: # Needed only for compatibility with Python 2.6
+            credentials = tools.run(flow, store)
+        print('Storing credentials to ' + credential_path)
+    return credentials
+
+
+def append_row(service, spreadsheet_id, param_dict):
+    # The A1 notation of a range to search for a logical table of data.
+    # Values will be appended after the last row of the table.
+    range_ = 'A1:A{}'.format(len(FIELD_NAMES))
+    # How the input data should be interpreted.
+    value_input_option = 'USER_ENTERED'
+    # How the input data should be inserted.
+    insert_data_option = 'INSERT_ROWS'
+
+    value_range_body = {
+        "range": range_,
+        "majorDimension": 'ROWS',
+        "values": [[str(param_dict[field_name]) for field_name in FIELD_NAMES ]]
+    }
+
+    request = service.spreadsheets().values().append(
+        spreadsheetId=spreadsheet_id,
+        range=range_,
+        valueInputOption=value_input_option,
+        insertDataOption=insert_data_option,
+        body=value_range_body)
+    response = request.execute()
+
+
+def get_row(service, spreadsheet_id, param_dict):
+    range_ = 'C:C'
+    major_dimension = 'COLUMNS'
+
+    request = service.spreadsheets().values().get(
+        spreadsheetId=spreadsheet_id,
+        range=range_,
+        majorDimension=major_dimension)
+    response = request.execute()
+
+    try:
+        row_idx = response['values'][0].index(param_dict['model_dir'])
+        return row_idx + 1
+    except ValueError:
+        return None
+
+
+def update_experiment(service, spreadsheet_id, param_dict, start_col, end_col, values):
+    row_num = get_row(service, spreadsheet_id, param_dict)
+    value_input_option = 'USER_ENTERED'
+    range_ = '{1}{0}:{2}{0}'.format(row_num, start_col, end_col)
+    value_range_body = {
+        "range": range_,
+        "majorDimension": 'ROWS',
+        "values": [[str(val) for val in values]]
+    }
+
+    request = service.spreadsheets().values().update(
+        spreadsheetId=spreadsheet_id,
+        range=range_,
+        valueInputOption=value_input_option,
+        body=value_range_body)
+    response = request.execute()
+
+if __name__ == '__main__':
+    try:
+        import argparse
+        flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
+    except ImportError:
+        flags = None
+
+    get_credentials()
diff --git a/jobs/l3embedding-train-02162018.sbatch b/jobs/l3embedding-train-02162018.sbatch
new file mode 100644
index 0000000..a30a28d
--- /dev/null
+++ b/jobs/l3embedding-train-02162018.sbatch
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+#SBATCH --gres=gpu:4
+#SBATCH --job-name=l3embedding-train
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=2
+#SBATCH --mem=64GB
+#SBATCH --time=7-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@email.com
+#SBATCH --output="l3embedding-train-filtered-test-%j.out"
+#SBATCH --err="l3embedding-train-filtered-test-%j.err"
+
+
+source ~/.bashrc
+cd /home/$USER/dev
+source activate l3embedding-cpu
+
+SRCDIR=$HOME/dev/l3embedding
+TRAIN_DATA_DIR=/beegfs/work/AudioSetSamples/filtered_train
+VAL_DATA_DIR=/beegfs/work/AudioSetSamples/filtered_valid
+MODEL_ID='new_train_test'
+OUTPUT_DIR=/scratch/$USER/l3_output
+GOOGLE_DEV_APP_NAME='l3embeddingexperiments'
+GSHEET_ID='' # REPLACE THIS
+NUM_GPUS=1
+
+module purge
+module load cuda/8.0.44
+module load cudnn/8.0v6.0
+
+python $SRCDIR/03_train_embedding.py \
+    --num-epochs 500 \
+    --train-epoch-size 4096 \
+    --train-batch-size 64 \
+    --model-type cnn_L3_orig \
+    --validation-epoch-size 4096 \
+    --validation-batch-size 64 \
+    --checkpoint-interval 10 \
+    --gpus $NUM_GPUS \
+    --learning-rate 0.00001 \
+    --random-state 20180216 \
+    --gsheet-id $GSHEET_ID \
+    --google-dev-app-name $GOOGLE_DEV_APP_NAME \
+    --verbose \
+    $TRAIN_DATA_DIR \
+    $VAL_DATA_DIR \
+    $MODEL_ID \
+    $OUTPUT_DIR
diff --git a/l3embedding/train.py b/l3embedding/train.py
index de8a87f..04a7ebc 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -1,3 +1,5 @@
+import getpass
+import git
 import json
 import math
 import datetime
@@ -19,11 +21,15 @@
 import soundfile as sf
 from tqdm import tqdm
 
+from gsheets import get_credentials, append_row, update_experiment, get_row
 from .model import MODELS, load_model
 from .training_utils import multi_gpu_model
 from .audio import pcm2float
 from log import *
 import h5py
+import copy
+
+from googleapiclient import discovery
 
 LOGGER = logging.getLogger('l3embedding')
 LOGGER.setLevel(logging.DEBUG)
@@ -55,6 +61,58 @@ def on_epoch_end(self, epoch, logs=None):
         with open(self.outfile, 'wb') as fp:
             pickle.dump(loss_dict, fp)
 
+class GSheetLogger(keras.callbacks.Callback):
+    """
+    Keras callback to update Google Sheets Spreadsheet
+    """
+
+    def __init__(self, google_dev_app_name, spreadsheet_id, param_dict):
+        super().__init__()
+        self.google_dev_app_name = google_dev_app_name
+        self.spreadsheet_id = spreadsheet_id
+        self.credentials = get_credentials(google_dev_app_name)
+        self.service = discovery.build('sheets', 'v4', credentials=self.credentials)
+        self.param_dict = copy.deepcopy(param_dict)
+
+        row_num = get_row(self.service, self.spreadsheet_id, self.param_dict)
+        if row_num is None:
+            append_row(self.service, self.spreadsheet_id, self.param_dict)
+
+    def on_train_begin(self, logs=None):
+        if logs is None:
+            logs = {}
+        self.best_train_loss = float('inf')
+        self.best_valid_loss = float('inf')
+        self.best_train_acc = float('-inf')
+        self.best_valid_acc = float('-inf')
+
+    # def on_batch_end(self, batch, logs={}):
+    def on_epoch_end(self, epoch, logs=None):
+        if logs is None:
+            logs = {}
+        latest_epoch = epoch
+        latest_train_loss = logs.get('loss')
+        latest_valid_loss = logs.get('val_loss')
+        latest_train_acc = logs.get('acc')
+        latest_valid_acc = logs.get('val_acc')
+
+        if latest_train_loss < self.best_train_loss:
+            self.best_train_loss = latest_train_loss
+        if latest_valid_loss < self.best_valid_loss:
+            self.best_valid_loss = latest_valid_loss
+        if latest_train_acc > self.best_train_acc:
+            self.best_train_acc = latest_train_acc
+        if latest_valid_acc > self.best_valid_acc:
+            self.best_valid_acc = latest_valid_acc
+
+        values = [
+            latest_epoch, latest_train_loss, latest_valid_loss,
+            latest_train_acc, latest_valid_acc, self.best_train_loss,
+            self.best_valid_loss, self.best_train_acc, self.best_valid_acc]
+
+        update_experiment(self.service, self.spreadsheet_id, self.param_dict,
+                          'R', 'AA', values)
+
 
 class TimeHistory(keras.callbacks.Callback):
     """
@@ -170,17 +228,17 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
           num_epochs=150, train_epoch_size=512, validation_epoch_size=1024,
           train_batch_size=64, validation_batch_size=64,
           model_type='cnn_L3_orig', random_state=20180123,
-          learning_rate=1e-4, verbose=False,
-          checkpoint_interval=10, log_path=None,
-          disable_logging=False, gpus=1, continue_model_dir=None):
+          learning_rate=1e-4, verbose=False, checkpoint_interval=10,
+          log_path=None, disable_logging=False, gpus=1, continue_model_dir=None,
+          gsheet_id=None, google_dev_app_name=None):
 
     init_console_logger(LOGGER, verbose=verbose)
     if not disable_logging:
         init_file_logger(LOGGER, log_path=log_path)
     LOGGER.debug('Initialized logging.')
 
-    # TODO: Maybe refactor so this logging can be done more robustly
-    LOGGER.info('Training with the following arguments: {}'.format({
+    param_dict = {
+          'username': getpass.getuser(),
           'train_data_dir': train_data_dir,
           'validation_data_dir': validation_data_dir,
           'model_id': model_id,
@@ -198,8 +256,13 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
           'log_path': log_path,
           'disable_logging': disable_logging,
           'gpus': gpus,
-          'continue_model_dir': continue_model_dir
-    }))
+          'continue_model_dir': continue_model_dir,
+          'git_commit': git.Repo(os.path.dirname(os.path.abspath(__file__)),
+                                 search_parent_directories=True).head.object.hexsha,
+          'gsheet_id': gsheet_id,
+          'google_dev_app_name': google_dev_app_name
+    }
+    LOGGER.info('Training with the following arguments: {}'.format(param_dict))
 
     if continue_model_dir:
         latest_model_path = os.path.join(continue_model_dir, 'model_latest.h5')
@@ -227,6 +290,19 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
 
     LOGGER.info('Model files can be found in "{}"'.format(model_dir))
 
+    param_dict.update({
+          'latest_epoch': '-',
+          'latest_train_loss': '-',
+          'latest_validation_loss': '-',
+          'latest_train_acc': '-',
+          'latest_validation_acc': '-',
+          'best_train_loss': '-',
+          'best_validation_loss': '-',
+          'best_train_acc': '-',
+          'best_validation_acc': '-',
+          'model_dir': model_dir,
+    })
+
     # Save the model
     model_spec_path = os.path.join(model_dir, 'model_spec.pkl')
     model_spec = keras.utils.serialize_keras_object(m)
@@ -288,6 +364,9 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
     cb.append(keras.callbacks.CSVLogger(history_csvlog, append=True,
                                         separator=','))
 
+    if gsheet_id:
+        cb.append(GSheetLogger(google_dev_app_name, gsheet_id, param_dict))
+
     LOGGER.info('Setting up train data generator...')
     if continue_model_dir is not None:
         train_start_batch_idx = train_epoch_size * (last_epoch_idx + 1)

From 9aa9cb7e6b563f66354793540a22ea59f0eb820a Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Fri, 16 Feb 2018 16:28:42 -0500
Subject: [PATCH 095/201] Fix command line behavior for initializing google
 sheets credentials

---
 gsheets.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/gsheets.py b/gsheets.py
index 5ab2b83..889ac9a 100644
--- a/gsheets.py
+++ b/gsheets.py
@@ -43,7 +43,7 @@
 SCOPES = 'https://www.googleapis.com/auth/spreadsheets'
 
 
-def get_credentials(application_name):
+def get_credentials(application_name, client_secret_file=None, flags=None):
     """Gets valid user credentials from storage.
 
     If nothing has been stored, or if the stored credentials are invalid,
@@ -62,7 +62,9 @@ def get_credentials(application_name):
     store = Storage(credential_path)
     credentials = store.get()
     if not credentials or credentials.invalid:
-        flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
+        if not client_secret_file:
+            raise ValueError('Must provide client secret file if credentials do not exist')
+        flow = client.flow_from_clientsecrets(client_secret_file, SCOPES)
         flow.user_agent = application_name
         if flags:
             credentials = tools.run_flow(flow, store, flags)
@@ -131,10 +133,16 @@ def update_experiment(service, spreadsheet_id, param_dict, start_col, end_col, v
     response = request.execute()
 
 if __name__ == '__main__':
-    try:
-        import argparse
-        flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
-    except ImportError:
-        flags = None
-
-    get_credentials()
+    import argparse
+    parser = argparse.ArgumentParser(parents=[tools.argparser])
+    parser.add_argument('application_name', type=str, help='Name of Google Developer Application')
+    parser.add_argument('client_secret_file', type=str, help='Path to application client secret file')
+    flags = parser.parse_args()
+
+    # TODO: Fix this hack
+    application_name = flags.application_name
+    client_secret_file = flags.client_secret_file
+    del flags.application_name
+    del flags.client_secret_file
+
+    get_credentials(application_name, client_secret_file=client_secret_file, flags=flags)

From b52caadfe3e6c1b5ebd657870f568a94e5e014f5 Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Sun, 18 Feb 2018 10:44:09 -0500
Subject: [PATCH 096/201] fix vision_model input

---
 l3embedding/vision_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/l3embedding/vision_model.py b/l3embedding/vision_model.py
index 5e3342b..2a91ab9 100644
--- a/l3embedding/vision_model.py
+++ b/l3embedding/vision_model.py
@@ -121,7 +121,7 @@ def construct_cnn_L3_orig_inputbn_vision_model():
     ####
     # INPUT
     x_i = Input(shape=(224, 224, 3), dtype='float32')
-    x_i = BatchNormalization()(x_i)
+    y_i = BatchNormalization()(x_i)
 
     # CONV BLOCK 1
     n_filter_i_1 = 64
@@ -129,7 +129,7 @@ def construct_cnn_L3_orig_inputbn_vision_model():
     pool_size_i_1 = (2, 2)
     y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
                  kernel_initializer='he_normal',
-                 kernel_regularizer=regularizers.l2(weight_decay))(x_i)
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_i)
     y_i = BatchNormalization()(y_i)
     y_i = Activation('relu')(y_i)
     y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',

From 9afc8961c6cc2b6bacaf5ac60698de8f1825c8cf Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sat, 24 Feb 2018 12:31:03 -0500
Subject: [PATCH 097/201] Add mel spectrogram models

---
 l3embedding/audio_model.py | 219 ++++++++++++++++++++++++++++++++++++-
 l3embedding/model.py       |  46 +++++++-
 2 files changed, 263 insertions(+), 2 deletions(-)

diff --git a/l3embedding/audio_model.py b/l3embedding/audio_model.py
index 3bed39f..35e704e 100644
--- a/l3embedding/audio_model.py
+++ b/l3embedding/audio_model.py
@@ -1,7 +1,7 @@
 from keras.models import Model
 from keras.layers import Input, Conv2D, BatchNormalization, MaxPooling2D, \
     Flatten, Activation, Lambda
-from kapre.time_frequency import Spectrogram
+from kapre.time_frequency import Spectrogram, Melspectrogram
 import tensorflow as tf
 import keras.regularizers as regularizers
 
@@ -222,6 +222,223 @@ def construct_cnn_L3_kapredbinputbn_audio_model():
 
     return m, x_a, y_a
 
+def construct_cnn_L3_melspec1_audio_model():
+    """
+    Constructs a model that replicates the audio subnetwork  used in Look,
+    Listen and Learn
+
+    Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, .
+
+    Returns
+    -------
+    model:  L3 CNN model
+            (Type: keras.models.Model)
+    inputs: Model inputs
+            (Type: list[keras.layers.Input])
+    outputs: Model outputs
+            (Type: keras.layers.Layer)
+    """
+    weight_decay = 1e-5
+    ####
+    # Audio subnetwork
+    ####
+    n_dft = 2048
+    #n_win = 480
+    #n_hop = n_win//2
+    n_mels = 128
+    n_hop = 242
+    asr = 48000
+    audio_window_dur = 1
+    # INPUT
+    x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32')
+
+    # MELSPECTROGRAM PREPROCESSING
+    # 128 x 199 x 1
+    y_a = Melspectrogram(n_dft=n_dft, n_hop=n_hop, n_mels=n_mels,
+                      power_melgram=1.0, htk=True, # n_win=n_win,
+                      return_decibel_melgram=True, padding='same')(x_a)
+
+    # CONV BLOCK 1
+    n_filter_a_1 = 64
+    filt_size_a_1 = (3, 3)
+    pool_size_a_1 = (2, 2)
+    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a)
+
+    # CONV BLOCK 2
+    n_filter_a_2 = 128
+    filt_size_a_2 = (3, 3)
+    pool_size_a_2 = (2, 2)
+    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a)
+
+    # CONV BLOCK 3
+    n_filter_a_3 = 256
+    filt_size_a_3 = (3, 3)
+    pool_size_a_3 = (2, 2)
+    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a)
+
+    # CONV BLOCK 4
+    n_filter_a_4 = 512
+    filt_size_a_4 = (3, 3)
+    pool_size_a_4 = (16, 24)
+    y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = Conv2D(n_filter_a_4, filt_size_a_4,
+                 kernel_initializer='he_normal',
+                 name='audio_embedding_layer', padding='same',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_4)(y_a)
+
+    y_a = Flatten()(y_a)
+
+    m = Model(inputs=x_a, outputs=y_a)
+    m.name = 'audio_model'
+
+    return m, x_a, y_a
+
+
+def construct_cnn_L3_melspec2_audio_model():
+    """
+    Constructs a model that replicates the audio subnetwork  used in Look,
+    Listen and Learn
+
+    Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, .
+
+    Returns
+    -------
+    model:  L3 CNN model
+            (Type: keras.models.Model)
+    inputs: Model inputs
+            (Type: list[keras.layers.Input])
+    outputs: Model outputs
+            (Type: keras.layers.Layer)
+    """
+    weight_decay = 1e-5
+    ####
+    # Audio subnetwork
+    ####
+    n_dft = 2048
+    #n_win = 480
+    #n_hop = n_win//2
+    n_mels = 256
+    n_hop = 242
+    asr = 48000
+    audio_window_dur = 1
+    # INPUT
+    x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32')
+
+    # MELSPECTROGRAM PREPROCESSING
+    # 128 x 199 x 1
+    y_a = Melspectrogram(n_dft=n_dft, n_hop=n_hop, n_mels=n_mels,
+                      power_melgram=1.0, htk=True, # n_win=n_win,
+                      return_decibel_melgram=True, padding='same')(x_a)
+
+    # CONV BLOCK 1
+    n_filter_a_1 = 64
+    filt_size_a_1 = (3, 3)
+    pool_size_a_1 = (2, 2)
+    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a)
+
+    # CONV BLOCK 2
+    n_filter_a_2 = 128
+    filt_size_a_2 = (3, 3)
+    pool_size_a_2 = (2, 2)
+    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a)
+
+    # CONV BLOCK 3
+    n_filter_a_3 = 256
+    filt_size_a_3 = (3, 3)
+    pool_size_a_3 = (2, 2)
+    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a)
+
+    # CONV BLOCK 4
+    n_filter_a_4 = 512
+    filt_size_a_4 = (3, 3)
+    pool_size_a_4 = (32, 24)
+    y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
+                 kernel_initializer='he_normal',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = Conv2D(n_filter_a_4, filt_size_a_4,
+                 kernel_initializer='he_normal',
+                 name='audio_embedding_layer', padding='same',
+                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Activation('relu')(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_4)(y_a)
+
+    y_a = Flatten()(y_a)
+
+    m = Model(inputs=x_a, outputs=y_a)
+    m.name = 'audio_model'
+
+    return m, x_a, y_a
+
 
 def construct_cnn_l3_orig_audio_embedding_model(audio_model, x_a):
     """
diff --git a/l3embedding/model.py b/l3embedding/model.py
index 47b06ea..50ad3c2 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -152,6 +152,48 @@ def construct_cnn_L3_kapredbinputbn():
     m = L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, 'cnn_L3_kapredbinputbn')
     return m
 
+def construct_cnn_L3_melspec1():
+    """
+    Constructs a model that replicates that used in Look, Listen and Learn
+
+    Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, .
+
+    Returns
+    -------
+    model:  L3 CNN model
+            (Type: keras.models.Model)
+    inputs: Model inputs
+            (Type: list[keras.layers.Input])
+    outputs: Model outputs
+            (Type: keras.layers.Layer)
+    """
+    vision_model, x_i, y_i = construct_cnn_L3_orig_vision_model()
+    audio_model, x_a, y_a = construct_cnn_L3_melspec1_audio_model()
+
+    m = L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, 'cnn_L3_kapredbinputbn')
+    return m
+
+def construct_cnn_L3_melspec2():
+    """
+    Constructs a model that replicates that used in Look, Listen and Learn
+
+    Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, .
+
+    Returns
+    -------
+    model:  L3 CNN model
+            (Type: keras.models.Model)
+    inputs: Model inputs
+            (Type: list[keras.layers.Input])
+    outputs: Model outputs
+            (Type: keras.layers.Layer)
+    """
+    vision_model, x_i, y_i = construct_cnn_L3_orig_vision_model()
+    audio_model, x_a, y_a = construct_cnn_L3_melspec2_audio_model()
+
+    m = L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, 'cnn_L3_kapredbinputbn')
+    return m
+
 def construct_tiny_L3():
     """
     Constructs a model that implements a small L3 model for validation purposes
@@ -175,5 +217,7 @@ def construct_tiny_L3():
 MODELS = {
     'cnn_L3_orig': construct_cnn_L3_orig,
     'tiny_L3': construct_tiny_L3,
-    'cnn_L3_kapredbinputbn': construct_cnn_L3_kapredbinputbn
+    'cnn_L3_kapredbinputbn': construct_cnn_L3_kapredbinputbn,
+    'cnn_L3_melspec1': construct_cnn_L3_melspec1,
+    'cnn_L3_melspec2': construct_cnn_L3_melspec2
 }

From 7f28370ed79d78377348f3407d579657f4238a15 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sat, 24 Feb 2018 17:58:30 -0500
Subject: [PATCH 098/201] Add support for framewise features for classifiers,
 reorganize classifier code, add support for adding different datasets

---
 ..._history.py => 04_plot_training_history.py |   0
 train_classifier.py => 05_train_classifier.py |  20 +
 classifier/features.py                        | 315 ++++++++++++
 classifier/metrics.py                         |  90 ++++
 classifier/train.py                           | 477 +++---------------
 classifier/us8k.py                            | 200 ++++++++
 6 files changed, 687 insertions(+), 415 deletions(-)
 rename plot_training_history.py => 04_plot_training_history.py (100%)
 rename train_classifier.py => 05_train_classifier.py (88%)
 create mode 100644 classifier/features.py
 create mode 100644 classifier/metrics.py
 create mode 100644 classifier/us8k.py

diff --git a/plot_training_history.py b/04_plot_training_history.py
similarity index 100%
rename from plot_training_history.py
rename to 04_plot_training_history.py
diff --git a/train_classifier.py b/05_train_classifier.py
similarity index 88%
rename from train_classifier.py
rename to 05_train_classifier.py
index 7d2b001..8005260 100644
--- a/train_classifier.py
+++ b/05_train_classifier.py
@@ -121,6 +121,21 @@ def parse_arguments():
                         default='cnn_L3_orig',
                         help='Type of L3 embedding model')
 
+    parser.add_argument('-hs',
+                        '--hop-size',
+                        dest='hop_size',
+                        action='store',
+                        type=float,
+                        default=0.25,
+                        help='Hop size as a fraction of the window')
+
+    parser.add_argument('-nrs',
+                        '--num-random-samples',
+                        dest='num_random_samples',
+                        action='store',
+                        type=int,
+                        help='Number of random samples for randomized sampling methods')
+
     parser.add_argument('-nl',
                         '--no-logging',
                         dest='disable_logging',
@@ -141,6 +156,11 @@ def parse_arguments():
                         type=str,
                         help='Path to UrbanSound8K metadata file')
 
+    parser.add_argument('dataset_name',
+                        action='store',
+                        type=str,
+                        help='Name of dataset')
+
     parser.add_argument('data_dir',
                         action='store',
                         type=str,
diff --git a/classifier/features.py b/classifier/features.py
new file mode 100644
index 0000000..8d41189
--- /dev/null
+++ b/classifier/features.py
@@ -0,0 +1,315 @@
+import logging
+import warnings
+import librosa
+import numpy as np
+import scipy as sp
+
+LOGGER = logging.getLogger('us8k')
+LOGGER.setLevel(logging.DEBUG)
+
+
+def one_hot(idx, n_classes=10):
+    """
+    Creates a one hot encoding vector
+
+    Args:
+        idx:  Class index
+              (Type: int)
+
+    Keyword Args:
+        n_classes: Number of classes
+                   (Type: int)
+
+    Returns:
+        one_hot_vector: One hot encoded vector
+                        (Type: np.ndarray)
+    """
+    y = np.zeros((n_classes,))
+    y[idx] = 1
+    return y
+
+
+def get_l3_stack_features(audio_path, l3embedding_model, hop_size=0.25):
+    """
+    Get stacked L3 embedding features, i.e. stack embedding features for each
+    1 second (overlapping) window of the given audio
+
+    Computes a _single_ feature for the given audio file
+
+    Args:
+        audio_path: Path to audio file
+                    (Type: str)
+
+        l3embedding_model:  Audio embedding model
+                            (keras.engine.training.Model)
+
+    Keyword Args:
+        hop_size: Hop size as a fraction of the window
+                  (Type: float)
+
+    Returns:
+        features:  Feature vector
+                   (Type: np.ndarray)
+    """
+    sr = 48000
+    audio, _ = librosa.load(audio_path, sr=sr, mono=True)
+    audio_length = len(audio)
+    frame_length = sr
+    hop_length = int(sr * hop_size)
+
+    # Zero pad to 4 seconds
+    target_len = 48000 * 4
+    if audio_length < target_len:
+        pad_length = target_len - audio_length
+        # Use (roughly) symmetric padding
+        left_pad = pad_length // 2
+        right_pad= pad_length - left_pad
+        audio = np.pad(audio, (left_pad, right_pad), mode='constant')
+    elif audio_length > target_len:
+        # Take center of audio (ASSUMES NOT MUCH GREATER THAN TARGET LENGTH)
+        center_sample = audio_length // 2
+        half_len = target_len // 2
+        audio = audio[center_sample-half_len:center_sample+half_len]
+
+
+
+    # Divide into overlapping 1 second frames
+    x = librosa.util.utils.frame(audio, frame_length=frame_length, hop_length=hop_length).T
+
+    # Add a channel dimension
+    x = x.reshape((x.shape[0], 1, x.shape[-1]))
+
+    # Get the L3 embedding for each frame
+    l3embedding = l3embedding_model.predict(x)
+
+    # Return a flattened vector of the embeddings
+    return l3embedding.flatten()
+
+
+def get_l3_stats_features(audio_path, l3embedding_model, hop_size=0.25):
+    """
+    Get L3 embedding stats features, i.e. compute statistics for each of the
+    embedding features across 1 second (overlapping) window of the given audio
+
+    Computes a _single_ feature for the given audio file
+
+
+    Args:
+        audio_path: Path to audio file
+                    (Type: str)
+
+        l3embedding_model:  Audio embedding model
+                            (keras.engine.training.Model)
+
+    Keyword Args:
+        hop_size: Hop size as a fraction of the window
+                  (Type: float)
+
+    Returns:
+        features:  Feature vector
+                   (Type: np.ndarray)
+    """
+    sr = 48000
+    audio, _ = librosa.load(audio_path, sr=sr, mono=True)
+
+    hop_length = int(hop_size * sr)
+    frame_length = 48000 * 1
+
+    audio_length = len(audio)
+    if audio_length < (frame_length + 2*hop_length):
+        # Make sure we can have at least three frames so that we can compute
+        # all of the stats.
+        pad_length = frame_length + 2*hop_length - audio_length
+    else:
+        # Zero pad so we compute embedding on all samples
+        pad_length = int(np.ceil(audio_length - frame_length)/hop_length) * hop_length \
+                     - (audio_length - frame_length)
+
+    if pad_length > 0:
+        # Use (roughly) symmetric padding
+        left_pad = pad_length // 2
+        right_pad= pad_length - left_pad
+        audio = np.pad(audio, (left_pad, right_pad), mode='constant')
+
+
+    # Divide into overlapping 1 second frames
+    x = librosa.util.utils.frame(audio, frame_length=frame_length, hop_length=hop_length).T
+
+    # Add a channel dimension
+    x = x.reshape((x.shape[0], 1, x.shape[-1]))
+
+    # Get the L3 embedding for each frame
+    l3embedding = l3embedding_model.predict(x)
+
+    # Compute statistics on the time series of embeddings
+    minimum = np.min(l3embedding, axis=0)
+    maximum = np.max(l3embedding, axis=0)
+    median = np.median(l3embedding, axis=0)
+    mean = np.mean(l3embedding, axis=0)
+    var = np.var(l3embedding, axis=0)
+    skewness = sp.stats.skew(l3embedding, axis=0)
+    kurtosis = sp.stats.kurtosis(l3embedding, axis=0)
+
+    # Compute statistics on the first and second derivatives of time series of embeddings
+
+    # Use finite differences to approximate the derivatives
+    d1 = np.gradient(l3embedding, 1/sr, edge_order=1, axis=0)
+    d2 = np.gradient(l3embedding, 1/sr, edge_order=2, axis=0)
+
+    d1_mean = np.mean(d1, axis=0)
+    d1_var = np.var(d1, axis=0)
+
+    d2_mean = np.mean(d2, axis=0)
+    d2_var = np.var(d2, axis=0)
+
+    return np.concatenate((minimum, maximum, median, mean, var, skewness, kurtosis,
+                           d1_mean, d1_var, d2_mean, d2_var))
+
+
+def get_l3_frames_uniform(audio_path, l3embedding_model, hop_size=0.25):
+    """
+    Get L3 embedding stats features, i.e. compute statistics for each of the
+    embedding features across 1 second (overlapping) window of the given audio
+
+    Args:
+        audio_path: Path to audio file
+                    (Type: str)
+
+        l3embedding_model:  Audio embedding model
+                            (keras.engine.training.Model)
+
+    Keyword Args:
+        hop_size: Hop size as a fraction of the window
+                  (Type: float)
+
+    Returns:
+        features:  List of embedding vectors
+                   (Type: list[np.ndarray])
+    """
+    sr = 48000
+    audio, _ = librosa.load(audio_path, sr=sr, mono=True)
+
+    hop_size = 0.25 # REVISIT
+    hop_length = int(hop_size * sr)
+    frame_length = 48000 * 1
+
+    audio_length = len(audio)
+    if audio_length < frame_length:
+        # Make sure we can have at least one frame of audio
+        pad_length = frame_length - audio_length
+    else:
+        # Zero pad so we compute embedding on all samples
+        pad_length = int(np.ceil(audio_length - frame_length)/hop_length) * hop_length \
+                     - (audio_length - frame_length)
+
+    if pad_length > 0:
+        # Use (roughly) symmetric padding
+        left_pad = pad_length // 2
+        right_pad= pad_length - left_pad
+        audio = np.pad(audio, (left_pad, right_pad), mode='constant')
+
+    # Divide into overlapping 1 second frames
+    x = librosa.util.utils.frame(audio, frame_length=frame_length, hop_length=hop_length).T
+
+    # Add a channel dimension
+    x = x.reshape((x.shape[0], 1, x.shape[-1]))
+
+    # Get the L3 embedding for each frame
+    l3embedding = l3embedding_model.predict(x).T
+
+    return list(l3embedding)
+
+def get_l3_frames_random(audio, l3embedding_model, num_samples):
+    """
+    Get L3 embedding stats features, i.e. compute statistics for each of the
+    embedding features across 1 second (overlapping) window of the given audio
+
+    Args:
+        audio_path: numpy array or ath to audio file
+                    (Type: np.ndarray or str)
+
+        l3embedding_model:  Audio embedding model
+                            (Type: keras.engine.training.Model)
+
+        num_samples: Number of samples
+                     (Type: int)
+
+    Returns:
+        features:  List of embedding vectors
+                   (Type: list[np.ndarray])
+    """
+    sr = 48000
+
+    if type(audio) == str:
+        audio, _ = librosa.load(audio, sr=sr, mono=True)
+
+    frame_length = 48000 * 1
+
+    audio_length = len(audio)
+    pad_length = frame_length - audio_length
+
+    if pad_length > 0:
+        # Use (roughly) symmetric padding
+        left_pad = pad_length // 2
+        right_pad= pad_length - left_pad
+        audio = np.pad(audio, (left_pad, right_pad), mode='constant')
+
+    if audio_length != frame_length:
+        sample_start_idxs = np.random.randint(low=0,
+                                              high=audio_length - frame_length,
+                                              size=num_samples)
+
+        x = []
+        for start_idx in sample_start_idxs:
+            end_idx = start_idx + frame_length
+            x.append(audio[start_idx:end_idx])
+
+
+        x = np.array(x)
+        x = x.reshape((x.shape[0], 1, x.shape[-1]))
+        l3embedding = l3embedding_model.predict(x).T
+
+    else:
+        warn_msg = 'Replicating samples'
+        LOGGER.warning(warn_msg)
+        warnings.warn(warn_msg)
+
+        x = audio.reshape((1,1,audio.shape[0]))
+        x = x.reshape((x.shape[0], 1, x.shape[-1]))
+        frame_l3embedding = l3embedding_model.predict(x).T.flatten()
+
+        l3embedding = np.tile(frame_l3embedding, (num_samples, 1))
+
+    return list(l3embedding)
+
+
+def flatten_file_frames(X, y):
+    """
+    For data organized by file, flattens the frame features for training data
+    and duplicates the file labels for each frame
+
+    Args:
+        X: Framewise feature data, which consists of lists of frame features
+           for each file
+                 (Type: np.ndarray[list[np.ndarray]])
+        y: Label data for each file
+                 (Type: np.ndarray)
+
+    Returns:
+        X_flattened: Flatten matrix of frame features
+                     (Type: np.ndarray)
+        y_flattened: Label data for each frame
+                     (Type: np.ndarray)
+
+    """
+    num_frames_per_file = []
+    X_flatten = []
+    for X_file in X:
+        num_frames_per_file.append(len(X_file))
+        X_flatten += X_file
+    X_flatten = np.array(X_flatten)
+
+    # Repeat the labels for each frame
+    y_flatten = np.repeat(y, num_frames_per_file)
+
+    return X_flatten, y_flatten
diff --git a/classifier/metrics.py b/classifier/metrics.py
new file mode 100644
index 0000000..7c1c9c1
--- /dev/null
+++ b/classifier/metrics.py
@@ -0,0 +1,90 @@
+import numpy as np
+
+from classifier.train import LOGGER
+
+
+def compute_metrics(y, pred):
+    """
+    Compute perfomance metrics given the predicted labels and the true labels
+
+    Args:
+        y: True label vector
+           (Type: np.ndarray)
+
+        pred: Predicted label vector
+              (Type: np.ndarray)
+
+    Returns:
+        metrics: Metrics dictionary
+                 (Type: dict[str, *])
+    """
+    # Convert from one-hot to integer encoding if necessary
+    if y.ndim == 2:
+        y = np.argmax(y, axis=1)
+    if pred.ndim == 2:
+        pred = np.argmax(pred, axis=1)
+
+    acc = (y == pred).mean()
+
+    sum_class_acc = 0.0
+    for class_idx in range(10):
+        idxs = (y == class_idx)
+        sum_class_acc += (y[idxs] == pred[idxs]).mean()
+
+    ave_class_acc = sum_class_acc / 10
+
+    return {
+        'accuracy': acc,
+        'average_class_accuracy': ave_class_acc
+    }
+
+
+def aggregate_metrics(fold_metrics):
+    """
+    Aggregate fold metrics using different stats
+
+    Args:
+        fold_metrics: List of fold metrics dictionaries
+                      (Type: list[dict[str, *]])
+
+    Returns:
+        aggr_metrics: Statistics averaged across epochs
+                      (Type: dict[str, dict[str,float]])
+    """
+    metric_keys = list(fold_metrics[0].keys())
+    fold_metrics_list = {k: [fold[k] for fold in fold_metrics]
+                         for k in metric_keys}
+    aggr_metrics = {}
+
+    for metric in metric_keys:
+        metric_list = fold_metrics_list[metric]
+        aggr_metrics[metric] = {
+            'mean': np.mean(metric_list),
+            'var': np.var(metric_list),
+            'min': np.min(metric_list),
+            '25_%ile': np.percentile(metric_list, 25),
+            '75_%ile': np.percentile(metric_list, 75),
+            'median': np.median(metric_list),
+            'max': np.max(metric_list)
+        }
+
+    return aggr_metrics
+
+
+def print_metrics(metrics, subset_name):
+    """
+    Print classifier metrics
+
+    Args:
+        metrics: Metrics dictionary
+                 (Type: dict[str, *])
+        subset_name: Name of subset to print
+                     (Type: str)
+    """
+    LOGGER.info("Results metrics for {}".format(subset_name))
+    LOGGER.info("=====================================================")
+    for metric, metric_stats in metrics.items():
+        LOGGER.info("* " + metric)
+        for stat_name, stat_val in metric_stats.items():
+            LOGGER.info("\t- {}: {}".format(stat_name, stat_val))
+        LOGGER.info("\n")
\ No newline at end of file
diff --git a/classifier/train.py b/classifier/train.py
index 582e893..2091af7 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -1,334 +1,31 @@
-import os
-import scipy as sp
-import numpy as np
 import json
-import csv
-import logging
-import librosa
+import os
+
 import keras
-from keras.optimizers import Adam
 import keras.regularizers as regularizers
+import numpy as np
+import random
+from keras.layers import Input, Dense
 from keras.models import Model
-from keras.layers import Input, Dense, Activation
+from keras.optimizers import Adam
 from sklearn.externals import joblib
-from sklearn.svm import SVC
 from sklearn.preprocessing import StandardScaler
-from l3embedding.train import sample_one_second, LossHistory
+from sklearn.svm import SVC
+from scipy.stats import mode
+
+from classifier.features import flatten_file_frames
+from classifier.metrics import compute_metrics, aggregate_metrics, print_metrics
+from classifier.us8k import load_us8k_metadata, get_us8k_folds, \
+    get_us8k_fold_split
 from l3embedding.model import load_embedding
+from l3embedding.train import LossHistory
 from log import *
 
 LOGGER = logging.getLogger('classifier')
 LOGGER.setLevel(logging.DEBUG)
 
 
-def load_us8k_metadata(path):
-    """
-    Load UrbanSound8K metadata
-
-    Args:
-        path: Path to metadata csv file
-              (Type: str)
-
-    Returns:
-        metadata: List of metadata dictionaries
-                  (Type: list[dict[str, *]])
-    """
-    metadata = [{} for _ in range(10)]
-    with open(path) as csvfile:
-        reader = csv.DictReader(csvfile)
-        for row in reader:
-            fname = row['slice_file_name']
-            row['start'] = float(row['start'])
-            row['end'] = float(row['end'])
-            row['salience'] = float(row['salience'])
-            fold_num = row['fold'] = int(row['fold'])
-            row['classID'] = int(row['classID'])
-            metadata[fold_num-1][fname] = row
-
-    return metadata
-
-
-def one_hot(idx, n_classes=10):
-    """
-    Creates a one hot encoding vector
-
-    Args:
-        idx:  Class index
-              (Type: int)
-
-    Keyword Args:
-        n_classes: Number of classes
-                   (Type: int)
-
-    Returns:
-        one_hot_vector: One hot encoded vector
-                        (Type: np.ndarray)
-    """
-    y = np.zeros((n_classes,))
-    y[idx] = 1
-    return y
-
-
-def get_l3_stack_features(audio_path, l3embedding_model):
-    """
-    Get stacked L3 embedding features, i.e. stack embedding features for each
-    1 second (overlapping) window of the given audio
-
-
-    Args:
-        audio_path: Path to audio file
-                    (Type: str)
-
-        l3embedding_model:  Audio embedding model
-                            (keras.engine.training.Model)
-
-    Returns:
-        features:  Feature vector
-                   (Type: np.ndarray)
-    """
-    sr = 48000
-    audio, _ = librosa.load(audio_path, sr=sr, mono=True)
-    audio_length = len(audio)
-
-    # Zero pad to 4 seconds
-    target_len = 48000 * 4
-    if audio_length < target_len:
-        pad_length = target_len - audio_length
-        # Use (roughly) symmetric padding
-        left_pad = pad_length // 2
-        right_pad= pad_length - left_pad
-        audio = np.pad(audio, (left_pad, right_pad), mode='constant')
-    elif audio_length > target_len:
-        # Take center of audio (ASSUMES NOT MUCH GREATER THAN TARGET LENGTH)
-        center_sample = audio_length // 2
-        half_len = target_len // 2
-        audio = audio[center_sample-half_len:center_sample+half_len]
-
-
-
-    # Divide into overlapping 1 second frames
-    x = librosa.util.utils.frame(audio, frame_length=sr * 1, hop_length=sr // 2).T
-
-    # Add a channel dimension
-    x = x.reshape((x.shape[0], 1, x.shape[-1]))
-
-    # Get the L3 embedding for each frame
-    l3embedding = l3embedding_model.predict(x)
-
-    # Return a flattened vector of the embeddings
-    return l3embedding.flatten()
-
-
-def get_l3_stats_features(audio_path, l3embedding_model):
-    """
-    Get L3 embedding stats features, i.e. compute statistics for each of the
-    embedding features across 1 second (overlapping) window of the given audio
-
-    Args:
-        audio_path: Path to audio file
-                    (Type: str)
-
-        l3embedding_model:  Audio embedding model
-                            (keras.engine.training.Model)
-
-    Returns:
-        features:  Feature vector
-                   (Type: np.ndarray)
-    """
-    sr = 48000
-    audio, _ = librosa.load(audio_path, sr=sr, mono=True)
-
-    hop_size = 0.25 # REVISIT
-    hop_length = int(hop_size * sr)
-    frame_length = 48000 * 1
-
-    audio_length = len(audio)
-    if audio_length < (frame_length + 2*hop_length):
-        # Make sure we can have at least three frames so that we can compute
-        # all of the stats.
-        pad_length = frame_length + 2*hop_length - audio_length
-    else:
-        # Zero pad so we compute embedding on all samples
-        pad_length = int(np.ceil(audio_length - frame_length)/hop_length) * hop_length \
-                     - (audio_length - frame_length)
-
-    if pad_length > 0:
-        # Use (roughly) symmetric padding
-        left_pad = pad_length // 2
-        right_pad= pad_length - left_pad
-        audio = np.pad(audio, (left_pad, right_pad), mode='constant')
-
-
-    # Divide into overlapping 1 second frames
-    x = librosa.util.utils.frame(audio, frame_length=frame_length, hop_length=hop_length).T
-
-    # Add a channel dimension
-    x = x.reshape((x.shape[0], 1, x.shape[-1]))
-
-    # Get the L3 embedding for each frame
-    l3embedding = l3embedding_model.predict(x)
-
-    # Compute statistics on the time series of embeddings
-    minimum = np.min(l3embedding, axis=0)
-    maximum = np.max(l3embedding, axis=0)
-    median = np.median(l3embedding, axis=0)
-    mean = np.mean(l3embedding, axis=0)
-    var = np.var(l3embedding, axis=0)
-    skewness = sp.stats.skew(l3embedding, axis=0)
-    kurtosis = sp.stats.kurtosis(l3embedding, axis=0)
-
-    # Compute statistics on the first and second derivatives of time series of embeddings
-
-    # Use finite differences to approximate the derivatives
-    d1 = np.gradient(l3embedding, 1/sr, edge_order=1, axis=0)
-    d2 = np.gradient(l3embedding, 1/sr, edge_order=2, axis=0)
-
-    d1_mean = np.mean(d1, axis=0)
-    d1_var = np.var(d1, axis=0)
-
-    d2_mean = np.mean(d2, axis=0)
-    d2_var = np.var(d2, axis=0)
-
-    return np.concatenate((minimum, maximum, median, mean, var, skewness, kurtosis,
-                           d1_mean, d1_var, d2_mean, d2_var))
-
-
-def get_us8k_folds(metadata, data_dir, l3embedding_model=None,
-                   features='l3_stack', label_format='int'):
-    """
-    Load all of the data for each fold
-
-    Args:
-        metadata: List of metadata dictionaries
-                  (Type: dict[str,*])
-
-        data_dir: Path to data directory
-                  (Type: str)
-
-    Keyword Args:
-        l3embedding_model: L3 embedding model, used if L3 features are used
-                           (Type: keras.engine.training.Model or None)
-
-        features: Type of features to be computed
-                  (Type: str)
-
-        label_format: Type of format used for encoding outputs
-                      (Type: str)
-
-    Returns:
-        fold_data: List of data for each fold
-                   (Type: list[tuple[np.ndarray, np.ndarray]])
-
-    """
-    fold_data = []
-    for fold_idx in range(10):
-        LOGGER.info("Loading fold {}...".format(fold_idx+1))
-        fold_data.append(get_fold_data(metadata, data_dir, fold_idx,
-                                       l3embedding_model=l3embedding_model,
-                                       features=features, label_format=label_format))
-
-    return fold_data
-
-
-def get_fold_data(metadata, data_dir, fold_idx, l3embedding_model=None,
-                  features='l3_stack', label_format='int'):
-    """
-    Load all of the data for a specific fold
-
-    Args:
-        metadata: List of metadata dictionaries
-                  (Type: list[dict[str,*]])
-
-        data_dir: Path to data directory
-                  (Type: str)
-
-        fold_idx: Index of fold to load
-                  (Type: int)
-
-    Keyword Args:
-        l3embedding_model: L3 embedding model, used if L3 features are used
-                           (Type: keras.engine.training.Model or None)
-
-        features: Type of features to be computed
-                  (Type: str)
-
-        label_format: Type of format used for encoding outputs
-                      (Type: str)
-
-    Returns:
-        X: Feature data
-           (Type: np.ndarray)
-        y: Label data
-           (Type: np.ndarray)
-
-    """
-    X = []
-    y = []
-
-    for idx, (fname, example_metadata) in enumerate(metadata[fold_idx].items()):
-        path = os.path.join(data_dir, "fold{}".format(fold_idx+1), fname)
-
-        if features.startswith('l3') and not l3embedding_model:
-            raise ValueError('Must provide L3 embedding model to use {} features'.format(features))
-
-        if features == 'l3_stack':
-            feature_vector = get_l3_stack_features(path, l3embedding_model)
-        elif features == 'l3_stats':
-            feature_vector = get_l3_stats_features(path, l3embedding_model)
-        else:
-            raise ValueError('Invalid feature type: {}'.format(features))
-
-        # If we were not able to compute the features, skip this file
-        if feature_vector is None:
-            continue
-
-        X.append(feature_vector)
-
-
-        class_label = example_metadata['classID']
-        if label_format == 'int':
-            y.append(class_label)
-        elif label_format == 'one_hot':
-            y.append(one_hot(class_label))
-        else:
-            raise ValueError('Invalid label format: {}'.format(label_format))
-
-    return np.array(X), np.array(y)
-
-
-def get_fold_split(fold_data, fold_idx):
-    """
-    Given the fold to use as held out, return the data split between training
-    and testing
-
-    Args:
-        fold_data: List of data for each fold
-                   (Type: list[tuple[np.ndarray, np.ndarray]])
-
-        fold_idx: Fold to use as held out data
-                  (Type: int)
-
-    Returns:
-        X_train: Training feature data
-                 (Type: np.ndarray)
-        y_train: Training label data
-                 (Type: np.ndarray)
-        X_test: Testing feature data
-                (Type: np.ndarray)
-        y_test: Testing label data
-                (Type: np.ndarray)
-    """
-    X_test, y_test = fold_data[fold_idx]
-    train = fold_data[:fold_idx] + fold_data[fold_idx+1:]
-
-    X_train, y_train = zip(*train)
-    X_train = np.concatenate(X_train, axis=0)
-    y_train = np.concatenate(y_train, axis=0)
-
-    return X_train, y_train, X_test, y_test
-
-
-def train_svm(X_train, y_train, X_test, y_test, C=1e-4, verbose=False, **kwargs):
+def train_svm(X_train, y_train, X_test, y_test, frame_features, C=1e-4, verbose=False, **kwargs):
     """
     Train a Support Vector Machine model on the given data
 
@@ -370,7 +67,18 @@ def train_svm(X_train, y_train, X_test, y_test, C=1e-4, verbose=False, **kwargs)
     y_train_pred = clf.predict(X_train)
 
     X_test = stdizer.transform(X_test)
-    y_test_pred = clf.predict(X_test)
+
+    if frame_features:
+        y_test_pred = []
+
+        for X_test_file in X_test:
+            output = clf.predict(X_test_file)
+            class_pred = mode(output.flatten())[0][0]
+            y_test_pred.append(class_pred)
+
+        y_test_pred = np.array(y_test_pred)
+    else:
+        y_test_pred = clf.predict(X_test)
 
     return clf, y_train_pred, y_test_pred
 
@@ -405,7 +113,7 @@ def construct_mlp_model(input_shape, weight_decay=1e-5):
     return m, inp, y
 
 
-def train_mlp(X_train, y_train, X_test, y_test, model_dir,
+def train_mlp(X_train, y_train, X_test, y_test, model_dir, frame_features,
               batch_size=64, num_epochs=100, train_epoch_size=None,
               validation_epoch_size=None, validation_split=0.1,
               learning_rate=1e-4, weight_decay=1e-5,
@@ -424,6 +132,11 @@ def train_mlp(X_train, y_train, X_test, y_test, model_dir,
                 (Type: np.ndarray)
         model_dir: Path to model directory
                    (Type: str)
+        frame_features: If True, test data will be handled as a tensor, where
+                        the first dimension is the audio file, and the second
+                        dimension is the frame in the audio file. Evaluation
+                        will additionally be done at the audio file level
+                        (Type: bool)
 
     Keyword Args:
         verbose:  If True, print verbose messages
@@ -478,109 +191,36 @@ def train_mlp(X_train, y_train, X_test, y_test, model_dir,
                     verbose=10 if verbose else 1)
 
     y_train_pred = m.predict(X_train)
-    y_test_pred = m.predict(X_test)
+    if frame_features:
+        y_test_pred = []
 
-    return m, y_train_pred, y_test_pred
-
-
-def compute_metrics(y, pred):
-    """
-    Compute perfomance metrics given the predicted labels and the true labels
-
-    Args:
-        y: True label vector
-           (Type: np.ndarray)
-
-        pred: Predicted label vector
-              (Type: np.ndarray)
-
-    Returns:
-        metrics: Metrics dictionary
-                 (Type: dict[str, *])
-    """
-    # Convert from one-hot to integer encoding if necessary
-    if y.ndim == 2:
-        y = np.argmax(y, axis=1)
-    if pred.ndim == 2:
-        pred = np.argmax(pred, axis=1)
+        for X_test_file in X_test:
+            output = m.predict(X_test_file).flatten()
+            y_test_pred.append(output.mean(axis=0))
 
-    acc = (y == pred).mean()
+        y_test_pred = np.array(y_test_pred)
 
-    sum_class_acc = 0.0
-    for class_idx in range(10):
-        idxs = (y == class_idx)
-        sum_class_acc += (y[idxs] == pred[idxs]).mean()
-
-    ave_class_acc = sum_class_acc / 10
-
-    return {
-        'accuracy': acc,
-        'average_class_accuracy': ave_class_acc
-    }
-
-
-def aggregate_metrics(fold_metrics):
-    """
-    Aggregate fold metrics using different stats
-
-    Args:
-        fold_metrics: List of fold metrics dictionaries
-                      (Type: list[dict[str, *]])
-
-    Returns:
-        aggr_metrics: Statistics averaged across epochs
-                      (Type: dict[str, dict[str,float]])
-    """
-    metric_keys = list(fold_metrics[0].keys())
-    fold_metrics_list = {k: [fold[k] for fold in fold_metrics]
-                         for k in metric_keys}
-    aggr_metrics = {}
-
-    for metric in metric_keys:
-        metric_list = fold_metrics_list[metric]
-        aggr_metrics[metric] = {
-            'mean': np.mean(metric_list),
-            'var': np.var(metric_list),
-            'min': np.min(metric_list),
-            '25_%ile': np.percentile(metric_list, 25),
-            '75_%ile': np.percentile(metric_list, 75),
-            'median': np.median(metric_list),
-            'max': np.max(metric_list)
-        }
-
-    return aggr_metrics
-
-
-def print_metrics(metrics, subset_name):
-    """
-    Print classifier metrics
+    else:
+        y_test_pred = m.predict(X_test)
 
-    Args:
-        metrics: Metrics dictionary
-                 (Type: dict[str, *])
-        subset_name: Name of subset to print
-                     (Type: str)
-    """
-    LOGGER.info("Results metrics for {}".format(subset_name))
-    LOGGER.info("=====================================================")
-    for metric, metric_stats in metrics.items():
-        LOGGER.info("* " + metric)
-        for stat_name, stat_val in metric_stats.items():
-            LOGGER.info("\t- {}: {}".format(stat_name, stat_val))
-        LOGGER.info("\n")
+    return m, y_train_pred, y_test_pred
 
 
-def train(metadata_path, data_dir, model_id, output_dir,
+def train(metadata_path, dataset_name, data_dir, model_id, output_dir,
           model_type='svm', features='l3_stack', label_format='int',
           l3embedding_model_path=None, l3embedding_model_type='cnn_L3_orig',
           random_state=20171021, verbose=False, log_path=None,
-          disable_logging=False, **model_args):
+          disable_logging=False, num_random_samples=None, hop_size=0.25, **model_args):
 
     init_console_logger(LOGGER, verbose=verbose)
     if not disable_logging:
         init_file_logger(LOGGER, log_path=log_path)
     LOGGER.debug('Initialized logging.')
 
+    # Set random state
+    np.random.seed(random_state)
+    random.seed(random_state)
+
     # Make sure the directories we need exist
     model_dir = os.path.join(output_dir, model_id)
     if not os.path.isdir(model_dir):
@@ -595,12 +235,18 @@ def train(metadata_path, data_dir, model_id, output_dir,
 
 
     LOGGER.info('Loading data...')
-    # Load metadata
-    metadata = load_us8k_metadata(metadata_path)
 
     # Get fold data
-    fold_data = get_us8k_folds(metadata, data_dir, l3embedding_model=l3embedding_model,
-                               features=features, label_format=label_format)
+    if dataset_name == 'us8k':
+        fold_data = get_us8k_folds(metadata_path, data_dir, l3embedding_model=l3embedding_model,
+                                   features=features, label_format=label_format,
+                                   num_random_samples=num_random_samples, hop_size=hop_size)
+    else:
+        err_msg = 'Unsupported dataset "{}"'.format(dataset_name)
+        LOGGER.error(err_msg)
+        raise ValueError(err_msg)
+
+    frame_features = 'frames' in features
 
     model_output_path = os.path.join(model_dir, "model_fold{}.{}")
 
@@ -611,10 +257,11 @@ def train(metadata_path, data_dir, model_id, output_dir,
     LOGGER.info('Preparing to fit models...')
     for fold_idx in range(10):
         LOGGER.info('\t* Training with fold {} held out'.format(fold_idx+1))
-        X_train, y_train, X_test, y_test = get_fold_split(fold_data, fold_idx)
+        X_train, y_train, X_test, y_test = get_us8k_fold_split(fold_data, fold_idx,
+                                                               frame_features=frame_features)
         if model_type == 'svm':
             model, y_train_pred, y_test_pred = train_svm(
-                    X_train, y_train, X_test, y_test,
+                    X_train, y_train, X_test, y_test, frame_features,
                     verbose=verbose, **model_args)
 
             LOGGER.info('Saving model...')
@@ -623,7 +270,7 @@ def train(metadata_path, data_dir, model_id, output_dir,
 
         elif model_type == 'mlp':
             model, y_train_pred, y_test_pred = train_mlp(
-                    X_train, y_train, X_test, y_test, model_dir,
+                    X_train, y_train, X_test, y_test, model_dir, frame_features,
                     verbose=verbose, **model_args)
 
         else:
diff --git a/classifier/us8k.py b/classifier/us8k.py
new file mode 100644
index 0000000..16f97d6
--- /dev/null
+++ b/classifier/us8k.py
@@ -0,0 +1,200 @@
+import csv
+import logging
+import os
+import numpy as np
+import classifier.features as cls_features
+
+LOGGER = logging.getLogger('us8k')
+LOGGER.setLevel(logging.DEBUG)
+
+
+def load_us8k_metadata(path):
+    """
+    Load UrbanSound8K metadata
+
+    Args:
+        path: Path to metadata csv file
+              (Type: str)
+
+    Returns:
+        metadata: List of metadata dictionaries
+                  (Type: list[dict[str, *]])
+    """
+    metadata = [{} for _ in range(10)]
+    with open(path) as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            fname = row['slice_file_name']
+            row['start'] = float(row['start'])
+            row['end'] = float(row['end'])
+            row['salience'] = float(row['salience'])
+            fold_num = row['fold'] = int(row['fold'])
+            row['classID'] = int(row['classID'])
+            metadata[fold_num-1][fname] = row
+
+    return metadata
+
+
+def get_us8k_folds(metadata_path, data_dir, l3embedding_model=None,
+                   features='l3_stack', label_format='int', **feature_args):
+    """
+    Load all of the data for each fold
+
+    Args:
+        metadata_path: Path to metadata file
+                       (Type: str)
+
+        data_dir: Path to data directory
+                  (Type: str)
+
+    Keyword Args:
+        l3embedding_model: L3 embedding model, used if L3 features are used
+                           (Type: keras.engine.training.Model or None)
+
+        features: Type of features to be computed
+                  (Type: str)
+
+        label_format: Type of format used for encoding outputs
+                      (Type: str)
+
+    Returns:
+        fold_data: List of data for each fold
+                   (Type: list[tuple[np.ndarray, np.ndarray]])
+
+    """
+    metadata = load_us8k_metadata(metadata_path)
+
+    fold_data = []
+    for fold_idx in range(10):
+        LOGGER.info("Loading fold {}...".format(fold_idx+1))
+        fold_data.append(get_us8k_fold_data(metadata, data_dir, fold_idx,
+                                            l3embedding_model=l3embedding_model,
+                                            features=features, label_format=label_format,
+                                            **feature_args))
+
+    return fold_data
+
+
+def get_us8k_fold_data(metadata, data_dir, fold_idx, l3embedding_model=None,
+                       features='l3_stack', label_format='int', **feature_args):
+    """
+    Load all of the data for a specific fold
+
+    Args:
+        metadata: List of metadata dictionaries
+                  (Type: list[dict[str,*]])
+
+        data_dir: Path to data directory
+                  (Type: str)
+
+        fold_idx: Index of fold to load
+                  (Type: int)
+
+    Keyword Args:
+        l3embedding_model: L3 embedding model, used if L3 features are used
+                           (Type: keras.engine.training.Model or None)
+
+        features: Type of features to be computed
+                  (Type: str)
+
+        label_format: Type of format used for encoding outputs
+                      (Type: str)
+
+    Returns:
+        X: Feature data
+           (Type: np.ndarray)
+        y: Label data
+           (Type: np.ndarray)
+
+    """
+    X = []
+    y = []
+
+    for idx, (fname, example_metadata) in enumerate(metadata[fold_idx].items()):
+        path = os.path.join(data_dir, "fold{}".format(fold_idx+1), fname)
+
+        if features.startswith('l3') and not l3embedding_model:
+            raise ValueError('Must provide L3 embedding model to use {} features'.format(features))
+
+        if features == 'l3_stack':
+            hop_size = feature_args.get('hop_size', 0.25)
+            features = cls_features.get_l3_stack_features(path, l3embedding_model,
+                                                          hop_size=hop_size)
+        elif features == 'l3_stats':
+            hop_size = feature_args.get('hop_size', 0.25)
+            features = cls_features.get_l3_stats_features(path, l3embedding_model,
+                                                          hop_size=hop_size)
+        elif features == 'l3_frames_uniform':
+            hop_size = feature_args.get('hop_size', 0.25)
+            features = cls_features.get_l3_frames_uniform(path, l3embedding_model,
+                                                          hop_size=hop_size)
+        elif features == 'l3_frames_random':
+            num_samples = feature_args.get('num_random_samples')
+            if not num_samples:
+                raise ValueError('Must specify "num_samples" for "l3_frame_random" features')
+            features = cls_features.get_l3_frames_random(path, l3embedding_model,
+                                                         num_samples)
+        else:
+            raise ValueError('Invalid feature type: {}'.format(features))
+
+        # If we were not able to compute the features, skip this file
+        if features is None:
+            continue
+
+        X.append(features)
+
+        class_label = example_metadata['classID']
+        if label_format == 'int':
+            y.append(class_label)
+        elif label_format == 'one_hot':
+            y.append(cls_features.one_hot(class_label))
+        else:
+            raise ValueError('Invalid label format: {}'.format(label_format))
+
+    return np.array(X), np.array(y)
+
+
+def get_us8k_fold_split(fold_data, fold_idx, frame_features, shuffle=True):
+    """
+    Given the fold to use as held out, return the data split between training
+    and testing
+
+    Args:
+        fold_data: List of data for each fold
+                   (Type: list[tuple[np.ndarray, np.ndarray]])
+
+        fold_idx: Fold to use as held out data
+                  (Type: int)
+
+        frame_features: If True, training frame features organized by file are
+                        flattened and labels are replicated accordingly
+                        (Type: bool)
+
+    Returns:
+        X_train: Training feature data
+                 (Type: np.ndarray)
+        y_train: Training label data
+                 (Type: np.ndarray)
+        X_test: Testing feature data
+                (Type: np.ndarray)
+        y_test: Testing label data
+                (Type: np.ndarray)
+    """
+    X_test, y_test = fold_data[fold_idx]
+    train = fold_data[:fold_idx] + fold_data[fold_idx+1:]
+
+    X_train, y_train = zip(*train)
+    X_train = np.concatenate(X_train, axis=0)
+    y_train = np.concatenate(y_train, axis=0)
+
+    # If features are computed framewise flatten the audio file dimension
+    if frame_features:
+        X_train, y_train = cls_features.flatten_file_frames(X_train, y_train)
+
+    if shuffle:
+        train_idxs = np.arange(X_train.shape[0])
+        np.random.shuffle(train_idxs)
+        X_train = X_train[train_idxs]
+        y_train = y_train[train_idxs]
+
+    return X_train, y_train, X_test, y_test
\ No newline at end of file

From bf387f6150b6a3da7992b2c50369402aec387b67 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sat, 24 Feb 2018 18:08:30 -0500
Subject: [PATCH 099/201] Update classifier training script

---
 jobs/classifier-train.sbatch | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/jobs/classifier-train.sbatch b/jobs/classifier-train.sbatch
index e2455ee..36b0e5a 100644
--- a/jobs/classifier-train.sbatch
+++ b/jobs/classifier-train.sbatch
@@ -22,20 +22,23 @@ DATA_DIR=''
 OUTPUT_DIR=''
 MODEL_PATH=''
 MODEL_ID='mycls'
+DATASET_NAME='us8k'
 
 module purge
 #module load cuda/8.0.44
 #module load cudnn/8.0v6.0
 
-python $SRCDIR/train_classifier.py \
+python $SRCDIR/05_train_classifier.py \
     --random-state 20171021 \
-    --features l3_stats \
+    --features l3_frames_uniform \
     --label-format int \
     --model-type svm \
     --l3embedding-model-path $MODEL_PATH \
     --l3embedding-model-type cnn_L3_orig \
+    --hop-size 0.25 \
     --verbose \
     $METADATA_PATH \
+    $DATASET_NAME \
     $DATA_DIR \
     $MODEL_ID \
     $OUTPUT_DIR

From 86747975690eef69f2e6bb248b6350bf1e626fe1 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sat, 24 Feb 2018 18:12:27 -0500
Subject: [PATCH 100/201] Fix LOGGER issues in classifier modules

---
 classifier/features.py | 2 +-
 classifier/metrics.py  | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/classifier/features.py b/classifier/features.py
index 8d41189..9c98ee3 100644
--- a/classifier/features.py
+++ b/classifier/features.py
@@ -4,7 +4,7 @@
 import numpy as np
 import scipy as sp
 
-LOGGER = logging.getLogger('us8k')
+LOGGER = logging.getLogger('classifier-features')
 LOGGER.setLevel(logging.DEBUG)
 
 
diff --git a/classifier/metrics.py b/classifier/metrics.py
index 7c1c9c1..6526bcf 100644
--- a/classifier/metrics.py
+++ b/classifier/metrics.py
@@ -1,6 +1,8 @@
+import logging
 import numpy as np
 
-from classifier.train import LOGGER
+LOGGER = logging.getLogger('classifier-metrics')
+LOGGER.setLevel(logging.DEBUG)
 
 
 def compute_metrics(y, pred):
@@ -87,4 +89,4 @@ def print_metrics(metrics, subset_name):
         LOGGER.info("* " + metric)
         for stat_name, stat_val in metric_stats.items():
             LOGGER.info("\t- {}: {}".format(stat_name, stat_val))
-        LOGGER.info("\n")
\ No newline at end of file
+        LOGGER.info("\n")

From 3d5d5a4f16b6d4c37b925840eb2ba30c2412b61e Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sat, 24 Feb 2018 20:45:38 -0500
Subject: [PATCH 101/201] Fix name overriding of "features"

---
 classifier/us8k.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/classifier/us8k.py b/classifier/us8k.py
index 16f97d6..4222e01 100644
--- a/classifier/us8k.py
+++ b/classifier/us8k.py
@@ -118,30 +118,30 @@ def get_us8k_fold_data(metadata, data_dir, fold_idx, l3embedding_model=None,
 
         if features == 'l3_stack':
             hop_size = feature_args.get('hop_size', 0.25)
-            features = cls_features.get_l3_stack_features(path, l3embedding_model,
+            file_features = cls_features.get_l3_stack_features(path, l3embedding_model,
                                                           hop_size=hop_size)
         elif features == 'l3_stats':
             hop_size = feature_args.get('hop_size', 0.25)
-            features = cls_features.get_l3_stats_features(path, l3embedding_model,
+            file_features = cls_features.get_l3_stats_features(path, l3embedding_model,
                                                           hop_size=hop_size)
         elif features == 'l3_frames_uniform':
             hop_size = feature_args.get('hop_size', 0.25)
-            features = cls_features.get_l3_frames_uniform(path, l3embedding_model,
+            file_features = cls_features.get_l3_frames_uniform(path, l3embedding_model,
                                                           hop_size=hop_size)
         elif features == 'l3_frames_random':
             num_samples = feature_args.get('num_random_samples')
             if not num_samples:
                 raise ValueError('Must specify "num_samples" for "l3_frame_random" features')
-            features = cls_features.get_l3_frames_random(path, l3embedding_model,
+            file_features = cls_features.get_l3_frames_random(path, l3embedding_model,
                                                          num_samples)
         else:
             raise ValueError('Invalid feature type: {}'.format(features))
 
         # If we were not able to compute the features, skip this file
-        if features is None:
+        if file_features is None:
             continue
 
-        X.append(features)
+        X.append(file_features)
 
         class_label = example_metadata['classID']
         if label_format == 'int':
@@ -197,4 +197,4 @@ def get_us8k_fold_split(fold_data, fold_idx, frame_features, shuffle=True):
         X_train = X_train[train_idxs]
         y_train = y_train[train_idxs]
 
-    return X_train, y_train, X_test, y_test
\ No newline at end of file
+    return X_train, y_train, X_test, y_test

From e47221a3da995bd696dee1218cdd07df189e21a4 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sat, 24 Feb 2018 20:46:54 -0500
Subject: [PATCH 102/201] Add support in model module to convert between GPUs
 (though this requires enough GPUs to be available to load the saved model)
 and fix loading embedding layer

---
 l3embedding/audio_model.py  |  3 +-
 l3embedding/model.py        | 82 ++++++++++++++++++++++++++++++++++++-
 l3embedding/train.py        |  6 +--
 l3embedding/vision_model.py |  2 +-
 4 files changed, 85 insertions(+), 8 deletions(-)

diff --git a/l3embedding/audio_model.py b/l3embedding/audio_model.py
index 35e704e..d974730 100644
--- a/l3embedding/audio_model.py
+++ b/l3embedding/audio_model.py
@@ -456,7 +456,7 @@ def construct_cnn_l3_orig_audio_embedding_model(audio_model, x_a):
     """
     pool_size = (8, 8)
     embed_layer = audio_model.get_layer('audio_embedding_layer')
-    y_a = MaxPooling2D(pool_size=pool_size, padding='same')(embed_layer)
+    y_a = MaxPooling2D(pool_size=pool_size, padding='same')(embed_layer.output)
     y_a = Flatten()(y_a)
 
     m = Model(inputs=x_a, outputs=y_a)
@@ -516,7 +516,6 @@ def construct_tiny_L3_audio_model():
 
     return m, x_a, y_a
 
-
 EMBEDDING_MODELS = {
     'cnn_L3_orig': construct_cnn_l3_orig_audio_embedding_model
 }
diff --git a/l3embedding/model.py b/l3embedding/model.py
index 50ad3c2..cc5a4b9 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -2,6 +2,7 @@
 from keras.layers import concatenate, Dense, MaxPooling2D, Flatten
 from .vision_model import *
 from .audio_model import *
+from .training_utils import multi_gpu_model
 
 
 def L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, model_name, layer_size=128):
@@ -35,7 +36,54 @@ def L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, model_name
     return m, [x_i, x_a], y
 
 
-def load_model(weights_path, model_type, return_io=False):
+def convert_num_gpus(model, inputs, outputs, model_type, src_num_gpus, tgt_num_gpus):
+    """
+    Converts a multi-GPU model to a model that uses a different number of GPUs
+
+    If the model is single-GPU/CPU, the given model is returned
+
+    Args:
+        model:  Keras model
+                (Type: keras.models.Model)
+
+        inputs: Input Tensor.
+                (Type: keras.layers.Input)
+
+        ouputs: Embedding output Tensor/Layer.
+                (Type: keras.layers.Layer)
+
+        model_type: Name of model type
+                    (Type: str)
+
+        src_num_gpus: Number of GPUs the source model uses
+                      (Type: int)
+
+        tgt_num_gpus: Number of GPUs the converted model will use
+                      (Type: int)
+
+    Returns:
+        model_cvt:  Embedding model object
+                    (Type: keras.engine.training.Model)
+
+        inputs_cvt: Input Tensor. Not returned if return_io is False.
+                    (Type: keras.layers.Input)
+
+        ouputs_cvt: Embedding output Tensor/Layer. Not returned if return_io is False.
+                    (Type: keras.layers.Layer)
+    """
+    if src_num_gpus <= 1 and tgt_num_gpus <= 1:
+        return model, inputs, outputs
+
+    m_new, inputs_new, output_new = MODELS[model_type]()
+    m_new.set_weights(model.layers[-2].get_weights())
+
+    if tgt_num_gpus > 1:
+        m_new = multi_gpu_model(m_new, gpus=tgt_num_gpus)
+
+    return m_new, inputs_new, output_new
+
+
+def load_model(weights_path, model_type, num_gpus=0, target_num_gpus=None, return_io=False):
     """
     Loads an audio-visual correspondence model
 
@@ -46,6 +94,12 @@ def load_model(weights_path, model_type, return_io=False):
                        (Type: str)
 
     Keyword Args:
+        num_gpus:   Number of GPUs the saved model uses
+                    (Type: int)
+
+        target_num_gpus:   Number of GPUs the loaded model will use
+                           (Type: int)
+
         return_io:  If True, return input and output tensors
                     (Type: bool)
 
@@ -61,7 +115,14 @@ def load_model(weights_path, model_type, return_io=False):
         raise ValueError('Invalid model type: "{}"'.format(model_type))
 
     m, inputs, output = MODELS[model_type]()
+    if num_gpus > 1:
+        m = multi_gpu_model(m, gpus=num_gpus)
     m.load_weights(weights_path)
+
+    if target_num_gpus is not None and num_gpus != target_num_gpus:
+        m, inputs, output = convert_num_gpus(m, inputs, output, model_type,
+                                             num_gpus, target_num_gpus)
+
     if return_io:
         return m, inputs, output
     else:
@@ -110,6 +171,21 @@ def load_embedding(weights_path, model_type, embedding_type, return_io=False):
         return m_embed
 
 
+def gpu_wrapper(model_f):
+    """
+    Decorator for creating multi-gpu models
+    """
+    def wrapped(*args, num_gpus=0, **kwargs):
+        m, inp, out = model_f(*args, **kwargs)
+        if num_gpus > 1:
+            m = multi_gpu_model(m, gpus=num_gpus)
+
+        return m, inp, out
+
+    return wrapped
+
+
+@gpu_wrapper
 def construct_cnn_L3_orig():
     """
     Constructs a model that replicates that used in Look, Listen and Learn
@@ -131,6 +207,7 @@ def construct_cnn_L3_orig():
     m = L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, 'cnn_L3_orig')
     return m
 
+@gpu_wrapper
 def construct_cnn_L3_kapredbinputbn():
     """
     Constructs a model that replicates that used in Look, Listen and Learn
@@ -152,6 +229,7 @@ def construct_cnn_L3_kapredbinputbn():
     m = L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, 'cnn_L3_kapredbinputbn')
     return m
 
+@gpu_wrapper
 def construct_cnn_L3_melspec1():
     """
     Constructs a model that replicates that used in Look, Listen and Learn
@@ -173,6 +251,7 @@ def construct_cnn_L3_melspec1():
     m = L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, 'cnn_L3_kapredbinputbn')
     return m
 
+@gpu_wrapper
 def construct_cnn_L3_melspec2():
     """
     Constructs a model that replicates that used in Look, Listen and Learn
@@ -194,6 +273,7 @@ def construct_cnn_L3_melspec2():
     m = L3_merge_audio_vision_models(vision_model, x_i, audio_model, x_a, 'cnn_L3_kapredbinputbn')
     return m
 
+@gpu_wrapper
 def construct_tiny_L3():
     """
     Constructs a model that implements a small L3 model for validation purposes
diff --git a/l3embedding/train.py b/l3embedding/train.py
index 04a7ebc..211fd21 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -266,11 +266,9 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
 
     if continue_model_dir:
         latest_model_path = os.path.join(continue_model_dir, 'model_latest.h5')
-        m, inputs, outputs = load_model(latest_model_path, model_type, return_io=True)
+        m, inputs, outputs = load_model(latest_model_path, model_type, return_io=True, num_gpus=gpus)
     else:
-        m, inputs, outputs = MODELS[model_type]()
-        if gpus > 1:
-            m = multi_gpu_model(m, gpus=gpus)
+        m, inputs, outputs = MODELS[model_type](num_gpus=gpus)
 
     loss = 'binary_crossentropy'
     metrics = ['accuracy']
diff --git a/l3embedding/vision_model.py b/l3embedding/vision_model.py
index 2a91ab9..755ea13 100644
--- a/l3embedding/vision_model.py
+++ b/l3embedding/vision_model.py
@@ -211,7 +211,7 @@ def construct_cnn_l3_orig_vision_embedding_model(vision_model, x_i):
     """
     pool_size = (7, 7)
     embed_layer = vision_model.get_layer('vision_embedding_layer')
-    y_i = MaxPooling2D(pool_size=pool_size, padding='same')(embed_layer)
+    y_i = MaxPooling2D(pool_size=pool_size, padding='same')(embed_layer.output)
     y_i = Flatten()(y_i)
 
     m = Model(inputs=x_i, outputs=y_i)

From 7c0208b825c59c0cc67c07655deae75b500c839d Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 26 Feb 2018 16:53:22 -0500
Subject: [PATCH 103/201] Add retry to Google Sheets requests

---
 gsheets.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/gsheets.py b/gsheets.py
index 889ac9a..0b53a98 100644
--- a/gsheets.py
+++ b/gsheets.py
@@ -95,7 +95,22 @@ def append_row(service, spreadsheet_id, param_dict):
         valueInputOption=value_input_option,
         insertDataOption=insert_data_option,
         body=value_range_body)
-    response = request.execute()
+    response = request_with_retry(request)
+
+
+def request_with_retry(request, num_retries=50):
+    exc = None
+    for _ in range(num_retries):
+        try:
+            response = request.execute()
+            break
+        except Exception as e:
+            exc = e
+            continue
+    else:
+        raise exc
+
+    return response
 
 
 def get_row(service, spreadsheet_id, param_dict):
@@ -106,7 +121,7 @@ def get_row(service, spreadsheet_id, param_dict):
         spreadsheetId=spreadsheet_id,
         range=range_,
         majorDimension=major_dimension)
-    response = request.execute()
+    response = request_with_retry(request)
 
     try:
         row_idx = response['values'][0].index(param_dict['model_dir'])
@@ -130,7 +145,7 @@ def update_experiment(service, spreadsheet_id, param_dict, start_col, end_col, v
         range=range_,
         valueInputOption=value_input_option,
         body=value_range_body)
-    response = request.execute()
+    response = request_with_retry(request)
 
 if __name__ == '__main__':
     import argparse

From 196337b56cc803066900bbec49c15efea3da3c26 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 26 Feb 2018 16:59:25 -0500
Subject: [PATCH 104/201] Pass GPU parameters from load_embedding to load_model

---
 l3embedding/model.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/l3embedding/model.py b/l3embedding/model.py
index cc5a4b9..eb1b093 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -129,7 +129,8 @@ def load_model(weights_path, model_type, num_gpus=0, target_num_gpus=None, retur
         return m
 
 
-def load_embedding(weights_path, model_type, embedding_type, return_io=False):
+def load_embedding(weights_path, model_type, embedding_type,
+                   num_gpus=0, target_num_gpus=None, return_io=False):
     """
     Loads an embedding model
 
@@ -142,6 +143,12 @@ def load_embedding(weights_path, model_type, embedding_type, return_io=False):
                          (Type: str)
 
     Keyword Args:
+        num_gpus:   Number of GPUs the saved model uses
+                    (Type: int)
+
+        target_num_gpus:   Number of GPUs the loaded model will use
+                           (Type: int)
+
         return_io:  If True, return input and output tensors
                     (Type: bool)
 
@@ -153,7 +160,8 @@ def load_embedding(weights_path, model_type, embedding_type, return_io=False):
         y_i:    Embedding output Tensor/Layer. Not returned if return_io is False.
                 (Type: keras.layers.Layer)
     """
-    m, inputs, output = load_model(weights_path, model_type, return_io=True)
+    m, inputs, output = load_model(weights_path, model_type, num_gpus=num_gpus,
+                                   target_num_gpus=target_num_gpus, return_io=True)
     x_i, x_a = inputs
     if embedding_type == 'vision':
         m_embed_model = m.get_layer('vision_model')

From 448ae0fb7fbfd3669569d74de0e9f4ca2d724d17 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 26 Feb 2018 17:31:14 -0500
Subject: [PATCH 105/201] Handle case where number of frames per file is
 consistent in classifier fold processing

---
 classifier/features.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/classifier/features.py b/classifier/features.py
index 9c98ee3..5b23d75 100644
--- a/classifier/features.py
+++ b/classifier/features.py
@@ -302,12 +302,22 @@ def flatten_file_frames(X, y):
                      (Type: np.ndarray)
 
     """
-    num_frames_per_file = []
-    X_flatten = []
-    for X_file in X:
-        num_frames_per_file.append(len(X_file))
-        X_flatten += X_file
-    X_flatten = np.array(X_flatten)
+    if X.ndim == 1:
+        # In this case the number of frames per file varies, which makes the
+        # numpy array store lists for each file
+        num_frames_per_file = []
+        X_flatten = []
+        for X_file in X:
+            num_frames_per_file.append(len(X_file))
+            X_flatten += X_file
+        X_flatten = np.array(X_flatten)
+    else:
+        # In this case the number of frames per file is the same, which means
+        # all of the data is in a unified numpy array
+        X_shape = X.shape
+        num_files, num_frames_per_file = X_shape[0], X_shape[1]
+        new_shape = (num_files * num_frames_per_file,) + X_shape[2:]
+        X_flatten = X.reshape(new_shape)
 
     # Repeat the labels for each frame
     y_flatten = np.repeat(y, num_frames_per_file)

From 68d6bb06e513b9f07ceacff9dc8a00a9cda1d03b Mon Sep 17 00:00:00 2001
From: Justin Salamon <justin.salamon@gmail.com>
Date: Wed, 28 Feb 2018 19:19:24 -0500
Subject: [PATCH 106/201] notebook for converting multigpu models to single gpu

---
 notebooks/convert_multigpu_singlegpu.ipynb | 142 +++++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 notebooks/convert_multigpu_singlegpu.ipynb

diff --git a/notebooks/convert_multigpu_singlegpu.ipynb b/notebooks/convert_multigpu_singlegpu.ipynb
new file mode 100644
index 0000000..1cfa04a
--- /dev/null
+++ b/notebooks/convert_multigpu_singlegpu.ipynb
@@ -0,0 +1,142 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using TensorFlow backend.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import keras\n",
+    "import os\n",
+    "import glob"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "model_dirs  = ['/scratch/jtc440/l3_output/melspec1/20180221105528',\n",
+    "               '/scratch/jtc440/l3_output/melspec2/20180223113902']\n",
+    "#                '/scratch/hhw230/l3_output/new_train_test/20180216082045',\n",
+    "#                '/scratch/hhw230/l3_output/new_train_test/20180218104219']\n",
+    "\n",
+    "model_types = ['cnn_L3_melspec1', 'cnn_L3_melspec2']\n",
+    "\n",
+    "output_dir = '/scratch/js7561/l3_output'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/home/js7561/dev/l3embedding\n"
+     ]
+    }
+   ],
+   "source": [
+    "cd /home/js7561/dev/l3embedding/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from l3embedding.model import load_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "('Keyword argument not understood:', 'htk')",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-9-bfa0ac7c37cb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     11\u001b[0m         \u001b[0;31m# Load and convert model back to 1 gpu\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m         \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_gpus\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget_num_gpus\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m         \u001b[0;31m# Save converted model back to disk\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/dev/l3embedding/l3embedding/model.py\u001b[0m in \u001b[0;36mload_model\u001b[0;34m(weights_path, model_type, num_gpus, target_num_gpus, return_io)\u001b[0m\n\u001b[1;32m    115\u001b[0m         \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Invalid model type: \"{}\"'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    116\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 117\u001b[0;31m     \u001b[0mm\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mMODELS\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmodel_type\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    118\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mnum_gpus\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    119\u001b[0m         \u001b[0mm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmulti_gpu_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mm\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgpus\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnum_gpus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/dev/l3embedding/l3embedding/model.py\u001b[0m in \u001b[0;36mwrapped\u001b[0;34m(num_gpus, *args, **kwargs)\u001b[0m\n\u001b[1;32m    185\u001b[0m     \"\"\"\n\u001b[1;32m    186\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mwrapped\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_gpus\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 187\u001b[0;31m         \u001b[0mm\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel_f\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    188\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mnum_gpus\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    189\u001b[0m             \u001b[0mm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmulti_gpu_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mm\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgpus\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnum_gpus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/dev/l3embedding/l3embedding/model.py\u001b[0m in \u001b[0;36mconstruct_cnn_L3_melspec1\u001b[0;34m()\u001b[0m\n\u001b[1;32m    255\u001b[0m     \"\"\"\n\u001b[1;32m    256\u001b[0m     \u001b[0mvision_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx_i\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_i\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconstruct_cnn_L3_orig_vision_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 257\u001b[0;31m     \u001b[0maudio_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx_a\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_a\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconstruct_cnn_L3_melspec1_audio_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    258\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    259\u001b[0m     \u001b[0mm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mL3_merge_audio_vision_models\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvision_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx_i\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maudio_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx_a\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'cnn_L3_kapredbinputbn'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/dev/l3embedding/l3embedding/audio_model.py\u001b[0m in \u001b[0;36mconstruct_cnn_L3_melspec1_audio_model\u001b[0;34m()\u001b[0m\n\u001b[1;32m    257\u001b[0m     y_a = Melspectrogram(n_dft=n_dft, n_hop=n_hop, n_mels=n_mels,\n\u001b[1;32m    258\u001b[0m                       \u001b[0mpower_melgram\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1.0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhtk\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# n_win=n_win,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 259\u001b[0;31m                       return_decibel_melgram=True, padding='same')(x_a)\n\u001b[0m\u001b[1;32m    260\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    261\u001b[0m     \u001b[0;31m# CONV BLOCK 1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/dev/l3embedding/src/kapre/kapre/time_frequency.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, sr, n_mels, fmin, fmax, power_melgram, return_decibel_melgram, trainable_fb, **kwargs)\u001b[0m\n\u001b[1;32m    259\u001b[0m                  trainable_fb=False, **kwargs):\n\u001b[1;32m    260\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 261\u001b[0;31m         \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mMelspectrogram\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    262\u001b[0m         \u001b[0;32massert\u001b[0m \u001b[0msr\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    263\u001b[0m         \u001b[0;32massert\u001b[0m \u001b[0mfmin\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0;36m0.0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/dev/l3embedding/src/kapre/kapre/time_frequency.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, n_dft, n_win, n_hop, padding, power_spectrogram, return_decibel_spectrogram, trainable_kernel, image_data_format, **kwargs)\u001b[0m\n\u001b[1;32m    101\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpower_spectrogram\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpower_spectrogram\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    102\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturn_decibel_spectrogram\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreturn_decibel_spectrogram\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 103\u001b[0;31m         \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSpectrogram\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    105\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mbuild\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_shape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/miniconda3/envs/l3/lib/python3.5/site-packages/keras/engine/topology.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m    278\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mkwarg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    279\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mkwarg\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mallowed_kwargs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 280\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Keyword argument not understood:'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwarg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    281\u001b[0m         \u001b[0mname\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'name'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    282\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mTypeError\u001b[0m: ('Keyword argument not understood:', 'htk')"
+     ]
+    }
+   ],
+   "source": [
+    "for md, mt in zip(model_dirs, model_types):\n",
+    "    dirname = os.path.basename(md)\n",
+    "    out_model_dir = os.path.join(output_dir, dirname)\n",
+    "    if not os.path.isdir(out_model_dir):\n",
+    "        os.mkdir(out_model_dir)\n",
+    "        \n",
+    "    weight_files = glob.glob(os.path.join(md, '*.h5'))\n",
+    "    \n",
+    "    for wf in weight_files:\n",
+    "\n",
+    "        # Load and convert model back to 1 gpu\n",
+    "        model = load_model(wf, mt, num_gpus=4, target_num_gpus=1)\n",
+    "        \n",
+    "        # Save converted model back to disk\n",
+    "        output_weight_file = os.path.join(out_model_dir, os.path.basename(wf))\n",
+    "        model.save_weights(output_weight_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (l3)",
+   "language": "python",
+   "name": "l3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 78a10cee64293a800fd8362d7957c6579edfcabf Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 28 Feb 2018 22:46:03 -0500
Subject: [PATCH 107/201] Update requirements file

---
 requirements.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 4df454a..bdbe75f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,10 +4,12 @@ numpy==1.13.3
 pescador==1.1.0
 Pillow-SIMD==4.3.0.post0
 PySoundFile==0.9.0.post1
--e git+https://github.com/jtcramer/kapre.git@7057f06a722113341e149516c143a4ecb00ceebd#egg=kapre
+-e git+https://github.com/jtcramer/kapre.git@8d6be29a18f0752f60f7b304f5f356d350d37f1b#egg=kapre
 -e git+https://github.com/scikit-image/scikit-image.git@e2a609415f17230549b845c38511315659f29f1e#egg=scikit_image
 scikit-learn==0.19.0
 scipy==0.19.1
 sk-video==1.1.8
 tensorflow==1.3.0
 tqdm==4.19.4
+GitPython==2.1.8               
+google-api-python-client==1.6.5

From f15b1e0af94ff8163ad674aa0e3a300163ff3912 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 28 Feb 2018 22:55:00 -0500
Subject: [PATCH 108/201] Update README to refer to TF and Keras installation,
 remove TF and Keras from requirements

---
 README.md        | 5 ++++-
 requirements.txt | 2 --
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 9be2656..86f8e7f 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,12 @@
 This is an implementation of the proposed model in Look, Listen and Learn ([Arandjelović, R., Zisserman, A. 2017](https://arxiv.org/pdf/1705.08168.pdf)). This model uses videos to learn vision and audio features in an unsupervised fashion by training the model for the proposed Audio-Visual Correspondence (AVC) task. This task tries to determine whether a piece of audio and an image frame come from the same video and occur simulatneously.
 
 Dependencies
-* Python 3
+* Python 3 (we use 3.6.3)
 * [ffmpeg](http://www.ffmpeg.org)
 * [sox](http://sox.sourceforge.net)
+* [TensorFlow](https://www.tensorflow.org/install/) (follow instructions carefully, and install before other Python dependencies)
+* [keras](https://keras.io/#installation) (follow instructions carefully!)
+* Other Python dependencies can by installed via `pip install -r requirements.txt`
 
 The code for the model and training implementation can be found in `l3embedding/`. Note that the metadata format expected is the same used in [AudioSet](https://research.google.com/audioset/download.html) ([Gemmeke, J., Ellis, D., et al. 2017](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45857.pdf)), as training this model on AudioSet was one of the goals for this implementation.
 
diff --git a/requirements.txt b/requirements.txt
index bdbe75f..6e93fd6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-Keras==2.1.0
 librosa==0.5.1
 numpy==1.13.3
 pescador==1.1.0
@@ -9,7 +8,6 @@ PySoundFile==0.9.0.post1
 scikit-learn==0.19.0
 scipy==0.19.1
 sk-video==1.1.8
-tensorflow==1.3.0
 tqdm==4.19.4
 GitPython==2.1.8               
 google-api-python-client==1.6.5

From 57883224d4f7615859d277cc850ab5d009f04a8a Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 28 Feb 2018 22:56:49 -0500
Subject: [PATCH 109/201] Add conda environment file

---
 l3conda.yml | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 128 insertions(+)
 create mode 100644 l3conda.yml

diff --git a/l3conda.yml b/l3conda.yml
new file mode 100644
index 0000000..f239802
--- /dev/null
+++ b/l3conda.yml
@@ -0,0 +1,128 @@
+name: l3embedding
+channels:
+- conda-forge
+- anaconda
+- defaults
+dependencies:
+- ca-certificates=2017.08.26=h1d4fec5_0
+- certifi=2017.7.27.1=py36h8b7b77e_0
+- cython=0.27.3=py36h1860423_0
+- libedit=3.1=heed3624_0
+- libffi=3.2.1=h4deb6c0_3
+- libgcc-ng=7.2.0=hcbc56d2_1
+- libstdcxx-ng=7.2.0=h24385c6_1
+- ncurses=6.0=h06874d7_1
+- openssl=1.0.2l=h9d1a558_3
+- pip=9.0.1=py36h8ec8b28_3
+- python=3.6.2=hdfe5801_15
+- readline=7.0=hac23ff0_3
+- setuptools=36.5.0=py36he42e2e1_0
+- sqlite=3.20.1=h6d8b0f3_1
+- tk=8.6.7=h5979e9b_1
+- wheel=0.29.0=py36he7f4e38_1
+- xz=5.2.3=h2bcbf08_1
+- zlib=1.2.11=hfbfcf68_1
+- libflac=1.3.1=0
+- libogg=1.3.2=0
+- libpng=1.6.28=2
+- libsndfile=1.0.27=1
+- libvorbis=1.3.5=0
+- pip:
+  - appnope==0.1.0
+  - audioread==2.1.5
+  - bleach==1.5.0
+  - cffi==1.11.0
+  - cycler==0.10.0
+  - dask==0.16.0
+  - decorator==4.1.2
+  - entrypoints==0.2.3
+  - enum34==1.1.6
+  - future==0.16.0
+  - gitdb2==2.0.3
+  - gitpython==2.1.8
+  - google-api-python-client==1.6.5
+  - h5py==2.7.1
+  - html5lib==0.9999999
+  - httplib2==0.10.3
+  - imageio==2.1.2
+  - ipykernel==4.6.1
+  - ipython==6.1.0
+  - ipython-genutils==0.2.0
+  - ipywidgets==6.0.0
+  - jedi==0.10.2
+  - jinja2==2.9.6
+  - joblib==0.11
+  - jsonschema==2.6.0
+  - jupyter==1.0.0
+  - jupyter-client==5.1.0
+  - jupyter-console==5.2.0
+  - jupyter-core==4.3.0
+  - kapre==0.1.2.1
+  - keras==2.0.9
+  - librosa==0.5.1
+  - llvmlite==0.20.0
+  - markdown==2.6.9
+  - markupsafe==1.0
+  - matplotlib==2.1.1
+  - mistune==0.7.4
+  - moviepy==0.2.3.2
+  - multiprocessing-logging==0.2.4
+  - nbconvert==5.2.1
+  - nbformat==4.4.0
+  - networkx==2.0
+  - notebook==5.0.0
+  - numba==0.35.0
+  - numpy==1.13.3
+  - oauth2client==4.1.2
+  - olefile==0.44
+  - opencv-python==3.3.0.10
+  - pafy==0.5.3.1
+  - pandas==0.20.3
+  - pandocfilters==1.4.2
+  - parso==0.1.0
+  - pescador==1.1.0
+  - pexpect==4.2.1
+  - pickleshare==0.7.4
+  - pillow==4.3.0
+  - prompt-toolkit==1.0.15
+  - protobuf==3.4.0
+  - ptyprocess==0.5.2
+  - pyasn1==0.4.2
+  - pyasn1-modules==0.2.1
+  - pycparser==2.18
+  - pygame==1.9.3
+  - pygments==2.2.0
+  - pyparsing==2.2.0
+  - python-dateutil==2.6.1
+  - pytz==2017.3
+  - pywavelets==0.5.2
+  - pyyaml==3.12
+  - pyzmq==16.0.2
+  - qtconsole==4.3.1
+  - resampy==0.2.0
+  - rsa==3.4.2
+  - scikit-image==0.13.1
+  - scikit-learn==0.19.1
+  - scipy==1.0.0
+  - simplegeneric==0.8.1
+  - six==1.11.0
+  - sk-video==1.1.8
+  - smmap2==2.0.3
+  - soundfile==0.9.0.post1
+  - sox==1.3.0
+  - tensorflow-gpu==1.4.0
+  - tensorflow-tensorboard==0.4.0rc2
+  - terminado==0.6
+  - testpath==0.3
+  - toolz==0.9.0
+  - tornado==4.5.2
+  - tqdm==4.11.2
+  - traitlets==4.3.2
+  - uritemplate==3.0.0
+  - wcwidth==0.1.7
+  - webencodings==0.5.1
+  - werkzeug==0.12.2
+  - widgetsnbextension==3.0.2
+  - youtube-dl==2017.9.24
+prefix: /home/jtc440/miniconda3/envs/l3embedding
+

From b19a229a9fd608f4d2a252b98aa54b224a37919b Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 1 Mar 2018 10:31:14 -0500
Subject: [PATCH 110/201] Refactor structure a bit

---
 01_create_subsets.py                             |  5 +++--
 02_generate_samples.py                           |  9 +++++----
 05_train_classifier.py => 06_train_classifier.py |  0
 classifier/train.py                              |  7 +++----
 data/avc/__init__.py                             |  0
 data/{ => avc}/sample.py                         | 10 +---------
 data/{ => avc}/subsets.py                        |  2 +-
 data/usc/__init__.py                             |  0
 {classifier => data/usc}/features.py             |  6 ++----
 {classifier => data/usc}/us8k.py                 |  4 +++-
 10 files changed, 18 insertions(+), 25 deletions(-)
 rename 05_train_classifier.py => 06_train_classifier.py (100%)
 create mode 100644 data/avc/__init__.py
 rename data/{ => avc}/sample.py (99%)
 rename data/{ => avc}/subsets.py (99%)
 create mode 100644 data/usc/__init__.py
 rename {classifier => data/usc}/features.py (98%)
 rename {classifier => data/usc}/us8k.py (99%)

diff --git a/01_create_subsets.py b/01_create_subsets.py
index 30fad08..bfd89cc 100644
--- a/01_create_subsets.py
+++ b/01_create_subsets.py
@@ -1,10 +1,11 @@
 import argparse
 import logging
 import os
-from log import init_console_logger
-from data.subsets import get_subset_split
 from csv import DictWriter
 
+from data.avc.subsets import get_subset_split
+from log import init_console_logger
+
 LOGGER = logging.getLogger('data')
 LOGGER.setLevel(logging.DEBUG)
 
diff --git a/02_generate_samples.py b/02_generate_samples.py
index 362fe34..a1bf481 100644
--- a/02_generate_samples.py
+++ b/02_generate_samples.py
@@ -1,12 +1,13 @@
 import argparse
 import logging
-import multiprocessing_logging
 import math
 from functools import partial
-from log import init_console_logger
-from data.sample import sample_and_save
-from data.utils import map_iterate_in_parallel
 
+import multiprocessing_logging
+
+from data.avc.sample import sample_and_save
+from data.utils import map_iterate_in_parallel
+from log import init_console_logger
 
 LOGGER = logging.getLogger('sampling')
 LOGGER.setLevel(logging.DEBUG)
diff --git a/05_train_classifier.py b/06_train_classifier.py
similarity index 100%
rename from 05_train_classifier.py
rename to 06_train_classifier.py
diff --git a/classifier/train.py b/classifier/train.py
index 2091af7..79960f4 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -1,21 +1,20 @@
 import json
 import os
+import random
 
 import keras
 import keras.regularizers as regularizers
 import numpy as np
-import random
 from keras.layers import Input, Dense
 from keras.models import Model
 from keras.optimizers import Adam
+from scipy.stats import mode
 from sklearn.externals import joblib
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC
-from scipy.stats import mode
 
-from classifier.features import flatten_file_frames
 from classifier.metrics import compute_metrics, aggregate_metrics, print_metrics
-from classifier.us8k import load_us8k_metadata, get_us8k_folds, \
+from data.usc.us8k import get_us8k_folds, \
     get_us8k_fold_split
 from l3embedding.model import load_embedding
 from l3embedding.train import LossHistory
diff --git a/data/avc/__init__.py b/data/avc/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/data/sample.py b/data/avc/sample.py
similarity index 99%
rename from data/sample.py
rename to data/avc/sample.py
index 805a3a9..4cfc226 100644
--- a/data/sample.py
+++ b/data/avc/sample.py
@@ -1,20 +1,12 @@
-import argparse
-import csv
-import datetime
 import glob
-import json
 import math
 import logging
-from multiprocessing import Pool
 import os
-import pickle
 import random
-import time
 import warnings
 
 import h5py
 import numpy as np
-import pandas as pd
 import pescador
 import scipy.misc
 import skimage
@@ -22,7 +14,7 @@
 from skvideo.io import FFmpegReader, ffprobe
 import soundfile as sf
 from tqdm import tqdm
-from .utils import read_csv_as_dicts, flatten_dict
+from data.utils import read_csv_as_dicts, flatten_dict
 from log import LogTimer
 
 LOGGER = logging.getLogger('sampling')
diff --git a/data/subsets.py b/data/avc/subsets.py
similarity index 99%
rename from data/subsets.py
rename to data/avc/subsets.py
index 66be378..f1a6cb2 100644
--- a/data/subsets.py
+++ b/data/avc/subsets.py
@@ -5,7 +5,7 @@
 import random
 from audioset.ontology import ASOntology
 from collections import OrderedDict
-from .utils import read_csv_as_dicts
+from data.utils import read_csv_as_dicts
 
 LOGGER = logging.getLogger('data')
 LOGGER.setLevel(logging.DEBUG)
diff --git a/data/usc/__init__.py b/data/usc/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/classifier/features.py b/data/usc/features.py
similarity index 98%
rename from classifier/features.py
rename to data/usc/features.py
index 5b23d75..9622404 100644
--- a/classifier/features.py
+++ b/data/usc/features.py
@@ -219,7 +219,7 @@ def get_l3_frames_uniform(audio_path, l3embedding_model, hop_size=0.25):
 
     return list(l3embedding)
 
-def get_l3_frames_random(audio, l3embedding_model, num_samples):
+def get_l3_frames_random(audio, l3embedding_model, num_samples, sr=48000):
     """
     Get L3 embedding stats features, i.e. compute statistics for each of the
     embedding features across 1 second (overlapping) window of the given audio
@@ -238,12 +238,10 @@ def get_l3_frames_random(audio, l3embedding_model, num_samples):
         features:  List of embedding vectors
                    (Type: list[np.ndarray])
     """
-    sr = 48000
-
     if type(audio) == str:
         audio, _ = librosa.load(audio, sr=sr, mono=True)
 
-    frame_length = 48000 * 1
+    frame_length = sr * 1
 
     audio_length = len(audio)
     pad_length = frame_length - audio_length
diff --git a/classifier/us8k.py b/data/usc/us8k.py
similarity index 99%
rename from classifier/us8k.py
rename to data/usc/us8k.py
index 4222e01..fbee340 100644
--- a/classifier/us8k.py
+++ b/data/usc/us8k.py
@@ -1,8 +1,10 @@
 import csv
 import logging
 import os
+
 import numpy as np
-import classifier.features as cls_features
+
+import data.usc.features as cls_features
 
 LOGGER = logging.getLogger('us8k')
 LOGGER.setLevel(logging.DEBUG)

From 3387a3cd8fa99b07b413f53ccd3890373f7e43a7 Mon Sep 17 00:00:00 2001
From: Justin Salamon <justin.salamon@gmail.com>
Date: Thu, 1 Mar 2018 12:41:02 -0500
Subject: [PATCH 111/201] Re-order args

---
 l3embedding/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/l3embedding/model.py b/l3embedding/model.py
index eb1b093..758220a 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -183,7 +183,7 @@ def gpu_wrapper(model_f):
     """
     Decorator for creating multi-gpu models
     """
-    def wrapped(*args, num_gpus=0, **kwargs):
+    def wrapped(num_gpus=0, *args, **kwargs):
         m, inp, out = model_f(*args, **kwargs)
         if num_gpus > 1:
             m = multi_gpu_model(m, gpus=num_gpus)

From bc474635158f1371050f0aefae1fb946e1c40c6c Mon Sep 17 00:00:00 2001
From: Justin Salamon <justin.salamon@gmail.com>
Date: Thu, 1 Mar 2018 12:44:06 -0500
Subject: [PATCH 112/201] Ignore pyc

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 398f615..3e85c7f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1 @@
-.idea/**.pyc*.err*.out
\ No newline at end of file
+.idea/**.pyc*.err*.out*.pyc
\ No newline at end of file

From b9c473baaf209750c2428342003fe4a8b03aa09d Mon Sep 17 00:00:00 2001
From: Justin Salamon <justin.salamon@gmail.com>
Date: Thu, 1 Mar 2018 12:44:42 -0500
Subject: [PATCH 113/201] Add init to make life easier inside notebooks

---
 l3embedding/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 l3embedding/__init__.py

diff --git a/l3embedding/__init__.py b/l3embedding/__init__.py
new file mode 100644
index 0000000..e69de29

From acbc834e1b7423b6da08014e45cc62ff45a96be9 Mon Sep 17 00:00:00 2001
From: Justin Salamon <justin.salamon@gmail.com>
Date: Thu, 1 Mar 2018 13:08:48 -0500
Subject: [PATCH 114/201] Test loading models after removing multigpu

---
 notebooks/test_load_converted_model.ipynb | 166 ++++++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 notebooks/test_load_converted_model.ipynb

diff --git a/notebooks/test_load_converted_model.ipynb b/notebooks/test_load_converted_model.ipynb
new file mode 100644
index 0000000..384afc8
--- /dev/null
+++ b/notebooks/test_load_converted_model.ipynb
@@ -0,0 +1,166 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using TensorFlow backend.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import keras\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/Users/justin/Documents/dev/l3embedding\n"
+     ]
+    }
+   ],
+   "source": [
+    "cd ~/dev/l3embedding/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from l3embedding.model import load_embedding, MODELS, load_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['cnn_L3_melspec2',\n",
+       " 'tiny_L3',\n",
+       " 'cnn_L3_kapredbinputbn',\n",
+       " 'cnn_L3_melspec1',\n",
+       " 'cnn_L3_orig']"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "MODELS.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "weight_file = '/Users/justin/Downloads/model_checkpoint.150.h5'\n",
+    "model_type = 'cnn_L3_melspec1'\n",
+    "emb_type = 'audio'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# LOAD FULL MODEL\n",
+    "model = load_model(weight_file, model_type, num_gpus=0, target_num_gpus=None, return_io=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "__________________________________________________________________________________________________\n",
+      "Layer (type)                    Output Shape         Param #     Connected to                     \n",
+      "==================================================================================================\n",
+      "input_3 (InputLayer)            (None, 224, 224, 3)  0                                            \n",
+      "__________________________________________________________________________________________________\n",
+      "input_4 (InputLayer)            (None, 1, 48000)     0                                            \n",
+      "__________________________________________________________________________________________________\n",
+      "vision_model (Model)            (None, 512)          4693056     input_3[0][0]                    \n",
+      "__________________________________________________________________________________________________\n",
+      "audio_model (Model)             (None, 512)          9021504     input_4[0][0]                    \n",
+      "__________________________________________________________________________________________________\n",
+      "concatenate_2 (Concatenate)     (None, 1024)         0           vision_model[1][0]               \n",
+      "                                                                 audio_model[1][0]                \n",
+      "__________________________________________________________________________________________________\n",
+      "dense_3 (Dense)                 (None, 128)          131200      concatenate_2[0][0]              \n",
+      "__________________________________________________________________________________________________\n",
+      "dense_4 (Dense)                 (None, 2)            258         dense_3[0][0]                    \n",
+      "==================================================================================================\n",
+      "Total params: 13,846,018\n",
+      "Trainable params: 9,508,738\n",
+      "Non-trainable params: 4,337,280\n",
+      "__________________________________________________________________________________________________\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Jusrt load rthe embedding\n",
+    "embedding = load_embedding(weight_file, model_type, emb_type, num_gpus=0, target_num_gpus=None, return_io=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (envsoundcnn)",
+   "language": "python",
+   "name": "envsoundcnn"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From c8257ac3a2124cee6463d254d30b6b17bd4db628 Mon Sep 17 00:00:00 2001
From: Justin Salamon <justin.salamon@gmail.com>
Date: Thu, 1 Mar 2018 14:24:05 -0500
Subject: [PATCH 115/201] Give audio and vision embedding model dicts unique
 names

---
 l3embedding/audio_model.py  | 2 +-
 l3embedding/model.py        | 4 ++--
 l3embedding/vision_model.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/l3embedding/audio_model.py b/l3embedding/audio_model.py
index d974730..4087057 100644
--- a/l3embedding/audio_model.py
+++ b/l3embedding/audio_model.py
@@ -516,6 +516,6 @@ def construct_tiny_L3_audio_model():
 
     return m, x_a, y_a
 
-EMBEDDING_MODELS = {
+AUDIO_EMBEDDING_MODELS = {
     'cnn_L3_orig': construct_cnn_l3_orig_audio_embedding_model
 }
diff --git a/l3embedding/model.py b/l3embedding/model.py
index 758220a..2a49250 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -165,11 +165,11 @@ def load_embedding(weights_path, model_type, embedding_type,
     x_i, x_a = inputs
     if embedding_type == 'vision':
         m_embed_model = m.get_layer('vision_model')
-        m_embed, x_embed, y_embed = EMBEDDING_MODELS[model_type](m_embed_model, x_i)
+        m_embed, x_embed, y_embed = VISION_EMBEDDING_MODELS[model_type](m_embed_model, x_i)
 
     elif embedding_type == 'audio':
         m_embed_model = m.get_layer('audio_model')
-        m_embed, x_embed, y_embed = EMBEDDING_MODELS[model_type](m_embed_model, x_a)
+        m_embed, x_embed, y_embed = AUDIO_EMBEDDING_MODELS[model_type](m_embed_model, x_a)
     else:
         raise ValueError('Invalid embedding type: "{}"'.format(embedding_type))
 
diff --git a/l3embedding/vision_model.py b/l3embedding/vision_model.py
index 755ea13..7aea703 100644
--- a/l3embedding/vision_model.py
+++ b/l3embedding/vision_model.py
@@ -265,6 +265,6 @@ def construct_tiny_L3_vision_model():
     return m, x_i, y_i
 
 
-EMBEDDING_MODELS = {
+VISION_EMBEDDING_MODELS = {
     'cnn_L3_orig': construct_cnn_l3_orig_vision_embedding_model
 }

From 39843b5fd0f45e1aa8d7834293ca8f47c93b7824 Mon Sep 17 00:00:00 2001
From: Justin Salamon <justin.salamon@gmail.com>
Date: Thu, 1 Mar 2018 14:48:03 -0500
Subject: [PATCH 116/201] udpate notebook

---
 notebooks/test_load_converted_model.ipynb | 120 +++++++++++++++++++---
 1 file changed, 104 insertions(+), 16 deletions(-)

diff --git a/notebooks/test_load_converted_model.ipynb b/notebooks/test_load_converted_model.ipynb
index 384afc8..a801d56 100644
--- a/notebooks/test_load_converted_model.ipynb
+++ b/notebooks/test_load_converted_model.ipynb
@@ -37,11 +37,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from l3embedding.model import load_embedding, MODELS, load_model"
+    "from l3embedding.model import load_embedding, load_model, MODELS"
    ]
   },
   {
@@ -70,7 +70,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -81,7 +81,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {
     "collapsed": true
    },
@@ -93,7 +93,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -103,20 +103,20 @@
       "__________________________________________________________________________________________________\n",
       "Layer (type)                    Output Shape         Param #     Connected to                     \n",
       "==================================================================================================\n",
-      "input_3 (InputLayer)            (None, 224, 224, 3)  0                                            \n",
+      "input_1 (InputLayer)            (None, 224, 224, 3)  0                                            \n",
       "__________________________________________________________________________________________________\n",
-      "input_4 (InputLayer)            (None, 1, 48000)     0                                            \n",
+      "input_2 (InputLayer)            (None, 1, 48000)     0                                            \n",
       "__________________________________________________________________________________________________\n",
-      "vision_model (Model)            (None, 512)          4693056     input_3[0][0]                    \n",
+      "vision_model (Model)            (None, 512)          4693056     input_1[0][0]                    \n",
       "__________________________________________________________________________________________________\n",
-      "audio_model (Model)             (None, 512)          9021504     input_4[0][0]                    \n",
+      "audio_model (Model)             (None, 512)          9021504     input_2[0][0]                    \n",
       "__________________________________________________________________________________________________\n",
-      "concatenate_2 (Concatenate)     (None, 1024)         0           vision_model[1][0]               \n",
+      "concatenate_1 (Concatenate)     (None, 1024)         0           vision_model[1][0]               \n",
       "                                                                 audio_model[1][0]                \n",
       "__________________________________________________________________________________________________\n",
-      "dense_3 (Dense)                 (None, 128)          131200      concatenate_2[0][0]              \n",
+      "dense_1 (Dense)                 (None, 128)          131200      concatenate_1[0][0]              \n",
       "__________________________________________________________________________________________________\n",
-      "dense_4 (Dense)                 (None, 2)            258         dense_3[0][0]                    \n",
+      "dense_2 (Dense)                 (None, 2)            258         dense_1[0][0]                    \n",
       "==================================================================================================\n",
       "Total params: 13,846,018\n",
       "Trainable params: 9,508,738\n",
@@ -129,6 +129,97 @@
     "model.summary()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Just load the embedding\n",
+    "embedding = load_embedding(weight_file, model_type, emb_type, num_gpus=0, target_num_gpus=None, return_io=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "_________________________________________________________________\n",
+      "Layer (type)                 Output Shape              Param #   \n",
+      "=================================================================\n",
+      "input_2 (InputLayer)         (None, 1, 48000)          0         \n",
+      "_________________________________________________________________\n",
+      "melspectrogram_1 (Melspectro (None, 128, 199, 1)       4329600   \n",
+      "_________________________________________________________________\n",
+      "conv2d_8 (Conv2D)            (None, 128, 199, 64)      640       \n",
+      "_________________________________________________________________\n",
+      "batch_normalization_9 (Batch (None, 128, 199, 64)      256       \n",
+      "_________________________________________________________________\n",
+      "activation_9 (Activation)    (None, 128, 199, 64)      0         \n",
+      "_________________________________________________________________\n",
+      "conv2d_9 (Conv2D)            (None, 128, 199, 64)      36928     \n",
+      "_________________________________________________________________\n",
+      "batch_normalization_10 (Batc (None, 128, 199, 64)      256       \n",
+      "_________________________________________________________________\n",
+      "activation_10 (Activation)   (None, 128, 199, 64)      0         \n",
+      "_________________________________________________________________\n",
+      "max_pooling2d_5 (MaxPooling2 (None, 64, 99, 64)        0         \n",
+      "_________________________________________________________________\n",
+      "conv2d_10 (Conv2D)           (None, 64, 99, 128)       73856     \n",
+      "_________________________________________________________________\n",
+      "batch_normalization_11 (Batc (None, 64, 99, 128)       512       \n",
+      "_________________________________________________________________\n",
+      "activation_11 (Activation)   (None, 64, 99, 128)       0         \n",
+      "_________________________________________________________________\n",
+      "conv2d_11 (Conv2D)           (None, 64, 99, 128)       147584    \n",
+      "_________________________________________________________________\n",
+      "batch_normalization_12 (Batc (None, 64, 99, 128)       512       \n",
+      "_________________________________________________________________\n",
+      "activation_12 (Activation)   (None, 64, 99, 128)       0         \n",
+      "_________________________________________________________________\n",
+      "max_pooling2d_6 (MaxPooling2 (None, 32, 49, 128)       0         \n",
+      "_________________________________________________________________\n",
+      "conv2d_12 (Conv2D)           (None, 32, 49, 256)       295168    \n",
+      "_________________________________________________________________\n",
+      "batch_normalization_13 (Batc (None, 32, 49, 256)       1024      \n",
+      "_________________________________________________________________\n",
+      "activation_13 (Activation)   (None, 32, 49, 256)       0         \n",
+      "_________________________________________________________________\n",
+      "conv2d_13 (Conv2D)           (None, 32, 49, 256)       590080    \n",
+      "_________________________________________________________________\n",
+      "batch_normalization_14 (Batc (None, 32, 49, 256)       1024      \n",
+      "_________________________________________________________________\n",
+      "activation_14 (Activation)   (None, 32, 49, 256)       0         \n",
+      "_________________________________________________________________\n",
+      "max_pooling2d_7 (MaxPooling2 (None, 16, 24, 256)       0         \n",
+      "_________________________________________________________________\n",
+      "conv2d_14 (Conv2D)           (None, 16, 24, 512)       1180160   \n",
+      "_________________________________________________________________\n",
+      "batch_normalization_15 (Batc (None, 16, 24, 512)       2048      \n",
+      "_________________________________________________________________\n",
+      "activation_15 (Activation)   (None, 16, 24, 512)       0         \n",
+      "_________________________________________________________________\n",
+      "audio_embedding_layer (Conv2 (None, 16, 24, 512)       2359808   \n",
+      "_________________________________________________________________\n",
+      "max_pooling2d_9 (MaxPooling2 (None, 4, 3, 512)         0         \n",
+      "_________________________________________________________________\n",
+      "flatten_3 (Flatten)          (None, 6144)              0         \n",
+      "=================================================================\n",
+      "Total params: 9,019,456\n",
+      "Trainable params: 4,687,040\n",
+      "Non-trainable params: 4,332,416\n",
+      "_________________________________________________________________\n"
+     ]
+    }
+   ],
+   "source": [
+    "embedding.summary()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -136,10 +227,7 @@
     "collapsed": true
    },
    "outputs": [],
-   "source": [
-    "# Jusrt load rthe embedding\n",
-    "embedding = load_embedding(weight_file, model_type, emb_type, num_gpus=0, target_num_gpus=None, return_io=False)"
-   ]
+   "source": []
   }
  ],
  "metadata": {

From 27d2b569238f8d652ab7a8ea919813c28a01b0a4 Mon Sep 17 00:00:00 2001
From: Justin Salamon <justin.salamon@gmail.com>
Date: Thu, 1 Mar 2018 14:50:51 -0500
Subject: [PATCH 117/201] Implement and use convert_audio_model_to_embedding

---
 l3embedding/audio_model.py | 38 +++++++++++++++++++++++++++++++++++---
 l3embedding/model.py       |  3 ++-
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/l3embedding/audio_model.py b/l3embedding/audio_model.py
index 4087057..5282ace 100644
--- a/l3embedding/audio_model.py
+++ b/l3embedding/audio_model.py
@@ -446,11 +446,11 @@ def construct_cnn_l3_orig_audio_embedding_model(audio_model, x_a):
 
     Args:
         audio_model: audio subnetwork
-        x_a: Image data input Tensor
+        x_a: audio data input Tensor
 
     Returns:
         m:   Model object
-        x_a: Image data input Tensor
+        x_a: audio data input Tensor
         y_a: Embedding output Tensor
 
     """
@@ -463,6 +463,37 @@ def construct_cnn_l3_orig_audio_embedding_model(audio_model, x_a):
     return m, x_a, y_a
 
 
+def convert_audio_model_to_embedding(audio_model, x_a, model_type):
+    """
+    Given and audio subnetwork, return a model that produces the learned
+    embedding
+
+    Args:
+        audio_model: audio subnetwork
+        x_a: audio data input Tensor
+        model_type: the model type string
+
+    Returns:
+        m: Model object
+        x_a : audio data input Tensor
+        y_a: embedding output Tensor
+    """
+
+    pooling = {'cnn_L3_orig': (8, 8),
+               'cnn_L3_kapredbinputbn': (8, 8),
+               'cnn_L3_melspec1': (4, 8),
+               'cnn_L3_melspec2': (8, 8)}
+
+    pool_size = pooling[model_type]
+
+    embed_layer = audio_model.get_layer('audio_embedding_layer')
+    y_a = MaxPooling2D(pool_size=pool_size, padding='same')(embed_layer.output)
+    y_a = Flatten()(y_a)
+
+    m = Model(inputs=x_a, outputs=y_a)
+    return m, x_a, y_a
+
+
 def construct_tiny_L3_audio_model():
     """
     Constructs a model that implements a small L3 audio subnetwork
@@ -517,5 +548,6 @@ def construct_tiny_L3_audio_model():
     return m, x_a, y_a
 
 AUDIO_EMBEDDING_MODELS = {
-    'cnn_L3_orig': construct_cnn_l3_orig_audio_embedding_model
+    'cnn_L3_orig': construct_cnn_l3_orig_audio_embedding_model,
+    'cnn_L3_melspec1': construct_cnn_l3_orig_audio_embedding_model
 }
diff --git a/l3embedding/model.py b/l3embedding/model.py
index 2a49250..91ca17b 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -169,7 +169,8 @@ def load_embedding(weights_path, model_type, embedding_type,
 
     elif embedding_type == 'audio':
         m_embed_model = m.get_layer('audio_model')
-        m_embed, x_embed, y_embed = AUDIO_EMBEDDING_MODELS[model_type](m_embed_model, x_a)
+        # m_embed, x_embed, y_embed = AUDIO_EMBEDDING_MODELS[model_type](m_embed_model, x_a)
+        m_embed, x_embed, y_embed = convert_audio_model_to_embedding(m_embed_model, x_a, model_type)
     else:
         raise ValueError('Invalid embedding type: "{}"'.format(embedding_type))
 

From f3a8800e61bab44fbb1bd4276f7210782abdf202 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Fri, 2 Mar 2018 01:20:18 -0500
Subject: [PATCH 118/201] Separate out data ganeration of embeddings for
 classifier model; add example sbatch script for embedding sample generation

---
 05_generate_embedding_samples.py       | 178 +++++++++++++++++++++++++
 06_train_classifier.py                 |   2 +-
 data/usc/features.py                   |  45 +++++--
 data/usc/us8k.py                       | 161 ++++++++++++++--------
 jobs/generate_embedding_samples.sbatch |  46 +++++++
 l3embedding/audio_model.py             |  24 +++-
 l3embedding/model.py                   |  45 ++++---
 l3embedding/train.py                   |   9 --
 8 files changed, 410 insertions(+), 100 deletions(-)
 create mode 100644 05_generate_embedding_samples.py
 create mode 100644 jobs/generate_embedding_samples.sbatch

diff --git a/05_generate_embedding_samples.py b/05_generate_embedding_samples.py
new file mode 100644
index 0000000..6540850
--- /dev/null
+++ b/05_generate_embedding_samples.py
@@ -0,0 +1,178 @@
+import argparse
+import logging
+import os
+import json
+from l3embedding.model import load_embedding
+from data.usc.us8k import generate_us8k_folds, generate_us8k_fold_data
+from log import init_console_logger
+
+LOGGER = logging.getLogger('data-generation')
+LOGGER.setLevel(logging.DEBUG)
+
+
+def parse_arguments():
+    """
+    Parse arguments from the command line
+
+
+    Returns:
+        args:  Argument dictionary
+               (Type: dict[str, *])
+    """
+    parser = argparse.ArgumentParser(description='Train an urban sound classification model')
+
+    parser.add_argument('-r',
+                        '--random-state',
+                        dest='random_state',
+                        action='store',
+                        type=int,
+                        default=20171021,
+                        help='Random seed used to set the RNG state')
+
+    parser.add_argument('-v',
+                        '--verbose',
+                        dest='verbose',
+                        action='store_true',
+                        default=False,
+                        help='If True, print detailed messages')
+
+    parser.add_argument('-f',
+                        '--features',
+                        dest='features',
+                        action='store',
+                        type=str,
+                        default='l3_frames_uniform',
+                        help='Type of features to be used in training')
+
+    parser.add_argument('-lf',
+                        '--label-format',
+                        dest='label_format',
+                        action='store',
+                        type=str,
+                        default='int',
+                        help='Type of format used for encoding outputs')
+
+    parser.add_argument('-lmp',
+                        '--l3embedding-model-path',
+                        dest='l3embedding_model_path',
+                        action='store',
+                        type=str,
+                        help='Path to L3 embedding model weights file')
+
+    parser.add_argument('-lmt',
+                        '--l3embedding-model-type',
+                        dest='l3embedding_model_type',
+                        action='store',
+                        type=str,
+                        default='cnn_L3_orig',
+                        help='Type of L3 embedding model')
+
+    parser.add_argument('-lpt',
+                        '--l3embedding-pooling-type',
+                        dest='l3embedding_pooling_type',
+                        action='store',
+                        type=str,
+                        default='original',
+                        help='Type of pooling used to downsample last conv layer of L3 embedding model')
+
+    parser.add_argument('-hs',
+                        '--hop-size',
+                        dest='hop_size',
+                        action='store',
+                        type=float,
+                        default=0.1,
+                        help='Hop size in seconds')
+
+    parser.add_argument('-nrs',
+                        '--num-random-samples',
+                        dest='num_random_samples',
+                        action='store',
+                        type=int,
+                        help='Number of random samples for randomized sampling methods')
+
+    parser.add_argument('--gpus',
+                        dest='gpus',
+                        type=int,
+                        default=0,
+                        help='Number of gpus used for data parallelism.')
+
+    parser.add_argument('--fold',
+                        dest='fold',
+                        type=int,
+                        help='Fold number to generate. If unused, generate all folds')
+
+    parser.add_argument('metadata_path',
+                        action='store',
+                        type=str,
+                        help='Path to UrbanSound8K metadata file')
+
+    parser.add_argument('model_id',
+                        action='store',
+                        type=str,
+                        help='Model ID used to generate features (or experiment ID)')
+
+    parser.add_argument('dataset_name',
+                        action='store',
+                        type=str,
+                        choices=['us8k'],
+                        help='Name of dataset')
+
+    parser.add_argument('data_dir',
+                        action='store',
+                        type=str,
+                        help='Path to directory where training set files are stored')
+
+    parser.add_argument('output_dir',
+                        action='store',
+                        type=str,
+                        help='Path to directory where output data files will be stored')
+
+    return vars(parser.parse_args())
+
+if __name__ == '__main__':
+    args = parse_arguments()
+
+    init_console_logger(LOGGER, verbose=args['verbose'])
+    LOGGER.debug('Initialized logging.')
+
+    model_type = args['l3embedding_model_type']
+    pooling_type = args['l3embedding_pooling_type']
+
+    l3embedding_model = load_embedding(args['l3embedding_model_path'],
+                                       model_type,
+                                       'audio', pooling_type,
+                                       tgt_num_gpus=args['gpus'])
+
+    output_dir = args['output_dir']
+
+    dataset_output_dir = os.path.join(output_dir, args['model_id'],
+                                      model_type, pooling_type)
+
+    # Write configurations to a file for reproducibility/posterity
+    with open(os.path.join(dataset_output_dir, 'config.json')) as f:
+        json.dump(args, f)
+
+    dataset_name = args['dataset_name']
+    fold_num = args['fold']
+    if dataset_name == 'us8k':
+        metadata_path = args['metadata_path']
+        data_dir = args['data_dir']
+        features = args['features']
+        hop_size = args['hop_size']
+        random_state = args['random_state']
+        num_random_samples = args['num_random_samples']
+        label_format = args['label_format']
+
+        if fold_num is not None:
+            # Generate a single fold if a fold was specified
+            generate_us8k_fold_data(metadata_path, data_dir, fold_num-1, dataset_output_dir,
+                l3embedding_model=l3embedding_model,
+                features=features, label_format=label_format, random_state=random_state,
+                hop_size=hop_size, num_random_samples=num_random_samples)
+
+        else:
+            # Otherwise, generate all the folds
+            generate_us8k_folds(metadata_path, data_dir, dataset_output_dir,
+                l3embedding_model=l3embedding_model,
+                features=features, label_format=label_format, random_state=random_state,
+                hop_size=hop_size, num_random_samples=num_random_samples)
diff --git a/06_train_classifier.py b/06_train_classifier.py
index 8005260..fbafb7e 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -11,7 +11,7 @@ def parse_arguments():
         args:  Argument dictionary
                (Type: dict[str, *])
     """
-    parser = argparse.ArgumentParser(description='Train an urban sound classification model')
+    parser = argparse.ArgumentParser(description='Generate embedding data for training an urban sound classification model')
 
     parser.add_argument('-e',
                         '--num-epochs',
diff --git a/data/usc/features.py b/data/usc/features.py
index 9622404..702f3ae 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -166,7 +166,7 @@ def get_l3_stats_features(audio_path, l3embedding_model, hop_size=0.25):
                            d1_mean, d1_var, d2_mean, d2_var))
 
 
-def get_l3_frames_uniform(audio_path, l3embedding_model, hop_size=0.25):
+def get_l3_frames_uniform(audio_path, l3embedding_model, hop_size=0.25, sr=48000):
     """
     Get L3 embedding stats features, i.e. compute statistics for each of the
     embedding features across 1 second (overlapping) window of the given audio
@@ -186,12 +186,12 @@ def get_l3_frames_uniform(audio_path, l3embedding_model, hop_size=0.25):
         features:  List of embedding vectors
                    (Type: list[np.ndarray])
     """
-    sr = 48000
-    audio, _ = librosa.load(audio_path, sr=sr, mono=True)
+    if type(audio) == str:
+        audio, _ = librosa.load(audio, sr=sr, mono=True)
 
     hop_size = 0.25 # REVISIT
     hop_length = int(hop_size * sr)
-    frame_length = 48000 * 1
+    frame_length = sr * 1
 
     audio_length = len(audio)
     if audio_length < frame_length:
@@ -217,7 +217,7 @@ def get_l3_frames_uniform(audio_path, l3embedding_model, hop_size=0.25):
     # Get the L3 embedding for each frame
     l3embedding = l3embedding_model.predict(x).T
 
-    return list(l3embedding)
+    return l3embedding
 
 def get_l3_frames_random(audio, l3embedding_model, num_samples, sr=48000):
     """
@@ -225,8 +225,8 @@ def get_l3_frames_random(audio, l3embedding_model, num_samples, sr=48000):
     embedding features across 1 second (overlapping) window of the given audio
 
     Args:
-        audio_path: numpy array or ath to audio file
-                    (Type: np.ndarray or str)
+        audio: numpy array or ath to audio file
+               (Type: np.ndarray or str)
 
         l3embedding_model:  Audio embedding model
                             (Type: keras.engine.training.Model)
@@ -278,7 +278,36 @@ def get_l3_frames_random(audio, l3embedding_model, num_samples, sr=48000):
 
         l3embedding = np.tile(frame_l3embedding, (num_samples, 1))
 
-    return list(l3embedding)
+    return l3embedding
+
+
+def compute_file_features(path, feature_type, l3embedding_model=None, **feature_args):
+    if feature_type.startswith('l3') and not l3embedding_model:
+        err_msg = 'Must provide L3 embedding model to use {} features'
+        raise ValueError(err_msg.format(feature_type))
+
+    if feature_type == 'l3_stack':
+        hop_size = feature_args.get('hop_size', 0.25)
+        file_features = get_l3_stack_features(path, l3embedding_model,
+                                              hop_size=hop_size)
+    elif feature_type == 'l3_stats':
+        hop_size = feature_args.get('hop_size', 0.25)
+        file_features = get_l3_stats_features(path, l3embedding_model,
+                                              hop_size=hop_size)
+    elif feature_type == 'l3_frames_uniform':
+        hop_size = feature_args.get('hop_size', 0.25)
+        file_features = get_l3_frames_uniform(path, l3embedding_model,
+                                              hop_size=hop_size)
+    elif feature_type == 'l3_frames_random':
+        num_samples = feature_args.get('num_random_samples')
+        if not num_samples:
+            raise ValueError('Must specify "num_samples" for "l3_frame_random" features')
+        file_features = get_l3_frames_random(path, l3embedding_model,
+                                             num_samples)
+    else:
+        raise ValueError('Invalid feature type: {}'.format(feature_type))
+
+    return file_features
 
 
 def flatten_file_frames(X, y):
diff --git a/data/usc/us8k.py b/data/usc/us8k.py
index fbee340..2342c9b 100644
--- a/data/usc/us8k.py
+++ b/data/usc/us8k.py
@@ -1,6 +1,8 @@
 import csv
 import logging
 import os
+import random
+import pescador
 
 import numpy as np
 
@@ -64,17 +66,7 @@ def get_us8k_folds(metadata_path, data_dir, l3embedding_model=None,
                    (Type: list[tuple[np.ndarray, np.ndarray]])
 
     """
-    metadata = load_us8k_metadata(metadata_path)
-
-    fold_data = []
-    for fold_idx in range(10):
-        LOGGER.info("Loading fold {}...".format(fold_idx+1))
-        fold_data.append(get_us8k_fold_data(metadata, data_dir, fold_idx,
-                                            l3embedding_model=l3embedding_model,
-                                            features=features, label_format=label_format,
-                                            **feature_args))
-
-    return fold_data
+    raise NotImplementedError()
 
 
 def get_us8k_fold_data(metadata, data_dir, fold_idx, l3embedding_model=None,
@@ -112,48 +104,7 @@ def get_us8k_fold_data(metadata, data_dir, fold_idx, l3embedding_model=None,
     X = []
     y = []
 
-    for idx, (fname, example_metadata) in enumerate(metadata[fold_idx].items()):
-        path = os.path.join(data_dir, "fold{}".format(fold_idx+1), fname)
-
-        if features.startswith('l3') and not l3embedding_model:
-            raise ValueError('Must provide L3 embedding model to use {} features'.format(features))
-
-        if features == 'l3_stack':
-            hop_size = feature_args.get('hop_size', 0.25)
-            file_features = cls_features.get_l3_stack_features(path, l3embedding_model,
-                                                          hop_size=hop_size)
-        elif features == 'l3_stats':
-            hop_size = feature_args.get('hop_size', 0.25)
-            file_features = cls_features.get_l3_stats_features(path, l3embedding_model,
-                                                          hop_size=hop_size)
-        elif features == 'l3_frames_uniform':
-            hop_size = feature_args.get('hop_size', 0.25)
-            file_features = cls_features.get_l3_frames_uniform(path, l3embedding_model,
-                                                          hop_size=hop_size)
-        elif features == 'l3_frames_random':
-            num_samples = feature_args.get('num_random_samples')
-            if not num_samples:
-                raise ValueError('Must specify "num_samples" for "l3_frame_random" features')
-            file_features = cls_features.get_l3_frames_random(path, l3embedding_model,
-                                                         num_samples)
-        else:
-            raise ValueError('Invalid feature type: {}'.format(features))
-
-        # If we were not able to compute the features, skip this file
-        if file_features is None:
-            continue
-
-        X.append(file_features)
-
-        class_label = example_metadata['classID']
-        if label_format == 'int':
-            y.append(class_label)
-        elif label_format == 'one_hot':
-            y.append(cls_features.one_hot(class_label))
-        else:
-            raise ValueError('Invalid label format: {}'.format(label_format))
-
-    return np.array(X), np.array(y)
+    raise NotImplementedError()
 
 
 def get_us8k_fold_split(fold_data, fold_idx, frame_features, shuffle=True):
@@ -200,3 +151,107 @@ def get_us8k_fold_split(fold_data, fold_idx, frame_features, shuffle=True):
         y_train = y_train[train_idxs]
 
     return X_train, y_train, X_test, y_test
+
+
+def generate_us8k_folds(metadata_path, data_dir, output_dir, l3embedding_model=None,
+                        features='l3_stack', label_format='int',
+                        random_state=12345678, **feature_args):
+    """
+    Generate all of the data for each fold
+
+    Args:
+        metadata_path: Path to metadata file
+                       (Type: str)
+
+        data_dir: Path to data directory
+                  (Type: str)
+
+        output_dir: Path to output directory where fold data will be stored
+                    (Type: str)
+
+    Keyword Args:
+        l3embedding_model: L3 embedding model, used if L3 features are used
+                           (Type: keras.engine.training.Model or None)
+
+        features: Type of features to be computed
+                  (Type: str)
+
+        label_format: Type of format used for encoding outputs
+                      (Type: str)
+    """
+    metadata = load_us8k_metadata(metadata_path)
+
+    for fold_idx in range(10):
+        LOGGER.info("Loading fold {}...".format(fold_idx+1))
+        generate_us8k_fold_data(metadata, data_dir, fold_idx, output_dir,
+                                l3embedding_model=l3embedding_model,
+                                features=features, label_format=label_format,
+                                random_state=random_state, **feature_args)
+
+
+def generate_us8k_fold_data(metadata, data_dir, fold_idx, output_dir, l3embedding_model=None,
+                            features='l3_stack', label_format='int',
+                            random_state=12345678, **feature_args):
+    """
+    Generate all of the data for a specific fold
+
+    Args:
+        metadata: List of metadata dictionaries, or a path to a metadata file to be loaded
+                  (Type: list[dict[str,*]] or str)
+
+        data_dir: Path to data directory
+                  (Type: str)
+
+        fold_idx: Index of fold to load
+                  (Type: int)
+
+        output_dir: Path to output directory where fold data will be stored
+                    (Type: str)
+
+    Keyword Args:
+        l3embedding_model: L3 embedding model, used if L3 features are used
+                           (Type: keras.engine.training.Model or None)
+
+        features: Type of features to be computed
+                  (Type: str)
+
+        label_format: Type of format used for encoding outputs
+                      (Type: str)
+    """
+    if type(metadata) == str:
+        metadata = load_us8k_metadata(metadata)
+
+    # Set random seed
+    random_state = random_state + fold_idx
+    random.seed(random_state)
+    np.random.seed(random_state)
+
+    # Create fold directory if it does not exist
+    fold_dir = os.path.join(data_dir, "fold{}".format(fold_idx+1))
+    if not os.path.isdir(fold_dir):
+        os.makedirs(fold_dir)
+
+    for idx, (fname, example_metadata) in enumerate(metadata[fold_idx].items()):
+        path = os.path.join(data_dir, "fold{}".format(fold_idx+1), fname)
+        basename, _ = os.path.splitext(fname)
+        output_path = os.path.join(fold_dir, fname)
+
+        if os.path.exists(output_path):
+            LOGGER.debug('Already found output file at {}. Skipping.'.format(output_path))
+            continue
+
+        X = cls_features.compute_file_features(path, features, l3embedding_model=l3embedding_model, **feature_args)
+
+        # If we were not able to compute the features, skip this file
+        if X is None:
+            continue
+
+        class_label = example_metadata['classID']
+        if label_format == 'int':
+            y = class_label
+        elif label_format == 'one_hot':
+            y = cls_features.one_hot(class_label)
+        else:
+            raise ValueError('Invalid label format: {}'.format(label_format))
+
+        np.savez_compressed(output_path, X=X, y=y)
\ No newline at end of file
diff --git a/jobs/generate_embedding_samples.sbatch b/jobs/generate_embedding_samples.sbatch
new file mode 100644
index 0000000..35b0b46
--- /dev/null
+++ b/jobs/generate_embedding_samples.sbatch
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+
+#SBATCH --job-name=generate-l3embedding-samples
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=120GB
+#SBATCH --time=7-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@email.com
+#SBATCH --output="generate-l3embedding-samples-%A-%a.out"
+#SBATCH --err="generate-l3embedding-samples-%A-%a.err"
+
+
+source ~/.bashrc
+cd /home/$USER/dev
+source activate l3embedding
+
+SRCDIR=$HOME/dev/l3embedding
+L3_MODEL_PATH=''
+L3_MODEL_TYPE='cnn_L3_melspec1'
+L3_POOLING_TYPE='original'
+US8K_PATH=/scratch/jtc440/UrbanSound8K
+METADATA_PATH=$US8K_PATH/metadata/UrbanSound8K.csv
+DATA_DIR=$US8K_PATH/audio
+OUTPUT_DIR=/scratch/jtc440/l3_features
+MODEL_ID='melspec1'
+
+module purge
+module load ffmpeg/intel/3.2.2
+
+python $SRCDIR/05_generate_embedding_samples.py \
+    --random-state 20180302 \
+    --verbose \
+    --features 'l3_frames_uniform' \
+    --label-format 'int' \
+    --l3embedding-model-path $L3_MODEL_PATH \
+    --l3embedding-model-type $L3_MODEL_TYPE \
+    --l3embedding-pooling-type $L3_POOLING_TYPE \
+    --hop-size 0.1 \
+    --gpus 0 \
+    --fold 1 \
+    $METADATA_PATH \
+    $MODEL_ID \
+    us8k \
+    $DATA_DIR
+    $OUTPUT_DIR
diff --git a/l3embedding/audio_model.py b/l3embedding/audio_model.py
index 5282ace..caaf6a9 100644
--- a/l3embedding/audio_model.py
+++ b/l3embedding/audio_model.py
@@ -463,7 +463,7 @@ def construct_cnn_l3_orig_audio_embedding_model(audio_model, x_a):
     return m, x_a, y_a
 
 
-def convert_audio_model_to_embedding(audio_model, x_a, model_type):
+def convert_audio_model_to_embedding(audio_model, x_a, model_type, pooling_type='original'):
     """
     Given and audio subnetwork, return a model that produces the learned
     embedding
@@ -479,12 +479,22 @@ def convert_audio_model_to_embedding(audio_model, x_a, model_type):
         y_a: embedding output Tensor
     """
 
-    pooling = {'cnn_L3_orig': (8, 8),
-               'cnn_L3_kapredbinputbn': (8, 8),
-               'cnn_L3_melspec1': (4, 8),
-               'cnn_L3_melspec2': (8, 8)}
-
-    pool_size = pooling[model_type]
+    pooling = {
+        'cnn_L3_orig': {
+            'original': (8, 8),
+        },
+        'cnn_L3_kapredbinputbn': {
+            'original': (8, 8),
+        },
+        'cnn_L3_melspec1': {
+            'original': (4, 8),
+        },
+        'cnn_L3_melspec2': {
+            'original': (8, 8)
+        }
+    }
+
+    pool_size = pooling[model_type][pooling_type]
 
     embed_layer = audio_model.get_layer('audio_embedding_layer')
     y_a = MaxPooling2D(pool_size=pool_size, padding='same')(embed_layer.output)
diff --git a/l3embedding/model.py b/l3embedding/model.py
index 91ca17b..ae82d23 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -1,5 +1,4 @@
-from keras.models import Model
-from keras.layers import concatenate, Dense, MaxPooling2D, Flatten
+from keras.layers import concatenate, Dense
 from .vision_model import *
 from .audio_model import *
 from .training_utils import multi_gpu_model
@@ -49,8 +48,8 @@ def convert_num_gpus(model, inputs, outputs, model_type, src_num_gpus, tgt_num_g
         inputs: Input Tensor.
                 (Type: keras.layers.Input)
 
-        ouputs: Embedding output Tensor/Layer.
-                (Type: keras.layers.Layer)
+        outputs: Embedding output Tensor/Layer.
+                 (Type: keras.layers.Layer)
 
         model_type: Name of model type
                     (Type: str)
@@ -83,7 +82,7 @@ def convert_num_gpus(model, inputs, outputs, model_type, src_num_gpus, tgt_num_g
     return m_new, inputs_new, output_new
 
 
-def load_model(weights_path, model_type, num_gpus=0, target_num_gpus=None, return_io=False):
+def load_model(weights_path, model_type, src_num_gpus=0, tgt_num_gpus=None, return_io=False):
     """
     Loads an audio-visual correspondence model
 
@@ -94,11 +93,11 @@ def load_model(weights_path, model_type, num_gpus=0, target_num_gpus=None, retur
                        (Type: str)
 
     Keyword Args:
-        num_gpus:   Number of GPUs the saved model uses
-                    (Type: int)
+        src_num_gpus:   Number of GPUs the saved model uses
+                        (Type: int)
 
-        target_num_gpus:   Number of GPUs the loaded model will use
-                           (Type: int)
+        tgt_num_gpus:   Number of GPUs the loaded model will use
+                        (Type: int)
 
         return_io:  If True, return input and output tensors
                     (Type: bool)
@@ -115,13 +114,13 @@ def load_model(weights_path, model_type, num_gpus=0, target_num_gpus=None, retur
         raise ValueError('Invalid model type: "{}"'.format(model_type))
 
     m, inputs, output = MODELS[model_type]()
-    if num_gpus > 1:
-        m = multi_gpu_model(m, gpus=num_gpus)
+    if src_num_gpus > 1:
+        m = multi_gpu_model(m, gpus=src_num_gpus)
     m.load_weights(weights_path)
 
-    if target_num_gpus is not None and num_gpus != target_num_gpus:
+    if tgt_num_gpus is not None and src_num_gpus != tgt_num_gpus:
         m, inputs, output = convert_num_gpus(m, inputs, output, model_type,
-                                             num_gpus, target_num_gpus)
+                                             src_num_gpus, tgt_num_gpus)
 
     if return_io:
         return m, inputs, output
@@ -129,8 +128,8 @@ def load_model(weights_path, model_type, num_gpus=0, target_num_gpus=None, retur
         return m
 
 
-def load_embedding(weights_path, model_type, embedding_type,
-                   num_gpus=0, target_num_gpus=None, return_io=False):
+def load_embedding(weights_path, model_type, embedding_type, pooling_type,
+                   src_num_gpus=0, tgt_num_gpus=None, return_io=False):
     """
     Loads an embedding model
 
@@ -141,13 +140,15 @@ def load_embedding(weights_path, model_type, embedding_type,
                          (Type: str)
         embedding_type:  Type of embedding to load ('audio' or 'vision')
                          (Type: str)
+        pooling_type:    Type of pooling applied to final convolutional layer
+                         (Type: str)
 
     Keyword Args:
-        num_gpus:   Number of GPUs the saved model uses
-                    (Type: int)
+        src_num_gpus:   Number of GPUs the saved model uses
+                        (Type: int)
 
-        target_num_gpus:   Number of GPUs the loaded model will use
-                           (Type: int)
+        tgt_num_gpus:   Number of GPUs the loaded model will use
+                        (Type: int)
 
         return_io:  If True, return input and output tensors
                     (Type: bool)
@@ -160,8 +161,8 @@ def load_embedding(weights_path, model_type, embedding_type,
         y_i:    Embedding output Tensor/Layer. Not returned if return_io is False.
                 (Type: keras.layers.Layer)
     """
-    m, inputs, output = load_model(weights_path, model_type, num_gpus=num_gpus,
-                                   target_num_gpus=target_num_gpus, return_io=True)
+    m, inputs, output = load_model(weights_path, model_type, src_num_gpus=src_num_gpus,
+                                   tgt_num_gpus=tgt_num_gpus, return_io=True)
     x_i, x_a = inputs
     if embedding_type == 'vision':
         m_embed_model = m.get_layer('vision_model')
@@ -170,7 +171,7 @@ def load_embedding(weights_path, model_type, embedding_type,
     elif embedding_type == 'audio':
         m_embed_model = m.get_layer('audio_model')
         # m_embed, x_embed, y_embed = AUDIO_EMBEDDING_MODELS[model_type](m_embed_model, x_a)
-        m_embed, x_embed, y_embed = convert_audio_model_to_embedding(m_embed_model, x_a, model_type)
+        m_embed, x_embed, y_embed = convert_audio_model_to_embedding(m_embed_model, x_a, model_type, pooling_type)
     else:
         raise ValueError('Invalid embedding type: "{}"'.format(embedding_type))
 
diff --git a/l3embedding/train.py b/l3embedding/train.py
index 211fd21..f16246b 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -1,29 +1,20 @@
 import getpass
 import git
 import json
-import math
 import datetime
-import time
 import os
 import pickle
 import random
-import warnings
-import itertools
 import csv
 
 import numpy as np
 import keras
 from keras.optimizers import Adam
 import pescador
-import scipy.misc
-from skvideo.io import FFmpegReader, ffprobe
 from skimage import img_as_float
-import soundfile as sf
-from tqdm import tqdm
 
 from gsheets import get_credentials, append_row, update_experiment, get_row
 from .model import MODELS, load_model
-from .training_utils import multi_gpu_model
 from .audio import pcm2float
 from log import *
 import h5py

From 94597f0877ad26ad63ef3045a8178afcd175eb79 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Fri, 2 Mar 2018 01:27:11 -0500
Subject: [PATCH 119/201] Create output folder before saving config file

---
 05_generate_embedding_samples.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/05_generate_embedding_samples.py b/05_generate_embedding_samples.py
index 6540850..444fe10 100644
--- a/05_generate_embedding_samples.py
+++ b/05_generate_embedding_samples.py
@@ -148,6 +148,8 @@ def parse_arguments():
     dataset_output_dir = os.path.join(output_dir, args['model_id'],
                                       model_type, pooling_type)
 
+    if not os.path.isdir(dataset_output_dir):
+        os.makedirs(dataset_output_dir)
     # Write configurations to a file for reproducibility/posterity
     with open(os.path.join(dataset_output_dir, 'config.json')) as f:
         json.dump(args, f)

From bcc572627f41ff7171f504ef8e0a1f5477abb89f Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Fri, 2 Mar 2018 20:36:40 -0500
Subject: [PATCH 120/201] Improve logging, refactor feature extraction, fix
 various bugs, make audio loading more robust

---
 05_generate_embedding_samples.py             | 44 +++++++-----
 data/usc/features.py                         | 33 ++++++---
 data/usc/us8k.py                             | 72 +++++++++++++-------
 jobs/generate_embedding_samples.sbatch       |  3 +-
 jobs/generate_embedding_samples_array.sbatch | 48 +++++++++++++
 5 files changed, 150 insertions(+), 50 deletions(-)
 create mode 100644 jobs/generate_embedding_samples_array.sbatch

diff --git a/05_generate_embedding_samples.py b/05_generate_embedding_samples.py
index 444fe10..7110c83 100644
--- a/05_generate_embedding_samples.py
+++ b/05_generate_embedding_samples.py
@@ -6,7 +6,7 @@
 from data.usc.us8k import generate_us8k_folds, generate_us8k_fold_data
 from log import init_console_logger
 
-LOGGER = logging.getLogger('data-generation')
+LOGGER = logging.getLogger('cls-data-generation')
 LOGGER.setLevel(logging.DEBUG)
 
 
@@ -94,7 +94,7 @@ def parse_arguments():
                         dest='gpus',
                         type=int,
                         default=0,
-                        help='Number of gpus used for data parallelism.')
+                        help='Number of gpus used for running the embedding model.')
 
     parser.add_argument('--fold',
                         dest='fold',
@@ -135,35 +135,43 @@ def parse_arguments():
     init_console_logger(LOGGER, verbose=args['verbose'])
     LOGGER.debug('Initialized logging.')
 
+    # Unpack CL args
     model_type = args['l3embedding_model_type']
     pooling_type = args['l3embedding_pooling_type']
+    metadata_path = args['metadata_path']
+    data_dir = args['data_dir']
+    features = args['features']
+    hop_size = args['hop_size']
+    random_state = args['random_state']
+    num_random_samples = args['num_random_samples']
+    label_format = args['label_format']
+    model_path = args['l3embedding_model_path']
+    num_gpus = args['gpus']
+    output_dir = args['output_dir']
+    model_id = args['model_id']
+    dataset_name = args['dataset_name']
+    fold_num = args['fold']
 
-    l3embedding_model = load_embedding(args['l3embedding_model_path'],
+    LOGGER.info('Loading embedding model...')
+    l3embedding_model = load_embedding(model_path,
                                        model_type,
                                        'audio', pooling_type,
-                                       tgt_num_gpus=args['gpus'])
+                                       tgt_num_gpus=num_gpus)
 
-    output_dir = args['output_dir']
 
-    dataset_output_dir = os.path.join(output_dir, args['model_id'],
+    dataset_output_dir = os.path.join(output_dir, model_id,
                                       model_type, pooling_type)
 
     if not os.path.isdir(dataset_output_dir):
         os.makedirs(dataset_output_dir)
+
     # Write configurations to a file for reproducibility/posterity
-    with open(os.path.join(dataset_output_dir, 'config.json')) as f:
+    config_path = os.path.join(dataset_output_dir, 'config.json')
+    with open(config_path, 'w') as f:
         json.dump(args, f)
+    LOGGER.info('Saved configuration to {}'.format(config_path))
 
-    dataset_name = args['dataset_name']
-    fold_num = args['fold']
     if dataset_name == 'us8k':
-        metadata_path = args['metadata_path']
-        data_dir = args['data_dir']
-        features = args['features']
-        hop_size = args['hop_size']
-        random_state = args['random_state']
-        num_random_samples = args['num_random_samples']
-        label_format = args['label_format']
 
         if fold_num is not None:
             # Generate a single fold if a fold was specified
@@ -178,3 +186,7 @@ def parse_arguments():
                 l3embedding_model=l3embedding_model,
                 features=features, label_format=label_format, random_state=random_state,
                 hop_size=hop_size, num_random_samples=num_random_samples)
+    else:
+        LOGGER.error('Invalid dataset name: {}'.format(dataset_name))
+
+    LOGGER.info('Done!')
diff --git a/data/usc/features.py b/data/usc/features.py
index 702f3ae..1fcec50 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -3,11 +3,26 @@
 import librosa
 import numpy as np
 import scipy as sp
+import soundfile as sf
+import resampy
 
-LOGGER = logging.getLogger('classifier-features')
+LOGGER = logging.getLogger('cls-data-generation')
 LOGGER.setLevel(logging.DEBUG)
 
 
+def load_audio(path, sr):
+    """
+    Load audio file
+    """
+    data, sr_orig = sf.read(path, dtype='float32', always_2d=True)
+    data = data.mean(axis=-1)
+
+    if sr_orig != sr:
+        data = resampy.resample(data, sr_orig, sr)
+
+    return data
+
+
 def one_hot(idx, n_classes=10):
     """
     Creates a one hot encoding vector
@@ -52,7 +67,7 @@ def get_l3_stack_features(audio_path, l3embedding_model, hop_size=0.25):
                    (Type: np.ndarray)
     """
     sr = 48000
-    audio, _ = librosa.load(audio_path, sr=sr, mono=True)
+    audio = load_audio(audio_path, sr)
     audio_length = len(audio)
     frame_length = sr
     hop_length = int(sr * hop_size)
@@ -110,7 +125,7 @@ def get_l3_stats_features(audio_path, l3embedding_model, hop_size=0.25):
                    (Type: np.ndarray)
     """
     sr = 48000
-    audio, _ = librosa.load(audio_path, sr=sr, mono=True)
+    audio = load_audio(audio_path, sr)
 
     hop_length = int(hop_size * sr)
     frame_length = 48000 * 1
@@ -166,14 +181,14 @@ def get_l3_stats_features(audio_path, l3embedding_model, hop_size=0.25):
                            d1_mean, d1_var, d2_mean, d2_var))
 
 
-def get_l3_frames_uniform(audio_path, l3embedding_model, hop_size=0.25, sr=48000):
+def get_l3_frames_uniform(audio, l3embedding_model, hop_size=0.25, sr=48000):
     """
     Get L3 embedding stats features, i.e. compute statistics for each of the
     embedding features across 1 second (overlapping) window of the given audio
 
     Args:
-        audio_path: Path to audio file
-                    (Type: str)
+        audio: Audio data or path to audio file
+               (Type: np.ndarray or str)
 
         l3embedding_model:  Audio embedding model
                             (keras.engine.training.Model)
@@ -187,7 +202,7 @@ def get_l3_frames_uniform(audio_path, l3embedding_model, hop_size=0.25, sr=48000
                    (Type: list[np.ndarray])
     """
     if type(audio) == str:
-        audio, _ = librosa.load(audio, sr=sr, mono=True)
+        audio = load_audio(audio, sr)
 
     hop_size = 0.25 # REVISIT
     hop_length = int(hop_size * sr)
@@ -225,7 +240,7 @@ def get_l3_frames_random(audio, l3embedding_model, num_samples, sr=48000):
     embedding features across 1 second (overlapping) window of the given audio
 
     Args:
-        audio: numpy array or ath to audio file
+        audio: Numpy array or path to audio file
                (Type: np.ndarray or str)
 
         l3embedding_model:  Audio embedding model
@@ -239,7 +254,7 @@ def get_l3_frames_random(audio, l3embedding_model, num_samples, sr=48000):
                    (Type: list[np.ndarray])
     """
     if type(audio) == str:
-        audio, _ = librosa.load(audio, sr=sr, mono=True)
+        audio = load_audio(audio, sr)
 
     frame_length = sr * 1
 
diff --git a/data/usc/us8k.py b/data/usc/us8k.py
index 2342c9b..040a6af 100644
--- a/data/usc/us8k.py
+++ b/data/usc/us8k.py
@@ -7,8 +7,9 @@
 import numpy as np
 
 import data.usc.features as cls_features
+from log import LogTimer
 
-LOGGER = logging.getLogger('us8k')
+LOGGER = logging.getLogger('cls-data-generation')
 LOGGER.setLevel(logging.DEBUG)
 
 
@@ -178,11 +179,12 @@ def generate_us8k_folds(metadata_path, data_dir, output_dir, l3embedding_model=N
 
         label_format: Type of format used for encoding outputs
                       (Type: str)
+
     """
+    LOGGER.info('Generating all folds.')
     metadata = load_us8k_metadata(metadata_path)
 
     for fold_idx in range(10):
-        LOGGER.info("Loading fold {}...".format(fold_idx+1))
         generate_us8k_fold_data(metadata, data_dir, fold_idx, output_dir,
                                 l3embedding_model=l3embedding_model,
                                 features=features, label_format=label_format,
@@ -217,7 +219,9 @@ def generate_us8k_fold_data(metadata, data_dir, fold_idx, output_dir, l3embeddin
 
         label_format: Type of format used for encoding outputs
                       (Type: str)
+
     """
+
     if type(metadata) == str:
         metadata = load_us8k_metadata(metadata)
 
@@ -226,32 +230,54 @@ def generate_us8k_fold_data(metadata, data_dir, fold_idx, output_dir, l3embeddin
     random.seed(random_state)
     np.random.seed(random_state)
 
+    audio_fold_dir = os.path.join(data_dir, "fold{}".format(fold_idx+1))
+
     # Create fold directory if it does not exist
-    fold_dir = os.path.join(data_dir, "fold{}".format(fold_idx+1))
-    if not os.path.isdir(fold_dir):
-        os.makedirs(fold_dir)
+    output_fold_dir = os.path.join(output_dir, "fold{}".format(fold_idx+1))
+    if not os.path.isdir(output_fold_dir):
+        os.makedirs(output_fold_dir)
+
+    LOGGER.info('Generating fold {} in {}'.format(fold_idx+1, output_fold_dir))
+
+    num_files = len(metadata[fold_idx])
 
     for idx, (fname, example_metadata) in enumerate(metadata[fold_idx].items()):
-        path = os.path.join(data_dir, "fold{}".format(fold_idx+1), fname)
-        basename, _ = os.path.splitext(fname)
-        output_path = os.path.join(fold_dir, fname)
+        desc = '({}/{}) Processed {} -'.format(idx+1, num_files, fname)
+        with LogTimer(LOGGER, desc, log_level=logging.DEBUG):
+            generate_us8k_file_data(fname, example_metadata, audio_fold_dir,
+                                    output_fold_dir, features, label_format,
+                                    l3embedding_model, **feature_args)
+
+
+def generate_us8k_file_data(fname, example_metadata, audio_fold_dir,
+                            output_fold_dir, features, label_format,
+                            l3embedding_model, **feature_args):
+    audio_path = os.path.join(audio_fold_dir, fname)
+
+    basename, _ = os.path.splitext(fname)
+    output_path = os.path.join(output_fold_dir, basename + '.npz')
+
+    if os.path.exists(output_path):
+        LOGGER.info('File {} already exists'.format(output_path))
+        return
+
+    X = cls_features.compute_file_features(audio_path, features, l3embedding_model=l3embedding_model, **feature_args)
+
+    # If we were not able to compute the features, skip this file
+    if X is None:
+        LOGGER.error('Could not generate data for {}'.format(audio_path))
+        return
 
-        if os.path.exists(output_path):
-            LOGGER.debug('Already found output file at {}. Skipping.'.format(output_path))
-            continue
+    class_label = example_metadata['classID']
+    if label_format == 'int':
+        y = class_label
+    elif label_format == 'one_hot':
+        y = cls_features.one_hot(class_label)
+    else:
+        raise ValueError('Invalid label format: {}'.format(label_format))
 
-        X = cls_features.compute_file_features(path, features, l3embedding_model=l3embedding_model, **feature_args)
+    np.savez_compressed(output_path, X=X, y=y)
 
-        # If we were not able to compute the features, skip this file
-        if X is None:
-            continue
+    return output_path, 'success'
 
-        class_label = example_metadata['classID']
-        if label_format == 'int':
-            y = class_label
-        elif label_format == 'one_hot':
-            y = cls_features.one_hot(class_label)
-        else:
-            raise ValueError('Invalid label format: {}'.format(label_format))
 
-        np.savez_compressed(output_path, X=X, y=y)
\ No newline at end of file
diff --git a/jobs/generate_embedding_samples.sbatch b/jobs/generate_embedding_samples.sbatch
index 35b0b46..a31ebd0 100644
--- a/jobs/generate_embedding_samples.sbatch
+++ b/jobs/generate_embedding_samples.sbatch
@@ -26,7 +26,6 @@ OUTPUT_DIR=/scratch/jtc440/l3_features
 MODEL_ID='melspec1'
 
 module purge
-module load ffmpeg/intel/3.2.2
 
 python $SRCDIR/05_generate_embedding_samples.py \
     --random-state 20180302 \
@@ -42,5 +41,5 @@ python $SRCDIR/05_generate_embedding_samples.py \
     $METADATA_PATH \
     $MODEL_ID \
     us8k \
-    $DATA_DIR
+    $DATA_DIR \
     $OUTPUT_DIR
diff --git a/jobs/generate_embedding_samples_array.sbatch b/jobs/generate_embedding_samples_array.sbatch
new file mode 100644
index 0000000..b8ffb32
--- /dev/null
+++ b/jobs/generate_embedding_samples_array.sbatch
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+#SBATCH --job-name=generate-l3embedding-samples
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64GB
+#SBATCH --time=7-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@email.com
+#SBATCH --output="generate-l3embedding-samples-%A-%a.out"
+#SBATCH --err="generate-l3embedding-samples-%A-%a.err"
+
+
+source ~/.bashrc
+cd /home/$USER/dev
+source activate l3embedding-cpu
+
+SRCDIR=$HOME/dev/l3embedding
+L3_MODEL_PATH=''
+L3_MODEL_TYPE='cnn_L3_melspec1'
+L3_POOLING_TYPE='original'
+US8K_PATH=/scratch/jtc440/UrbanSound8K
+METADATA_PATH=$US8K_PATH/metadata/UrbanSound8K.csv
+DATA_DIR=$US8K_PATH/audio
+OUTPUT_DIR=/scratch/jtc440/l3_features
+MODEL_ID='test'
+
+module purge
+#module load cuda/8.0.44
+#module load cudnn/8.0v6.0
+module load ffmpeg/intel/3.2.2
+
+python $SRCDIR/05_generate_embedding_samples.py \
+    --random-state 20180302 \
+    --verbose \
+    --features 'l3_frames_uniform' \
+    --label-format 'int' \
+    --l3embedding-model-path $L3_MODEL_PATH \
+    --l3embedding-model-type $L3_MODEL_TYPE \
+    --l3embedding-pooling-type $L3_POOLING_TYPE \
+    --hop-size 0.1 \
+    --gpus 0 \
+    --fold 1 \
+    $METADATA_PATH \
+    $MODEL_ID \
+    us8k \
+    $DATA_DIR \
+    $OUTPUT_DIR

From d1c04df8c3ad79d203341c3c46c13bd8e4a3789d Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 5 Mar 2018 21:53:32 -0500
Subject: [PATCH 121/201] Change num_gpu to src_num_gpu when calling load_model

---
 l3embedding/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index f16246b..2a9a8c1 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -257,7 +257,7 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
 
     if continue_model_dir:
         latest_model_path = os.path.join(continue_model_dir, 'model_latest.h5')
-        m, inputs, outputs = load_model(latest_model_path, model_type, return_io=True, num_gpus=gpus)
+        m, inputs, outputs = load_model(latest_model_path, model_type, return_io=True, src_num_gpus=gpus)
     else:
         m, inputs, outputs = MODELS[model_type](num_gpus=gpus)
 

From b7057355e5dcbdc0e9ce88c8853286689cfc98ea Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 7 Mar 2018 23:30:11 -0500
Subject: [PATCH 122/201] Add sbatch script for generating urban train data for
 training embedding

---
 .../generate_samples_array_urban_train.sbatch | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 jobs/generate_samples_array_urban_train.sbatch

diff --git a/jobs/generate_samples_array_urban_train.sbatch b/jobs/generate_samples_array_urban_train.sbatch
new file mode 100644
index 0000000..40418e7
--- /dev/null
+++ b/jobs/generate_samples_array_urban_train.sbatch
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+#SBATCH --job-name=generate-samples-audioset
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64GB
+#SBATCH --time=7-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@email.com
+#SBATCH --output="generate-samples-audioset-%A-%a.out"
+#SBATCH --err="generate-samples-audioset-%A-%a.err"
+
+
+source ~/.bashrc
+cd /home/$USER/dev
+source activate l3embedding
+
+SRCDIR=$HOME/dev/l3embedding
+OUTPUT_DIR=/beegfs/work/AudioSetSamples_environmental/urban_train
+SUBSET_PATH=/scratch/jtc440/audioset_subsets/audioset_urban_train.csv
+USER_IDX=0
+NUM_WORKERS=4
+NUM_TASKS=12
+BASE_RANDOM_STATE=20180307
+
+module purge
+module load ffmpeg/intel/3.2.2
+
+python $SRCDIR/02_generate_samples.py \
+    --batch-size 1024 \
+    --num-streamers 20 \
+    --mux-rate 20 \
+    --augment \
+    --precompute \
+    --num-workers $NUM_WORKERS \
+    --num-distractors 2 \
+    --random-state $[$BASE_RANDOM_STATE + $NUM_WORKERS * ($SLURM_ARRAY_TASK_ID - 1 + $NUM_TASKS * $USER_IDX)] \
+    --include-metadata \
+    $SUBSET_PATH \
+    $[30000000 / $NUM_TASKS] \
+    $OUTPUT_DIR

From ae687274c023b35c3ef22719af8dc49a9cd15e7c Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 7 Mar 2018 23:31:54 -0500
Subject: [PATCH 123/201] Add ability to include children of nodes in AudioSet
 filtering and add ontology method for obtaining node by name

---
 audioset/ontology.py | 12 ++++++++++++
 data/avc/subsets.py  | 14 +++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/audioset/ontology.py b/audioset/ontology.py
index d39a002..259f9a2 100644
--- a/audioset/ontology.py
+++ b/audioset/ontology.py
@@ -57,6 +57,7 @@ def is_parent(self, q_parent):
 class ASOntology(object):
     def __init__(self, ontology_path):
         self._nodes = {}
+        self._node_name_to_id = {}
 
         # Make sure the path to the ontology file exists
         if not os.path.exists(ontology_path):
@@ -91,6 +92,9 @@ def _init_tree(self):
             for child in node.children:
                 child.parent_id = node.id
 
+            # (Also use this loop to reate a mapping from node name to ID)
+            self._node_name_to_id[node.name] = node.id
+
         # Do another pass to find the top level nodes, which have not
         # been assigned a parent
         self.top_level_node_ids = []
@@ -124,4 +128,12 @@ def get_node(self, node_id):
             raise ValueError('No node with ID {}'.format(node_id))
         return self._nodes[node_id]
 
+    def get_node_by_name(self, node_name):
+        """
+        Get a node with the given name
+        """
+        if node_name not in self._node_name_to_id:
+            raise ValueError('No node with name {}'.format(node_name))
+        return self.get_node(self._node_name_to_id[node_name])
+
 
diff --git a/data/avc/subsets.py b/data/avc/subsets.py
index f1a6cb2..8b7a764 100644
--- a/data/avc/subsets.py
+++ b/data/avc/subsets.py
@@ -134,8 +134,9 @@ def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=
             has_accept_filter = False
             for _filter in filters:
                 filter_type = _filter['filter_type']
-                filter_accept = _filter['accept_reject'] == 'accept'
+                filter_accept = _filter['accept_reject'].lower() == 'accept'
                 string = _filter['string']
+                include_children = _filter['include_children'].lower() == 'true'
 
                 if filter_accept:
                     has_accept_filter = True
@@ -146,6 +147,17 @@ def get_file_list(data_dir, metadata_path=None, filter_path=None, ontology_path=
                 elif filter_type == 'label':
                     match = string.lower() in label_list
 
+                    if include_children and not match:
+                        # If this filter includes children classes,
+                        # check each label to see if it is a descendent of the
+                        # filter label
+                        filter_node = ontology.get_node_by_name(string)
+                        for label in label_list:
+                            label_node = ontology.get_node_by_name(string)
+                            if filter_node.is_child(label_node):
+                                match = True
+                                break
+
                 if filter_accept:
                     # If "accept" has not been set, and there is a match with an accept filter, set "accept" to True
                     # -> If "accept" has already been set (which is True if another accept filter has already approved the

From 0431e376d76ac44593c9cc5f29f78a33d1030065 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 7 Mar 2018 23:33:27 -0500
Subject: [PATCH 124/201] Add audioset filter files

---
 .../audioset_filters/audioset_filter_ots.csv  | 111 ++++++++++++++++++
 .../audioset_filter_urban.csv                 |  21 ++++
 2 files changed, 132 insertions(+)
 create mode 100644 resources/audioset_filters/audioset_filter_ots.csv
 create mode 100644 resources/audioset_filters/audioset_filter_urban.csv

diff --git a/resources/audioset_filters/audioset_filter_ots.csv b/resources/audioset_filters/audioset_filter_ots.csv
new file mode 100644
index 0000000..dd1ca8e
--- /dev/null
+++ b/resources/audioset_filters/audioset_filter_ots.csv
@@ -0,0 +1,111 @@
+filter_type,accept_reject,include_children,string
+label,accept,false,Accordion
+label,accept,false,Acoustic guitar
+label,accept,false,Alto saxophone
+label,accept,false,Bagpipes
+label,accept,false,Banjo
+label,accept,false,Bass (instrument role)
+label,accept,false,Bass drum
+label,accept,false,Bass guitar
+label,accept,false,Bassoon
+label,accept,false,Bell
+label,accept,false,Bicycle bell
+label,accept,false,Bowed string instrument
+label,accept,false,Brass instrument
+label,accept,false,Bugle
+label,accept,false,Cello
+label,accept,false,Change ringing (campanology)
+label,accept,false,Chant
+label,accept,false,Child singing
+label,accept,false,Chime
+label,accept,false,Choir
+label,accept,false,Church bell
+label,accept,false,Clarinet
+label,accept,false,Clavinet
+label,accept,false,Cornet
+label,accept,false,Cowbell
+label,accept,false,Crash cymbal
+label,accept,false,Cymbal
+label,accept,false,"Dental drill, dentist's drill"
+label,accept,false,Didgeridoo
+label,accept,false,Double bass
+label,accept,false,Drill
+label,accept,false,Drum
+label,accept,false,Drum kit
+label,accept,false,Drum machine
+label,accept,false,Drum roll
+label,accept,false,Electric guitar
+label,accept,false,Electric piano
+label,accept,false,Electronic organ
+label,accept,false,Female singing
+label,accept,false,Filing (rasp)
+label,accept,false,Flute
+label,accept,false,French horn
+label,accept,false,Glockenspiel
+label,accept,false,Gong
+label,accept,false,Guitar
+label,accept,false,Hammer
+label,accept,false,Hammond organ
+label,accept,false,Harmonica
+label,accept,false,Harp
+label,accept,false,Harpsichord
+label,accept,false,Hi-hat
+label,accept,false,Jackhammer
+label,accept,false,Jingle bell
+label,accept,false,Keyboard (musical)
+label,accept,false,Male singing
+label,accept,false,Mallet percussion
+label,accept,false,Mandolin
+label,accept,false,Mantra
+label,accept,false,Maraca
+label,accept,false,"Marimba, xylophone"
+label,accept,false,Mellotron
+label,accept,false,Musical ensemble
+label,accept,false,Musical instrument
+label,accept,false,Oboe
+label,accept,false,Orchestra
+label,accept,false,Organ
+label,accept,false,Percussion
+label,accept,false,Piano
+label,accept,false,Pizzicato
+label,accept,false,Plucked string instrument
+label,accept,false,Power tool
+label,accept,false,Rapping
+label,accept,false,Rattle (instrument)
+label,accept,false,Rhodes piano
+label,accept,false,Rimshot
+label,accept,false,Sampler
+label,accept,false,Sanding
+label,accept,false,Sawing
+label,accept,false,Saxophone
+label,accept,false,Scratching (performance technique)
+label,accept,false,Shofar
+label,accept,false,Singing
+label,accept,false,Singing bowl
+label,accept,false,Sitar
+label,accept,false,Snare drum
+label,accept,false,Soprano saxophone
+label,accept,false,"Steel guitar, slide guitar"
+label,accept,false,Steelpan
+label,accept,false,String section
+label,accept,false,Strum
+label,accept,false,Synthesizer
+label,accept,false,Synthetic singing
+label,accept,false,Tabla
+label,accept,false,Tambourine
+label,accept,false,Tapping (guitar technique)
+label,accept,false,Theremin
+label,accept,false,Timpani
+label,accept,false,Tools
+label,accept,false,Trombone
+label,accept,false,Trumpet
+label,accept,false,Tubular bells
+label,accept,false,Tuning fork
+label,accept,false,Ukulele
+label,accept,false,Vibraphone
+label,accept,false,"Violin, fiddle"
+label,accept,false,Wind chime
+label,accept,false,"Wind instrument, woodwind instrument"
+label,accept,false,Wood block
+label,accept,false,Yodeling
+label,accept,false,Zither
diff --git a/resources/audioset_filters/audioset_filter_urban.csv b/resources/audioset_filters/audioset_filter_urban.csv
new file mode 100644
index 0000000..5fcf3cc
--- /dev/null
+++ b/resources/audioset_filters/audioset_filter_urban.csv
@@ -0,0 +1,21 @@
+filter_type,accept_reject,include_children,string
+label,accept,true,"Male speech, man speaking"
+label,accept,true,"Female speech, woman speaking"
+label,accept,true,"Child speech, kid speaking"
+label,accept,true,Conversation
+label,accept,true,Babbling
+label,accept,true,Battle cry
+label,accept,true,Laughter
+label,accept,true,"Crying, sobbing"
+label,accept,true,Finger snapping
+label,accept,true,Clapping
+label,accept,true,Human group actions
+label,accept,true,Vehicle
+label,accept,true,Engine
+label,accept,true,Siren
+label,accept,true,"Gunshot, gunfire"
+label,accept,true,Splinter
+label,accept,true,Chop
+label,accept,true,Pump (liquid)
+label,accept,true,Basketball bounce
+label,accept,true,Bark

From 661587319e0fe32a832c247716b6e8c30513bd55 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 8 Mar 2018 20:44:28 -0500
Subject: [PATCH 125/201] Save train configuration parameters to model dir

---
 l3embedding/train.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index 2a9a8c1..c16bf0d 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -279,6 +279,12 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
 
     LOGGER.info('Model files can be found in "{}"'.format(model_dir))
 
+    param_dict['model_dir'] = model_dir
+    train_config_path = os.path.join(model_dir, 'config.json')
+    with open(train_config_path, 'w') as fd:
+        json.dump(param_dict, fd, indent=2)
+
+
     param_dict.update({
           'latest_epoch': '-',
           'latest_train_loss': '-',
@@ -289,7 +295,6 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
           'best_validation_loss': '-',
           'best_train_acc': '-',
           'best_validation_acc': '-',
-          'model_dir': model_dir,
     })
 
     # Save the model

From cb330d9c5ae50f017d2f0aed466088224fe2d9b5 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 12 Mar 2018 21:43:53 -0400
Subject: [PATCH 126/201] Properly call super constructor in GSheetLogger
 constructor

---
 l3embedding/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/l3embedding/train.py b/l3embedding/train.py
index c16bf0d..b7d1c2d 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -58,7 +58,7 @@ class GSheetLogger(keras.callbacks.Callback):
     """
 
     def __init__(self, google_dev_app_name, spreadsheet_id, param_dict):
-        super().__init__()
+        super(GSheetLogger).__init__()
         self.google_dev_app_name = google_dev_app_name
         self.spreadsheet_id = spreadsheet_id
         self.credentials = get_credentials(google_dev_app_name)

From a037f4e01853e14966ba969dad52a94b28c2bb79 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 12 Mar 2018 21:44:35 -0400
Subject: [PATCH 127/201] Update classifier training code to work with new
 feature format for a single fold, add logging, and save metrics to disk

---
 06_train_classifier.py | 109 +++----------
 classifier/metrics.py  |  25 ++-
 classifier/train.py    | 358 ++++++++++++++++++++++++-----------------
 data/usc/us8k.py       | 174 ++++++++++----------
 4 files changed, 339 insertions(+), 327 deletions(-)

diff --git a/06_train_classifier.py b/06_train_classifier.py
index fbafb7e..3fafaf8 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -29,27 +29,27 @@ def parse_arguments():
                         default=64,
                         help='(MLP) Number of training examples per batch')
 
-    parser.add_argument('-tes',
-                        '--train-epoch-size',
-                        dest='train_epoch_size',
+    parser.add_argument('-ns',
+                        '--num-streamers',
+                        dest='num_streamers',
                         action='store',
                         type=int,
-                        help='(MLP) Number of training batches per epoch')
+                        help='Number of open pescador streamers to keep open ' \
+                             'while sampling features')
 
-    parser.add_argument('-ves',
-                        '--validation-epoch-size',
-                        dest='validation_epoch_size',
+    parser.add_argument('-mr',
+                        '--mux-rate',
+                        dest='mux_rate',
                         action='store',
                         type=int,
-                        help='(MLP) Number of validation batches per epoch')
+                        help='Pescador poisson rate used for sampling')
 
-    parser.add_argument('-vs',
-                        '--validation-split',
-                        dest='validation_split',
+    parser.add_argument('-tes',
+                        '--train-epoch-size',
+                        dest='train_epoch_size',
                         action='store',
-                        type=float,
-                        default=0.1,
-                        help='(MLP) Percentage of training data used for validation')
+                        type=int,
+                        help='(MLP) Number of training batches per epoch')
 
     parser.add_argument('-lr',
                         '--learning-rate',
@@ -82,22 +82,6 @@ def parse_arguments():
                         default=False,
                         help='If True, print detailed messages')
 
-    parser.add_argument('-f',
-                        '--features',
-                        dest='features',
-                        action='store',
-                        type=str,
-                        default='l3_stack',
-                        help='Type of features to be used in training')
-
-    parser.add_argument('-lf',
-                        '--label-format',
-                        dest='label_format',
-                        action='store',
-                        type=str,
-                        default='int',
-                        help='Type of format used for encoding outputs')
-
     parser.add_argument('-mt',
                         '--model-type',
                         dest='model_type',
@@ -106,76 +90,25 @@ def parse_arguments():
                         default='svm',
                         help='Type of model used for training classifier')
 
-    parser.add_argument('-lmp',
-                        '--l3embedding-model-path',
-                        dest='l3embedding_model_path',
+    parser.add_argument('features_dir',
                         action='store',
                         type=str,
-                        help='Path to L3 embedding model weights file')
-
-    parser.add_argument('-lmt',
-                        '--l3embedding-model-type',
-                        dest='l3embedding_model_type',
-                        action='store',
-                        type=str,
-                        default='cnn_L3_orig',
-                        help='Type of L3 embedding model')
-
-    parser.add_argument('-hs',
-                        '--hop-size',
-                        dest='hop_size',
-                        action='store',
-                        type=float,
-                        default=0.25,
-                        help='Hop size as a fraction of the window')
+                        help='Path to directory where feature files are stored')
 
-    parser.add_argument('-nrs',
-                        '--num-random-samples',
-                        dest='num_random_samples',
-                        action='store',
-                        type=int,
-                        help='Number of random samples for randomized sampling methods')
-
-    parser.add_argument('-nl',
-                        '--no-logging',
-                        dest='disable_logging',
-                        action='store_true',
-                        default=False,
-                        help='Disables logging if flag enabled')
-
-    parser.add_argument('-lp',
-                        '--log-path',
-                        dest='log_path',
-                        action='store',
-                        default=None,
-                        help='Path to log file generated by this script. ' \
-                             'By default, the path is "./audiosetdl.log".')
-
-    parser.add_argument('metadata_path',
-                        action='store',
-                        type=str,
-                        help='Path to UrbanSound8K metadata file')
-
-    parser.add_argument('dataset_name',
-                        action='store',
-                        type=str,
-                        help='Name of dataset')
-
-    parser.add_argument('data_dir',
+    parser.add_argument('output_dir',
                         action='store',
                         type=str,
-                        help='Path to directory where training set files are stored')
+                        help='Path to directory where output files will be stored')
 
     parser.add_argument('model_id',
                         action='store',
                         type=str,
                         help='Identifier for this model')
 
-    parser.add_argument('output_dir',
+    parser.add_argument('fold_num',
                         action='store',
-                        type=str,
-                        help='Path to directory where output files will be stored')
-
+                        type=int,
+                        help='Fold ordinal to train/test with')
 
     return vars(parser.parse_args())
 
diff --git a/classifier/metrics.py b/classifier/metrics.py
index 6526bcf..25852ad 100644
--- a/classifier/metrics.py
+++ b/classifier/metrics.py
@@ -28,15 +28,16 @@ def compute_metrics(y, pred):
 
     acc = (y == pred).mean()
 
-    sum_class_acc = 0.0
+    class_acc = []
     for class_idx in range(10):
         idxs = (y == class_idx)
-        sum_class_acc += (y[idxs] == pred[idxs]).mean()
+        class_acc.append((y[idxs] == pred[idxs]).mean())
 
-    ave_class_acc = sum_class_acc / 10
+    ave_class_acc = np.mean(class_acc)
 
     return {
         'accuracy': acc,
+        'class_accuracy': class_acc,
         'average_class_accuracy': ave_class_acc
     }
 
@@ -73,6 +74,24 @@ def aggregate_metrics(fold_metrics):
     return aggr_metrics
 
 
+def collapse_metrics(metrics_list):
+    """
+    Collapse a list of metrics into a dict of lists for each metric
+
+    Args:
+        metrics_list: List of metrics dictionaries
+                      (Type: list[dict[str, *]])
+
+    Returns:
+        collapsed_metrics: Collapsed metrics
+                           (Type: dict[str, list])
+    """
+    metric_keys = list(metrics_list[0].keys())
+    collapsed_metrics_list = {k: [step[k] for step in metrics_list]
+                               for k in metric_keys}
+    return collapsed_metrics_list
+
+
 def print_metrics(metrics, subset_name):
     """
     Print classifier metrics
diff --git a/classifier/train.py b/classifier/train.py
index 79960f4..1b943d4 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -1,5 +1,7 @@
+import datetime
 import json
 import os
+import pickle as pk
 import random
 
 import keras
@@ -11,12 +13,11 @@
 from scipy.stats import mode
 from sklearn.externals import joblib
 from sklearn.preprocessing import StandardScaler
-from sklearn.svm import SVC
+from sklearn.linear_model import SGDClassifier
+import pescador
 
-from classifier.metrics import compute_metrics, aggregate_metrics, print_metrics
-from data.usc.us8k import get_us8k_folds, \
-    get_us8k_fold_split
-from l3embedding.model import load_embedding
+from classifier.metrics import compute_metrics, collapse_metrics
+from data.usc.us8k import get_us8k_batch_generator, get_us8k_batch, load_test_fold
 from l3embedding.train import LossHistory
 from log import *
 
@@ -24,7 +25,49 @@
 LOGGER.setLevel(logging.DEBUG)
 
 
-def train_svm(X_train, y_train, X_test, y_test, frame_features, C=1e-4, verbose=False, **kwargs):
+class MetricCallback(keras.callbacks.Callback):
+
+    def __init__(self, valid_data, verbose=False):
+        super(MetricCallback).__init__()
+        self.valid_data = valid_data
+        self.verbose = verbose
+
+    def on_train_begin(self, logs=None):
+        if logs is None:
+            logs = {}
+        self.train_loss = []
+        self.valid_loss = []
+        self.train_acc = []
+        self.valid_acc = []
+        self.valid_class_acc = []
+        self.valid_avg_class_acc = []
+
+    # def on_batch_end(self, batch, logs={}):
+    def on_epoch_end(self, epoch, logs=None):
+        if logs is None:
+            logs = {}
+        self.train_loss.append(logs.get('loss'))
+        self.valid_loss.append(logs.get('val_loss'))
+        self.train_acc.append(logs.get('acc'))
+        self.valid_acc.append(logs.get('val_acc'))
+
+        valid_pred = self.model.predict(self.valid_data['features'])
+        valid_metrics = compute_metrics(self.valid_data['label'], valid_pred)
+        self.valid_class_acc.append(valid_metrics['class_accuracy'])
+        self.valid_avg_class_acc.append(valid_metrics['average_class_accuracy'])
+
+        if self.verbose:
+            train_msg = 'Train - loss: {}, acc: {}'
+            valid_msg = 'Valid - loss: {}, acc: {}'
+            LOGGER.info('Epoch {}'.format(epoch))
+            LOGGER.info(train_msg.format(self.train_loss[-1],
+                                         self.train_acc[-1]))
+            LOGGER.info(valid_msg.format(self.valid_loss[-1],
+                                         self.valid_acc[-1]))
+
+
+def train_svm(train_gen, valid_data, test_data, model_dir, C=1e-4, reg_penalty='l2',
+              num_workers=1, tol=1e-3, max_iterations=1000000, verbose=False, **kwargs):
     """
     Train a Support Vector Machine model on the given data
 
@@ -55,31 +98,79 @@ def train_svm(X_train, y_train, X_test, y_test, frame_features, C=1e-4, verbose=
         y_test_pred: Predicted test output of classifier
                      (Type: np.ndarray)
     """
-    # Standardize
-    LOGGER.debug('Standardizing data...')
+    # Set up standardizer
     stdizer = StandardScaler()
-    X_train = stdizer.fit_transform(X_train)
 
-    clf = SVC(C=C, verbose=verbose)
-    LOGGER.debug('Fitting model to data...')
-    clf.fit(X_train, y_train)
-    y_train_pred = clf.predict(X_train)
-
-    X_test = stdizer.transform(X_test)
+    X_valid = valid_data['features']
+    y_valid = valid_data['labels']
 
-    if frame_features:
-        y_test_pred = []
+    train_loss_history = []
+    valid_loss_history = []
+    train_metric_history = []
+    valid_metric_history = []
 
-        for X_test_file in X_test:
-            output = clf.predict(X_test_file)
-            class_pred = mode(output.flatten())[0][0]
-            y_test_pred.append(class_pred)
+    model_output_path = os.path.join(model_dir, "model.pkl")
 
-        y_test_pred = np.array(y_test_pred)
-    else:
-        y_test_pred = clf.predict(X_test)
+    # Create classifier
+    clf = SGDClassifier(alpha=C, penalty=reg_penalty, n_jobs=num_workers, verbose=verbose)
 
-    return clf, y_train_pred, y_test_pred
+    LOGGER.debug('Fitting model to data...')
+    for iter_idx, train_data in enumerate(train_gen):
+        X_train = train_data['features']
+        y_train = train_data['labels']
+        stdizer.partial_fit(X_train)
+
+        # Fit data and get output for train and valid batches
+        clf.partial_fit(X_train, y_train)
+        y_train_pred = clf.predict(stdizer.transform(X_train))
+        y_valid_pred = clf.predict(stdizer.transform(X_valid))
+
+        # Compute new metrics
+        valid_loss_history.append(list(y_valid_pred))
+        train_loss_history.append(clf.loss_function_(y_train, y_train_pred))
+        valid_loss_history.append(clf.loss_function_(y_valid, y_valid_pred))
+        train_metric_history.append(compute_metrics(y_train, y_train_pred))
+        valid_metric_history.append(compute_metrics(y_valid, y_valid_pred))
+
+        # Save the model for this iteration
+        LOGGER.info('Saving model...')
+        joblib.dump(clf, model_output_path)
+
+        if verbose:
+            train_msg = 'Train - loss: {}, acc: {}'
+            valid_msg = 'Valid - loss: {}, acc: {}'
+            LOGGER.info('Epoch {}'.format(iter_idx + 1))
+            LOGGER.info(train_msg.format(train_loss_history[-1],
+                                         train_metric_history[-1]['accuracy']))
+            LOGGER.info(valid_msg.format(valid_loss_history[-1],
+                                         valid_metric_history[-1]['accuracy']))
+
+        # Finish training if the loss doesn't change much
+        if len(train_loss_history) > 1 and abs(train_loss_history[-2] - train_loss_history[-1]) < tol:
+            break
+
+        # Break if we reach the maximum number of iterations
+        if iter_idx >= max_iterations:
+            break
+
+    # Post process metrics
+    train_metrics = collapse_metrics(train_metric_history)
+    valid_metrics = collapse_metrics(valid_metric_history)
+    train_metrics['loss'] = train_loss_history
+    valid_metrics['loss'] = valid_loss_history
+
+    # Evaluate model on test data
+    X_test = stdizer.transform(test_data['features'])
+    y_test_pred_frame = clf.predict(X_test)
+    y_test_pred = []
+    for start_idx, end_idx in test_data['file_idxs']:
+        class_pred = mode(y_test_pred_frame[start_idx:end_idx])[0][0]
+        y_test_pred.append(class_pred)
+
+    y_test_pred = np.array(y_test_pred)
+    test_metrics = compute_metrics(test_data['labels'], y_test_pred)
+
+    return clf, train_metrics, valid_metrics, test_metrics
 
 
 def construct_mlp_model(input_shape, weight_decay=1e-5):
@@ -100,7 +191,6 @@ def construct_mlp_model(input_shape, weight_decay=1e-5):
         output:Model output
                 (Type: keras.layers.Layer)
     """
-    weight_decay = 1e-5
     l2_weight_decay = regularizers.l2(weight_decay)
     inp = Input(shape=input_shape, dtype='float32')
     y = Dense(512, activation='relu', kernel_regularizer=l2_weight_decay)(inp)
@@ -112,9 +202,8 @@ def construct_mlp_model(input_shape, weight_decay=1e-5):
     return m, inp, y
 
 
-def train_mlp(X_train, y_train, X_test, y_test, model_dir, frame_features,
+def train_mlp(train_gen, valid_data, test_data, model_dir,
               batch_size=64, num_epochs=100, train_epoch_size=None,
-              validation_epoch_size=None, validation_split=0.1,
               learning_rate=1e-4, weight_decay=1e-5,
               verbose=False, **kwargs):
     """
@@ -146,10 +235,6 @@ def train_mlp(X_train, y_train, X_test, y_test, model_dir, frame_features,
                     (Type: int)
         train_epoch_size: Number of training batches per training epoch
                           (Type: int)
-        validation_epoch_size: Number of validation batches per validation epoch
-                               (Type: int)
-        validation_split: Percentage of training data used for validation
-                          (Type: float)
         learning_rate: Learning rate value
                        (Type: float)
         weight_decay: L2 regularization factor
@@ -159,61 +244,68 @@ def train_mlp(X_train, y_train, X_test, y_test, model_dir, frame_features,
     metrics = ['accuracy']
     monitor = 'val_loss'
 
-    m, inp, out = construct_mlp_model(X_train.shape[1:], weight_decay=weight_decay)
-    weight_path = os.path.join(model_dir, 'model.h5')
+    # Set up data inputs
+    train_gen = pescador.maps.keras_tuples(train_gen, 'features', 'label')
+    valid_data_keras = (valid_data['features'], valid_data['label'])
+    train_iter = iter(train_gen)
+    train_batch = next(train_iter)
+
+    # Set up model
+    m, inp, out = construct_mlp_model(train_batch['features'].shape[1:], weight_decay=weight_decay)
 
+    # Set up callbacks
     cb = []
+    weight_path = os.path.join(model_dir, 'model.h5')
     cb.append(keras.callbacks.ModelCheckpoint(weight_path,
                                               save_weights_only=True,
                                               save_best_only=True,
                                               monitor=monitor))
-
     history_checkpoint = os.path.join(model_dir, 'history_checkpoint.pkl')
     cb.append(LossHistory(history_checkpoint))
-
     history_csvlog = os.path.join(model_dir, 'history_csvlog.csv')
     cb.append(keras.callbacks.CSVLogger(history_csvlog, append=True,
                                         separator=','))
+    metric_cb = MetricCallback(valid_data, verbose=verbose)
+    cb.append(metric_cb)
 
+    # Fit model
     LOGGER.debug('Compiling model...')
-
     m.compile(Adam(lr=learning_rate), loss=loss, metrics=metrics)
-
     LOGGER.debug('Fitting model to data...')
+    m.fit_generator(train_gen, batch_size=batch_size,
+          epochs=num_epochs, steps_per_epoch=train_epoch_size,
+          validation_data=valid_data_keras, callbacks=cb)
+
+    # Set up train and validation metrics
+    train_metrics = {
+        'loss': metric_cb.train_loss,
+        'acc': metric_cb.train_acc
+    }
 
-    history = m.fit(x=X_train, y=y_train, batch_size=batch_size,
-                    epochs=num_epochs,
-                    steps_per_epoch=train_epoch_size,
-                    validation_split=validation_split,
-                    validation_steps=validation_epoch_size,
-                    callbacks=cb,
-                    verbose=10 if verbose else 1)
-
-    y_train_pred = m.predict(X_train)
-    if frame_features:
-        y_test_pred = []
-
-        for X_test_file in X_test:
-            output = m.predict(X_test_file).flatten()
-            y_test_pred.append(output.mean(axis=0))
-
-        y_test_pred = np.array(y_test_pred)
-
-    else:
-        y_test_pred = m.predict(X_test)
+    valid_metrics = {
+        'loss': metric_cb.valid_loss,
+        'acc': metric_cb.valid_acc,
+        'class_acc': metric_cb.valid_class_acc,
+        'avg_class_acc': metric_cb.valid_avg_class_acc
+    }
 
-    return m, y_train_pred, y_test_pred
+    # Evaluate model on test data
+    X_test = test_data['features']
+    y_test_pred_frame = m.predict(X_test)
+    y_test_pred = []
+    for start_idx, end_idx in test_data['file_idxs']:
+        class_pred = mode(y_test_pred_frame[start_idx:end_idx])[0][0]
+        y_test_pred.append(class_pred)
+    y_test_pred = np.array(y_test_pred)
+    test_metrics = compute_metrics(test_data['labels'], y_test_pred)
 
+    return m, train_metrics, valid_metrics, test_metrics
 
-def train(metadata_path, dataset_name, data_dir, model_id, output_dir,
-          model_type='svm', features='l3_stack', label_format='int',
-          l3embedding_model_path=None, l3embedding_model_type='cnn_L3_orig',
-          random_state=20171021, verbose=False, log_path=None,
-          disable_logging=False, num_random_samples=None, hop_size=0.25, **model_args):
 
+def train(features_dir, output_dir, model_id, fold_num, model_type='svm',
+          num_streamers=None, batch_size=64, mux_rate=None, random_state=20171021,
+          verbose=False, **model_args):
     init_console_logger(LOGGER, verbose=verbose)
-    if not disable_logging:
-        init_file_logger(LOGGER, log_path=log_path)
     LOGGER.debug('Initialized logging.')
 
     # Set random state
@@ -221,107 +313,73 @@ def train(metadata_path, dataset_name, data_dir, model_id, output_dir,
     random.seed(random_state)
 
     # Make sure the directories we need exist
-    model_dir = os.path.join(output_dir, model_id)
+    model_dir = os.path.join(output_dir, model_id,
+                             'fold{}'.format(fold_num),
+                             datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
+
+    # Make sure model directory exists
     if not os.path.isdir(model_dir):
         os.makedirs(model_dir)
 
-    if features.startswith('l3'):
-        LOGGER.info('Loading embedding model...')
-        l3embedding_model = load_embedding(l3embedding_model_path,
-                                           l3embedding_model_type, 'audio')
-    else:
-        l3embedding_model = None
-
+    # Save configs
+    with open(os.path.join(model_dir, 'config.json'), 'w') as fp:
+        config = {
+            'features_dir': features_dir,
+            'output_dir': output_dir,
+            'model_id': model_id,
+            'fold_num': fold_num,
+            'model_type': model_type,
+            'num_streamers': num_streamers,
+            'batch_size': batch_size,
+            'mux_rate': mux_rate,
+            'random_state': random_state,
+            'verbose': verbose
+        }
+        config.update(model_args)
+
+        json.dump(config, fp)
 
     LOGGER.info('Loading data...')
 
-    # Get fold data
-    if dataset_name == 'us8k':
-        fold_data = get_us8k_folds(metadata_path, data_dir, l3embedding_model=l3embedding_model,
-                                   features=features, label_format=label_format,
-                                   num_random_samples=num_random_samples, hop_size=hop_size)
-    else:
-        err_msg = 'Unsupported dataset "{}"'.format(dataset_name)
-        LOGGER.error(err_msg)
-        raise ValueError(err_msg)
+    fold_idx = fold_num - 1
+    LOGGER.info('Preparing data for fold {}'.format(fold_num))
+    train_gen = get_us8k_batch_generator(features_dir, fold_idx,
+                         valid=False, num_streamers=num_streamers,
+                         batch_size=batch_size, random_state=random_state,
+                         rate=mux_rate)
+    valid_data = get_us8k_batch(features_dir, fold_idx,
+                         valid=True, num_streamers=num_streamers,
+                         batch_size=batch_size, random_state=random_state,
+                         rate=mux_rate)
+    test_data = load_test_fold(features_dir, fold_idx)
+
+    LOGGER.info('Training {} with fold {} held out'.format(model_type, fold_num))
+    # Fit the model
+    if model_type == 'svm':
+        model, train_metrics, valid_metrics, test_metrics \
+            = train_svm(train_gen, valid_data, test_data, model_dir,
+                verbose=verbose, **model_args)
 
-    frame_features = 'frames' in features
+    elif model_type == 'mlp':
+        model, train_metrics, valid_metrics, test_metrics \
+                = train_mlp(train_gen, valid_data, test_data, model_dir,
+                batch_size=batch_size, verbose=verbose, **model_args)
 
-    model_output_path = os.path.join(model_dir, "model_fold{}.{}")
+    else:
+        raise ValueError('Invalid model type: {}'.format(model_type))
 
+    # Assemble metrics for this training run
     results = {
-        'folds': []
-    }
-    # Fit the model
-    LOGGER.info('Preparing to fit models...')
-    for fold_idx in range(10):
-        LOGGER.info('\t* Training with fold {} held out'.format(fold_idx+1))
-        X_train, y_train, X_test, y_test = get_us8k_fold_split(fold_data, fold_idx,
-                                                               frame_features=frame_features)
-        if model_type == 'svm':
-            model, y_train_pred, y_test_pred = train_svm(
-                    X_train, y_train, X_test, y_test, frame_features,
-                    verbose=verbose, **model_args)
-
-            LOGGER.info('Saving model...')
-            # Save the model for this fold
-            joblib.dump(model, model_output_path.format(fold_idx+1, 'pkl'))
-
-        elif model_type == 'mlp':
-            model, y_train_pred, y_test_pred = train_mlp(
-                    X_train, y_train, X_test, y_test, model_dir, frame_features,
-                    verbose=verbose, **model_args)
-
-        else:
-            raise ValueError('Invalid model type: {}'.format(model_type))
-
-        # Compute metrics for this fold
-        train_metrics = compute_metrics(y_train, y_train_pred)
-        test_metrics = compute_metrics(y_test, y_test_pred)
-
-        results['folds'].append({
-            'train': {
-                'metrics': train_metrics,
-                'target': y_train.tolist(),
-                'prediction': y_train_pred.tolist()
-            },
-            'test': {
-                'metrics': test_metrics,
-                'target': y_test.tolist(),
-                'prediction': y_test_pred.tolist()
-            }
-        })
-
-    train_metrics = aggregate_metrics([fold['train']['metrics']
-                                       for fold in results['folds']])
-    print_metrics(train_metrics, 'train')
-
-    test_metrics = aggregate_metrics([fold['test']['metrics']
-                                      for fold in results['folds']])
-    print_metrics(test_metrics, 'test')
-
-    results['summary'] = {
         'train': train_metrics,
+        'valid': valid_metrics,
         'test': test_metrics
     }
 
     LOGGER.info('Done training. Saving results to disk...')
 
-    # Evaluate model
-    # print('Evaluate model...')
-    # Load best params
-    # m.load_weights(weight_path)
-    # with open(os.path.join(output_dir, 'index_test.json'), 'r') as fp:
-    #     test_idx = json.load(fp)['id']
-
-    # Compute eval scores
-    # results = score_model(output_dir, pump, model, test_idx, working,
-    #                       strong_label_file, duration, modelid,
-    #                       use_orig_duration=True)
-
     # Save results to disk
-    results_file = os.path.join(model_dir, 'results.json')
+    results_file = os.path.join(model_dir, 'results.pkl')
     with open(results_file, 'w') as fp:
-        json.dump(results, fp, indent=2)
+        pk.dump(results, fp, protocol=pk.HIGHEST_PROTOCOL)
 
     LOGGER.info('Done!')
diff --git a/data/usc/us8k.py b/data/usc/us8k.py
index 040a6af..0258066 100644
--- a/data/usc/us8k.py
+++ b/data/usc/us8k.py
@@ -16,11 +16,9 @@
 def load_us8k_metadata(path):
     """
     Load UrbanSound8K metadata
-
     Args:
         path: Path to metadata csv file
               (Type: str)
-
     Returns:
         metadata: List of metadata dictionaries
                   (Type: list[dict[str, *]])
@@ -40,118 +38,122 @@ def load_us8k_metadata(path):
     return metadata
 
 
-def get_us8k_folds(metadata_path, data_dir, l3embedding_model=None,
-                   features='l3_stack', label_format='int', **feature_args):
-    """
-    Load all of the data for each fold
 
-    Args:
-        metadata_path: Path to metadata file
-                       (Type: str)
+def load_feature_file(feature_filepath):
+    data = np.load(feature_filepath)
+    X, y = data['X'], data['y']
+    return X, y
 
-        data_dir: Path to data directory
-                  (Type: str)
 
-    Keyword Args:
-        l3embedding_model: L3 embedding model, used if L3 features are used
-                           (Type: keras.engine.training.Model or None)
+def us8k_file_sampler(feature_filepath, shuffle=True):
+    X, y = load_feature_file(feature_filepath)
 
-        features: Type of features to be computed
-                  (Type: str)
+    num_frames = X.shape[0]
 
-        label_format: Type of format used for encoding outputs
-                      (Type: str)
+    if shuffle:
+        frame_idxs = np.random.permutation(num_frames)
+    else:
+        frame_idxs = range(num_frames)
 
-    Returns:
-        fold_data: List of data for each fold
-                   (Type: list[tuple[np.ndarray, np.ndarray]])
+    for idx in frame_idxs:
+        yield {
+            'features': X[idx],
+            'label': y[idx],
+            'filepath': feature_filepath,
+            'frame_idx': idx
+        }
 
-    """
-    raise NotImplementedError()
 
 
-def get_us8k_fold_data(metadata, data_dir, fold_idx, l3embedding_model=None,
-                       features='l3_stack', label_format='int', **feature_args):
-    """
-    Load all of the data for a specific fold
+def get_us8k_batch_generator(features_dir, test_fold_idx, valid=False, num_streamers=None,
+                             batch_size=64, random_state=20171021,
+                             rate=None, cycle=True):
 
-    Args:
-        metadata: List of metadata dictionaries
-                  (Type: list[dict[str,*]])
+    random.seed(random_state)
+    np.random.seed(random_state)
 
-        data_dir: Path to data directory
-                  (Type: str)
 
-        fold_idx: Index of fold to load
-                  (Type: int)
+    LOGGER.info("Loading subset list")
 
-    Keyword Args:
-        l3embedding_model: L3 embedding model, used if L3 features are used
-                           (Type: keras.engine.training.Model or None)
+    LOGGER.info("Creating streamers...")
+    seeds = []
 
-        features: Type of features to be computed
-                  (Type: str)
+    valid_fold_idx = (test_fold_idx - 1) % 10
 
-        label_format: Type of format used for encoding outputs
-                      (Type: str)
+    for fold_dirname in os.listdir(features_dir):
+        fold_idx = int(fold_dirname.replace('fold', '')) - 1
+        if fold_idx == test_fold_idx:
+            continue
 
-    Returns:
-        X: Feature data
-           (Type: np.ndarray)
-        y: Label data
-           (Type: np.ndarray)
+        if valid and fold_idx != valid_fold_idx:
+            continue
 
-    """
-    X = []
-    y = []
+        fold_dir = os.path.join(features_dir, fold_dirname)
+        for feature_filename in os.listdir(fold_dir):
+            feature_filepath = os.path.join(fold_dir, feature_filename)
+            streamer = pescador.Streamer(us8k_file_sampler, feature_filepath)
+            seeds.append(streamer)
 
-    raise NotImplementedError()
+        if valid:
+            break
 
+    # Randomly shuffle the seeds
+    random.shuffle(seeds)
 
-def get_us8k_fold_split(fold_data, fold_idx, frame_features, shuffle=True):
-    """
-    Given the fold to use as held out, return the data split between training
-    and testing
+    if num_streamers is None:
+        num_streamers = len(seeds)
 
-    Args:
-        fold_data: List of data for each fold
-                   (Type: list[tuple[np.ndarray, np.ndarray]])
+    mux = pescador.StochasticMux(seeds, num_streamers, rate=rate, random_state=random_state)
+    if cycle:
+        mux = mux.cycle()
 
-        fold_idx: Fold to use as held out data
-                  (Type: int)
+    if batch_size == 1:
+        return mux
+    else:
+        return pescador.maps.buffer_stream(mux, batch_size)
 
-        frame_features: If True, training frame features organized by file are
-                        flattened and labels are replicated accordingly
-                        (Type: bool)
 
-    Returns:
-        X_train: Training feature data
-                 (Type: np.ndarray)
-        y_train: Training label data
-                 (Type: np.ndarray)
-        X_test: Testing feature data
-                (Type: np.ndarray)
-        y_test: Testing label data
-                (Type: np.ndarray)
-    """
-    X_test, y_test = fold_data[fold_idx]
-    train = fold_data[:fold_idx] + fold_data[fold_idx+1:]
+def get_us8k_batch(features_dir, test_fold_idx, valid=False, num_streamers=None,
+                   batch_size=64, random_state=20171021,
+                   rate=None, cycle=True):
+    gen = iter(get_us8k_batch_generator(features_dir, test_fold_idx, valid=valid,
+                                   num_streamers=num_streamers, batch_size=batch_size,
+                                   random_state=random_state, rate=rate, cycle=cycle))
+    return next(gen)
 
-    X_train, y_train = zip(*train)
-    X_train = np.concatenate(X_train, axis=0)
-    y_train = np.concatenate(y_train, axis=0)
 
-    # If features are computed framewise flatten the audio file dimension
-    if frame_features:
-        X_train, y_train = cls_features.flatten_file_frames(X_train, y_train)
+def load_test_fold(feature_dir, fold_idx):
+    X = []
+    y = []
+    file_idxs = []
+    fold_dir = os.path.join(feature_dir, 'fold{}'.format(fold_idx + 1))
 
-    if shuffle:
-        train_idxs = np.arange(X_train.shape[0])
-        np.random.shuffle(train_idxs)
-        X_train = X_train[train_idxs]
-        y_train = y_train[train_idxs]
+    filenames = os.listdir(fold_dir)
+
+    start_idx = 0
+    for feature_filename in filenames:
+        feature_filepath = os.path.join(fold_dir, feature_filename)
+        file_X, file_y = load_feature_file(feature_filepath)
+
+        if file_X.ndim > 1:
+            end_idx = start_idx + file_X.shape[0]
+        else:
+            end_idx = start_idx + 1
+
+        X.append(X)
+        y.append(file_y)
+        file_idxs.append((start_idx, end_idx))
+
+        start_idx = end_idx
+
+    X = np.vstack(X)
+
+    if y[0].ndim == 0:
+        y = np.array(y)
+    else:
+        y = np.concatenate(y)
 
-    return X_train, y_train, X_test, y_test
+    return {'features': X, 'labels': y, 'file_idxs': file_idxs, 'filenames': filenames}
 
 
 def generate_us8k_folds(metadata_path, data_dir, output_dir, l3embedding_model=None,

From 0e4deaa478e2bb4a2bcd6227ed3984546727de66 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Mon, 12 Mar 2018 22:03:59 -0400
Subject: [PATCH 128/201] Set SGDClassifier to use all available CPUs

---
 classifier/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/classifier/train.py b/classifier/train.py
index 1b943d4..c66f998 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -67,7 +67,7 @@ def on_epoch_end(self, epoch, logs=None):
 
 
 def train_svm(train_gen, valid_data, test_data, model_dir, C=1e-4, reg_penalty='l2',
-              num_workers=1, tol=1e-3, max_iterations=1000000, verbose=False, **kwargs):
+              tol=1e-3, max_iterations=1000000, verbose=False, **kwargs):
     """
     Train a Support Vector Machine model on the given data
 
@@ -112,7 +112,7 @@ def train_svm(train_gen, valid_data, test_data, model_dir, C=1e-4, reg_penalty='
     model_output_path = os.path.join(model_dir, "model.pkl")
 
     # Create classifier
-    clf = SGDClassifier(alpha=C, penalty=reg_penalty, n_jobs=num_workers, verbose=verbose)
+    clf = SGDClassifier(alpha=C, penalty=reg_penalty, n_jobs=-1, verbose=verbose)
 
     LOGGER.debug('Fitting model to data...')
     for iter_idx, train_data in enumerate(train_gen):

From c402e261bdc07ec1225ed05a41c004cbec43e415 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 13 Mar 2018 11:33:32 -0400
Subject: [PATCH 129/201] Ignore config file when loading folds, use older
 version of pescador Mux until we update to pescador 2.0, properly load and
 yield labels from a feature file

---
 data/usc/us8k.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/data/usc/us8k.py b/data/usc/us8k.py
index 0258066..60e0631 100644
--- a/data/usc/us8k.py
+++ b/data/usc/us8k.py
@@ -42,6 +42,8 @@ def load_us8k_metadata(path):
 def load_feature_file(feature_filepath):
     data = np.load(feature_filepath)
     X, y = data['X'], data['y']
+    if type(y) == np.ndarray and y.ndim == 0:
+        y = int(y)
     return X, y
 
 
@@ -58,7 +60,7 @@ def us8k_file_sampler(feature_filepath, shuffle=True):
     for idx in frame_idxs:
         yield {
             'features': X[idx],
-            'label': y[idx],
+            'label': y,
             'filepath': feature_filepath,
             'frame_idx': idx
         }
@@ -81,6 +83,11 @@ def get_us8k_batch_generator(features_dir, test_fold_idx, valid=False, num_strea
     valid_fold_idx = (test_fold_idx - 1) % 10
 
     for fold_dirname in os.listdir(features_dir):
+        fold_dir = os.path.join(features_dir, fold_dirname)
+
+        if not os.path.isdir(fold_dir):
+            continue
+
         fold_idx = int(fold_dirname.replace('fold', '')) - 1
         if fold_idx == test_fold_idx:
             continue
@@ -103,7 +110,7 @@ def get_us8k_batch_generator(features_dir, test_fold_idx, valid=False, num_strea
     if num_streamers is None:
         num_streamers = len(seeds)
 
-    mux = pescador.StochasticMux(seeds, num_streamers, rate=rate, random_state=random_state)
+    mux = pescador.Mux(seeds, num_streamers, rate=rate, random_state=random_state)
     if cycle:
         mux = mux.cycle()
 

From b5cfda244cb98af3ef4264b86d84f5f61aae34c2 Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Tue, 13 Mar 2018 15:16:36 -0400
Subject: [PATCH 130/201] close reader after use

---
 data/avc/sample.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/avc/sample.py b/data/avc/sample.py
index 4cfc226..fba150e 100644
--- a/data/avc/sample.py
+++ b/data/avc/sample.py
@@ -302,7 +302,7 @@ def read_video(video_path):
     frames = []
     for frame in reader.nextFrame():
         frames.append(frame)
-
+    reader.close()
     return frames
 
 

From 121fa1e2faba36d3732a3d50c5a04fd4339b41cc Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 13 Mar 2018 16:25:54 -0400
Subject: [PATCH 131/201] Separate pescador parameters for train and validation
 sets

---
 06_train_classifier.py | 45 +++++++++++++++++++++++++++++++-----------
 classifier/train.py    | 36 ++++++++++++++++++++-------------
 2 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/06_train_classifier.py b/06_train_classifier.py
index 3fafaf8..85fa507 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -21,28 +21,51 @@ def parse_arguments():
                         default=150,
                         help='(MLP) Maximum number of training epochs')
 
-    parser.add_argument('-bs',
-                        '--batch-size',
-                        dest='batch_size',
+    parser.add_argument('-tbs',
+                        '--train-batch-size',
+                        dest='train_batch_size',
                         action='store',
                         type=int,
                         default=64,
                         help='(MLP) Number of training examples per batch')
 
-    parser.add_argument('-ns',
-                        '--num-streamers',
-                        dest='num_streamers',
+    parser.add_argument('-tns',
+                        '--train-num-streamers',
+                        dest='train_num_streamers',
                         action='store',
                         type=int,
                         help='Number of open pescador streamers to keep open ' \
-                             'while sampling features')
+                             'while sampling features for training data')
 
-    parser.add_argument('-mr',
-                        '--mux-rate',
-                        dest='mux_rate',
+    parser.add_argument('-tmr',
+                        '--train-mux-rate',
+                        dest='train_mux_rate',
                         action='store',
                         type=int,
-                        help='Pescador poisson rate used for sampling')
+                        help='Pescador poisson rate used for sampling training data')
+
+    parser.add_argument('-vbs',
+                        '--valid-batch-size',
+                        dest='valid_batch_size',
+                        action='store',
+                        type=int,
+                        default=64,
+                        help='(MLP) Number of validation examples per batch')
+
+    parser.add_argument('-vns',
+                        '--valid-num-streamers',
+                        dest='valid_num_streamers',
+                        action='store',
+                        type=int,
+                        help='Number of open pescador streamers to keep open ' \
+                             'while sampling features for validation data')
+
+    parser.add_argument('-vmr',
+                        '--valid-mux-rate',
+                        dest='valid_mux_rate',
+                        action='store',
+                        type=int,
+                        help='Pescador poisson rate used for sampling validation data')
 
     parser.add_argument('-tes',
                         '--train-epoch-size',
diff --git a/classifier/train.py b/classifier/train.py
index c66f998..fa71798 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -110,6 +110,7 @@ def train_svm(train_gen, valid_data, test_data, model_dir, C=1e-4, reg_penalty='
     valid_metric_history = []
 
     model_output_path = os.path.join(model_dir, "model.pkl")
+    stdizer_output_path = os.path.join(model_dir, "stdizer.pkl")
 
     # Create classifier
     clf = SGDClassifier(alpha=C, penalty=reg_penalty, n_jobs=-1, verbose=verbose)
@@ -135,6 +136,7 @@ def train_svm(train_gen, valid_data, test_data, model_dir, C=1e-4, reg_penalty='
         # Save the model for this iteration
         LOGGER.info('Saving model...')
         joblib.dump(clf, model_output_path)
+        joblib.dump(stdizer, stdizer_output_path)
 
         if verbose:
             train_msg = 'Train - loss: {}, acc: {}'
@@ -170,7 +172,7 @@ def train_svm(train_gen, valid_data, test_data, model_dir, C=1e-4, reg_penalty='
     y_test_pred = np.array(y_test_pred)
     test_metrics = compute_metrics(test_data['labels'], y_test_pred)
 
-    return clf, train_metrics, valid_metrics, test_metrics
+    return (stdizer, clf), train_metrics, valid_metrics, test_metrics
 
 
 def construct_mlp_model(input_shape, weight_decay=1e-5):
@@ -303,8 +305,9 @@ def train_mlp(train_gen, valid_data, test_data, model_dir,
 
 
 def train(features_dir, output_dir, model_id, fold_num, model_type='svm',
-          num_streamers=None, batch_size=64, mux_rate=None, random_state=20171021,
-          verbose=False, **model_args):
+          train_num_streamers=None, train_batch_size=64, train_mux_rate=None,
+          valid_num_streamers=None, valid_batch_size=64, valid_mux_rate=None,
+          random_state=20171021, verbose=False, **model_args):
     init_console_logger(LOGGER, verbose=verbose)
     LOGGER.debug('Initialized logging.')
 
@@ -329,9 +332,12 @@ def train(features_dir, output_dir, model_id, fold_num, model_type='svm',
             'model_id': model_id,
             'fold_num': fold_num,
             'model_type': model_type,
-            'num_streamers': num_streamers,
-            'batch_size': batch_size,
-            'mux_rate': mux_rate,
+            'train_num_streamers': train_num_streamers,
+            'train_batch_size': train_batch_size,
+            'train_mux_rate': train_mux_rate,
+            'valid_num_streamers': valid_num_streamers,
+            'valid_batch_size': valid_batch_size,
+            'valid_mux_rate': valid_mux_rate,
             'random_state': random_state,
             'verbose': verbose
         }
@@ -342,15 +348,17 @@ def train(features_dir, output_dir, model_id, fold_num, model_type='svm',
     LOGGER.info('Loading data...')
 
     fold_idx = fold_num - 1
-    LOGGER.info('Preparing data for fold {}'.format(fold_num))
+    LOGGER.info('Preparing training data for fold {}'.format(fold_num))
     train_gen = get_us8k_batch_generator(features_dir, fold_idx,
-                         valid=False, num_streamers=num_streamers,
-                         batch_size=batch_size, random_state=random_state,
-                         rate=mux_rate)
+                         valid=False, num_streamers=train_num_streamers,
+                         batch_size=train_batch_size, random_state=random_state,
+                         rate=train_mux_rate)
+    LOGGER.info('Preparing validation data for fold {}'.format(fold_num))
     valid_data = get_us8k_batch(features_dir, fold_idx,
-                         valid=True, num_streamers=num_streamers,
-                         batch_size=batch_size, random_state=random_state,
-                         rate=mux_rate)
+                         valid=True, num_streamers=valid_num_streamers,
+                         batch_size=valid_batch_size, random_state=random_state,
+                         rate=valid_mux_rate)
+    LOGGER.info('Preparing test data for fold {}'.format(fold_num))
     test_data = load_test_fold(features_dir, fold_idx)
 
     LOGGER.info('Training {} with fold {} held out'.format(model_type, fold_num))
@@ -363,7 +371,7 @@ def train(features_dir, output_dir, model_id, fold_num, model_type='svm',
     elif model_type == 'mlp':
         model, train_metrics, valid_metrics, test_metrics \
                 = train_mlp(train_gen, valid_data, test_data, model_dir,
-                batch_size=batch_size, verbose=verbose, **model_args)
+                batch_size=train_batch_size, verbose=verbose, **model_args)
 
     else:
         raise ValueError('Invalid model type: {}'.format(model_type))

From 33c25af39a2245d477ff212843db2aea8b07646d Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 13 Mar 2018 16:32:14 -0400
Subject: [PATCH 132/201] Implement VGGish feature extraction

---
 05_generate_embedding_samples.py          |  15 +-
 README.md                                 |   7 +
 data/usc/features.py                      | 106 +++++++--
 data/usc/vggish/README.md                 | 250 ++++++++++++++++++++++
 data/usc/vggish/__init__.py               |   0
 data/usc/vggish/mel_features.py           | 218 +++++++++++++++++++
 data/usc/vggish/vggish_input.py           |  90 ++++++++
 data/usc/vggish/vggish_params.py          |  53 +++++
 data/usc/vggish/vggish_postprocess.py     |  90 ++++++++
 data/usc/vggish/vggish_slim.py            | 128 +++++++++++
 data/usc/vggish/vggish_smoke_test.py      |  97 +++++++++
 jobs/generate_vggish_samples_array.sbatch |  42 ++++
 resources/vggish/.gitkeep                 |   0
 13 files changed, 1073 insertions(+), 23 deletions(-)
 create mode 100644 data/usc/vggish/README.md
 create mode 100644 data/usc/vggish/__init__.py
 create mode 100644 data/usc/vggish/mel_features.py
 create mode 100644 data/usc/vggish/vggish_input.py
 create mode 100644 data/usc/vggish/vggish_params.py
 create mode 100644 data/usc/vggish/vggish_postprocess.py
 create mode 100644 data/usc/vggish/vggish_slim.py
 create mode 100644 data/usc/vggish/vggish_smoke_test.py
 create mode 100644 jobs/generate_vggish_samples_array.sbatch
 create mode 100644 resources/vggish/.gitkeep

diff --git a/05_generate_embedding_samples.py b/05_generate_embedding_samples.py
index 7110c83..5ae43ac 100644
--- a/05_generate_embedding_samples.py
+++ b/05_generate_embedding_samples.py
@@ -152,15 +152,18 @@ def parse_arguments():
     dataset_name = args['dataset_name']
     fold_num = args['fold']
 
-    LOGGER.info('Loading embedding model...')
-    l3embedding_model = load_embedding(model_path,
-                                       model_type,
-                                       'audio', pooling_type,
-                                       tgt_num_gpus=num_gpus)
+    if model_path:
+        LOGGER.info('Loading embedding model...')
+        l3embedding_model = load_embedding(model_path,
+                                           model_type,
+                                           'audio', pooling_type,
+                                           tgt_num_gpus=num_gpus)
+    else:
+        l3embedding_model = None
 
 
     dataset_output_dir = os.path.join(output_dir, model_id,
-                                      model_type, pooling_type)
+                                      model_type, pooling_type, features)
 
     if not os.path.isdir(dataset_output_dir):
         os.makedirs(dataset_output_dir)
diff --git a/README.md b/README.md
index 86f8e7f..0139a51 100644
--- a/README.md
+++ b/README.md
@@ -18,4 +18,11 @@ There is also a module `classifier/` which contains code to train a classifier u
 You can train an urban sound classification model using `train_classifier.py`. Run `python train_classifier.py -h` to read the help message regarding how to use the script.
 
 
+## Download VGGish models:
+* `cd ./resources/vggish`
+* `curl -O https://storage.googleapis.com/audioset/vggish_model.ckpt`
+* `curl -O https://storage.googleapis.com/audioset/vggish_pca_params.npz`
+* `cd ../..`
+
+
 If you use a SLURM environment, `sbatch` scripts are available in `jobs/`.
diff --git a/data/usc/features.py b/data/usc/features.py
index 1fcec50..c885892 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -1,10 +1,15 @@
 import logging
+import os
 import warnings
 import librosa
 import numpy as np
 import scipy as sp
 import soundfile as sf
 import resampy
+import tensorflow as tf
+import .vggish.vggish_input as vggish_input
+import .vggish.vggish_postprocess as vggish_postprocess
+import .vggish.vggish_slim as vggish_slim
 
 LOGGER = logging.getLogger('cls-data-generation')
 LOGGER.setLevel(logging.DEBUG)
@@ -44,7 +49,73 @@ def one_hot(idx, n_classes=10):
     return y
 
 
-def get_l3_stack_features(audio_path, l3embedding_model, hop_size=0.25):
+def extract_vggish_embedding(audio_path, input_op_name='vggish/input_features',
+                             output_op_name='vggish/embedding',
+                             resources_dir=None, **params):
+    fs = params.get('target_sample_rate', 16000)
+    audio_data = load_audio(audio_path, fs)
+
+    if not resources_dir:
+        resources_dir = os.path.join(os.path.dirname(__file__), '../../resources/vggish')
+
+    pca_params_path = os.path.join(resources_dir, 'vggish_pca_params.npz')
+    model_path = os.path.join(resources_dir, 'vggish_model.ckpt')
+
+
+    examples_batch = vggish_input.waveform_to_examples(audio_data, fs, **params)
+
+    # Prepare a postprocessor to munge the model embeddings.
+    pproc = vggish_postprocess.Postprocessor(pca_params_path)
+
+    # If needed, prepare a record writer to store the postprocessed embeddings.
+    #writer = tf.python_io.TFRecordWriter(
+    #    FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None
+
+    input_tensor_name = input_op_name + ':0'
+    output_tensor_name = output_op_name + ':0'
+
+    with tf.Graph().as_default(), tf.Session() as sess:
+      # Define the model in inference mode, load the checkpoint, and
+      # locate input and output tensors.
+      vggish_slim.define_vggish_slim(training=False)
+      vggish_slim.load_vggish_slim_checkpoint(sess, model_path)
+      features_tensor = sess.graph.get_tensor_by_name(input_tensor_name)
+      embedding_tensor = sess.graph.get_tensor_by_name(output_tensor_name)
+
+      # Run inference and postprocessing.
+      [embedding_batch] = sess.run([embedding_tensor],
+                                   feed_dict={features_tensor: examples_batch})
+      postprocessed_batch = pproc.postprocess(embedding_batch)
+
+      # Write the postprocessed embeddings as a SequenceExample, in a similar
+      # format as the features released in AudioSet. Each row of the batch of
+      # embeddings corresponds to roughly a second of audio (96 10ms frames), and
+      # the rows are written as a sequence of bytes-valued features, where each
+      # feature value contains the 128 bytes of the whitened quantized embedding.
+
+    return postprocessed_batch
+
+
+def get_vggish_frames_uniform(audio_path, hop_size=0.1):
+    """
+    Get vggish embedding features for each frame in the given audio file
+
+    Args:
+        audio: Audio data or path to audio file
+               (Type: np.ndarray or str)
+
+    Keyword Args:
+        hop_size: Hop size in seconds
+                  (Type: float)
+
+    Returns:
+        features:  Array of embedding vectors
+                   (Type: np.ndarray)
+    """
+    return extract_vggish_embedding(audio_path, frame_hop_sec=hop_size)
+
+
+def get_l3_stack_features(audio_path, l3embedding_model, hop_size=0.1):
     """
     Get stacked L3 embedding features, i.e. stack embedding features for each
     1 second (overlapping) window of the given audio
@@ -59,7 +130,7 @@ def get_l3_stack_features(audio_path, l3embedding_model, hop_size=0.25):
                             (keras.engine.training.Model)
 
     Keyword Args:
-        hop_size: Hop size as a fraction of the window
+        hop_size: Hop size in seconds
                   (Type: float)
 
     Returns:
@@ -101,7 +172,7 @@ def get_l3_stack_features(audio_path, l3embedding_model, hop_size=0.25):
     return l3embedding.flatten()
 
 
-def get_l3_stats_features(audio_path, l3embedding_model, hop_size=0.25):
+def get_l3_stats_features(audio_path, l3embedding_model, hop_size=0.1):
     """
     Get L3 embedding stats features, i.e. compute statistics for each of the
     embedding features across 1 second (overlapping) window of the given audio
@@ -117,7 +188,7 @@ def get_l3_stats_features(audio_path, l3embedding_model, hop_size=0.25):
                             (keras.engine.training.Model)
 
     Keyword Args:
-        hop_size: Hop size as a fraction of the window
+        hop_size: Hop size in seconds
                   (Type: float)
 
     Returns:
@@ -181,10 +252,9 @@ def get_l3_stats_features(audio_path, l3embedding_model, hop_size=0.25):
                            d1_mean, d1_var, d2_mean, d2_var))
 
 
-def get_l3_frames_uniform(audio, l3embedding_model, hop_size=0.25, sr=48000):
+def get_l3_frames_uniform(audio, l3embedding_model, hop_size=0.1, sr=48000):
     """
-    Get L3 embedding stats features, i.e. compute statistics for each of the
-    embedding features across 1 second (overlapping) window of the given audio
+    Get L3 embedding for each frame in the given audio file
 
     Args:
         audio: Audio data or path to audio file
@@ -194,12 +264,12 @@ def get_l3_frames_uniform(audio, l3embedding_model, hop_size=0.25, sr=48000):
                             (keras.engine.training.Model)
 
     Keyword Args:
-        hop_size: Hop size as a fraction of the window
+        hop_size: Hop size in seconds
                   (Type: float)
 
     Returns:
-        features:  List of embedding vectors
-                   (Type: list[np.ndarray])
+        features:  Array of embedding vectors
+                   (Type: np.ndarray)
     """
     if type(audio) == str:
         audio = load_audio(audio, sr)
@@ -236,8 +306,7 @@ def get_l3_frames_uniform(audio, l3embedding_model, hop_size=0.25, sr=48000):
 
 def get_l3_frames_random(audio, l3embedding_model, num_samples, sr=48000):
     """
-    Get L3 embedding stats features, i.e. compute statistics for each of the
-    embedding features across 1 second (overlapping) window of the given audio
+    Get L3 embedding for random frames in the given audio file
 
     Args:
         audio: Numpy array or path to audio file
@@ -250,8 +319,8 @@ def get_l3_frames_random(audio, l3embedding_model, num_samples, sr=48000):
                      (Type: int)
 
     Returns:
-        features:  List of embedding vectors
-                   (Type: list[np.ndarray])
+        features:  Array of embedding vectors
+                   (Type: np.ndarray)
     """
     if type(audio) == str:
         audio = load_audio(audio, sr)
@@ -302,15 +371,15 @@ def compute_file_features(path, feature_type, l3embedding_model=None, **feature_
         raise ValueError(err_msg.format(feature_type))
 
     if feature_type == 'l3_stack':
-        hop_size = feature_args.get('hop_size', 0.25)
+        hop_size = feature_args.get('hop_size', 0.1)
         file_features = get_l3_stack_features(path, l3embedding_model,
                                               hop_size=hop_size)
     elif feature_type == 'l3_stats':
-        hop_size = feature_args.get('hop_size', 0.25)
+        hop_size = feature_args.get('hop_size', 0.1)
         file_features = get_l3_stats_features(path, l3embedding_model,
                                               hop_size=hop_size)
     elif feature_type == 'l3_frames_uniform':
-        hop_size = feature_args.get('hop_size', 0.25)
+        hop_size = feature_args.get('hop_size', 0.1)
         file_features = get_l3_frames_uniform(path, l3embedding_model,
                                               hop_size=hop_size)
     elif feature_type == 'l3_frames_random':
@@ -319,6 +388,9 @@ def compute_file_features(path, feature_type, l3embedding_model=None, **feature_
             raise ValueError('Must specify "num_samples" for "l3_frame_random" features')
         file_features = get_l3_frames_random(path, l3embedding_model,
                                              num_samples)
+    elif feature_type == 'vggish_frames_uniform':
+        hop_size = feature_args.get('hop_size', 0.1)
+        file_features = get_vggish_frames_uniform(path, hop_size=hop_size)
     else:
         raise ValueError('Invalid feature type: {}'.format(feature_type))
 
diff --git a/data/usc/vggish/README.md b/data/usc/vggish/README.md
new file mode 100644
index 0000000..6f7f0ac
--- /dev/null
+++ b/data/usc/vggish/README.md
@@ -0,0 +1,250 @@
+# Models for AudioSet: A Large Scale Dataset of Audio Events
+
+This repository provides models and supporting code associated with
+[AudioSet](http://g.co/audioset), a dataset of over 2 million human-labeled
+10-second YouTube video soundtracks, with labels taken from an ontology of
+more than 600 audio event classes.
+
+AudioSet was
+[released](https://research.googleblog.com/2017/03/announcing-audioset-dataset-for-audio.html)
+in March 2017 by Google's Sound Understanding team to provide a common
+large-scale evaluation task for audio event detection as well as a starting
+point for a comprehensive vocabulary of sound events.
+
+For more details about AudioSet and the various models we have trained, please
+visit the [AudioSet website](http://g.co/audioset) and read our papers:
+
+* Gemmeke, J. et. al.,
+  [AudioSet: An ontology and human-labelled dataset for audio events](https://research.google.com/pubs/pub45857.html),
+  ICASSP 2017
+
+* Hershey, S. et. al.,
+  [CNN Architectures for Large-Scale Audio Classification](https://research.google.com/pubs/pub45611.html),
+  ICASSP 2017
+
+If you use the pre-trained VGGish model in your published research, we ask that
+you cite [CNN Architectures for Large-Scale Audio Classification](https://research.google.com/pubs/pub45611.html).
+If you use the AudioSet dataset or the released 128-D embeddings of AudioSet
+segments, please cite
+[AudioSet: An ontology and human-labelled dataset for audio events](https://research.google.com/pubs/pub45857.html).
+
+## VGGish
+
+The initial AudioSet release included 128-dimensional embeddings of each
+AudioSet segment produced from a VGG-like audio classification model that was
+trained on a large YouTube dataset (a preliminary version of what later became
+[YouTube-8M](https://research.google.com/youtube8m)).
+
+We provide a TensorFlow definition of this model, which we call __*VGGish*__, as
+well as supporting code to extract input features for the model from audio
+waveforms and to post-process the model embedding output into the same format as
+the released embedding features.
+
+### Installation
+
+VGGish depends on the following Python packages:
+
+* [`numpy`](http://www.numpy.org/)
+* [`scipy`](http://www.scipy.org/)
+* [`resampy`](http://resampy.readthedocs.io/en/latest/)
+* [`tensorflow`](http://www.tensorflow.org/)
+* [`six`](https://pythonhosted.org/six/)
+
+These are all easily installable via, e.g., `pip install numpy` (as in the
+example command sequence below).
+
+Any reasonably recent version of these packages should work. TensorFlow should
+be at least version 1.0.  We have tested with Python 2.7.6 and 3.4.3 on an
+Ubuntu-like system with NumPy v1.13.1, SciPy v0.19.1, resampy v0.1.5, TensorFlow
+v1.2.1, and Six v1.10.0.
+
+VGGish also requires downloading two data files:
+
+* [VGGish model checkpoint](https://storage.googleapis.com/audioset/vggish_model.ckpt),
+  in TensorFlow checkpoint format.
+* [Embedding PCA parameters](https://storage.googleapis.com/audioset/vggish_pca_params.npz),
+  in NumPy compressed archive format.
+
+After downloading these files into the same directory as this README, the
+installation can be tested by running `python vggish_smoke_test.py` which
+runs a known signal through the model and checks the output.
+
+Here's a sample installation and test session:
+
+```shell
+# You can optionally install and test VGGish within a Python virtualenv, which
+# is useful for isolating changes from the rest of your system. For example, you
+# may have an existing version of some packages that you do not want to upgrade,
+# or you want to try Python 3 instead of Python 2. If you decide to use a
+# virtualenv, you can create one by running
+#   $ virtualenv vggish   # For Python 2
+# or
+#   $ python3 -m venv vggish # For Python 3
+# and then enter the virtual environment by running
+#   $ source vggish/bin/activate  # Assuming you use bash
+# Leave the virtual environment at the end of the session by running
+#   $ deactivate
+# Within the virtual environment, do not use 'sudo'.
+
+# Upgrade pip first.
+$ sudo python -m pip install --upgrade pip
+
+# Install dependences. Resampy needs to be installed after NumPy and SciPy
+# are already installed.
+$ sudo pip install numpy scipy
+$ sudo pip install resampy tensorflow six
+
+# Clone TensorFlow models repo into a 'models' directory.
+$ git clone https://github.com/tensorflow/models.git
+$ cd models/research/audioset
+# Download data files into same directory as code.
+$ curl -O https://storage.googleapis.com/audioset/vggish_model.ckpt
+$ curl -O https://storage.googleapis.com/audioset/vggish_pca_params.npz
+
+# Installation ready, let's test it.
+$ python vggish_smoke_test.py
+# If we see "Looks Good To Me", then we're all set.
+```
+
+### Usage
+
+VGGish can be used in two ways:
+
+* *As a feature extractor*: VGGish converts audio input features into a
+  semantically meaningful, high-level 128-D embedding which can be fed as input
+  to a downstream classification model. The downstream model can be shallower
+  than usual because the VGGish embedding is more semantically compact than raw
+  audio features.
+
+  So, for example, you could train a classifier for 10 of the AudioSet classes
+  by using the released embeddings as features.  Then, you could use that
+  trained classifier with any arbitrary audio input by running the audio through
+  the audio feature extractor and VGGish model provided here, passing the
+  resulting embedding features as input to your trained model.
+  `vggish_inference_demo.py` shows how to produce VGGish embeddings from
+  arbitrary audio.
+
+* *As part of a larger model*: Here, we treat VGGish as a "warm start" for the
+  lower layers of a model that takes audio features as input and adds more
+  layers on top of the VGGish embedding. This can be used to fine-tune VGGish
+  (or parts thereof) if you have large datasets that might be very different
+  from the typical YouTube video clip. `vggish_train_demo.py` shows how to add
+  layers on top of VGGish and train the whole model.
+
+### About the Model
+
+The VGGish code layout is as follows:
+
+* `vggish_slim.py`: Model definition in TensorFlow Slim notation.
+* `vggish_params.py`: Hyperparameters.
+* `vggish_input.py`: Converter from audio waveform into input examples.
+* `mel_features.py`: Audio feature extraction helpers.
+* `vggish_postprocess.py`: Embedding postprocessing.
+* `vggish_inference_demo.py`: Demo of VGGish in inference mode.
+* `vggish_train_demo.py`: Demo of VGGish in training mode.
+* `vggish_smoke_test.py`: Simple test of a VGGish installation
+
+#### Architecture
+
+See `vggish_slim.py` and `vggish_params.py`.
+
+VGGish is a variant of the [VGG](https://arxiv.org/abs/1409.1556) model, in
+particular Configuration A with 11 weight layers. Specifically, here are the
+changes we made:
+
+* The input size was changed to 96x64 for log mel spectrogram audio inputs.
+
+* We drop the last group of convolutional and maxpool layers, so we now have
+  only four groups of convolution/maxpool layers instead of five.
+
+* Instead of a 1000-wide fully connected layer at the end, we use a 128-wide
+  fully connected layer. This acts as a compact embedding layer.
+
+The model definition provided here defines layers up to and including the
+128-wide embedding layer.
+
+#### Input: Audio Features
+
+See `vggish_input.py` and `mel_features.py`.
+
+VGGish was trained with audio features computed as follows:
+
+* All audio is resampled to 16 kHz mono.
+* A spectrogram is computed using magnitudes of the Short-Time Fourier Transform
+  with a window size of 25 ms, a window hop of 10 ms, and a periodic Hann
+  window.
+* A mel spectrogram is computed by mapping the spectrogram to 64 mel bins
+  covering the range 125-7500 Hz.
+* A stabilized log mel spectrogram is computed by applying
+  log(mel-spectrum + 0.01) where the offset is used to avoid taking a logarithm
+  of zero.
+* These features are then framed into non-overlapping examples of 0.96 seconds,
+  where each example covers 64 mel bands and 96 frames of 10 ms each.
+
+We provide our own NumPy implementation that produces features that are very
+similar to those produced by our internal production code. This results in
+embedding outputs that are closely match the embeddings that we have already
+released. Note that these embeddings will *not* be bit-for-bit identical to the
+released embeddings due to small differences between the feature computation
+code paths, and even between two different installations of VGGish with
+different underlying libraries and hardware. However, we expect that the
+embeddings will be equivalent in the context of a downstream classification
+task.
+
+#### Output: Embeddings
+
+See `vggish_postprocess.py`.
+
+The released AudioSet embeddings were postprocessed before release by applying a
+PCA transformation (which performs both PCA and whitening) as well as
+quantization to 8 bits per embedding element. This was done to be compatible
+with the [YouTube-8M](https://research.google.com/youtube8m) project which has
+released visual and audio embeddings for millions of YouTube videos in the same
+PCA/whitened/quantized format.
+
+We provide a Python implementation of the postprocessing which can be applied to
+batches of embeddings produced by VGGish. `vggish_inference_demo.py` shows how
+the postprocessor can be run after inference.
+
+If you don't need to use the released embeddings or YouTube-8M, then you could
+skip postprocessing and use raw embeddings.
+
+### Future Work
+
+Below are some of the things we would like to add to this repository.  We
+welcome pull requests for these or other enhancements, but please consider
+sending an email to the mailing list (see the Contact section) describing what
+you plan to do before you invest a lot of time, to get feedback from us and the
+rest of the community.
+
+* An AudioSet classifier trained on top of the VGGish embeddings to predict all
+  the AudioSet labels. This can act as a baseline for audio research using
+  AudioSet.
+* Feature extraction implemented within TensorFlow using the upcoming
+  [tf.contrib.signal](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/docs_src/api_guides/python/contrib.signal.md)
+  ops.
+* A Keras version of the VGGish model definition and checkpoint.
+* Jupyter notebook demonstrating audio feature extraction and model performance.
+
+## Contact
+
+For general questions about AudioSet and VGGish, please use the
+[audioset-users@googlegroups.com](https://groups.google.com/forum/#!forum/audioset-users)
+mailing list.
+
+For technical problems with the released model and code, please open an issue on
+the [tensorflow/models issue tracker](https://github.com/tensorflow/models/issues)
+and __*assign to @plakal and @dpwe*__. Please note that because the issue tracker
+is shared across all models released by Google, we won't be notified about an
+issue unless you explicitly @-mention us (@plakal and @dpwe) or assign the issue
+to us.
+
+## Credits
+
+Original authors and reviewers of the code in this package include (in
+alphabetical order):
+
+* DAn Ellis
+* Shawn Hershey
+* Aren Jansen
+* Manoj Plakal
diff --git a/data/usc/vggish/__init__.py b/data/usc/vggish/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/data/usc/vggish/mel_features.py b/data/usc/vggish/mel_features.py
new file mode 100644
index 0000000..13b14f7
--- /dev/null
+++ b/data/usc/vggish/mel_features.py
@@ -0,0 +1,218 @@
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Defines routines to compute mel spectrogram features from audio waveform."""
+
+import numpy as np
+
+
+def frame(data, window_length, hop_length):
+  """Convert array into a sequence of successive possibly overlapping frames.
+
+  An n-dimensional array of shape (num_samples, ...) is converted into an
+  (n+1)-D array of shape (num_frames, window_length, ...), where each frame
+  starts hop_length points after the preceding one.
+
+  This is accomplished using stride_tricks, so the original data is not
+  copied.  However, there is no zero-padding, so any incomplete frames at the
+  end are not included.
+
+  Args:
+    data: np.array of dimension N >= 1.
+    window_length: Number of samples in each frame.
+    hop_length: Advance (in samples) between each window.
+
+  Returns:
+    (N+1)-D np.array with as many rows as there are complete frames that can be
+    extracted.
+  """
+  num_samples = data.shape[0]
+  num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))
+  shape = (num_frames, window_length) + data.shape[1:]
+  strides = (data.strides[0] * hop_length,) + data.strides
+  return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)
+
+
+def periodic_hann(window_length):
+  """Calculate a "periodic" Hann window.
+
+  The classic Hann window is defined as a raised cosine that starts and
+  ends on zero, and where every value appears twice, except the middle
+  point for an odd-length window.  Matlab calls this a "symmetric" window
+  and np.hanning() returns it.  However, for Fourier analysis, this
+  actually represents just over one cycle of a period N-1 cosine, and
+  thus is not compactly expressed on a length-N Fourier basis.  Instead,
+  it's better to use a raised cosine that ends just before the final
+  zero value - i.e. a complete cycle of a period-N cosine.  Matlab
+  calls this a "periodic" window. This routine calculates it.
+
+  Args:
+    window_length: The number of points in the returned window.
+
+  Returns:
+    A 1D np.array containing the periodic hann window.
+  """
+  return 0.5 - (0.5 * np.cos(2 * np.pi / window_length *
+                             np.arange(window_length)))
+
+
+def stft_magnitude(signal, fft_length,
+                   hop_length=None,
+                   window_length=None):
+  """Calculate the short-time Fourier transform magnitude.
+
+  Args:
+    signal: 1D np.array of the input time-domain signal.
+    fft_length: Size of the FFT to apply.
+    hop_length: Advance (in samples) between each frame passed to FFT.
+    window_length: Length of each block of samples to pass to FFT.
+
+  Returns:
+    2D np.array where each row contains the magnitudes of the fft_length/2+1
+    unique values of the FFT for the corresponding frame of input samples.
+  """
+  frames = frame(signal, window_length, hop_length)
+  # Apply frame window to each frame. We use a periodic Hann (cosine of period
+  # window_length) instead of the symmetric Hann of np.hanning (period
+  # window_length-1).
+  window = periodic_hann(window_length)
+  windowed_frames = frames * window
+  return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))
+
+
+# Mel spectrum constants and functions.
+_MEL_BREAK_FREQUENCY_HERTZ = 700.0
+_MEL_HIGH_FREQUENCY_Q = 1127.0
+
+
+def hertz_to_mel(frequencies_hertz):
+  """Convert frequencies to mel scale using HTK formula.
+
+  Args:
+    frequencies_hertz: Scalar or np.array of frequencies in hertz.
+
+  Returns:
+    Object of same size as frequencies_hertz containing corresponding values
+    on the mel scale.
+  """
+  return _MEL_HIGH_FREQUENCY_Q * np.log(
+      1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))
+
+
+def spectrogram_to_mel_matrix(num_mel_bins=20,
+                              num_spectrogram_bins=129,
+                              audio_sample_rate=8000,
+                              lower_edge_hertz=125.0,
+                              upper_edge_hertz=3800.0):
+  """Return a matrix that can post-multiply spectrogram rows to make mel.
+
+  Returns a np.array matrix A that can be used to post-multiply a matrix S of
+  spectrogram values (STFT magnitudes) arranged as frames x bins to generate a
+  "mel spectrogram" M of frames x num_mel_bins.  M = S A.
+
+  The classic HTK algorithm exploits the complementarity of adjacent mel bands
+  to multiply each FFT bin by only one mel weight, then add it, with positive
+  and negative signs, to the two adjacent mel bands to which that bin
+  contributes.  Here, by expressing this operation as a matrix multiply, we go
+  from num_fft multiplies per frame (plus around 2*num_fft adds) to around
+  num_fft^2 multiplies and adds.  However, because these are all presumably
+  accomplished in a single call to np.dot(), it's not clear which approach is
+  faster in Python.  The matrix multiplication has the attraction of being more
+  general and flexible, and much easier to read.
+
+  Args:
+    num_mel_bins: How many bands in the resulting mel spectrum.  This is
+      the number of columns in the output matrix.
+    num_spectrogram_bins: How many bins there are in the source spectrogram
+      data, which is understood to be fft_size/2 + 1, i.e. the spectrogram
+      only contains the nonredundant FFT bins.
+    audio_sample_rate: Samples per second of the audio at the input to the
+      spectrogram. We need this to figure out the actual frequencies for
+      each spectrogram bin, which dictates how they are mapped into mel.
+    lower_edge_hertz: Lower bound on the frequencies to be included in the mel
+      spectrum.  This corresponds to the lower edge of the lowest triangular
+      band.
+    upper_edge_hertz: The desired top edge of the highest frequency band.
+
+  Returns:
+    An np.array with shape (num_spectrogram_bins, num_mel_bins).
+
+  Raises:
+    ValueError: if frequency edges are incorrectly ordered.
+  """
+  nyquist_hertz = audio_sample_rate / 2.
+  if lower_edge_hertz >= upper_edge_hertz:
+    raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" %
+                     (lower_edge_hertz, upper_edge_hertz))
+  spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins)
+  spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)
+  # The i'th mel band (starting from i=1) has center frequency
+  # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge
+  # band_edges_mel[i+1].  Thus, we need num_mel_bins + 2 values in
+  # the band_edges_mel arrays.
+  band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),
+                               hertz_to_mel(upper_edge_hertz), num_mel_bins + 2)
+  # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins
+  # of spectrogram values.
+  mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))
+  for i in range(num_mel_bins):
+    lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]
+    # Calculate lower and upper slopes for every spectrogram bin.
+    # Line segments are linear in the *mel* domain, not hertz.
+    lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /
+                   (center_mel - lower_edge_mel))
+    upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /
+                   (upper_edge_mel - center_mel))
+    # .. then intersect them with each other and zero.
+    mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope,
+                                                          upper_slope))
+  # HTK excludes the spectrogram DC bin; make sure it always gets a zero
+  # coefficient.
+  mel_weights_matrix[0, :] = 0.0
+  return mel_weights_matrix
+
+
+def log_mel_spectrogram(data,
+                        audio_sample_rate=8000,
+                        log_offset=0.0,
+                        window_length_secs=0.025,
+                        hop_length_secs=0.010,
+                        **kwargs):
+  """Convert waveform to a log magnitude mel-frequency spectrogram.
+
+  Args:
+    data: 1D np.array of waveform data.
+    audio_sample_rate: The sampling rate of data.
+    log_offset: Add this to values when taking log to avoid -Infs.
+    window_length_secs: Duration of each window to analyze.
+    hop_length_secs: Advance between successive analysis windows.
+    **kwargs: Additional arguments to pass to spectrogram_to_mel_matrix.
+
+  Returns:
+    2D np.array of (num_frames, num_mel_bins) consisting of log mel filterbank
+    magnitudes for successive frames.
+  """
+  window_length_samples = int(round(audio_sample_rate * window_length_secs))
+  hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
+  fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
+  spectrogram = stft_magnitude(
+      data,
+      fft_length=fft_length,
+      hop_length=hop_length_samples,
+      window_length=window_length_samples)
+  mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix(
+      num_spectrogram_bins=spectrogram.shape[1],
+      audio_sample_rate=audio_sample_rate, **kwargs))
+  return np.log(mel_spectrogram + log_offset)
diff --git a/data/usc/vggish/vggish_input.py b/data/usc/vggish/vggish_input.py
new file mode 100644
index 0000000..7e63768
--- /dev/null
+++ b/data/usc/vggish/vggish_input.py
@@ -0,0 +1,90 @@
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Compute input examples for VGGish from audio waveform."""
+
+import numpy as np
+import resampy
+from scipy.io import wavfile
+
+import .mel_features
+
+
+def waveform_to_examples(data, sample_rate, target_sample_rate=16000,
+                         log_offset=0.1, stft_win_len_sec=0.025,
+                         stft_hop_len_sec=0.010, num_mel_bins=64,
+                         mel_min_hz=125, mel_max_hz=7500, frame_win_sec=0.96,
+                         frame_hop_sec=0.96, **params):
+  """Converts audio waveform into an array of examples for VGGish.
+
+  Args:
+    data: np.array of either one dimension (mono) or two dimensions
+      (multi-channel, with the outer dimension representing channels).
+      Each sample is generally expected to lie in the range [-1.0, +1.0],
+      although this is not required.
+    sample_rate: Sample rate of data.
+
+  Returns:
+    3-D np.array of shape [num_examples, num_frames, num_bands] which represents
+    a sequence of examples, each of which contains a patch of log mel
+    spectrogram, covering num_frames frames of audio and num_bands mel frequency
+    bands, where the frame length is stft_hop_len_sec.
+  """
+
+  # Convert to mono.
+  if len(data.shape) > 1:
+    data = np.mean(data, axis=1)
+  # Resample to the rate assumed by VGGish.
+  if sample_rate != target_sample_rate:
+    data = resampy.resample(data, sample_rate, target_sample_rate)
+
+  # Compute log mel spectrogram features.
+  log_mel = mel_features.log_mel_spectrogram(
+      data,
+      audio_sample_rate=target_sample_rate,
+      log_offset=log_offset,
+      window_length_secs=stft_win_len_sec,
+      hop_length_secs=stft_hop_len_sec,
+      num_mel_bins=num_mel_bins,
+      lower_edge_hertz=mel_min_hz,
+      upper_edge_hertz=mel_max_hz)
+
+  # Frame features into examples.
+  features_sample_rate = 1.0 / stft_hop_len_sec
+  example_window_length = int(round(
+      frame_win_sec * features_sample_rate))
+  example_hop_length = int(round(
+      frame_hop_sec * features_sample_rate))
+  log_mel_examples = mel_features.frame(
+      log_mel,
+      window_length=example_window_length,
+      hop_length=example_hop_length)
+  return log_mel_examples
+
+
+def wavfile_to_examples(wav_file):
+  """Convenience wrapper around waveform_to_examples() for a common WAV format.
+
+  Args:
+    wav_file: String path to a file, or a file-like object. The file
+    is assumed to contain WAV audio data with signed 16-bit PCM samples.
+
+  Returns:
+    See waveform_to_examples.
+  """
+  sr, wav_data = wavfile.read(wav_file)
+  assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
+  samples = wav_data / 32768.0  # Convert to [-1.0, +1.0]
+  return waveform_to_examples(samples, sr)
diff --git a/data/usc/vggish/vggish_params.py b/data/usc/vggish/vggish_params.py
new file mode 100644
index 0000000..a38ce26
--- /dev/null
+++ b/data/usc/vggish/vggish_params.py
@@ -0,0 +1,53 @@
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Global parameters for the VGGish model.
+
+See vggish_slim.py for more information.
+"""
+
+# Architectural constants.
+NUM_FRAMES = 96  # Frames in input mel-spectrogram patch.
+NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
+EMBEDDING_SIZE = 128  # Size of embedding layer.
+
+# Hyperparameters used in feature and example generation.
+SAMPLE_RATE = 16000
+STFT_WINDOW_LENGTH_SECONDS = 0.025
+STFT_HOP_LENGTH_SECONDS = 0.010
+NUM_MEL_BINS = NUM_BANDS
+MEL_MIN_HZ = 125
+MEL_MAX_HZ = 7500
+LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
+EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
+EXAMPLE_HOP_SECONDS = 0.96     # with zero overlap.
+
+# Parameters used for embedding postprocessing.
+PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
+PCA_MEANS_NAME = 'pca_means'
+QUANTIZE_MIN_VAL = -2.0
+QUANTIZE_MAX_VAL = +2.0
+
+# Hyperparameters used in training.
+INIT_STDDEV = 0.01  # Standard deviation used to initialize weights.
+LEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.
+ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.
+
+# Names of ops, tensors, and features.
+INPUT_OP_NAME = 'vggish/input_features'
+INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
+OUTPUT_OP_NAME = 'vggish/embedding'
+OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
+AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'
diff --git a/data/usc/vggish/vggish_postprocess.py b/data/usc/vggish/vggish_postprocess.py
new file mode 100644
index 0000000..5e601a7
--- /dev/null
+++ b/data/usc/vggish/vggish_postprocess.py
@@ -0,0 +1,90 @@
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Post-process embeddings from VGGish."""
+
+import numpy as np
+
+
+class Postprocessor(object):
+  """Post-processes VGGish embeddings.
+
+  The initial release of AudioSet included 128-D VGGish embeddings for each
+  segment of AudioSet. These released embeddings were produced by applying
+  a PCA transformation (technically, a whitening transform is included as well)
+  and 8-bit quantization to the raw embedding output from VGGish, in order to
+  stay compatible with the YouTube-8M project which provides visual embeddings
+  in the same format for a large set of YouTube videos. This class implements
+  the same PCA (with whitening) and quantization transformations.
+  """
+
+  def __init__(self, pca_params_npz_path, pca_eigen_vectors_name='pca_eigen_vectors',
+          pca_means_name='pca_means', embedding_size=128,
+          quantize_min_val=-2.0, quantize_max_val=+2.0):
+    """Constructs a postprocessor.
+
+    Args:
+      pca_params_npz_path: Path to a NumPy-format .npz file that
+        contains the PCA parameters used in postprocessing.
+    """
+    params = np.load(pca_params_npz_path)
+    self._pca_matrix = params[pca_eigen_vectors_name]
+    # Load means into a column vector for easier broadcasting later.
+    self._pca_means = params[pca_means_name].reshape(-1, 1)
+    assert self._pca_matrix.shape == (
+        embedding_size, embedding_size), (
+            'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
+    assert self._pca_means.shape == (embedding_size, 1), (
+        'Bad PCA means shape: %r' % (self._pca_means.shape,))
+
+  def postprocess(self, embeddings_batch):
+    """Applies postprocessing to a batch of embeddings.
+
+    Args:
+      embeddings_batch: An nparray of shape [batch_size, embedding_size]
+        containing output from the embedding layer of VGGish.
+
+    Returns:
+      An nparray of the same shape as the input but of type uint8,
+      containing the PCA-transformed and quantized version of the input.
+    """
+    assert len(embeddings_batch.shape) == 2, (
+        'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
+    assert embeddings_batch.shape[1] == embedding_size, (
+        'Bad batch shape: %r' % (embeddings_batch.shape,))
+
+    # Apply PCA.
+    # - Embeddings come in as [batch_size, embedding_size].
+    # - Transpose to [embedding_size, batch_size].
+    # - Subtract pca_means column vector from each column.
+    # - Premultiply by PCA matrix of shape [output_dims, input_dims]
+    #   where both are are equal to embedding_size in our case.
+    # - Transpose result back to [batch_size, embedding_size].
+    pca_applied = np.dot(self._pca_matrix,
+                         (embeddings_batch.T - self._pca_means)).T
+
+    # Quantize by:
+    # - clipping to [min, max] range
+    clipped_embeddings = np.clip(
+        pca_applied, quantize_min_val,
+        quantize_max_val)
+    # - convert to 8-bit in range [0.0, 255.0]
+    quantized_embeddings = (
+        (clipped_embeddings - quantize_min_val) *
+        (255.0 / (quantize_max_val - quantize_min_val)))
+    # - cast 8-bit float to uint8
+    quantized_embeddings = quantized_embeddings.astype(np.uint8)
+
+    return quantized_embeddings
diff --git a/data/usc/vggish/vggish_slim.py b/data/usc/vggish/vggish_slim.py
new file mode 100644
index 0000000..ce72aa6
--- /dev/null
+++ b/data/usc/vggish/vggish_slim.py
@@ -0,0 +1,128 @@
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Defines the 'VGGish' model used to generate AudioSet embedding features.
+
+The public AudioSet release (https://research.google.com/audioset/download.html)
+includes 128-D features extracted from the embedding layer of a VGG-like model
+that was trained on a large Google-internal YouTube dataset. Here we provide
+a TF-Slim definition of the same model, without any dependences on libraries
+internal to Google. We call it 'VGGish'.
+
+Note that we only define the model up to the embedding layer, which is the
+penultimate layer before the final classifier layer. We also provide various
+hyperparameter values (in vggish_params.py) that were used to train this model
+internally.
+
+For comparison, here is TF-Slim's VGG definition:
+https://github.com/tensorflow/models/blob/master/research/slim/nets/vgg.py
+"""
+
+import tensorflow as tf
+
+slim = tf.contrib.slim
+
+
+def define_vggish_slim(training=False, init_stddev=0.01, num_frames=96,
+                       num_bands=64, embedding_size=128):
+  """Defines the VGGish TensorFlow model.
+
+  All ops are created in the current default graph, under the scope 'vggish/'.
+
+  The input is a placeholder named 'vggish/input_features' of type float32 and
+  shape [batch_size, num_frames, num_bands] where batch_size is variable and
+  num_frames and num_bands are constants, and [num_frames, num_bands] represents
+  a log-mel-scale spectrogram patch covering num_bands frequency bands and
+  num_frames time frames (where each frame step is usually 10ms). This is
+  produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET).
+  The output is an op named 'vggish/embedding' which produces the activations of
+  a 128-D embedding layer, which is usually the penultimate layer when used as
+  part of a full model with a final classifier layer.
+
+  Args:
+    training: If true, all parameters are marked trainable.
+
+  Returns:
+    The op 'vggish/embeddings'.
+  """
+  # Defaults:
+  # - All weights are initialized to N(0, INIT_STDDEV).
+  # - All biases are initialized to 0.
+  # - All activations are ReLU.
+  # - All convolutions are 3x3 with stride 1 and SAME padding.
+  # - All max-pools are 2x2 with stride 2 and SAME padding.
+  with slim.arg_scope([slim.conv2d, slim.fully_connected],
+                      weights_initializer=tf.truncated_normal_initializer(
+                          stddev=init_stddev),
+                      biases_initializer=tf.zeros_initializer(),
+                      activation_fn=tf.nn.relu,
+                      trainable=training), \
+       slim.arg_scope([slim.conv2d],
+                      kernel_size=[3, 3], stride=1, padding='SAME'), \
+       slim.arg_scope([slim.max_pool2d],
+                      kernel_size=[2, 2], stride=2, padding='SAME'), \
+       tf.variable_scope('vggish'):
+    # Input: a batch of 2-D log-mel-spectrogram patches.
+    features = tf.placeholder(
+        tf.float32, shape=(None, num_frames, num_bands),
+        name='input_features')
+    # Reshape to 4-D so that we can convolve a batch with conv2d().
+    net = tf.reshape(features, [-1, num_frames, num_bands, 1])
+
+    # The VGG stack of alternating convolutions and max-pools.
+    net = slim.conv2d(net, 64, scope='conv1')
+    net = slim.max_pool2d(net, scope='pool1')
+    net = slim.conv2d(net, 128, scope='conv2')
+    net = slim.max_pool2d(net, scope='pool2')
+    net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
+    net = slim.max_pool2d(net, scope='pool3')
+    net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
+    net = slim.max_pool2d(net, scope='pool4')
+
+    # Flatten before entering fully-connected layers
+    net = slim.flatten(net)
+    net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
+    # The embedding layer.
+    net = slim.fully_connected(net, embedding_size, scope='fc2')
+    return tf.identity(net, name='embedding')
+
+
+def load_vggish_slim_checkpoint(session, checkpoint_path):
+  """Loads a pre-trained VGGish-compatible checkpoint.
+
+  This function can be used as an initialization function (referred to as
+  init_fn in TensorFlow documentation) which is called in a Session after
+  initializating all variables. When used as an init_fn, this will load
+  a pre-trained checkpoint that is compatible with the VGGish model
+  definition. Only variables defined by VGGish will be loaded.
+
+  Args:
+    session: an active TensorFlow session.
+    checkpoint_path: path to a file containing a checkpoint that is
+      compatible with the VGGish model definition.
+  """
+  # Get the list of names of all VGGish variables that exist in
+  # the checkpoint (i.e., all inference-mode VGGish variables).
+  with tf.Graph().as_default():
+    define_vggish_slim(training=False)
+    vggish_var_names = [v.name for v in tf.global_variables()]
+
+  # Get the list of all currently existing variables that match
+  # the list of variable names we just computed.
+  vggish_vars = [v for v in tf.global_variables() if v.name in vggish_var_names]
+
+  # Use a Saver to restore just the variables selected above.
+  saver = tf.train.Saver(vggish_vars, name='vggish_load_pretrained')
+  saver.restore(session, checkpoint_path)
diff --git a/data/usc/vggish/vggish_smoke_test.py b/data/usc/vggish/vggish_smoke_test.py
new file mode 100644
index 0000000..d7ff3fc
--- /dev/null
+++ b/data/usc/vggish/vggish_smoke_test.py
@@ -0,0 +1,97 @@
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A smoke test for VGGish.
+
+This is a simple smoke test of a local install of VGGish and its associated
+downloaded files. We create a synthetic sound, extract log mel spectrogram
+features, run them through VGGish, post-process the embedding ouputs, and
+check some simple statistics of the results, allowing for variations that
+might occur due to platform/version differences in the libraries we use.
+
+Usage:
+- Download the VGGish checkpoint and PCA parameters into the same directory as
+  the VGGish source code. If you keep them elsewhere, update the checkpoint_path
+  and pca_params_path variables below.
+- Run:
+  $ python vggish_smoke_test.py
+"""
+
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+from . import vggish_input
+from . import vggish_params
+from . import vggish_postprocess
+from . import vggish_slim
+
+print('\nTesting your install of VGGish\n')
+
+# Paths to downloaded VGGish files.
+checkpoint_path = 'vggish_model.ckpt'
+pca_params_path = 'vggish_pca_params.npz'
+
+# Relative tolerance of errors in mean and standard deviation of embeddings.
+rel_error = 0.1  # Up to 10%
+
+# Generate a 1 kHz sine wave at 44.1 kHz (we use a high sampling rate
+# to test resampling to 16 kHz during feature extraction).
+num_secs = 3
+freq = 1000
+sr = 44100
+t = np.linspace(0, num_secs, int(num_secs * sr))
+x = np.sin(2 * np.pi * freq * t)
+
+# Produce a batch of log mel spectrogram examples.
+input_batch = vggish_input.waveform_to_examples(x, sr)
+print('Log Mel Spectrogram example: ', input_batch[0])
+np.testing.assert_equal(
+    input_batch.shape,
+    [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS])
+
+# Define VGGish, load the checkpoint, and run the batch through the model to
+# produce embeddings.
+with tf.Graph().as_default(), tf.Session() as sess:
+  vggish_slim.define_vggish_slim()
+  vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)
+
+  features_tensor = sess.graph.get_tensor_by_name(
+      vggish_params.INPUT_TENSOR_NAME)
+  embedding_tensor = sess.graph.get_tensor_by_name(
+      vggish_params.OUTPUT_TENSOR_NAME)
+  [embedding_batch] = sess.run([embedding_tensor],
+                               feed_dict={features_tensor: input_batch})
+  print('VGGish embedding: ', embedding_batch[0])
+  expected_embedding_mean = 0.131
+  expected_embedding_std = 0.238
+  np.testing.assert_allclose(
+      [np.mean(embedding_batch), np.std(embedding_batch)],
+      [expected_embedding_mean, expected_embedding_std],
+      rtol=rel_error)
+
+# Postprocess the results to produce whitened quantized embeddings.
+pproc = vggish_postprocess.Postprocessor(pca_params_path)
+postprocessed_batch = pproc.postprocess(embedding_batch)
+print('Postprocessed VGGish embedding: ', postprocessed_batch[0])
+expected_postprocessed_mean = 123.0
+expected_postprocessed_std = 75.0
+np.testing.assert_allclose(
+    [np.mean(postprocessed_batch), np.std(postprocessed_batch)],
+    [expected_postprocessed_mean, expected_postprocessed_std],
+    rtol=rel_error)
+
+print('\nLooks Good To Me!\n')
diff --git a/jobs/generate_vggish_samples_array.sbatch b/jobs/generate_vggish_samples_array.sbatch
new file mode 100644
index 0000000..d982408
--- /dev/null
+++ b/jobs/generate_vggish_samples_array.sbatch
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+
+#SBATCH --job-name=generate-vggish-samples
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=62GB
+#SBATCH --time=7-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@email.com
+#SBATCH --output="generate-vggish-samples-%A-%a.out"
+#SBATCH --err="generate-vggish-samples-%A-%a.err"
+
+
+source ~/.bashrc
+cd /home/$USER/dev
+source activate l3embedding-cpu
+
+SRCDIR=$HOME/dev/l3embedding
+US8K_PATH=/scratch/jtc440/UrbanSound8K
+METADATA_PATH=$US8K_PATH/metadata/UrbanSound8K.csv
+DATA_DIR=$US8K_PATH/audio
+OUTPUT_DIR=/scratch/jtc440/l3_features
+MODEL_ID='vggish'
+
+module purge
+#module load cuda/8.0.44
+#module load cudnn/8.0v6.0
+module load ffmpeg/intel/3.2.2
+
+python $SRCDIR/05_generate_embedding_samples.py \
+    --random-state 20180302 \
+    --verbose \
+    --features 'vggish_frames_uniform' \
+    --label-format 'int' \
+    --hop-size 0.1 \
+    --gpus 0 \
+    --fold $SLURM_ARRAY_TASK_ID \
+    $METADATA_PATH \
+    $MODEL_ID \
+    us8k \
+    $DATA_DIR \
+    $OUTPUT_DIR
diff --git a/resources/vggish/.gitkeep b/resources/vggish/.gitkeep
new file mode 100644
index 0000000..e69de29

From e5e07eb295d5ec2901192394ca45d52ff824541e Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 13 Mar 2018 16:45:20 -0400
Subject: [PATCH 133/201] Improve output directory convention for classifier
 data generation

---
 05_generate_embedding_samples.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/05_generate_embedding_samples.py b/05_generate_embedding_samples.py
index 5ae43ac..58ade32 100644
--- a/05_generate_embedding_samples.py
+++ b/05_generate_embedding_samples.py
@@ -152,7 +152,10 @@ def parse_arguments():
     dataset_name = args['dataset_name']
     fold_num = args['fold']
 
-    if model_path:
+    is_l3_feature = features.startswith('l3')
+
+    if is_l3_feature and model_path:
+        # Load L3 embedding model if using L3 features
         LOGGER.info('Loading embedding model...')
         l3embedding_model = load_embedding(model_path,
                                            model_type,
@@ -161,10 +164,15 @@ def parse_arguments():
     else:
         l3embedding_model = None
 
+    if is_l3_feature:
+        # If using an L3 model, make model arch. type and pooling type to path
+        dataset_output_dir = os.path.join(output_dir, dataset_name, features,
+                                          model_id, model_type, pooling_type)
+    else:
+        dataset_output_dir = os.path.join(output_dir, dataset_name, features,
+                                          model_id)
 
-    dataset_output_dir = os.path.join(output_dir, model_id,
-                                      model_type, pooling_type, features)
-
+    # Make sure output directory exists
     if not os.path.isdir(dataset_output_dir):
         os.makedirs(dataset_output_dir)
 

From 598f10f41764f72bb045411a838151686f7438d8 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 13 Mar 2018 16:45:58 -0400
Subject: [PATCH 134/201] Fix relative imports for vggish things

---
 data/usc/features.py            | 6 +++---
 data/usc/vggish/vggish_input.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/data/usc/features.py b/data/usc/features.py
index c885892..b0b1af9 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -7,9 +7,9 @@
 import soundfile as sf
 import resampy
 import tensorflow as tf
-import .vggish.vggish_input as vggish_input
-import .vggish.vggish_postprocess as vggish_postprocess
-import .vggish.vggish_slim as vggish_slim
+from .vggish import vggish_input
+from .vggish import vggish_postprocess
+from .vggish import vggish_slim
 
 LOGGER = logging.getLogger('cls-data-generation')
 LOGGER.setLevel(logging.DEBUG)
diff --git a/data/usc/vggish/vggish_input.py b/data/usc/vggish/vggish_input.py
index 7e63768..9f54497 100644
--- a/data/usc/vggish/vggish_input.py
+++ b/data/usc/vggish/vggish_input.py
@@ -19,7 +19,7 @@
 import resampy
 from scipy.io import wavfile
 
-import .mel_features
+from . import mel_features
 
 
 def waveform_to_examples(data, sample_rate, target_sample_rate=16000,

From 91ddb6ec3a4ecc47cd774e84985f1e48347411d0 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 13 Mar 2018 17:00:39 -0400
Subject: [PATCH 135/201] Adding missing keyword arguments to vggish stuff

---
 data/usc/features.py                  | 8 ++++----
 data/usc/vggish/vggish_input.py       | 4 ++--
 data/usc/vggish/vggish_postprocess.py | 6 +++---
 data/usc/vggish/vggish_slim.py        | 6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/data/usc/features.py b/data/usc/features.py
index b0b1af9..e881cd4 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -65,7 +65,7 @@ def extract_vggish_embedding(audio_path, input_op_name='vggish/input_features',
     examples_batch = vggish_input.waveform_to_examples(audio_data, fs, **params)
 
     # Prepare a postprocessor to munge the model embeddings.
-    pproc = vggish_postprocess.Postprocessor(pca_params_path)
+    pproc = vggish_postprocess.Postprocessor(pca_params_path, **params)
 
     # If needed, prepare a record writer to store the postprocessed embeddings.
     #writer = tf.python_io.TFRecordWriter(
@@ -77,15 +77,15 @@ def extract_vggish_embedding(audio_path, input_op_name='vggish/input_features',
     with tf.Graph().as_default(), tf.Session() as sess:
       # Define the model in inference mode, load the checkpoint, and
       # locate input and output tensors.
-      vggish_slim.define_vggish_slim(training=False)
-      vggish_slim.load_vggish_slim_checkpoint(sess, model_path)
+      vggish_slim.define_vggish_slim(training=False, **params)
+      vggish_slim.load_vggish_slim_checkpoint(sess, model_path, **params)
       features_tensor = sess.graph.get_tensor_by_name(input_tensor_name)
       embedding_tensor = sess.graph.get_tensor_by_name(output_tensor_name)
 
       # Run inference and postprocessing.
       [embedding_batch] = sess.run([embedding_tensor],
                                    feed_dict={features_tensor: examples_batch})
-      postprocessed_batch = pproc.postprocess(embedding_batch)
+      postprocessed_batch = pproc.postprocess(embedding_batch, **params)
 
       # Write the postprocessed embeddings as a SequenceExample, in a similar
       # format as the features released in AudioSet. Each row of the batch of
diff --git a/data/usc/vggish/vggish_input.py b/data/usc/vggish/vggish_input.py
index 9f54497..381d992 100644
--- a/data/usc/vggish/vggish_input.py
+++ b/data/usc/vggish/vggish_input.py
@@ -74,7 +74,7 @@ def waveform_to_examples(data, sample_rate, target_sample_rate=16000,
   return log_mel_examples
 
 
-def wavfile_to_examples(wav_file):
+def wavfile_to_examples(wav_file, **params):
   """Convenience wrapper around waveform_to_examples() for a common WAV format.
 
   Args:
@@ -87,4 +87,4 @@ def wavfile_to_examples(wav_file):
   sr, wav_data = wavfile.read(wav_file)
   assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
   samples = wav_data / 32768.0  # Convert to [-1.0, +1.0]
-  return waveform_to_examples(samples, sr)
+  return waveform_to_examples(samples, sr, **params)
diff --git a/data/usc/vggish/vggish_postprocess.py b/data/usc/vggish/vggish_postprocess.py
index 5e601a7..2955238 100644
--- a/data/usc/vggish/vggish_postprocess.py
+++ b/data/usc/vggish/vggish_postprocess.py
@@ -31,8 +31,7 @@ class Postprocessor(object):
   """
 
   def __init__(self, pca_params_npz_path, pca_eigen_vectors_name='pca_eigen_vectors',
-          pca_means_name='pca_means', embedding_size=128,
-          quantize_min_val=-2.0, quantize_max_val=+2.0):
+          pca_means_name='pca_means', embedding_size=128, **params):
     """Constructs a postprocessor.
 
     Args:
@@ -49,7 +48,8 @@ def __init__(self, pca_params_npz_path, pca_eigen_vectors_name='pca_eigen_vector
     assert self._pca_means.shape == (embedding_size, 1), (
         'Bad PCA means shape: %r' % (self._pca_means.shape,))
 
-  def postprocess(self, embeddings_batch):
+  def postprocess(self, embeddings_batch, embedding_size=128,
+                  quantize_min_val=-2.0, quantize_max_val=+2.0, **params):
     """Applies postprocessing to a batch of embeddings.
 
     Args:
diff --git a/data/usc/vggish/vggish_slim.py b/data/usc/vggish/vggish_slim.py
index ce72aa6..96c4e8b 100644
--- a/data/usc/vggish/vggish_slim.py
+++ b/data/usc/vggish/vggish_slim.py
@@ -36,7 +36,7 @@
 
 
 def define_vggish_slim(training=False, init_stddev=0.01, num_frames=96,
-                       num_bands=64, embedding_size=128):
+                       num_bands=64, embedding_size=128, **params):
   """Defines the VGGish TensorFlow model.
 
   All ops are created in the current default graph, under the scope 'vggish/'.
@@ -99,7 +99,7 @@ def define_vggish_slim(training=False, init_stddev=0.01, num_frames=96,
     return tf.identity(net, name='embedding')
 
 
-def load_vggish_slim_checkpoint(session, checkpoint_path):
+def load_vggish_slim_checkpoint(session, checkpoint_path, **params):
   """Loads a pre-trained VGGish-compatible checkpoint.
 
   This function can be used as an initialization function (referred to as
@@ -116,7 +116,7 @@ def load_vggish_slim_checkpoint(session, checkpoint_path):
   # Get the list of names of all VGGish variables that exist in
   # the checkpoint (i.e., all inference-mode VGGish variables).
   with tf.Graph().as_default():
-    define_vggish_slim(training=False)
+    define_vggish_slim(training=False, **params)
     vggish_var_names = [v.name for v in tf.global_variables()]
 
   # Get the list of all currently existing variables that match

From a0f3e878cebf2d4c1b5b3ab83e51b076e4053f4a Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 13 Mar 2018 17:10:15 -0400
Subject: [PATCH 136/201] Write each embedding generation config for each fold
 to a different file

---
 05_generate_embedding_samples.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/05_generate_embedding_samples.py b/05_generate_embedding_samples.py
index 58ade32..9f8438b 100644
--- a/05_generate_embedding_samples.py
+++ b/05_generate_embedding_samples.py
@@ -177,7 +177,7 @@ def parse_arguments():
         os.makedirs(dataset_output_dir)
 
     # Write configurations to a file for reproducibility/posterity
-    config_path = os.path.join(dataset_output_dir, 'config.json')
+    config_path = os.path.join(dataset_output_dir, 'config_{}.json'.format(fold_num))
     with open(config_path, 'w') as f:
         json.dump(args, f)
     LOGGER.info('Saved configuration to {}'.format(config_path))

From 9b90d6aa4f582e6e94eb3c3701ace5e95abbc18c Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 13 Mar 2018 20:59:36 -0400
Subject: [PATCH 137/201] Fix transpose issues with L3 feature generation, fix
 classifier test set loading, fix default log offset for VGGish features

---
 data/usc/features.py            |  5 +++--
 data/usc/us8k.py                |  2 +-
 data/usc/vggish/vggish_input.py | 19 ++++++++++++++-----
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/data/usc/features.py b/data/usc/features.py
index e881cd4..264233f 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -52,6 +52,7 @@ def one_hot(idx, n_classes=10):
 def extract_vggish_embedding(audio_path, input_op_name='vggish/input_features',
                              output_op_name='vggish/embedding',
                              resources_dir=None, **params):
+    # TODO: Make more efficient so we're not loading model every time we extract features
     fs = params.get('target_sample_rate', 16000)
     audio_data = load_audio(audio_path, fs)
 
@@ -300,7 +301,7 @@ def get_l3_frames_uniform(audio, l3embedding_model, hop_size=0.1, sr=48000):
     x = x.reshape((x.shape[0], 1, x.shape[-1]))
 
     # Get the L3 embedding for each frame
-    l3embedding = l3embedding_model.predict(x).T
+    l3embedding = l3embedding_model.predict(x)
 
     return l3embedding
 
@@ -358,7 +359,7 @@ def get_l3_frames_random(audio, l3embedding_model, num_samples, sr=48000):
 
         x = audio.reshape((1,1,audio.shape[0]))
         x = x.reshape((x.shape[0], 1, x.shape[-1]))
-        frame_l3embedding = l3embedding_model.predict(x).T.flatten()
+        frame_l3embedding = l3embedding_model.predict(x).flatten()
 
         l3embedding = np.tile(frame_l3embedding, (num_samples, 1))
 
diff --git a/data/usc/us8k.py b/data/usc/us8k.py
index 60e0631..8c6f57d 100644
--- a/data/usc/us8k.py
+++ b/data/usc/us8k.py
@@ -147,7 +147,7 @@ def load_test_fold(feature_dir, fold_idx):
         else:
             end_idx = start_idx + 1
 
-        X.append(X)
+        X.append(file_X)
         y.append(file_y)
         file_idxs.append((start_idx, end_idx))
 
diff --git a/data/usc/vggish/vggish_input.py b/data/usc/vggish/vggish_input.py
index 381d992..5b968ab 100644
--- a/data/usc/vggish/vggish_input.py
+++ b/data/usc/vggish/vggish_input.py
@@ -23,7 +23,7 @@
 
 
 def waveform_to_examples(data, sample_rate, target_sample_rate=16000,
-                         log_offset=0.1, stft_win_len_sec=0.025,
+                         log_offset=0.01, stft_win_len_sec=0.025,
                          stft_hop_len_sec=0.010, num_mel_bins=64,
                          mel_min_hz=125, mel_max_hz=7500, frame_win_sec=0.96,
                          frame_hop_sec=0.96, **params):
@@ -67,10 +67,19 @@ def waveform_to_examples(data, sample_rate, target_sample_rate=16000,
       frame_win_sec * features_sample_rate))
   example_hop_length = int(round(
       frame_hop_sec * features_sample_rate))
-  log_mel_examples = mel_features.frame(
-      log_mel,
-      window_length=example_window_length,
-      hop_length=example_hop_length)
+  try:
+      log_mel_examples = mel_features.frame(
+          log_mel,
+          window_length=example_window_length,
+          hop_length=example_hop_length)
+  except ValueError:
+      msg = "data shape: {}; mel_shape: {}; target_sample_rate: {}; log_offset: {}; stft_win_len_sec: {}; stft_hop_len_sec: {}; num_mel_bins: {}, mel_min_hz: {}; mel_max_hz: {}; frame_win_sec: {}; frame_hop_sec: {}"
+      msg = msg.format(data.shape, log_mel.shape, target_sample_rate,
+                         log_offset, stft_win_len_sec,
+                         stft_hop_len_sec, num_mel_bins,
+                         mel_min_hz, mel_max_hz, frame_win_sec,
+                         frame_hop_sec)
+      raise ValueError(msg)
   return log_mel_examples
 
 

From e1c4a946feea5a6d489133b3dfd518ac92560331 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 15 Mar 2018 02:46:46 -0400
Subject: [PATCH 138/201] Fix VGGish feature extraction, fix SVM bugs, fix MLP
 bugs, and add SVM CLI arguments

---
 06_train_classifier.py                | 33 ++++++++++++++++++
 classifier/train.py                   | 50 ++++++++++++++++++---------
 data/usc/features.py                  | 12 ++++++-
 data/usc/us8k.py                      |  2 +-
 data/usc/vggish/vggish_input.py       | 18 +++-------
 data/usc/vggish/vggish_postprocess.py | 20 ++++++-----
 jobs/classifier-train-array.sbatch    | 48 +++++++++++++++++++++++++
 jobs/classifier-train.sbatch          | 44 -----------------------
 8 files changed, 143 insertions(+), 84 deletions(-)
 create mode 100644 jobs/classifier-train-array.sbatch
 delete mode 100644 jobs/classifier-train.sbatch

diff --git a/06_train_classifier.py b/06_train_classifier.py
index 85fa507..18dc149 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -90,6 +90,39 @@ def parse_arguments():
                         default=1e-5,
                         help='(MLP) L2 regularization penalty factor')
 
+    parser.add_argument('-npf',
+                        '--norm-penalty-factor',
+                        dest='C',
+                        action='store',
+                        type=float,
+                        default=1e-4,
+                        help='(SVM) norm penalization factor')
+
+    parser.add_argument('-sct',
+                        '--svm-conv-tolerance',
+                        dest='tol',
+                        action='store',
+                        type=float,
+                        default=1e-3,
+                        help='(SVM) convergence tolerance threshold')
+
+    parser.add_argument('-smi',
+                        '--svm-max-iterations',
+                        dest='max_iterations',
+                        action='store',
+                        type=int,
+                        default=1000000,
+                        help='(SVM) maximum iterations')
+
+    parser.add_argument('-srpt',
+                        '--svm-reg-penalty-type',
+                        dest='reg_penalty',
+                        action='store',
+                        type=str,
+                        default='l2',
+                        choices=['l1', 'l2', 'elasticnet', 'none'],
+                        help='(SVM) maximum iterations')
+
     parser.add_argument('-r',
                         '--random-state',
                         dest='random_state',
diff --git a/classifier/train.py b/classifier/train.py
index fa71798..30a7af7 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -11,8 +11,9 @@
 from keras.models import Model
 from keras.optimizers import Adam
 from scipy.stats import mode
+from sklearn.metrics import hinge_loss
 from sklearn.externals import joblib
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.linear_model import SGDClassifier
 import pescador
 
@@ -67,7 +68,7 @@ def on_epoch_end(self, epoch, logs=None):
 
 
 def train_svm(train_gen, valid_data, test_data, model_dir, C=1e-4, reg_penalty='l2',
-              tol=1e-3, max_iterations=1000000, verbose=False, **kwargs):
+              num_classes=10, tol=1e-3, max_iterations=1000000, verbose=False, **kwargs):
     """
     Train a Support Vector Machine model on the given data
 
@@ -102,7 +103,7 @@ def train_svm(train_gen, valid_data, test_data, model_dir, C=1e-4, reg_penalty='
     stdizer = StandardScaler()
 
     X_valid = valid_data['features']
-    y_valid = valid_data['labels']
+    y_valid = valid_data['label']
 
     train_loss_history = []
     valid_loss_history = []
@@ -115,21 +116,27 @@ def train_svm(train_gen, valid_data, test_data, model_dir, C=1e-4, reg_penalty='
     # Create classifier
     clf = SGDClassifier(alpha=C, penalty=reg_penalty, n_jobs=-1, verbose=verbose)
 
+    classes = np.arange(num_classes)
+
     LOGGER.debug('Fitting model to data...')
     for iter_idx, train_data in enumerate(train_gen):
         X_train = train_data['features']
-        y_train = train_data['labels']
+        y_train = train_data['label']
         stdizer.partial_fit(X_train)
 
         # Fit data and get output for train and valid batches
-        clf.partial_fit(X_train, y_train)
-        y_train_pred = clf.predict(stdizer.transform(X_train))
-        y_valid_pred = clf.predict(stdizer.transform(X_valid))
+        clf.partial_fit(X_train, y_train, classes=classes)
+        X_train_std = stdizer.transform(X_train)
+        X_valid_std = stdizer.transform(X_valid)
+        y_train_pred = clf.predict(X_train_std)
+        y_valid_pred = clf.predict(X_valid_std)
 
         # Compute new metrics
         valid_loss_history.append(list(y_valid_pred))
-        train_loss_history.append(clf.loss_function_(y_train, y_train_pred))
-        valid_loss_history.append(clf.loss_function_(y_valid, y_valid_pred))
+        train_loss_history.append(hinge_loss(y_train, clf.decision_function(X_train_std),
+                                             labels=classes))
+        valid_loss_history.append(hinge_loss(y_valid, clf.decision_function(X_valid_std),
+                                             labels=classes))
         train_metric_history.append(compute_metrics(y_train, y_train_pred))
         valid_metric_history.append(compute_metrics(y_valid, y_valid_pred))
 
@@ -175,7 +182,7 @@ def train_svm(train_gen, valid_data, test_data, model_dir, C=1e-4, reg_penalty='
     return (stdizer, clf), train_metrics, valid_metrics, test_metrics
 
 
-def construct_mlp_model(input_shape, weight_decay=1e-5):
+def construct_mlp_model(input_shape, weight_decay=1e-5, num_classes=10):
     """
     Constructs a multi-layer perceptron model
 
@@ -194,10 +201,11 @@ def construct_mlp_model(input_shape, weight_decay=1e-5):
                 (Type: keras.layers.Layer)
     """
     l2_weight_decay = regularizers.l2(weight_decay)
+    LOGGER.info(str(input_shape))
     inp = Input(shape=input_shape, dtype='float32')
     y = Dense(512, activation='relu', kernel_regularizer=l2_weight_decay)(inp)
     y = Dense(128, activation='relu', kernel_regularizer=l2_weight_decay)(y)
-    y = Dense(10, activation='softmax', kernel_regularizer=l2_weight_decay)(y)
+    y = Dense(num_classes, activation='softmax', kernel_regularizer=l2_weight_decay)(y)
     m = Model(inputs=inp, outputs=y)
     m.name = 'urban_sound_classifier'
 
@@ -206,7 +214,7 @@ def construct_mlp_model(input_shape, weight_decay=1e-5):
 
 def train_mlp(train_gen, valid_data, test_data, model_dir,
               batch_size=64, num_epochs=100, train_epoch_size=None,
-              learning_rate=1e-4, weight_decay=1e-5,
+              learning_rate=1e-4, weight_decay=1e-5, num_classes=10,
               verbose=False, **kwargs):
     """
     Train a Multi-layer perceptron model on the given data
@@ -248,12 +256,19 @@ def train_mlp(train_gen, valid_data, test_data, model_dir,
 
     # Set up data inputs
     train_gen = pescador.maps.keras_tuples(train_gen, 'features', 'label')
-    valid_data_keras = (valid_data['features'], valid_data['label'])
+    enc = OneHotEncoder(n_values=num_classes, sparse=False)
+    # Transform int targets to produce one hot targets
+    train_gen = ((X, enc.fit_transform(y.reshape(-1, 1))) for X, y in train_gen)
+    valid_data_keras = (valid_data['features'],
+                        enc.fit_transform(valid_data['label'].reshape(-1,1)))
+
     train_iter = iter(train_gen)
     train_batch = next(train_iter)
 
     # Set up model
-    m, inp, out = construct_mlp_model(train_batch['features'].shape[1:], weight_decay=weight_decay)
+    m, inp, out = construct_mlp_model(train_batch[0].shape[1:],
+                                      weight_decay=weight_decay,
+                                      num_classes=num_classes)
 
     # Set up callbacks
     cb = []
@@ -274,7 +289,7 @@ def train_mlp(train_gen, valid_data, test_data, model_dir,
     LOGGER.debug('Compiling model...')
     m.compile(Adam(lr=learning_rate), loss=loss, metrics=metrics)
     LOGGER.debug('Fitting model to data...')
-    m.fit_generator(train_gen, batch_size=batch_size,
+    m.fit_generator(train_gen,
           epochs=num_epochs, steps_per_epoch=train_epoch_size,
           validation_data=valid_data_keras, callbacks=cb)
 
@@ -299,7 +314,8 @@ def train_mlp(train_gen, valid_data, test_data, model_dir,
         class_pred = mode(y_test_pred_frame[start_idx:end_idx])[0][0]
         y_test_pred.append(class_pred)
     y_test_pred = np.array(y_test_pred)
-    test_metrics = compute_metrics(test_data['labels'], y_test_pred)
+    test_metrics = compute_metrics(enc.fit_transform(test_data['labels'].reshape(-1,1)),
+                                   y_test_pred)
 
     return m, train_metrics, valid_metrics, test_metrics
 
@@ -387,7 +403,7 @@ def train(features_dir, output_dir, model_id, fold_num, model_type='svm',
 
     # Save results to disk
     results_file = os.path.join(model_dir, 'results.pkl')
-    with open(results_file, 'w') as fp:
+    with open(results_file, 'wb') as fp:
         pk.dump(results, fp, protocol=pk.HIGHEST_PROTOCOL)
 
     LOGGER.info('Done!')
diff --git a/data/usc/features.py b/data/usc/features.py
index 264233f..87e1dd0 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -54,8 +54,18 @@ def extract_vggish_embedding(audio_path, input_op_name='vggish/input_features',
                              resources_dir=None, **params):
     # TODO: Make more efficient so we're not loading model every time we extract features
     fs = params.get('target_sample_rate', 16000)
+    frame_win_sec = params.get('frame_win_sec', 0.96)
     audio_data = load_audio(audio_path, fs)
 
+    # For some reason 0.96 doesn't work, padding to 0.975 empirically works
+    frame_samples = int(np.ceil(fs * max(frame_win_sec, 0.975)))
+    if audio_data.shape[0] < frame_samples:
+        pad_length = frame_samples - audio_data.shape[0]
+        # Use (roughly) symmetric padding
+        left_pad = pad_length // 2
+        right_pad= pad_length - left_pad
+        audio_data = np.pad(audio_data, (left_pad, right_pad), mode='constant')
+
     if not resources_dir:
         resources_dir = os.path.join(os.path.dirname(__file__), '../../resources/vggish')
 
@@ -94,7 +104,7 @@ def extract_vggish_embedding(audio_path, input_op_name='vggish/input_features',
       # the rows are written as a sequence of bytes-valued features, where each
       # feature value contains the 128 bytes of the whitened quantized embedding.
 
-    return postprocessed_batch
+    return postprocessed_batch.astype(np.float32)
 
 
 def get_vggish_frames_uniform(audio_path, hop_size=0.1):
diff --git a/data/usc/us8k.py b/data/usc/us8k.py
index 8c6f57d..e7d586c 100644
--- a/data/usc/us8k.py
+++ b/data/usc/us8k.py
@@ -155,7 +155,7 @@ def load_test_fold(feature_dir, fold_idx):
 
     X = np.vstack(X)
 
-    if y[0].ndim == 0:
+    if type(y[0]) == int or y[0].ndim == 0:
         y = np.array(y)
     else:
         y = np.concatenate(y)
diff --git a/data/usc/vggish/vggish_input.py b/data/usc/vggish/vggish_input.py
index 5b968ab..7e24733 100644
--- a/data/usc/vggish/vggish_input.py
+++ b/data/usc/vggish/vggish_input.py
@@ -67,19 +67,11 @@ def waveform_to_examples(data, sample_rate, target_sample_rate=16000,
       frame_win_sec * features_sample_rate))
   example_hop_length = int(round(
       frame_hop_sec * features_sample_rate))
-  try:
-      log_mel_examples = mel_features.frame(
-          log_mel,
-          window_length=example_window_length,
-          hop_length=example_hop_length)
-  except ValueError:
-      msg = "data shape: {}; mel_shape: {}; target_sample_rate: {}; log_offset: {}; stft_win_len_sec: {}; stft_hop_len_sec: {}; num_mel_bins: {}, mel_min_hz: {}; mel_max_hz: {}; frame_win_sec: {}; frame_hop_sec: {}"
-      msg = msg.format(data.shape, log_mel.shape, target_sample_rate,
-                         log_offset, stft_win_len_sec,
-                         stft_hop_len_sec, num_mel_bins,
-                         mel_min_hz, mel_max_hz, frame_win_sec,
-                         frame_hop_sec)
-      raise ValueError(msg)
+  log_mel_examples = mel_features.frame(
+      log_mel,
+      window_length=example_window_length,
+      hop_length=example_hop_length)
+
   return log_mel_examples
 
 
diff --git a/data/usc/vggish/vggish_postprocess.py b/data/usc/vggish/vggish_postprocess.py
index 2955238..8722f1d 100644
--- a/data/usc/vggish/vggish_postprocess.py
+++ b/data/usc/vggish/vggish_postprocess.py
@@ -48,7 +48,7 @@ def __init__(self, pca_params_npz_path, pca_eigen_vectors_name='pca_eigen_vector
     assert self._pca_means.shape == (embedding_size, 1), (
         'Bad PCA means shape: %r' % (self._pca_means.shape,))
 
-  def postprocess(self, embeddings_batch, embedding_size=128,
+  def postprocess(self, embeddings_batch, embedding_size=128, quantize=False,
                   quantize_min_val=-2.0, quantize_max_val=+2.0, **params):
     """Applies postprocessing to a batch of embeddings.
 
@@ -80,11 +80,15 @@ def postprocess(self, embeddings_batch, embedding_size=128,
     clipped_embeddings = np.clip(
         pca_applied, quantize_min_val,
         quantize_max_val)
-    # - convert to 8-bit in range [0.0, 255.0]
-    quantized_embeddings = (
-        (clipped_embeddings - quantize_min_val) *
-        (255.0 / (quantize_max_val - quantize_min_val)))
-    # - cast 8-bit float to uint8
-    quantized_embeddings = quantized_embeddings.astype(np.uint8)
 
-    return quantized_embeddings
+    if quantize:
+        # - convert to 8-bit in range [0.0, 255.0]
+        quantized_embeddings = (
+            (clipped_embeddings - quantize_min_val) *
+            (255.0 / (quantize_max_val - quantize_min_val)))
+        # - cast 8-bit float to uint8
+        quantized_embeddings = quantized_embeddings.astype(np.uint8)
+
+        return quantized_embeddings
+    else:
+        return clipped_embeddings
diff --git a/jobs/classifier-train-array.sbatch b/jobs/classifier-train-array.sbatch
new file mode 100644
index 0000000..30ff20f
--- /dev/null
+++ b/jobs/classifier-train-array.sbatch
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+#SBATCH --job-name=us8k-classifier-train
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=2
+#SBATCH --mem=64GB
+#SBATCH --time=7-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@email.org
+#SBATCH --output="classifier-train-%A-%a.out"
+#SBATCH --err="classifier-train-%A-%a.err"
+
+
+source ~/.bashrc
+source activate l3embedding-cpu
+
+SRCDIR=$HOME/dev/l3embedding
+FEATURES_DIR=/scratch/jtc440/l3_features/us8k/l3_frames_uniform/l3embedding/cnn_L3_orig/original
+OUTPUT_DIR=/scratch/jtc440/cls_output
+MODEL_ID='mycls'
+MODEL_TYPE='svm'
+FOLD_NUM=$SLURM_ARRAY_TASK_ID
+
+module purge
+
+python $SRCDIR/06_train_classifier.py \
+    --random-state 20171021 \
+    --model-type $MODEL_TYPE \
+    --num-epochs 10 \
+    --train-num-streamers 1024 \
+    --train-mux-rate 16 \
+    --train-batch-size 128 \
+    --train-epoch-size 16 \
+    --valid-num-streamers 1024 \
+    --valid-mux-rate 1 \
+    --valid-batch-size 512 \
+    --norm-penalty-factor 0.0001 \
+    --svm-conv-tolerance 0.001 \
+    --svm-max-iterations 1000000 \
+    --svm-reg-penalty-type l2 \
+    --verbose \
+    $FEATURES_DIR \
+    $OUTPUT_DIR \
+    $MODEL_ID \
+    $FOLD_NUM
+
+#    --learning-rate 0.00001 \
+#    --weight-decay 0.00001 \
diff --git a/jobs/classifier-train.sbatch b/jobs/classifier-train.sbatch
deleted file mode 100644
index 36b0e5a..0000000
--- a/jobs/classifier-train.sbatch
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env bash
-
-##SBATCH --gres=gpu:1
-#SBATCH --job-name=us8k-classifier-train
-#SBATCH --nodes=1
-#SBATCH --cpus-per-task=2
-#SBATCH --mem=64GB
-#SBATCH --time=7-0
-#SBATCH --mail-type=ALL
-#SBATCH --mail-user=name@email.org
-#SBATCH --output="classifier-train-%j.out"
-#SBATCH --err="classifier-train-%j.err"
-
-
-source ~/.bashrc
-source activate l3embedding-cpu
-
-SRCDIR=''
-US8K_PATH=''
-METADATA_PATH=''
-DATA_DIR=''
-OUTPUT_DIR=''
-MODEL_PATH=''
-MODEL_ID='mycls'
-DATASET_NAME='us8k'
-
-module purge
-#module load cuda/8.0.44
-#module load cudnn/8.0v6.0
-
-python $SRCDIR/05_train_classifier.py \
-    --random-state 20171021 \
-    --features l3_frames_uniform \
-    --label-format int \
-    --model-type svm \
-    --l3embedding-model-path $MODEL_PATH \
-    --l3embedding-model-type cnn_L3_orig \
-    --hop-size 0.25 \
-    --verbose \
-    $METADATA_PATH \
-    $DATASET_NAME \
-    $DATA_DIR \
-    $MODEL_ID \
-    $OUTPUT_DIR

From 15e33a04b7f5bf68de6081902d14bd5fc77b0d3e Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 15 Mar 2018 14:54:12 -0400
Subject: [PATCH 139/201] Fix obtaining classes from test predictions with MLP
 classifier and add GSheets functionality to classifier training experiments

---
 06_train_classifier.py             |  12 ++++
 classifier/train.py                | 100 +++++++++++++++++++-------
 gsheets.py                         | 110 ++++++++++++++++++++---------
 jobs/classifier-train-array.sbatch |   4 ++
 l3embedding/train.py               |   6 +-
 5 files changed, 169 insertions(+), 63 deletions(-)

diff --git a/06_train_classifier.py b/06_train_classifier.py
index 18dc149..4f98c9b 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -123,6 +123,18 @@ def parse_arguments():
                         choices=['l1', 'l2', 'elasticnet', 'none'],
                         help='(SVM) maximum iterations')
 
+    parser.add_argument('-gsid',
+                        '--gsheet-id',
+                        dest='gsheet_id',
+                        type=str,
+                        help='Google Spreadsheet ID for centralized logging of experiments')
+
+    parser.add_argument('-gdan',
+                        '--google-dev-app-name',
+                        dest='google_dev_app_name',
+                        type=str,
+                        help='Google Developer Application Name for using API')
+
     parser.add_argument('-r',
                         '--random-state',
                         dest='random_state',
diff --git a/classifier/train.py b/classifier/train.py
index 30a7af7..4202511 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -1,8 +1,10 @@
 import datetime
+import getpass
 import json
 import os
 import pickle as pk
 import random
+import git
 
 import keras
 import keras.regularizers as regularizers
@@ -22,6 +24,9 @@
 from l3embedding.train import LossHistory
 from log import *
 
+from gsheets import get_credentials, append_row, update_experiment
+from googleapiclient import discovery
+
 LOGGER = logging.getLogger('classifier')
 LOGGER.setLevel(logging.DEBUG)
 
@@ -296,14 +301,14 @@ def train_mlp(train_gen, valid_data, test_data, model_dir,
     # Set up train and validation metrics
     train_metrics = {
         'loss': metric_cb.train_loss,
-        'acc': metric_cb.train_acc
+        'accuracy': metric_cb.train_acc
     }
 
     valid_metrics = {
         'loss': metric_cb.valid_loss,
-        'acc': metric_cb.valid_acc,
-        'class_acc': metric_cb.valid_class_acc,
-        'avg_class_acc': metric_cb.valid_avg_class_acc
+        'accuracy': metric_cb.valid_acc,
+        'class_accuracy': metric_cb.valid_class_acc,
+        'average_class_accuracy': metric_cb.valid_avg_class_acc
     }
 
     # Evaluate model on test data
@@ -311,11 +316,10 @@ def train_mlp(train_gen, valid_data, test_data, model_dir,
     y_test_pred_frame = m.predict(X_test)
     y_test_pred = []
     for start_idx, end_idx in test_data['file_idxs']:
-        class_pred = mode(y_test_pred_frame[start_idx:end_idx])[0][0]
+        class_pred = y_test_pred_frame[start_idx:end_idx].mean(axis=0).argmax()
         y_test_pred.append(class_pred)
     y_test_pred = np.array(y_test_pred)
-    test_metrics = compute_metrics(enc.fit_transform(test_data['labels'].reshape(-1,1)),
-                                   y_test_pred)
+    test_metrics = compute_metrics(test_data['labels'], y_test_pred)
 
     return m, train_metrics, valid_metrics, test_metrics
 
@@ -323,7 +327,8 @@ def train_mlp(train_gen, valid_data, test_data, model_dir,
 def train(features_dir, output_dir, model_id, fold_num, model_type='svm',
           train_num_streamers=None, train_batch_size=64, train_mux_rate=None,
           valid_num_streamers=None, valid_batch_size=64, valid_mux_rate=None,
-          random_state=20171021, verbose=False, **model_args):
+          random_state=20171021, gsheet_id=None, google_dev_app_name=None,
+          verbose=False, **model_args):
     init_console_logger(LOGGER, verbose=verbose)
     LOGGER.debug('Initialized logging.')
 
@@ -340,27 +345,54 @@ def train(features_dir, output_dir, model_id, fold_num, model_type='svm',
     if not os.path.isdir(model_dir):
         os.makedirs(model_dir)
 
+    config = {
+        'username': getpass.getuser(),
+        'features_dir': features_dir,
+        'output_dir': output_dir,
+        'model_dir': model_dir,
+        'model_id': model_id,
+        'fold_num': fold_num,
+        'model_type': model_type,
+        'train_num_streamers': train_num_streamers,
+        'train_batch_size': train_batch_size,
+        'train_mux_rate': train_mux_rate,
+        'valid_num_streamers': valid_num_streamers,
+        'valid_batch_size': valid_batch_size,
+        'valid_mux_rate': valid_mux_rate,
+        'random_state': random_state,
+        'verbose': verbose,
+        'git_commit': git.Repo(os.path.dirname(os.path.abspath(__file__)),
+                               search_parent_directories=True).head.object.hexsha,
+        'gsheet_id': gsheet_id,
+        'google_dev_app_name': google_dev_app_name
+    }
+    config.update(model_args)
+
     # Save configs
     with open(os.path.join(model_dir, 'config.json'), 'w') as fp:
-        config = {
-            'features_dir': features_dir,
-            'output_dir': output_dir,
-            'model_id': model_id,
-            'fold_num': fold_num,
-            'model_type': model_type,
-            'train_num_streamers': train_num_streamers,
-            'train_batch_size': train_batch_size,
-            'train_mux_rate': train_mux_rate,
-            'valid_num_streamers': valid_num_streamers,
-            'valid_batch_size': valid_batch_size,
-            'valid_mux_rate': valid_mux_rate,
-            'random_state': random_state,
-            'verbose': verbose
-        }
-        config.update(model_args)
-
         json.dump(config, fp)
 
+
+    if gsheet_id:
+        # Add a new entry in the Google Sheets spreadsheet
+        LOGGER.info('Creating new spreadsheet entry...')
+        config.update({
+              'train_loss': '-',
+              'valid_loss': '-',
+              'train_acc': '-',
+              'valid_acc': '-',
+              'valid_avg_class_acc': '-',
+              'valid_class_acc': '-',
+              'test_acc': '-',
+              'test_avg_class_acc': '-',
+              'test_class_acc': '-'
+        })
+        credentials = get_credentials(google_dev_app_name)
+        service = discovery.build('sheets', 'v4', credentials=credentials)
+        append_row(service, gsheet_id, config, 'classifier')
+    else:
+        LOGGER.error(gsheet_id)
+
     LOGGER.info('Loading data...')
 
     fold_idx = fold_num - 1
@@ -406,4 +438,22 @@ def train(features_dir, output_dir, model_id, fold_num, model_type='svm',
     with open(results_file, 'wb') as fp:
         pk.dump(results, fp, protocol=pk.HIGHEST_PROTOCOL)
 
+
+    if gsheet_id:
+        # Update spreadsheet with results
+        LOGGER.info('Updating spreadsheet...')
+        update_values = [
+              train_metrics['loss'][-1],
+              valid_metrics['loss'][-1],
+              train_metrics['accuracy'][-1],
+              valid_metrics['accuracy'][-1],
+              valid_metrics['average_class_accuracy'][-1],
+              ', '.join(map(str, valid_metrics['class_accuracy'][-1])),
+              test_metrics['accuracy'],
+              test_metrics['average_class_accuracy'],
+              ', '.join(map(str, test_metrics['class_accuracy']))
+        ]
+        update_experiment(service, gsheet_id, config, 'V', 'AE',
+                          update_values, 'classifier')
+
     LOGGER.info('Done!')
diff --git a/gsheets.py b/gsheets.py
index 0b53a98..acc8be8 100644
--- a/gsheets.py
+++ b/gsheets.py
@@ -8,33 +8,66 @@
 
 # TODO: Implement initializing spreadsheet
 
-FIELD_NAMES = [
-          'username',
-          'model_id',
-          'model_dir',
-          'git_commit',
-          'train_data_dir',
-          'validation_data_dir',
-          'continue_model_dir',
-          'model_type',
-          'num_epochs',
-          'train_epoch_size',
-          'validation_epoch_size',
-          'train_batch_size',
-          'validation_batch_size',
-          'random_state',
-          'learning_rate',
-          'gpus',
-          'checkpoint_interval',
-          'latest_epoch',
-          'latest_train_loss',
-          'latest_validation_loss',
-          'latest_train_acc',
-          'latest_validation_acc',
-          'best_train_loss',
-          'best_validation_loss',
-          'best_train_acc',
-          'best_validation_acc'
+EMBEDDING_FIELD_NAMES = [
+    'username',
+    'model_id',
+    'model_dir',
+    'git_commit',
+    'train_data_dir',
+    'validation_data_dir',
+    'continue_model_dir',
+    'model_type',
+    'num_epochs',
+    'train_epoch_size',
+    'validation_epoch_size',
+    'train_batch_size',
+    'validation_batch_size',
+    'random_state',
+    'learning_rate',
+    'gpus',
+    'checkpoint_interval',
+    'latest_epoch',
+    'latest_train_loss',
+    'latest_validation_loss',
+    'latest_train_acc',
+    'latest_validation_acc',
+    'best_train_loss',
+    'best_validation_loss',
+    'best_train_acc',
+    'best_validation_acc'
+]
+
+CLASSIFIER_FIELD_NAMES = [
+    'username',
+    'model_id',
+    'model_dir',
+    'git_commit',
+    'features_dir',
+    'fold_num',
+    'model_type',
+    'train_num_streamers',
+    'train_batch_size',
+    'train_mux_rate',
+    'valid_num_streamers',
+    'valid_batch_size',
+    'valid_mux_rate',
+    'random_state',
+    'num_epochs',
+    'learning_rate',
+    'weight_decay',
+    'reg_penalty',
+    'C',
+    'tol',
+    'max_iterations',
+    'train_loss',
+    'valid_loss',
+    'train_acc',
+    'valid_acc',
+    'valid_avg_class_acc',
+    'valid_class_acc',
+    'test_acc',
+    'test_avg_class_acc',
+    'test_class_acc',
 ]
 
 
@@ -74,10 +107,17 @@ def get_credentials(application_name, client_secret_file=None, flags=None):
     return credentials
 
 
-def append_row(service, spreadsheet_id, param_dict):
+def append_row(service, spreadsheet_id, param_dict, sheet_name):
+    if sheet_name == 'embedding':
+        field_names = EMBEDDING_FIELD_NAMES
+    elif sheet_name == 'classifier':
+        field_names = CLASSIFIER_FIELD_NAMES
+    else:
+        raise ValueError('Unknown spreadsheet sheet name: {}'.format(sheet_name))
+
     # The A1 notation of a range to search for a logical table of data.
     # Values will be appended after the last row of the table.
-    range_ = 'A1:A{}'.format(len(FIELD_NAMES))
+    range_ = '{}!A1:A{}'.format(sheet_name, len(field_names))
     # How the input data should be interpreted.
     value_input_option = 'USER_ENTERED'
     # How the input data should be inserted.
@@ -86,7 +126,7 @@ def append_row(service, spreadsheet_id, param_dict):
     value_range_body = {
         "range": range_,
         "majorDimension": 'ROWS',
-        "values": [[str(param_dict[field_name]) for field_name in FIELD_NAMES ]]
+        "values": [[str(param_dict[field_name]) for field_name in field_names ]]
     }
 
     request = service.spreadsheets().values().append(
@@ -113,8 +153,8 @@ def request_with_retry(request, num_retries=50):
     return response
 
 
-def get_row(service, spreadsheet_id, param_dict):
-    range_ = 'C:C'
+def get_row(service, spreadsheet_id, param_dict, sheet_name):
+    range_ = '{}!C:C'.format(sheet_name)
     major_dimension = 'COLUMNS'
 
     request = service.spreadsheets().values().get(
@@ -130,10 +170,10 @@ def get_row(service, spreadsheet_id, param_dict):
         return None
 
 
-def update_experiment(service, spreadsheet_id, param_dict, start_col, end_col, values):
-    row_num = get_row(service, spreadsheet_id, param_dict)
+def update_experiment(service, spreadsheet_id, param_dict, start_col, end_col, values, sheet_name):
+    row_num = get_row(service, spreadsheet_id, param_dict, sheet_name)
     value_input_option = 'USER_ENTERED'
-    range_ = '{1}{0}:{2}{0}'.format(row_num, start_col, end_col)
+    range_ = '{0}!{2}{1}:{3}{1}'.format(sheet_name, row_num, start_col, end_col)
     value_range_body = {
         "range": range_,
         "majorDimension": 'ROWS',
diff --git a/jobs/classifier-train-array.sbatch b/jobs/classifier-train-array.sbatch
index 30ff20f..43ec9df 100644
--- a/jobs/classifier-train-array.sbatch
+++ b/jobs/classifier-train-array.sbatch
@@ -19,6 +19,8 @@ FEATURES_DIR=/scratch/jtc440/l3_features/us8k/l3_frames_uniform/l3embedding/cnn_
 OUTPUT_DIR=/scratch/jtc440/cls_output
 MODEL_ID='mycls'
 MODEL_TYPE='svm'
+GOOGLE_DEV_APP_NAME=''
+GSHEET_ID=''
 FOLD_NUM=$SLURM_ARRAY_TASK_ID
 
 module purge
@@ -38,6 +40,8 @@ python $SRCDIR/06_train_classifier.py \
     --svm-conv-tolerance 0.001 \
     --svm-max-iterations 1000000 \
     --svm-reg-penalty-type l2 \
+    --gsheet-id $GSHEET_ID \
+    --google-dev-app-name $GOOGLE_DEV_APP_NAME \
     --verbose \
     $FEATURES_DIR \
     $OUTPUT_DIR \
diff --git a/l3embedding/train.py b/l3embedding/train.py
index b7d1c2d..ef1629e 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -65,9 +65,9 @@ def __init__(self, google_dev_app_name, spreadsheet_id, param_dict):
         self.service = discovery.build('sheets', 'v4', credentials=self.credentials)
         self.param_dict = copy.deepcopy(param_dict)
 
-        row_num = get_row(self.service, self.spreadsheet_id, self.param_dict)
+        row_num = get_row(self.service, self.spreadsheet_id, self.param_dict, 'embedding')
         if row_num is None:
-            append_row(self.service, self.spreadsheet_id, self.param_dict)
+            append_row(self.service, self.spreadsheet_id, self.param_dict, 'embedding')
 
     def on_train_begin(self, logs=None):
         if logs is None:
@@ -102,7 +102,7 @@ def on_epoch_end(self, epoch, logs=None):
             self.best_valid_loss, self.best_train_acc, self.best_valid_acc]
 
         update_experiment(self.service, self.spreadsheet_id, self.param_dict,
-                          'R', 'AA', values)
+                          'R', 'AA', values, 'embedding')
 
 
 class TimeHistory(keras.callbacks.Callback):

From bacbd17592cf04f4512d161f393ab97f13007852 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 20 Mar 2018 05:15:39 -0400
Subject: [PATCH 140/201] Make VGGish features quantized by default, remove
 some statistics from stats features, add VGGish stats features, add short
 embedding pooling option, add batch norm to MLP classifier

---
 classifier/train.py                   |  7 ++-
 data/usc/features.py                  | 77 +++++++++++++++++++--------
 data/usc/vggish/vggish_postprocess.py |  2 +-
 l3embedding/audio_model.py            | 34 ++----------
 4 files changed, 65 insertions(+), 55 deletions(-)

diff --git a/classifier/train.py b/classifier/train.py
index 4202511..083458e 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -9,7 +9,7 @@
 import keras
 import keras.regularizers as regularizers
 import numpy as np
-from keras.layers import Input, Dense
+from keras.layers import Input, Dense, BatchNormalization
 from keras.models import Model
 from keras.optimizers import Adam
 from scipy.stats import mode
@@ -208,8 +208,11 @@ def construct_mlp_model(input_shape, weight_decay=1e-5, num_classes=10):
     l2_weight_decay = regularizers.l2(weight_decay)
     LOGGER.info(str(input_shape))
     inp = Input(shape=input_shape, dtype='float32')
-    y = Dense(512, activation='relu', kernel_regularizer=l2_weight_decay)(inp)
+    y = BatchNormalization()(y)
+    y = Dense(512, activation='relu', kernel_regularizer=l2_weight_decay)(y)
+    y = BatchNormalization()(y)
     y = Dense(128, activation='relu', kernel_regularizer=l2_weight_decay)(y)
+    y = BatchNormalization()(y)
     y = Dense(num_classes, activation='softmax', kernel_regularizer=l2_weight_decay)(y)
     m = Model(inputs=inp, outputs=y)
     m.name = 'urban_sound_classifier'
diff --git a/data/usc/features.py b/data/usc/features.py
index 87e1dd0..9a75e27 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -126,6 +126,27 @@ def get_vggish_frames_uniform(audio_path, hop_size=0.1):
     return extract_vggish_embedding(audio_path, frame_hop_sec=hop_size)
 
 
+def get_vggish_stats(audio_path, hop_size=0.1):
+    """
+    Get vggish embedding features summary statistics for all frames in the
+    audio file
+
+    Args:
+        audio: Audio data or path to audio file
+               (Type: np.ndarray or str)
+
+    Keyword Args:
+        hop_size: Hop size in seconds
+                  (Type: float)
+
+    Returns:
+        features:  Array of embedding vectors
+                   (Type: np.ndarray)
+    """
+    vggish_embedding = extract_vggish_embedding(audio_path, frame_hop_sec=hop_size)
+    return compute_stats_features(vggish_embedding, hop_size)
+
+
 def get_l3_stack_features(audio_path, l3embedding_model, hop_size=0.1):
     """
     Get stacked L3 embedding features, i.e. stack embedding features for each
@@ -183,6 +204,35 @@ def get_l3_stack_features(audio_path, l3embedding_model, hop_size=0.1):
     return l3embedding.flatten()
 
 
+def compute_stats_features(embeddings, hop_size):
+    # Compute statistics on the time series of embeddings
+    minimum = np.min(embeddings, axis=0)
+    maximum = np.max(embeddings, axis=0)
+    #median = np.median(embeddings, axis=0)
+    mean = np.mean(embeddings, axis=0)
+    var = np.var(embeddings, axis=0)
+    #skewness = sp.stats.skew(embeddings, axis=0)
+    #kurtosis = sp.stats.kurtosis(embeddings, axis=0)
+
+    # Compute statistics on the first and second derivatives of time series of embeddings
+
+    # Use finite differences to approximate the derivatives
+    d1 = np.gradient(embeddings, 1/hop_size, edge_order=1, axis=0)
+    #d2 = np.gradient(embeddings, 1/hop_size, edge_order=2, axis=0)
+
+    d1_mean = np.mean(d1, axis=0)
+    d1_var = np.var(d1, axis=0)
+
+    #d2_mean = np.mean(d2, axis=0)
+    #d2_var = np.var(d2, axis=0)
+
+    """
+    return np.concatenate((minimum, maximum, median, mean, var, skewness, kurtosis,
+                           d1_mean, d1_var, d2_mean, d2_var))
+    """
+    return np.concatenate((minimum, maximum, mean, var, d1_mean, d1_var))
+
+
 def get_l3_stats_features(audio_path, l3embedding_model, hop_size=0.1):
     """
     Get L3 embedding stats features, i.e. compute statistics for each of the
@@ -238,29 +288,7 @@ def get_l3_stats_features(audio_path, l3embedding_model, hop_size=0.1):
     # Get the L3 embedding for each frame
     l3embedding = l3embedding_model.predict(x)
 
-    # Compute statistics on the time series of embeddings
-    minimum = np.min(l3embedding, axis=0)
-    maximum = np.max(l3embedding, axis=0)
-    median = np.median(l3embedding, axis=0)
-    mean = np.mean(l3embedding, axis=0)
-    var = np.var(l3embedding, axis=0)
-    skewness = sp.stats.skew(l3embedding, axis=0)
-    kurtosis = sp.stats.kurtosis(l3embedding, axis=0)
-
-    # Compute statistics on the first and second derivatives of time series of embeddings
-
-    # Use finite differences to approximate the derivatives
-    d1 = np.gradient(l3embedding, 1/sr, edge_order=1, axis=0)
-    d2 = np.gradient(l3embedding, 1/sr, edge_order=2, axis=0)
-
-    d1_mean = np.mean(d1, axis=0)
-    d1_var = np.var(d1, axis=0)
-
-    d2_mean = np.mean(d2, axis=0)
-    d2_var = np.var(d2, axis=0)
-
-    return np.concatenate((minimum, maximum, median, mean, var, skewness, kurtosis,
-                           d1_mean, d1_var, d2_mean, d2_var))
+    return compute_stats_features(l3embedding, hop_size)
 
 
 def get_l3_frames_uniform(audio, l3embedding_model, hop_size=0.1, sr=48000):
@@ -399,6 +427,9 @@ def compute_file_features(path, feature_type, l3embedding_model=None, **feature_
             raise ValueError('Must specify "num_samples" for "l3_frame_random" features')
         file_features = get_l3_frames_random(path, l3embedding_model,
                                              num_samples)
+    elif feature_type == 'vggish_stats':
+        hop_size = feature_args.get('hop_size', 0.1)
+        file_features = get_vggish_stats(path, hop_size=hop_size)
     elif feature_type == 'vggish_frames_uniform':
         hop_size = feature_args.get('hop_size', 0.1)
         file_features = get_vggish_frames_uniform(path, hop_size=hop_size)
diff --git a/data/usc/vggish/vggish_postprocess.py b/data/usc/vggish/vggish_postprocess.py
index 8722f1d..13035ae 100644
--- a/data/usc/vggish/vggish_postprocess.py
+++ b/data/usc/vggish/vggish_postprocess.py
@@ -48,7 +48,7 @@ def __init__(self, pca_params_npz_path, pca_eigen_vectors_name='pca_eigen_vector
     assert self._pca_means.shape == (embedding_size, 1), (
         'Bad PCA means shape: %r' % (self._pca_means.shape,))
 
-  def postprocess(self, embeddings_batch, embedding_size=128, quantize=False,
+  def postprocess(self, embeddings_batch, embedding_size=128, quantize=True,
                   quantize_min_val=-2.0, quantize_max_val=+2.0, **params):
     """Applies postprocessing to a batch of embeddings.
 
diff --git a/l3embedding/audio_model.py b/l3embedding/audio_model.py
index caaf6a9..fa6097e 100644
--- a/l3embedding/audio_model.py
+++ b/l3embedding/audio_model.py
@@ -440,29 +440,6 @@ def construct_cnn_L3_melspec2_audio_model():
     return m, x_a, y_a
 
 
-def construct_cnn_l3_orig_audio_embedding_model(audio_model, x_a):
-    """
-    Constructs a model that produces the learned audio embedding
-
-    Args:
-        audio_model: audio subnetwork
-        x_a: audio data input Tensor
-
-    Returns:
-        m:   Model object
-        x_a: audio data input Tensor
-        y_a: Embedding output Tensor
-
-    """
-    pool_size = (8, 8)
-    embed_layer = audio_model.get_layer('audio_embedding_layer')
-    y_a = MaxPooling2D(pool_size=pool_size, padding='same')(embed_layer.output)
-    y_a = Flatten()(y_a)
-
-    m = Model(inputs=x_a, outputs=y_a)
-    return m, x_a, y_a
-
-
 def convert_audio_model_to_embedding(audio_model, x_a, model_type, pooling_type='original'):
     """
     Given and audio subnetwork, return a model that produces the learned
@@ -482,15 +459,19 @@ def convert_audio_model_to_embedding(audio_model, x_a, model_type, pooling_type=
     pooling = {
         'cnn_L3_orig': {
             'original': (8, 8),
+            'short': (32, 24),
         },
         'cnn_L3_kapredbinputbn': {
             'original': (8, 8),
+            'short': (32, 24),
         },
         'cnn_L3_melspec1': {
             'original': (4, 8),
+            'short': (16, 24),
         },
         'cnn_L3_melspec2': {
-            'original': (8, 8)
+            'original': (8, 8),
+            'short': (32, 24),
         }
     }
 
@@ -556,8 +537,3 @@ def construct_tiny_L3_audio_model():
     m.name = 'audio_model'
 
     return m, x_a, y_a
-
-AUDIO_EMBEDDING_MODELS = {
-    'cnn_L3_orig': construct_cnn_l3_orig_audio_embedding_model,
-    'cnn_L3_melspec1': construct_cnn_l3_orig_audio_embedding_model
-}

From cb0e2d3c7a157a8e85ebfc090321c74dcd8191d6 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 20 Mar 2018 13:49:42 -0400
Subject: [PATCH 141/201] Print configuration in feature generation script

---
 05_generate_embedding_samples.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/05_generate_embedding_samples.py b/05_generate_embedding_samples.py
index 9f8438b..0c8edb9 100644
--- a/05_generate_embedding_samples.py
+++ b/05_generate_embedding_samples.py
@@ -152,6 +152,8 @@ def parse_arguments():
     dataset_name = args['dataset_name']
     fold_num = args['fold']
 
+    LOGGER.info('Configuration: {}'.format(str(args)))
+
     is_l3_feature = features.startswith('l3')
 
     if is_l3_feature and model_path:

From 2e6e55f7edafdc9ddcc419f5de7d587c6248327c Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 20 Mar 2018 14:42:07 -0400
Subject: [PATCH 142/201] Add melspec2 env dataset training job

---
 ...l3embedding-train-melspec2-03202018.sbatch | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 jobs/l3embedding-train-melspec2-03202018.sbatch

diff --git a/jobs/l3embedding-train-melspec2-03202018.sbatch b/jobs/l3embedding-train-melspec2-03202018.sbatch
new file mode 100644
index 0000000..4b6d2a9
--- /dev/null
+++ b/jobs/l3embedding-train-melspec2-03202018.sbatch
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+#SBATCH --gres=gpu:4
+#SBATCH --job-name=l3embedding-train-melspec2
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=2
+#SBATCH --mem=64GB
+#SBATCH --time=7-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@mail.com
+#SBATCH --output="l3embedding-train-melspec2-test-%j.out"
+#SBATCH --err="l3embedding-train-melspec2-test-%j.err"
+
+
+source ~/.bashrc
+cd /home/$USER/dev
+source activate l3embedding
+
+SRCDIR=$HOME/dev/l3embedding
+TRAIN_DATA_DIR=/beegfs/work/AudioSetSamples_environmental/urban_train
+VAL_DATA_DIR=/beegfs/work/AudioSetSamples_environmental/urban_valid
+MODEL_ID='melspec2'
+OUTPUT_DIR=/scratch/jtc440/l3_output
+GOOGLE_DEV_APP_NAME='l3embeddingexperiments'
+GSHEET_ID='' # REPLACE THIS
+NUM_GPUS=4
+
+module purge
+module load cuda/8.0.44
+module load cudnn/8.0v6.0
+
+python $SRCDIR/03_train_embedding.py \
+    --num-epochs 300 \
+    --train-epoch-size 4096 \
+    --train-batch-size 64 \
+    --model-type cnn_L3_melspec2 \
+    --validation-epoch-size 1024 \
+    --validation-batch-size 64 \
+    --checkpoint-interval 10 \
+    --gpus $NUM_GPUS \
+    --learning-rate 0.00001 \
+    --random-state 20180216 \
+    --gsheet-id $GSHEET_ID \
+    --google-dev-app-name $GOOGLE_DEV_APP_NAME \
+    --verbose \
+    $TRAIN_DATA_DIR \
+    $VAL_DATA_DIR \
+    $MODEL_ID \
+    $OUTPUT_DIR

From b6f104df2b2c99b0a79fb3bf027db96480c274af Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 20 Mar 2018 16:09:43 -0400
Subject: [PATCH 143/201] Update embedding sampling script

---
 jobs/generate_embedding_samples_array.sbatch | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/jobs/generate_embedding_samples_array.sbatch b/jobs/generate_embedding_samples_array.sbatch
index b8ffb32..d9c91c3 100644
--- a/jobs/generate_embedding_samples_array.sbatch
+++ b/jobs/generate_embedding_samples_array.sbatch
@@ -19,11 +19,11 @@ SRCDIR=$HOME/dev/l3embedding
 L3_MODEL_PATH=''
 L3_MODEL_TYPE='cnn_L3_melspec1'
 L3_POOLING_TYPE='original'
-US8K_PATH=/scratch/jtc440/UrbanSound8K
+US8K_PATH=/beegfs/jtc440/UrbanSound8K
 METADATA_PATH=$US8K_PATH/metadata/UrbanSound8K.csv
 DATA_DIR=$US8K_PATH/audio
 OUTPUT_DIR=/scratch/jtc440/l3_features
-MODEL_ID='test'
+MODEL_ID='l3embedding'
 
 module purge
 #module load cuda/8.0.44
@@ -40,7 +40,7 @@ python $SRCDIR/05_generate_embedding_samples.py \
     --l3embedding-pooling-type $L3_POOLING_TYPE \
     --hop-size 0.1 \
     --gpus 0 \
-    --fold 1 \
+    --fold $SLURM_ARRAY_TASK_ID \
     $METADATA_PATH \
     $MODEL_ID \
     us8k \

From 1cda5bbdd85e25f171ad63b81817e1cf5e8b548d Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 20 Mar 2018 17:04:41 -0400
Subject: [PATCH 144/201] Update feature summary statistics computation to be
 consistent with other experiments

---
 data/usc/features.py | 34 +++++++++-------------------------
 1 file changed, 9 insertions(+), 25 deletions(-)

diff --git a/data/usc/features.py b/data/usc/features.py
index 9a75e27..942c66b 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -144,7 +144,7 @@ def get_vggish_stats(audio_path, hop_size=0.1):
                    (Type: np.ndarray)
     """
     vggish_embedding = extract_vggish_embedding(audio_path, frame_hop_sec=hop_size)
-    return compute_stats_features(vggish_embedding, hop_size)
+    return compute_stats_features(vggish_embedding)
 
 
 def get_l3_stack_features(audio_path, l3embedding_model, hop_size=0.1):
@@ -204,33 +204,17 @@ def get_l3_stack_features(audio_path, l3embedding_model, hop_size=0.1):
     return l3embedding.flatten()
 
 
-def compute_stats_features(embeddings, hop_size):
+def compute_stats_features(embeddings):
     # Compute statistics on the time series of embeddings
     minimum = np.min(embeddings, axis=0)
     maximum = np.max(embeddings, axis=0)
-    #median = np.median(embeddings, axis=0)
+    median = np.median(embeddings, axis=0)
     mean = np.mean(embeddings, axis=0)
     var = np.var(embeddings, axis=0)
-    #skewness = sp.stats.skew(embeddings, axis=0)
-    #kurtosis = sp.stats.kurtosis(embeddings, axis=0)
+    skewness = sp.stats.skew(embeddings, axis=0)
+    kurtosis = sp.stats.kurtosis(embeddings, axis=0)
 
-    # Compute statistics on the first and second derivatives of time series of embeddings
-
-    # Use finite differences to approximate the derivatives
-    d1 = np.gradient(embeddings, 1/hop_size, edge_order=1, axis=0)
-    #d2 = np.gradient(embeddings, 1/hop_size, edge_order=2, axis=0)
-
-    d1_mean = np.mean(d1, axis=0)
-    d1_var = np.var(d1, axis=0)
-
-    #d2_mean = np.mean(d2, axis=0)
-    #d2_var = np.var(d2, axis=0)
-
-    """
-    return np.concatenate((minimum, maximum, median, mean, var, skewness, kurtosis,
-                           d1_mean, d1_var, d2_mean, d2_var))
-    """
-    return np.concatenate((minimum, maximum, mean, var, d1_mean, d1_var))
+    return np.concatenate((minimum, maximum, median, mean, var, skewness, kurtosis))
 
 
 def get_l3_stats_features(audio_path, l3embedding_model, hop_size=0.1):
@@ -263,10 +247,10 @@ def get_l3_stats_features(audio_path, l3embedding_model, hop_size=0.1):
     frame_length = 48000 * 1
 
     audio_length = len(audio)
-    if audio_length < (frame_length + 2*hop_length):
+    if audio_length < frame_length:
         # Make sure we can have at least three frames so that we can compute
         # all of the stats.
-        pad_length = frame_length + 2*hop_length - audio_length
+        pad_length = frame_length - audio_length
     else:
         # Zero pad so we compute embedding on all samples
         pad_length = int(np.ceil(audio_length - frame_length)/hop_length) * hop_length \
@@ -288,7 +272,7 @@ def get_l3_stats_features(audio_path, l3embedding_model, hop_size=0.1):
     # Get the L3 embedding for each frame
     l3embedding = l3embedding_model.predict(x)
 
-    return compute_stats_features(l3embedding, hop_size)
+    return compute_stats_features(l3embedding)
 
 
 def get_l3_frames_uniform(audio, l3embedding_model, hop_size=0.1, sr=48000):

From 58946d03b13df8b6bcee4e175a66ca2c03d775c0 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 21 Mar 2018 11:39:24 -0400
Subject: [PATCH 145/201] Rename fold loading function, make feature file idxs
 a numpy array, and add necessary index check to not load validation data in
 training batches

---
 classifier/train.py | 2 +-
 data/usc/us8k.py    | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/classifier/train.py b/classifier/train.py
index 083458e..531c70a 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -410,7 +410,7 @@ def train(features_dir, output_dir, model_id, fold_num, model_type='svm',
                          batch_size=valid_batch_size, random_state=random_state,
                          rate=valid_mux_rate)
     LOGGER.info('Preparing test data for fold {}'.format(fold_num))
-    test_data = load_test_fold(features_dir, fold_idx)
+    test_data = get_us8k_fold(features_dir, fold_idx)
 
     LOGGER.info('Training {} with fold {} held out'.format(model_type, fold_num))
     # Fit the model
diff --git a/data/usc/us8k.py b/data/usc/us8k.py
index e7d586c..f295f5d 100644
--- a/data/usc/us8k.py
+++ b/data/usc/us8k.py
@@ -95,6 +95,9 @@ def get_us8k_batch_generator(features_dir, test_fold_idx, valid=False, num_strea
         if valid and fold_idx != valid_fold_idx:
             continue
 
+        if not valid and fold_idx == valid_fold_idx:
+            continue
+
         fold_dir = os.path.join(features_dir, fold_dirname)
         for feature_filename in os.listdir(fold_dir):
             feature_filepath = os.path.join(fold_dir, feature_filename)
@@ -129,7 +132,7 @@ def get_us8k_batch(features_dir, test_fold_idx, valid=False, num_streamers=None,
     return next(gen)
 
 
-def load_test_fold(feature_dir, fold_idx):
+def get_us8k_fold(feature_dir, fold_idx):
     X = []
     y = []
     file_idxs = []
@@ -149,7 +152,7 @@ def load_test_fold(feature_dir, fold_idx):
 
         X.append(file_X)
         y.append(file_y)
-        file_idxs.append((start_idx, end_idx))
+        file_idxs.append([start_idx, end_idx])
 
         start_idx = end_idx
 
@@ -160,6 +163,8 @@ def load_test_fold(feature_dir, fold_idx):
     else:
         y = np.concatenate(y)
 
+    file_idxs = np.array(file_idxs)
+
     return {'features': X, 'labels': y, 'file_idxs': file_idxs, 'filenames': filenames}
 
 

From ebfbf108b8deace6d6607891a6fe37965e99563c Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 21 Mar 2018 16:08:42 -0400
Subject: [PATCH 146/201] Load full dataset into memory for classifier
 training, do offline normalization and standardization on classification
 trianing data, use SVC instead of SGDClassifier, update Google sheets rows

---
 06_train_classifier.py             |  28 ++--
 classifier/train.py                | 217 +++++++++++++----------------
 data/usc/features.py               |  67 +++++++++
 data/usc/us8k.py                   |  45 ++++++
 gsheets.py                         |   3 +
 jobs/classifier-train-array.sbatch |   4 +-
 6 files changed, 232 insertions(+), 132 deletions(-)

diff --git a/06_train_classifier.py b/06_train_classifier.py
index 4f98c9b..87b2e1f 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -95,7 +95,7 @@ def parse_arguments():
                         dest='C',
                         action='store',
                         type=float,
-                        default=1e-4,
+                        default=1.0,
                         help='(SVM) norm penalization factor')
 
     parser.add_argument('-sct',
@@ -103,7 +103,7 @@ def parse_arguments():
                         dest='tol',
                         action='store',
                         type=float,
-                        default=1e-3,
+                        default=0.001,
                         help='(SVM) convergence tolerance threshold')
 
     parser.add_argument('-smi',
@@ -111,17 +111,17 @@ def parse_arguments():
                         dest='max_iterations',
                         action='store',
                         type=int,
-                        default=1000000,
+                        default=-1,
                         help='(SVM) maximum iterations')
 
-    parser.add_argument('-srpt',
-                        '--svm-reg-penalty-type',
-                        dest='reg_penalty',
+    parser.add_argument('-skt',
+                        '--svm-kernel-type',
+                        dest='kernel',
                         action='store',
                         type=str,
-                        default='l2',
-                        choices=['l1', 'l2', 'elasticnet', 'none'],
-                        help='(SVM) maximum iterations')
+                        default='rbf',
+                        choices=['rbf', 'sigmoid', 'linear', 'poly'],
+                        help='(SVM) kernel type')
 
     parser.add_argument('-gsid',
                         '--gsheet-id',
@@ -150,12 +150,22 @@ def parse_arguments():
                         default=False,
                         help='If True, print detailed messages')
 
+    parser.add_argument('-fm',
+                        '--feature-mode',
+                        dest='feature_mode',
+                        action='store',
+                        type=str,
+                        default='framewise',
+                        choices=['framewise', 'stats'],
+                        help='Type of inputs used for model')
+
     parser.add_argument('-mt',
                         '--model-type',
                         dest='model_type',
                         action='store',
                         type=str,
                         default='svm',
+                        choices=['svm', 'mlp'],
                         help='Type of model used for training classifier')
 
     parser.add_argument('features_dir',
diff --git a/classifier/train.py b/classifier/train.py
index 531c70a..c28c2f5 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -8,6 +8,7 @@
 
 import keras
 import keras.regularizers as regularizers
+from tensorflow import set_random_seed
 import numpy as np
 from keras.layers import Input, Dense, BatchNormalization
 from keras.models import Model
@@ -16,11 +17,12 @@
 from sklearn.metrics import hinge_loss
 from sklearn.externals import joblib
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
-from sklearn.linear_model import SGDClassifier
+from sklearn.svm import SVC
 import pescador
 
 from classifier.metrics import compute_metrics, collapse_metrics
-from data.usc.us8k import get_us8k_batch_generator, get_us8k_batch, load_test_fold
+from data.usc.features import preprocess_split_data
+from data.usc.us8k import get_us8k_split
 from l3embedding.train import LossHistory
 from log import *
 
@@ -45,8 +47,6 @@ def on_train_begin(self, logs=None):
         self.valid_loss = []
         self.train_acc = []
         self.valid_acc = []
-        self.valid_class_acc = []
-        self.valid_avg_class_acc = []
 
     # def on_batch_end(self, batch, logs={}):
     def on_epoch_end(self, epoch, logs=None):
@@ -57,11 +57,6 @@ def on_epoch_end(self, epoch, logs=None):
         self.train_acc.append(logs.get('acc'))
         self.valid_acc.append(logs.get('val_acc'))
 
-        valid_pred = self.model.predict(self.valid_data['features'])
-        valid_metrics = compute_metrics(self.valid_data['label'], valid_pred)
-        self.valid_class_acc.append(valid_metrics['class_accuracy'])
-        self.valid_avg_class_acc.append(valid_metrics['average_class_accuracy'])
-
         if self.verbose:
             train_msg = 'Train - loss: {}, acc: {}'
             valid_msg = 'Valid - loss: {}, acc: {}'
@@ -72,8 +67,9 @@ def on_epoch_end(self, epoch, logs=None):
                                          self.valid_acc[-1]))
 
 
-def train_svm(train_gen, valid_data, test_data, model_dir, C=1e-4, reg_penalty='l2',
-              num_classes=10, tol=1e-3, max_iterations=1000000, verbose=False, **kwargs):
+def train_svm(train_data, valid_data, test_data, model_dir, C=1.0, kernel='rbf',
+              num_classes=10, tol=0.001, max_iterations=-1, verbose=False,
+              random_state=12345678, **kwargs):
     """
     Train a Support Vector Machine model on the given data
 
@@ -104,87 +100,54 @@ def train_svm(train_gen, valid_data, test_data, model_dir, C=1e-4, reg_penalty='
         y_test_pred: Predicted test output of classifier
                      (Type: np.ndarray)
     """
-    # Set up standardizer
-    stdizer = StandardScaler()
-
+    X_train = train_data['features']
+    y_train = train_data['label']
     X_valid = valid_data['features']
     y_valid = valid_data['label']
 
-    train_loss_history = []
-    valid_loss_history = []
-    train_metric_history = []
-    valid_metric_history = []
-
     model_output_path = os.path.join(model_dir, "model.pkl")
-    stdizer_output_path = os.path.join(model_dir, "stdizer.pkl")
 
     # Create classifier
-    clf = SGDClassifier(alpha=C, penalty=reg_penalty, n_jobs=-1, verbose=verbose)
+    clf = SVC(C=C, probability=True, kernel=kernel, max_iter=max_iterations,
+              tol=tol, random_state=random_state, verbose=verbose)
 
-    classes = np.arange(num_classes)
+    # Save the model for this iteration
+    LOGGER.info('Saving model...')
+    joblib.dump(clf, model_output_path)
 
+    # Fit data and get output for train and valid batches
     LOGGER.debug('Fitting model to data...')
-    for iter_idx, train_data in enumerate(train_gen):
-        X_train = train_data['features']
-        y_train = train_data['label']
-        stdizer.partial_fit(X_train)
-
-        # Fit data and get output for train and valid batches
-        clf.partial_fit(X_train, y_train, classes=classes)
-        X_train_std = stdizer.transform(X_train)
-        X_valid_std = stdizer.transform(X_valid)
-        y_train_pred = clf.predict(X_train_std)
-        y_valid_pred = clf.predict(X_valid_std)
-
-        # Compute new metrics
-        valid_loss_history.append(list(y_valid_pred))
-        train_loss_history.append(hinge_loss(y_train, clf.decision_function(X_train_std),
-                                             labels=classes))
-        valid_loss_history.append(hinge_loss(y_valid, clf.decision_function(X_valid_std),
-                                             labels=classes))
-        train_metric_history.append(compute_metrics(y_train, y_train_pred))
-        valid_metric_history.append(compute_metrics(y_valid, y_valid_pred))
-
-        # Save the model for this iteration
-        LOGGER.info('Saving model...')
-        joblib.dump(clf, model_output_path)
-        joblib.dump(stdizer, stdizer_output_path)
-
-        if verbose:
-            train_msg = 'Train - loss: {}, acc: {}'
-            valid_msg = 'Valid - loss: {}, acc: {}'
-            LOGGER.info('Epoch {}'.format(iter_idx + 1))
-            LOGGER.info(train_msg.format(train_loss_history[-1],
-                                         train_metric_history[-1]['accuracy']))
-            LOGGER.info(valid_msg.format(valid_loss_history[-1],
-                                         valid_metric_history[-1]['accuracy']))
-
-        # Finish training if the loss doesn't change much
-        if len(train_loss_history) > 1 and abs(train_loss_history[-2] - train_loss_history[-1]) < tol:
-            break
-
-        # Break if we reach the maximum number of iterations
-        if iter_idx >= max_iterations:
-            break
-
-    # Post process metrics
-    train_metrics = collapse_metrics(train_metric_history)
-    valid_metrics = collapse_metrics(valid_metric_history)
-    train_metrics['loss'] = train_loss_history
-    valid_metrics['loss'] = valid_loss_history
+    clf.fit(X_train, y_train)
+    y_train_pred = clf.predict(X_train)
+    y_valid_pred = clf.predict(X_valid)
+
+    # Compute new metrics
+    classes = np.arange(num_classes)
+    train_loss = hinge_loss(y_train, clf.decision_function(X_train), labels=classes)
+    valid_loss = hinge_loss(y_valid, clf.decision_function(X_valid), labels=classes)
+    train_metrics = compute_metrics(y_train, y_train_pred)
+    valid_metrics = compute_metrics(y_valid, y_valid_pred)
+    train_metrics['loss'] = train_loss
+    valid_metrics['loss'] = valid_loss
+
+    train_msg = 'Train - hinge loss: {}, acc: {}'
+    valid_msg = 'Valid - hinge loss: {}, acc: {}'
+    LOGGER.info(train_msg.format(train_loss, train_metrics['accuracy']))
+    LOGGER.info(valid_msg.format(valid_loss, valid_metrics['accuracy']))
+
 
     # Evaluate model on test data
-    X_test = stdizer.transform(test_data['features'])
-    y_test_pred_frame = clf.predict(X_test)
+    X_test = test_data['features']
+    y_test_pred_frame = clf.predict_proba(X_test)
     y_test_pred = []
     for start_idx, end_idx in test_data['file_idxs']:
-        class_pred = mode(y_test_pred_frame[start_idx:end_idx])[0][0]
+        class_pred = y_test_pred_frame[start_idx:end_idx].mean(axis=0).argmax()
         y_test_pred.append(class_pred)
 
     y_test_pred = np.array(y_test_pred)
     test_metrics = compute_metrics(test_data['labels'], y_test_pred)
 
-    return (stdizer, clf), train_metrics, valid_metrics, test_metrics
+    return clf, train_metrics, valid_metrics, test_metrics
 
 
 def construct_mlp_model(input_shape, weight_decay=1e-5, num_classes=10):
@@ -208,11 +171,8 @@ def construct_mlp_model(input_shape, weight_decay=1e-5, num_classes=10):
     l2_weight_decay = regularizers.l2(weight_decay)
     LOGGER.info(str(input_shape))
     inp = Input(shape=input_shape, dtype='float32')
-    y = BatchNormalization()(y)
-    y = Dense(512, activation='relu', kernel_regularizer=l2_weight_decay)(y)
-    y = BatchNormalization()(y)
+    y = Dense(512, activation='relu', kernel_regularizer=l2_weight_decay)(inp)
     y = Dense(128, activation='relu', kernel_regularizer=l2_weight_decay)(y)
-    y = BatchNormalization()(y)
     y = Dense(num_classes, activation='softmax', kernel_regularizer=l2_weight_decay)(y)
     m = Model(inputs=inp, outputs=y)
     m.name = 'urban_sound_classifier'
@@ -220,10 +180,10 @@ def construct_mlp_model(input_shape, weight_decay=1e-5, num_classes=10):
     return m, inp, y
 
 
-def train_mlp(train_gen, valid_data, test_data, model_dir,
+def train_mlp(train_data, valid_data, test_data, model_dir,
               batch_size=64, num_epochs=100, train_epoch_size=None,
               learning_rate=1e-4, weight_decay=1e-5, num_classes=10,
-              verbose=False, **kwargs):
+              random_state=12345678, verbose=False, **kwargs):
     """
     Train a Multi-layer perceptron model on the given data
 
@@ -261,20 +221,19 @@ def train_mlp(train_gen, valid_data, test_data, model_dir,
     loss = 'categorical_crossentropy'
     metrics = ['accuracy']
     monitor = 'val_loss'
+    set_random_seed(random_state)
 
     # Set up data inputs
-    train_gen = pescador.maps.keras_tuples(train_gen, 'features', 'label')
     enc = OneHotEncoder(n_values=num_classes, sparse=False)
-    # Transform int targets to produce one hot targets
-    train_gen = ((X, enc.fit_transform(y.reshape(-1, 1))) for X, y in train_gen)
-    valid_data_keras = (valid_data['features'],
-                        enc.fit_transform(valid_data['label'].reshape(-1,1)))
 
-    train_iter = iter(train_gen)
-    train_batch = next(train_iter)
+    X_train = train_data['features']
+    y_train = enc.fit_transform(train_data['labels'].reshape(-1, 1))
+
+    X_valid = valid_data['features']
+    y_valid = enc.fit_transform(valid_data['labels'].reshape(-1, 1))
 
     # Set up model
-    m, inp, out = construct_mlp_model(train_batch[0].shape[1:],
+    m, inp, out = construct_mlp_model(X_train.shape[1:],
                                       weight_decay=weight_decay,
                                       num_classes=num_classes)
 
@@ -297,21 +256,29 @@ def train_mlp(train_gen, valid_data, test_data, model_dir,
     LOGGER.debug('Compiling model...')
     m.compile(Adam(lr=learning_rate), loss=loss, metrics=metrics)
     LOGGER.debug('Fitting model to data...')
-    m.fit_generator(train_gen,
+    m.fit(x=X_train, y=y_train, batch_size=batch_size,
           epochs=num_epochs, steps_per_epoch=train_epoch_size,
-          validation_data=valid_data_keras, callbacks=cb)
+          validation_data=(X_valid, y_valid), callbacks=cb)
+
+    # Compute metrics for train and valid
+    train_pred = m.predict(X_train)
+    valid_pred = m.predict(X_valid)
+    train_metrics = compute_metrics(y_train, train_pred)
+    valid_metrics = compute_metrics(y_valid, valid_pred)
 
     # Set up train and validation metrics
     train_metrics = {
-        'loss': metric_cb.train_loss,
-        'accuracy': metric_cb.train_acc
+        'loss': metric_cb.train_loss[-1],
+        'accuracy': metric_cb.train_acc[-1],
+        'class_accuracy': train_metrics['class_accuracy'],
+        'average_class_accuracy': train_metrics['average_class_accuracy']
     }
 
     valid_metrics = {
-        'loss': metric_cb.valid_loss,
-        'accuracy': metric_cb.valid_acc,
-        'class_accuracy': metric_cb.valid_class_acc,
-        'average_class_accuracy': metric_cb.valid_avg_class_acc
+        'loss': metric_cb.valid_loss[-1],
+        'accuracy': metric_cb.valid_acc[-1],
+        'class_accuracy': valid_metrics['class_accuracy'],
+        'average_class_accuracy': valid_metrics['average_class_accuracy']
     }
 
     # Evaluate model on test data
@@ -327,7 +294,8 @@ def train_mlp(train_gen, valid_data, test_data, model_dir,
     return m, train_metrics, valid_metrics, test_metrics
 
 
-def train(features_dir, output_dir, model_id, fold_num, model_type='svm',
+def train(features_dir, output_dir, model_id, fold_num,
+          model_type='svm', feature_mode='framewise',
           train_num_streamers=None, train_batch_size=64, train_mux_rate=None,
           valid_num_streamers=None, valid_batch_size=64, valid_mux_rate=None,
           random_state=20171021, gsheet_id=None, google_dev_app_name=None,
@@ -340,7 +308,7 @@ def train(features_dir, output_dir, model_id, fold_num, model_type='svm',
     random.seed(random_state)
 
     # Make sure the directories we need exist
-    model_dir = os.path.join(output_dir, model_id,
+    model_dir = os.path.join(output_dir, model_id, model_type, feature_mode,
                              'fold{}'.format(fold_num),
                              datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
 
@@ -356,6 +324,7 @@ def train(features_dir, output_dir, model_id, fold_num, model_type='svm',
         'model_id': model_id,
         'fold_num': fold_num,
         'model_type': model_type,
+        'feature_mode': feature_mode,
         'train_num_streamers': train_num_streamers,
         'train_batch_size': train_batch_size,
         'train_mux_rate': train_mux_rate,
@@ -375,7 +344,6 @@ def train(features_dir, output_dir, model_id, fold_num, model_type='svm',
     with open(os.path.join(model_dir, 'config.json'), 'w') as fp:
         json.dump(config, fp)
 
-
     if gsheet_id:
         # Add a new entry in the Google Sheets spreadsheet
         LOGGER.info('Creating new spreadsheet entry...')
@@ -383,6 +351,8 @@ def train(features_dir, output_dir, model_id, fold_num, model_type='svm',
               'train_loss': '-',
               'valid_loss': '-',
               'train_acc': '-',
+              'train_avg_class_acc': '-',
+              'train_class_acc': '-',
               'valid_acc': '-',
               'valid_avg_class_acc': '-',
               'valid_class_acc': '-',
@@ -399,30 +369,31 @@ def train(features_dir, output_dir, model_id, fold_num, model_type='svm',
     LOGGER.info('Loading data...')
 
     fold_idx = fold_num - 1
-    LOGGER.info('Preparing training data for fold {}'.format(fold_num))
-    train_gen = get_us8k_batch_generator(features_dir, fold_idx,
-                         valid=False, num_streamers=train_num_streamers,
-                         batch_size=train_batch_size, random_state=random_state,
-                         rate=train_mux_rate)
-    LOGGER.info('Preparing validation data for fold {}'.format(fold_num))
-    valid_data = get_us8k_batch(features_dir, fold_idx,
-                         valid=True, num_streamers=valid_num_streamers,
-                         batch_size=valid_batch_size, random_state=random_state,
-                         rate=valid_mux_rate)
-    LOGGER.info('Preparing test data for fold {}'.format(fold_num))
-    test_data = get_us8k_fold(features_dir, fold_idx)
+
+    LOGGER.info('Loading data for configuration with test fold {}...'.format(fold_num))
+    train_data, valid_data, test_data = get_us8k_split(features_dir, fold_idx)
+
+    LOGGER.info('Preprocessing data...')
+    min_max_scaler, stdizer = preprocess_split_data(train_data, valid_data, test_data,
+                                                    feature_mode=feature_mode)
+
+    min_max_scaler_output_path = os.path.join(model_dir, "min_max_scaler.pkl")
+    joblib.dump(min_max_scaler, min_max_scaler_output_path)
+    stdizer_output_path = os.path.join(model_dir, "stdizer.pkl")
+    joblib.dump(stdizer, stdizer_output_path)
 
     LOGGER.info('Training {} with fold {} held out'.format(model_type, fold_num))
     # Fit the model
     if model_type == 'svm':
         model, train_metrics, valid_metrics, test_metrics \
-            = train_svm(train_gen, valid_data, test_data, model_dir,
-                verbose=verbose, **model_args)
+            = train_svm(train_data, test_data, model_dir,
+                random_state=random_state, verbose=verbose, **model_args)
 
     elif model_type == 'mlp':
         model, train_metrics, valid_metrics, test_metrics \
-                = train_mlp(train_gen, valid_data, test_data, model_dir,
-                batch_size=train_batch_size, verbose=verbose, **model_args)
+                = train_mlp(train_data, valid_data, test_data, model_dir,
+                    batch_size=train_batch_size, random_state=random_state,
+                    verbose=verbose, **model_args)
 
     else:
         raise ValueError('Invalid model type: {}'.format(model_type))
@@ -446,17 +417,19 @@ def train(features_dir, output_dir, model_id, fold_num, model_type='svm',
         # Update spreadsheet with results
         LOGGER.info('Updating spreadsheet...')
         update_values = [
-              train_metrics['loss'][-1],
-              valid_metrics['loss'][-1],
-              train_metrics['accuracy'][-1],
-              valid_metrics['accuracy'][-1],
-              valid_metrics['average_class_accuracy'][-1],
-              ', '.join(map(str, valid_metrics['class_accuracy'][-1])),
+              train_metrics['loss'],
+              valid_metrics['loss'],
+              train_metrics['accuracy'],
+              valid_metrics['accuracy'],
+              train_metrics['average_class_accuracy'],
+              valid_metrics['average_class_accuracy'],
+              ', '.join(map(str, train_metrics['class_accuracy'])),
+              ', '.join(map(str, valid_metrics['class_accuracy'])),
               test_metrics['accuracy'],
               test_metrics['average_class_accuracy'],
               ', '.join(map(str, test_metrics['class_accuracy']))
         ]
-        update_experiment(service, gsheet_id, config, 'V', 'AE',
+        update_experiment(service, gsheet_id, config, 'V', 'AG',
                           update_values, 'classifier')
 
     LOGGER.info('Done!')
diff --git a/data/usc/features.py b/data/usc/features.py
index 942c66b..e6e8878 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -7,6 +7,7 @@
 import soundfile as sf
 import resampy
 import tensorflow as tf
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
 from .vggish import vggish_input
 from .vggish import vggish_postprocess
 from .vggish import vggish_slim
@@ -49,6 +50,72 @@ def one_hot(idx, n_classes=10):
     return y
 
 
+def framewise_to_stats(data):
+    X = []
+    for start_idx, end_idx in data['file_idxs']:
+        X.append(compute_stats_features(data['features'][start_idx:end_idx]))
+
+    data['features'] = np.vstack(X)
+
+    idxs = np.arange(data['features'].shape[0])
+    data['file_idx'] = np.column_stack((idxs, idxs + 1))
+
+
+def expand_framewise_labels(data):
+    labels = []
+    for y, (start_idx, end_idx) in zip(data['labels'], data['file_idxs']):
+        num_frames = end_idx - start_idx
+        labels.append(np.tile(y, num_frames))
+
+    data['labels'] = np.concatenate(labels)
+
+
+def preprocess_split_data(train_data, valid_data, test_data,
+                          feature_mode='framewise'):
+    # NOTE: This function mutates data so there aren't extra copies
+
+    # Apply min max scaling to data
+    min_max_scaler = MinMaxScaler()
+    train_data['features'] = min_max_scaler.fit_transform(
+        train_data['features'])
+    valid_data['features'] = min_max_scaler.transform(valid_data['features'])
+    test_data['features'] = min_max_scaler.transform(test_data['features'])
+
+    if feature_mode == 'framewise':
+        # Expand training and validation labels to apply to each frame
+        expand_framewise_labels(train_data)
+        expand_framewise_labels(valid_data)
+    elif feature_mode == 'stats':
+        # Summarize frames in each file using summary statistics
+        framewise_to_stats(train_data)
+        framewise_to_stats(valid_data)
+        framewise_to_stats(test_data)
+    else:
+        raise ValueError('Invalid feature mode: {}'.format(feature_mode))
+
+    # Standardize features
+    stdizer = StandardScaler()
+    train_data['features'] = stdizer.fit_transform(train_data['features'])
+    valid_data['features'] = stdizer.transform(valid_data['features'])
+    test_data['features'] = stdizer.transform(test_data['features'])
+
+    return min_max_scaler, stdizer
+
+
+def preprocess_features(data, min_max_scaler, stdizer,
+                        feature_mode='framewise'):
+    data['features'] = min_max_scaler.fit_transform(data['features'])
+    if feature_mode == 'framewise':
+        # Expand training and validation labels to apply to each frame
+        expand_framewise_labels(data)
+    elif feature_mode == 'stats':
+        # Summarize frames in each file using summary statistics
+        framewise_to_stats(data)
+    else:
+        raise ValueError('Invalid feature mode: {}'.format(feature_mode))
+    data['features'] = stdizer.transform(data['features'])
+
+
 def extract_vggish_embedding(audio_path, input_op_name='vggish/input_features',
                              output_op_name='vggish/embedding',
                              resources_dir=None, **params):
diff --git a/data/usc/us8k.py b/data/usc/us8k.py
index f295f5d..77cf157 100644
--- a/data/usc/us8k.py
+++ b/data/usc/us8k.py
@@ -168,6 +168,51 @@ def get_us8k_fold(feature_dir, fold_idx):
     return {'features': X, 'labels': y, 'file_idxs': file_idxs, 'filenames': filenames}
 
 
+def get_us8k_split(feature_dir, test_fold_idx):
+    train_data = get_us8k_train_folds(feature_dir, test_fold_idx)
+    valid_data = get_us8k_fold(feature_dir, get_valid_fold_idx(test_fold_idx))
+    test_data = get_us8k_fold(feature_dir, test_fold_idx)
+
+    return train_data, valid_data, test_data
+
+
+def get_valid_fold_idx(test_fold_idx):
+    return (test_fold_idx - 1) % 10
+
+
+def get_us8k_train_folds(feature_dir, test_fold_idx):
+    X = []
+    y = []
+    file_idxs = []
+    filenames = []
+
+    valid_fold_idx = get_valid_fold_idx(test_fold_idx)
+
+    for fold_idx in range(10):
+        if fold_idx in (valid_fold_idx, test_fold_idx):
+            continue
+
+        fold_data = get_us8k_fold(feature_dir, fold_idx)
+
+        X.append(fold_data['features'])
+        y.append(fold_data['labels'])
+        idxs = fold_data['file_idxs']
+        if len(file_idxs) > 0:
+            # Since we're appending all of the file indices together, increment
+            # the current fold indices by the current global index
+            idxs = idxs + file_idxs[-1][-1, -1]
+        file_idxs.append(idxs)
+
+        filenames += fold_data['filenames']
+
+    X = np.vstack(X)
+    y = np.concatenate(y)
+    file_idxs = np.vstack(file_idxs)
+
+    return {'features': X, 'labels': y, 'file_idxs': file_idxs,
+            'filenames': filenames}
+
+
 def generate_us8k_folds(metadata_path, data_dir, output_dir, l3embedding_model=None,
                         features='l3_stack', label_format='int',
                         random_state=12345678, **feature_args):
diff --git a/gsheets.py b/gsheets.py
index acc8be8..59ccd39 100644
--- a/gsheets.py
+++ b/gsheets.py
@@ -45,6 +45,7 @@
     'features_dir',
     'fold_num',
     'model_type',
+    'feature_mode',
     'train_num_streamers',
     'train_batch_size',
     'train_mux_rate',
@@ -63,7 +64,9 @@
     'valid_loss',
     'train_acc',
     'valid_acc',
+    'train_avg_class_acc',
     'valid_avg_class_acc',
+    'train_class_acc',
     'valid_class_acc',
     'test_acc',
     'test_avg_class_acc',
diff --git a/jobs/classifier-train-array.sbatch b/jobs/classifier-train-array.sbatch
index 43ec9df..fd33b45 100644
--- a/jobs/classifier-train-array.sbatch
+++ b/jobs/classifier-train-array.sbatch
@@ -19,6 +19,7 @@ FEATURES_DIR=/scratch/jtc440/l3_features/us8k/l3_frames_uniform/l3embedding/cnn_
 OUTPUT_DIR=/scratch/jtc440/cls_output
 MODEL_ID='mycls'
 MODEL_TYPE='svm'
+FEATURE_MODE='framewise'
 GOOGLE_DEV_APP_NAME=''
 GSHEET_ID=''
 FOLD_NUM=$SLURM_ARRAY_TASK_ID
@@ -28,6 +29,7 @@ module purge
 python $SRCDIR/06_train_classifier.py \
     --random-state 20171021 \
     --model-type $MODEL_TYPE \
+    --feature-mode $FEATURE_MODE \
     --num-epochs 10 \
     --train-num-streamers 1024 \
     --train-mux-rate 16 \
@@ -39,7 +41,7 @@ python $SRCDIR/06_train_classifier.py \
     --norm-penalty-factor 0.0001 \
     --svm-conv-tolerance 0.001 \
     --svm-max-iterations 1000000 \
-    --svm-reg-penalty-type l2 \
+    --svm-kernel-type rbf \
     --gsheet-id $GSHEET_ID \
     --google-dev-app-name $GOOGLE_DEV_APP_NAME \
     --verbose \

From f46ce3cae07f68563eebbf64083e6cdec51190f2 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 21 Mar 2018 16:10:55 -0400
Subject: [PATCH 147/201] Remove label format for classification data and
 instead assume integer labels

---
 05_generate_embedding_samples.py | 13 ++-----------
 data/usc/us8k.py                 | 27 +++++++--------------------
 2 files changed, 9 insertions(+), 31 deletions(-)

diff --git a/05_generate_embedding_samples.py b/05_generate_embedding_samples.py
index 0c8edb9..eb0f31b 100644
--- a/05_generate_embedding_samples.py
+++ b/05_generate_embedding_samples.py
@@ -44,14 +44,6 @@ def parse_arguments():
                         default='l3_frames_uniform',
                         help='Type of features to be used in training')
 
-    parser.add_argument('-lf',
-                        '--label-format',
-                        dest='label_format',
-                        action='store',
-                        type=str,
-                        default='int',
-                        help='Type of format used for encoding outputs')
-
     parser.add_argument('-lmp',
                         '--l3embedding-model-path',
                         dest='l3embedding_model_path',
@@ -144,7 +136,6 @@ def parse_arguments():
     hop_size = args['hop_size']
     random_state = args['random_state']
     num_random_samples = args['num_random_samples']
-    label_format = args['label_format']
     model_path = args['l3embedding_model_path']
     num_gpus = args['gpus']
     output_dir = args['output_dir']
@@ -190,14 +181,14 @@ def parse_arguments():
             # Generate a single fold if a fold was specified
             generate_us8k_fold_data(metadata_path, data_dir, fold_num-1, dataset_output_dir,
                 l3embedding_model=l3embedding_model,
-                features=features, label_format=label_format, random_state=random_state,
+                features=features, random_state=random_state,
                 hop_size=hop_size, num_random_samples=num_random_samples)
 
         else:
             # Otherwise, generate all the folds
             generate_us8k_folds(metadata_path, data_dir, dataset_output_dir,
                 l3embedding_model=l3embedding_model,
-                features=features, label_format=label_format, random_state=random_state,
+                features=features, random_state=random_state,
                 hop_size=hop_size, num_random_samples=num_random_samples)
     else:
         LOGGER.error('Invalid dataset name: {}'.format(dataset_name))
diff --git a/data/usc/us8k.py b/data/usc/us8k.py
index 77cf157..83f8ab3 100644
--- a/data/usc/us8k.py
+++ b/data/usc/us8k.py
@@ -214,8 +214,7 @@ def get_us8k_train_folds(feature_dir, test_fold_idx):
 
 
 def generate_us8k_folds(metadata_path, data_dir, output_dir, l3embedding_model=None,
-                        features='l3_stack', label_format='int',
-                        random_state=12345678, **feature_args):
+                        features='l3_stack', random_state=12345678, **feature_args):
     """
     Generate all of the data for each fold
 
@@ -236,9 +235,6 @@ def generate_us8k_folds(metadata_path, data_dir, output_dir, l3embedding_model=N
         features: Type of features to be computed
                   (Type: str)
 
-        label_format: Type of format used for encoding outputs
-                      (Type: str)
-
     """
     LOGGER.info('Generating all folds.')
     metadata = load_us8k_metadata(metadata_path)
@@ -246,13 +242,12 @@ def generate_us8k_folds(metadata_path, data_dir, output_dir, l3embedding_model=N
     for fold_idx in range(10):
         generate_us8k_fold_data(metadata, data_dir, fold_idx, output_dir,
                                 l3embedding_model=l3embedding_model,
-                                features=features, label_format=label_format,
-                                random_state=random_state, **feature_args)
+                                features=features, random_state=random_state,
+                                **feature_args)
 
 
 def generate_us8k_fold_data(metadata, data_dir, fold_idx, output_dir, l3embedding_model=None,
-                            features='l3_stack', label_format='int',
-                            random_state=12345678, **feature_args):
+                            features='l3_stack', random_state=12345678, **feature_args):
     """
     Generate all of the data for a specific fold
 
@@ -276,9 +271,6 @@ def generate_us8k_fold_data(metadata, data_dir, fold_idx, output_dir, l3embeddin
         features: Type of features to be computed
                   (Type: str)
 
-        label_format: Type of format used for encoding outputs
-                      (Type: str)
-
     """
 
     if type(metadata) == str:
@@ -304,12 +296,12 @@ def generate_us8k_fold_data(metadata, data_dir, fold_idx, output_dir, l3embeddin
         desc = '({}/{}) Processed {} -'.format(idx+1, num_files, fname)
         with LogTimer(LOGGER, desc, log_level=logging.DEBUG):
             generate_us8k_file_data(fname, example_metadata, audio_fold_dir,
-                                    output_fold_dir, features, label_format,
+                                    output_fold_dir, features,
                                     l3embedding_model, **feature_args)
 
 
 def generate_us8k_file_data(fname, example_metadata, audio_fold_dir,
-                            output_fold_dir, features, label_format,
+                            output_fold_dir, features,
                             l3embedding_model, **feature_args):
     audio_path = os.path.join(audio_fold_dir, fname)
 
@@ -328,12 +320,7 @@ def generate_us8k_file_data(fname, example_metadata, audio_fold_dir,
         return
 
     class_label = example_metadata['classID']
-    if label_format == 'int':
-        y = class_label
-    elif label_format == 'one_hot':
-        y = cls_features.one_hot(class_label)
-    else:
-        raise ValueError('Invalid label format: {}'.format(label_format))
+    y = class_label
 
     np.savez_compressed(output_path, X=X, y=y)
 

From fba70e0661a90a07cb1a31d19402bfffc0ff6a1f Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 21 Mar 2018 16:18:51 -0400
Subject: [PATCH 148/201] Change embedding sampling to only output framewise
 features and update sbatch scripts

---
 05_generate_embedding_samples.py             |   4 +-
 data/usc/features.py                         | 225 +------------------
 data/usc/us8k.py                             |   4 +-
 jobs/generate_embedding_samples.sbatch       |   3 +-
 jobs/generate_embedding_samples_array.sbatch |   3 +-
 5 files changed, 11 insertions(+), 228 deletions(-)

diff --git a/05_generate_embedding_samples.py b/05_generate_embedding_samples.py
index eb0f31b..61629a3 100644
--- a/05_generate_embedding_samples.py
+++ b/05_generate_embedding_samples.py
@@ -41,7 +41,7 @@ def parse_arguments():
                         dest='features',
                         action='store',
                         type=str,
-                        default='l3_frames_uniform',
+                        default='l3',
                         help='Type of features to be used in training')
 
     parser.add_argument('-lmp',
@@ -145,7 +145,7 @@ def parse_arguments():
 
     LOGGER.info('Configuration: {}'.format(str(args)))
 
-    is_l3_feature = features.startswith('l3')
+    is_l3_feature = features == 'l3'
 
     if is_l3_feature and model_path:
         # Load L3 embedding model if using L3 features
diff --git a/data/usc/features.py b/data/usc/features.py
index e6e8878..f9fa899 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -1,6 +1,5 @@
 import logging
 import os
-import warnings
 import librosa
 import numpy as np
 import scipy as sp
@@ -193,84 +192,6 @@ def get_vggish_frames_uniform(audio_path, hop_size=0.1):
     return extract_vggish_embedding(audio_path, frame_hop_sec=hop_size)
 
 
-def get_vggish_stats(audio_path, hop_size=0.1):
-    """
-    Get vggish embedding features summary statistics for all frames in the
-    audio file
-
-    Args:
-        audio: Audio data or path to audio file
-               (Type: np.ndarray or str)
-
-    Keyword Args:
-        hop_size: Hop size in seconds
-                  (Type: float)
-
-    Returns:
-        features:  Array of embedding vectors
-                   (Type: np.ndarray)
-    """
-    vggish_embedding = extract_vggish_embedding(audio_path, frame_hop_sec=hop_size)
-    return compute_stats_features(vggish_embedding)
-
-
-def get_l3_stack_features(audio_path, l3embedding_model, hop_size=0.1):
-    """
-    Get stacked L3 embedding features, i.e. stack embedding features for each
-    1 second (overlapping) window of the given audio
-
-    Computes a _single_ feature for the given audio file
-
-    Args:
-        audio_path: Path to audio file
-                    (Type: str)
-
-        l3embedding_model:  Audio embedding model
-                            (keras.engine.training.Model)
-
-    Keyword Args:
-        hop_size: Hop size in seconds
-                  (Type: float)
-
-    Returns:
-        features:  Feature vector
-                   (Type: np.ndarray)
-    """
-    sr = 48000
-    audio = load_audio(audio_path, sr)
-    audio_length = len(audio)
-    frame_length = sr
-    hop_length = int(sr * hop_size)
-
-    # Zero pad to 4 seconds
-    target_len = 48000 * 4
-    if audio_length < target_len:
-        pad_length = target_len - audio_length
-        # Use (roughly) symmetric padding
-        left_pad = pad_length // 2
-        right_pad= pad_length - left_pad
-        audio = np.pad(audio, (left_pad, right_pad), mode='constant')
-    elif audio_length > target_len:
-        # Take center of audio (ASSUMES NOT MUCH GREATER THAN TARGET LENGTH)
-        center_sample = audio_length // 2
-        half_len = target_len // 2
-        audio = audio[center_sample-half_len:center_sample+half_len]
-
-
-
-    # Divide into overlapping 1 second frames
-    x = librosa.util.utils.frame(audio, frame_length=frame_length, hop_length=hop_length).T
-
-    # Add a channel dimension
-    x = x.reshape((x.shape[0], 1, x.shape[-1]))
-
-    # Get the L3 embedding for each frame
-    l3embedding = l3embedding_model.predict(x)
-
-    # Return a flattened vector of the embeddings
-    return l3embedding.flatten()
-
-
 def compute_stats_features(embeddings):
     # Compute statistics on the time series of embeddings
     minimum = np.min(embeddings, axis=0)
@@ -284,64 +205,6 @@ def compute_stats_features(embeddings):
     return np.concatenate((minimum, maximum, median, mean, var, skewness, kurtosis))
 
 
-def get_l3_stats_features(audio_path, l3embedding_model, hop_size=0.1):
-    """
-    Get L3 embedding stats features, i.e. compute statistics for each of the
-    embedding features across 1 second (overlapping) window of the given audio
-
-    Computes a _single_ feature for the given audio file
-
-
-    Args:
-        audio_path: Path to audio file
-                    (Type: str)
-
-        l3embedding_model:  Audio embedding model
-                            (keras.engine.training.Model)
-
-    Keyword Args:
-        hop_size: Hop size in seconds
-                  (Type: float)
-
-    Returns:
-        features:  Feature vector
-                   (Type: np.ndarray)
-    """
-    sr = 48000
-    audio = load_audio(audio_path, sr)
-
-    hop_length = int(hop_size * sr)
-    frame_length = 48000 * 1
-
-    audio_length = len(audio)
-    if audio_length < frame_length:
-        # Make sure we can have at least three frames so that we can compute
-        # all of the stats.
-        pad_length = frame_length - audio_length
-    else:
-        # Zero pad so we compute embedding on all samples
-        pad_length = int(np.ceil(audio_length - frame_length)/hop_length) * hop_length \
-                     - (audio_length - frame_length)
-
-    if pad_length > 0:
-        # Use (roughly) symmetric padding
-        left_pad = pad_length // 2
-        right_pad= pad_length - left_pad
-        audio = np.pad(audio, (left_pad, right_pad), mode='constant')
-
-
-    # Divide into overlapping 1 second frames
-    x = librosa.util.utils.frame(audio, frame_length=frame_length, hop_length=hop_length).T
-
-    # Add a channel dimension
-    x = x.reshape((x.shape[0], 1, x.shape[-1]))
-
-    # Get the L3 embedding for each frame
-    l3embedding = l3embedding_model.predict(x)
-
-    return compute_stats_features(l3embedding)
-
-
 def get_l3_frames_uniform(audio, l3embedding_model, hop_size=0.1, sr=48000):
     """
     Get L3 embedding for each frame in the given audio file
@@ -394,94 +257,16 @@ def get_l3_frames_uniform(audio, l3embedding_model, hop_size=0.1, sr=48000):
 
     return l3embedding
 
-def get_l3_frames_random(audio, l3embedding_model, num_samples, sr=48000):
-    """
-    Get L3 embedding for random frames in the given audio file
-
-    Args:
-        audio: Numpy array or path to audio file
-               (Type: np.ndarray or str)
-
-        l3embedding_model:  Audio embedding model
-                            (Type: keras.engine.training.Model)
-
-        num_samples: Number of samples
-                     (Type: int)
-
-    Returns:
-        features:  Array of embedding vectors
-                   (Type: np.ndarray)
-    """
-    if type(audio) == str:
-        audio = load_audio(audio, sr)
-
-    frame_length = sr * 1
-
-    audio_length = len(audio)
-    pad_length = frame_length - audio_length
-
-    if pad_length > 0:
-        # Use (roughly) symmetric padding
-        left_pad = pad_length // 2
-        right_pad= pad_length - left_pad
-        audio = np.pad(audio, (left_pad, right_pad), mode='constant')
-
-    if audio_length != frame_length:
-        sample_start_idxs = np.random.randint(low=0,
-                                              high=audio_length - frame_length,
-                                              size=num_samples)
-
-        x = []
-        for start_idx in sample_start_idxs:
-            end_idx = start_idx + frame_length
-            x.append(audio[start_idx:end_idx])
-
-
-        x = np.array(x)
-        x = x.reshape((x.shape[0], 1, x.shape[-1]))
-        l3embedding = l3embedding_model.predict(x).T
-
-    else:
-        warn_msg = 'Replicating samples'
-        LOGGER.warning(warn_msg)
-        warnings.warn(warn_msg)
-
-        x = audio.reshape((1,1,audio.shape[0]))
-        x = x.reshape((x.shape[0], 1, x.shape[-1]))
-        frame_l3embedding = l3embedding_model.predict(x).flatten()
-
-        l3embedding = np.tile(frame_l3embedding, (num_samples, 1))
-
-    return l3embedding
-
 
 def compute_file_features(path, feature_type, l3embedding_model=None, **feature_args):
-    if feature_type.startswith('l3') and not l3embedding_model:
-        err_msg = 'Must provide L3 embedding model to use {} features'
-        raise ValueError(err_msg.format(feature_type))
-
-    if feature_type == 'l3_stack':
-        hop_size = feature_args.get('hop_size', 0.1)
-        file_features = get_l3_stack_features(path, l3embedding_model,
-                                              hop_size=hop_size)
-    elif feature_type == 'l3_stats':
-        hop_size = feature_args.get('hop_size', 0.1)
-        file_features = get_l3_stats_features(path, l3embedding_model,
-                                              hop_size=hop_size)
-    elif feature_type == 'l3_frames_uniform':
+    if feature_type == 'l3':
+        if not l3embedding_model:
+            err_msg = 'Must provide L3 embedding model to use {} features'
+            raise ValueError(err_msg.format(feature_type))
         hop_size = feature_args.get('hop_size', 0.1)
         file_features = get_l3_frames_uniform(path, l3embedding_model,
                                               hop_size=hop_size)
-    elif feature_type == 'l3_frames_random':
-        num_samples = feature_args.get('num_random_samples')
-        if not num_samples:
-            raise ValueError('Must specify "num_samples" for "l3_frame_random" features')
-        file_features = get_l3_frames_random(path, l3embedding_model,
-                                             num_samples)
-    elif feature_type == 'vggish_stats':
-        hop_size = feature_args.get('hop_size', 0.1)
-        file_features = get_vggish_stats(path, hop_size=hop_size)
-    elif feature_type == 'vggish_frames_uniform':
+    elif feature_type == 'vggish':
         hop_size = feature_args.get('hop_size', 0.1)
         file_features = get_vggish_frames_uniform(path, hop_size=hop_size)
     else:
diff --git a/data/usc/us8k.py b/data/usc/us8k.py
index 83f8ab3..b082348 100644
--- a/data/usc/us8k.py
+++ b/data/usc/us8k.py
@@ -214,7 +214,7 @@ def get_us8k_train_folds(feature_dir, test_fold_idx):
 
 
 def generate_us8k_folds(metadata_path, data_dir, output_dir, l3embedding_model=None,
-                        features='l3_stack', random_state=12345678, **feature_args):
+                        features='l3', random_state=12345678, **feature_args):
     """
     Generate all of the data for each fold
 
@@ -247,7 +247,7 @@ def generate_us8k_folds(metadata_path, data_dir, output_dir, l3embedding_model=N
 
 
 def generate_us8k_fold_data(metadata, data_dir, fold_idx, output_dir, l3embedding_model=None,
-                            features='l3_stack', random_state=12345678, **feature_args):
+                            features='l3', random_state=12345678, **feature_args):
     """
     Generate all of the data for a specific fold
 
diff --git a/jobs/generate_embedding_samples.sbatch b/jobs/generate_embedding_samples.sbatch
index a31ebd0..9563b3e 100644
--- a/jobs/generate_embedding_samples.sbatch
+++ b/jobs/generate_embedding_samples.sbatch
@@ -30,8 +30,7 @@ module purge
 python $SRCDIR/05_generate_embedding_samples.py \
     --random-state 20180302 \
     --verbose \
-    --features 'l3_frames_uniform' \
-    --label-format 'int' \
+    --features 'l3' \
     --l3embedding-model-path $L3_MODEL_PATH \
     --l3embedding-model-type $L3_MODEL_TYPE \
     --l3embedding-pooling-type $L3_POOLING_TYPE \
diff --git a/jobs/generate_embedding_samples_array.sbatch b/jobs/generate_embedding_samples_array.sbatch
index d9c91c3..712d301 100644
--- a/jobs/generate_embedding_samples_array.sbatch
+++ b/jobs/generate_embedding_samples_array.sbatch
@@ -33,8 +33,7 @@ module load ffmpeg/intel/3.2.2
 python $SRCDIR/05_generate_embedding_samples.py \
     --random-state 20180302 \
     --verbose \
-    --features 'l3_frames_uniform' \
-    --label-format 'int' \
+    --features 'l3' \
     --l3embedding-model-path $L3_MODEL_PATH \
     --l3embedding-model-type $L3_MODEL_TYPE \
     --l3embedding-pooling-type $L3_POOLING_TYPE \

From 92e3f7bc04ecd9089b1b0761b26a7ab5eb5043f8 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 21 Mar 2018 16:19:24 -0400
Subject: [PATCH 149/201] Fix bug where hop size is not changed for computing
 framewise L3 features

---
 data/usc/features.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/usc/features.py b/data/usc/features.py
index f9fa899..28afd50 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -227,7 +227,7 @@ def get_l3_frames_uniform(audio, l3embedding_model, hop_size=0.1, sr=48000):
     if type(audio) == str:
         audio = load_audio(audio, sr)
 
-    hop_size = 0.25 # REVISIT
+    hop_size = hop_size
     hop_length = int(hop_size * sr)
     frame_length = sr * 1
 

From 9d2666595e16b91b13ed7f19a39ace2cb2765b15 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 21 Mar 2018 16:20:22 -0400
Subject: [PATCH 150/201] Remove unnecessary imports from classifier training
 code

---
 classifier/train.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/classifier/train.py b/classifier/train.py
index c28c2f5..d49056b 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -10,17 +10,15 @@
 import keras.regularizers as regularizers
 from tensorflow import set_random_seed
 import numpy as np
-from keras.layers import Input, Dense, BatchNormalization
+from keras.layers import Input, Dense
 from keras.models import Model
 from keras.optimizers import Adam
-from scipy.stats import mode
 from sklearn.metrics import hinge_loss
 from sklearn.externals import joblib
-from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.preprocessing import OneHotEncoder
 from sklearn.svm import SVC
-import pescador
 
-from classifier.metrics import compute_metrics, collapse_metrics
+from classifier.metrics import compute_metrics
 from data.usc.features import preprocess_split_data
 from data.usc.us8k import get_us8k_split
 from l3embedding.train import LossHistory

From 87b94618de8dd22857b61dd8686193af03eb5dbd Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 21 Mar 2018 16:40:11 -0400
Subject: [PATCH 151/201] Remove reg_penalty from gsheets classifier update

---
 classifier/train.py | 2 +-
 gsheets.py          | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/classifier/train.py b/classifier/train.py
index d49056b..fd9560a 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -427,7 +427,7 @@ def train(features_dir, output_dir, model_id, fold_num,
               test_metrics['average_class_accuracy'],
               ', '.join(map(str, test_metrics['class_accuracy']))
         ]
-        update_experiment(service, gsheet_id, config, 'V', 'AG',
+        update_experiment(service, gsheet_id, config, 'V', 'AF',
                           update_values, 'classifier')
 
     LOGGER.info('Done!')
diff --git a/gsheets.py b/gsheets.py
index 59ccd39..08c8b42 100644
--- a/gsheets.py
+++ b/gsheets.py
@@ -56,7 +56,6 @@
     'num_epochs',
     'learning_rate',
     'weight_decay',
-    'reg_penalty',
     'C',
     'tol',
     'max_iterations',

From 458bf7d927b8a0dbac4838890ca8fdee54e3a97d Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 21 Mar 2018 17:21:14 -0400
Subject: [PATCH 152/201] Fix classifier training typos and add logging for
 where classifier training config is saved

---
 classifier/train.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/classifier/train.py b/classifier/train.py
index fd9560a..80c231c 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -99,9 +99,9 @@ def train_svm(train_data, valid_data, test_data, model_dir, C=1.0, kernel='rbf',
                      (Type: np.ndarray)
     """
     X_train = train_data['features']
-    y_train = train_data['label']
+    y_train = train_data['labels']
     X_valid = valid_data['features']
-    y_valid = valid_data['label']
+    y_valid = valid_data['labels']
 
     model_output_path = os.path.join(model_dir, "model.pkl")
 
@@ -109,13 +109,13 @@ def train_svm(train_data, valid_data, test_data, model_dir, C=1.0, kernel='rbf',
     clf = SVC(C=C, probability=True, kernel=kernel, max_iter=max_iterations,
               tol=tol, random_state=random_state, verbose=verbose)
 
-    # Save the model for this iteration
-    LOGGER.info('Saving model...')
-    joblib.dump(clf, model_output_path)
-
     # Fit data and get output for train and valid batches
     LOGGER.debug('Fitting model to data...')
     clf.fit(X_train, y_train)
+
+    LOGGER.info('Saving model...')
+    joblib.dump(clf, model_output_path)
+
     y_train_pred = clf.predict(X_train)
     y_valid_pred = clf.predict(X_valid)
 
@@ -339,8 +339,10 @@ def train(features_dir, output_dir, model_id, fold_num,
     config.update(model_args)
 
     # Save configs
-    with open(os.path.join(model_dir, 'config.json'), 'w') as fp:
+    config_path = os.path.join(model_dir, 'config.json')
+    with open(config_path, 'w') as fp:
         json.dump(config, fp)
+    LOGGER.info('Saved configurations to {}'.format(config_path))
 
     if gsheet_id:
         # Add a new entry in the Google Sheets spreadsheet
@@ -384,7 +386,7 @@ def train(features_dir, output_dir, model_id, fold_num,
     # Fit the model
     if model_type == 'svm':
         model, train_metrics, valid_metrics, test_metrics \
-            = train_svm(train_data, test_data, model_dir,
+            = train_svm(train_data, valid_data, test_data, model_dir,
                 random_state=random_state, verbose=verbose, **model_args)
 
     elif model_type == 'mlp':

From 894e54e4b928a1f1e36152a65993277493783e20 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 22 Mar 2018 09:52:05 -0400
Subject: [PATCH 153/201] Remove now unnecessary streamer and mux rate
 arguments

---
 06_train_classifier.py | 32 +-------------------------------
 classifier/train.py    | 11 ++---------
 gsheets.py             |  4 ----
 3 files changed, 3 insertions(+), 44 deletions(-)

diff --git a/06_train_classifier.py b/06_train_classifier.py
index 87b2e1f..1f61475 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -29,21 +29,6 @@ def parse_arguments():
                         default=64,
                         help='(MLP) Number of training examples per batch')
 
-    parser.add_argument('-tns',
-                        '--train-num-streamers',
-                        dest='train_num_streamers',
-                        action='store',
-                        type=int,
-                        help='Number of open pescador streamers to keep open ' \
-                             'while sampling features for training data')
-
-    parser.add_argument('-tmr',
-                        '--train-mux-rate',
-                        dest='train_mux_rate',
-                        action='store',
-                        type=int,
-                        help='Pescador poisson rate used for sampling training data')
-
     parser.add_argument('-vbs',
                         '--valid-batch-size',
                         dest='valid_batch_size',
@@ -52,21 +37,6 @@ def parse_arguments():
                         default=64,
                         help='(MLP) Number of validation examples per batch')
 
-    parser.add_argument('-vns',
-                        '--valid-num-streamers',
-                        dest='valid_num_streamers',
-                        action='store',
-                        type=int,
-                        help='Number of open pescador streamers to keep open ' \
-                             'while sampling features for validation data')
-
-    parser.add_argument('-vmr',
-                        '--valid-mux-rate',
-                        dest='valid_mux_rate',
-                        action='store',
-                        type=int,
-                        help='Pescador poisson rate used for sampling validation data')
-
     parser.add_argument('-tes',
                         '--train-epoch-size',
                         dest='train_epoch_size',
@@ -103,7 +73,7 @@ def parse_arguments():
                         dest='tol',
                         action='store',
                         type=float,
-                        default=0.001,
+                        default=0.00001,
                         help='(SVM) convergence tolerance threshold')
 
     parser.add_argument('-smi',
diff --git a/classifier/train.py b/classifier/train.py
index 80c231c..d0fd057 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -294,8 +294,7 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
 
 def train(features_dir, output_dir, model_id, fold_num,
           model_type='svm', feature_mode='framewise',
-          train_num_streamers=None, train_batch_size=64, train_mux_rate=None,
-          valid_num_streamers=None, valid_batch_size=64, valid_mux_rate=None,
+          train_batch_size=64, valid_batch_size=64,
           random_state=20171021, gsheet_id=None, google_dev_app_name=None,
           verbose=False, **model_args):
     init_console_logger(LOGGER, verbose=verbose)
@@ -323,12 +322,8 @@ def train(features_dir, output_dir, model_id, fold_num,
         'fold_num': fold_num,
         'model_type': model_type,
         'feature_mode': feature_mode,
-        'train_num_streamers': train_num_streamers,
         'train_batch_size': train_batch_size,
-        'train_mux_rate': train_mux_rate,
-        'valid_num_streamers': valid_num_streamers,
         'valid_batch_size': valid_batch_size,
-        'valid_mux_rate': valid_mux_rate,
         'random_state': random_state,
         'verbose': verbose,
         'git_commit': git.Repo(os.path.dirname(os.path.abspath(__file__)),
@@ -366,8 +361,6 @@ def train(features_dir, output_dir, model_id, fold_num,
     else:
         LOGGER.error(gsheet_id)
 
-    LOGGER.info('Loading data...')
-
     fold_idx = fold_num - 1
 
     LOGGER.info('Loading data for configuration with test fold {}...'.format(fold_num))
@@ -429,7 +422,7 @@ def train(features_dir, output_dir, model_id, fold_num,
               test_metrics['average_class_accuracy'],
               ', '.join(map(str, test_metrics['class_accuracy']))
         ]
-        update_experiment(service, gsheet_id, config, 'V', 'AF',
+        update_experiment(service, gsheet_id, config, 'R', 'AB',
                           update_values, 'classifier')
 
     LOGGER.info('Done!')
diff --git a/gsheets.py b/gsheets.py
index 08c8b42..6fb4fee 100644
--- a/gsheets.py
+++ b/gsheets.py
@@ -46,12 +46,8 @@
     'fold_num',
     'model_type',
     'feature_mode',
-    'train_num_streamers',
     'train_batch_size',
-    'train_mux_rate',
-    'valid_num_streamers',
     'valid_batch_size',
-    'valid_mux_rate',
     'random_state',
     'num_epochs',
     'learning_rate',

From 4d0a1e764e83d15d2fc4a81a56bc2f158ab63aff Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 22 Mar 2018 09:52:40 -0400
Subject: [PATCH 154/201] Fix typo not properly updating file_idxs when
 computing summary stats features for classification

---
 data/usc/features.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/usc/features.py b/data/usc/features.py
index 28afd50..93463ae 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -57,7 +57,7 @@ def framewise_to_stats(data):
     data['features'] = np.vstack(X)
 
     idxs = np.arange(data['features'].shape[0])
-    data['file_idx'] = np.column_stack((idxs, idxs + 1))
+    data['file_idxs'] = np.column_stack((idxs, idxs + 1))
 
 
 def expand_framewise_labels(data):

From 26b1109d828f63c5f8d7709d7998486704b1efe0 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 22 Mar 2018 10:07:18 -0400
Subject: [PATCH 155/201] Remove num streamers and mux rate from classifier
 training sbatch script

---
 jobs/classifier-train-array.sbatch | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/jobs/classifier-train-array.sbatch b/jobs/classifier-train-array.sbatch
index fd33b45..1da6335 100644
--- a/jobs/classifier-train-array.sbatch
+++ b/jobs/classifier-train-array.sbatch
@@ -31,14 +31,10 @@ python $SRCDIR/06_train_classifier.py \
     --model-type $MODEL_TYPE \
     --feature-mode $FEATURE_MODE \
     --num-epochs 10 \
-    --train-num-streamers 1024 \
-    --train-mux-rate 16 \
     --train-batch-size 128 \
     --train-epoch-size 16 \
-    --valid-num-streamers 1024 \
-    --valid-mux-rate 1 \
     --valid-batch-size 512 \
-    --norm-penalty-factor 0.0001 \
+    --norm-penalty-factor 1.0 \
     --svm-conv-tolerance 0.001 \
     --svm-max-iterations 1000000 \
     --svm-kernel-type rbf \

From 63aded41d31ed2096a3a75ede27332f0ce495872 Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Tue, 27 Mar 2018 12:32:36 -0400
Subject: [PATCH 156/201] Add parameters for use_min_max and non_overlap

---
 06_train_classifier.py | 12 ++++++++++++
 classifier/train.py    |  6 ++++--
 data/usc/features.py   | 33 +++++++++++++++++++++++----------
 3 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/06_train_classifier.py b/06_train_classifier.py
index 1f61475..cb1f364 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -138,6 +138,18 @@ def parse_arguments():
                         choices=['svm', 'mlp'],
                         help='Type of model used for training classifier')
 
+    parser.add_argument('-no',
+                        '--non-overlap',
+                        dest='non_overlap',
+                        action='store_true',
+                        default=False)
+
+    parser.add_argument('-umm',
+                        '--use-min-max',
+                        dest='use_min_max',
+                        action='store_true',
+                        default=False)
+
     parser.add_argument('features_dir',
                         action='store',
                         type=str,
diff --git a/classifier/train.py b/classifier/train.py
index d0fd057..215fab2 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -296,7 +296,7 @@ def train(features_dir, output_dir, model_id, fold_num,
           model_type='svm', feature_mode='framewise',
           train_batch_size=64, valid_batch_size=64,
           random_state=20171021, gsheet_id=None, google_dev_app_name=None,
-          verbose=False, **model_args):
+          verbose=False, non_overlap=False, use_min_max=False, **model_args):
     init_console_logger(LOGGER, verbose=verbose)
     LOGGER.debug('Initialized logging.')
 
@@ -368,7 +368,9 @@ def train(features_dir, output_dir, model_id, fold_num,
 
     LOGGER.info('Preprocessing data...')
     min_max_scaler, stdizer = preprocess_split_data(train_data, valid_data, test_data,
-                                                    feature_mode=feature_mode)
+                                                    feature_mode=feature_mode,
+                                                    non_overlap=non_overlap,
+                                                    use_min_max=use_min_max)
 
     min_max_scaler_output_path = os.path.join(model_dir, "min_max_scaler.pkl")
     joblib.dump(min_max_scaler, min_max_scaler_output_path)
diff --git a/data/usc/features.py b/data/usc/features.py
index 93463ae..cd980d8 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -49,10 +49,21 @@ def one_hot(idx, n_classes=10):
     return y
 
 
-def framewise_to_stats(data):
+def sample_non_overlap(X, chunk_size=10):
+    def _chunks(l, n):
+        """Yield successive n-sized chunks from l."""
+        for i in range(0, len(l), n):
+            yield l[i:i + n]
+    return np.array([chunk[0] for chunk in _chunks(X, chunk_size)])
+
+
+def framewise_to_stats(data, non_overlap=False):
     X = []
     for start_idx, end_idx in data['file_idxs']:
-        X.append(compute_stats_features(data['features'][start_idx:end_idx]))
+        features = data['features'][start_idx:end_idx]
+        if non_overlap:
+            features = sample_non_overlap(features)
+        X.append(compute_stats_features(features))
 
     data['features'] = np.vstack(X)
 
@@ -70,15 +81,17 @@ def expand_framewise_labels(data):
 
 
 def preprocess_split_data(train_data, valid_data, test_data,
-                          feature_mode='framewise'):
+                          feature_mode='framewise', non_overlap=False,
+                          use_min_max=False):
     # NOTE: This function mutates data so there aren't extra copies
 
     # Apply min max scaling to data
     min_max_scaler = MinMaxScaler()
-    train_data['features'] = min_max_scaler.fit_transform(
-        train_data['features'])
-    valid_data['features'] = min_max_scaler.transform(valid_data['features'])
-    test_data['features'] = min_max_scaler.transform(test_data['features'])
+    if use_min_max:
+        train_data['features'] = min_max_scaler.fit_transform(
+            train_data['features'])
+        valid_data['features'] = min_max_scaler.transform(valid_data['features'])
+        test_data['features'] = min_max_scaler.transform(test_data['features'])
 
     if feature_mode == 'framewise':
         # Expand training and validation labels to apply to each frame
@@ -86,9 +99,9 @@ def preprocess_split_data(train_data, valid_data, test_data,
         expand_framewise_labels(valid_data)
     elif feature_mode == 'stats':
         # Summarize frames in each file using summary statistics
-        framewise_to_stats(train_data)
-        framewise_to_stats(valid_data)
-        framewise_to_stats(test_data)
+        framewise_to_stats(train_data, non_overlap)
+        framewise_to_stats(valid_data, non_overlap)
+        framewise_to_stats(test_data, non_overlap)
     else:
         raise ValueError('Invalid feature mode: {}'.format(feature_mode))
 

From 7f77e4fcb9fb3ce382cf1592ddeb225905585df3 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 27 Mar 2018 13:18:30 -0400
Subject: [PATCH 157/201] Move non-overlap sampling for classifier training
 outside of stats so it can be used for framewise as well, and allow chunk
 size to be set

---
 06_train_classifier.py |  6 ++++++
 classifier/train.py    | 10 ++++++----
 data/usc/features.py   | 37 +++++++++++++++++++++++++++++--------
 3 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/06_train_classifier.py b/06_train_classifier.py
index cb1f364..d7b4660 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -144,6 +144,12 @@ def parse_arguments():
                         action='store_true',
                         default=False)
 
+    parser.add_argument('-nocs',
+                        '--non-overlap-chunk-size',
+                        dest='non_overlap_chunk_size',
+                        action='store',
+                        default=10)
+
     parser.add_argument('-umm',
                         '--use-min-max',
                         dest='use_min_max',
diff --git a/classifier/train.py b/classifier/train.py
index 215fab2..45be50a 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -296,7 +296,8 @@ def train(features_dir, output_dir, model_id, fold_num,
           model_type='svm', feature_mode='framewise',
           train_batch_size=64, valid_batch_size=64,
           random_state=20171021, gsheet_id=None, google_dev_app_name=None,
-          verbose=False, non_overlap=False, use_min_max=False, **model_args):
+          verbose=False, non_overlap=False, non_overlap_chunk_size=10,
+          use_min_max=False, **model_args):
     init_console_logger(LOGGER, verbose=verbose)
     LOGGER.debug('Initialized logging.')
 
@@ -368,9 +369,10 @@ def train(features_dir, output_dir, model_id, fold_num,
 
     LOGGER.info('Preprocessing data...')
     min_max_scaler, stdizer = preprocess_split_data(train_data, valid_data, test_data,
-                                                    feature_mode=feature_mode,
-                                                    non_overlap=non_overlap,
-                                                    use_min_max=use_min_max)
+                                                feature_mode=feature_mode,
+                                                non_overlap=non_overlap,
+                                                non_overlap_chunk_size=non_overlap_chunk_size,
+                                                use_min_max=use_min_max)
 
     min_max_scaler_output_path = os.path.join(model_dir, "min_max_scaler.pkl")
     joblib.dump(min_max_scaler, min_max_scaler_output_path)
diff --git a/data/usc/features.py b/data/usc/features.py
index cd980d8..fe2c4d6 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -49,7 +49,7 @@ def one_hot(idx, n_classes=10):
     return y
 
 
-def sample_non_overlap(X, chunk_size=10):
+def sample_non_overlap_file(X, chunk_size=10):
     def _chunks(l, n):
         """Yield successive n-sized chunks from l."""
         for i in range(0, len(l), n):
@@ -57,12 +57,27 @@ def _chunks(l, n):
     return np.array([chunk[0] for chunk in _chunks(X, chunk_size)])
 
 
-def framewise_to_stats(data, non_overlap=False):
+def remove_data_overlap(data, chunk_size=10):
+    X = []
+    file_idxs = []
+    new_start_idx = 0
+    for start_idx, end_idx in data['file_idxs']:
+        features = data['features'][start_idx:end_idx]
+        features = sample_non_overlap_file(features, chunk_size=chunk_size)
+        X.append(features)
+
+        new_end_idx = new_start_idx + features.shape[0]
+        file_idxs.append([new_start_idx, new_end_idx])
+        new_start_idx = new_end_idx
+
+    data['features'] = np.vstack(X)
+    data['file_idxs'] = np.array(file_idxs)
+
+
+def framewise_to_stats(data):
     X = []
     for start_idx, end_idx in data['file_idxs']:
         features = data['features'][start_idx:end_idx]
-        if non_overlap:
-            features = sample_non_overlap(features)
         X.append(compute_stats_features(features))
 
     data['features'] = np.vstack(X)
@@ -82,9 +97,15 @@ def expand_framewise_labels(data):
 
 def preprocess_split_data(train_data, valid_data, test_data,
                           feature_mode='framewise', non_overlap=False,
-                          use_min_max=False):
+                          non_overlap_chunk_size=10, use_min_max=False):
     # NOTE: This function mutates data so there aren't extra copies
 
+    # Remove overlapping frames if no overlap
+    if non_overlap:
+        remove_data_overlap(train_data, chunk_size=non_overlap_chunk_size)
+        remove_data_overlap(valid_data, chunk_size=non_overlap_chunk_size)
+        remove_data_overlap(test_data, chunk_size=non_overlap_chunk_size)
+
     # Apply min max scaling to data
     min_max_scaler = MinMaxScaler()
     if use_min_max:
@@ -99,9 +120,9 @@ def preprocess_split_data(train_data, valid_data, test_data,
         expand_framewise_labels(valid_data)
     elif feature_mode == 'stats':
         # Summarize frames in each file using summary statistics
-        framewise_to_stats(train_data, non_overlap)
-        framewise_to_stats(valid_data, non_overlap)
-        framewise_to_stats(test_data, non_overlap)
+        framewise_to_stats(train_data)
+        framewise_to_stats(valid_data)
+        framewise_to_stats(test_data)
     else:
         raise ValueError('Invalid feature mode: {}'.format(feature_mode))
 

From 83973eb9389ff8c579ea07360fd87ad5f5980d81 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 27 Mar 2018 14:32:29 -0400
Subject: [PATCH 158/201] Remove valid batch size from classifier code

---
 06_train_classifier.py             | 8 --------
 classifier/train.py                | 5 ++---
 gsheets.py                         | 1 -
 jobs/classifier-train-array.sbatch | 1 -
 4 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/06_train_classifier.py b/06_train_classifier.py
index d7b4660..2d26c42 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -29,14 +29,6 @@ def parse_arguments():
                         default=64,
                         help='(MLP) Number of training examples per batch')
 
-    parser.add_argument('-vbs',
-                        '--valid-batch-size',
-                        dest='valid_batch_size',
-                        action='store',
-                        type=int,
-                        default=64,
-                        help='(MLP) Number of validation examples per batch')
-
     parser.add_argument('-tes',
                         '--train-epoch-size',
                         dest='train_epoch_size',
diff --git a/classifier/train.py b/classifier/train.py
index 45be50a..f0463bc 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -294,7 +294,7 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
 
 def train(features_dir, output_dir, model_id, fold_num,
           model_type='svm', feature_mode='framewise',
-          train_batch_size=64, valid_batch_size=64,
+          train_batch_size=64,
           random_state=20171021, gsheet_id=None, google_dev_app_name=None,
           verbose=False, non_overlap=False, non_overlap_chunk_size=10,
           use_min_max=False, **model_args):
@@ -324,7 +324,6 @@ def train(features_dir, output_dir, model_id, fold_num,
         'model_type': model_type,
         'feature_mode': feature_mode,
         'train_batch_size': train_batch_size,
-        'valid_batch_size': valid_batch_size,
         'random_state': random_state,
         'verbose': verbose,
         'git_commit': git.Repo(os.path.dirname(os.path.abspath(__file__)),
@@ -426,7 +425,7 @@ def train(features_dir, output_dir, model_id, fold_num,
               test_metrics['average_class_accuracy'],
               ', '.join(map(str, test_metrics['class_accuracy']))
         ]
-        update_experiment(service, gsheet_id, config, 'R', 'AB',
+        update_experiment(service, gsheet_id, config, 'Q', 'AA',
                           update_values, 'classifier')
 
     LOGGER.info('Done!')
diff --git a/gsheets.py b/gsheets.py
index 6fb4fee..310877b 100644
--- a/gsheets.py
+++ b/gsheets.py
@@ -47,7 +47,6 @@
     'model_type',
     'feature_mode',
     'train_batch_size',
-    'valid_batch_size',
     'random_state',
     'num_epochs',
     'learning_rate',
diff --git a/jobs/classifier-train-array.sbatch b/jobs/classifier-train-array.sbatch
index 1da6335..8ab93cc 100644
--- a/jobs/classifier-train-array.sbatch
+++ b/jobs/classifier-train-array.sbatch
@@ -33,7 +33,6 @@ python $SRCDIR/06_train_classifier.py \
     --num-epochs 10 \
     --train-batch-size 128 \
     --train-epoch-size 16 \
-    --valid-batch-size 512 \
     --norm-penalty-factor 1.0 \
     --svm-conv-tolerance 0.001 \
     --svm-max-iterations 1000000 \

From 11f8ddb8a652c7f11f1ab34befccb8e1db7314a8 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 27 Mar 2018 15:22:11 -0400
Subject: [PATCH 159/201] Add non-overlap to google sheets for classification
 results

---
 classifier/train.py | 4 +++-
 gsheets.py          | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/classifier/train.py b/classifier/train.py
index f0463bc..5036ee5 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -324,6 +324,8 @@ def train(features_dir, output_dir, model_id, fold_num,
         'model_type': model_type,
         'feature_mode': feature_mode,
         'train_batch_size': train_batch_size,
+        'non_overlap': non_overlap,
+        'non_overlap_chunk_size': non_overlap_chunk_size,
         'random_state': random_state,
         'verbose': verbose,
         'git_commit': git.Repo(os.path.dirname(os.path.abspath(__file__)),
@@ -425,7 +427,7 @@ def train(features_dir, output_dir, model_id, fold_num,
               test_metrics['average_class_accuracy'],
               ', '.join(map(str, test_metrics['class_accuracy']))
         ]
-        update_experiment(service, gsheet_id, config, 'Q', 'AA',
+        update_experiment(service, gsheet_id, config, 'R', 'AB',
                           update_values, 'classifier')
 
     LOGGER.info('Done!')
diff --git a/gsheets.py b/gsheets.py
index 310877b..121a60f 100644
--- a/gsheets.py
+++ b/gsheets.py
@@ -47,6 +47,7 @@
     'model_type',
     'feature_mode',
     'train_batch_size',
+    'non_overlap',
     'random_state',
     'num_epochs',
     'learning_rate',

From 5de2eacf9508a8c58e2d057c5ab89a17120c4af4 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 3 Apr 2018 17:55:23 -0400
Subject: [PATCH 160/201] For classifier, remove train epoch size, change model
 id just be model id prefix, and make the model id generated from the
 parameters and more descriptive

---
 06_train_classifier.py             | 11 ++---------
 classifier/train.py                | 28 +++++++++++++++++++++-------
 jobs/classifier-train-array.sbatch | 15 +++------------
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/06_train_classifier.py b/06_train_classifier.py
index 2d26c42..3cbff76 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -29,13 +29,6 @@ def parse_arguments():
                         default=64,
                         help='(MLP) Number of training examples per batch')
 
-    parser.add_argument('-tes',
-                        '--train-epoch-size',
-                        dest='train_epoch_size',
-                        action='store',
-                        type=int,
-                        help='(MLP) Number of training batches per epoch')
-
     parser.add_argument('-lr',
                         '--learning-rate',
                         dest='learning_rate',
@@ -158,10 +151,10 @@ def parse_arguments():
                         type=str,
                         help='Path to directory where output files will be stored')
 
-    parser.add_argument('model_id',
+    parser.add_argument('model_id_suffix',
                         action='store',
                         type=str,
-                        help='Identifier for this model')
+                        help='Identifier suffix for this model')
 
     parser.add_argument('fold_num',
                         action='store',
diff --git a/classifier/train.py b/classifier/train.py
index 5036ee5..614503e 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -179,7 +179,7 @@ def construct_mlp_model(input_shape, weight_decay=1e-5, num_classes=10):
 
 
 def train_mlp(train_data, valid_data, test_data, model_dir,
-              batch_size=64, num_epochs=100, train_epoch_size=None,
+              batch_size=64, num_epochs=100,
               learning_rate=1e-4, weight_decay=1e-5, num_classes=10,
               random_state=12345678, verbose=False, **kwargs):
     """
@@ -209,8 +209,6 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
                     (Type: int)
         num_epochs: Number of training epochs
                     (Type: int)
-        train_epoch_size: Number of training batches per training epoch
-                          (Type: int)
         learning_rate: Learning rate value
                        (Type: float)
         weight_decay: L2 regularization factor
@@ -255,8 +253,7 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
     m.compile(Adam(lr=learning_rate), loss=loss, metrics=metrics)
     LOGGER.debug('Fitting model to data...')
     m.fit(x=X_train, y=y_train, batch_size=batch_size,
-          epochs=num_epochs, steps_per_epoch=train_epoch_size,
-          validation_data=(X_valid, y_valid), callbacks=cb)
+          epochs=num_epochs, validation_data=(X_valid, y_valid), callbacks=cb)
 
     # Compute metrics for train and valid
     train_pred = m.predict(X_train)
@@ -292,7 +289,7 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
     return m, train_metrics, valid_metrics, test_metrics
 
 
-def train(features_dir, output_dir, model_id, fold_num,
+def train(features_dir, output_dir, model_id_suffix, fold_num,
           model_type='svm', feature_mode='framewise',
           train_batch_size=64,
           random_state=20171021, gsheet_id=None, google_dev_app_name=None,
@@ -305,8 +302,25 @@ def train(features_dir, output_dir, model_id, fold_num,
     np.random.seed(random_state)
     random.seed(random_state)
 
+    datasets = ['us8k', 'esc50', 'dcase2013']
+    for ds in datasets:
+        if ds in features_dir:
+            dataset_name = ds
+            break
+    else:
+        err_msg = 'Feature directory must contain name of dataset ({})'
+        raise ValueError(err_msg.format(str(datasets)))
+
+    features_desc_str = features_dir[features_dir.index(dataset_name):]
+
+    model_id = os.path.join(features_desc_str,
+                            model_type, feature_mode,
+                            "non-overlap" if non_overlap else "overlap",
+                            "min-max" if use_min_max else "no-min-max",
+                            random_state, model_id_suffix)
+
     # Make sure the directories we need exist
-    model_dir = os.path.join(output_dir, model_id, model_type, feature_mode,
+    model_dir = os.path.join(output_dir, model_id,
                              'fold{}'.format(fold_num),
                              datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
 
diff --git a/jobs/classifier-train-array.sbatch b/jobs/classifier-train-array.sbatch
index 8ab93cc..5bdac3e 100644
--- a/jobs/classifier-train-array.sbatch
+++ b/jobs/classifier-train-array.sbatch
@@ -17,7 +17,6 @@ source activate l3embedding-cpu
 SRCDIR=$HOME/dev/l3embedding
 FEATURES_DIR=/scratch/jtc440/l3_features/us8k/l3_frames_uniform/l3embedding/cnn_L3_orig/original
 OUTPUT_DIR=/scratch/jtc440/cls_output
-MODEL_ID='mycls'
 MODEL_TYPE='svm'
 FEATURE_MODE='framewise'
 GOOGLE_DEV_APP_NAME=''
@@ -30,20 +29,12 @@ python $SRCDIR/06_train_classifier.py \
     --random-state 20171021 \
     --model-type $MODEL_TYPE \
     --feature-mode $FEATURE_MODE \
-    --num-epochs 10 \
-    --train-batch-size 128 \
-    --train-epoch-size 16 \
-    --norm-penalty-factor 1.0 \
-    --svm-conv-tolerance 0.001 \
-    --svm-max-iterations 1000000 \
-    --svm-kernel-type rbf \
+    --num-epochs 150 \
+    --train-batch-size 32 \
     --gsheet-id $GSHEET_ID \
     --google-dev-app-name $GOOGLE_DEV_APP_NAME \
     --verbose \
     $FEATURES_DIR \
     $OUTPUT_DIR \
-    $MODEL_ID \
+    $SLURM_JOB_ID \
     $FOLD_NUM
-
-#    --learning-rate 0.00001 \
-#    --weight-decay 0.00001 \

From 8797329d54feccb37e17c6a8bf89e013fe2d790a Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 3 Apr 2018 20:40:03 -0400
Subject: [PATCH 161/201] Change expected input feature directory convention
 for classifier and change classifier output directory convention to inherit
 more directly from the input feature directory

---
 classifier/train.py                | 10 ++++------
 jobs/classifier-train-array.sbatch |  1 -
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/classifier/train.py b/classifier/train.py
index 614503e..111f601 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -311,16 +311,14 @@ def train(features_dir, output_dir, model_id_suffix, fold_num,
         err_msg = 'Feature directory must contain name of dataset ({})'
         raise ValueError(err_msg.format(str(datasets)))
 
-    features_desc_str = features_dir[features_dir.index(dataset_name):]
+    features_desc_str = features_dir[features_dir.rindex('features'):]
 
-    model_id = os.path.join(features_desc_str,
-                            model_type, feature_mode,
+    model_id = os.path.join(features_desc_str, model_type, feature_mode,
                             "non-overlap" if non_overlap else "overlap",
-                            "min-max" if use_min_max else "no-min-max",
-                            random_state, model_id_suffix)
+                            "min-max" if use_min_max else "no-min-max")
 
     # Make sure the directories we need exist
-    model_dir = os.path.join(output_dir, model_id,
+    model_dir = os.path.join(output_dir, 'classifier', model_id,
                              'fold{}'.format(fold_num),
                              datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
 
diff --git a/jobs/classifier-train-array.sbatch b/jobs/classifier-train-array.sbatch
index 5bdac3e..4be8a81 100644
--- a/jobs/classifier-train-array.sbatch
+++ b/jobs/classifier-train-array.sbatch
@@ -36,5 +36,4 @@ python $SRCDIR/06_train_classifier.py \
     --verbose \
     $FEATURES_DIR \
     $OUTPUT_DIR \
-    $SLURM_JOB_ID \
     $FOLD_NUM

From 533f3d81114af6cad3113e9702743bb5380f077e Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 3 Apr 2018 21:34:36 -0400
Subject: [PATCH 162/201] Change classifier model_type to be last part of model
 id

---
 classifier/train.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/classifier/train.py b/classifier/train.py
index 111f601..c6a53ad 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -313,9 +313,10 @@ def train(features_dir, output_dir, model_id_suffix, fold_num,
 
     features_desc_str = features_dir[features_dir.rindex('features'):]
 
-    model_id = os.path.join(features_desc_str, model_type, feature_mode,
+    model_id = os.path.join(features_desc_str, feature_mode,
                             "non-overlap" if non_overlap else "overlap",
-                            "min-max" if use_min_max else "no-min-max")
+                            "min-max" if use_min_max else "no-min-max",
+                            model_type)
 
     # Make sure the directories we need exist
     model_dir = os.path.join(output_dir, 'classifier', model_id,

From 39b566aed30b3652adcf5ebfdd26e389d4fba388 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 3 Apr 2018 22:07:02 -0400
Subject: [PATCH 163/201] Remove lingering references to model_id_suffix

---
 06_train_classifier.py | 5 -----
 classifier/train.py    | 2 +-
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/06_train_classifier.py b/06_train_classifier.py
index 3cbff76..7d8af2e 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -151,11 +151,6 @@ def parse_arguments():
                         type=str,
                         help='Path to directory where output files will be stored')
 
-    parser.add_argument('model_id_suffix',
-                        action='store',
-                        type=str,
-                        help='Identifier suffix for this model')
-
     parser.add_argument('fold_num',
                         action='store',
                         type=int,
diff --git a/classifier/train.py b/classifier/train.py
index c6a53ad..8e3bd48 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -289,7 +289,7 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
     return m, train_metrics, valid_metrics, test_metrics
 
 
-def train(features_dir, output_dir, model_id_suffix, fold_num,
+def train(features_dir, output_dir, fold_num,
           model_type='svm', feature_mode='framewise',
           train_batch_size=64,
           random_state=20171021, gsheet_id=None, google_dev_app_name=None,

From d7a462a144f642f7e7d4cedd9f6e7eeea5a35686 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 3 Apr 2018 22:07:43 -0400
Subject: [PATCH 164/201] Replace example features and output directories in
 classifier training sbatch scripts

---
 jobs/classifier-train-array.sbatch | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/jobs/classifier-train-array.sbatch b/jobs/classifier-train-array.sbatch
index 4be8a81..9a116ae 100644
--- a/jobs/classifier-train-array.sbatch
+++ b/jobs/classifier-train-array.sbatch
@@ -15,8 +15,8 @@ source ~/.bashrc
 source activate l3embedding-cpu
 
 SRCDIR=$HOME/dev/l3embedding
-FEATURES_DIR=/scratch/jtc440/l3_features/us8k/l3_frames_uniform/l3embedding/cnn_L3_orig/original
-OUTPUT_DIR=/scratch/jtc440/cls_output
+FEATURES_DIR=/scratch/jtc440/sonyc-usc/features/us8k/l3/original/music/cnn_L3_orig
+OUTPUT_DIR=/scratch/jtc440/sonyc-usc
 MODEL_TYPE='svm'
 FEATURE_MODE='framewise'
 GOOGLE_DEV_APP_NAME=''

From 0b778d4890754ae09abe81f3583f4bcc2a291fb3 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 3 Apr 2018 22:31:16 -0400
Subject: [PATCH 165/201] Remove "features" from classifier model id

---
 classifier/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/classifier/train.py b/classifier/train.py
index 8e3bd48..7d14b7e 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -311,7 +311,7 @@ def train(features_dir, output_dir, fold_num,
         err_msg = 'Feature directory must contain name of dataset ({})'
         raise ValueError(err_msg.format(str(datasets)))
 
-    features_desc_str = features_dir[features_dir.rindex('features'):]
+    features_desc_str = features_dir[features_dir.rindex('features')+9:]
 
     model_id = os.path.join(features_desc_str, feature_mode,
                             "non-overlap" if non_overlap else "overlap",

From bb96353b5c40445c0c0c9c0fcd9998e383b2960a Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 4 Apr 2018 20:59:19 -0400
Subject: [PATCH 166/201] Change embedding generation to fit new output
 directory convention, and infer embedding model type from the path by
 convention

---
 05_generate_embedding_samples.py             | 33 +++++++++-----------
 jobs/generate_embedding_samples.sbatch       |  2 --
 jobs/generate_embedding_samples_array.sbatch |  2 --
 3 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/05_generate_embedding_samples.py b/05_generate_embedding_samples.py
index 61629a3..9a61ead 100644
--- a/05_generate_embedding_samples.py
+++ b/05_generate_embedding_samples.py
@@ -51,14 +51,6 @@ def parse_arguments():
                         type=str,
                         help='Path to L3 embedding model weights file')
 
-    parser.add_argument('-lmt',
-                        '--l3embedding-model-type',
-                        dest='l3embedding_model_type',
-                        action='store',
-                        type=str,
-                        default='cnn_L3_orig',
-                        help='Type of L3 embedding model')
-
     parser.add_argument('-lpt',
                         '--l3embedding-pooling-type',
                         dest='l3embedding_pooling_type',
@@ -128,7 +120,6 @@ def parse_arguments():
     LOGGER.debug('Initialized logging.')
 
     # Unpack CL args
-    model_type = args['l3embedding_model_type']
     pooling_type = args['l3embedding_pooling_type']
     metadata_path = args['metadata_path']
     data_dir = args['data_dir']
@@ -146,29 +137,35 @@ def parse_arguments():
     LOGGER.info('Configuration: {}'.format(str(args)))
 
     is_l3_feature = features == 'l3'
+    if is_l3_feature and not model_path:
+        raise ValueError('Must provide model path is L3 embedding features are used')
+
+    if is_l3_feature:
+        # Get output dir
+        model_desc_start_idx = model_path.rindex('embedding')+10
+        model_desc_end_idx = os.path.dirname(model_path).rindex('/')
+        embedding_desc_str = model_path[model_desc_start_idx:model_desc_end_idx]
+        # If using an L3 model, make model arch. type and pooling type to path
+        dataset_output_dir = os.path.join(output_dir, dataset_name, features,
+                                          pooling_type, embedding_desc_str)
 
-    if is_l3_feature and model_path:
         # Load L3 embedding model if using L3 features
         LOGGER.info('Loading embedding model...')
+        model_type = embedding_desc_str.split('/')[-1]
         l3embedding_model = load_embedding(model_path,
                                            model_type,
                                            'audio', pooling_type,
                                            tgt_num_gpus=num_gpus)
     else:
+        # Get output dir
+        dataset_output_dir = os.path.join(output_dir, dataset_name, features)
         l3embedding_model = None
 
-    if is_l3_feature:
-        # If using an L3 model, make model arch. type and pooling type to path
-        dataset_output_dir = os.path.join(output_dir, dataset_name, features,
-                                          model_id, model_type, pooling_type)
-    else:
-        dataset_output_dir = os.path.join(output_dir, dataset_name, features,
-                                          model_id)
-
     # Make sure output directory exists
     if not os.path.isdir(dataset_output_dir):
         os.makedirs(dataset_output_dir)
 
+    args['features_dir'] = dataset_output_dir
     # Write configurations to a file for reproducibility/posterity
     config_path = os.path.join(dataset_output_dir, 'config_{}.json'.format(fold_num))
     with open(config_path, 'w') as f:
diff --git a/jobs/generate_embedding_samples.sbatch b/jobs/generate_embedding_samples.sbatch
index 9563b3e..d578ed1 100644
--- a/jobs/generate_embedding_samples.sbatch
+++ b/jobs/generate_embedding_samples.sbatch
@@ -17,7 +17,6 @@ source activate l3embedding
 
 SRCDIR=$HOME/dev/l3embedding
 L3_MODEL_PATH=''
-L3_MODEL_TYPE='cnn_L3_melspec1'
 L3_POOLING_TYPE='original'
 US8K_PATH=/scratch/jtc440/UrbanSound8K
 METADATA_PATH=$US8K_PATH/metadata/UrbanSound8K.csv
@@ -32,7 +31,6 @@ python $SRCDIR/05_generate_embedding_samples.py \
     --verbose \
     --features 'l3' \
     --l3embedding-model-path $L3_MODEL_PATH \
-    --l3embedding-model-type $L3_MODEL_TYPE \
     --l3embedding-pooling-type $L3_POOLING_TYPE \
     --hop-size 0.1 \
     --gpus 0 \
diff --git a/jobs/generate_embedding_samples_array.sbatch b/jobs/generate_embedding_samples_array.sbatch
index 712d301..a258af7 100644
--- a/jobs/generate_embedding_samples_array.sbatch
+++ b/jobs/generate_embedding_samples_array.sbatch
@@ -17,7 +17,6 @@ source activate l3embedding-cpu
 
 SRCDIR=$HOME/dev/l3embedding
 L3_MODEL_PATH=''
-L3_MODEL_TYPE='cnn_L3_melspec1'
 L3_POOLING_TYPE='original'
 US8K_PATH=/beegfs/jtc440/UrbanSound8K
 METADATA_PATH=$US8K_PATH/metadata/UrbanSound8K.csv
@@ -35,7 +34,6 @@ python $SRCDIR/05_generate_embedding_samples.py \
     --verbose \
     --features 'l3' \
     --l3embedding-model-path $L3_MODEL_PATH \
-    --l3embedding-model-type $L3_MODEL_TYPE \
     --l3embedding-pooling-type $L3_POOLING_TYPE \
     --hop-size 0.1 \
     --gpus 0 \

From 7d90d7ef57bf762c806726cf7c4dc7aed81b925a Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 4 Apr 2018 21:10:56 -0400
Subject: [PATCH 167/201] Remove embedding model ID from embedding training
 code and derive from data subset name and model type

---
 03_train_embedding.py                           | 5 -----
 jobs/l3embedding-train-02052018.sbatch          | 2 --
 jobs/l3embedding-train-02162018.sbatch          | 2 --
 jobs/l3embedding-train-melspec2-03202018.sbatch | 2 --
 jobs/l3embedding-train.sbatch                   | 2 --
 l3embedding/train.py                            | 7 ++++++-
 6 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/03_train_embedding.py b/03_train_embedding.py
index 62d8659..cc513f8 100644
--- a/03_train_embedding.py
+++ b/03_train_embedding.py
@@ -144,11 +144,6 @@ def parse_arguments():
                         type=str,
                         help='Path to directory where validation set files are stored')
 
-    parser.add_argument('model_id',
-                        action='store',
-                        type=str,
-                        help='Identifier for this model')
-
     parser.add_argument('output_dir',
                         action='store',
                         type=str,
diff --git a/jobs/l3embedding-train-02052018.sbatch b/jobs/l3embedding-train-02052018.sbatch
index 15707a7..cecbc57 100644
--- a/jobs/l3embedding-train-02052018.sbatch
+++ b/jobs/l3embedding-train-02052018.sbatch
@@ -19,7 +19,6 @@ source activate l3embedding
 SRCDIR=$HOME/dev/l3embedding
 TRAIN_DATA_DIR=/beegfs/work/AudioSetSamples/filtered_train
 VAL_DATA_DIR=/beegfs/work/AudioSetSamples/filtered_valid
-MODEL_ID='new_train_test'
 OUTPUT_DIR=/scratch/myname/l3_output
 
 module purge
@@ -40,5 +39,4 @@ python $SRCDIR/03_train_embedding.py \
     --verbose \
     $TRAIN_DATA_DIR \
     $VAL_DATA_DIR \
-    $MODEL_ID \
     $OUTPUT_DIR
diff --git a/jobs/l3embedding-train-02162018.sbatch b/jobs/l3embedding-train-02162018.sbatch
index a30a28d..48c089a 100644
--- a/jobs/l3embedding-train-02162018.sbatch
+++ b/jobs/l3embedding-train-02162018.sbatch
@@ -19,7 +19,6 @@ source activate l3embedding-cpu
 SRCDIR=$HOME/dev/l3embedding
 TRAIN_DATA_DIR=/beegfs/work/AudioSetSamples/filtered_train
 VAL_DATA_DIR=/beegfs/work/AudioSetSamples/filtered_valid
-MODEL_ID='new_train_test'
 OUTPUT_DIR=/scratch/$USER/l3_output
 GOOGLE_DEV_APP_NAME='l3embeddingexperiments'
 GSHEET_ID='' # REPLACE THIS
@@ -45,5 +44,4 @@ python $SRCDIR/03_train_embedding.py \
     --verbose \
     $TRAIN_DATA_DIR \
     $VAL_DATA_DIR \
-    $MODEL_ID \
     $OUTPUT_DIR
diff --git a/jobs/l3embedding-train-melspec2-03202018.sbatch b/jobs/l3embedding-train-melspec2-03202018.sbatch
index 4b6d2a9..223a415 100644
--- a/jobs/l3embedding-train-melspec2-03202018.sbatch
+++ b/jobs/l3embedding-train-melspec2-03202018.sbatch
@@ -19,7 +19,6 @@ source activate l3embedding
 SRCDIR=$HOME/dev/l3embedding
 TRAIN_DATA_DIR=/beegfs/work/AudioSetSamples_environmental/urban_train
 VAL_DATA_DIR=/beegfs/work/AudioSetSamples_environmental/urban_valid
-MODEL_ID='melspec2'
 OUTPUT_DIR=/scratch/jtc440/l3_output
 GOOGLE_DEV_APP_NAME='l3embeddingexperiments'
 GSHEET_ID='' # REPLACE THIS
@@ -45,5 +44,4 @@ python $SRCDIR/03_train_embedding.py \
     --verbose \
     $TRAIN_DATA_DIR \
     $VAL_DATA_DIR \
-    $MODEL_ID \
     $OUTPUT_DIR
diff --git a/jobs/l3embedding-train.sbatch b/jobs/l3embedding-train.sbatch
index 8bee837..2064439 100644
--- a/jobs/l3embedding-train.sbatch
+++ b/jobs/l3embedding-train.sbatch
@@ -18,7 +18,6 @@ source activate l3embedding
 SRCDIR=''
 TRAIN_DATA_DIR=''
 VAL_DATA_DIR=''
-MODEL_ID='mymodel'
 OUTPUT_DIR=''
 
 module purge
@@ -51,5 +50,4 @@ python $SRCDIR/train.py \
     --verbose \
     $TRAIN_DATA_DIR \
     $VAL_DATA_DIR \
-    $MODEL_ID \
     $OUTPUT_DIR
diff --git a/l3embedding/train.py b/l3embedding/train.py
index ef1629e..ab8ae0c 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -215,7 +215,7 @@ def get_restart_info(history_path):
     return int(last['epoch']), float(last['val_acc']), float(last['val_loss'])
 
 
-def train(train_data_dir, validation_data_dir, model_id, output_dir,
+def train(train_data_dir, validation_data_dir, output_dir,
           num_epochs=150, train_epoch_size=512, validation_epoch_size=1024,
           train_batch_size=64, validation_batch_size=64,
           model_type='cnn_L3_orig', random_state=20180123,
@@ -228,6 +228,11 @@ def train(train_data_dir, validation_data_dir, model_id, output_dir,
         init_file_logger(LOGGER, log_path=log_path)
     LOGGER.debug('Initialized logging.')
 
+    # Form model ID
+    data_subset_name = os.path.basename(train_data_dir)
+    data_subset_name = data_subset_name[:data_subset_name.rindex('_')]
+    model_id = os.path.join(data_subset_name, model_type)
+
     param_dict = {
           'username': getpass.getuser(),
           'train_data_dir': train_data_dir,

From 04fa4d75090c868502db8a42551a1ea82e1329d3 Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Thu, 5 Apr 2018 10:10:54 -0400
Subject: [PATCH 168/201] Add code to process dcase2013 and esc50

---
 05_generate_embedding_samples.py | 29 ++++++++++-
 data/usc/dcase2013.py            | 88 ++++++++++++++++++++++++++++++++
 data/usc/esc50.py                | 74 +++++++++++++++++++++++++++
 3 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 data/usc/dcase2013.py
 create mode 100644 data/usc/esc50.py

diff --git a/05_generate_embedding_samples.py b/05_generate_embedding_samples.py
index 9a61ead..9a3ac84 100644
--- a/05_generate_embedding_samples.py
+++ b/05_generate_embedding_samples.py
@@ -3,6 +3,8 @@
 import os
 import json
 from l3embedding.model import load_embedding
+from data.usc.dcase2013 import generate_dcase2013_folds, generate_dcase2013_fold_data
+from data.usc.esc50 import generate_esc50_folds, generate_esc50_fold_data
 from data.usc.us8k import generate_us8k_folds, generate_us8k_fold_data
 from log import init_console_logger
 
@@ -98,7 +100,7 @@ def parse_arguments():
     parser.add_argument('dataset_name',
                         action='store',
                         type=str,
-                        choices=['us8k'],
+                        choices=['us8k', 'esc50', 'dcase2013'],
                         help='Name of dataset')
 
     parser.add_argument('data_dir',
@@ -187,6 +189,31 @@ def parse_arguments():
                 l3embedding_model=l3embedding_model,
                 features=features, random_state=random_state,
                 hop_size=hop_size, num_random_samples=num_random_samples)
+
+    elif dataset_name == 'esc50':
+        if fold_num is not None:
+            generate_esc50_fold_data(data_dir, fold_num-1, dataset_output_dir,
+                l3embedding_model=l3embedding_model,
+                features=features, random_state=random_state,
+                hop_size=hop_size, num_random_samples=num_random_samples)
+        else:
+            generate_esc50_folds(data_dir, dataset_output_dir,
+                l3embedding_model=l3embedding_model,
+                features=features, random_state=random_state,
+                hop_size=hop_size, num_random_samples=num_random_samples)
+
+    elif dataset_name == 'dcase2013':
+        if fold_num is not None:
+            generate_dcase2013_fold_data(data_dir, fold_num-1, dataset_output_dir,
+                l3embedding_model=l3embedding_model,
+                features=features, random_state=random_state,
+                hop_size=hop_size, num_random_samples=num_random_samples)
+        else:
+            generate_dcase2013_folds(data_dir, dataset_output_dir,
+                l3embedding_model=l3embedding_model,
+                features=features, random_state=random_state,
+                hop_size=hop_size, num_random_samples=num_random_samples)
+
     else:
         LOGGER.error('Invalid dataset name: {}'.format(dataset_name))
 
diff --git a/data/usc/dcase2013.py b/data/usc/dcase2013.py
new file mode 100644
index 0000000..4a37efb
--- /dev/null
+++ b/data/usc/dcase2013.py
@@ -0,0 +1,88 @@
+import glob
+import logging
+import os
+import random
+
+import numpy as np
+
+from data.usc.features as compute_file_features
+from log import LogTimer
+
+LOGGER = logging.getLogger('cls-data-generation')
+LOGGER.setLevel(logging.DEBUG)
+
+
+CLASS_TO_INT = {
+    'bus': 0,
+    'busystreet': 1,
+    'office': 2,
+    'openairmarket': 3,
+    'park': 4,
+    'quietstreet': 5,
+    'restaurant': 6,
+    'supermarket': 7,
+    'tube': 8,
+    'tubestation': 9
+}
+
+
+def generate_dcase2013_folds(data_dir, output_dir, l3embedding_model=None,
+                             features='l3', random_state=12345678, **feature_args):
+    for fold_idx in range(2):
+        generate_dcase2013_fold_data(data_dir, fold_idx, output_dir,
+                                     l3embedding_model=l3embedding_model,
+                                     features=features, random_state=random_state,
+                                     **feature_args)
+
+
+def generate_dcase2013_fold_data(data_dir, fold_idx, output_dir, l3embedding_model=None,
+                                 features='l3', random_state=12345678, **feature_args):
+    # Set random seed
+    random_state = random_state + fold_idx
+    random.seed(random_state)
+    np.random.seed(random_state)
+
+    audio_fold_dir = os.path.join(data_dir, "fold{}".format(fold_idx+1))
+
+    # Create fold directory if it does not exist
+    output_fold_dir = os.path.join(output_dir, "fold{}".format(fold_idx+1))
+    if not os.path.isdir(output_fold_dir):
+        os.makedirs(output_fold_dir)
+
+    LOGGER.info('Generating fold {} in {}'.format(fold_idx+1, output_fold_dir))
+
+    files = glob.glob(audio_fold_dir + '/*')
+    num_files = len(files)
+
+    for idx, f in enumerate(files):
+        fname = f.split('/')[-1]
+        desc = '({}/{}) Processed {} -'.format(idx+1, num_files, fname)
+        with LogTimer(LOGGER, desc, log_level=logging.DEBUG):
+            generate_esc50_file_data(fname, audio_fold_dir, output_fold_dir,
+                                     features, l3embedding_model, **feature_args)
+
+
+def generate_dcase2013_file_data(fname, audio_fold_dir, output_fold_dir,
+                                 features, l3embedding_model, **feature_args):
+    audio_path = os.path.join(audio_fold_dir, fname)
+
+    basename, _ = os.path.splitext(fname)
+    output_path = os.path.join(output_fold_dir, basename + '.npz')
+
+    if os.path.exists(output_path):
+        LOGGER.info('File {} already exists'.format(output_path))
+        return
+
+    X = compute_file_features(audio_path, features, l3embedding_model=l3embedding_model, **feature_args)
+
+    # If we were not able to compute the features, skip this file
+    if X is None:
+        LOGGER.error('Could not generate data for {}'.format(audio_path))
+        return
+
+    class_label = CLASS_TO_INT[basename[:-2]]
+    y = class_label
+
+    np.savez_compressed(output_path, X=X, y=y)
+
+    return output_path, 'success'
diff --git a/data/usc/esc50.py b/data/usc/esc50.py
new file mode 100644
index 0000000..ea33d99
--- /dev/null
+++ b/data/usc/esc50.py
@@ -0,0 +1,74 @@
+import glob
+import logging
+import os
+import random
+
+import numpy as np
+
+from data.usc.features as compute_file_features
+from log import LogTimer
+
+LOGGER = logging.getLogger('cls-data-generation')
+LOGGER.setLevel(logging.DEBUG)
+
+
+def generate_esc50_folds(data_dir, output_dir, l3embedding_model=None,
+                         features='l3', random_state=12345678, **feature_args):
+    for fold_idx in range(5):
+        generate_esc50_fold_data(data_dir, fold_idx, output_dir,
+                                 l3embedding_model=l3embedding_model,
+                                 features=features, random_state=random_state,
+                                 **feature_args)
+
+
+def generate_esc50_fold_data(data_dir, fold_idx, output_dir, l3embedding_model=None,
+                             features='l3', random_state=12345678, **feature_args):
+    # Set random seed
+    random_state = random_state + fold_idx
+    random.seed(random_state)
+    np.random.seed(random_state)
+
+    audio_fold_dir = os.path.join(data_dir, "fold{}".format(fold_idx+1))
+
+    # Create fold directory if it does not exist
+    output_fold_dir = os.path.join(output_dir, "fold{}".format(fold_idx+1))
+    if not os.path.isdir(output_fold_dir):
+        os.makedirs(output_fold_dir)
+
+    LOGGER.info('Generating fold {} in {}'.format(fold_idx+1, output_fold_dir))
+
+    files = glob.glob(audio_fold_dir + '/*')
+    num_files = len(files)
+
+    for idx, f in enumerate(files):
+        fname = f.split('/')[-1]
+        desc = '({}/{}) Processed {} -'.format(idx+1, num_files, fname)
+        with LogTimer(LOGGER, desc, log_level=logging.DEBUG):
+            generate_esc50_file_data(fname, audio_fold_dir, output_fold_dir,
+                                     features, l3embedding_model, **feature_args)
+
+
+def generate_esc50_file_data(fname, audio_fold_dir, output_fold_dir,
+                             features, l3embedding_model, **feature_args):
+    audio_path = os.path.join(audio_fold_dir, fname)
+
+    basename, _ = os.path.splitext(fname)
+    output_path = os.path.join(output_fold_dir, basename + '.npz')
+
+    if os.path.exists(output_path):
+        LOGGER.info('File {} already exists'.format(output_path))
+        return
+
+    X = compute_file_features(audio_path, features, l3embedding_model=l3embedding_model, **feature_args)
+
+    # If we were not able to compute the features, skip this file
+    if X is None:
+        LOGGER.error('Could not generate data for {}'.format(audio_path))
+        return
+
+    class_label = int(basename.split('-')[-1])
+    y = class_label
+
+    np.savez_compressed(output_path, X=X, y=y)
+
+    return output_path, 'success'

From f9b0f6d26a5c2aed611bd918547668407dd18cc0 Mon Sep 17 00:00:00 2001
From: Ho-Hsiang Wu <hohsiangwu@github.com>
Date: Thu, 5 Apr 2018 10:25:34 -0400
Subject: [PATCH 169/201] wrong function name

---
 data/usc/dcase2013.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/usc/dcase2013.py b/data/usc/dcase2013.py
index 4a37efb..8e764b9 100644
--- a/data/usc/dcase2013.py
+++ b/data/usc/dcase2013.py
@@ -58,7 +58,7 @@ def generate_dcase2013_fold_data(data_dir, fold_idx, output_dir, l3embedding_mod
         fname = f.split('/')[-1]
         desc = '({}/{}) Processed {} -'.format(idx+1, num_files, fname)
         with LogTimer(LOGGER, desc, log_level=logging.DEBUG):
-            generate_esc50_file_data(fname, audio_fold_dir, output_fold_dir,
+            generate_dcase2013_file_data(fname, audio_fold_dir, output_fold_dir,
                                      features, l3embedding_model, **feature_args)
 
 

From 9f0ae6aa272e566bb5c5bc73530db94993d82673 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 5 Apr 2018 21:58:24 -0400
Subject: [PATCH 170/201] Remove unnecessary model ID argument from feature
 generation code

---
 05_generate_embedding_samples.py             | 6 ------
 jobs/generate_embedding_samples_array.sbatch | 5 ++---
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/05_generate_embedding_samples.py b/05_generate_embedding_samples.py
index 9a3ac84..4c181ed 100644
--- a/05_generate_embedding_samples.py
+++ b/05_generate_embedding_samples.py
@@ -92,11 +92,6 @@ def parse_arguments():
                         type=str,
                         help='Path to UrbanSound8K metadata file')
 
-    parser.add_argument('model_id',
-                        action='store',
-                        type=str,
-                        help='Model ID used to generate features (or experiment ID)')
-
     parser.add_argument('dataset_name',
                         action='store',
                         type=str,
@@ -132,7 +127,6 @@ def parse_arguments():
     model_path = args['l3embedding_model_path']
     num_gpus = args['gpus']
     output_dir = args['output_dir']
-    model_id = args['model_id']
     dataset_name = args['dataset_name']
     fold_num = args['fold']
 
diff --git a/jobs/generate_embedding_samples_array.sbatch b/jobs/generate_embedding_samples_array.sbatch
index a258af7..cdf437d 100644
--- a/jobs/generate_embedding_samples_array.sbatch
+++ b/jobs/generate_embedding_samples_array.sbatch
@@ -22,7 +22,7 @@ US8K_PATH=/beegfs/jtc440/UrbanSound8K
 METADATA_PATH=$US8K_PATH/metadata/UrbanSound8K.csv
 DATA_DIR=$US8K_PATH/audio
 OUTPUT_DIR=/scratch/jtc440/l3_features
-MODEL_ID='l3embedding'
+DATASET='us8k'
 
 module purge
 #module load cuda/8.0.44
@@ -39,7 +39,6 @@ python $SRCDIR/05_generate_embedding_samples.py \
     --gpus 0 \
     --fold $SLURM_ARRAY_TASK_ID \
     $METADATA_PATH \
-    $MODEL_ID \
-    us8k \
+    $DATASET \
     $DATA_DIR \
     $OUTPUT_DIR

From 07fdc4dd517c08cca01ed09db2243e69cc180c6b Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 5 Apr 2018 22:07:36 -0400
Subject: [PATCH 171/201] Make US8k metadata path an optional argument and
 update sbatch script

---
 05_generate_embedding_samples.py                    | 13 +++++++++----
 ...=> generate_embedding_samples_array_us8k.sbatch} |  4 ++--
 2 files changed, 11 insertions(+), 6 deletions(-)
 rename jobs/{generate_embedding_samples_array.sbatch => generate_embedding_samples_array_us8k.sbatch} (92%)

diff --git a/05_generate_embedding_samples.py b/05_generate_embedding_samples.py
index 4c181ed..5e15280 100644
--- a/05_generate_embedding_samples.py
+++ b/05_generate_embedding_samples.py
@@ -76,7 +76,8 @@ def parse_arguments():
                         type=int,
                         help='Number of random samples for randomized sampling methods')
 
-    parser.add_argument('--gpus',
+    parser.add_argument('-g',
+                        '--gpus',
                         dest='gpus',
                         type=int,
                         default=0,
@@ -87,9 +88,11 @@ def parse_arguments():
                         type=int,
                         help='Fold number to generate. If unused, generate all folds')
 
-    parser.add_argument('metadata_path',
-                        action='store',
+    parser.add_argument('-ump',
+                        '--us8k-metadata-path',
+                        dest='us8k_metadata_path',
                         type=str,
+                        action='store',
                         help='Path to UrbanSound8K metadata file')
 
     parser.add_argument('dataset_name',
@@ -118,7 +121,7 @@ def parse_arguments():
 
     # Unpack CL args
     pooling_type = args['l3embedding_pooling_type']
-    metadata_path = args['metadata_path']
+    metadata_path = args['us8k_metadata_path']
     data_dir = args['data_dir']
     features = args['features']
     hop_size = args['hop_size']
@@ -169,6 +172,8 @@ def parse_arguments():
     LOGGER.info('Saved configuration to {}'.format(config_path))
 
     if dataset_name == 'us8k':
+        if not metadata_path:
+            raise ValueError('Must provide metadata file for UrbanSound8k')
 
         if fold_num is not None:
             # Generate a single fold if a fold was specified
diff --git a/jobs/generate_embedding_samples_array.sbatch b/jobs/generate_embedding_samples_array_us8k.sbatch
similarity index 92%
rename from jobs/generate_embedding_samples_array.sbatch
rename to jobs/generate_embedding_samples_array_us8k.sbatch
index cdf437d..ca95012 100644
--- a/jobs/generate_embedding_samples_array.sbatch
+++ b/jobs/generate_embedding_samples_array_us8k.sbatch
@@ -21,7 +21,7 @@ L3_POOLING_TYPE='original'
 US8K_PATH=/beegfs/jtc440/UrbanSound8K
 METADATA_PATH=$US8K_PATH/metadata/UrbanSound8K.csv
 DATA_DIR=$US8K_PATH/audio
-OUTPUT_DIR=/scratch/jtc440/l3_features
+OUTPUT_DIR=/scratch/jtc440/sonyc-usc
 DATASET='us8k'
 
 module purge
@@ -38,7 +38,7 @@ python $SRCDIR/05_generate_embedding_samples.py \
     --hop-size 0.1 \
     --gpus 0 \
     --fold $SLURM_ARRAY_TASK_ID \
-    $METADATA_PATH \
+    --us8k-metadata-path $METADATA_PATH \
     $DATASET \
     $DATA_DIR \
     $OUTPUT_DIR

From 35c86d85943d205a53956c9b681dd038f1915fcc Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 5 Apr 2018 22:09:21 -0400
Subject: [PATCH 172/201] Add "features" to embedding generation output path

---
 05_generate_embedding_samples.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/05_generate_embedding_samples.py b/05_generate_embedding_samples.py
index 5e15280..b4e666a 100644
--- a/05_generate_embedding_samples.py
+++ b/05_generate_embedding_samples.py
@@ -145,8 +145,8 @@ def parse_arguments():
         model_desc_end_idx = os.path.dirname(model_path).rindex('/')
         embedding_desc_str = model_path[model_desc_start_idx:model_desc_end_idx]
         # If using an L3 model, make model arch. type and pooling type to path
-        dataset_output_dir = os.path.join(output_dir, dataset_name, features,
-                                          pooling_type, embedding_desc_str)
+        dataset_output_dir = os.path.join(output_dir, 'features', dataset_name,
+                                          features, pooling_type, embedding_desc_str)
 
         # Load L3 embedding model if using L3 features
         LOGGER.info('Loading embedding model...')
@@ -157,7 +157,7 @@ def parse_arguments():
                                            tgt_num_gpus=num_gpus)
     else:
         # Get output dir
-        dataset_output_dir = os.path.join(output_dir, dataset_name, features)
+        dataset_output_dir = os.path.join(output_dir, 'features', dataset_name, features)
         l3embedding_model = None
 
     # Make sure output directory exists

From db4ffaf8909b492bf78a5083728ab08c119bde99 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 5 Apr 2018 22:12:12 -0400
Subject: [PATCH 173/201] Add sbatch scripts for ESC-50 and DCASE2013 embedding
 generation

---
 ...e_embedding_samples_array_dcase2013.sbatch | 41 +++++++++++++++++++
 ...erate_embedding_samples_array_esc50.sbatch | 41 +++++++++++++++++++
 2 files changed, 82 insertions(+)
 create mode 100644 jobs/generate_embedding_samples_array_dcase2013.sbatch
 create mode 100644 jobs/generate_embedding_samples_array_esc50.sbatch

diff --git a/jobs/generate_embedding_samples_array_dcase2013.sbatch b/jobs/generate_embedding_samples_array_dcase2013.sbatch
new file mode 100644
index 0000000..c8c1859
--- /dev/null
+++ b/jobs/generate_embedding_samples_array_dcase2013.sbatch
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+#SBATCH --job-name=generate-l3embedding-samples
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64GB
+#SBATCH --time=7-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@email.com
+#SBATCH --output="generate-l3embedding-samples-%A-%a.out"
+#SBATCH --err="generate-l3embedding-samples-%A-%a.err"
+
+
+source ~/.bashrc
+cd /home/$USER/dev
+source activate l3embedding-cpu
+
+SRCDIR=$HOME/dev/l3embedding
+L3_MODEL_PATH=''
+L3_POOLING_TYPE='original'
+DATA_DIR=/scratch/hhw230/dcase2013/audio
+OUTPUT_DIR=/scratch/jtc440/sonyc-usc
+DATASET='dcase2013'
+
+module purge
+#module load cuda/8.0.44
+#module load cudnn/8.0v6.0
+module load ffmpeg/intel/3.2.2
+
+python $SRCDIR/05_generate_embedding_samples.py \
+    --random-state 20180302 \
+    --verbose \
+    --features 'l3' \
+    --l3embedding-model-path $L3_MODEL_PATH \
+    --l3embedding-pooling-type $L3_POOLING_TYPE \
+    --hop-size 0.1 \
+    --gpus 0 \
+    --fold $SLURM_ARRAY_TASK_ID \
+    $DATASET \
+    $DATA_DIR \
+    $OUTPUT_DIR
diff --git a/jobs/generate_embedding_samples_array_esc50.sbatch b/jobs/generate_embedding_samples_array_esc50.sbatch
new file mode 100644
index 0000000..c2f6d6c
--- /dev/null
+++ b/jobs/generate_embedding_samples_array_esc50.sbatch
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+#SBATCH --job-name=generate-l3embedding-samples
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64GB
+#SBATCH --time=7-0
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=name@email.com
+#SBATCH --output="generate-l3embedding-samples-%A-%a.out"
+#SBATCH --err="generate-l3embedding-samples-%A-%a.err"
+
+
+source ~/.bashrc
+cd /home/$USER/dev
+source activate l3embedding-cpu
+
+SRCDIR=$HOME/dev/l3embedding
+L3_MODEL_PATH=''
+L3_POOLING_TYPE='original'
+DATA_DIR=/scratch/hhw230/esc50/audio
+OUTPUT_DIR=/scratch/jtc440/sonyc-usc
+DATASET='esc50'
+
+module purge
+#module load cuda/8.0.44
+#module load cudnn/8.0v6.0
+module load ffmpeg/intel/3.2.2
+
+python $SRCDIR/05_generate_embedding_samples.py \
+    --random-state 20180302 \
+    --verbose \
+    --features 'l3' \
+    --l3embedding-model-path $L3_MODEL_PATH \
+    --l3embedding-pooling-type $L3_POOLING_TYPE \
+    --hop-size 0.1 \
+    --gpus 0 \
+    --fold $SLURM_ARRAY_TASK_ID \
+    $DATASET \
+    $DATA_DIR \
+    $OUTPUT_DIR

From 29407dea641d1257a3a07aee74b4a5e385511b30 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sat, 7 Apr 2018 04:22:14 -0400
Subject: [PATCH 174/201] Fix typo

---
 data/usc/dcase2013.py | 2 +-
 data/usc/esc50.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/usc/dcase2013.py b/data/usc/dcase2013.py
index 8e764b9..6244ce1 100644
--- a/data/usc/dcase2013.py
+++ b/data/usc/dcase2013.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 
-from data.usc.features as compute_file_features
+from data.usc.features import compute_file_features
 from log import LogTimer
 
 LOGGER = logging.getLogger('cls-data-generation')
diff --git a/data/usc/esc50.py b/data/usc/esc50.py
index ea33d99..b9255b8 100644
--- a/data/usc/esc50.py
+++ b/data/usc/esc50.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 
-from data.usc.features as compute_file_features
+from data.usc.features import compute_file_features
 from log import LogTimer
 
 LOGGER = logging.getLogger('cls-data-generation')

From a3b5ebaa9830c363e906ae30e56b384db70c991b Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 10 Apr 2018 04:13:36 -0400
Subject: [PATCH 175/201] Support training classifier on all datasets

---
 classifier/train.py   |  14 +++-
 data/usc/dcase2013.py |   4 +-
 data/usc/esc50.py     |   4 +-
 data/usc/folds.py     | 105 ++++++++++++++++++++++++
 data/usc/us8k.py      | 182 +-----------------------------------------
 5 files changed, 127 insertions(+), 182 deletions(-)
 create mode 100644 data/usc/folds.py

diff --git a/classifier/train.py b/classifier/train.py
index 7d14b7e..c5518f7 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -20,7 +20,7 @@
 
 from classifier.metrics import compute_metrics
 from data.usc.features import preprocess_split_data
-from data.usc.us8k import get_us8k_split
+from data.usc.folds import get_split
 from l3embedding.train import LossHistory
 from log import *
 
@@ -31,6 +31,13 @@
 LOGGER.setLevel(logging.DEBUG)
 
 
+DATASET_NUM_CLASSES = {
+    'us8k': 10,
+    'esc50': 50,
+    'dcase2013': 10,
+}
+
+
 class MetricCallback(keras.callbacks.Callback):
 
     def __init__(self, valid_data, verbose=False):
@@ -312,6 +319,7 @@ def train(features_dir, output_dir, fold_num,
         raise ValueError(err_msg.format(str(datasets)))
 
     features_desc_str = features_dir[features_dir.rindex('features')+9:]
+    dataset_name = features_desc_str.split('/')[0]
 
     model_id = os.path.join(features_desc_str, feature_mode,
                             "non-overlap" if non_overlap else "overlap",
@@ -379,7 +387,7 @@ def train(features_dir, output_dir, fold_num,
     fold_idx = fold_num - 1
 
     LOGGER.info('Loading data for configuration with test fold {}...'.format(fold_num))
-    train_data, valid_data, test_data = get_us8k_split(features_dir, fold_idx)
+    train_data, valid_data, test_data = get_split(features_dir, fold_idx, dataset_name)
 
     LOGGER.info('Preprocessing data...')
     min_max_scaler, stdizer = preprocess_split_data(train_data, valid_data, test_data,
@@ -398,12 +406,14 @@ def train(features_dir, output_dir, fold_num,
     if model_type == 'svm':
         model, train_metrics, valid_metrics, test_metrics \
             = train_svm(train_data, valid_data, test_data, model_dir,
+                num_classes=DATASET_NUM_CLASSES[dataset_name],
                 random_state=random_state, verbose=verbose, **model_args)
 
     elif model_type == 'mlp':
         model, train_metrics, valid_metrics, test_metrics \
                 = train_mlp(train_data, valid_data, test_data, model_dir,
                     batch_size=train_batch_size, random_state=random_state,
+                    num_classes=DATASET_NUM_CLASSES[dataset_name],
                     verbose=verbose, **model_args)
 
     else:
diff --git a/data/usc/dcase2013.py b/data/usc/dcase2013.py
index 6244ce1..0194ff9 100644
--- a/data/usc/dcase2013.py
+++ b/data/usc/dcase2013.py
@@ -12,6 +12,8 @@
 LOGGER.setLevel(logging.DEBUG)
 
 
+NUM_FOLDS = 2
+
 CLASS_TO_INT = {
     'bus': 0,
     'busystreet': 1,
@@ -28,7 +30,7 @@
 
 def generate_dcase2013_folds(data_dir, output_dir, l3embedding_model=None,
                              features='l3', random_state=12345678, **feature_args):
-    for fold_idx in range(2):
+    for fold_idx in range(NUM_FOLDS):
         generate_dcase2013_fold_data(data_dir, fold_idx, output_dir,
                                      l3embedding_model=l3embedding_model,
                                      features=features, random_state=random_state,
diff --git a/data/usc/esc50.py b/data/usc/esc50.py
index b9255b8..efad024 100644
--- a/data/usc/esc50.py
+++ b/data/usc/esc50.py
@@ -11,10 +11,12 @@
 LOGGER = logging.getLogger('cls-data-generation')
 LOGGER.setLevel(logging.DEBUG)
 
+NUM_FOLDS = 5
+
 
 def generate_esc50_folds(data_dir, output_dir, l3embedding_model=None,
                          features='l3', random_state=12345678, **feature_args):
-    for fold_idx in range(5):
+    for fold_idx in range(NUM_FOLDS):
         generate_esc50_fold_data(data_dir, fold_idx, output_dir,
                                  l3embedding_model=l3embedding_model,
                                  features=features, random_state=random_state,
diff --git a/data/usc/folds.py b/data/usc/folds.py
new file mode 100644
index 0000000..0f71cc6
--- /dev/null
+++ b/data/usc/folds.py
@@ -0,0 +1,105 @@
+import os
+import numpy as np
+
+from .us8k import NUM_FOLDS as NUM_FOLDS_US8K
+from .esc50 import NUM_FOLDS as NUM_FOLDS_ESC50
+from .dcase2013 import NUM_FOLDS as NUM_FOLDS_DCASE2013
+
+
+DATASET_NUM_FOLDS = {
+    'us8k': NUM_FOLDS_US8K,
+    'esc50': NUM_FOLDS_ESC50,
+    'dcase2013': NUM_FOLDS_DCASE2013
+}
+
+
+def load_feature_file(feature_filepath):
+    data = np.load(feature_filepath)
+    X, y = data['X'], data['y']
+    if type(y) == np.ndarray and y.ndim == 0:
+        y = int(y)
+    return X, y
+
+
+def get_fold(feature_dir, fold_idx):
+    X = []
+    y = []
+    file_idxs = []
+    fold_dir = os.path.join(feature_dir, 'fold{}'.format(fold_idx + 1))
+
+    filenames = os.listdir(fold_dir)
+
+    start_idx = 0
+    for feature_filename in filenames:
+        feature_filepath = os.path.join(fold_dir, feature_filename)
+        file_X, file_y = load_feature_file(feature_filepath)
+
+        if file_X.ndim > 1:
+            end_idx = start_idx + file_X.shape[0]
+        else:
+            end_idx = start_idx + 1
+
+        X.append(file_X)
+        y.append(file_y)
+        file_idxs.append([start_idx, end_idx])
+
+        start_idx = end_idx
+
+    X = np.vstack(X)
+
+    if type(y[0]) == int or y[0].ndim == 0:
+        y = np.array(y)
+    else:
+        y = np.concatenate(y)
+
+    file_idxs = np.array(file_idxs)
+
+    return {'features': X, 'labels': y, 'file_idxs': file_idxs, 'filenames': filenames}
+
+
+def get_split(feature_dir, test_fold_idx, dataset_name):
+    if dataset_name not in DATASET_NUM_FOLDS:
+        raise ValueError('Invalid dataset: {}'.format(dataset_name))
+    num_folds = DATASET_NUM_FOLDS[dataset_name]
+    train_data = get_train_folds(feature_dir, test_fold_idx, num_folds)
+    valid_data = get_fold(feature_dir, get_valid_fold_idx(test_fold_idx, num_folds))
+    test_data = get_fold(feature_dir, test_fold_idx)
+
+    return train_data, valid_data, test_data
+
+
+def get_valid_fold_idx(test_fold_idx, num_folds):
+    return (test_fold_idx - 1) % num_folds
+
+
+def get_train_folds(feature_dir, test_fold_idx, num_folds):
+    X = []
+    y = []
+    file_idxs = []
+    filenames = []
+
+    valid_fold_idx = get_valid_fold_idx(test_fold_idx, num_folds)
+
+    for fold_idx in range(num_folds):
+        if fold_idx in (valid_fold_idx, test_fold_idx):
+            continue
+
+        fold_data = get_fold(feature_dir, fold_idx)
+
+        X.append(fold_data['features'])
+        y.append(fold_data['labels'])
+        idxs = fold_data['file_idxs']
+        if len(file_idxs) > 0:
+            # Since we're appending all of the file indices together, increment
+            # the current fold indices by the current global index
+            idxs = idxs + file_idxs[-1][-1, -1]
+        file_idxs.append(idxs)
+
+        filenames += fold_data['filenames']
+
+    X = np.vstack(X)
+    y = np.concatenate(y)
+    file_idxs = np.vstack(file_idxs)
+
+    return {'features': X, 'labels': y, 'file_idxs': file_idxs,
+            'filenames': filenames}
diff --git a/data/usc/us8k.py b/data/usc/us8k.py
index b082348..e72cbd3 100644
--- a/data/usc/us8k.py
+++ b/data/usc/us8k.py
@@ -2,8 +2,6 @@
 import logging
 import os
 import random
-import pescador
-
 import numpy as np
 
 import data.usc.features as cls_features
@@ -13,6 +11,8 @@
 LOGGER.setLevel(logging.DEBUG)
 
 
+NUM_FOLDS = 10
+
 def load_us8k_metadata(path):
     """
     Load UrbanSound8K metadata
@@ -23,7 +23,7 @@ def load_us8k_metadata(path):
         metadata: List of metadata dictionaries
                   (Type: list[dict[str, *]])
     """
-    metadata = [{} for _ in range(10)]
+    metadata = [{} for _ in range(NUM_FOLDS)]
     with open(path) as csvfile:
         reader = csv.DictReader(csvfile)
         for row in reader:
@@ -39,180 +39,6 @@ def load_us8k_metadata(path):
 
 
 
-def load_feature_file(feature_filepath):
-    data = np.load(feature_filepath)
-    X, y = data['X'], data['y']
-    if type(y) == np.ndarray and y.ndim == 0:
-        y = int(y)
-    return X, y
-
-
-def us8k_file_sampler(feature_filepath, shuffle=True):
-    X, y = load_feature_file(feature_filepath)
-
-    num_frames = X.shape[0]
-
-    if shuffle:
-        frame_idxs = np.random.permutation(num_frames)
-    else:
-        frame_idxs = range(num_frames)
-
-    for idx in frame_idxs:
-        yield {
-            'features': X[idx],
-            'label': y,
-            'filepath': feature_filepath,
-            'frame_idx': idx
-        }
-
-
-
-def get_us8k_batch_generator(features_dir, test_fold_idx, valid=False, num_streamers=None,
-                             batch_size=64, random_state=20171021,
-                             rate=None, cycle=True):
-
-    random.seed(random_state)
-    np.random.seed(random_state)
-
-
-    LOGGER.info("Loading subset list")
-
-    LOGGER.info("Creating streamers...")
-    seeds = []
-
-    valid_fold_idx = (test_fold_idx - 1) % 10
-
-    for fold_dirname in os.listdir(features_dir):
-        fold_dir = os.path.join(features_dir, fold_dirname)
-
-        if not os.path.isdir(fold_dir):
-            continue
-
-        fold_idx = int(fold_dirname.replace('fold', '')) - 1
-        if fold_idx == test_fold_idx:
-            continue
-
-        if valid and fold_idx != valid_fold_idx:
-            continue
-
-        if not valid and fold_idx == valid_fold_idx:
-            continue
-
-        fold_dir = os.path.join(features_dir, fold_dirname)
-        for feature_filename in os.listdir(fold_dir):
-            feature_filepath = os.path.join(fold_dir, feature_filename)
-            streamer = pescador.Streamer(us8k_file_sampler, feature_filepath)
-            seeds.append(streamer)
-
-        if valid:
-            break
-
-    # Randomly shuffle the seeds
-    random.shuffle(seeds)
-
-    if num_streamers is None:
-        num_streamers = len(seeds)
-
-    mux = pescador.Mux(seeds, num_streamers, rate=rate, random_state=random_state)
-    if cycle:
-        mux = mux.cycle()
-
-    if batch_size == 1:
-        return mux
-    else:
-        return pescador.maps.buffer_stream(mux, batch_size)
-
-
-def get_us8k_batch(features_dir, test_fold_idx, valid=False, num_streamers=None,
-                   batch_size=64, random_state=20171021,
-                   rate=None, cycle=True):
-    gen = iter(get_us8k_batch_generator(features_dir, test_fold_idx, valid=valid,
-                                   num_streamers=num_streamers, batch_size=batch_size,
-                                   random_state=random_state, rate=rate, cycle=cycle))
-    return next(gen)
-
-
-def get_us8k_fold(feature_dir, fold_idx):
-    X = []
-    y = []
-    file_idxs = []
-    fold_dir = os.path.join(feature_dir, 'fold{}'.format(fold_idx + 1))
-
-    filenames = os.listdir(fold_dir)
-
-    start_idx = 0
-    for feature_filename in filenames:
-        feature_filepath = os.path.join(fold_dir, feature_filename)
-        file_X, file_y = load_feature_file(feature_filepath)
-
-        if file_X.ndim > 1:
-            end_idx = start_idx + file_X.shape[0]
-        else:
-            end_idx = start_idx + 1
-
-        X.append(file_X)
-        y.append(file_y)
-        file_idxs.append([start_idx, end_idx])
-
-        start_idx = end_idx
-
-    X = np.vstack(X)
-
-    if type(y[0]) == int or y[0].ndim == 0:
-        y = np.array(y)
-    else:
-        y = np.concatenate(y)
-
-    file_idxs = np.array(file_idxs)
-
-    return {'features': X, 'labels': y, 'file_idxs': file_idxs, 'filenames': filenames}
-
-
-def get_us8k_split(feature_dir, test_fold_idx):
-    train_data = get_us8k_train_folds(feature_dir, test_fold_idx)
-    valid_data = get_us8k_fold(feature_dir, get_valid_fold_idx(test_fold_idx))
-    test_data = get_us8k_fold(feature_dir, test_fold_idx)
-
-    return train_data, valid_data, test_data
-
-
-def get_valid_fold_idx(test_fold_idx):
-    return (test_fold_idx - 1) % 10
-
-
-def get_us8k_train_folds(feature_dir, test_fold_idx):
-    X = []
-    y = []
-    file_idxs = []
-    filenames = []
-
-    valid_fold_idx = get_valid_fold_idx(test_fold_idx)
-
-    for fold_idx in range(10):
-        if fold_idx in (valid_fold_idx, test_fold_idx):
-            continue
-
-        fold_data = get_us8k_fold(feature_dir, fold_idx)
-
-        X.append(fold_data['features'])
-        y.append(fold_data['labels'])
-        idxs = fold_data['file_idxs']
-        if len(file_idxs) > 0:
-            # Since we're appending all of the file indices together, increment
-            # the current fold indices by the current global index
-            idxs = idxs + file_idxs[-1][-1, -1]
-        file_idxs.append(idxs)
-
-        filenames += fold_data['filenames']
-
-    X = np.vstack(X)
-    y = np.concatenate(y)
-    file_idxs = np.vstack(file_idxs)
-
-    return {'features': X, 'labels': y, 'file_idxs': file_idxs,
-            'filenames': filenames}
-
-
 def generate_us8k_folds(metadata_path, data_dir, output_dir, l3embedding_model=None,
                         features='l3', random_state=12345678, **feature_args):
     """
@@ -239,7 +65,7 @@ def generate_us8k_folds(metadata_path, data_dir, output_dir, l3embedding_model=N
     LOGGER.info('Generating all folds.')
     metadata = load_us8k_metadata(metadata_path)
 
-    for fold_idx in range(10):
+    for fold_idx in range(NUM_FOLDS):
         generate_us8k_fold_data(metadata, data_dir, fold_idx, output_dir,
                                 l3embedding_model=l3embedding_model,
                                 features=features, random_state=random_state,

From 5515ecc59cda8a0e44a28e80f54dd42ad60cd9d0 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sat, 14 Apr 2018 18:32:32 -0400
Subject: [PATCH 176/201] Add cross validation, sans gsheets update

---
 06_train_classifier.py |  15 +++
 classifier/train.py    | 222 ++++++++++++++++++++++++++++++-----------
 2 files changed, 180 insertions(+), 57 deletions(-)

diff --git a/06_train_classifier.py b/06_train_classifier.py
index 7d8af2e..5e952b1 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -29,6 +29,21 @@ def parse_arguments():
                         default=64,
                         help='(MLP) Number of training examples per batch')
 
+    parser.add_argument('-cv',
+                        '--cross-validation',
+                        dest='search_param',
+                        action='store',
+                        type=str,
+                        help='Name of parameter to cross-validate over. If not provided, cross validation will not be done')
+
+    parser.add_argument('-cvss',
+                        '--cross-validation-search_space',
+                        dest='search_space',
+                        action='store',
+                        nargs="*",
+                        type=float,
+                        help='List of numeric possible parameter values used in cross validation')
+
     parser.add_argument('-lr',
                         '--learning-rate',
                         dest='learning_rate',
diff --git a/classifier/train.py b/classifier/train.py
index c5518f7..a3a9d96 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -16,9 +16,10 @@
 from sklearn.metrics import hinge_loss
 from sklearn.externals import joblib
 from sklearn.preprocessing import OneHotEncoder
+from sklearn.model_selection import StratifiedKFold
 from sklearn.svm import SVC
 
-from classifier.metrics import compute_metrics
+from classifier.metrics import compute_metrics, aggregate_metrics
 from data.usc.features import preprocess_split_data
 from data.usc.folds import get_split
 from l3embedding.train import LossHistory
@@ -105,10 +106,11 @@ def train_svm(train_data, valid_data, test_data, model_dir, C=1.0, kernel='rbf',
         y_test_pred: Predicted test output of classifier
                      (Type: np.ndarray)
     """
+    np.random.seed(random_state)
+    random.seed(random_state)
+
     X_train = train_data['features']
     y_train = train_data['labels']
-    X_valid = valid_data['features']
-    y_valid = valid_data['labels']
 
     model_output_path = os.path.join(model_dir, "model.pkl")
 
@@ -124,33 +126,39 @@ def train_svm(train_data, valid_data, test_data, model_dir, C=1.0, kernel='rbf',
     joblib.dump(clf, model_output_path)
 
     y_train_pred = clf.predict(X_train)
-    y_valid_pred = clf.predict(X_valid)
-
     # Compute new metrics
     classes = np.arange(num_classes)
     train_loss = hinge_loss(y_train, clf.decision_function(X_train), labels=classes)
-    valid_loss = hinge_loss(y_valid, clf.decision_function(X_valid), labels=classes)
     train_metrics = compute_metrics(y_train, y_train_pred)
-    valid_metrics = compute_metrics(y_valid, y_valid_pred)
     train_metrics['loss'] = train_loss
-    valid_metrics['loss'] = valid_loss
-
     train_msg = 'Train - hinge loss: {}, acc: {}'
-    valid_msg = 'Valid - hinge loss: {}, acc: {}'
     LOGGER.info(train_msg.format(train_loss, train_metrics['accuracy']))
-    LOGGER.info(valid_msg.format(valid_loss, valid_metrics['accuracy']))
 
+    if valid_data:
+        X_valid = valid_data['features']
+        y_valid = valid_data['labels']
+        y_valid_pred = clf.predict(X_valid)
+        valid_loss = hinge_loss(y_valid, clf.decision_function(X_valid), labels=classes)
+        valid_metrics = compute_metrics(y_valid, y_valid_pred)
+        valid_metrics['loss'] = valid_loss
+        valid_msg = 'Valid - hinge loss: {}, acc: {}'
+        LOGGER.info(valid_msg.format(valid_loss, valid_metrics['accuracy']))
+    else:
+        valid_metrics = {}
 
     # Evaluate model on test data
-    X_test = test_data['features']
-    y_test_pred_frame = clf.predict_proba(X_test)
-    y_test_pred = []
-    for start_idx, end_idx in test_data['file_idxs']:
-        class_pred = y_test_pred_frame[start_idx:end_idx].mean(axis=0).argmax()
-        y_test_pred.append(class_pred)
-
-    y_test_pred = np.array(y_test_pred)
-    test_metrics = compute_metrics(test_data['labels'], y_test_pred)
+    if test_data:
+        X_test = test_data['features']
+        y_test_pred_frame = clf.predict_proba(X_test)
+        y_test_pred = []
+        for start_idx, end_idx in test_data['file_idxs']:
+            class_pred = y_test_pred_frame[start_idx:end_idx].mean(axis=0).argmax()
+            y_test_pred.append(class_pred)
+
+        y_test_pred = np.array(y_test_pred)
+        test_metrics = compute_metrics(test_data['labels'], y_test_pred)
+    else:
+        test_metrics = {}
 
     return clf, train_metrics, valid_metrics, test_metrics
 
@@ -186,7 +194,7 @@ def construct_mlp_model(input_shape, weight_decay=1e-5, num_classes=10):
 
 
 def train_mlp(train_data, valid_data, test_data, model_dir,
-              batch_size=64, num_epochs=100,
+              batch_size=64, num_epochs=100, valid_split=0.15, patience=20,
               learning_rate=1e-4, weight_decay=1e-5, num_classes=10,
               random_state=12345678, verbose=False, **kwargs):
     """
@@ -232,8 +240,12 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
     X_train = train_data['features']
     y_train = enc.fit_transform(train_data['labels'].reshape(-1, 1))
 
-    X_valid = valid_data['features']
-    y_valid = enc.fit_transform(valid_data['labels'].reshape(-1, 1))
+    if valid_data:
+        validation_data = (valid_data['features'],
+                           enc.fit_transform(valid_data['labels'].reshape(-1, 1)))
+        valid_split = 0.0
+    else:
+        validation_data = None
 
     # Set up model
     m, inp, out = construct_mlp_model(X_train.shape[1:],
@@ -247,6 +259,7 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
                                               save_weights_only=True,
                                               save_best_only=True,
                                               monitor=monitor))
+    cb.append(keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience))
     history_checkpoint = os.path.join(model_dir, 'history_checkpoint.pkl')
     cb.append(LossHistory(history_checkpoint))
     history_csvlog = os.path.join(model_dir, 'history_csvlog.csv')
@@ -259,15 +272,12 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
     LOGGER.debug('Compiling model...')
     m.compile(Adam(lr=learning_rate), loss=loss, metrics=metrics)
     LOGGER.debug('Fitting model to data...')
-    m.fit(x=X_train, y=y_train, batch_size=batch_size,
-          epochs=num_epochs, validation_data=(X_valid, y_valid), callbacks=cb)
+    m.fit(x=X_train, y=y_train, batch_size=batch_size, epochs=num_epochs,
+          validation_data=validation_data, validation_split=valid_split, callbacks=cb)
 
     # Compute metrics for train and valid
     train_pred = m.predict(X_train)
-    valid_pred = m.predict(X_valid)
     train_metrics = compute_metrics(y_train, train_pred)
-    valid_metrics = compute_metrics(y_valid, valid_pred)
-
     # Set up train and validation metrics
     train_metrics = {
         'loss': metric_cb.train_loss[-1],
@@ -279,27 +289,110 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
     valid_metrics = {
         'loss': metric_cb.valid_loss[-1],
         'accuracy': metric_cb.valid_acc[-1],
-        'class_accuracy': valid_metrics['class_accuracy'],
-        'average_class_accuracy': valid_metrics['average_class_accuracy']
     }
 
-    # Evaluate model on test data
-    X_test = test_data['features']
-    y_test_pred_frame = m.predict(X_test)
-    y_test_pred = []
-    for start_idx, end_idx in test_data['file_idxs']:
-        class_pred = y_test_pred_frame[start_idx:end_idx].mean(axis=0).argmax()
-        y_test_pred.append(class_pred)
-    y_test_pred = np.array(y_test_pred)
-    test_metrics = compute_metrics(test_data['labels'], y_test_pred)
+    if valid_data:
+        valid_pred = m.predict(validation_data[0])
+        valid_metrics = compute_metrics(validation_data[1], valid_pred)
+        valid_metrics.update({
+            'class_accuracy': valid_metrics['class_accuracy'],
+            'average_class_accuracy': valid_metrics['average_class_accuracy']
+        })
+
+    if test_data:
+        # Evaluate model on test data
+        X_test = test_data['features']
+        y_test_pred_frame = m.predict(X_test)
+        y_test_pred = []
+        for start_idx, end_idx in test_data['file_idxs']:
+            class_pred = y_test_pred_frame[start_idx:end_idx].mean(axis=0).argmax()
+            y_test_pred.append(class_pred)
+        y_test_pred = np.array(y_test_pred)
+        test_metrics = compute_metrics(test_data['labels'], y_test_pred)
+    else:
+        test_metrics = {}
 
     return m, train_metrics, valid_metrics, test_metrics
 
 
+def train_cvsearch(train_data, valid_data, test_data, model_dir, train_func,
+                   search_param, search_space=None, num_splits=7, **kwargs):
+
+    cv_train_metrics = {}
+    cv_valid_metrics = {}
+
+    if not search_space:
+        if search_param == 'learning_rate':
+            search_space = [1e-5, 1e-4, 1e-3, 1e-2]
+        elif search_param == 'C':
+            search_space = [0.1, 1, 10, 100, 1000]
+        else:
+            raise ValueError('Defaults for parameter {} not implemented'.format(search_param))
+
+
+    LOGGER.info('Starting hyperparameter search on {}.'.format(search_param))
+
+    skf = StratifiedKFold(n_splits=num_splits)
+
+    best_valid_loss = float('inf')
+    best_param = None
+
+    for param in search_space:
+        LOGGER.info('Evaluating {} = {}'.format(search_param, param))
+        skf_train_metrics = []
+        skf_valid_metrics = []
+        for train_idxs, valid_idxs in skf.split(train_data['features'], train_data['labels']):
+            train_data_skf = {
+                'features': train_data['features'][train_idxs],
+                'labels': train_data['labels'][train_idxs]
+            }
+            valid_data_skf = {
+                'features': train_data['features'][valid_idxs],
+                'labels': train_data['labels'][valid_idxs]
+            }
+
+            kwargs[search_param] = param
+
+            LOGGER.info('')
+            _, train_metrics, valid_metrics, _ \
+                = train_func(train_data_skf, valid_data_skf, None, model_dir, **kwargs)
+
+            skf_train_metrics.append(train_metrics)
+            skf_valid_metrics.append(valid_metrics)
+
+        ave_train_metrics = aggregate_metrics(skf_train_metrics)
+        ave_valid_metrics = aggregate_metrics(skf_valid_metrics)
+
+        if ave_valid_metrics['loss'] < best_valid_loss:
+            best_valid_loss = ave_valid_metrics['loss']
+            best_param = param
+
+        cv_train_metrics[param] = {
+            'average': ave_train_metrics,
+            'minifolds': skf_train_metrics
+        }
+
+        cv_valid_metrics[param] = {
+            'average': ave_valid_metrics,
+            'minifolds': skf_valid_metrics
+        }
+
+    LOGGER.info('Best {} = {}, ave valid loss = {}'.format(search_param, best_param,
+                                                           best_valid_loss))
+
+    kwargs[search_param] = best_param
+    clf, train_metrics, _, test_metrics, \
+        = train_func(train_data, None, test_data, model_dir, **kwargs)
+
+    train_metrics['cross_validation'] = cv_train_metrics
+
+    return clf, train_metrics, cv_valid_metrics, test_metrics
+
+
 def train(features_dir, output_dir, fold_num,
           model_type='svm', feature_mode='framewise',
-          train_batch_size=64,
-          random_state=20171021, gsheet_id=None, google_dev_app_name=None,
+          train_batch_size=64, random_state=20171021, search_param=None,
+          search_space=None, gsheet_id=None, google_dev_app_name=None,
           verbose=False, non_overlap=False, non_overlap_chunk_size=10,
           use_min_max=False, **model_args):
     init_console_logger(LOGGER, verbose=verbose)
@@ -310,17 +403,14 @@ def train(features_dir, output_dir, fold_num,
     random.seed(random_state)
 
     datasets = ['us8k', 'esc50', 'dcase2013']
-    for ds in datasets:
-        if ds in features_dir:
-            dataset_name = ds
-            break
-    else:
-        err_msg = 'Feature directory must contain name of dataset ({})'
-        raise ValueError(err_msg.format(str(datasets)))
 
     features_desc_str = features_dir[features_dir.rindex('features')+9:]
     dataset_name = features_desc_str.split('/')[0]
 
+    if dataset_name not in datasets:
+        err_msg = 'Feature directory must contain name of dataset ({})'
+        raise ValueError(err_msg.format(str(datasets)))
+
     model_id = os.path.join(features_desc_str, feature_mode,
                             "non-overlap" if non_overlap else "overlap",
                             "min-max" if use_min_max else "no-min-max",
@@ -404,17 +494,34 @@ def train(features_dir, output_dir, fold_num,
     LOGGER.info('Training {} with fold {} held out'.format(model_type, fold_num))
     # Fit the model
     if model_type == 'svm':
-        model, train_metrics, valid_metrics, test_metrics \
-            = train_svm(train_data, valid_data, test_data, model_dir,
-                num_classes=DATASET_NUM_CLASSES[dataset_name],
-                random_state=random_state, verbose=verbose, **model_args)
+        if search_param:
+            model, train_metrics, valid_metrics, test_metrics \
+                = train_cvsearch(train_data, valid_data, test_data, model_dir,
+                    train_func=train_svm, search_param=search_param,
+                    search_space=search_space,
+                    num_classes=DATASET_NUM_CLASSES[dataset_name],
+                    random_state=random_state, verbose=verbose, **model_args)
+        else:
+            model, train_metrics, valid_metrics, test_metrics \
+                = train_svm(train_data, valid_data, test_data, model_dir,
+                    num_classes=DATASET_NUM_CLASSES[dataset_name],
+                    random_state=random_state, verbose=verbose, **model_args)
 
     elif model_type == 'mlp':
-        model, train_metrics, valid_metrics, test_metrics \
-                = train_mlp(train_data, valid_data, test_data, model_dir,
-                    batch_size=train_batch_size, random_state=random_state,
-                    num_classes=DATASET_NUM_CLASSES[dataset_name],
-                    verbose=verbose, **model_args)
+        if search_param:
+            model, train_metrics, valid_metrics, test_metrics \
+                = train_cvsearch(train_data, valid_data, test_data, model_dir,
+                                 train_func=train_mlp, search_param=search_param,
+                                 search_space=search_space,
+                                 batch_size=train_batch_size, random_state=random_state,
+                                 num_classes=DATASET_NUM_CLASSES[dataset_name],
+                                 verbose=verbose, **model_args)
+        else:
+            model, train_metrics, valid_metrics, test_metrics \
+                    = train_mlp(train_data, valid_data, test_data, model_dir,
+                        batch_size=train_batch_size, random_state=random_state,
+                        num_classes=DATASET_NUM_CLASSES[dataset_name],
+                        verbose=verbose, **model_args)
 
     else:
         raise ValueError('Invalid model type: {}'.format(model_type))
@@ -434,6 +541,7 @@ def train(features_dir, output_dir, fold_num,
         pk.dump(results, fp, protocol=pk.HIGHEST_PROTOCOL)
 
 
+    # TODO: Update spreadsheet for cross validation!
     if gsheet_id:
         # Update spreadsheet with results
         LOGGER.info('Updating spreadsheet...')

From 2556a7c15ed7712cbbbae2b6ec38e9da1fbabc7d Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sun, 15 Apr 2018 18:14:23 -0400
Subject: [PATCH 177/201] Remove full kfold validation for parameter search,
 instead use a single split

---
 classifier/train.py | 86 +++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 46 deletions(-)

diff --git a/classifier/train.py b/classifier/train.py
index a3a9d96..0ac8dbe 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -16,10 +16,10 @@
 from sklearn.metrics import hinge_loss
 from sklearn.externals import joblib
 from sklearn.preprocessing import OneHotEncoder
-from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import StratifiedShuffleSplit
 from sklearn.svm import SVC
 
-from classifier.metrics import compute_metrics, aggregate_metrics
+from classifier.metrics import compute_metrics
 from data.usc.features import preprocess_split_data
 from data.usc.folds import get_split
 from l3embedding.train import LossHistory
@@ -315,11 +315,11 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
     return m, train_metrics, valid_metrics, test_metrics
 
 
-def train_cvsearch(train_data, valid_data, test_data, model_dir, train_func,
-                   search_param, search_space=None, num_splits=7, **kwargs):
+def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
+                   search_param, search_space=None, valid_ratio=0.15, **kwargs):
 
-    cv_train_metrics = {}
-    cv_valid_metrics = {}
+    search_train_metrics = {}
+    search_valid_metrics = {}
 
     if not search_space:
         if search_param == 'learning_rate':
@@ -332,51 +332,36 @@ def train_cvsearch(train_data, valid_data, test_data, model_dir, train_func,
 
     LOGGER.info('Starting hyperparameter search on {}.'.format(search_param))
 
-    skf = StratifiedKFold(n_splits=num_splits)
+    splitter = StratifiedShuffleSplit(n_splits=1, test_size=valid_ratio)
+    train_idxs, valid_idxs = splitter.split(train_data['features'], train_data['labels'])
 
     best_valid_loss = float('inf')
     best_param = None
 
     for param in search_space:
         LOGGER.info('Evaluating {} = {}'.format(search_param, param))
-        skf_train_metrics = []
-        skf_valid_metrics = []
-        for train_idxs, valid_idxs in skf.split(train_data['features'], train_data['labels']):
-            train_data_skf = {
-                'features': train_data['features'][train_idxs],
-                'labels': train_data['labels'][train_idxs]
-            }
-            valid_data_skf = {
-                'features': train_data['features'][valid_idxs],
-                'labels': train_data['labels'][valid_idxs]
-            }
-
-            kwargs[search_param] = param
-
-            LOGGER.info('')
-            _, train_metrics, valid_metrics, _ \
-                = train_func(train_data_skf, valid_data_skf, None, model_dir, **kwargs)
-
-            skf_train_metrics.append(train_metrics)
-            skf_valid_metrics.append(valid_metrics)
-
-        ave_train_metrics = aggregate_metrics(skf_train_metrics)
-        ave_valid_metrics = aggregate_metrics(skf_valid_metrics)
-
-        if ave_valid_metrics['loss'] < best_valid_loss:
-            best_valid_loss = ave_valid_metrics['loss']
-            best_param = param
-
-        cv_train_metrics[param] = {
-            'average': ave_train_metrics,
-            'minifolds': skf_train_metrics
+        train_data_skf = {
+            'features': train_data['features'][train_idxs],
+            'labels': train_data['labels'][train_idxs]
         }
-
-        cv_valid_metrics[param] = {
-            'average': ave_valid_metrics,
-            'minifolds': skf_valid_metrics
+        valid_data_skf = {
+            'features': train_data['features'][valid_idxs],
+            'labels': train_data['labels'][valid_idxs]
         }
 
+        kwargs[search_param] = param
+
+        LOGGER.info('')
+        _, train_metrics, valid_metrics, _ \
+            = train_func(train_data_skf, valid_data_skf, None, model_dir, **kwargs)
+
+        if valid_metrics['accuracy'] < best_valid_loss:
+            best_valid_loss = valid_metrics['accuracy']
+            best_param = param
+
+        search_train_metrics[param] = train_metrics
+        search_valid_metrics[param] = valid_metrics
+
     LOGGER.info('Best {} = {}, ave valid loss = {}'.format(search_param, best_param,
                                                            best_valid_loss))
 
@@ -384,9 +369,18 @@ def train_cvsearch(train_data, valid_data, test_data, model_dir, train_func,
     clf, train_metrics, _, test_metrics, \
         = train_func(train_data, None, test_data, model_dir, **kwargs)
 
-    train_metrics['cross_validation'] = cv_train_metrics
+    train_metrics['search'] = search_train_metrics
+    train_metrics['search_param'] = search_param
+    train_metrics['search_param_best_value'] = best_param
+
+    valid_metrics = {
+        'search': search_valid_metrics,
+        'search_param': search_param,
+        'search_param_best_value': best_param
+    }
+    valid_metrics.update(search_valid_metrics[best_param])
 
-    return clf, train_metrics, cv_valid_metrics, test_metrics
+    return clf, train_metrics, valid_metrics, test_metrics
 
 
 def train(features_dir, output_dir, fold_num,
@@ -496,7 +490,7 @@ def train(features_dir, output_dir, fold_num,
     if model_type == 'svm':
         if search_param:
             model, train_metrics, valid_metrics, test_metrics \
-                = train_cvsearch(train_data, valid_data, test_data, model_dir,
+                = train_param_search(train_data, valid_data, test_data, model_dir,
                     train_func=train_svm, search_param=search_param,
                     search_space=search_space,
                     num_classes=DATASET_NUM_CLASSES[dataset_name],
@@ -510,7 +504,7 @@ def train(features_dir, output_dir, fold_num,
     elif model_type == 'mlp':
         if search_param:
             model, train_metrics, valid_metrics, test_metrics \
-                = train_cvsearch(train_data, valid_data, test_data, model_dir,
+                = train_param_search(train_data, valid_data, test_data, model_dir,
                                  train_func=train_mlp, search_param=search_param,
                                  search_space=search_space,
                                  batch_size=train_batch_size, random_state=random_state,

From c7d66d18643837bab121657ddfeefcc54858f30d Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sun, 15 Apr 2018 18:25:05 -0400
Subject: [PATCH 178/201] Update search parameter in spreadsheet

---
 classifier/train.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/classifier/train.py b/classifier/train.py
index 0ac8dbe..4d340b2 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -25,7 +25,7 @@
 from l3embedding.train import LossHistory
 from log import *
 
-from gsheets import get_credentials, append_row, update_experiment
+from gsheets import get_credentials, append_row, update_experiment, CLASSIFIER_FIELD_NAMES
 from googleapiclient import discovery
 
 LOGGER = logging.getLogger('classifier')
@@ -545,9 +545,10 @@ def train(features_dir, output_dir, fold_num,
               train_metrics['accuracy'],
               valid_metrics['accuracy'],
               train_metrics['average_class_accuracy'],
-              valid_metrics['average_class_accuracy'],
+              valid_metrics.get('average_class_accuracy', -1),
               ', '.join(map(str, train_metrics['class_accuracy'])),
-              ', '.join(map(str, valid_metrics['class_accuracy'])),
+              ', '.join(map(str, valid_metrics['class_accuracy'])) \
+                  if valid_metrics.get('class_accuracy') else '',
               test_metrics['accuracy'],
               test_metrics['average_class_accuracy'],
               ', '.join(map(str, test_metrics['class_accuracy']))
@@ -555,4 +556,12 @@ def train(features_dir, output_dir, fold_num,
         update_experiment(service, gsheet_id, config, 'R', 'AB',
                           update_values, 'classifier')
 
+        if search_param and search_param in CLASSIFIER_FIELD_NAMES:
+            # Also update search parameter
+            # ASSUMES that parameter values will be in the first 26 columns
+            col = chr(CLASSIFIER_FIELD_NAMES.index(search_param) + 67)
+            best_param = train_metrics['search_param_best_value']
+            update_experiment(service, gsheet_id, config, col, col,
+                              [best_param], 'classifier')
+
     LOGGER.info('Done!')

From c4bf79cd5ee2f32e0c1f33596569d043a1fe06df Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sun, 15 Apr 2018 21:16:06 -0400
Subject: [PATCH 179/201] Allow for multiple parameters to be used in paramter
 search, remove user provided search space and use preset search spaces for
 different models

---
 06_train_classifier.py | 19 +++--------
 classifier/train.py    | 76 +++++++++++++++++++++---------------------
 2 files changed, 43 insertions(+), 52 deletions(-)

diff --git a/06_train_classifier.py b/06_train_classifier.py
index 5e952b1..cc6a25e 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -29,20 +29,11 @@ def parse_arguments():
                         default=64,
                         help='(MLP) Number of training examples per batch')
 
-    parser.add_argument('-cv',
-                        '--cross-validation',
-                        dest='search_param',
-                        action='store',
-                        type=str,
-                        help='Name of parameter to cross-validate over. If not provided, cross validation will not be done')
-
-    parser.add_argument('-cvss',
-                        '--cross-validation-search_space',
-                        dest='search_space',
-                        action='store',
-                        nargs="*",
-                        type=float,
-                        help='List of numeric possible parameter values used in cross validation')
+    parser.add_argument('-ps',
+                        '--parameter-search',
+                        dest='parameter_search',
+                        action='store_true',
+                        help='If True, parameter search will be run')
 
     parser.add_argument('-lr',
                         '--learning-rate',
diff --git a/classifier/train.py b/classifier/train.py
index 4d340b2..ef4de45 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -5,6 +5,7 @@
 import pickle as pk
 import random
 import git
+from itertools import product
 
 import keras
 import keras.regularizers as regularizers
@@ -316,21 +317,13 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
 
 
 def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
-                   search_param, search_space=None, valid_ratio=0.15, **kwargs):
+                       search_space, valid_ratio=0.15, **kwargs):
 
     search_train_metrics = {}
     search_valid_metrics = {}
 
-    if not search_space:
-        if search_param == 'learning_rate':
-            search_space = [1e-5, 1e-4, 1e-3, 1e-2]
-        elif search_param == 'C':
-            search_space = [0.1, 1, 10, 100, 1000]
-        else:
-            raise ValueError('Defaults for parameter {} not implemented'.format(search_param))
-
-
-    LOGGER.info('Starting hyperparameter search on {}.'.format(search_param))
+    search_params = list(search_space.keys())
+    LOGGER.info('Starting hyperparameter search on {}.'.format(search_params))
 
     splitter = StratifiedShuffleSplit(n_splits=1, test_size=valid_ratio)
     train_idxs, valid_idxs = splitter.split(train_data['features'], train_data['labels'])
@@ -338,8 +331,9 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
     best_valid_loss = float('inf')
     best_param = None
 
-    for param in search_space:
-        LOGGER.info('Evaluating {} = {}'.format(search_param, param))
+
+    for params in product(*[search_space[p] for p in search_params]):
+        LOGGER.info('Evaluating {} = {}'.format(search_params, params))
         train_data_skf = {
             'features': train_data['features'][train_idxs],
             'labels': train_data['labels'][train_idxs]
@@ -349,7 +343,7 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
             'labels': train_data['labels'][valid_idxs]
         }
 
-        kwargs[search_param] = param
+        kwargs.update(dict(zip(search_params, params)))
 
         LOGGER.info('')
         _, train_metrics, valid_metrics, _ \
@@ -357,26 +351,26 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
 
         if valid_metrics['accuracy'] < best_valid_loss:
             best_valid_loss = valid_metrics['accuracy']
-            best_param = param
+            best_params = params
 
-        search_train_metrics[param] = train_metrics
-        search_valid_metrics[param] = valid_metrics
+        search_train_metrics[best_params] = train_metrics
+        search_valid_metrics[best_params] = valid_metrics
 
-    LOGGER.info('Best {} = {}, ave valid loss = {}'.format(search_param, best_param,
+    LOGGER.info('Best {} = {}, ave valid loss = {}'.format(search_params, best_param,
                                                            best_valid_loss))
 
-    kwargs[search_param] = best_param
+    kwargs.update(dict(zip(search_params, best_params)))
     clf, train_metrics, _, test_metrics, \
         = train_func(train_data, None, test_data, model_dir, **kwargs)
 
     train_metrics['search'] = search_train_metrics
-    train_metrics['search_param'] = search_param
-    train_metrics['search_param_best_value'] = best_param
+    train_metrics['search_params'] = search_params
+    train_metrics['search_params_best_values'] = best_params
 
     valid_metrics = {
         'search': search_valid_metrics,
-        'search_param': search_param,
-        'search_param_best_value': best_param
+        'search_params': search_params,
+        'search_params_best_values': best_params
     }
     valid_metrics.update(search_valid_metrics[best_param])
 
@@ -385,8 +379,8 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
 
 def train(features_dir, output_dir, fold_num,
           model_type='svm', feature_mode='framewise',
-          train_batch_size=64, random_state=20171021, search_param=None,
-          search_space=None, gsheet_id=None, google_dev_app_name=None,
+          train_batch_size=64, random_state=20171021, parameter_search=False,
+          gsheet_id=None, google_dev_app_name=None,
           verbose=False, non_overlap=False, non_overlap_chunk_size=10,
           use_min_max=False, **model_args):
     init_console_logger(LOGGER, verbose=verbose)
@@ -488,11 +482,11 @@ def train(features_dir, output_dir, fold_num,
     LOGGER.info('Training {} with fold {} held out'.format(model_type, fold_num))
     # Fit the model
     if model_type == 'svm':
-        if search_param:
+        if parameter_search:
+            search_space = { 'C': [0.1, 1, 10, 100, 1000] }
             model, train_metrics, valid_metrics, test_metrics \
                 = train_param_search(train_data, valid_data, test_data, model_dir,
-                    train_func=train_svm, search_param=search_param,
-                    search_space=search_space,
+                    train_func=train_svm, search_space=search_space,
                     num_classes=DATASET_NUM_CLASSES[dataset_name],
                     random_state=random_state, verbose=verbose, **model_args)
         else:
@@ -502,11 +496,14 @@ def train(features_dir, output_dir, fold_num,
                     random_state=random_state, verbose=verbose, **model_args)
 
     elif model_type == 'mlp':
-        if search_param:
+        if parameter_search:
+            search_space = {
+                'learning_rate': [1e-5, 1e-4, 1e-3],
+                'weight_decay': [1e-5, 1e-4, 1e-3],
+            }
             model, train_metrics, valid_metrics, test_metrics \
                 = train_param_search(train_data, valid_data, test_data, model_dir,
-                                 train_func=train_mlp, search_param=search_param,
-                                 search_space=search_space,
+                                 train_func=train_mlp, search_space=search_space,
                                  batch_size=train_batch_size, random_state=random_state,
                                  num_classes=DATASET_NUM_CLASSES[dataset_name],
                                  verbose=verbose, **model_args)
@@ -556,12 +553,15 @@ def train(features_dir, output_dir, fold_num,
         update_experiment(service, gsheet_id, config, 'R', 'AB',
                           update_values, 'classifier')
 
-        if search_param and search_param in CLASSIFIER_FIELD_NAMES:
-            # Also update search parameter
-            # ASSUMES that parameter values will be in the first 26 columns
-            col = chr(CLASSIFIER_FIELD_NAMES.index(search_param) + 67)
-            best_param = train_metrics['search_param_best_value']
-            update_experiment(service, gsheet_id, config, col, col,
-                              [best_param], 'classifier')
+        if parameter_search:
+            for param, param_value in zip(train_metrics['search_params'],
+                    train_metrics['search_params_best_values']):
+                if param not in CLASSIFIER_FIELD_NAMES:
+                    continue
+                # Also update search parameter
+                # ASSUMES that parameter values will be in the first 26 columns
+                col = chr(CLASSIFIER_FIELD_NAMES.index(param) + 67)
+                update_experiment(service, gsheet_id, config, col, col,
+                                  [param_value], 'classifier')
 
     LOGGER.info('Done!')

From 11bab985eab9e2ec012d1a76ac3078d2599ed036 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 18 Apr 2018 20:55:58 -0400
Subject: [PATCH 180/201] Allow user to choose whether to use a fold or subset
 of training set as validation, update gsheet, shuffle train set

---
 06_train_classifier.py             | 14 +++++++
 classifier/train.py                | 64 ++++++++++++++++++++++++------
 data/usc/features.py               | 15 ++++---
 data/usc/folds.py                  | 21 ++++++----
 gsheets.py                         |  3 ++
 jobs/classifier-train-array.sbatch |  7 +++-
 6 files changed, 97 insertions(+), 27 deletions(-)

diff --git a/06_train_classifier.py b/06_train_classifier.py
index cc6a25e..f9bf5d7 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -35,6 +35,20 @@ def parse_arguments():
                         action='store_true',
                         help='If True, parameter search will be run')
 
+    parser.add_argument('-psnv',
+                        '--parameter-search-no-valid-fold',
+                        dest='parameter_search_valid_fold',
+                        action='store_false',
+                        help='If True, include validation set in train set and instead get the validation set as a ratio of the training set')
+
+    parser.add_argument('-psvr',
+                        '--parameter-search-valid-ratio',
+                        dest='parameter_search_valid_ratio',
+                        action='store',
+                        type=float,
+                        default=0.15,
+                        help='If no validation fold is used, the ratio of the extended training set to set aside for validation')
+
     parser.add_argument('-lr',
                         '--learning-rate',
                         dest='learning_rate',
diff --git a/classifier/train.py b/classifier/train.py
index ef4de45..948081a 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -317,7 +317,7 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
 
 
 def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
-                       search_space, valid_ratio=0.15, **kwargs):
+                       search_space, valid_ratio=0.15, train_with_valid=True, **kwargs):
 
     search_train_metrics = {}
     search_valid_metrics = {}
@@ -331,9 +331,10 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
     best_valid_loss = float('inf')
     best_param = None
 
-
-    for params in product(*[search_space[p] for p in search_params]):
-        LOGGER.info('Evaluating {} = {}'.format(search_params, params))
+    if valid_data:
+        train_data_skf = train_data
+        valid_data_skf = valid_data
+    else:
         train_data_skf = {
             'features': train_data['features'][train_idxs],
             'labels': train_data['labels'][train_idxs]
@@ -343,6 +344,11 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
             'labels': train_data['labels'][valid_idxs]
         }
 
+
+
+    for params in product(*[search_space[p] for p in search_params]):
+        LOGGER.info('Evaluating {} = {}'.format(search_params, params))
+
         kwargs.update(dict(zip(search_params, params)))
 
         LOGGER.info('')
@@ -360,8 +366,35 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
                                                            best_valid_loss))
 
     kwargs.update(dict(zip(search_params, best_params)))
-    clf, train_metrics, _, test_metrics, \
-        = train_func(train_data, None, test_data, model_dir, **kwargs)
+
+    if valid_data:
+        if train_with_valid:
+            # If valid data was provided and we want to train the final model
+            # with it, we need to merge the train and valid data and shuffle
+            num_examples = train_data['labels'].size + valid_data['labels'].size
+            idxs = np.random.permutation(num_examples)
+            clf, train_metrics, _, test_metrics \
+                = train_func({
+                        'features': np.vstack((train_data['features'],
+                                               valid_data['features']))[idxs],
+                        'labels': np.concatenate((train_data['labels'],
+                                                  valid_data['labels']))[idxs]
+                    }, None, test_data, model_dir, **kwargs)
+        else:
+            # If valid data was provided but we just want to train the final
+            # model with train, just do that
+            clf, train_metrics, _, test_metrics \
+                = train_func(train_data, None, test_data, model_dir, **kwargs)
+
+    else:
+        if train_with_valid:
+            # If valid data was not provided, train with entire training set
+            clf, train_metrics, _, test_metrics \
+                = train_func(train_data, None, test_data, model_dir, **kwargs)
+        else:
+            # If valid data was not provided, train with just the sub-train split
+            clf, train_metrics, _, test_metrics \
+                = train_func(train_data_skf, None, test_data, model_dir, **kwargs)
 
     train_metrics['search'] = search_train_metrics
     train_metrics['search_params'] = search_params
@@ -380,6 +413,7 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
 def train(features_dir, output_dir, fold_num,
           model_type='svm', feature_mode='framewise',
           train_batch_size=64, random_state=20171021, parameter_search=False,
+          parameter_search_valid_fold=True, parameter_search_valid_ratio=0.15,
           gsheet_id=None, google_dev_app_name=None,
           verbose=False, non_overlap=False, non_overlap_chunk_size=10,
           use_min_max=False, **model_args):
@@ -420,6 +454,9 @@ def train(features_dir, output_dir, fold_num,
         'model_dir': model_dir,
         'model_id': model_id,
         'fold_num': fold_num,
+        'parameter_search': parameter_search,
+        'parameter_search_valid_fold': parameter_search_valid_fold,
+        'parameter_search_valid_ratio': parameter_search_valid_ratio,
         'model_type': model_type,
         'feature_mode': feature_mode,
         'train_batch_size': train_batch_size,
@@ -465,7 +502,8 @@ def train(features_dir, output_dir, fold_num,
     fold_idx = fold_num - 1
 
     LOGGER.info('Loading data for configuration with test fold {}...'.format(fold_num))
-    train_data, valid_data, test_data = get_split(features_dir, fold_idx, dataset_name)
+    train_data, valid_data, test_data = get_split(features_dir, fold_idx, dataset_name,
+                                                  valid=(not parameter_search or parameter_search_valid_fold))
 
     LOGGER.info('Preprocessing data...')
     min_max_scaler, stdizer = preprocess_split_data(train_data, valid_data, test_data,
@@ -488,6 +526,7 @@ def train(features_dir, output_dir, fold_num,
                 = train_param_search(train_data, valid_data, test_data, model_dir,
                     train_func=train_svm, search_space=search_space,
                     num_classes=DATASET_NUM_CLASSES[dataset_name],
+                    valid_ratio=parameter_search_valid_ratio,
                     random_state=random_state, verbose=verbose, **model_args)
         else:
             model, train_metrics, valid_metrics, test_metrics \
@@ -503,10 +542,11 @@ def train(features_dir, output_dir, fold_num,
             }
             model, train_metrics, valid_metrics, test_metrics \
                 = train_param_search(train_data, valid_data, test_data, model_dir,
-                                 train_func=train_mlp, search_space=search_space,
-                                 batch_size=train_batch_size, random_state=random_state,
-                                 num_classes=DATASET_NUM_CLASSES[dataset_name],
-                                 verbose=verbose, **model_args)
+                     train_func=train_mlp, search_space=search_space,
+                     batch_size=train_batch_size, random_state=random_state,
+                     num_classes=DATASET_NUM_CLASSES[dataset_name],
+                     valid_ratio=parameter_search_valid_ratio,
+                     verbose=verbose, **model_args)
         else:
             model, train_metrics, valid_metrics, test_metrics \
                     = train_mlp(train_data, valid_data, test_data, model_dir,
@@ -550,7 +590,7 @@ def train(features_dir, output_dir, fold_num,
               test_metrics['average_class_accuracy'],
               ', '.join(map(str, test_metrics['class_accuracy']))
         ]
-        update_experiment(service, gsheet_id, config, 'R', 'AB',
+        update_experiment(service, gsheet_id, config, 'U', 'AE',
                           update_values, 'classifier')
 
         if parameter_search:
diff --git a/data/usc/features.py b/data/usc/features.py
index fe2c4d6..5fd5068 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -103,7 +103,8 @@ def preprocess_split_data(train_data, valid_data, test_data,
     # Remove overlapping frames if no overlap
     if non_overlap:
         remove_data_overlap(train_data, chunk_size=non_overlap_chunk_size)
-        remove_data_overlap(valid_data, chunk_size=non_overlap_chunk_size)
+        if valid_data:
+            remove_data_overlap(valid_data, chunk_size=non_overlap_chunk_size)
         remove_data_overlap(test_data, chunk_size=non_overlap_chunk_size)
 
     # Apply min max scaling to data
@@ -111,17 +112,20 @@ def preprocess_split_data(train_data, valid_data, test_data,
     if use_min_max:
         train_data['features'] = min_max_scaler.fit_transform(
             train_data['features'])
-        valid_data['features'] = min_max_scaler.transform(valid_data['features'])
+        if valid_data:
+            valid_data['features'] = min_max_scaler.transform(valid_data['features'])
         test_data['features'] = min_max_scaler.transform(test_data['features'])
 
     if feature_mode == 'framewise':
         # Expand training and validation labels to apply to each frame
         expand_framewise_labels(train_data)
-        expand_framewise_labels(valid_data)
+        if valid_data:
+            expand_framewise_labels(valid_data)
     elif feature_mode == 'stats':
         # Summarize frames in each file using summary statistics
         framewise_to_stats(train_data)
-        framewise_to_stats(valid_data)
+        if valid_data:
+            framewise_to_stats(valid_data)
         framewise_to_stats(test_data)
     else:
         raise ValueError('Invalid feature mode: {}'.format(feature_mode))
@@ -129,7 +133,8 @@ def preprocess_split_data(train_data, valid_data, test_data,
     # Standardize features
     stdizer = StandardScaler()
     train_data['features'] = stdizer.fit_transform(train_data['features'])
-    valid_data['features'] = stdizer.transform(valid_data['features'])
+    if valid_data:
+        valid_data['features'] = stdizer.transform(valid_data['features'])
     test_data['features'] = stdizer.transform(test_data['features'])
 
     return min_max_scaler, stdizer
diff --git a/data/usc/folds.py b/data/usc/folds.py
index 0f71cc6..d2e128e 100644
--- a/data/usc/folds.py
+++ b/data/usc/folds.py
@@ -57,12 +57,15 @@ def get_fold(feature_dir, fold_idx):
     return {'features': X, 'labels': y, 'file_idxs': file_idxs, 'filenames': filenames}
 
 
-def get_split(feature_dir, test_fold_idx, dataset_name):
+def get_split(feature_dir, test_fold_idx, dataset_name, valid=True):
     if dataset_name not in DATASET_NUM_FOLDS:
         raise ValueError('Invalid dataset: {}'.format(dataset_name))
     num_folds = DATASET_NUM_FOLDS[dataset_name]
-    train_data = get_train_folds(feature_dir, test_fold_idx, num_folds)
-    valid_data = get_fold(feature_dir, get_valid_fold_idx(test_fold_idx, num_folds))
+    train_data = get_train_folds(feature_dir, test_fold_idx, num_folds, valid=valid)
+    if valid:
+        valid_data = get_fold(feature_dir, get_valid_fold_idx(test_fold_idx, num_folds))
+    else:
+        valid_data = None
     test_data = get_fold(feature_dir, test_fold_idx)
 
     return train_data, valid_data, test_data
@@ -72,7 +75,7 @@ def get_valid_fold_idx(test_fold_idx, num_folds):
     return (test_fold_idx - 1) % num_folds
 
 
-def get_train_folds(feature_dir, test_fold_idx, num_folds):
+def get_train_folds(feature_dir, test_fold_idx, num_folds, valid=True):
     X = []
     y = []
     file_idxs = []
@@ -81,7 +84,7 @@ def get_train_folds(feature_dir, test_fold_idx, num_folds):
     valid_fold_idx = get_valid_fold_idx(test_fold_idx, num_folds)
 
     for fold_idx in range(num_folds):
-        if fold_idx in (valid_fold_idx, test_fold_idx):
+        if fold_idx == test_fold_idx or (valid and fold_idx == valid_fold_idx):
             continue
 
         fold_data = get_fold(feature_dir, fold_idx)
@@ -97,9 +100,11 @@ def get_train_folds(feature_dir, test_fold_idx, num_folds):
 
         filenames += fold_data['filenames']
 
-    X = np.vstack(X)
-    y = np.concatenate(y)
-    file_idxs = np.vstack(file_idxs)
+    num_examples = len(y)
+    shuffle_idxs = np.random.permutation(num_examples)
+    X = np.vstack(X)[shuffle_idxs]
+    y = np.concatenate(y)[shuffle_idxs]
+    file_idxs = np.vstack(file_idxs)[shuffle_idxs]
 
     return {'features': X, 'labels': y, 'file_idxs': file_idxs,
             'filenames': filenames}
diff --git a/gsheets.py b/gsheets.py
index 121a60f..6992068 100644
--- a/gsheets.py
+++ b/gsheets.py
@@ -44,6 +44,9 @@
     'git_commit',
     'features_dir',
     'fold_num',
+    'parameter_search',
+    'parameter_search_valid_fold',
+    'parameter_search_valid_ratio',
     'model_type',
     'feature_mode',
     'train_batch_size',
diff --git a/jobs/classifier-train-array.sbatch b/jobs/classifier-train-array.sbatch
index 9a116ae..1aac102 100644
--- a/jobs/classifier-train-array.sbatch
+++ b/jobs/classifier-train-array.sbatch
@@ -15,7 +15,7 @@ source ~/.bashrc
 source activate l3embedding-cpu
 
 SRCDIR=$HOME/dev/l3embedding
-FEATURES_DIR=/scratch/jtc440/sonyc-usc/features/us8k/l3/original/music/cnn_L3_orig
+FEATURES_DIR=/scratch/jtc440/sonyc-usc/features/esc50/l3/original/music/cnn_L3_orig
 OUTPUT_DIR=/scratch/jtc440/sonyc-usc
 MODEL_TYPE='svm'
 FEATURE_MODE='framewise'
@@ -29,10 +29,13 @@ python $SRCDIR/06_train_classifier.py \
     --random-state 20171021 \
     --model-type $MODEL_TYPE \
     --feature-mode $FEATURE_MODE \
-    --num-epochs 150 \
+    --num-epochs 50 \
     --train-batch-size 32 \
     --gsheet-id $GSHEET_ID \
     --google-dev-app-name $GOOGLE_DEV_APP_NAME \
+    --parameter-search \
+    --parameter-search-no-valid-fold \
+    --parameter-search-valid-ratio 0.15 \
     --verbose \
     $FEATURES_DIR \
     $OUTPUT_DIR \

From 66941bc32e7f570867812c38423e5649b9463cb4 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 19 Apr 2018 09:50:02 -0400
Subject: [PATCH 181/201] Fix issues with shuffling data and splitting train
 set

---
 .gitignore           |  2 +-
 classifier/train.py  | 14 ++++++--------
 data/usc/features.py |  9 +++++++++
 data/usc/folds.py    |  8 +++-----
 4 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3e85c7f..398f615 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1 @@
-.idea/**.pyc*.err*.out*.pyc
\ No newline at end of file
+.idea/**.pyc*.err*.out
\ No newline at end of file
diff --git a/classifier/train.py b/classifier/train.py
index 948081a..2f1e3d7 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -326,9 +326,10 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
     LOGGER.info('Starting hyperparameter search on {}.'.format(search_params))
 
     splitter = StratifiedShuffleSplit(n_splits=1, test_size=valid_ratio)
-    train_idxs, valid_idxs = splitter.split(train_data['features'], train_data['labels'])
+    train_idxs, valid_idxs = next(splitter.split(train_data['features'],
+                                                 train_data['labels']))
 
-    best_valid_loss = float('inf')
+    best_valid_acc = float('inf')
     best_param = None
 
     if valid_data:
@@ -351,19 +352,18 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
 
         kwargs.update(dict(zip(search_params, params)))
 
-        LOGGER.info('')
         _, train_metrics, valid_metrics, _ \
             = train_func(train_data_skf, valid_data_skf, None, model_dir, **kwargs)
 
-        if valid_metrics['accuracy'] < best_valid_loss:
-            best_valid_loss = valid_metrics['accuracy']
+        if valid_metrics['accuracy'] < best_valid_acc:
+            best_valid_acc = valid_metrics['accuracy']
             best_params = params
 
         search_train_metrics[best_params] = train_metrics
         search_valid_metrics[best_params] = valid_metrics
 
     LOGGER.info('Best {} = {}, ave valid loss = {}'.format(search_params, best_param,
-                                                           best_valid_loss))
+                                                           best_valid_acc))
 
     kwargs.update(dict(zip(search_params, best_params)))
 
@@ -496,8 +496,6 @@ def train(features_dir, output_dir, fold_num,
         credentials = get_credentials(google_dev_app_name)
         service = discovery.build('sheets', 'v4', credentials=credentials)
         append_row(service, gsheet_id, config, 'classifier')
-    else:
-        LOGGER.error(gsheet_id)
 
     fold_idx = fold_num - 1
 
diff --git a/data/usc/features.py b/data/usc/features.py
index 5fd5068..958acc4 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -137,6 +137,15 @@ def preprocess_split_data(train_data, valid_data, test_data,
         valid_data['features'] = stdizer.transform(valid_data['features'])
     test_data['features'] = stdizer.transform(test_data['features'])
 
+    # Shuffle training data
+    num_train_examples = len(train_data['labels'])
+    shuffle_idxs = np.random.permutation(num_train_examples)
+    reverse_shuffle_idxs = np.argsort(shuffle_idxs)
+    train_data['features'] = train_data['features'][shuffle_idxs]
+    train_data['labels'] = train_data['labels'][shuffle_idxs]
+    train_data['file_idxs'] = np.vstack([reverse_shuffle_idxs[slice(*pair)]
+                                         for pair in train_data['file_idxs']])
+
     return min_max_scaler, stdizer
 
 
diff --git a/data/usc/folds.py b/data/usc/folds.py
index d2e128e..a21fb9e 100644
--- a/data/usc/folds.py
+++ b/data/usc/folds.py
@@ -100,11 +100,9 @@ def get_train_folds(feature_dir, test_fold_idx, num_folds, valid=True):
 
         filenames += fold_data['filenames']
 
-    num_examples = len(y)
-    shuffle_idxs = np.random.permutation(num_examples)
-    X = np.vstack(X)[shuffle_idxs]
-    y = np.concatenate(y)[shuffle_idxs]
-    file_idxs = np.vstack(file_idxs)[shuffle_idxs]
+    X = np.vstack(X)
+    y = np.concatenate(y)
+    file_idxs = np.vstack(file_idxs)
 
     return {'features': X, 'labels': y, 'file_idxs': file_idxs,
             'filenames': filenames}

From ed926919d45b90262e8193e0fb2aae3f974a3311 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 19 Apr 2018 16:47:13 -0400
Subject: [PATCH 182/201] Add some more fixes to parameter search code

---
 classifier/train.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/classifier/train.py b/classifier/train.py
index 2f1e3d7..f876a20 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -325,17 +325,17 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
     search_params = list(search_space.keys())
     LOGGER.info('Starting hyperparameter search on {}.'.format(search_params))
 
-    splitter = StratifiedShuffleSplit(n_splits=1, test_size=valid_ratio)
-    train_idxs, valid_idxs = next(splitter.split(train_data['features'],
-                                                 train_data['labels']))
-
-    best_valid_acc = float('inf')
-    best_param = None
+    best_valid_acc = float('-inf')
+    best_params = None
 
     if valid_data:
         train_data_skf = train_data
         valid_data_skf = valid_data
     else:
+        splitter = StratifiedShuffleSplit(n_splits=1, test_size=valid_ratio)
+        train_idxs, valid_idxs = next(splitter.split(train_data['features'],
+                                                     train_data['labels']))
+
         train_data_skf = {
             'features': train_data['features'][train_idxs],
             'labels': train_data['labels'][train_idxs]
@@ -345,8 +345,6 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
             'labels': train_data['labels'][valid_idxs]
         }
 
-
-
     for params in product(*[search_space[p] for p in search_params]):
         LOGGER.info('Evaluating {} = {}'.format(search_params, params))
 
@@ -355,18 +353,21 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
         _, train_metrics, valid_metrics, _ \
             = train_func(train_data_skf, valid_data_skf, None, model_dir, **kwargs)
 
-        if valid_metrics['accuracy'] < best_valid_acc:
+        if valid_metrics['accuracy'] > best_valid_acc:
             best_valid_acc = valid_metrics['accuracy']
             best_params = params
 
         search_train_metrics[best_params] = train_metrics
         search_valid_metrics[best_params] = valid_metrics
 
-    LOGGER.info('Best {} = {}, ave valid loss = {}'.format(search_params, best_param,
+    LOGGER.info('Best {} = {}, valid accuracy = {}'.format(search_params,
+                                                           best_params,
                                                            best_valid_acc))
 
     kwargs.update(dict(zip(search_params, best_params)))
 
+    LOGGER.info('Training model with chosen parameters...')
+    # Retrain final model
     if valid_data:
         if train_with_valid:
             # If valid data was provided and we want to train the final model
@@ -405,7 +406,7 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
         'search_params': search_params,
         'search_params_best_values': best_params
     }
-    valid_metrics.update(search_valid_metrics[best_param])
+    valid_metrics.update(search_valid_metrics[best_params])
 
     return clf, train_metrics, valid_metrics, test_metrics
 

From 4d85c2b8ba6c0e86e3d6d6c22826d564a967f7fb Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sun, 22 Apr 2018 22:11:46 -0400
Subject: [PATCH 183/201] Update valid metrics instead of overwriting

---
 classifier/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/classifier/train.py b/classifier/train.py
index f876a20..f74e35e 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -294,7 +294,7 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
 
     if valid_data:
         valid_pred = m.predict(validation_data[0])
-        valid_metrics = compute_metrics(validation_data[1], valid_pred)
+        valid_metrics.update(compute_metrics(validation_data[1], valid_pred))
         valid_metrics.update({
             'class_accuracy': valid_metrics['class_accuracy'],
             'average_class_accuracy': valid_metrics['average_class_accuracy']

From 261fbbbe849e98346971e61d55e9534253bcd76c Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 24 Apr 2018 13:57:20 -0400
Subject: [PATCH 184/201] Add random forest and fix metrics for class-wise
 classification metrics

---
 06_train_classifier.py | 10 ++++-
 classifier/metrics.py  |  4 +-
 classifier/train.py    | 88 +++++++++++++++++++++++++++++++++++++++---
 3 files changed, 93 insertions(+), 9 deletions(-)

diff --git a/06_train_classifier.py b/06_train_classifier.py
index f9bf5d7..13df66d 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -98,6 +98,14 @@ def parse_arguments():
                         choices=['rbf', 'sigmoid', 'linear', 'poly'],
                         help='(SVM) kernel type')
 
+    parser.add_argument('-rfne',
+                        '--rf-num-estimators',
+                        dest='n_estimators',
+                        action='store',
+                        type=int,
+                        default=100,
+                        help='(RF) Number of decision trees in the random forest')
+
     parser.add_argument('-gsid',
                         '--gsheet-id',
                         dest='gsheet_id',
@@ -140,7 +148,7 @@ def parse_arguments():
                         action='store',
                         type=str,
                         default='svm',
-                        choices=['svm', 'mlp'],
+                        choices=['svm', 'mlp', 'rf'],
                         help='Type of model used for training classifier')
 
     parser.add_argument('-no',
diff --git a/classifier/metrics.py b/classifier/metrics.py
index 25852ad..e052514 100644
--- a/classifier/metrics.py
+++ b/classifier/metrics.py
@@ -5,7 +5,7 @@
 LOGGER.setLevel(logging.DEBUG)
 
 
-def compute_metrics(y, pred):
+def compute_metrics(y, pred, num_classes=10):
     """
     Compute perfomance metrics given the predicted labels and the true labels
 
@@ -29,7 +29,7 @@ def compute_metrics(y, pred):
     acc = (y == pred).mean()
 
     class_acc = []
-    for class_idx in range(10):
+    for class_idx in range(num_classes):
         idxs = (y == class_idx)
         class_acc.append((y[idxs] == pred[idxs]).mean())
 
diff --git a/classifier/train.py b/classifier/train.py
index f74e35e..659eaaa 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -19,6 +19,7 @@
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.model_selection import StratifiedShuffleSplit
 from sklearn.svm import SVC
+from sklearn.ensemble import RandomForestClassifier
 
 from classifier.metrics import compute_metrics
 from data.usc.features import preprocess_split_data
@@ -130,7 +131,7 @@ def train_svm(train_data, valid_data, test_data, model_dir, C=1.0, kernel='rbf',
     # Compute new metrics
     classes = np.arange(num_classes)
     train_loss = hinge_loss(y_train, clf.decision_function(X_train), labels=classes)
-    train_metrics = compute_metrics(y_train, y_train_pred)
+    train_metrics = compute_metrics(y_train, y_train_pred, num_classes=num_classes)
     train_metrics['loss'] = train_loss
     train_msg = 'Train - hinge loss: {}, acc: {}'
     LOGGER.info(train_msg.format(train_loss, train_metrics['accuracy']))
@@ -140,7 +141,7 @@ def train_svm(train_data, valid_data, test_data, model_dir, C=1.0, kernel='rbf',
         y_valid = valid_data['labels']
         y_valid_pred = clf.predict(X_valid)
         valid_loss = hinge_loss(y_valid, clf.decision_function(X_valid), labels=classes)
-        valid_metrics = compute_metrics(y_valid, y_valid_pred)
+        valid_metrics = compute_metrics(y_valid, y_valid_pred, num_classes=num_classes)
         valid_metrics['loss'] = valid_loss
         valid_msg = 'Valid - hinge loss: {}, acc: {}'
         LOGGER.info(valid_msg.format(valid_loss, valid_metrics['accuracy']))
@@ -157,7 +158,68 @@ def train_svm(train_data, valid_data, test_data, model_dir, C=1.0, kernel='rbf',
             y_test_pred.append(class_pred)
 
         y_test_pred = np.array(y_test_pred)
-        test_metrics = compute_metrics(test_data['labels'], y_test_pred)
+        test_metrics = compute_metrics(test_data['labels'], y_test_pred, num_classes=num_classes)
+    else:
+        test_metrics = {}
+
+    return clf, train_metrics, valid_metrics, test_metrics
+
+
+def train_rf(train_data, valid_data, test_data, model_dir, n_estimators=100,
+             num_classes=10, random_state=12345678, **kwargs):
+    """
+    Train a Support Vector Machine model on the given data
+    """
+    np.random.seed(random_state)
+    random.seed(random_state)
+
+    X_train = train_data['features']
+    y_train = train_data['labels']
+
+    model_output_path = os.path.join(model_dir, "model.pkl")
+
+    # Create classifier
+    clf = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1,
+                                 random_state=random_state)
+
+    # Fit data and get output for train and valid batches
+    LOGGER.debug('Fitting model to data...')
+    clf.fit(X_train, y_train)
+
+    LOGGER.info('Saving model...')
+    joblib.dump(clf, model_output_path)
+
+    y_train_pred = clf.predict(X_train)
+    # Compute new metrics
+    train_loss = 0
+    train_metrics = compute_metrics(y_train, y_train_pred, num_classes=num_classes)
+    train_metrics['loss'] = train_loss
+    train_msg = 'Train - acc: {}'
+    LOGGER.info(train_msg.format(train_loss, train_metrics['accuracy']))
+
+    if valid_data:
+        X_valid = valid_data['features']
+        y_valid = valid_data['labels']
+        y_valid_pred = clf.predict(X_valid)
+        valid_loss = 0
+        valid_metrics = compute_metrics(y_valid, y_valid_pred, num_classes=num_classes)
+        valid_metrics['loss'] = valid_loss
+        valid_msg = 'Valid - acc: {}'
+        LOGGER.info(valid_msg.format(valid_loss, valid_metrics['accuracy']))
+    else:
+        valid_metrics = {}
+
+    # Evaluate model on test data
+    if test_data:
+        X_test = test_data['features']
+        y_test_pred_frame = clf.predict_proba(X_test)
+        y_test_pred = []
+        for start_idx, end_idx in test_data['file_idxs']:
+            class_pred = y_test_pred_frame[start_idx:end_idx].mean(axis=0).argmax()
+            y_test_pred.append(class_pred)
+
+        y_test_pred = np.array(y_test_pred)
+        test_metrics = compute_metrics(test_data['labels'], y_test_pred, num_classes=num_classes)
     else:
         test_metrics = {}
 
@@ -278,7 +340,7 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
 
     # Compute metrics for train and valid
     train_pred = m.predict(X_train)
-    train_metrics = compute_metrics(y_train, train_pred)
+    train_metrics = compute_metrics(y_train, train_pred, num_classes=num_classes)
     # Set up train and validation metrics
     train_metrics = {
         'loss': metric_cb.train_loss[-1],
@@ -294,7 +356,7 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
 
     if valid_data:
         valid_pred = m.predict(validation_data[0])
-        valid_metrics.update(compute_metrics(validation_data[1], valid_pred))
+        valid_metrics.update(compute_metrics(validation_data[1], valid_pred, num_classes=num_classes))
         valid_metrics.update({
             'class_accuracy': valid_metrics['class_accuracy'],
             'average_class_accuracy': valid_metrics['average_class_accuracy']
@@ -309,7 +371,7 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
             class_pred = y_test_pred_frame[start_idx:end_idx].mean(axis=0).argmax()
             y_test_pred.append(class_pred)
         y_test_pred = np.array(y_test_pred)
-        test_metrics = compute_metrics(test_data['labels'], y_test_pred)
+        test_metrics = compute_metrics(test_data['labels'], y_test_pred, num_classes=num_classes)
     else:
         test_metrics = {}
 
@@ -533,6 +595,20 @@ def train(features_dir, output_dir, fold_num,
                     num_classes=DATASET_NUM_CLASSES[dataset_name],
                     random_state=random_state, verbose=verbose, **model_args)
 
+    elif model_type == 'rf':
+        if parameter_search:
+            search_space = { 'n_estimators': [100, 500, 1000] }
+            model, train_metrics, valid_metrics, test_metrics \
+                = train_param_search(train_data, valid_data, test_data, model_dir,
+                                     train_func=train_rf, search_space=search_space,
+                                     num_classes=DATASET_NUM_CLASSES[dataset_name],
+                                     valid_ratio=parameter_search_valid_ratio,
+                                     random_state=random_state, verbose=verbose, **model_args)
+        else:
+            model, train_metrics, valid_metrics, test_metrics \
+                = train_rf(train_data, valid_data, test_data, model_dir,
+                           num_classes=DATASET_NUM_CLASSES[dataset_name],
+                           random_state=random_state, verbose=verbose, **model_args)
     elif model_type == 'mlp':
         if parameter_search:
             search_space = {

From f0fe9e7218621d7214e483e6fe2c02130a8612c5 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 24 Apr 2018 14:06:29 -0400
Subject: [PATCH 185/201] Expose option for retraining with validation in
 command line arguments

---
 06_train_classifier.py | 6 ++++++
 classifier/train.py    | 6 +++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/06_train_classifier.py b/06_train_classifier.py
index 13df66d..858156d 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -49,6 +49,12 @@ def parse_arguments():
                         default=0.15,
                         help='If no validation fold is used, the ratio of the extended training set to set aside for validation')
 
+    parser.add_argument('-pstwv',
+                        '--parameter-search-train-with-valid',
+                        dest='parameter_search_train_with_valid',
+                        action='store_true',
+                        help='If True, retrain with validation set')
+
     parser.add_argument('-lr',
                         '--learning-rate',
                         dest='learning_rate',
diff --git a/classifier/train.py b/classifier/train.py
index 659eaaa..d77f24e 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -477,7 +477,7 @@ def train(features_dir, output_dir, fold_num,
           model_type='svm', feature_mode='framewise',
           train_batch_size=64, random_state=20171021, parameter_search=False,
           parameter_search_valid_fold=True, parameter_search_valid_ratio=0.15,
-          gsheet_id=None, google_dev_app_name=None,
+          parameter_search_train_with_valid=False, gsheet_id=None, google_dev_app_name=None,
           verbose=False, non_overlap=False, non_overlap_chunk_size=10,
           use_min_max=False, **model_args):
     init_console_logger(LOGGER, verbose=verbose)
@@ -520,6 +520,7 @@ def train(features_dir, output_dir, fold_num,
         'parameter_search': parameter_search,
         'parameter_search_valid_fold': parameter_search_valid_fold,
         'parameter_search_valid_ratio': parameter_search_valid_ratio,
+        'parameter_search_train_with_valid': parameter_search_train_with_valid,
         'model_type': model_type,
         'feature_mode': feature_mode,
         'train_batch_size': train_batch_size,
@@ -588,6 +589,7 @@ def train(features_dir, output_dir, fold_num,
                     train_func=train_svm, search_space=search_space,
                     num_classes=DATASET_NUM_CLASSES[dataset_name],
                     valid_ratio=parameter_search_valid_ratio,
+                    train_with_valid=parameter_search_train_with_valid,
                     random_state=random_state, verbose=verbose, **model_args)
         else:
             model, train_metrics, valid_metrics, test_metrics \
@@ -603,6 +605,7 @@ def train(features_dir, output_dir, fold_num,
                                      train_func=train_rf, search_space=search_space,
                                      num_classes=DATASET_NUM_CLASSES[dataset_name],
                                      valid_ratio=parameter_search_valid_ratio,
+                                     train_with_valid=parameter_search_train_with_valid,
                                      random_state=random_state, verbose=verbose, **model_args)
         else:
             model, train_metrics, valid_metrics, test_metrics \
@@ -621,6 +624,7 @@ def train(features_dir, output_dir, fold_num,
                      batch_size=train_batch_size, random_state=random_state,
                      num_classes=DATASET_NUM_CLASSES[dataset_name],
                      valid_ratio=parameter_search_valid_ratio,
+                     train_with_valid=parameter_search_train_with_valid,
                      verbose=verbose, **model_args)
         else:
             model, train_metrics, valid_metrics, test_metrics \

From 67b95a56a82fb319a57978102d7356aa5df8180c Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 24 Apr 2018 14:09:16 -0400
Subject: [PATCH 186/201] Negate command line option for retraining with
 validation fold

---
 06_train_classifier.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/06_train_classifier.py b/06_train_classifier.py
index 858156d..572c53c 100644
--- a/06_train_classifier.py
+++ b/06_train_classifier.py
@@ -50,10 +50,10 @@ def parse_arguments():
                         help='If no validation fold is used, the ratio of the extended training set to set aside for validation')
 
     parser.add_argument('-pstwv',
-                        '--parameter-search-train-with-valid',
+                        '--parameter-search-train-without-valid',
                         dest='parameter_search_train_with_valid',
-                        action='store_true',
-                        help='If True, retrain with validation set')
+                        action='store_false',
+                        help='If set, do not retrain with validation set')
 
     parser.add_argument('-lr',
                         '--learning-rate',

From 141d1926071da7c37e008237874452c317bca039 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 24 Apr 2018 14:15:41 -0400
Subject: [PATCH 187/201] Add param search train with valid to spreadsheet, fix
 spreadsheet issue, generalize some gsheets things

---
 classifier/train.py | 4 ++--
 gsheets.py          | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/classifier/train.py b/classifier/train.py
index d77f24e..0ca48b9 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -669,7 +669,7 @@ def train(features_dir, output_dir, fold_num,
               test_metrics['average_class_accuracy'],
               ', '.join(map(str, test_metrics['class_accuracy']))
         ]
-        update_experiment(service, gsheet_id, config, 'U', 'AE',
+        update_experiment(service, gsheet_id, config, 'V', 'AF',
                           update_values, 'classifier')
 
         if parameter_search:
@@ -679,7 +679,7 @@ def train(features_dir, output_dir, fold_num,
                     continue
                 # Also update search parameter
                 # ASSUMES that parameter values will be in the first 26 columns
-                col = chr(CLASSIFIER_FIELD_NAMES.index(param) + 67)
+                col = chr(CLASSIFIER_FIELD_NAMES.index(param) + 65)
                 update_experiment(service, gsheet_id, config, col, col,
                                   [param_value], 'classifier')
 
diff --git a/gsheets.py b/gsheets.py
index 6992068..b0d447b 100644
--- a/gsheets.py
+++ b/gsheets.py
@@ -47,6 +47,7 @@
     'parameter_search',
     'parameter_search_valid_fold',
     'parameter_search_valid_ratio',
+    'parameter_search_train_with_valid',
     'model_type',
     'feature_mode',
     'train_batch_size',
@@ -154,7 +155,7 @@ def request_with_retry(request, num_retries=50):
     return response
 
 
-def get_row(service, spreadsheet_id, param_dict, sheet_name):
+def get_row(service, spreadsheet_id, param_dict, sheet_name, id_field='model_dir'):
     range_ = '{}!C:C'.format(sheet_name)
     major_dimension = 'COLUMNS'
 
@@ -165,7 +166,7 @@ def get_row(service, spreadsheet_id, param_dict, sheet_name):
     response = request_with_retry(request)
 
     try:
-        row_idx = response['values'][0].index(param_dict['model_dir'])
+        row_idx = response['values'][0].index(param_dict[id_field])
         return row_idx + 1
     except ValueError:
         return None

From b04fed27843d2609e01193150e560f2c9108ed93 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 24 Apr 2018 14:38:43 -0400
Subject: [PATCH 188/201] Use best results instead of retrain when not
 retraining with validation data, and fix bug where only best parameter
 results are updated in the metrics history

---
 classifier/train.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/classifier/train.py b/classifier/train.py
index 0ca48b9..4dcb992 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -389,6 +389,8 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
 
     best_valid_acc = float('-inf')
     best_params = None
+    best_clf = None
+    best_test_metrics = None
 
     if valid_data:
         train_data_skf = train_data
@@ -412,15 +414,17 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
 
         kwargs.update(dict(zip(search_params, params)))
 
-        _, train_metrics, valid_metrics, _ \
-            = train_func(train_data_skf, valid_data_skf, None, model_dir, **kwargs)
+        clf, train_metrics, valid_metrics, test_metrics \
+            = train_func(train_data_skf, valid_data_skf, test_data, model_dir, **kwargs)
 
         if valid_metrics['accuracy'] > best_valid_acc:
             best_valid_acc = valid_metrics['accuracy']
             best_params = params
+            best_clf = clf
+            best_test_metrics = test_metrics
 
-        search_train_metrics[best_params] = train_metrics
-        search_valid_metrics[best_params] = valid_metrics
+        search_train_metrics[params] = train_metrics
+        search_valid_metrics[params] = valid_metrics
 
     LOGGER.info('Best {} = {}, valid accuracy = {}'.format(search_params,
                                                            best_params,
@@ -445,9 +449,10 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
                     }, None, test_data, model_dir, **kwargs)
         else:
             # If valid data was provided but we just want to train the final
-            # model with train, just do that
-            clf, train_metrics, _, test_metrics \
-                = train_func(train_data, None, test_data, model_dir, **kwargs)
+            # model with train, just use best results
+            clf = best_clf
+            train_metrics = dict(search_train_metrics[best_params])
+            test_metrics = best_test_metrics
 
     else:
         if train_with_valid:
@@ -455,9 +460,10 @@ def train_param_search(train_data, valid_data, test_data, model_dir, train_func,
             clf, train_metrics, _, test_metrics \
                 = train_func(train_data, None, test_data, model_dir, **kwargs)
         else:
-            # If valid data was not provided, train with just the sub-train split
-            clf, train_metrics, _, test_metrics \
-                = train_func(train_data_skf, None, test_data, model_dir, **kwargs)
+            # If valid data was not provided, just use results from the sub-train split
+            clf = best_clf
+            train_metrics = dict(search_train_metrics[best_params])
+            test_metrics = best_test_metrics
 
     train_metrics['search'] = search_train_metrics
     train_metrics['search_params'] = search_params

From ae2b517eb0a5302c59ced3a038eeafacf902513d Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 26 Apr 2018 17:08:46 -0400
Subject: [PATCH 189/201] Update classifier training script

---
 jobs/classifier-train-array.sbatch | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/jobs/classifier-train-array.sbatch b/jobs/classifier-train-array.sbatch
index 1aac102..fc1d6ef 100644
--- a/jobs/classifier-train-array.sbatch
+++ b/jobs/classifier-train-array.sbatch
@@ -3,7 +3,7 @@
 #SBATCH --job-name=us8k-classifier-train
 #SBATCH --nodes=1
 #SBATCH --cpus-per-task=2
-#SBATCH --mem=64GB
+#SBATCH --mem=16GB
 #SBATCH --time=7-0
 #SBATCH --mail-type=ALL
 #SBATCH --mail-user=name@email.org
@@ -17,7 +17,7 @@ source activate l3embedding-cpu
 SRCDIR=$HOME/dev/l3embedding
 FEATURES_DIR=/scratch/jtc440/sonyc-usc/features/esc50/l3/original/music/cnn_L3_orig
 OUTPUT_DIR=/scratch/jtc440/sonyc-usc
-MODEL_TYPE='svm'
+MODEL_TYPE='mlp'
 FEATURE_MODE='framewise'
 GOOGLE_DEV_APP_NAME=''
 GSHEET_ID=''
@@ -34,8 +34,10 @@ python $SRCDIR/06_train_classifier.py \
     --gsheet-id $GSHEET_ID \
     --google-dev-app-name $GOOGLE_DEV_APP_NAME \
     --parameter-search \
+    --parameter-search-train-without-valid \
     --parameter-search-no-valid-fold \
     --parameter-search-valid-ratio 0.15 \
+    --svm-kernel-type linear \
     --verbose \
     $FEATURES_DIR \
     $OUTPUT_DIR \

From 8d265f538d4c79de8edbebd5d82a3c2c13c098a8 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 1 May 2018 13:19:44 -0400
Subject: [PATCH 190/201] Fix file idxs for us8k

---
 data/usc/features.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/usc/features.py b/data/usc/features.py
index 958acc4..b0b7514 100644
--- a/data/usc/features.py
+++ b/data/usc/features.py
@@ -143,8 +143,8 @@ def preprocess_split_data(train_data, valid_data, test_data,
     reverse_shuffle_idxs = np.argsort(shuffle_idxs)
     train_data['features'] = train_data['features'][shuffle_idxs]
     train_data['labels'] = train_data['labels'][shuffle_idxs]
-    train_data['file_idxs'] = np.vstack([reverse_shuffle_idxs[slice(*pair)]
-                                         for pair in train_data['file_idxs']])
+    train_data['file_idxs'] = [reverse_shuffle_idxs[slice(*pair)]
+                               for pair in train_data['file_idxs']]
 
     return min_max_scaler, stdizer
 

From 154ef202f47d72cb10d1ec32ee77d62d06f8b64c Mon Sep 17 00:00:00 2001
From: Justin Salamon <justin.salamon@gmail.com>
Date: Fri, 4 May 2018 16:01:01 -0400
Subject: [PATCH 191/201] Change to verbose=2 for mlp.fit() to avoid giant
 slurm output files

---
 classifier/train.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/classifier/train.py b/classifier/train.py
index 4dcb992..4ce0671 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -336,7 +336,8 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
     m.compile(Adam(lr=learning_rate), loss=loss, metrics=metrics)
     LOGGER.debug('Fitting model to data...')
     m.fit(x=X_train, y=y_train, batch_size=batch_size, epochs=num_epochs,
-          validation_data=validation_data, validation_split=valid_split, callbacks=cb)
+          validation_data=validation_data, validation_split=valid_split,
+          callbacks=cb, verbose=2)
 
     # Compute metrics for train and valid
     train_pred = m.predict(X_train)

From f24fc84a911cde8e730ca1d7cfb0134abd3b670b Mon Sep 17 00:00:00 2001
From: Justin Salamon <justin.salamon@gmail.com>
Date: Thu, 17 May 2018 14:38:10 -0400
Subject: [PATCH 192/201] DCASE 2013 results (data and notebook)

---
 notebooks/data/dcase2013_results.csv |  67 ++++++++++++
 notebooks/dcase2013_results.ipynb    | 151 +++++++++++++++++++++++++++
 2 files changed, 218 insertions(+)
 create mode 100644 notebooks/data/dcase2013_results.csv
 create mode 100644 notebooks/dcase2013_results.ipynb

diff --git a/notebooks/data/dcase2013_results.csv b/notebooks/data/dcase2013_results.csv
new file mode 100644
index 0000000..14cdc15
--- /dev/null
+++ b/notebooks/data/dcase2013_results.csv
@@ -0,0 +1,67 @@
+﻿Username,Model ID,Model Dir,Commit,"Features
+Dir","Test
+Fold
+Num","Param
+Search","Param
+Search
+Valid
+Fold
+","Param
+Search
+Valid
+Ratio
+","Param
+Search
+Train
+With
+Valid
+","Model
+Type","Feature
+Mode","Train
+Batch
+Size","Non
+Overlap","Random
+State","Num
+Epochs
+","Learning
+Rate (MLP)","Weight
+Decay
+(MLP)
+","Reg.
+Factor
+(SVM)
+","Conv
+Tolerance
+(SVM)
+","Max
+Iterations
+(SVM)","Train
+Loss","Valid
+Loss","Train
+Accuracy","Valid
+Accuracy","Train Average
+Class Accuracy","Valid Average
+Class Accuracy","Train Class
+Accuracy","Valid Class
+Accuracy","Test
+Accuracy","Test Average
+Class Accuracy","Test Class
+Accuracy"
+js7561,dcase2013/l3/original/environmental/cnn_L3_melspec1/framewise/overlap/no-min-max/mlp,/scratch/js7561/l3_cls/classifier/dcase2013/l3/original/environmental/cnn_L3_melspec1/framewise/overlap/no-min-max/mlp/fold2/20180427174607,ae2b517eb0a5302c59ced3a038eeafacf902513d,/scratch/jtc440/sonyc-usc/features/dcase2013/l3/original/environmental/cnn_L3_melspec1,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,1.00E-05,1.00E-05,1,1.00E-05,-1,0.000521011,0.000581202,1,1,1,1,"1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0","1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0",0.96,0.96,"1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 0.9, 0.8"
+js7561,dcase2013/l3/original/music/cnn_L3_melspec1/framewise/overlap/no-min-max/mlp,/scratch/js7561/l3_cls/classifier/dcase2013/l3/original/music/cnn_L3_melspec1/framewise/overlap/no-min-max/mlp/fold2/20180427174607,ae2b517eb0a5302c59ced3a038eeafacf902513d,/scratch/jtc440/sonyc-usc/features/dcase2013/l3/original/music/cnn_L3_melspec1,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,1.00E-05,1.00E-05,1,1.00E-05,-1,0.000723378,0.000865292,1,1,1,1,"1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0","1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0",0.96,0.96,"1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 0.9, 0.8"
+js7561,dcase2013/l3/original/music/cnn_L3_melspec2/framewise/overlap/no-min-max/mlp,/scratch/js7561/l3_cls/classifier/dcase2013/l3/original/music/cnn_L3_melspec2/framewise/overlap/no-min-max/mlp/fold2/20180427174607,ae2b517eb0a5302c59ced3a038eeafacf902513d,/scratch/hhw230/sonyc-usc/features/dcase2013/l3/original/music/cnn_L3_melspec2,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,1.00E-05,1.00E-05,1,1.00E-05,-1,0.000673865,0.000870263,1,1,1,1,"1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0","1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0",0.96,0.96,"1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 0.9, 0.8"
+js7561,dcase2013/l3/short/environmental/cnn_L3_melspec1/framewise/overlap/no-min-max/mlp,/scratch/js7561/l3_cls/classifier/dcase2013/l3/short/environmental/cnn_L3_melspec1/framewise/overlap/no-min-max/mlp/fold2/20180427174607,ae2b517eb0a5302c59ced3a038eeafacf902513d,/scratch/hhw230/sonyc-usc/features/dcase2013/l3/short/environmental/cnn_L3_melspec1,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,1.00E-05,1.00E-05,1,1.00E-05,-1,0.004461471,0.004684339,1,1,1,1,"1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0","1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0",0.96,0.96,"0.9, 1.0, 1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 0.9, 0.9"
+js7561,dcase2013/l3/short/environmental/cnn_L3_melspec2/framewise/overlap/no-min-max/mlp,/scratch/js7561/l3_cls/classifier/dcase2013/l3/short/environmental/cnn_L3_melspec2/framewise/overlap/no-min-max/mlp/fold2/20180427174607,ae2b517eb0a5302c59ced3a038eeafacf902513d,/scratch/hhw230/sonyc-usc/features/dcase2013/l3/short/environmental/cnn_L3_melspec2,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,1.00E-05,1.00E-05,1,1.00E-05,-1,0.004963388,0.005131662,1,1,1,1,"1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0","1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0",0.96,0.96,"1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 1.0, 0.9, 0.9, 0.9"
+js7561,dcase2013/l3/original/environmental/cnn_L3_kapredbinputbn/framewise/overlap/no-min-max/mlp,/scratch/js7561/l3_cls/classifier/dcase2013/l3/original/environmental/cnn_L3_kapredbinputbn/framewise/overlap/no-min-max/mlp/fold2/20180427174607,ae2b517eb0a5302c59ced3a038eeafacf902513d,/scratch/hhw230/sonyc-usc/features/dcase2013/l3/original/environmental/cnn_L3_kapredbinputbn,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,1.00E-05,1.00E-05,1,1.00E-05,-1,0.001746329,0.001793127,0.999797857,1,1,1,"1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0","1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0",0.95,0.95,"0.9, 1.0, 1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 0.9, 0.8"
+js7561,dcase2013/l3/original/environmental/cnn_L3_melspec2/framewise/overlap/no-min-max/mlp,/scratch/js7561/l3_cls/classifier/dcase2013/l3/original/environmental/cnn_L3_melspec2/framewise/overlap/no-min-max/mlp/fold2/20180427174607,ae2b517eb0a5302c59ced3a038eeafacf902513d,/scratch/jtc440/sonyc-usc/features/dcase2013/l3/original/environmental/cnn_L3_melspec2,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,1.00E-05,1.00E-05,1,1.00E-05,-1,0.000570133,0.000643959,1,1,1,1,"1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0","1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0",0.95,0.95,"0.9, 1.0, 1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 0.9, 0.8"
+js7561,dcase2013/l3/short/environmental/cnn_L3_kapredbinputbn/framewise/overlap/no-min-max/mlp,/scratch/js7561/l3_cls/classifier/dcase2013/l3/short/environmental/cnn_L3_kapredbinputbn/framewise/overlap/no-min-max/mlp/fold2/20180427174607,ae2b517eb0a5302c59ced3a038eeafacf902513d,/scratch/hhw230/sonyc-usc/features/dcase2013/l3/short/environmental/cnn_L3_kapredbinputbn,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,0.0001,1.00E-05,1,1.00E-05,-1,0.002521921,0.003925787,1,1,1,1,"1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0","1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0",0.95,0.95,"0.9, 1.0, 1.0, 1.0, 1.0, 0.8, 1.0, 0.9, 1.0, 0.9"
+js7561,dcase2013/l3/short/environmental/cnn_L3_orig/framewise/overlap/no-min-max/mlp,/scratch/js7561/l3_cls/classifier/dcase2013/l3/short/environmental/cnn_L3_orig/framewise/overlap/no-min-max/mlp/fold2/20180427174607,ae2b517eb0a5302c59ced3a038eeafacf902513d,/scratch/jtc440/sonyc-usc/features/dcase2013/l3/short/environmental/cnn_L3_orig,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,0.0001,1.00E-04,1,1.00E-05,-1,0.01040496,0.012845931,1,1,1,1,"1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0","1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0",0.95,0.95,"1.0, 0.9, 1.0, 1.0, 1.0, 0.9, 1.0, 0.9, 1.0, 0.8"
+js7561,dcase2013/l3/short/music/cnn_L3_kapredbinputbn/framewise/overlap/no-min-max/mlp,/scratch/js7561/l3_cls/classifier/dcase2013/l3/short/music/cnn_L3_kapredbinputbn/framewise/overlap/no-min-max/mlp/fold2/20180427174607,ae2b517eb0a5302c59ced3a038eeafacf902513d,/scratch/hhw230/sonyc-usc/features/dcase2013/l3/short/music/cnn_L3_kapredbinputbn,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,0.0001,1.00E-04,1,1.00E-05,-1,0.01225311,0.016584669,1,0.998854525,1,0.998853211,"1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0","1.0, 1.0, 1.0, 1.0, 0.9977064220183486, 1.0, 1.0, 0.9931192660550459, 1.0, 0.9977064220183486",0.95,0.95,"1.0, 1.0, 1.0, 1.0, 0.9, 0.9, 1.0, 1.0, 0.9, 0.8"
+js7561,dcase2013/l3/short/music/cnn_L3_melspec1/framewise/overlap/no-min-max/mlp,/scratch/js7561/l3_cls/classifier/dcase2013/l3/short/music/cnn_L3_melspec1/framewise/overlap/no-min-max/mlp/fold2/20180427174607,ae2b517eb0a5302c59ced3a038eeafacf902513d,/scratch/hhw230/sonyc-usc/features/dcase2013/l3/short/music/cnn_L3_melspec1,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,1.00E-05,1.00E-05,1,1.00E-05,-1,0.006562674,0.00741586,1,1,1,1,"1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0","1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0",0.95,0.95,"0.9, 1.0, 1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 0.9, 0.8"
+js7561,dcase2013/l3/short/music/cnn_L3_melspec2/framewise/overlap/no-min-max/mlp,/scratch/js7561/l3_cls/classifier/dcase2013/l3/short/music/cnn_L3_melspec2/framewise/overlap/no-min-max/mlp/fold2/20180427174607,ae2b517eb0a5302c59ced3a038eeafacf902513d,/scratch/hhw230/sonyc-usc/features/dcase2013/l3/short/music/cnn_L3_melspec2,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,0.0001,1.00E-04,1,1.00E-05,-1,0.01072127,0.012126957,1,1,1,1,"1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0","1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0",0.95,0.95,"1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 0.8, 0.8"
+js7561,dcase2013/l3/short/music/cnn_L3_orig/framewise/overlap/no-min-max/mlp,/scratch/js7561/l3_cls/classifier/dcase2013/l3/short/music/cnn_L3_orig/framewise/overlap/no-min-max/mlp/fold2/20180427174607,ae2b517eb0a5302c59ced3a038eeafacf902513d,/scratch/jtc440/sonyc-usc/features/dcase2013/l3/short/music/cnn_L3_orig,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,0.0001,1.00E-04,1,1.00E-05,-1,0.013331991,0.019654844,1,0.99862543,1,0.998626477,"1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0","1.0, 0.9977116704805492, 1.0, 1.0, 1.0, 0.9954233409610984, 0.9954233409610984, 1.0, 1.0, 0.9977064220183486",0.95,0.95,"1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 0.8, 0.8"
+js7561,dcase2013/l3/original/environmental/cnn_L3_orig/framewise/overlap/no-min-max/mlp,/scratch/js7561/l3_cls/classifier/dcase2013/l3/original/environmental/cnn_L3_orig/framewise/overlap/no-min-max/mlp/fold2/20180427174607,ae2b517eb0a5302c59ced3a038eeafacf902513d,/scratch/jtc440/sonyc-usc/features/dcase2013/l3/original/environmental/cnn_L3_orig,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,1.00E-05,1.00E-05,1,1.00E-05,-1,0.000875714,0.001198803,0.999919143,1,1,1,"1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0","1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0",0.94,0.94,"1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 1.0, 1.0, 0.8, 0.8"
+js7561,dcase2013/l3/original/music/cnn_L3_kapredbinputbn/framewise/overlap/no-min-max/mlp,/scratch/js7561/l3_cls/classifier/dcase2013/l3/original/music/cnn_L3_kapredbinputbn/framewise/overlap/no-min-max/mlp/fold2/20180427174607,ae2b517eb0a5302c59ced3a038eeafacf902513d,/scratch/hhw230/sonyc-usc/features/dcase2013/l3/original/music/cnn_L3_kapredbinputbn,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,1.00E-05,1.00E-05,1,1.00E-05,-1,0.00101895,0.001729461,1,1,1,1,"1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0","1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0",0.94,0.94,"0.9, 1.0, 1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 0.8, 0.8"
+js7561,dcase2013/l3/original/music/cnn_L3_orig/framewise/overlap/no-min-max/mlp,/scratch/js7561/l3_cls/classifier/dcase2013/l3/original/music/cnn_L3_orig/framewise/overlap/no-min-max/mlp/fold2/20180427174607,ae2b517eb0a5302c59ced3a038eeafacf902513d,/scratch/jtc440/sonyc-usc/features/dcase2013/l3/original/music/cnn_L3_orig,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,1.00E-05,1.00E-03,1,1.00E-05,-1,0.042081074,0.049192403,1,1,1,1,"1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0","1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0",0.94,0.94,"1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 1.0, 1.0, 0.8, 0.8"
+jtc440,dcase2013/soundnet/framewise/overlap/no-min-max/mlp,/scratch/jtc440/sonyc-usc/classifier/dcase2013/soundnet/framewise/overlap/no-min-max/mlp/fold2/20180509031303,8d265f538d4c79de8edbebd5d82a3c2c13c098a8,/scratch/hhw230/sonyc-usc/features/dcase2013/soundnet,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,0.001,1.00E-04,1,1.00E-05,-1,0.847743234,1.438365181,0.831563845,0.681300813,0.843039225,0.681253305,"0.8735632183908046, 0.9109195402298851, 0.8218390804597702, 0.8595988538681948, 0.7679083094555874, 0.7614942528735632, 0.8390804597701149, 0.9283667621776505, 0.9369627507163324, 0.7306590257879656","0.5645161290322581, 0.7903225806451613, 0.7580645161290323, 0.6065573770491803, 0.6721311475409836, 0.6612903225806451, 0.6612903225806451, 0.7049180327868853, 0.7868852459016393, 0.6065573770491803",0.82,0.82,"0.8, 1.0, 0.9, 0.9, 0.8, 0.9, 0.7, 0.9, 0.9, 0.4"
+jtc440,dcase2013/vggish/framewise/overlap/no-min-max/mlp,/scratch/jtc440/sonyc-usc/classifier/dcase2013/vggish/framewise/overlap/no-min-max/mlp/fold2/20180509031303,8d265f538d4c79de8edbebd5d82a3c2c13c098a8,/scratch/jtc440/sonyc-usc/features/dcase2013/vggish,2,TRUE,FALSE,0.15,FALSE,mlp,framewise,32,FALSE,20171021,50,0.0001,1.00E-04,1,1.00E-05,-1,0.059105595,0.174368465,0.999595715,0.959221077,0.999838319,0.959213675,"1.0, 1.0, 1.0, 1.0, 0.9995957962813258, 1.0, 1.0, 0.9987873888439773, 1.0, 1.0","0.9701834862385321, 0.988558352402746, 0.988558352402746, 0.9564220183486238, 0.9793577981651376, 0.9473684210526315, 0.954233409610984, 0.9105504587155964, 0.9496567505720824, 0.9472477064220184",0.91,0.91,"0.9, 1.0, 0.9, 0.8, 1.0, 1.0, 1.0, 1.0, 0.7, 0.8"
\ No newline at end of file
diff --git a/notebooks/dcase2013_results.ipynb b/notebooks/dcase2013_results.ipynb
new file mode 100644
index 0000000..084984a
--- /dev/null
+++ b/notebooks/dcase2013_results.ipynb
@@ -0,0 +1,151 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('data/dcase2013_results.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embedding_subset = []\n",
+    "embedding_model = []\n",
+    "embedding_feature = []\n",
+    "\n",
+    "for p in df['Model ID'].values: \n",
+    "    \n",
+    "    # training set\n",
+    "    subset = p.split('/')[3]\n",
+    "    subset = subset[:3]\n",
+    "    embedding_subset.append(subset)\n",
+    "    \n",
+    "    # model name\n",
+    "    model = p.split('/')[4]\n",
+    "    if model=='cnn_L3_orig':\n",
+    "        model = '1_orig'\n",
+    "    elif model=='cnn_L3_kapredbinputbn':\n",
+    "        model = '2_norm'\n",
+    "    elif model=='cnn_L3_melspec1':\n",
+    "        model = '3_mel1'\n",
+    "    elif model=='cnn_L3_melspec2':\n",
+    "        model = '4_mel2'\n",
+    "    \n",
+    "    embedding_model.append(model)\n",
+    "    embedding_feature.append(p.split('/')[2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['embedding_subset'] = embedding_subset\n",
+    "df['embedding_model'] = embedding_model\n",
+    "df['embedding_feature'] = embedding_feature"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_sorted = df.sort_values(by=['embedding_subset', 'embedding_model', 'embedding_feature'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAtoAAAHjCAYAAAAdc7jLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAIABJREFUeJzs3X2cVeV57//PxciTjIqCHZXRaH9VDkQtBoLVpDqoiQ/V2AQNGiSlxwSPD7STSFTyU2vVxEShJa3KCUaj0bxEq22PKKlG6pQmenqAiKJwsNZaGIxWwJiMPDPX+WNvkp0Js9dFuRdrb/b3/XrxYu+97r32Nd+5ueZmzdprm7sjIiIiIiJp9Sm6ABERERGRvZEW2iIiIiIiOdBCW0REREQkB1poi4iIiIjkQAttEREREZEcaKEtIiIiIpKDXBfaZnaWma00s9fN7LqdbP+QmS0ws5fNrMPMWiu23W5mr5rZCjP7KzOzPGsVEREREUkpt4W2mTUBdwFnAyOBi81sZI9hM4DvufvxwM3AbeXnngx8DDgeOBb4KHBqXrWKiIiIiKSW5xHtscDr7v6Gu28B5gLn9xgzElhQvv1cxXYHBgD9gP5AX+CdHGsVEREREUkqz4X2MGB1xf3O8mOVXgLGl29/GtjPzIa4+wuUFt4/Lf952t1X5FiriIiIiEhS++S4752dU93z896nAXea2WRgIbAG2GZmvwOMAHacs/1DMzvF3Rf+2guYTQGmAAwcOHD04YcfnqTw7u5u+vSprfeJpqpp9erS/31SZFWLOUGaulLmBLWZleZUTC3mBHtvVsopbm//tweaU1Gpatq3nNWGvXROpazptddeW+vuB2cOdPdc/gAnUToSveP+dGB6lfHNQGf59leAGyq23QhcU+31Ro8e7ak899xzyfaVSqqaTj31VD/11FOT7KsWc3JPU1fKnNxrMyvNqZhazMl9781KOcXt7f/23DWnopLVdOqppT8J7NU5uTuw2APr4Tz/q7EIONrMjjKzfsBFwBOVA8xsqJntqGE6cF/59irgVDPbx8z6UnojpE4dEREREZG6kdupI+6+zcyuAp4GmoD73P1VM7uZ0v8CngDagNvMzCmdOnJl+emPAacByyidbvIP7j4vr1obyYMPPlh0CXVBOcUpK0lJ8ylOWcUoJylSnudo4+7zgfk9Hrux4vZjlBbVPZ+3Hbgsz9oaVapz1PZ2yilOWUlKmk9xyipGOUmRaussdcndI488wiOPPFJ0GTVPOcUpK0lJ8ylOWcUoJylSrke0pfbMnj0bgAkTJhRcSW1TTnHKSlLSfIpTVjHKSYqkI9oiIiIiIjnQQltEREREJAdaaIuIiIiI5EALbRERERGRHOjNkA3mscd+42qKshPKKU5ZSUqaT3HKKkY5SZG00G4wQ4cOLbqEuqCc4pSVpKT5FKesYpSTFEmnjjSY+++/n/vvv7/oMmqecopTVpKS5lOcsopRTlIkLbQbjBpOjHKKU1aSkuZTnLKKUU5SJC20RURERERyoIW2iIiIiEgOtNAWEREREcmBFtoiIiIiIjnQ5f0azPz584suoS4opzhlJSlpPsUpqxjlJEXSQrvB7LvvvkWXUBeUU5yykpQ0n+KUVYxykiLp1JEGc/fdd3P33XcXXUbNU05xykpS0nyKU1YxykmKpIV2g3n00Ud59NFHiy6j5imnOGUlKWk+xSmrGOUkRdJCW0REREQkB1poi4iIiIjkQAttEREREZEcaKEtIiIiIpIDXd6vwXR0dBRdQl1QTnHKSlLSfIpTVjHKSYqkI9oiIiIiIjnQQrvBzJgxgxkzZhRdRs1TTnHKSlLSfIpTVjHKSYqkhXaDefLJJ3nyySeLLqPmKac4ZSUpaT7FKasY5SRF0kJbRERERCQHuS60zewsM1tpZq+b2XU72f4hM1tgZi+bWYeZtVZsO8LMnjGzFWa23MyOzLNWEREREZGUcltom1kTcBdwNjASuNjMRvYYNgP4nrsfD9wM3Fax7XvAHe4+AhgL/GdetYqIiIiIpJbn5f3GAq+7+xsAZjYXOB9YXjFmJPCl8u3ngL8vjx0J7OPuPwRw964c62woAwcOLLqEuqCc4pSVpKT5FKesYpSTFCnPhfYwYHXF/U7gxB5jXgLGA98CPg3sZ2ZDgGOAn5nZ3wJHAc8C17n79hzrbQg/+MEPii6hLiinOGUlKWk+xSmrGOUkRTJ3z2fHZhcCZ7r7F8r3JwFj3X1qxZjDgDspLaYXUlp0fxj4BHAvcAKwCngEmO/u9/Z4jSnAFICWlpbRc+fOTVJ7V1cXzc3NSfaVimqK2+N1rV8P3d1Vh3Q1NdG8PeP/iX36wEEHJSlp6/IVsG1b1TGbDmlhwNvvVN/RPvvQd+SIJDVF7PHv3TsvQ/fWqkO6+rfSvLkze199+kLL8b1ubm9vB2DWrFm7VGKvddXgvz/VFFO3PQqS9qmIPZ7VB29BxjG9ru2DaG76oPp+rAkGHZawsOpS5TSq3KeWJuhTe/u/vXHjxi1x9zFZ4/I8ot0JHF5xvxV4q3KAu78FfAbAzJqB8e7+vpl1Ai9WnHby98DvUVp8Vz5/DjAHYMyYMd7W1pak8I6ODlLtK5VUNd1yyy0A3HDDDbu9r1rMCdLUtSs5bbtnTuaYHx14EB9/b33muH0+85ns4gI6J07KHLPi2msY8c3bM8e1rllddXtdz6mbxmUO6Rg+g7aV02L7m9D7gYvBgwcDJPv6avHf357+txext+YE8axS9ihI16ci9vSc6u5ozxyzsGs0pzQvyRzXpy3Nf6ojks3zhH1qb/63tyvyvOrIIuBoMzvKzPoBFwFPVA4ws6FmtqOG6cB9Fc890MwOLt8/jV8/t1v+ixYsWMCCBQuKLqPmKac4ZSUpaT7FKasY5SRFym2h7e7bgKuAp4EVwKPu/qqZ3WxmnyoPawNWmtlrQAvwtfJztwPTgAVmtgww4J68ahURERERSS3PU0dw9/nA/B6P3Vhx+zHgsV6e+0Og9xMeRURERERqmD4ZUkREREQkB7ke0ZbaM2TIkKJLqAvKKU5ZSUqaT3HKKkY5SZG00G4wjz/+eNEl1AXlFKesJCXNpzhlFaOcpEg6dUREREREJAdaaDeY6dOnM3369KLLqHnKKU5ZSUqaT3HKKkY5SZF06kiDeeGFF4ouoS4opzhlJSlpPsUpqxjlJEXSEW0RERERkRxooS0iIiIikgMttEVEREREcqBztBtMa2tr0SXUBeUUp6wkJc2nOGUVo5ykSFpoN5iHHnqo6BLqgnKKU1aSkuZTnLKKUU5SJJ06IiIiIiKSAy20G0x7ezvt7e1Fl1HzlFOcspKUNJ/ilFWMcpIi6dSRGjDmaz9kbdeWqmOuPnYbk6c/lbmvoc39WPz/f6LX7UuXLg3V9Myqu9jcvaHqmO4tRzDvzTsy99W/z7588ogrQ69bzScfaWP9pnWZ4y5tvpxpD0ytOuagAUN4ZkJHr9ujOUk8q+4f3wBbf1F9UNdoujsCPxD77kefj90Sel1JI1WfStWjalWkT0V6FOzdfeqtUR+h+913M8dtvfYaOidOqjqmz8EHc9jSn/S6vZ5zAuCOQ+CDd6qPGT4DbhpXfcygFvjK2+nqkhAd0a4BWT+8ithX1iK7iH1FFtlF7EuCshbZRe1LQlL1lpT9rhapT8VEFtlF7KsmZS2y9/R+ZJdooS0iIiIikgMttEVEREREcqBztBvMMcccU3QJdUE5xSkrSUnzKU5ZxSgnKZIW2g1mzpw5RZdQF5RTnLKSlDSf4pRVjHKSIunUERERERGRHGih3WCmTJnClClTii6j5imnOGUlKWk+xSmrGOUkRdKpIw3mtddeK7qEuqCc4pSVpKT5FKesYpSTFElHtEVEREREcqCFtoiIiIhIDrTQFhERERHJgc7RbjCjRo0quoS6oJzilJWkpPkUp6xilJMUSQvtBjNr1qyiS6gLyilOWUlKmk9xyipGOUmRdOqIiIiIiEgOcl1om9lZZrbSzF43s+t2sv1DZrbAzF42sw4za+2xfX8zW2Nmd+ZZZyO55JJLuOSSS4ouo+YppzhlJSlpPsUpqxjlJEXK7dQRM2sC7gI+AXQCi8zsCXdfXjFsBvA9d3/AzE4DbgMmVWy/BfinvGpsRJ2dnUWXUBeUU5yykpQ0n+KUVYxykiLleUR7LPC6u7/h7luAucD5PcaMBBaUbz9Xud3MRgMtwDM51igiIiIikos8F9rDgNUV9zvLj1V6CRhfvv1pYD8zG2JmfYCZwFdyrE9EREREJDfm7vns2OxC4Ex3/0L5/iRgrLtPrRhzGHAncBSwkNKi+8OUTh/Z191vN7PJwBh3v2onrzEFmALQ0tIyeu7cuZl1/WzjVrozxvTZuonuvgOqjwEGD+yb+XoRy9a8nzmmZSC8szG2v+OGHdDrtvb2diD7Xdjvb3kn+4U29YMBW0I1HdCvJTSumhXrlmcPAoY2Hcza7e9mjhsxZGSv26I5AbB2beaQrqYmmrdvz97X0KHZYwK2vrwsc8ymQ1oY8Hb297nv8cdV3R7Oqmt19e1A1/ZBNDd9kDkOgObDY+Oq+emSzCFd/Vtp3hz81fOho3vdFM0p0qOgvvtUih4F8PMt7+JZaQX6lNGH/fsdnPl6EZE+Fe1RkKhPpexRkKRPRXoUpOlTu9TPU/apFD0K0vapKj0KYFQ5q6UJrtTS1dVFc3Pzbu8npZQ1jRs3bom7j8kal+fl/TqBylnWCrxVOcDd3wI+A2BmzcB4d3/fzE4Cft/MrgCagX5m1uXu1/V4/hxgDsCYMWO8ra0ts6iHX8yeiP3fXsHmQ0ZkjvvDE1ozx0RMnv5U5pirj93GzFdi3643J7b1uu3ss88GICureW/ekfk63a8dQZ9jVoVqajtyQmhcNdMemJo9CLi0+XLu7ZqdOW7x+N4bfTQngG33zMkc86MDD+Lj763PHLfPBRdkjononDgpc8yKa69hxDdvzxzXuqb6D55oVt0d7ZmvtbBrNKc0Z/9QAejTlv01ZrppXOaQjuEzaFs5Lba/i3s/cDF48GAgO6dIj4L67lMpehSU+pRljIn2qbYjL8wcExHpU9EeBWn6VMoeBWn6VKRHQZo+tStzKmWfStKjIG2fqtKjAAj2qYiOjo4k+0mpiJryXGgvAo42s6OANcBFwOcqB5jZUGC9u3cD04H7ANx9YsWYyZSOaP/GVUtk1912221Fl1AXlFOcspKUNJ/ilFWMcpIi5XaOtrtvA64CngZWAI+6+6tmdrOZfao8rA1YaWavUXrj49fyqkdEREREZE/K9ZMh3X0+ML/HYzdW3H4MeCxjH/cD9+dQXkMaP7703tPHH3+84Epqm3KKU1aSkuZTnLKKUU5SJH0Ee4NZt25d0SXUBeUUp6wkJc2nOGUVo5ykSPoIdhERERGRHGihLSIiIiKSAy20RURERERyoHO0G8zpp59edAl1QTnFKStJSfMpTlnFKCcpkhbaDeaGG24ouoS6oJzilJWkpPkUp6xilJMUSaeOiIiIiIjkQAvtBnP22Wf/8uNopXfKKU5ZSUqaT3HKKkY5SZF06kiD2bhxY9El1AXlFKesJCXNpzhlFaOcpEg6oi0iIiIikgMttEVEREREcqCFtoiIiIhIDnSOdoM599xziy6hLiinOGUlKWk+xSmrGOUkRdJCu8FMmzat6BLqgnKKU1aSkuZTnLKKUU5SJJ06IiIiIiKSAy20G0xbWxttbW1Fl1HzlFOcspKUNJ/ilFWMcpIiaaEtIiIiIpIDLbRFRERERHKghbaIiIiISA600BYRERERyYEu79dgPvvZzxZdQl1QTnHKSlLSfIpTVjHKSYqkhXaDueKKK4ouoS4opzhlJSlpPsUpqxjlJEXSqSMNZsOGDWzYsKHoMmqecopTVpKS5lOcsopRTlIkHdFuMOeccw4AHR0dxRZS45RTnLKSlDSf4pRVjHKSIumItoiIiIhIDrTQFhERERHJgRbaIiIiIiI5yHWhbWZnmdlKM3vdzK7byfYPmdkCM3vZzDrMrLX8+Cgze8HMXi1vm5BnnSIiIiIiqeX2ZkgzawLuAj4BdAKLzOwJd19eMWwG8D13f8DMTgNuAyYBG4DPu/u/mtlhwBIze9rdf5ZXvY1i8uTJRZdQF5RTnLKSlDSf4pRVjHKSImUutM3sXGC+u3fv4r7HAq+7+xvl/cwFzgcqF9ojgS+Vbz8H/D2Au7+2Y4C7v2Vm/wkcDGihvZvUcGKUU5yykpQ0n+KUVYxykiJFTh25CPhXM7vdzEbswr6HAasr7neWH6v0EjC+fPvTwH5mNqRygJmNBfoB/7YLry29WLt2LWvXri26jJqnnOKUlaSk+RSnrGKUkxTJ3D17kNn+wMXAHwMOfBd42N1/UeU5FwJnuvsXyvcnAWPdfWrFmMOAO4GjgIWUFt0fdvf3y9sPBTqAP3L3/72T15gCTAFoaWkZPXfu3MyvZf3GrZlj+mzdRHffAZnjDhrYN3NMxLI172eOaRkI72yM7e+4YQf0uq29vR2AWbNmVd3H+1veyX6hTf1gwJZQTQf0awmNq2bFuuXZg4ChTQezdvu7meNGDBnZ67ZoTgAEGnhXUxPN27dn72vo0OwxAVtfXpY5ZtMhLQx4O/v73Pf446puD2fVtbr6dqBr+yCamz7IHAdA8+GxcdX8dEnmkK7+rTRv7ozt79DRvW6K5hTpUVDffSpFj4K0fSpFj4JYn4r2KEjUp1L2KEjSpyI9CtL0qV3q5yn7VIoeBWn7VJUeBTCqnNXSSFZZNXV10dzcvNv7SSllTePGjVvi7mOyxoXO0Xb3n5vZ48BAoJ3S0eevmNlfuftf9/K0TqBylrUCb/XY71vAZwDMrBkYX7HI3h94Crh+Z4vs8vPnAHMAxowZ421tbZlfy8MvZk/E/m+vYPMh2Qfv205ozRwTMXn6U5ljrj52GzNfiZ1S/+bEtl63DR48GICsrOa9eUfm63S/dgR9jlkVqqntyN1/P+u0B6ZmDwIubb6ce7tmZ45bPL73Rh/NCWDbPXMyx/zowIP4+HvrM8ftc8EFmWMiOidOyhyz4tprGPHN2zPHta6p/oMnmlV3R3vmay3sGs0pzdk/VAD6tGV/jZluGpc5pGP4DNpWTovt7+LeD1xEc4r0KKjvPpWiR0HaPpWiR0GsT0V7FKTpUyl7FKTpU5EeBWn61K7MqZR9KkmPgrR9qkqPAmAXssqsqaMjyX5SKqKmzFNHzOw8M/s74B+BvpSOSp8N/C5Q7bu6CDjazI4ys36UTkF5ose+h5rZjhqmA/eVH+8H/B2lN0r+zS5+TSIiIiIihYscIr0Q+Et3X1j5oLtvMLP/3tuT3H2bmV0FPA00Afe5+6tmdjOw2N2fANqA28zMKZ06cmX56Z8FTgGGmNnk8mOT3X1p/EsTERERESlOZKH9Z8BPd9wxs4FAi7u/6e4Lqj3R3ecD83s8dmPF7ceAx3byvIeAhwK1iYiIiIjUpMhC+2+Akyvuby8/9tFcKpJcXX755UWXUBeUU5yykpQ0n+KUVYxykiJFFtr7uPsv37Lt7lvK51BLHZowQR+yGaGc4pSVpKT5FKesYpSTFClyHe13zexTO+6Y2fmALkhZp1avXs3q1dmXL2p0yilOWUlKmk9xyipGOUmRIke0/wfwfTO7EzBKH0Lz+VyrktxMmlS63FBHR0exhdQ45RSnrCQlzac4ZRWjnKRImQttd/834PfK17m2ah9SIyIiIiIiJaFPQDGzPwA+DAwwMwDc/eYc6xIRERERqWuRD6z5n8AEYCqlU0cuBD6Uc10iIiIiInUt8mbIk93988B77v7nwEn8+keri4iIiIhID5FTRzaV/95gZocB64Cj8itJ8nT11VcXXUJdUE5xykpS0nyKU1YxykmKFFlozzOzwcAdwE8AB+7JtSrJzXnnnVd0CXVBOcUpK0lJ8ylOWcUoJylS1YW2mfUBFrj7z4DHzexJYIC7v79HqpPkVq5cCcDw4cMLrqS2Kac4ZSUpaT7FKasY5SRFqrrQdvduM5tJ6bxs3H0zsHlPFCb5uOyyywBdTzSLcopTVpKS5lOcsopRTlKkyJshnzGz8bbjun4iIiIiIpIpco72l4FBwDYz20TpEn/u7vvnWpmIiIiISMB1f/gQv1i/seqYEyceyJU3Zr/NcL+DBvKNv78kSV2RT4bcL8kriYiIiIjkIGuRXdS+MhfaZnbKzh5394XJqhARERER2ctETh35SsXtAcBYYAlwWi4VSa6uv/76okuoC8opTllJSppPccoqRjlJkSKnjvzaBSjN7HDg9twqklydccYZRZdQF5RTnLKSlDSf4pRVjHKSIkWuOtJTJ3Bs6kJkz1i6dClLly4tuoyap5zilJWkpPkUp6xilJMUKXKO9l9T+jRIKC3MRwEv5VmU5Ke9vR3Q9USzKKc4ZSUpaT7FKasY5SRFipyjvbji9jbgYXf/cU71iIiIiIjsFSIL7ceATe6+HcDMmsxsX3ffkG9pIiIiIiL1K3KO9gJgYMX9gcCz+ZQjIiIiIrJ3iCy0B7h714475dv75leSiIiIiEj9i5w68oGZfcTdfwJgZqOBdB+ZI3vU17/+9aJLqAvKKU5ZSUqaT3HKKkY5SZEiC+124G/M7K3y/UOBCfmVJHk6+eSTiy6hLiinOGUlKWk+xSmrGOUkRYp8YM0iM/tvwHDAgP/r7ltzr0xy8fzzzwNqPFmUU5yykpQ0n+KUVYxykiJFrqN9JfB9d3+lfP9AM7vY3e8OPPcs4FtAE/Add/9Gj+0fAu4DDgbWA5e4e2d52x8BOz439VZ3fyD+ZUlvvvrVrwK6nmgW5RSnrCQlzac4ZRWjnKRIkTdDftHdf7bjjru/B3wx60lm1gTcBZwNjAQuNrORPYbNAL7n7scDNwO3lZ97EPBnwInAWODPzOzAQK0iIiIiIjUhstDuY2a24055Ad0v8LyxwOvu/oa7bwHmAuf3GDOS0uUDAZ6r2H4m8EN3X19e2P8QOCvwmiIiIiIiNSGy0H4aeNTMTjez04CHgX8IPG8YsLrifmf5sUovAePLtz8N7GdmQ4LPFRERERGpWebu1QeY9QEuA06n9GbIZyidb70943kXAme6+xfK9ycBY919asWYw4A7gaOAhZQW3R8GpgD93f3W8rgbgA3uPrPHa0wpj6WlpWX03LlzM7/g9Ruz38fZZ+smuvsOyBx30MC+mWMilq15P3NMy0B4J3hRxeOGHdDrtvb2dgBmzZpVdR/vb3kn+4U29YMBW0I1HdCvJTSumhXrlofGDW06mLXb380cN2JIzzOZfiWaEwBr12YO6Wpqonl71X8yJUOHZo8J2Prysswxmw5pYcDb2d/nvscfV3V7OKuu1dW3A13bB9Hc9EHmOACaD4+Nq+anSzKHdPVvpXlzZ2x/h47udVM0p0iPgvruUyl6FKTtUyl6FMT6VLRHQaI+lbJHQZI+FelRkKZP7VI/T9mnUvQoSNunqvQogFHlrJZGssqqqauL5ubm3d5P1KqV2fN80JAmPlgXm+dHDK8+z8eNG7fE3cdk7Sdzof1fZWYnATe5+5nl+9MB3P22XsY3U7qiSauZXQy0uftl5W3fBjrc/eHeXm/MmDG+ePHizLoefjF7IvZ/ewWbDxmROe7iE1ozx0QcOf2pzDFXH7uNma9ErsYIb972B71uW7p0KQCjRo2quo95b96R+Trdrx1Bn2NWhWo678ivhMZVM+aB6gu+HS5tvpx7u2Znjlv8R703+mhOANvumZM55kcHHsTH31ufOW6fL07JHBPROSy7wa+49hpGfPP2zHGta6r/4Ilm1d3RnvlaC7tGc0pz9g8VgD5tu/+DgJssc0jH8Bm0rZwW3F/v/bStra20v4w3ZEV6FNR3n0rRoyBtn0rRoyDWp6I9CtL0qZQ9CtL0qUiPgjR9alfmVMo+laRHQdo+VaVHAVDuUyR442hHR8cv+96ecOUp92SOOXHigfzL998L7e+uhdXfjmhmoYV25KojR1N6k+JI4JeHT9z9tzOeugg42syOAtYAFwGf67HvocB6d+8GplO6AgmUTlf5esUbID9Z3i67KdJoRDntCmUlKWk+xSmrGOUkRYqco/1dYDawDRgHfA94MOtJ7r4NuIrSonkF8Ki7v2pmN5vZp8rD2oCVZvYa0AJ8rfzc9cAtlBbri4Cby4/Jbnr22Wd59tlniy6j5imnOGUlKWk+xSmrGOUkRYqcizDQ3ReYmbn7fwA3mdk/U7r8XlXuPh+Y3+OxGytuPwY81stz7+NXR7glkVtvvRWAM844o+BKaptyilNWkpLmU5yyilFOUqTIQntT+Q2R/2pmV1E6DeS38i1LRERERKS+RU4daQf2Bf4EGA1cAvxRnkWJiIiIiNS7zCPa7r6ofLML+ON8yxERERER2TtEjmiLiIiIiMguil2YWfYa3/72t4suoS4opzhlJSlpPsUpqxjlJEXSQrvBDB8+vOgS6oJyilNWkpLmU5yyilFOUqTIB9YcDHwROLJyvLv/9/zKkrzMmzcPgPPOO6/gSmqbcopTVpKS5lOcsopRTlKkyBHt/wX8M/AsEPuAeKlZM2fOBNRwsiinOGUlKWk+xSmrGOUkRYostPd192tzr0REREREZC8SuerIk2Z2Tu6ViIiIiIjsRSIL7T+ltNjeZGa/KP/5ed6FiYiIiIjUs8gH1uy3JwoREREREdmbhC7vZ2afAk4p3+1w9yfzK0ny9OCDDxZdQl1QTnHKSlLSfIpTVjHKSYoUubzfN4CPAt8vP/SnZvZxd78u18okF4cffnjRJdQF5RSnrCQlzac4ZRWjnKRIkSPa5wCj3L0bwMweAF4EtNCuQ4888ggAEyZMKLiS2qac4pSVpKT5FKesYpSTFCn6yZCDgfXl2wfkVIvsAbNnzwbUcLIopzhlJSlpPsUpqxjlJEWKLLRvA140s+cAo3Su9vRcqxIRERERqXORq448bGYdlM7TNuBad38778JEREREROpZr9fRNrP/Vv77I8ChQCewGjis/JiIiIiIiPSi2hHtLwNTgJk72ebAablUJCIiIiKyF+h1oe3uU8o3z3b3TZXbzGxArlVJbh577LFOzeoiAAAgAElEQVSiS6gLyilOWUlKmk9xyipGOUmRIm+GfB7oearIzh6TOjB06NCiS6gLyilOWUlKmk9xyipGOUmRel1om9khwDBgoJmdQOmNkAD7A/vugdokB/fffz8AkydPLrSOWqec4pSVpKT5FKesYpSTFKnaEe0zgclAK/AXFY//AvhqjjVJjtRwYpRTnLKSlDSf4pRVjHKSIlU7R/sB4AEzG+/uj+/BmkRERERE6l7kOtqPm9kfAB8GBlQ8fnOehYmIiIiI1LNer6O9g5n9T2ACMJXSedoXAh/KuS4RERERkbqWudAGTnb3zwPvufufAycBh0d2bmZnmdlKM3vdzK7byfYjzOw5M3vRzF42s3PKj/c1swfMbJmZrTAzfeS7iIiIiNSVyOX9Npb/3mBmhwHrgKOynmRmTcBdwCcofarkIjN7wt2XVwy7HnjU3Web2UhgPnAkpaPm/d39ODPbF1huZg+7+5vBr0t6MX/+/KJLqAvKKU5ZSUqaT3HKKkY5SZEiC+0nzWwwcAfwE0qfCvmdwPPGAq+7+xsAZjYXOB+oXGg7pcsFAhwAvFXx+CAz2wcYCGwBfh54Tcmw7766MmOEcopTVpKS5lOcsopRTlKkyJshbynffNzMngQGuPv7gX0PA1ZX3O8ETuwx5ibgGTObCgwCzig//hilRflPKV2z+0vuvj7wmpLh7rvvBuCKK64ouJLappzilJWkpPkUp6xilJMUydy9+gCzK4Hvu/vPyvcPBC5297sznnchcKa7f6F8fxIw1t2nVoz5crmGmWZ2EnAvcCyl88CvoHQd7wOBf6b0UfBv9HiNKcAUgJaWltFz587N/ILXb9yaOabP1k10983+lPmDBvbNHBOxbE32/1taBsI7GzOHAXDcsAN63dbe3g7ArFmzqu7j/S3vZL/Qpn4wYEuopgP6tYTGVbNi3fLsQcDQpoNZu/3dzHEjhozsdVs0JwDWrs0c0tXURPP27dn7SvQJZltfXpY5ZtMhLQx4O/v73Pf446puD2fVtbr6dqBr+yCamz7IHAdAc+itItX9dEnmkK7+rTRv7ozt79DRvW6K5hTpUVDffSpFj4K0fSpFj4JYn4r2KEjUp1L2KEjSpyI9CtL0qV3q5yn7VIoeBWn7VJUeBTCqnNXSSFZZNXV10dzcvNv7iVq1MnueDxrSxAfrYvP8iOHV5/m4ceOWuPuYrP1ETh35orvfteOOu79nZl8Eqi60KR3Brpxlrfzq1JAdLgXOKu/3BTMbAAwFPgf8g7tvBf7TzH4MjAF+baHt7nOAOQBjxozxtra2zC/m4RezJ2L/t1ew+ZARmePaTmjNHBMxefpTmWOuPnYbM1+JfLvgzYltvW4bPHgwAFlZzXvzjszX6X7tCPocsypUU9uRE0Ljqpn2wNTsQcClzZdzb9fszHGLx/fe6KM5AWy7Z07mmB8deBAffy/7lzL7XHBB5piIzomTMsesuPYaRnzz9sxxrWuq/+CJZtXd0Z75Wgu7RnNKc/YPFYA+bdlfY6abxmUO6Rg+g7aV02L7u7j3AxfRnCI9Cuq7T6XoUZC2T6XoURDrU9EeBWn6VMoeBWn6VKRHQZo+tStzKmWfStKjIG2fqtKjANiFrDJr6uhIsp+oK2+8J3PMiRMP5F++/15of59fmObnceSqI33MbMfHr+94k2O/wPMWAUeb2VFm1g+4CHiix5hVwOnl/Y6gdJ3ud8uPn2Ylg4DfA/5v4DVFRERERGpCZKH9NPComZ1uZqcBDwP/kPUkd98GXFV+/gpKVxd51cxuNrNPlYddDXzRzF4q73eyl85luQtoBl6htGD/rru/vItfm4iIiIhIYSLnIlwLXAZcTukDa54hdtUR3H0+pUv2VT52Y8Xt5cDHdvK8LkqX+BMRERERqUuRq450A7PLf6TOdXR0FF1CXVBOccpKUtJ8ilNWMcpJitTrQtvMHnX3z5rZMkrXtf417n58rpWJiIiIiNSxake0d7z19tw9UYjsGTNmzABg2rTgVRQalHKKU1aSkuZTnLKKUU5SpGpvhnyy/Pet7v4fPf/sieIkvSeffJInn3wye2CDU05xykpS0nyKU1YxykmKVO2Idj8z+yPgZDP7TM+N7v63+ZUlIiIiIlLfqi20/wcwERgMnNdjmwNaaIuIiIiI9KLXhba7/wj4kZktdvd792BNIiIiIiJ1r9pVR05z938E3tOpI3uPgQMHFl1CXVBOccpKUtJ8ilNWMcpJilTt1JFTgX/kN08bAZ06Urd+8IMfFF1CXVBOccpKUtJ8ilNWMcpJilTt1JE/K//9x3uuHBERERGRvUO1y/sBYGZ/amb7W8l3zOwnZvbJPVGcpHfLLbdwyy23FF1GzVNOccpKUtJ8ilNWMcpJipS50Ab+u7v/HPgk8FvAHwPfyLUqyc2CBQtYsGBB0WXUPOUUp6wkJc2nOGUVo5ykSJGFtpX/Pgf4rru/VPGYiIiIiIjsRGShvcTMnqG00H7azPYDuvMtS0RERESkvlW76sgOlwKjgDfcfYOZHUTp9BEREREREelFZKF9ErDU3T8ws0uAjwDfyrcsycuQIUOKLqEuKKc4ZSUpaT7FKasY5SRFiiy0ZwO/a2a/C1wD3At8j9J1tqXOPP7440WXUBeUU5yykpQ0n+KUVYxykiJFztHe5u4OnA98y92/BeyXb1kiIiIiIvUtckT7F2Y2HbgEOMXMmoC++ZYleZk+fToAt912W8GV1DblFKesJCXNpzhlFaOcpEiRhfYE4HPApe7+tpkdAdyRb1mSlxdeeKHoEuqCcopTVpKS5lOcsopRTlKkzIW2u78N/EXF/VWUztEWEREREZFeRD6C/ffMbJGZdZnZFjPbbmbv74niRERERETqVeTNkHcCFwP/CgwEvgDclWdRIiIiIiL1LnKONu7+upk1uft24Ltm9nzOdUlOWltbiy6hLiinOGUlKWk+xSmrGOUkRYostDeYWT9gqZndDvwUGJRvWZKXhx56qOgS6oJyilNWkpLmU5yyilFOUqTIqSOTgCbgKuAD4HBgfJ5FiYiIiIjUu8hVR/6jfHMj8Of5liN5a29vB2DWrFkFV1LblFOcspKUNJ/ilFWMcpIi9brQNrNlgPe23d2Pz6UiydXSpUuLLqEuKKc4ZSUpaT7FKasY5SRFqnZE+9zd3bmZnQV8i9KpJ99x92/02H4E8AAwuDzmOnefX952PPBtYH+gG/iou2/a3ZpERERERPaEagvtvkCLu/+48kEz+33grawdlz+q/S7gE0AnsMjMnnD35RXDrgcedffZZjYSmA8caWb7AA8Bk9z9JTMbAmzdlS9MRERERKRI1d4MOQv4xU4e31jelmUs8Lq7v+HuW4C5wPk9xjilI9YAB/CrBfwngZfd/SUAd19XvrSgiIiIiEhdqHZE+0h3f7nng+6+2MyODOx7GLC64n4ncGKPMTcBz5jZVEqXDDyj/PgxgJvZ08DBwFx3vz3wmpLhmGOOKbqEuqCc4pSVpKT5FKesYpSTFMncd/5+RzN73d1/Z1e3VYy5EDjT3b9Qvj8JGOvuUyvGfLlcw0wzOwm4FzgW+DJwJfBRYAOwALje3Rf0eI0pwBSAlpaW0XPnzs38gtdvzD4Dpc/WTXT3HZA57qCBfTPHRCxbk/2J9i0D4Z2Nsf0dN+yA3awI3t/yTvagTf1gwJbQ/g7o17KbFcGKdcuzBwFDmw5m7fZ3M8eNGDJyd0sqWbs2c0hXUxPN2wO/lBk6NEFBsPXlZZljNh3SwoC3s7/PfY8/LkVJ0LU6e8j2QTQ3fRDbX/Phu1kQ8NMlmUO6+rfSvLkztr9DR/e6KXrlg0iPgvruUyl6FKTtUyl6FMT6VLRHQaI+lbJHQZI+FelRUOd9KkWPgrR9qkqPAhhV7lNLE1yhpauri+bm5t3eT9SqldnzfNCQJj5YF5vnRwyvPs/HjRu3xN3HZO2n2hHtRWb2RXe/p/JBM7sUyP6ul45gV86yVn7z3O5LgbMA3P0FMxsADC0/95/cfW35NecDH6G04P4ld58DzAEYM2aMt7W1ZRb18IvZE7H/2yvYfMiIzHFtJ6T5tKnJ05/KHHP1sduY+Urogzx5c2LbblYE8968I3NM92tH0OeYVaH9tR05YXdLYtoDU7MHAZc2X869XbMzxy0eH2v0WbbdMydzzI8OPIiPv7c+c9w+F1yQoiQ6J07KHLPi2msY8c3sXxS1rsn+wRPR3dGeOWZh12hOaY60F+jTlv01ZrppXOaQjuEzaFs5Lba/i3u9UBODBw8GIKtPRXoU1HefStGjIG2fStGjINanoj0K0vSplD0K0vSpSI+C+u5TSXoUpO1TVXoUAME+FdHR0ZFkP1FX3nhP5pgTJx7Iv3z/vdD+Pr8wzc/jah2xHfg7M5vIrxbWY4B+wKcD+14EHG1mRwFrgIuAz/UYswo4HbjfzEYAA4B3gaeBa8xsX2ALcCrwl6GvSKqaMmUKAHPmZDfeRqac4pSVpKT5FKesYpSTFKnXhba7vwOcbGbjKJ3OAfCUu/9jZMfuvs3MrqK0aG4C7nP3V83sZmCxuz8BXA3cY2ZfovTGyMleOpflPTP7C0qLdQfmu3v24RTJ9NprrxVdQl1QTnHKSlLSfIpTVjHKSYoU+WTI54Dn/is7L18Te36Px26suL0c+Fgvz32I0iX+RERERETqTrXL+4mIiIiIyH+RFtoiIiIiIjmIXcZC9hqjRo0quoS6oJzilJWkpPkUp6xilJMUSQvtBpN1DV8pUU5xykpS0nyKU1YxykmKpFNHRERERERyoIV2g7nkkku45JJLii6j5imnOGUlKWk+xSmrGOUkRdKpIw2mszP4UdINTjnFKStJSfMpTlnFKCcpko5oi4iIiIjkQAttEREREZEcaKEtIiIiIpIDnaPdYE466aSiS6gLyilOWUlKmk9xyipGOUmRtNBuMLfddlvRJdQF5RSnrCQlzac4ZRWjnKRIOnVERERERCQHWmg3mPHjxzN+/Piiy6h5yilOWUlKmk9xyipGOUmRdOpIg1m3bl3RJdQF5RSnrCQlzac4ZRWjnKRIOqItIiIiIpIDLbRFRERERHKghbaIiIiISA50jnaDOf3004suoS4opzhlJSlpPsUpqxjlJEXSQrvB3HDDDUWXUBeUU5yykpQ0n+KUVYxykiLp1BERERERkRxood1gzj77bM4+++yiy6h5yilOWUlKmk9xyipGOUmRdOpIg9m4cWPRJdQF5RSnrCQlzac4ZRWjnKRIOqItIiIiIpIDLbRFRERERHKghbaIiIiISA50jnaDOffcc4suoS4opzhlJSlpPsUpqxjlJEXKdaFtZmcB3wKagO+4+zd6bD8CeAAYXB5znbvP77F9OXCTu8/Is9ZGMW3atKJLqAvKKU5ZSUqaT3HKKkY5SZFyO3XEzJqAu4CzgZHAxWY2ssew64FH3f0E4CLg7h7b/xL4QV41ioiIiIjkJc9ztMcCr7v7G+6+BZgLnN9jjAP7l28fALy1Y4OZ/SHwBvBqjjU2nLa2Ntra2oouo+YppzhlJSlpPsUpqxjlJEXKc6E9DFhdcb+z/Film4BLzKwTmA9MBTCzQcC1wJ/nWJ+IiIiISG7M3fPZsdmFwJnu/oXy/UnAWHefWjHmy+UaZprZScC9wLHA7cD/cfdHzewmoGtn52ib2RRgCkBLS8vouXPnZta1fuPWzDF9tm6iu++AzHEHDeybOSZi2Zr3M8e0DIR3gtfcP27YAb1ua29vB2DWrFlV9/H+lneyX2hTPxiwJVTTAf1aQuOqWbFueWjc0KaDWbv93cxxI4b0PJPpV6I5AbB2beaQrqYmmrdvz97X0KHZYwK2vrwsc8ymQ1oY8Hb297nv8cdV3R7Oqmt19e1A1/ZBNDd9kDkOgObDY+Oq+emSzCFd/Vtp3twZ29+ho3vdFM0p0qOgvvtUih4FaftUih4FsT4V7VGQqE+l7FGQpE9FehSk6VO71M9T9qkUPQrS9qkqPQpgVDmrpZGssmrq6qK5uXm39xO1amX2PB80pIkP1sXm+RHDq8/zcePGLXH3MVn7yfPNkJ1A5SxrpeLUkLJLgbMA3P0FMxsADAVOBC4ws9spvVGy28w2ufudlU929znAHIAxY8Z45FdDD7+YPRH7v72CzYeMyBzXdkJr5piIydOfyhxz9bHbmPlK7Nv15sS2XrcNHjwYIPPXaPPevCPzdbpfO4I+x6wK1dR25ITQuGqmPTA1exBwafPl3Ns1O3Pc4vG9N/poTgDb7pmTOeZHBx7Ex99bnzlunwsuyBwT0TlxUuaYFddew4hv3p45rnVN9R880ay6O9ozX2th12hOac7+oQLQpy37a8x007jMIR3DZ9C2Mvhmqot7P3ARzSnSo6C++1SKHgVp+1SKHgWxPhXtUZCmT6XsUZCmT0V6FKTpU7syp1L2qSQ9CtL2qSo9CoBdyCqzpo6OPXrKzpU33pM55sSJB/Iv338vtL/PL0zz8zjPhfYi4GgzOwpYQ+nNjp/rMWYVcDpwv5mNAAYA77r77+8YUHFE+05EREREROpEbgttd99mZlcBT1O6dN997v6qmd0MLHb3J4CrgXvM7EuU3hg52fM6l0UA+OxnP1t0CXVBOcUpK0lJ8ylOWcUoJylSrtfRLl8Te36Px26suL0c+FjGPm7KpbgGdcUVVxRdQl1QTnHKSlLSfIpTVjHKSYqkj2BvMBs2bGDDhg1Fl1HzlFOcspKUNJ/ilFWMcpIi6SPYG8w555wDlN6kIL1TTnHKSlLSfIpTVjHKSYqkI9oiIiIiIjnQQltEREREJAdaaIuIiIiI5EALbRERERGRHOjNkA1m8uTJRZdQF5RTnLKSlDSf4pRVjHKSImmh3WDUcGKUU5yykpQ0n+KUVYxykiLp1JEGs3btWtauXVt0GTVPOcUpK0lJ8ylOWcUoJymSjmg3mAsuuADQ9USzKKc4ZSUpaT7FKasY5SRF0hFtEREREZEcaKEtIiIiIpIDLbRFRERERHKghbaIiIiISA70ZsgGc/nllxddQl1QTnHKSlLSfIpTVjHKSYqkhXaDmTBhQtEl1AXlFKesJCXNpzhlFaOcpEg6daTBrF69mtWrVxddRs1TTnHKSlLSfIpTVjHKSYqkI9oNZtKkSYCuJ5pFOcUpK0lJ8ylOWcUoJymSjmiLiIiIiORAC20RERERkRxooS0iIiIikgMttEVEREREcqA3QzaYq6++uugS6oJyilNWkpLmU5yyilFOUiQttBvMeeedV3QJdUE5xSkrSUnzKU5ZxSgnKZJOHWkwK1euZOXKlUWXUfOUU5yykpQ0n+KUVYxykiLpiHaDueyyywBdTzSLcopTVpKS5lOcsopRTlIkHdEWEREREclBrgttMzvLzFaa2etmdt1Oth9hZs+Z2Ytm9rKZnVN+/BNmtsTMlpX/Pi3POkVEREREUsvt1BEzawLuAj4BdAKLzOwJd19eMex64FF3n21mI4H5wJHAWuA8d3/LzI4FngaG5VWriIiIiEhqeR7RHgu87u5vuPsWYC5wfo8xDuxfvn0A8BaAu7/o7m+VH38VGGBm/XOsVUREREQkqTzfDDkMWF1xvxM4sceYm4BnzGwqMAg4Yyf7GQ+86O6b8yiy0Vx//fVFl1AXlFOcspKUNJ/ilFWMcpIimbvns2OzC4Ez3f0L5fuTgLHuPrVizJfLNcw0s5OAe4Fj3b27vP3DwBPAJ93933byGlOAKQAtLS2j586dm1nX+o1bM8f02bqJ7r4DMscdNLBv5piIZWvezxzTMhDe2Rjb33HDDtjNiuD9Le9kD9rUDwZsCe3vgH4tu1kRrFi3PHsQMLTpYNZufzdz3IghI3e3pJK1azOHdDU10bx9e/a+hg5NUBBsfXlZ5phNh7Qw4O3s73Pf449LURJ0rc4esn0QzU0fxPbXfPhuFgT8dEnmkK7+rTRv7ozt79DRvW5qb28HYNasWVV3EelRUN99KkWPgrR9KkWPglifivYoSNSnUvYoSNKnIj0K6rxPpehRkLZPVelRAKPKfWppRp/a+PPNZK0ft7GFfeiXWZKZMXD/3T9pYdXK7Hk+aEgTH6yLzfMjhlef5+PGjVvi7mOy9pPnEe1OoHKWtVI+NaTCpcBZAO7+gpkNAIYC/2lmrcDfAZ/f2SK7/Jw5wByAMWPGeFtbW2ZRD7+YPRH7v72CzYeMyBzXdkJr5piIydOfyhxz9bHbmPlK7Nv15sS2XrctXboUgFGjRlXdx7w378h8ne7XjqDPMatCNbUdOSE0rpppD0zNHgRc2nw593bNzhy3eHzvjT6aE8C2e+ZkjvnRgQfx8ffWZ47b54ILMsdEdE6clDlmxbXXMOKbt2eOa11T/QdPNKvujvbM11rYNZpTmrN/qAD0acv+GjPdNC5zSMfwGbStnBbb38W9/+AZPHgwAFl9KtKjoL77VIoeBWn7VIoeBbE+Fe1RkKZPpexRkKZPRXoUpOlTuzKnUvapJD0K0vapKj0KgGCfWjRvBVj1Xb3b/R8c3OdD2TUBH23L7mVZrrzxnswxJ048kH/5/nuh/X1+YZqfx3kutBcBR5vZUcAa4CLgcz3GrAJOB+43sxHAAOBdMxsMPAVMd/cf51hjw9lxVE3XE61OOcUpK0lJ8ylOWcUoJylSbm+GdPdtwFWUrhiygtLVRV41s5vN7FPlYVcDXzSzl4CHgcle+l3EVcDvADeY2dLyn9/Kq1YRERERkdRy/WRId59P6ZJ9lY/dWHF7OfCxnTzvVuDWPGsTEREREcmTPhlSRERERCQHWmiLiIiIiOQg11NHpPZ8/etfL7qEuqCc4pSVpKT5FKesYpSTFEkL7QZz8sknF11CXVBOccpKUtJ8ilNWMcpJiqRTRxrM888/z/PPP190GTVPOcUpK0lJ8ylOWcUoJymSjmg3mK9+9auArieaRTnFKStJSfMpTlnFKCcpko5oi4iIiIjkQAttEREREZEcaKEtIiIiIpIDLbRFRERERHKgN0M2mFmzZhVdQl1QTnHKSlLSfIpTVjHKSYqkhXaDGTVqVNEl1AXlFKesJCXNpzhlFaOcpEg6daTBPPvsszz77LNFl1HzlFOcspKUNJ/ilFWMcpIi6Yh2g7n11lsBOOOMMwqupLYppzhlJSlpPsUpqxjlJEXSEW0RERERkRxooS0iIiIikgMttEVEREREcqCFtoiIiIhIDvRmyAbz7W9/u+gS6oJyilNWkpLmU5yyilFOUiQttBvM8OHDiy6hLiinOGUlKWk+xSmrGOUkRdKpIw1m3rx5zJs3r+gyap5yilNWkpLmU5yyilFOUiQd0W4wM2fOBOC8884ruJLappzilJWkpPkUp6xilJMUSUe0RURERERyoIW2iIiIiEgOtNAWEREREcmBFtoiIiIiIjnQmyEbzIMPPlh0CXVBOcUpK0lJ8ylOWcUoJylSrgttMzsL+BbQBHzH3b/RY/sRwAPA4PKY69x9fnnbdOBSYDvwJ+7+dJ61NorDDz+86BLqgnKKU1aSkuZTnLKKUU5SpNwW2mbWBNwFfALoBBaZ2RPuvrxi2PXAo+4+28xGAvOBI8u3LwI+DBwGPGtmx7j79rzqbRSPPPIIABMmTCi4ktqmnOKUlaSk+RSnrGKUkxQpzyPaY4HX3f0NADObC5wPVC60Hdi/fPsA4K3y7fOBue6+Gfh3M3u9vL8Xcqy3IcyePRtQw8minOKUlaSk+RSnrGKUkxQpzzdDDgNWV9zvLD9W6SbgEjPrpHQ0e+ouPFdEREREpGaZu+ezY7MLgTPd/Qvl+5OAse4+tWLMl8s1zDSzk4B7gWOBvwZecPeHyuPuBea7++M9XmMKMKV8dziwMlH5Q4G1ifaVimqKq8W6VFOMaoqrxbpUU4xqiqvFulRTzN5e04fc/eCsQXmeOtIJVL4DoZVfnRqyw6XAWQDu/oKZDaAUQuS5uPscYE7CmgEws8XuPib1fneHaoqrxbpUU4xqiqvFulRTjGqKq8W6VFOMairJ89SRRcDRZnaUmfWj9ObGJ3qMWQWcDmBmI4ABwLvlcReZWX8zOwo4Gvg/OdYqIiIiIpJUbke03X2bmV0FPE3p0n33ufurZnYzsNjdnwCuBu4xsy9RemPkZC+dy/KqmT1K6Y2T24ArdcUREREREaknuV5Hu3xN7Pk9Hrux4vZy4GO9PPdrwNfyrK+K5KejJKCa4mqxLtUUo5riarEu1RSjmuJqsS7VFKOayPHNkCIiIiIijSzPc7RFRERERBqWFtoiIiIiIjnI9RztemNmB1L6yPeNwJvu3l1wSTVZk4gUx8wGAZtq6Q3itViTiEgtaPhztM3sAOBK4GKgH6XLCw4AWoD/Ddzt7s81ek3luk4CLgF+HziU0uL/FeAp4CF3f181Qfl68OeWa9rxn6RXgKfc/dU9XU8t16WaQvX0oXR51InAR4HNQH9KfWE+MMfd/7XRa6qorbVc2298/4AfFHGwohZrKtc1Zic1Pevu64uoRzXVf11m9k13vzbrsUaihbbZD4HvAfPc/Wc9to0GJgHL3P3eGqlpDKWF5Z6u6QeUPjTofwGLgf+ktPg/BhgHnAf8RfmyjY1c003l1+0AluykpgHA1e7+8p6qqVbrUk3hmv4JeJbSPH9lx6LMzA4q1/Q54O92fJJuo9ZUfv3vAsOAJ9l5TxgNXOfuCxu8psnAnwD/zm/O849RWrDd4O6rVFNt1VTLdZVr+4m7f6THYy+7+/F7upaK1/8M8E3gtwAr/3F333+PvH6jL7QlxsyGunvVjy2NjGmAmv7A3Z+qsv23gCPcffGeqqn8ujVXl2oK19TX3bfu7pi9vabyax7r7q9U2d6P0vfv9Qav6UpKn22xsZfto4Ah7r5ANdVWTbVal5ldDlwB/DbwbxWb9gN+7O6X7KlaejKz14Hz3H1FIa/f6AttM/tIte3u/pM9VUtPZjVEA4QAACAASURBVLbA3U/PeqzR6VdVcWZ2obv/TdZjja6WciofJe5Vwb8mftDdJ2U9JmBmf+ru38p6rNGZ2cfc/cdZj0ltZVU+5fVA4DbguopNv6iBU2x+7O47/cyWPfL6WmhbtXOd3d1P22PFlJXPEd0XeA5oo/RrDoD9KZ3PN6KAmn5B6dM7f2MTe/BXMDtTS7+qMrN57DwnANz9U3uwnN/QS1a/8dgeqqVms6qxnP6dUk62k83u7r+9h0v6pZ6ZmFkTpdPaRhZUzzKq96kif329szn1orufUEAtf1Vtu7v/yZ6qpaca+7dXszlBbWXVo4aPA0e7+3fNbCiwn7v/ewF1fKZ881TgEODvKb2fBAB3/9s9UUfDX3XE3ccVXcNOXAa0U3qDwxJ+9QP258BdRRTk7vsV8brVVPyq6v8zs8pzZvcDijr6MaOg163KzM4GzgGG9fjhsT+wrZiqai+rWszJ3Y8q4nWrMbPpwFeBgWb28x0PA1so9tPgzi3wtXfKzC6mdM76b5tZ5ftF9gPWFVMVSwp63V6V39h+MnCwmX25YtP+QFMxVdVeTlCzWQFgZn8GjAGGA9+ldEGHh+jlU8Bzdl7F7Q3AJyvuO6CF9p5mZscCIym9qQAAd//enq7D3b9lZncCX3X3W/5fe+cdLllVpe/3a2wyyIiIKHGQ8GMkZ2GUGYExERQEBcwoZtQBlTGAogIt6jggCMoAgkQRkFZy6AaRTDfd5AFEUdBBhyix+X5/7H36nltddW9doGvt693v89znVp2qOuerVad2rbP3CoM+fj/kWNW2nQaedAGcBJxLQUtVtqdFHLcP/khKxNqe4T8ejwKfjRBUqK2Ks1MbpXKfqzH8uzewJLrWMQ+SdAjwY9sfHPTxe2H73mgNXbgSuB94OfCd1vZHgYEmRTfYPj7iuKOwILA4yS9pT+w8AuwcIahQO0GBtmrxdmB94AYA23+UFDJRZ/sDEcftZMKHjjTkq7CtSI72r4A3A1fYDjtpJf3G9uZRx++GpO1JPxavImU6rwTcavufgvRMAm6y/dqI4/dC0mqkC4DOC7fIZf4FgJ/Y3j1KQzdKs1XBdtoT2BtYHpgBbAb8JiK8raXpetsbRh2/F5I2Aw4D/h/JKVkAeDwqxC2fU+fb3jri+L2QtAzwBeb97oWcU9lOp0b+7najNDtlTaXa6hrbmzQhLEo19n8THLY1BfgGqQTiecC6wGc8oKpItTPkEDsDbwQeyFdB65LqwkZygaSdJHWLzYziQNIP/B15SfuNxIVp4FRWbKakFaM09OBY4EhSuMG/kMo1nhApyKmZyNK5ykFJFGWrgu20N6lm9b055G19Ut3qSK6StHGwhm4cTupDcCewCLAnyfEOIZ9Tf8sJYyXxU+BWYBXga8BvgWujxGQ7jZj8G0RRdoKibXWapKOApSR9mFQG9EfBmra1/QgptOw+UhnEfQd18Bo6MsQTtp+T9KykJUmztWGzj5nPAYsBcyQ9QQGJh8Aztv8iaZKkSbYvzUvIkSwH3CzpGuDxZmNw4uEiti+WpLycfYCky4H9AzUB3Av8OseKtm313ThJRdqqRDs9aftJSUhayPZtktYI1APpwmgvSfeS7BSedNhg+38kLZAdkmMlXRks6UlgllKfhPY5FZlQt7TtY3L1k2nANKUa6ZHcmL93pzPcTgOJp+1BiXaCAm1l+1BJ25DCWNYAvmr7wig9mcn5/1uAk23/dZDzl9XRHuI6SUuRrryuBx4DrokUVGICIvCQpMWB6cBPJf2ZuGS6hq8FH78bT+awljslfRL4A6lYfjR/zH+TGB7bF0mJtirRTvflMeos4EJJ/0fSGMmbg4/fi7/lFYkZedn4ftKkRSS/zH8l0dQ5v1/SW0nn0/KBeiDN0v4FaIdlDCxxrQcl2gnKtBXZsY52rtucI+k2UujIx3Mo0JODOniN0e6CpJWBJT3g7n3dyDHRr893L7M9NVjPYqQTVKT2yy8Ffmo7Knu+0bUsaVkd4Brbfw7WszFpqXEpUrjNS4Eptq+K1NWQk1Ns+7ECtBRrq5Ls1EbSG0h2Os/208Fa1iW1gQa43PbMSD0AklYC/kSKz/4syVZHeIBNYXroWpC0bA1wuwfc0KeLnrcBlwMrkEJrlgS+5gF20x0PVDuNTqsMaTdse9VB6ukkJ5I/YnuOpEVJPt4DAzl2dbSHkPRqUnLf3Jn+iIz+lp6DSc7jT/OmdwPX2/5i71dNPCTtAnyb1DZbpB/9fW3/LFJXieTKOicwFNv3IPBe2zfHqSqPUu2UE6CWZfgYFVHxp9GzN/BhhmbQ3g4cbTssHrpUJG0FHE+K7xXJaXtf5G9MiUhanuTMbkFy3K4A9rZ9X6iwAinJVpKW7tg0CdgF2Ae4wfZOAZr+1fYlrXrawxhUiE11tDM5znhX4BZgTt7syDjfXBt6vZzw1/zI3hicvfsO4BDS0r4oIG5c0kxgm2YWOy8LXWR73UBNTbJF54VbWJY6QI5T/ZLtS/P9rYBv2X5doKbibFWonT5Filv/E/Bc3hwaD53HqM1tP57vh1cYyDq2AA5g3nMqsurP9cButm/P91cnxYuGVW3JY+WHgZUZbqewko05hv0khhKi9wB2t71NoKbi7ATF2moS8B7SmD6DNG7eEqTla7b3l3Rsl4c9qM+vxmgPsSOwhu2nRn3mYFkKaGpCl5CxPgXYzvat0UJaTOoIFfkL8RV1Tgd+SIr5nzPKcwfJYo3zCGD7suwcRVKirUq0096kMSo0TKsDMfwzm0P3DpaD5hhSyMj1lHNOTW6cbADbd0iaPNILBsDZpJCIiyjHTsvYbjtGx0n6TJiaRIl2goJslc/lD5K+d1cAO9i+K0JLg+398//QetrV0R7iblJmakmO9kGkrOJLST9erwf2i5XEnwpzsgHOk3Q+cHK+vyupFnokz9o+MlhDN+6W9BWGz4AMvDVuByXaqkQ7/R54OFhDJ8cCV0s6M9/fkeTkRvOw7XOjRXRwnaRjGDqndie+8+Citr8QrKGTByXtwdB4/m7iOmg2lGgnKMtW95AKI/wn8Dtg3Zy/AcRUQsmhNSvbviLf/xyp0Q/ASYPK2aihIxlJZ5BqZ19My9kOLr2EpOVIcdoCrh5U8P4Ier4PvJJU+aBtp9AsZ0k7keLUBEy3feYoL5nfeg4glYg8k+F2CulY2ZATQr4GbEm2FXCA7f8L1HQAhdmqUDsdQyqX9UuG2ymy5CCSNmT4d+/GSD0wN79lAVLseNtWNwRqWgj4BMPPqSMiV1ElfQO40nb0xMRclHoiHA5sToo7vpIUdxzW9bNEO0FZtpJ0HCMnQw48zEbSyaRiDVPz/duBo4FFgTU9oKZk1dHOSHpft+0ObsFaYIJmaKzTeCFnYHfiyBjRUqm26g+l7rXzYDu0vGVpCZoAeRWwE0fnSJSGpEdJZQ+fIpWwC8+5KZFqp/GJcnfK1v0bba+fb19u+597v/pF1FEd7XJpJWjezPDkp8hGLMVRYoJmqeQErH2YN6mnOiAtqp36oyNBs4nPDk3QLJUSEzRLpNTEwxIpyVY5LKMnEStvkm6xvVbr/sualdLOx+YnEz5GW9JptneRNIsuyx7BPxjFJGhK+rztKZIOo7udIkNsiknQLKWc0Ag0iYc/Jjipp3BblWSn/7T9GUnn0P27F3nhXVSCpqQ9bJ/Y60c/OMymmARNSWs6dRbdoNvjkSE2FJR4WLidoCBbMdTYaw1SuGtTY3w7UphUBI9KWt32HTAUjihpTVJTwoEw4R1t0g8FwNtCVXSnpATNxom9LlRFd0pK0HwDcAlpcOkkvGMXZSUelmyrkuzUJM8dGqqiO6UlaDaVYUrp5tmmpATNzwEfAb7T5TEzvNPgoCkp8bBkO0FBtmpC2CRdAGxg+9F8/wDSxEUE+wNTJX0TaC6KNgT+gyHfb75TQ0cKpDVr/GoKTNAshdZM6BsoMEGzJCQ1jVc+TWGJhyVR7dQfrRnjf6LABM2SaM2G7kJhCZolUmriYYmUaCulVufrNivxOQl4pu01g/S8Fvg8aayCFIo7xfbsgWmojnYiJzt0GuNh0gzuv9u+e4BauiZmNkQmaPZYvm7sdJTtJweopVtiZkNogmaPpeuHSZ09ZwToadrjdqtxHJp4WJKtCrdTt/C25rv3jUGGb/RKzGwoIEHzv7psfhi4zvbZA9bSLTGzITRBs0fY1sPALA/vTTAILc1vsCgs8bAkO2U9JdvqS6SLyjOzxrcDp9n+VpSmaKqjnZH0NeCPpC5LAt5FmiW9HfiY7a3i1M0tN7aC7ZuCdXwfWIbhNasfABYBlrT9nihtJSHpJGAj4Jy86a3AtcCawOm2p0RpK41qq/6QNIUUh3lS3vQu0lj1MLCl7W4hOAMjd4Rb3PYjkTqylqPJ50/etBNpJmsF4G7b0Q1QikDSL0ml4ZqLga2Aq4DVga/bPqHHSycU1U5jI5f83DLfDSv5KWlL4B9t/yTf/xnQrFp+w/YlAxFiu/6li42ru2y7Kv+fGaTpMmDJfGL8jpRE891gO03vtQ24OUjTlGynyaQwmweBPYLtdD7J6WjuLw6cR7oguSVQ1zuBJfLtL5OWsdevthoXdvp1r22kmbUITSfl795iwG3A/cC+kXbKui4BXtK6/5K8bYHAc2rvbCuRkmxvALYNttM5wLKt+8vmc/1lwOwgTVuQOrNCahT1XWDFaqfxYausZQHgVcCKzV+QjouBtVr3Z5FitF8PnDcoHdFtqkviOUm7SJqU/3ZpPRY17f9Sp9mhdwDH2t4Q2DpIS8MyuUg+MLdg/svz3adjJLFtttPbgPtIswz7BmlpWJHh9ngGWMn2E8Qmt37F9qP5Sv/fgONJ1TUiKdFWJdppcUmbNnckbcJQl7NnYySxVv7u7UjqxroiUMKq1qsZSowk336V7TnEnVMfzLballSK9APAwUFaGla2/afW/T8DqzvlIjwTpOlI4G+5q+DngXsZSgiOokQ7QYG2yiU//wRcCEwl5W9MDZKzpO1bWvfvtH29Uy+SgSVM16ojQ+wOfB84guRYXwXsIWkR4JNBml6SO0PuAnwpSEMn/w5cIeku0szMKsDHJS1GckYimJz/vwU42fZfpW4htgPlJOAqSU086HbAydlOt/R+2XynKQH1VuBI22fnrPBISrRViXbaE/hvSYuTvnuPAHtmOx0UpGmypMkkR/tw289IKiEecQowQ9JlJFu9HvhWttVFQZqaQektpImTmYofqC6XNJXhITbTs50eCtL0rG1L2gH4vu1jRstbGgAl2gnKtFVJJT+Xat+x3Y61X3ZQImqMdsFIeifwFeAK2x+X9I/At23vFKxrIVL8o4DbPMAEyB56Dib90D8BbEL6ck21vemIL5z/upo4NZE+w/DSiPnH4g+klZENSTa7xva6wbqKslWpdgKQ9FLS2B35A99o+TTwBWAm6aJkReBED6jj2kjkSYpNSOfUNbb/GKznWNJM+yqkalILAJfllcooTSI5jVuQv3vAGQ50DCRNI4WOfYB0gfS/wAzbawdqKs5OWVeJtroU2MZ21CpbW8s5wA9t/7Jj+9tIuXdvHYiOie5oF96IpRgKby7SJIs+YntOnmVYwvYDATqWtP1Iq0zcMBxcHk7SosCbSDG9d2ZnZG3bFwRoKdZWhdmp5CYsw8gOyQJRP7IlNxjJyaLrkZIxH5K0NPBqBye4l4akVwK7AdfavjyHJ27lnNBWGaJEW0k6hkJKfkp6TdZxJcPraL8OeJtzI5v5rqM62trO9jm9llscWEqvJCR9zfb+PUrq2bU9LpBmQm2/rVUmbu5DBJeHK41qq/6QtJfto3qV1HNwKb2SkHS07Y/0KKlnB5bSKwlJV9jeUvOWtQ0vD1cS1U5jp6RxStLhpAppazC8jvZJg1yJn/CONoCkBYCDbUcn0BVNnpHZ2fZp0VpKJs/qrWD7d9FaSqfaqj/yGPVp29+L1lI6eZza3Pavo7VUKpU4JO1NKoO6HHAqKYdr8H0sqqOdkHRJne0YHUnTbb8+WkfpSLo+MvZyPFFt1R+SLrX9L9E6xgOSfmN782gdJZMvSG6y/dpoLSVT7TQ28mpStzDcyMZMK5Ec7ncBC5NmuU8ZVOhIrToyxI2SfkHKKn682Rgde9wmZxY/YPvqQBkXStqHdHXYtlMxralzPO1fnVvABnGVpI1tXxuoYbxQbdUfV+al0M7vXjEtvCVtBNxv+w/BUi6QtBPw8+iEtVKx/ZykmZJWrCtKval2GjP7tG4vTEoiDU2MtH0vcAhwiKT1gf8G9iclJM936ox2ZjzEHkv6FrA2qRHDm4M03NNlc1HxtJIuAlYlZYXvM9rz55OGW0j1vO8lOUVNTN86EXp6kW31DPAD2yG1TseDrQqxU/Fxx5KOB9YB7rC9a6COR0m1s+eQKsYUF1Mr6dZ88we2Dw/ScAmwMXANwy/eto/Q0418Tv2NZKfZQRqKtxOUYatuSJpm+w2Bx59MSm5/F/BGYBopjOSsgRy/OtqVv0dy7O9atm8OOv5K3bbnK+tikPQqUvzaZrZ/EKSheFuVYKfxhKQlbD8araN0cuWRzTrLjw3w+F2dH9vTBq2lF5I2JpWN3MT2F4I0FG8nKMZW7SpSk0hVPv7L9hoBWrYB3k0qPXoNcApwlu3HR3zhi62jOtoJScsDh5HqZJpUJ3Nv2/cFaupWSu9hUsmxPw9aD8y9MvwYqWYnpDbxR9mO7I7VlPdbgVY4VPSSulK3rqae8OW2Z0bqKZlqq9FRqp+9P0PfvWnA120/HKipWxm9h4F7o0r8NUjantY4FbUS0YmkJRk+TkWX/FyWNFsLqd54yG9L6VQ79UeripRIISP3kMapKwK0XEpqiHZG5PesOtoZSReSPpCmfekewO62twnU9Etgc6BZMt6K1LFyddKJO/BWq5J+TOrE2JQ9fA8wx/aeg9bS0nQg8H7gLoaSMEKX1HO284eBJsb/7cDRtg+L0tQLSedGhSLl4xdjq+wE7QcsD5xr+6TWY0fY/vigNbWOfwYwm+HfvXU9vNvZoDVdBWwA3ET6YX1tvr008FEH1B3Pug4mOUU/zZveDVxv+4sRerKmvYCvk0JZ2uNUWNidpF2Ab5MmTES62N3X9s+iNHWjKdsYePyi7JSrEO1JGqfOa1fYkfRl29+I0FXpTnW0M5Jm2F5vtG0D1nQOsKftP+X7ywJHkr5g0yOyoCXNdEd3vG7bBqzpdlIzkaejNHQi6SZSibHH8/3FgN9ExR33mHmE9KMx1fZyg9QzTEBBtsrO7J2kC9oPkuKyd7P9lKQbbPey4yC0lThGnQIc2IRoSVoL2Bc4kJSIGKItn1Pr2X4u318AuDEy7l/SnaTz/MEoDZ1Imknq4vfnfH8Z4KKI8Vw9GleRxqiZtpcfpJ5hAgqyUz7+j4FFSeEQ7wGm2f5cfix6nCpy1TuSWnVkiAcl7UEq+wJpBuQvgXoAVm6c7MyfgdVt/1VS1Ek7R9Kqtu8CUGoLPydIS8NsUtv1kpbyxHC7zMnboriWFGrQTcNSA9bSSUm2WtX2Tvn2WZK+BFySwxCieULSls0SrKQtSLOjkazZzoOwfYuk9W3fndIkQlkKaJaLXxopJHMXKVGtJCZ1hED8hRRXG8H/khKi2ydOE4LwihBFQ5RkJ0gx2OsATVOWIyT9nOS3RH/xjiSteh+R77+HoQnCCUl1tIf4IHA48D3Sl/vKvC2SyyVNJZUchFQmZ3qe8XsoSNO+wKWS7iZ9oVcCPhCkpeEgUnnG2Qxv+RrpHB0LXC3pzHx/R+CYQD23AnvZvrPzAUm/D9DTpiRbLSRpUjMTavubku4DpgOLB2lq+CjwkxyrDfB/QNeOtgPkdklHkpKMAHYF7pC0EGk1IIpmTLiUNE69nhQSFMl+pBKNVzN8nPp0nCTOk3Q+QxNMuwK/CtJyN/DGbiX0ChijSrITwILNjZwL8RFJXwUuIX6c2rhjpv+SvCIwYamhIwWTK2fsRErQFClB8wwHf2j5R3SNrOk2x9arRtLNwFHALOC5Znt0RngO19iSZKfptm8M1LIzKYn29i6P7TioMke9KMVWkqYAF9i+qGP7m4DDbK8WoatDy5IAth8pQMsiwMcZ+uyuIM1kPQksavuxQG3LkeK0BVxt+4EoLVnPNST7dI5Tx/d80QDISfft796Zo7xkfun4BHBFt0RoSZ+Kzm8pxU5Zy4nAibbP69i+J3Ck7ckxylLoCvDOjlXvn0WGs0RTHe3KuEfBNTorlUplNCRdaft10ToqlfmJpDeSVinvzptWBj5gu1sfgAlBdbQLRNIVtrdUarrQ/oCKa7pQApK+S1qK/QXDl2SL6ZgXjaTPjfS47e8OSkvJVDv1h6TTbO8iaRbd2y0X02yoFCR9kxSDfA7Dx6liuupG0qOc7VxcUJfmaEq0lVIN79/bfiCveu8FbA08AHxxIp/nNUa7QGxvmf8vEa1lnLB+/r9Za5uBYjrmFUA9l/qj2qk/9s7/3xaqYnyxW/7fjhU3UExX3WC2G+ExM1T+s1KmrY4iOdYAmwJfBD4FrAccDewcoKkI6ox2DyTtADxg++qg408Cbooo4TcWchzkX6PitLOddrZ9WsTxK5UoJG0E3G/7D0HHXwA43/bWoz55gpPHqc3b9Y4rlb8n2mV+Jf0A+F/bB+T7oWVIo4ksT1M6mwJflnRuxMFz1YOZklaMOP4YOAG4TdKhEQfPdvpkxLHHgqSLJJ0rKXQGUNLqki7OFVqQtI6kL0dq6qQEW40HO5Fmi6ZKOjXi4LbnAH9rVUEpFkm35r+QsSKPUyFj5FiQdLykIyWFTfBIWlbSMc1vr6S1JH0oSk83SrBT1lGSrRaQ1ERJvJFUAaVhQkdP1BntgpF0CSlr/hrg8WZ7cNm6ecjVUdZq19Md8PG/QqonfCrD7VRMTJikVwHLAZvZ/kGgjmmkEo1H2V4/b5td0spJCbYaD3ZqkLSE7UeDjn0aKWTrQoZ/9yJL1nVF0tKkc+qXQcf/Gqlr5s+jK0f1IsfZrkiq0/yFIA3nkpLpvmR73ey83Wh77Qg93SjBTllHMbZS6jnwFuBBkm02sG1JrwGOt73FoDWVQnW0Mz2SCx4mlUQLaYQiqWsljQLK1v0DsAKtq9TIxENJ93TZbAe2Ni4VSdfa3ljSjS0HckIv63WjRDupe3fPh4F7cy3dgSOpax3v6JJ1DbkUYnucCrv4zsnti5EaMj1BTW7vSonfvVIpzVaSNiNNklzgoU6/qwOLT+TiBBN6Or+DDwGbA00Jmq1IbZhXl/R12ycMWpDtaUpt1zfOm66JcvobJB0IvJ/U5ay5SgtNPLS9StSxx4Kkc22/OVjGg5JWJX92SvW1748Qkp2g/YDlgXNtn9R67AjbH4/QlSnGTi2OADYgzYoKeG2+vbSkj9q+YNCCbB8vaUFg9bzpdhfQalnSXsDXSQ5te5wKu/geL8ntko62/ZFACY/n1Yfmu7cZ6YJy4OQ8hD1JY9R57Rh7SV+2/Y0IXS2KsRWA7au6bLsjQktJ1BntjKRzgD2dW55nB7dpGzo9YslY0i7At4HLSD+s/wzsa/tng9bS0nQ7sLbtp6M0dCJpMvAxUvc3SPY6KuIHv8esI6TPb6rt5QapZx4RqXnA0cDrSJ0F7wH2sP3bAC1nAHeSLmg/SOokuJvtpyTdENngoCQ7tTSdAhzYhGhJWosU3nIgKRxh4LNYkrYCjgd+SzrHVwDeZ3v6oLW0kXQnKfnwwUgdnUjantY4ZXtqkI6X9XoImGl7+UHqGSYgjaGHkS4kZwPLkBLebwrQ8mNgUVL45nuAabY/lx8LHaOyhmJsVelNdbQzkma145py3PEs269tL8sMWNNMYJtmFlvSMsBFHt7edNCazgA+Fj2z3iYPhpNJP/iQBsQ5tvcM0DIHmEb6wepkM9uLDFhSVyQtBkyKiu3NGoYtcbZi/LYHLoz+EYMy7NTSMs+ScLMtarlY0vWki6Pb8/3VgZNtbzhoLR26zgPeYftvkTraSDqYtDr507zp3cD1tr8YoGUOqaZ3e5xyvv9q2wt2feGAyLHGTffhsFUSSTc514TPmo4AXk767K6K8As6KcVWld7U0JEhLpc0FTg9398JmJ5/aB8K0jSpw6H9C/GVYg4CblSqxtBuuhCZoLlxx8XHJfkiJYJbgb1s39n5gKTfB+jp1LAU8F5St66XpOvJsOS1hSRNyhUZsP1NSfcB04HFA/TMpTA7Ndwu6UjglHx/V+AOpeYQUT+ukxsnG9IycV5himY/4EpJVzN8nIr8/N4CrNec75KOB24k1RseNHcDb7T9u84HosepHK7xFvJ3D9hWUlSzqLkXHDkP4iOSvkqqqBE6RkFxtqr0oDraQ3yC5FxvQboy/AlwRs4O/5cgTedJOh84Od/fFfhVkJaG44FDgFnAc8FaGuZIWtX2XTB32X9OkJYD6H0x9KkB6ujFr0ihGiV8fueQYvsvajbkmN8/kZZDIynJTg3vBz4OfIY0Rl0B7ENysqPGqOskHUMq8wmwO3B9kJY2R5GcoZI+P4ClgCYhM7Is4n8C/wDM42gDUwaspZNzgCcp47O7TtKbbJ/XbLD9dUl/JIWWRlOSrSo9qKEjhZOroWxJ+mGdbvvMYD3TbHethhKFpDeSShzdTbLTSsAHbF864gsnICXEFY4Hqp36I8+mf4LWGAUc4aAGVi1dV9p+XaSGTiS9GziYlHAvUqz2frZPGfGFE4x2uEZlZKqtxgcT3tGWdIXtLXPppbYxii+9JOk3tjcf8DG/S1qK/QXDl2RDS/fkH/wmTu22qB96SZ8b6fHoJT1JnwUeA6Yy/PMbeNmzkm1VmJ1Os72LpFkMH6MaTcX+0Eo6w/ZOAcf9JikG+RyCP782Sp10NyaNU1fbfiBIR7dytnOxHdbuXNIhwMURVXS6aCnWTlCWrSq9mfChI7a3zP/HRemlDhYOOGaT9mkm3wAAH7ZJREFU/LFZa1toeb/MhgzFqa2b49R+EqCj9PPoaVIlmy8RX/asZFuVZKe98//QrqLPk6hyervl//u1toWW98tMIjX0eAmpdOzqQRVathvhMQORDuRVwJlKbeufIXbSq2Q7QVm2qvRgws9oA+ST9KaIEn4vhEEvb2c77Wz7tEEdsx8knQCsCsxgKDbbwYlPRSLpLmDT0sqelUZpdspJT+fb3jpay1iICMHJ49Tm7ZrHJZBnH3cFbmYontbBieTFIeluYEdS1a/qoIxAtdX4YMLPaAPYfk7STEkrdsvCriSynT4JFOVoAxuRWsAXM9DkMmdHAsvmEpHrANs7vsHBzUAxJc+gWFsVZSfbcyT9TdJLbYc1pBgP5HHqUFIDspLYEVgjOn69Te4X8S3gVbbfnGuzb277mEBZdwKzCxvPS7QTFGiryrxUR3uI5YCbJV0DPN5sLHy2oVut5vnNhZL2AU5luJ0iYx9nA68kvnNfmx+RmokcBWD7JkknAdGO9hxghqRLKafsWYm2KtFOTwKzJF3I8O9eySs3EWMUwAWSdiI18inFCbmbVO+/GEcbOI6USP6lfP8O0tge6UDeD1wm6VyGf/ci81uOozw7QZm2qnRQHe0hvhYt4HnwnoBjfjD//0RrW3Ts48uBW/JFUim1vRe1fU1TfznzbJSYFmflv5Io0VYl2umX+W888YWg434OWIxU+vMJyohd/Rvp4u1iyrl4e7nt0yTtl7U8m5vZRHJP/luQVh3rYEq0E5Rpq0oH1dHO2J6Wl4c2zpuucUHdDxvU6mBpe/agj297lUEfsw8OiBbQhQclrUpOpJO0MwXMuNs+fqTHg6pEFGerEu2Ua4wvCKyeNxXZBU7SubbfDBBVDaHQ5PZf5L+SeFzS0gx99zYDQkOTbI846SXpMNuD7klQnJ2gWFtVOqiOdkbSLqQqA5eRZj8Ok7Sv7Z8FaOlVUkikEIkwcte3j5FqwEKy11GRP/i2p430eEQZRNKM/9HAmpL+QJp12GPAGp4PESsT49FWA7eTpK1IDaN+SxoLVpD0voiqFZJ6JTgKGHgr+G5I2p7WOGV7aqSeEi/eSDP/vwBWlfRrYBlg5wFrGCtbBBxzPNoJYmxV6aBWHckotezeppnFlrQMcJGHt/YelJZngJ/SpWYuqepH2GyNpB+T4gybH433AHNs7xmlaTQk3Wh7/dGfOV+OvRgwyfajEccfK5GNWsaTrYKqaVwP7Obc8jwnkZ5se8NB6sjHngNMo3sM9ma2FxmwpGFIOpi0OvnTvOndwPW2I9qd90XUOCXpJQz1IChylaRN1Bg13uwEtfFWKdQZ7SEmdYSK/IXerbTnNzcBh3YLDZEUXd5r446Lj0vyRUrJDPxqUtJSwHvJtb2b+OPCE9dCqLbqm8mNkw1g+468whTBrcBetu/sfEDS7wP0dPIWYD3bzwFIOh64ESjW0SZmnFqAZKuVSf7AtrkHQU2ma1HtVHkhVEd7iPMknQ+cnO/vCvwqSMtngEd6PPb2QQrpwhxJq9q+C0DSPzJUu7oyxK9IzQRmMVQzdzwQUSViPNoqwk7XSToGOCHf3x24PkAHpLyIXhMRpcSELgU01ZBeGimkYM4hV7OhfvdGYjzaCeKq/lRaVEc7Y3vfHBu9JenkPNr2mUFaLh/hsesGqaUL+wKX5kL5AlYCPhAraVQiBpuFbY/YYrxQIqpEFG0rSRvYvqFjc4SdPkaKZ/806ZyeDhwRoIORcldsl1Ct5SDgxlyeUaRY7f1Gfkk4EePU8rbXCTjuC+H7Accs3k6SXtGlgEOErSod1BjtPglKqCsSSQsxFKt2W0kNGLoh6bWDrtAi6bPAY8BUhpfyCqk3LmlN4Huk2ZhPA18hNdC4A3if7VsjdGVtxdiqS5KfgLNJrZjVxeEuhqBkumKRtBwpTlvA1bYfCJY0IpK2HXSVltyt8uKo6jAdWl4J7E8ao75KWhnZiRSmtLftsEpEJdkJQNLLOjeRVrfWJ41TkX0tKh1UR7tPIhPqSkPS6xiKVQPA9k8CdKxAqhTzauBc4NtNgoqks2zvOGhNLW2fAL4JPMRQ7KVth9QblzSdZKvFgYNJM7KnAm8DPmP7jRG6srZibCXpOVIYS/vicbO8zbb/ddCa+qWOUcOR9GrSilt7nIqo0LIkaTZ9eeBc2ye1HjvC9scHral1/LcDJ5LCgJ4hsN64pPNIdeIXA3YjJbKeDOwAbG17h0Framkrxk5Zz3PAvR2blwfuI/B3ptKd6mj3Sc3eTUg6AVgVmMFQbLYjEtdyh7wzSE7Qh4ANge1s/yXa6ZB0F7Cp7QejNLRp20PS/9h+Teux0HO7JFvlGt6fAg6x/au87Z5C68cPI/pzLIk8A7krcDNDMbV2QBMrSWeQWmVfRWr49QypesxT0Z9ZDgHcEZjlYGegY4z6ne0VW4/NsB1WNrIkO2U9+wBbA/vanpW3jYtxaiJSY7THEZJ2AB6wfXWgjI2AtUoYbIBlbP8w3/6UpD2A6bl+brS+m0md4Ephgdbtzkz56I5ixdjK9s/yzNqBkj4A/Dvx59K4QdJGwP22/xAsZUdgjULC2lZthfScJelLpGpNkZ1rG+4EZhcynreTaztXSKMqgDWUZCdsHyrpFOB7ucrP/tRxqliqo90/JWTvbgqsLeklzp3XAphNapoT3uUQmCxpYdtPAtg+UdIDwPmk5cdI5pDaLV9KGe2WfyBpcduP2Z6bQCfpNcBFQZoairKV7ceAz0paj1QvfvEIHc+DEsaoTwHrSLrD9q6BOu4m1fsvwdFeSNKkptSg7W9Kuo+UzBp9bt0PXCbpXIZ/9yLK1p3dGqO+3GzMY9QdAXralGSn5tj3Ae+UtB1wIbBolJbKyNTQkT6JSKgrkewMrQdcw/ABJ2JJ9rPADe7oDClpfWCK7W0Graml4X3dtnuU7nATkZJtpVTUewnbvcptFkNEMl0vJC3hwMZDOVxjXeBigi/eJE0BLrB9Ucf2NwGH2V5t0JpaGvbvtt2jtPaeaJRuJ0mLkFZOJryPUiLV0R4BSbNsrx14/JmkhLVTm7rV0Uh6Q7ftnc5uZWRqhYj+qbbqjaRzA1e3kLQFMMP24zl0awPg+7Y7E7UGravYi7fxhKTDbJdSF71Yqp0qIzHhQ0dy7eyuD5FCJCLZnpTQc1rOMj4VOM3276IEjeZQ1zKIfVOzwvtnQtuqS8nBuQ+RVpciORJYV9K6wOeBY0jxtV0vyAfFaA51vXjrmy2iBYwTqp0qPZnwjjbJef0p3RMJFh6wlmHkWaEpwBRJq5FqHx/C8MS20gi12TiiLiX1z0S31bXANLrHYC81YC2dPGvbOVH7+7aP6TWbXBgT+uKtUqkMjupow03Aod1imyRtHaCnU8PKwC6kme05pFmjkpnoTtG4opBKNsUTbKdbgb1s39n5QK44EMmjkvYD9gBeL2kBUhJi6dRxapxQUCWb4qm2KpPqaMNngF6JTm8fpJBOJF1N+tE6HXin7bsj9YwXxonzWEKFCCijks1olGCrSDsdQO/yZtFxobuSmot8yPYDklYkNUaqjMI4cYpK+O6VUslmJEqwE4wPW004ajJkwUha0/Zt0TrGQnSjmKzhW8DaQLHOY0kVIkpC0gbuaHNebVV5sSlknDoeWAco1imS9H7bx0XrgPhKNi0dr7D9545txdgJyrFVJVEd7YKRtBCwE/O2O/96lKbRmOhlECWtCXyP1I3u06S4+h1JdWDfZ/vWQHlFVbLpkuQn4GxgO9LYdMO8rxoMJdmpZCQ9ylAYxoKkFbjHbL80TtXolHTxFuEUSXolqcnJc8BXSTOhO5HClPa2HdYnoaRKNpJe1rkJuB5YnzRG/XXQmoaJKchWld5Ed1uqjMzZwA7As8Djrb+BI2kFSadIulzSf0ia3HrsrOZ2hJMtaWbWtOqgj92Fo4EjgBOBS4DzgH8ADgQOD9TVsD3pfDpN0rWS9snL/RFcR7LJd/LfocDSpM6VhwZpaijJTsViewnbS+a/hUnO2g+i9EhaUtJBkk6QtFvHY3MbNUU42ZK2kLRYvr2HpO9KWilo5vE44Bbg98ClwBPAW4HLgR/2ftlAOBL4W6uSzb3M2ylyUDxIcqybv+uAVwM35NvRlGSrSg/qjHbBSJpt+7XROgAkXQicAVwFfAjYENjO9l+il2ElrUSKFd2VNEMTVgaxbQtJ/2P7Na3HbrDdq1TbwGlVstnd9sAr2UjamTSTdojtX+Vt99heZdBaRiLaTuMNSVfZ3izo2GeQ2mVfBXwQeAbYzfZT0d8/STeRmuisA5xAKoX4DtsDL4XYMU79zvaKrcdm2A4rG9l8TpK+CvwhV7IJ+ewk7QNsDexre1beVswYVZKtKr2pM9o9kLSDpE2DZVwpKaxhTgfL2P6h7Rm5MP8RwPQ8ixx6tWb7XttTbG9ISsxaB7gnSE7bEetsz7vgIIX0QtLKkj4PnAKsSVAlG9s/I82ibSPp9DxjXMyVfyl26oWkjSS9OljDO1p/O0s6mNjPcFXbX7R9Vu5WewNwiaSlAzU1POs0s9WUQvw+sESQlvZvf+cMaLRf0K5k88vISja2DwX2BL6aVyCWoKAxioJsVelNrTrSmxKqMWwJvF/SPaQ2wgJse50ALZMlLWz7SZKIEyU9AJwPLBagZxgFlUH8gaTFbT9me+5StaTXABeN8LqBUFolG9uPAZ+VtB5wPLB4pJ6G0uzUgxIqDGzXuv0s8FuSIxnFQpIm2X4OwPY3Jd0HTCf+3CqpFOLZrXHqy83GPE7dEaSpoahKNrbvA94paTvgQmDRKC1dKMpWle7U0JGCySER8xCUFPJZ4IbOzpCS1gem2N5m0JpaGtpO0amFOkVFoIIr2UgSsITtXuU2B6mlWDt1UisMDCFpCnCB7Ys6tr8JOMz2ajHK5iYg7gZca/vy7BRtZbvG1I4TJC1CWjWZsAn/lbFTHe1MqVUGJG0JrGb7WEnLAIvbjgqLKJLx5BRFo3FYySaCEu1UYoUBSauQZtZXZridto/SVBnfaJxWsomg2mp8UENHhtietAxzmqTQhLoGSfsDGwFrAMeSvkQnAltEaSqUe3KFgZUpxCkqmLOBh0kZ9E8FaymZEu10JLBuq8LAMaT42oEn07U4K+s4h5SIXOlBdYr6w/awuHVJOwKbBMkpmmqr8UGd0e5CKVUGJM0g1eu8oZUhflNQjHaxSDqPIadoTrPd9nfCRBVKSZVsSqZEO5VYYUDS1bajk8bHJY1TZPs/orWUTmQlm/FGtVV51BntFgUl1DU8bduSDNDUYK3Mw/K23xQtYiRUTlv4KyWt3ZSqKpFCbFWinUpKpmv4fl55u4DWzL8Dmw2NF2yfJemL0TraqIC28JLe0bo7ibSqW9SMYAl2yjqKt1WlOtpzKbTKwGmSjgKWkvRhUl3YHwVrGkZ1ivqmhCo2UFYlm16UYKsS7VRihYG1gfcA/8pQ6Ijz/WIowTEaJ05RrWTTHyXYCcaHrSY8NXQkU2pCnaRtgG1JP/Tn274wWNIwJH2L9GMb5hRJugV4Dal2dilOUZGUVMmmZKqd+kPSbcA6tp+O1jISko4n1dcPc4wkHdu62zhFP7L95wg9I1Er2fRHtVOlH6qjnSmpyoAkeZQPpp/nTBRKdIpKrWID5VWyKdVWBdqpuGQ6SacCnyrRWexGdYyGUyvZ9K2pODtlXcXZqjIv1dHOlJRQJ+kyUrvzs9tVTyQtSFrSfh9wqe3jArRVp6g/PcW0he/QNbeSje3VJb0KON12WCWbEm1Vop06KSGZLo9V6wDXMjxGO/SHvkTHqESnSAW1hW9pmpl1zKJVyaazh8OANRVnp6yrOFtV5qU62pmSqgxIWpgUj707sArwELAIKa7vAuAHtmcEaatO0RgppYpN1lJ0JZtSbFW6nRqiKwxI6upoRP/Ql+gYlegU1Uo2/VGinbKu4mxVmZeaDDlEMQl1Tm3OjwCOkDQZeDnwhO2HYpXNDceYAkxpOUWHAJEO5NvJThGA7T9KWmLkl8x/CqxiA4VWsinQVsXZqaRkuiZ0bSQnMTi87dn8+e1Amsk+RtL7grQ0PGn7v4I1dFIr2fRHiXaCMm1V6aA62kOUWGUA288A90dq6KQ6RaNTaBUbKLCSTaG2Ks5OlFVh4FJJo4a3AcfFyCvSMSrRKaqVbPqjRDtBmbaqdFBDRzIlJtSVSIdTdGoJTpGkfYDVgG2Ag0hO0Um2DwvUVGQVGyivkk2ptirNTiXRI7xtYdLKVmh4W9b3SpJjdK3ty7NjtJXtnwRqOojkFN1FyymyXZ2iFuOlkk0JVFuND6qj3aK0hLoSqU5R33qKqWKT9RRbyaYkWxVup+KS6QBKC28rlRKdolrJpj9KtBOUaavKvNTQkUw7oQ44lvRFOhEoIqGuIO6RtBuFOUXZsZ7HuQ6MEz2boSo2T43y3EFQ8lJ/SbYq2U5nkZLpzqGVTBdNoeFtJTpGM4GlgGKcItvDclmaSjZBchqWBW6TVEwlm0LtBAXaqjIv1dEeosiEugKpTlF/lNYW/k2kpf6T88xoZyWb7wUu9Zdkq5LtVGIyXZEU6hgV7xS5jLbw+wcff1QKsROMA1tVaujIXCRdY3uTVhmfxYDfRCdDlkYtg9i3tqOBw0qoYtNJaUv9pdqqQDvtRspFKCmZbtxQSyHOS49KNm+wvXmAlpLDtoqxU9ZTrK0q81JntIcoscpAidQyiP1RZBUbKHKpv0hbFWinWmGgT2opxL6plWz6oyQ7Qdm2qnRQZ7RblJZQVyKSbgFeAxTlFJVGrWLTP9VW/VFiMl2pSDq2dbdxjH4UkTSmgjv9lkTplWxKotpqfDHhHe26BDM2qlPUP7WKTf9UW41OrTAwPinZKaqVbPrWU6SdoDxbVealOtp1tmHMVKdodFR4W/iSqLbqjzxWrQMUm0xXCqU6RqU5RSqwLXyJVDtVXgg1RrvsKgPFUcsg9k2tYtM/1Vb9USsM9E8thdgftZJNf1Q7VZ43E97RLjyhrkSqU9QfxbWFL5hqqxEoPJmuVKpj1B8ltoUvkWqnyvNmwjvabQqcbSiR6hT1R61i0z/VViNTKwyMneoY9UetZNMf1U6V582Ej9GujA1J+5Bq+W4DHERyik6yfViosAKpVWz6p9qqNyUn05WKpINIjtFdtBwj29UxalEr2fRHtVPlhVAd7cqYqU5Rb2oVm/6ptho7NbytP6pj1B+1kk1/VDtVXgg1dKTSF22HJzvW8zjX1SkC6jL/WKi2GiM1vK1vZgJLAdUxGpni28IXQrVT5XlTZ7QrfVHLIPZHyW3hS6PaqjK/qKUQ+6PEtvAlUu1UeSFUR7vSF9UpGjt1mb9/qq0qLybVMRqZGrbVH9VOlReD6mhXxkx1iiqVSolUx6g/6gplf1Q7VV4MqqNdqVQqlb8LqmPUH7WSTX9UO1VeDKqjXalUKpW/C6pjNHbqCmV/VDtVni/V0a5UKpXK3x3VMapUKiVQHe1KpVKpVCqVSmU+MClaQKVSqVQqlUql8vdIdbQrlUqlUqlUKpX5QHW0K5VKpVKpVCqV+UB1tCuVCYqk90s6fH68XtJj+f+rJP3s+R7jxaDR8gL3sZ6kt7wYerrs+zhJO7/Yr5e0laSp+fb2kr74QnTOT/qxwQu10wj7fUHnR6/Xt/VK+rGktV7Icbrs/9uSbpb07efx2vl2PlcqleG8JFpApVL5+8X2H4EX3TkKYD1gI+BX0UKeD7Z/AfwiWsdExfae82G3ewHL2H5q1GfOy5jPZ0kiFVB47nkcr1KZsNQZ7UplnCNpD0nXSJoh6ShJC0h6TNIhkq6XdJGkTSRdJuluSdu3Xr6CpPMk3S5p/5H2mbd/QNIdkqYBW7Sev4qk30i6VtKBre0rS5qdb79f0s/z8e6UNKX1vA/l/V4m6UcjzbRLeqek2ZJmSpre2vfhredMlbRV6/53JN0g6WJJy+Rtn5Z0i6SbJJ2Sty0m6b/z+7hR0g652cnXgV2zPXbt0LNAnl28Nu9rr7x9K0nTJJ2W39vBknbPdp0ladXWbraWdHl+3ttG2a8kHZ61/xJ4RUvLmyTdJukK4B2t7XPtk2da/0vSlfl8aGZdJ0k6Is+STpX0q5FmkCX9VtK38ud+naQNJJ0v6S5JH21p/Xb+vGY1thvlPWyY7XZ93t9yvTTk56+az6nrsw3XbL3PIyVdmt/nG/Jne6uk4zr20e386LXfVdT9XB/pPV0maaN8+zFJ31Q6f6+StGzreFfl/X5dI8y0S/oFsBhwtaRdJS0j6Yz82mslbZGft0n+nG/M/9dQl/NZ0gGS9mntf7bSd3flbK8jgBtI48W2+f3fIOl0SYuP9PlUKhMe2/Wv/tW/cfoH/D/gHGByvn8E8F7AwJvztjNJzTomA+sCM/L29wP3A0sDiwCzSbNcvfa5HPA7YBlgQeDXwOH5Ob8A3ptvfwJ4LN9eGZjdOt7dwEtJTUTuBVYAXgX8FnhZ1nh5s98e73kW8Op8e6nWvg9vPWcqsFW+bWD3fPurLc1/BBbq2M+3gD2abcAdJIdm2P479HwE+HK+vRBwHalZylakhinL5e1/AL6Wn7c38J/59nHAeaSJj9WA+7J9eu33HcCFpCYsr8rH2Dm/5vd5HwJOA6Z22icf7/R8vLWA/8nbdybNcE4CXgn8H7DzCJ/Db4GP5dvfA24CliCdH3/O23dqaV2WdP4sN8J7mAxcSZqpBdgV+O+W7nn0ABcDq+XbmwKXtJ5/SrbFDsAjwNr5/V0PrDfK+dFrv73O9a7vKT92GbBR63jb5dtTWp/xVODd+fZHm/2OYP/HWrdPArbMt1cEbs23lwRekm9vDZzR4/tyALBP6/5s0nd3ZeA5YLO8/eXAdGCxfP8LwFejx8H6V/9K/quhI5XK+OaNwIbAtZIgOcx/Bp4mOW+QHNOnbD8jaRbpx7PhQtt/AZD0c1Kb6md77HNT4DLb/5uffyqwet7PFiSnCuAE4JAeei+2/XB+/S3ASqQf72m2/5q3n97abzd+DRwn6TTg5yM8r+E54NR8+8TWa24CfirpLOCsvG1bYPvW7N7CJMdlJLYF1mnN/r6U5Ow+DVxr+/78vu4iXfBA+kz+pbWP05yW5O+UdDew5gj7fT1wsu05wB8lXZIfXxO4x/ad+Xgnkpz1bpyVj3dLM6NK+uxPz9sfkHTpKO8bhsJRZgGL234UeFTSk5KWyvtstP5JaSVk4xHewxrAa4EL87m3AOlisCt5NvV1wOn5+ZAuShrOse183v/J9qz8uptJ34MZdDk/Rtlvr3O913vq5GmSUw3J4d8m394c2DHfPgk4tNf77sLWwFotrUtKWoJ0zhwvaTWSgz95DPtsuNf2Vfn2ZqSLs1/nYy0I/OZ57LNSmTBUR7tSGd8ION72fsM2SvvYbrpRPQc8BWD7OUnt731nxyqPsM8duzy/87Wj0Y4nnUMag9Tjud0PYn9U0qbAW4EZktYjXRy0Q+EW7kPnW0nO0fbAVyT9U9ayk+3b2y/Ix+uFgE/ZPr/jNVsx/P0+17r/HMPH316fQ7f9vqXL83vtpxdtXer4Pxba76fzvY722XbTKuBm25v3efxJwEO213ue+nrpGm2/L8T+z7S+m8134IUyCdjc9hPtjZIOAy61/XZJK5Nm1rsx0vfn8fYuSRfn736hgiuViUKN0a5UxjcXAztLegWApJdJWmkMr98mv2YR0mzar0fY59XAVpKWVmpv/c7Wfn4NvCvf3n2M7+Ea4A2S/iFfBOw00pMlrWr7attfBR4khZ/8FlhPKc54BWCT1ksmMZSQuRtwhaRJwAq2LwU+TwoTWRw4H/iU8nSdpPXz6x4lhUU0GjaR9JN893zgY9kmSFpd0mJjtME7s/ZVgX8Ebh9hv9OBdynFcC/H0Mz4bcAqGor9HqszdAWwU9axLCn05YUynRQLvIBS7PPrSZ93r/dwO7CMpM0htVHPF0DDkHSQpLfbfgS4R9I783ZJWneMGuc5P0bZb69zvdd76perGDr33zXSE7twAfDJ5k6++IQ0o/2HfPv9recPO59J358N8ms3IIUo9dK4haTX5OcuKmmk1adKZcJTHe1KZRxj+xbgy8AFkm4ixYiOmDzWwRWk5e8ZpPjN63rtM4dAHEBaKr6IlBzVsDfwCUnXkn7cx/Ie/kCKjb467/cW4OERXvJtpcS62STnZibJ+bmHFMJwaIe2x4F/knQ98K+kRLAFgBNzSMGNwPdsPwQcSFpevynvv0l2u5S0NN8kQ64INLOHP86ab8ivOYqxz1LeDkwDzgU+avvJEfZ7JnBnfq9H5teRX/MR4JdKyZD3jlHDGaT48OZYVzPy59APZ5JCdGYClwCft/3ACO/haZLTe4ikmaTz8nVd9rs28EC+vTvwofz8m0nx2GOh2/kx0n57netd39MY+AzwOUnXkL7DY7H9p4GNlJJmbyHFeEOKAT9I0q9J53xD5/l8BvAySTOAj5FyE+Yhh429Hzg5jw1XkUKWKpVKDzS0glWpVCoxSFrc9mN5RvtMUgLcmdG6eqFUu/gE2zdFa3kxaX0OS5NmnrfIjnFRSDrf9r9F63gxkbQo8ESOKX8XKTFyrBcNlUqlMKqjXalUwpF0KCmha2HSMvjeroPTwJF0GSmMZkFgiu3jQgVNICT9M3A4KQ76IeCDtv8nVlWlUnmhVEe7UqkUiaQvMTwOHFJVjG9G6JmoSDqTeWN2v9CZpFl58ZG0Nim0q81TtkdKzq1UKgVRHe1KpVKpVCqVSmU+UJMhK5VKpVKpVCqV+UB1tCuVSqVSqVQqlflAdbQrlUqlUqlUKpX5QHW0K5VKpVKpVCqV+UB1tCuVSqVSqVQqlfnA/wdW8cMQOQEAJQAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<matplotlib.figure.Figure at 0x1a17333710>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "ax = df_sorted.plot.bar(y='Test\\nAccuracy', figsize=(12, 6), \n",
+    "                        x=['embedding_subset', 'embedding_model', 'embedding_feature'])\n",
+    "plt.ylim([.8, 0.98])\n",
+    "plt.grid()\n",
+    "plt.vlines(7.5, 0, 1) # mus/env separator\n",
+    "plt.vlines(15.5, 0, 1, 'r') # vggish soundnet separator\n",
+    "plt.vlines([1.5, 3.5, 5.5, 9.5, 11.5, 13.5], 0, 1, linestyles='dashed')\n",
+    "labels = ax.get_xmajorticklabels()\n",
+    "labels_txt = [t.get_text() for t in labels]\n",
+    "labels_txt[-2] = 'SoundNet'\n",
+    "labels_txt[-1] = 'VGGish'\n",
+    "ax.set_xticklabels(labels_txt)\n",
+    "\n",
+    "# Set colors\n",
+    "pal = sns.color_palette(\"Paired\", n_colors=10)\n",
+    "colors = pal[:8] * 2\n",
+    "colors.extend(pal[-2:])\n",
+    "for idx in range(18):\n",
+    "    ax.get_children()[idx+3].set_color(colors[idx])\n",
+    "    \n",
+    "# remove legend()\n",
+    "ax.legend().set_visible(False)\n",
+    "\n",
+    "# axis labels\n",
+    "plt.ylabel('Classification accuracy')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (py35)",
+   "language": "python",
+   "name": "py35"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 711ba4e1531f481369cc0f6298763c9132d81c78 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sun, 3 Jun 2018 02:15:16 -0400
Subject: [PATCH 193/201] Add support for augmented data and remove setting of
 random seed

---
 classifier/train.py |  6 +++---
 data/usc/folds.py   |  8 ++++++--
 data/usc/us8k.py    | 17 ++++++++++++++---
 3 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/classifier/train.py b/classifier/train.py
index 4dcb992..ea18e72 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -295,7 +295,7 @@ def train_mlp(train_data, valid_data, test_data, model_dir,
     loss = 'categorical_crossentropy'
     metrics = ['accuracy']
     monitor = 'val_loss'
-    set_random_seed(random_state)
+    #set_random_seed(random_state)
 
     # Set up data inputs
     enc = OneHotEncoder(n_values=num_classes, sparse=False)
@@ -490,8 +490,8 @@ def train(features_dir, output_dir, fold_num,
     LOGGER.debug('Initialized logging.')
 
     # Set random state
-    np.random.seed(random_state)
-    random.seed(random_state)
+    #np.random.seed(random_state)
+    #random.seed(random_state)
 
     datasets = ['us8k', 'esc50', 'dcase2013']
 
diff --git a/data/usc/folds.py b/data/usc/folds.py
index a21fb9e..2072256 100644
--- a/data/usc/folds.py
+++ b/data/usc/folds.py
@@ -21,7 +21,7 @@ def load_feature_file(feature_filepath):
     return X, y
 
 
-def get_fold(feature_dir, fold_idx):
+def get_fold(feature_dir, fold_idx, augment=False):
     X = []
     y = []
     file_idxs = []
@@ -31,6 +31,10 @@ def get_fold(feature_dir, fold_idx):
 
     start_idx = 0
     for feature_filename in filenames:
+        # Hack for skipping augmented files for US8K
+        if 'us8k' in fold_dir and '_' in feature_filename and not augment:
+            continue
+
         feature_filepath = os.path.join(fold_dir, feature_filename)
         file_X, file_y = load_feature_file(feature_filepath)
 
@@ -87,7 +91,7 @@ def get_train_folds(feature_dir, test_fold_idx, num_folds, valid=True):
         if fold_idx == test_fold_idx or (valid and fold_idx == valid_fold_idx):
             continue
 
-        fold_data = get_fold(feature_dir, fold_idx)
+        fold_data = get_fold(feature_dir, fold_idx, augment=True)
 
         X.append(fold_data['features'])
         y.append(fold_data['labels'])
diff --git a/data/usc/us8k.py b/data/usc/us8k.py
index e72cbd3..d327249 100644
--- a/data/usc/us8k.py
+++ b/data/usc/us8k.py
@@ -1,6 +1,7 @@
 import csv
 import logging
 import os
+import glob
 import random
 import numpy as np
 
@@ -121,9 +122,19 @@ def generate_us8k_fold_data(metadata, data_dir, fold_idx, output_dir, l3embeddin
     for idx, (fname, example_metadata) in enumerate(metadata[fold_idx].items()):
         desc = '({}/{}) Processed {} -'.format(idx+1, num_files, fname)
         with LogTimer(LOGGER, desc, log_level=logging.DEBUG):
-            generate_us8k_file_data(fname, example_metadata, audio_fold_dir,
-                                    output_fold_dir, features,
-                                    l3embedding_model, **feature_args)
+            # TODO: Make sure glob doesn't catch things with numbers afterwards
+            variants = [x for x in glob.glob(os.path.join(audio_fold_dir,
+                '**', os.path.splitext(fname)[0] + '[!0-9]*[wm][ap][v3]'), recursive=True)
+                if os.path.isfile(x) and not x.endswith('.jams')]
+            num_variants = len(variants)
+            for var_idx, var_path in enumerate(variants):
+                audio_dir = os.path.dirname(var_path)
+                var_fname = os.path.basename(var_path)
+                desc = '\t({}/{}) Variants {} -'.format(var_idx+1, num_variants, var_fname)
+                with LogTimer(LOGGER, desc, log_level=logging.DEBUG):
+                    generate_us8k_file_data(var_fname, example_metadata, audio_dir,
+                                            output_fold_dir, features,
+                                            l3embedding_model, **feature_args)
 
 
 def generate_us8k_file_data(fname, example_metadata, audio_fold_dir,

From 373cfca210e0baa7224bde831a654b76988bc3bc Mon Sep 17 00:00:00 2001
From: Justin Salamon <justin.salamon@gmail.com>
Date: Mon, 4 Jun 2018 20:08:41 -0400
Subject: [PATCH 194/201] Add random time delay befor mkdirs to avoid parallel
 jobs colliding

---
 classifier/train.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/classifier/train.py b/classifier/train.py
index 47b01e4..6b36540 100644
--- a/classifier/train.py
+++ b/classifier/train.py
@@ -6,6 +6,7 @@
 import random
 import git
 from itertools import product
+import time
 
 import keras
 import keras.regularizers as regularizers
@@ -514,6 +515,8 @@ def train(features_dir, output_dir, fold_num,
                              datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
 
     # Make sure model directory exists
+    # Add random time delay to avoid parallel jobs colliding
+    time.sleep(np.random.random() * 10)
     if not os.path.isdir(model_dir):
         os.makedirs(model_dir)
 

From 7b1b0f14d56ebb84e6570b27f71dc90c16362d15 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sun, 17 Jun 2018 17:25:58 -0400
Subject: [PATCH 195/201] Add scripts for creating plots and significance tests

---
 create_plots_and_significance.py | 443 +++++++++++++++++++++++++++++++
 1 file changed, 443 insertions(+)
 create mode 100644 create_plots_and_significance.py

diff --git a/create_plots_and_significance.py b/create_plots_and_significance.py
new file mode 100644
index 0000000..c2259aa
--- /dev/null
+++ b/create_plots_and_significance.py
@@ -0,0 +1,443 @@
+from gsheets import get_credentials, get_row, append_row, update_experiment, request_with_retry
+from googleapiclient import discovery
+from collections import Counter
+import pickle as pk
+import os
+import numpy as np
+import scipy.stats
+import shutil
+import json
+import time
+import pandas
+import itertools
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import seaborn as sns
+from pprint import pprint
+
+CLASSIFIER_FIELD_NAMES = [
+    'username',
+    'model_id',
+    'model_dir',
+    'git_commit',
+    'features_dir',
+    'fold_num',
+    'parameter_search',
+    'parameter_search_valid_fold',
+    'parameter_search_valid_ratio',
+    'parameter_search_train_with_valid',
+    'model_type',
+    'feature_mode',
+    'train_batch_size',
+    'non_overlap',
+    'random_state',
+    'num_epochs',
+    'learning_rate',
+    'weight_decay',
+    'C',
+    'tol',
+    'max_iterations',
+    'train_loss',
+    'valid_loss',
+    'train_acc',
+    'valid_acc',
+    'train_avg_class_acc',
+    'valid_avg_class_acc',
+    'train_class_acc',
+    'valid_class_acc',
+    'test_acc',
+    'test_avg_class_acc',
+    'test_class_acc',
+]
+
+def append_row(service, spreadsheet_id, param_dict, sheet_name):
+    # The A1 notation of a range to search for a logical table of data.
+    # Values will be appended after the last row of the table.
+    range_ = '{}!A1:A{}'.format(sheet_name, len(FIELD_NAMES))
+    # How the input data should be interpreted.
+    value_input_option = 'USER_ENTERED'
+    # How the input data should be inserted.
+    insert_data_option = 'INSERT_ROWS'
+
+    value_range_body = {
+        "range": range_,
+        "majorDimension": 'ROWS',
+        "values": [[str(param_dict[field_name]) for field_name in FIELD_NAMES ]]
+    }
+
+    request = service.spreadsheets().values().append(
+        spreadsheetId=spreadsheet_id,
+        range=range_,
+        valueInputOption=value_input_option,
+        insertDataOption=insert_data_option,
+        body=value_range_body)
+    response = request_with_retry(request)
+
+
+value_render_option = 'UNFORMATTED_VALUE'
+date_time_render_option = 'FORMATTED_STRING'
+GOOGLE_DEV_APP_NAME = 'l3embeddingexperiments'
+GSHEET_ID = '1eyFv_jUWJeEuG1VKQNfAmhQUREMRDgEMxfVDffRaHDo'
+
+credentials = get_credentials(GOOGLE_DEV_APP_NAME)
+service = discovery.build('sheets', 'v4', credentials=credentials)
+
+def get_entries(target_dataset, sheet_name, first_row=3, l3_only=True, l3_model=None):
+    range_ = '{}!A{}:AF'.format(sheet_name, first_row)
+    request = service.spreadsheets().values().get(spreadsheetId=GSHEET_ID, range=range_, valueRenderOption=value_render_option, dateTimeRenderOption=date_time_render_option)
+    response = request.execute()
+
+    unique = Counter()
+
+    if target_dataset == 'us8k':
+        limit = 2
+    elif target_dataset == 'esc50':
+        limit = 5
+    else:
+        limit = 1
+
+    data2 = {}
+    data = []
+    for datum in response['values']:
+        if not datum:
+            continue
+        model_id = datum[1]
+        if target_dataset not in model_id:
+            continue
+        if datum[21] == '-':
+            continue
+        if l3_only and 'L3' not in model_id:
+            continue
+
+        entry = {k: v for k,v in zip(CLASSIFIER_FIELD_NAMES, datum)}
+
+        if 'L3' in model_id:
+            dataset, _, embedding_length, audioset_subset, embedding_model_type, \
+                    _, _, _, _ = model_id.split('/')
+
+            if embedding_model_type == 'cnn_L3_orig':
+                embedding_model_type = '1_orig'
+                continue
+            elif embedding_model_type == 'cnn_L3_kapredbinputbn':
+                embedding_model_type = '2_norm'
+            elif embedding_model_type == 'cnn_L3_melspec1':
+                embedding_model_type = '3_mel1'
+            elif embedding_model_type == 'cnn_L3_melspec2':
+                embedding_model_type = '4_mel2'
+        else:
+            dataset, embedding_model_type, _, _, _, _ = model_id.split('/')
+            audioset_subset = 'na'
+            embedding_length = 'na'
+
+
+        fold_num = datum[5]
+        ident = (fold_num, dataset, embedding_length, audioset_subset, embedding_model_type)
+
+
+        if not l3_only and 'L3' in model_id and l3_model and ident[2:] != l3_model:
+            continue
+
+        entry['dataset'] = dataset
+        entry['embedding_length'] = embedding_length
+        entry['audioset_subset'] = audioset_subset
+        entry['embedding_model_type'] = embedding_model_type
+
+        if unique[ident] < limit:
+            unique[ident] += 1
+            data.append(entry)
+            data2[ident] = entry['test_acc']
+
+    for ident, count in unique.items():
+        if target_dataset == 'us8k':
+            if count != 2:
+                print('Missing trials for {}: {}'.format(ident, count))
+        elif target_dataset == 'esc50':
+            if count != 5:
+                print('Missing trials for {}: {}'.format(ident, count))
+        else:
+            if count != 1:
+                print('Missing trials for {}: {}'.format(ident, count))
+
+    return data, data2
+
+
+def compute_stat_test(data2):
+    keys = data2.keys()
+
+    models = set()
+    folds = set()
+    a = {}
+
+    for k in keys:
+        ident = k[1:]
+        fold = k[0]
+        models.add(ident)
+        folds.add(fold)
+
+        if ident not in a:
+            a[ident] = {}
+
+        a[ident][fold] = data2[k]
+
+    folds = list(sorted(list(folds)))
+    models = list(models)
+
+    num_folds = len(folds)
+
+    measurements = {}
+
+    for k1, k2 in itertools.product(models, models):
+        if k1 == k2:
+            continue
+
+        k1_acc = []
+        k2_acc = []
+        for fold in range(1, num_folds+1):
+            k1_acc.append(a[k1][fold])
+            k2_acc.append(a[k2][fold])
+
+        try:
+            T, p = scipy.stats.wilcoxon(k1_acc, k2_acc)
+        except:
+            import pdb
+            pdb.set_trace()
+
+        measurements[(k1, k2)] = {'T': T, 'p': p}
+    return measurements
+
+us8k_data, us8k_data2 = get_entries('us8k', 'classifier', first_row=3)
+esc50_data, esc50_data2 = get_entries('esc50', 'ESC50', first_row=3)
+dcase2013_data, dcase2013_data2 = get_entries('dcase2013', 'DCASE', first_row=3)
+
+us8k_stat = compute_stat_test(us8k_data2)
+esc50_stat = compute_stat_test(esc50_data2)
+
+print("US8K-L3")
+pprint(us8k_stat)
+print("ESC50-L3")
+pprint(esc50_stat)
+
+font = {'size' : 18}
+matplotlib.rc('font', **font)
+
+fig, axarr = plt.subplots(3, figsize=(10, 13), sharex=True)
+
+tick_labels = []
+for model_name in ["L", "M128", "M256"]:
+    for width in ["6K", "512"]:
+        for subset in ["Env", "Mus"]:
+            tick_labels.append("{}/{}/{}".format(model_name, width, subset))
+
+for data, ax, dataset_name, tick_interval in zip([us8k_data, esc50_data, dcase2013_data], axarr, ['UrbanSound8K', 'ESC-50', 'DCASE 2013'], [0.02, 0.02, 0.01]):
+    df = pandas.DataFrame(data)
+    ax = df.boxplot(column='test_acc', by=['embedding_model_type', 'embedding_length', 'audioset_subset'], figsize=(12,14), showmeans=True, ax=ax)
+
+    ax.set_xticklabels(tick_labels, ha='right')
+    ax.xaxis.set_tick_params(rotation=45)
+    start, end = ax.get_ylim()
+    ax.yaxis.set_ticks(np.arange(start, end, tick_interval))
+    ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.2f'))
+    ax.set_title(dataset_name)
+    ax.set_xlabel('')
+    ax.set_ylabel('Classification accuracy')
+    fig.suptitle("")
+
+# Set colors
+    pal = sns.color_palette("Paired", n_colors=10)
+    colors = []
+    for c in pal[:8]:
+        colors.append(c)
+        colors.append(c)
+#colors.extend(pal[-2:])
+    children = ax.get_children()
+    for idx in range(12):
+        box = children[idx*8]
+        xs = box.get_xdata()
+        ys = box.get_ydata()
+        ax.fill_between((xs[0], xs[1]), ys[0], ys[2], color=colors[idx] + (0.5,))
+
+
+# remove legend()
+#ax.legend().set_visible(False)
+
+fig.tight_layout()
+fig.savefig('us8k_test_boxplot.png')
+
+
+
+
+
+
+
+
+
+
+l3_model = ('original', 'music', '4_mel2')
+us8k_data, us8k_data2, = get_entries('us8k', 'classifier', first_row=3, l3_only=False, l3_model=l3_model)
+esc50_data, esc50_data2 = get_entries('esc50', 'ESC50', first_row=3, l3_only=False, l3_model=l3_model)
+dcase2013_data, dcase2013_data2 = get_entries('dcase2013', 'DCASE', first_row=3, l3_only=False, l3_model=l3_model)
+
+us8k_stat = compute_stat_test(us8k_data2)
+print("US8K-AllEmbeddings")
+pprint(us8k_stat)
+
+esc50_stat = compute_stat_test(esc50_data2)
+print("ESC50-AllEmbeddings")
+pprint(esc50_stat)
+
+
+"""
+font = {'size' : 8}
+matplotlib.rc('font', **font)
+
+fig, axarr = plt.subplots(1, 3, figsize=(6, 5), sharey=True)
+
+for data, ax, dataset_name, tick_interval in zip([us8k_data, esc50_data, dcase2013_data], axarr, ['UrbanSound8K', 'ESC-50', 'DCASE 2013'], [0.05, 0.05, 0.01]):
+    df = pandas.DataFrame(data)
+    ax = df.boxplot(column='test_acc', by=['embedding_model_type', 'embedding_length', 'audioset_subset'], showmeans=True, ax=ax)
+    ax.set_xticklabels(["L3-M256/6K/Mus", "SoundNet", "VGGish"], ha='right')
+    ax.xaxis.set_tick_params(rotation=45)
+    start, end = ax.get_ylim()
+    ax.yaxis.set_ticks(np.arange(start, end, tick_interval))
+    ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.2f'))
+    ax.set_title(dataset_name)
+    ax.set_xlabel('')
+    fig.suptitle("")
+
+
+# Set colors
+    colors = sns.color_palette(None, n_colors=3)
+#colors.extend(pal[-2:])
+    children = ax.get_children()
+    for idx in range(3):
+        box = children[idx*8]
+        xs = box.get_xdata()
+        ys = box.get_ydata()
+        ax.fill_between((xs[0], xs[1]), ys[0], ys[2], color=colors[idx] + (0.5,))
+
+axarr[0].set_ylabel('Classification accuracy')
+
+# remove legend()
+#ax.legend().set_visible(False)
+
+fig.tight_layout()
+fig.savefig('us8k_test_boxplot_2.png')
+"""
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+font = {'size' : 8}
+matplotlib.rc('font', **font)
+
+#fig, axarr = plt.subplots(3, figsize=(3, 6), sharex=True)
+fig, axarr = plt.subplots(1, 3, figsize=(6, 4))
+
+for data, ax, dataset_name, tick_interval, (start, end) in zip([us8k_data, esc50_data, dcase2013_data], axarr, ['UrbanSound8K', 'ESC-50', 'DCASE 2013'], [0.05, 0.05, 0.02], [(0.55, 0.9), (0.40, 0.85), (0.80, 1.)]):
+    df = pandas.DataFrame(data)
+    ax = df.boxplot(column='test_acc', by=['embedding_model_type', 'embedding_length', 'audioset_subset'], showmeans=True, ax=ax)
+    ax.set_xticklabels(["L3-M256/6K/Mus", "SoundNet", "VGGish"], ha='right')
+    ax.xaxis.set_tick_params(rotation=45)
+    #tick_interval = 0.05
+    #start, end = 0.4, 1.0
+    ax.set_ylim(start, end)
+    ax.yaxis.set_ticks(np.arange(start, end, tick_interval))
+    ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.2f'))
+    ax.set_title(dataset_name)
+    ax.set_xlabel('')
+    #ax.set_ylabel('Classification accuracy')
+
+
+# Set colors
+    colors = sns.color_palette(None, n_colors=3)
+#colors.extend(pal[-2:])
+    children = ax.get_children()
+    for idx in range(3):
+        box = children[idx*8]
+        xs = box.get_xdata()
+        ys = box.get_ydata()
+        ax.fill_between((xs[0], xs[1]), ys[0], ys[2], color=colors[idx] + (0.5,))
+
+axarr[0].set_ylabel('Classification accuracy')
+fig.suptitle("")
+
+# remove legend()
+#ax.legend().set_visible(False)
+
+fig.tight_layout()
+fig.savefig('us8k_test_boxplot_2.png')
+
+
+
+
+
+
+
+
+
+
+
+
+
+l3_model = ('original', 'music', '4_mel2')
+us8k_data, us8k_data2, = get_entries('us8k', 'classifier', first_row=3, l3_only=False, l3_model=l3_model)
+us8k_aug_data, us8k_aug_data2, = get_entries('us8k', 'classifier_augmented', first_row=3, l3_only=False, l3_model=l3_model)
+
+[x.update({'augmented': "false"}) for x in us8k_data]
+[x.update({'augmented': "true"}) for x in us8k_aug_data]
+
+us8k_data2 = {k:v for k,v in us8k_data2.items() if k[1:] == ('us8k', 'original', 'music', '4_mel2')}
+us8k_data += us8k_aug_data
+us8k_data2.update({k + ('augment',): v for k, v in us8k_aug_data2.items()})
+
+us8k_stat = compute_stat_test(us8k_data2)
+print("Augmented Trials")
+pprint(us8k_stat)
+
+
+font = {'size' : 8}
+matplotlib.rc('font', **font)
+
+fig = plt.figure(figsize=(4, 1.5))
+
+df = pandas.DataFrame(us8k_data)
+ax = df.boxplot(column='test_acc', by=['augmented'], showmeans=True, ax=fig.gca())
+ax.set_xticklabels(["L3-M256/6K/Mus", "L3-M256/6K/Mus-Aug"])
+start, end = ax.get_ylim()
+ax.yaxis.set_ticks(np.arange(start, end, 0.04))
+ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.2f'))
+ax.set_title("")
+ax.set_xlabel('')
+ax.set_ylabel('Classification accuracy')
+fig.suptitle("")
+
+
+# Set colors
+colors = sns.color_palette(None, n_colors=3)
+#colors.extend(pal[-2:])
+children = ax.get_children()
+for idx in range(2):
+    box = children[idx*8]
+    xs = box.get_xdata()
+    ys = box.get_ydata()
+    ax.fill_between((xs[0], xs[1]), ys[0], ys[2], color=colors[idx] + (0.5,))
+
+
+# remove legend()
+#ax.legend().set_visible(False)
+
+fig.tight_layout()
+fig.savefig('us8k_test_boxplot_3.png')

From 334991159dfecf5e4dab031b55698f4c8683ac1d Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 12 Jul 2018 14:22:43 -0400
Subject: [PATCH 196/201] Add plotting and significance test script

---
 compute_us8k_boxplot.py | 575 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 575 insertions(+)
 create mode 100644 compute_us8k_boxplot.py

diff --git a/compute_us8k_boxplot.py b/compute_us8k_boxplot.py
new file mode 100644
index 0000000..9d1c894
--- /dev/null
+++ b/compute_us8k_boxplot.py
@@ -0,0 +1,575 @@
+from gsheets import get_credentials, get_row, append_row, update_experiment, request_with_retry
+from googleapiclient import discovery
+from collections import Counter
+import pickle as pk
+import os
+import numpy as np
+import scipy.stats
+import shutil
+import json
+import time
+import pandas
+import itertools
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import seaborn as sns
+from pprint import pprint
+
+CLASSIFIER_FIELD_NAMES = [
+    'username',
+    'model_id',
+    'model_dir',
+    'git_commit',
+    'features_dir',
+    'fold_num',
+    'parameter_search',
+    'parameter_search_valid_fold',
+    'parameter_search_valid_ratio',
+    'parameter_search_train_with_valid',
+    'model_type',
+    'feature_mode',
+    'train_batch_size',
+    'non_overlap',
+    'random_state',
+    'num_epochs',
+    'learning_rate',
+    'weight_decay',
+    'C',
+    'tol',
+    'max_iterations',
+    'train_loss',
+    'valid_loss',
+    'train_acc',
+    'valid_acc',
+    'train_avg_class_acc',
+    'valid_avg_class_acc',
+    'train_class_acc',
+    'valid_class_acc',
+    'test_acc',
+    'test_avg_class_acc',
+    'test_class_acc',
+]
+
+def append_row(service, spreadsheet_id, param_dict, sheet_name):
+    # The A1 notation of a range to search for a logical table of data.
+    # Values will be appended after the last row of the table.
+    range_ = '{}!A1:A{}'.format(sheet_name, len(FIELD_NAMES))
+    # How the input data should be interpreted.
+    value_input_option = 'USER_ENTERED'
+    # How the input data should be inserted.
+    insert_data_option = 'INSERT_ROWS'
+
+    value_range_body = {
+        "range": range_,
+        "majorDimension": 'ROWS',
+        "values": [[str(param_dict[field_name]) for field_name in FIELD_NAMES ]]
+    }
+
+    request = service.spreadsheets().values().append(
+        spreadsheetId=spreadsheet_id,
+        range=range_,
+        valueInputOption=value_input_option,
+        insertDataOption=insert_data_option,
+        body=value_range_body)
+    response = request_with_retry(request)
+
+
+value_render_option = 'UNFORMATTED_VALUE'
+date_time_render_option = 'FORMATTED_STRING'
+GOOGLE_DEV_APP_NAME = 'l3embeddingexperiments'
+GSHEET_ID = '1eyFv_jUWJeEuG1VKQNfAmhQUREMRDgEMxfVDffRaHDo'
+
+credentials = get_credentials(GOOGLE_DEV_APP_NAME)
+service = discovery.build('sheets', 'v4', credentials=credentials)
+
+def get_entries(target_dataset, sheet_name, first_row=3, l3_only=True, l3_model=None):
+    range_ = '{}!A{}:AF'.format(sheet_name, first_row)
+    request = service.spreadsheets().values().get(spreadsheetId=GSHEET_ID, range=range_, valueRenderOption=value_render_option, dateTimeRenderOption=date_time_render_option)
+    response = request.execute()
+
+    unique = Counter()
+
+    if target_dataset == 'us8k':
+        limit = 2
+    elif target_dataset == 'esc50':
+        limit = 5
+    else:
+        limit = 1
+
+    data2 = {}
+    data = []
+    for datum in response['values']:
+        if not datum:
+            continue
+        model_id = datum[1]
+        if target_dataset not in model_id:
+            continue
+        if datum[21] == '-':
+            continue
+        if l3_only and 'L3' not in model_id:
+            continue
+
+        entry = {k: v for k,v in zip(CLASSIFIER_FIELD_NAMES, datum)}
+
+        if 'L3' in model_id:
+            dataset, _, embedding_length, audioset_subset, embedding_model_type, \
+                    _, _, _, _ = model_id.split('/')
+
+            if embedding_model_type == 'cnn_L3_orig':
+                embedding_model_type = '1_orig'
+                continue
+            elif embedding_model_type == 'cnn_L3_kapredbinputbn':
+                embedding_model_type = '2_norm'
+            elif embedding_model_type == 'cnn_L3_melspec1':
+                embedding_model_type = '3_mel1'
+            elif embedding_model_type == 'cnn_L3_melspec2':
+                embedding_model_type = '4_mel2'
+        else:
+            dataset, embedding_model_type, _, _, _, _ = model_id.split('/')
+            audioset_subset = 'na'
+            embedding_length = 'na'
+
+
+        fold_num = datum[5]
+        ident = (fold_num, dataset, embedding_length, audioset_subset, embedding_model_type)
+
+
+        if not l3_only and 'L3' in model_id and l3_model and ident[2:] != l3_model:
+            continue
+
+        entry['dataset'] = dataset
+        entry['embedding_length'] = embedding_length
+        entry['audioset_subset'] = audioset_subset
+        entry['embedding_model_type'] = embedding_model_type
+
+        if unique[ident] < limit:
+            unique[ident] += 1
+            data.append(entry)
+            data2[ident] = entry['test_acc']
+
+    for ident, count in unique.items():
+        if target_dataset == 'us8k':
+            if count != 2:
+                print('Missing trials for {}: {}'.format(ident, count))
+        elif target_dataset == 'esc50':
+            if count != 5:
+                print('Missing trials for {}: {}'.format(ident, count))
+        else:
+            if count != 1:
+                print('Missing trials for {}: {}'.format(ident, count))
+
+    return data, data2
+
+pretty_print_map = {
+    'us8k': 'US8K',
+    'esc50': 'ESC50',
+    'original': '6K',
+    'short': '512',
+    '2_norm': 'Linear',
+    '3_mel1': 'Mel-128',
+    '4_mel2': 'Mel-256',
+    'augment': 'Augmented',
+    'music': 'Music',
+    'environmental': 'Environmental'
+}
+
+def get_dataset(ident):
+    if 'us8k' in ident:
+        dataset = 'US8K'
+    elif 'esc50' in ident:
+        dataset = 'ESC50'
+    elif 'dcase2013' in ident:
+        dataset = 'DCASE2013'
+    else:
+        dataset = None
+    return dataset
+
+
+def get_embedding_size(ident):
+    if 'original' in ident:
+        emb_size = '6K'
+    elif 'short' in ident:
+        emb_size = '512'
+    else:
+        emb_size = None
+
+    return emb_size
+
+def get_subset(ident):
+    if 'music' in ident:
+        subset = 'Music'
+    elif 'environmental' in ident:
+        subset = 'Environmental'
+    else:
+        subset = None
+    return subset
+
+def get_tfrepr(ident):
+    if '2_norm' in ident:
+        tfrepr = 'Linear'
+    elif '3_mel1' in ident:
+        tfrepr = 'Mel-128'
+    elif '4_mel2' in ident:
+        tfrepr = 'Mel-256'
+    elif 'vggish' in ident:
+        tfrepr = "VGGish"
+    elif 'soundnet' in ident:
+        tfrepr = 'SoundNet'
+    else:
+        tfrepr = None
+
+    return tfrepr
+
+
+def get_print_parts(ident1, ident2):
+    intersection = set(ident1[1:]) & set(ident2[1:])
+
+
+    intersection_tuple = []
+    int_emb_size = get_embedding_size(intersection)
+    int_subset = get_subset(intersection)
+    int_tfrepr = get_tfrepr(intersection)
+    if int_emb_size:
+        intersection_tuple.append(int_emb_size)
+    if int_subset:
+        intersection_tuple.append(int_subset)
+    if int_tfrepr:
+        intersection_tuple.append(int_tfrepr)
+
+    model1_list = []
+    model1 = set(ident1) - set(ident2)
+    mod1_emb_size = get_embedding_size(model1)
+    mod1_subset = get_subset(model1)
+    mod1_tfrepr = get_tfrepr(model1)
+    mod1_augmented = 'Augmented' if 'augmented' in model1 else None
+    if mod1_emb_size:
+        model1_list.append(mod1_emb_size)
+    if mod1_subset:
+        model1_list.append(mod1_subset)
+    if mod1_tfrepr:
+        model1_list.append(mod1_tfrepr)
+    if mod1_augmented:
+        model1_list.append(mod1_augmented)
+
+    model2_list = []
+    model2 = set(ident2) - set(ident1)
+    mod2_emb_size = get_embedding_size(model2)
+    mod2_subset = get_subset(model2)
+    mod2_tfrepr = get_tfrepr(model2)
+    mod2_augmented = 'Augmented' if 'augmented' in model2 else None
+    if mod2_emb_size:
+        model2_list.append(mod2_emb_size)
+    if mod2_subset:
+        model2_list.append(mod2_subset)
+    if mod2_tfrepr:
+        model2_list.append(mod2_tfrepr)
+    if mod2_augmented:
+        model2_list.append(mod2_augmented)
+
+
+    return tuple(intersection_tuple), model1_list, model2_list
+
+
+def compute_stat_test(data2, desc, dcase=False):
+    keys = data2.keys()
+
+    models = set()
+    folds = set()
+    a = {}
+
+    for k in keys:
+        ident = k[1:]
+        fold = k[0]
+        models.add(ident)
+        folds.add(fold)
+
+        if ident not in a:
+            a[ident] = {}
+
+        a[ident][fold] = data2[k]
+
+    folds = list(sorted(list(folds)))
+    models = list(models)
+
+    num_folds = len(folds)
+
+    dataset = get_dataset(models[0])
+
+
+    print("{} - {}".format(dataset, desc))
+    measurements = set()
+
+    for k1, k2 in itertools.product(models, models):
+        if k1 == k2:
+            continue
+
+        pair = tuple(sorted((k1, k2)))
+        if pair in measurements:
+            continue
+
+        k1, k2 = pair
+        k1_acc = []
+        k2_acc = []
+        if not dcase:
+            for fold in range(1, num_folds+1):
+                k1_acc.append(a[k1][fold])
+                k2_acc.append(a[k2][fold])
+        else:
+            k1_acc.append(a[k1][2])
+            k2_acc.append(a[k2][2])
+        T, p = scipy.stats.wilcoxon(k1_acc, k2_acc)
+
+        if p < 0.05:
+            int_tuple, mod1_list, mod2_list = get_print_parts(*pair)
+
+            k1_mean = np.mean(k1_acc)
+            k2_mean = np.mean(k2_acc)
+            diff = k1_mean - k2_mean
+
+            if diff < 0:
+                tmp = mod2_list
+                mod2_list = mod1_list
+                mod1_list = tmp
+
+                tmp = k2_mean
+                k2_mean = k1_mean
+                k1_mean = tmp
+
+            print("({}): [{}] better than [{}] by {} - {} = {} ~~ T={}, p={}".format(
+                " + ".join(int_tuple),
+                " + ".join(mod1_list),
+                " + ".join(mod2_list),
+                k1_mean, k2_mean, np.abs(diff), T, p
+            ))
+        measurements.add(pair)
+    print("\n\n")
+
+        #measurements[pair] = {'T': T, 'p': p}
+    #return measurements
+
+us8k_data, us8k_data2 = get_entries('us8k', 'classifier', first_row=3)
+esc50_data, esc50_data2 = get_entries('esc50', 'ESC50', first_row=3)
+dcase2013_data, dcase2013_data2 = get_entries('dcase2013', 'DCASE', first_row=3)
+
+compute_stat_test(us8k_data2, "L3")
+compute_stat_test(esc50_data2, "L3")
+#compute_stat_test(dcase2013_data2, "L3")
+
+font = {'size' : 18}
+matplotlib.rc('font', **font)
+
+fig, axarr = plt.subplots(3, figsize=(10, 13), sharex=True)
+
+tick_labels = []
+for model_name in ["L", "M128", "M256"]:
+    for width in ["6K", "512"]:
+        for subset in ["Env", "Mus"]:
+            tick_labels.append("{}/{}/{}".format(model_name, width, subset))
+
+for data, ax, dataset_name, tick_interval in zip([us8k_data, esc50_data, dcase2013_data], axarr, ['UrbanSound8K', 'ESC-50', 'DCASE 2013'], [0.02, 0.02, 0.01]):
+    df = pandas.DataFrame(data)
+    ax = df.boxplot(column='test_acc', by=['embedding_model_type', 'embedding_length', 'audioset_subset'], figsize=(12,14), showmeans=True, ax=ax)
+
+    ax.set_xticklabels(tick_labels, ha='right')
+    ax.xaxis.set_tick_params(rotation=45)
+    start, end = ax.get_ylim()
+    ax.yaxis.set_ticks(np.arange(start, end, tick_interval))
+    ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.2f'))
+    ax.set_title(dataset_name)
+    ax.set_xlabel('')
+    ax.set_ylabel('Classification accuracy')
+    fig.suptitle("")
+
+# Set colors
+    pal = sns.color_palette("Paired", n_colors=10)
+    colors = []
+    for c in pal[:8]:
+        colors.append(c)
+        colors.append(c)
+#colors.extend(pal[-2:])
+    children = ax.get_children()
+    for idx in range(12):
+        box = children[idx*8]
+        xs = box.get_xdata()
+        ys = box.get_ydata()
+        ax.fill_between((xs[0], xs[1]), ys[0], ys[2], color=colors[idx] + (0.5,))
+
+
+# remove legend()
+#ax.legend().set_visible(False)
+
+fig.tight_layout()
+fig.savefig('us8k_test_boxplot.png')
+
+
+
+
+
+
+
+
+
+
+l3_model = ('original', 'music', '4_mel2')
+us8k_data, us8k_data2, = get_entries('us8k', 'classifier', first_row=3, l3_only=False, l3_model=l3_model)
+esc50_data, esc50_data2 = get_entries('esc50', 'ESC50', first_row=3, l3_only=False, l3_model=l3_model)
+dcase2013_data, dcase2013_data2 = get_entries('dcase2013', 'DCASE', first_row=3, l3_only=False, l3_model=l3_model)
+
+compute_stat_test(us8k_data2, "All Embeddings")
+compute_stat_test(esc50_data2, "All Embeddings")
+compute_stat_test(dcase2013_data2, "All Embeddings", dcase=True)
+
+
+"""
+font = {'size' : 8}
+matplotlib.rc('font', **font)
+
+fig, axarr = plt.subplots(1, 3, figsize=(6, 5), sharey=True)
+
+for data, ax, dataset_name, tick_interval in zip([us8k_data, esc50_data, dcase2013_data], axarr, ['UrbanSound8K', 'ESC-50', 'DCASE 2013'], [0.05, 0.05, 0.01]):
+    df = pandas.DataFrame(data)
+    ax = df.boxplot(column='test_acc', by=['embedding_model_type', 'embedding_length', 'audioset_subset'], showmeans=True, ax=ax)
+    ax.set_xticklabels(["L3-M256/6K/Mus", "SoundNet", "VGGish"], ha='right')
+    ax.xaxis.set_tick_params(rotation=45)
+    start, end = ax.get_ylim()
+    ax.yaxis.set_ticks(np.arange(start, end, tick_interval))
+    ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.2f'))
+    ax.set_title(dataset_name)
+    ax.set_xlabel('')
+    fig.suptitle("")
+
+
+# Set colors
+    colors = sns.color_palette(None, n_colors=3)
+#colors.extend(pal[-2:])
+    children = ax.get_children()
+    for idx in range(3):
+        box = children[idx*8]
+        xs = box.get_xdata()
+        ys = box.get_ydata()
+        ax.fill_between((xs[0], xs[1]), ys[0], ys[2], color=colors[idx] + (0.5,))
+
+axarr[0].set_ylabel('Classification accuracy')
+
+# remove legend()
+#ax.legend().set_visible(False)
+
+fig.tight_layout()
+fig.savefig('us8k_test_boxplot_2.png')
+"""
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+font = {'size' : 8}
+matplotlib.rc('font', **font)
+
+#fig, axarr = plt.subplots(3, figsize=(3, 6), sharex=True)
+fig, axarr = plt.subplots(1, 3, figsize=(6, 4))
+
+for data, ax, dataset_name, tick_interval, (start, end) in zip([us8k_data, esc50_data, dcase2013_data], axarr, ['UrbanSound8K', 'ESC-50', 'DCASE 2013'], [0.05, 0.05, 0.02], [(0.55, 0.9), (0.40, 0.85), (0.80, 1.)]):
+    df = pandas.DataFrame(data)
+    ax = df.boxplot(column='test_acc', by=['embedding_model_type', 'embedding_length', 'audioset_subset'], showmeans=True, ax=ax)
+    ax.set_xticklabels(["L3-M256/6K/Mus", "SoundNet", "VGGish"], ha='right')
+    ax.xaxis.set_tick_params(rotation=45)
+    #tick_interval = 0.05
+    #start, end = 0.4, 1.0
+    ax.set_ylim(start, end)
+    ax.yaxis.set_ticks(np.arange(start, end, tick_interval))
+    ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.2f'))
+    ax.set_title(dataset_name)
+    ax.set_xlabel('')
+    #ax.set_ylabel('Classification accuracy')
+
+
+# Set colors
+    colors = sns.color_palette(None, n_colors=3)
+#colors.extend(pal[-2:])
+    children = ax.get_children()
+    for idx in range(3):
+        box = children[idx*8]
+        xs = box.get_xdata()
+        ys = box.get_ydata()
+        ax.fill_between((xs[0], xs[1]), ys[0], ys[2], color=colors[idx] + (0.5,))
+
+axarr[0].set_ylabel('Classification accuracy')
+fig.suptitle("")
+
+# remove legend()
+#ax.legend().set_visible(False)
+
+fig.tight_layout()
+fig.savefig('us8k_test_boxplot_2.png')
+
+
+
+
+
+
+
+
+
+
+
+
+
+l3_model = ('original', 'music', '4_mel2')
+us8k_data, us8k_data2, = get_entries('us8k', 'classifier', first_row=3, l3_only=False, l3_model=l3_model)
+us8k_aug_data, us8k_aug_data2, = get_entries('us8k', 'classifier_augmented', first_row=3, l3_only=False, l3_model=l3_model)
+
+[x.update({'augmented': "false"}) for x in us8k_data]
+[x.update({'augmented': "true"}) for x in us8k_aug_data]
+
+us8k_data2 = {k:v for k,v in us8k_data2.items() if k[1:] == ('us8k', 'original', 'music', '4_mel2')}
+us8k_data += us8k_aug_data
+us8k_data2.update({k + ('augment',): v for k, v in us8k_aug_data2.items()})
+
+compute_stat_test(us8k_data2, "Augmented Trials")
+
+
+font = {'size' : 8}
+matplotlib.rc('font', **font)
+
+fig = plt.figure(figsize=(4, 1.5))
+
+df = pandas.DataFrame(us8k_data)
+ax = df.boxplot(column='test_acc', by=['augmented'], showmeans=True, ax=fig.gca())
+ax.set_xticklabels(["L3-M256/6K/Mus", "L3-M256/6K/Mus-Aug"])
+start, end = ax.get_ylim()
+ax.yaxis.set_ticks(np.arange(start, end, 0.04))
+ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.2f'))
+ax.set_title("")
+ax.set_xlabel('')
+ax.set_ylabel('Classification accuracy')
+fig.suptitle("")
+
+
+# Set colors
+colors = sns.color_palette(None, n_colors=3)
+#colors.extend(pal[-2:])
+children = ax.get_children()
+for idx in range(2):
+    box = children[idx*8]
+    xs = box.get_xdata()
+    ys = box.get_ydata()
+    ax.fill_between((xs[0], xs[1]), ys[0], ys[2], color=colors[idx] + (0.5,))
+
+
+# remove legend()
+#ax.legend().set_visible(False)
+
+fig.tight_layout()
+fig.savefig('us8k_test_boxplot_3.png')

From 92c5f71023c55914e5e11340ae2254af1bfdf614 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Thu, 12 Jul 2018 14:38:15 -0400
Subject: [PATCH 197/201] Rename plotting and sig test script to be more
 helpful

---
 compute_us8k_boxplot.py => generate_plots_and_sig_tests.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename compute_us8k_boxplot.py => generate_plots_and_sig_tests.py (100%)

diff --git a/compute_us8k_boxplot.py b/generate_plots_and_sig_tests.py
similarity index 100%
rename from compute_us8k_boxplot.py
rename to generate_plots_and_sig_tests.py

From 0f811d1d0af687ce0f961e7958bd6839111e0584 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 17 Jul 2018 14:27:29 -0400
Subject: [PATCH 198/201] Add some fixes to the significance tests and plots

---
 generate_plots_and_sig_tests.py | 153 ++++++++++++++++++++------------
 1 file changed, 97 insertions(+), 56 deletions(-)

diff --git a/generate_plots_and_sig_tests.py b/generate_plots_and_sig_tests.py
index 9d1c894..0054e94 100644
--- a/generate_plots_and_sig_tests.py
+++ b/generate_plots_and_sig_tests.py
@@ -147,7 +147,9 @@ def get_entries(target_dataset, sheet_name, first_row=3, l3_only=True, l3_model=
         if unique[ident] < limit:
             unique[ident] += 1
             data.append(entry)
-            data2[ident] = entry['test_acc']
+            if ident not in data2:
+                data2[ident] = []
+            data2[ident].append(entry['test_acc'])
 
     for ident, count in unique.items():
         if target_dataset == 'us8k':
@@ -160,6 +162,7 @@ def get_entries(target_dataset, sheet_name, first_row=3, l3_only=True, l3_model=
             if count != 1:
                 print('Missing trials for {}: {}'.format(ident, count))
 
+
     return data, data2
 
 pretty_print_map = {
@@ -272,15 +275,30 @@ def get_print_parts(ident1, ident2):
     return tuple(intersection_tuple), model1_list, model2_list
 
 
-def compute_stat_test(data2, desc, dcase=False):
+def compute_stat_test(data2, desc, var=None, dcase=False):
     keys = data2.keys()
 
     models = set()
     folds = set()
     a = {}
 
+    if var == 'dataset':
+        ident_idx = 1
+    elif var == 'embedding_length':
+        ident_idx = 2
+    elif var == 'audioset_subset':
+        ident_idx = 3
+    elif var == 'embedding_model_type':
+        ident_idx = 4
+
+    if var:
+        print(var)
+
     for k in keys:
-        ident = k[1:]
+        if var:
+            ident = (k[ident_idx],)
+        else:
+            ident = k[1:]
         fold = k[0]
         models.add(ident)
         folds.add(fold)
@@ -288,15 +306,20 @@ def compute_stat_test(data2, desc, dcase=False):
         if ident not in a:
             a[ident] = {}
 
-        a[ident][fold] = data2[k]
+        if fold not in a[ident]:
+            a[ident][fold] = []
+
+        a[ident][fold] += data2[k]
 
     folds = list(sorted(list(folds)))
-    models = list(models)
+    models = list(set(models))
 
     num_folds = len(folds)
 
-    dataset = get_dataset(models[0])
-
+    if var:
+        dataset = get_dataset(k[1:])
+    else:
+        dataset = get_dataset(models[0])
 
     print("{} - {}".format(dataset, desc))
     measurements = set()
@@ -314,11 +337,11 @@ def compute_stat_test(data2, desc, dcase=False):
         k2_acc = []
         if not dcase:
             for fold in range(1, num_folds+1):
-                k1_acc.append(a[k1][fold])
-                k2_acc.append(a[k2][fold])
+                k1_acc += a[k1][fold]
+                k2_acc += a[k2][fold]
         else:
-            k1_acc.append(a[k1][2])
-            k2_acc.append(a[k2][2])
+            k1_acc += a[k1][2]
+            k2_acc += a[k2][2]
         T, p = scipy.stats.wilcoxon(k1_acc, k2_acc)
 
         if p < 0.05:
@@ -353,55 +376,72 @@ def compute_stat_test(data2, desc, dcase=False):
 esc50_data, esc50_data2 = get_entries('esc50', 'ESC50', first_row=3)
 dcase2013_data, dcase2013_data2 = get_entries('dcase2013', 'DCASE', first_row=3)
 
-compute_stat_test(us8k_data2, "L3")
-compute_stat_test(esc50_data2, "L3")
+compute_stat_test(us8k_data2, "L3", 'embedding_model_type')
+compute_stat_test(us8k_data2, "L3", 'embedding_length')
+compute_stat_test(us8k_data2, "L3", 'audioset_subset')
+compute_stat_test(esc50_data2, "L3", 'embedding_model_type')
+compute_stat_test(esc50_data2, "L3", 'embedding_length')
+compute_stat_test(esc50_data2, "L3", 'audioset_subset')
 #compute_stat_test(dcase2013_data2, "L3")
 
-font = {'size' : 18}
-matplotlib.rc('font', **font)
-
-fig, axarr = plt.subplots(3, figsize=(10, 13), sharex=True)
-
-tick_labels = []
-for model_name in ["L", "M128", "M256"]:
-    for width in ["6K", "512"]:
-        for subset in ["Env", "Mus"]:
-            tick_labels.append("{}/{}/{}".format(model_name, width, subset))
-
-for data, ax, dataset_name, tick_interval in zip([us8k_data, esc50_data, dcase2013_data], axarr, ['UrbanSound8K', 'ESC-50', 'DCASE 2013'], [0.02, 0.02, 0.01]):
-    df = pandas.DataFrame(data)
-    ax = df.boxplot(column='test_acc', by=['embedding_model_type', 'embedding_length', 'audioset_subset'], figsize=(12,14), showmeans=True, ax=ax)
-
-    ax.set_xticklabels(tick_labels, ha='right')
-    ax.xaxis.set_tick_params(rotation=45)
-    start, end = ax.get_ylim()
-    ax.yaxis.set_ticks(np.arange(start, end, tick_interval))
-    ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.2f'))
-    ax.set_title(dataset_name)
-    ax.set_xlabel('')
-    ax.set_ylabel('Classification accuracy')
-    fig.suptitle("")
-
-# Set colors
-    pal = sns.color_palette("Paired", n_colors=10)
-    colors = []
-    for c in pal[:8]:
-        colors.append(c)
-        colors.append(c)
-#colors.extend(pal[-2:])
-    children = ax.get_children()
-    for idx in range(12):
-        box = children[idx*8]
-        xs = box.get_xdata()
-        ys = box.get_ydata()
-        ax.fill_between((xs[0], xs[1]), ys[0], ys[2], color=colors[idx] + (0.5,))
+for var in ['embedding_model_type', 'embedding_length', 'audioset_subset']:
+    font = {'size' : 18}
+    matplotlib.rc('font', **font)
+
+    fig, axarr = plt.subplots(3, figsize=(7, 10), sharex=True)
+
+    tick_labels = []
+    """
+    for model_name in ["L", "M128", "M256"]:
+        for width in ["6K", "512"]:
+            for subset in ["Env", "Mus"]:
+                tick_labels.append("{}/{}/{}".format(model_name, width, subset))
+    """
+    if var == 'embedding_model_type':
+        tick_labels = ["Linear", "Mel-128", "Mel-256"]
+    elif var == 'embedding_length':
+        tick_labels = ["6144", "512"]
+    else:
+        tick_labels = ["Env.", "Music"]
+
+    for data, ax, dataset_name, tick_interval in zip([us8k_data, esc50_data, dcase2013_data], axarr, ['UrbanSound8K', 'ESC-50', 'DCASE 2013'], [0.03, 0.02, 0.01]):
+        df = pandas.DataFrame(data)
+        ax = df.boxplot(column='test_acc', by=var, figsize=(7,9), showmeans=True, ax=ax)
+
+        ax.set_xticklabels(tick_labels, ha='right')
+        ax.xaxis.set_tick_params(rotation=45)
+        start, end = ax.get_ylim()
+        ax.yaxis.set_ticks(np.arange(start, end, tick_interval))
+        ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.2f'))
+        ax.set_title(dataset_name)
+        ax.set_xlabel('')
+        ax.set_ylabel('Classification accuracy')
+        fig.suptitle("")
+
+    # Set colors
+        pal = sns.color_palette("Set2", n_colors=10)
+        colors = []
+        for c in pal[:8]:
+            colors.append(c)
+            #colors.append(c)
+    #colors.extend(pal[-2:])
+        children = ax.get_children()
+        if var == 'embedding_model_type':
+            num_boxes = 3
+        else:
+            num_boxes = 2
+        for idx in range(num_boxes):
+            box = children[idx*8]
+            xs = box.get_xdata()
+            ys = box.get_ydata()
+            ax.fill_between((xs[0], xs[1]), ys[0], ys[2], color=colors[idx] + (0.5,))
 
 
-# remove legend()
-#ax.legend().set_visible(False)
+    # remove legend()
+    #ax.legend().set_visible(False)
 
-fig.tight_layout()
-fig.savefig('us8k_test_boxplot.png')
+    fig.tight_layout()
+    fig.savefig('us8k_test_boxplot_{}.png'.format(var))
 
 
 
@@ -413,7 +453,7 @@ def compute_stat_test(data2, desc, dcase=False):
 
 
 l3_model = ('original', 'music', '4_mel2')
-us8k_data, us8k_data2, = get_entries('us8k', 'classifier', first_row=3, l3_only=False, l3_model=l3_model)
+us8k_data, us8k_data2 = get_entries('us8k', 'classifier', first_row=3, l3_only=False, l3_model=l3_model)
 esc50_data, esc50_data2 = get_entries('esc50', 'ESC50', first_row=3, l3_only=False, l3_model=l3_model)
 dcase2013_data, dcase2013_data2 = get_entries('dcase2013', 'DCASE', first_row=3, l3_only=False, l3_model=l3_model)
 
@@ -534,6 +574,7 @@ def compute_stat_test(data2, desc, dcase=False):
 [x.update({'augmented': "true"}) for x in us8k_aug_data]
 
 us8k_data2 = {k:v for k,v in us8k_data2.items() if k[1:] == ('us8k', 'original', 'music', '4_mel2')}
+us8k_data = [x for x in us8k_data if 'l3' in x['model_id']]
 us8k_data += us8k_aug_data
 us8k_data2.update({k + ('augment',): v for k, v in us8k_aug_data2.items()})
 

From eab2a8aa6c2d8d72e7b3cdf22a29e23cc8db1fbf Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 18 Jul 2018 14:15:40 -0400
Subject: [PATCH 199/201] Add epoch experiment plots to plotting script

---
 generate_plots_and_sig_tests.py | 62 +++++++++++++++++++++++++++++++--
 1 file changed, 60 insertions(+), 2 deletions(-)

diff --git a/generate_plots_and_sig_tests.py b/generate_plots_and_sig_tests.py
index 0054e94..d20efcf 100644
--- a/generate_plots_and_sig_tests.py
+++ b/generate_plots_and_sig_tests.py
@@ -452,7 +452,7 @@ def compute_stat_test(data2, desc, var=None, dcase=False):
 
 
 
-l3_model = ('original', 'music', '4_mel2')
+l3_model = ('original', 'music', '3_mel1')
 us8k_data, us8k_data2 = get_entries('us8k', 'classifier', first_row=3, l3_only=False, l3_model=l3_model)
 esc50_data, esc50_data2 = get_entries('esc50', 'ESC50', first_row=3, l3_only=False, l3_model=l3_model)
 dcase2013_data, dcase2013_data2 = get_entries('dcase2013', 'DCASE', first_row=3, l3_only=False, l3_model=l3_model)
@@ -523,7 +523,7 @@ def compute_stat_test(data2, desc, var=None, dcase=False):
 for data, ax, dataset_name, tick_interval, (start, end) in zip([us8k_data, esc50_data, dcase2013_data], axarr, ['UrbanSound8K', 'ESC-50', 'DCASE 2013'], [0.05, 0.05, 0.02], [(0.55, 0.9), (0.40, 0.85), (0.80, 1.)]):
     df = pandas.DataFrame(data)
     ax = df.boxplot(column='test_acc', by=['embedding_model_type', 'embedding_length', 'audioset_subset'], showmeans=True, ax=ax)
-    ax.set_xticklabels(["L3-M256/6K/Mus", "SoundNet", "VGGish"], ha='right')
+    ax.set_xticklabels(["L3-M128/6K/Mus", "SoundNet", "VGGish"], ha='right')
     ax.xaxis.set_tick_params(rotation=45)
     #tick_interval = 0.05
     #start, end = 0.4, 1.0
@@ -614,3 +614,61 @@ def compute_stat_test(data2, desc, var=None, dcase=False):
 
 fig.tight_layout()
 fig.savefig('us8k_test_boxplot_3.png')
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+us8k_epoch_df = pandas.read_csv('epoch_us8k.csv')
+esc50_epoch_df = pandas.read_csv('epoch_esc50.csv')
+
+tick_labels = ['0.26M', '2.62M', '5.24M', '13.11M', '26.21M', '39.32M', '52.43M', '65.54M', '78.64M']
+
+font = {'size' : 8}
+matplotlib.rc('font', **font)
+
+fig, axarr = plt.subplots(2, 1, figsize=(5, 5), sharex=True)
+
+for df, ax, dataset_name, tick_interval, (start, end) in zip([us8k_epoch_df, esc50_epoch_df], axarr, ['UrbanSound8K', 'ESC-50'], [0.05, 0.05], [(0.55, 0.9), (0.55, 0.9)]):
+    ax = df.boxplot(column='test_acc', by='# of samples', showmeans=True, ax=ax)
+    ax.set_xticklabels(tick_labels, ha='right')
+    ax.xaxis.set_tick_params(rotation=45)
+    ax.set_ylim(start, end)
+    ax.yaxis.set_ticks(np.arange(start, end, tick_interval))
+    ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.2f'))
+    ax.set_title(dataset_name)
+    ax.set_xlabel('')
+    ax.set_ylabel('Classification accuracy')
+
+    # Set colors
+    colors = sns.color_palette(None, n_colors=1)
+    #colors.extend(pal[-2:])
+    children = ax.get_children()
+    for idx in range(len(tick_labels)):
+        box = children[idx*8]
+        xs = box.get_xdata()
+        ys = box.get_ydata()
+        ax.fill_between((xs[0], xs[1]), ys[0], ys[2], color=colors[0] + (0.5,))
+
+fig.suptitle("")
+
+fig.tight_layout()
+fig.savefig('us8k_test_boxplot_4.png')

From 2f3352ccb1a6ddbce290bcd1a55cc360d41caad1 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 28 Aug 2018 20:54:59 -0400
Subject: [PATCH 200/201] Update plot generation script

---
 generate_plots_and_sig_tests.py | 289 ++++++++++++++++++++++++++------
 1 file changed, 238 insertions(+), 51 deletions(-)

diff --git a/generate_plots_and_sig_tests.py b/generate_plots_and_sig_tests.py
index d20efcf..25e7faf 100644
--- a/generate_plots_and_sig_tests.py
+++ b/generate_plots_and_sig_tests.py
@@ -13,6 +13,7 @@
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
 import matplotlib.ticker as ticker
 import seaborn as sns
 from pprint import pprint
@@ -40,7 +41,6 @@
     'tol',
     'max_iterations',
     'train_loss',
-    'valid_loss',
     'train_acc',
     'valid_acc',
     'train_avg_class_acc',
@@ -52,10 +52,11 @@
     'test_class_acc',
 ]
 
+"""
 def append_row(service, spreadsheet_id, param_dict, sheet_name):
     # The A1 notation of a range to search for a logical table of data.
     # Values will be appended after the last row of the table.
-    range_ = '{}!A1:A{}'.format(sheet_name, len(FIELD_NAMES))
+    range_ = '{}!A1:AF{}'.format(sheet_name, len(FIELD_NAMES))
     # How the input data should be interpreted.
     value_input_option = 'USER_ENTERED'
     # How the input data should be inserted.
@@ -74,6 +75,7 @@ def append_row(service, spreadsheet_id, param_dict, sheet_name):
         insertDataOption=insert_data_option,
         body=value_range_body)
     response = request_with_retry(request)
+"""
 
 
 value_render_option = 'UNFORMATTED_VALUE'
@@ -85,7 +87,7 @@ def append_row(service, spreadsheet_id, param_dict, sheet_name):
 service = discovery.build('sheets', 'v4', credentials=credentials)
 
 def get_entries(target_dataset, sheet_name, first_row=3, l3_only=True, l3_model=None):
-    range_ = '{}!A{}:AF'.format(sheet_name, first_row)
+    range_ = '{}!A{}:AF1561'.format(sheet_name, first_row)
     request = service.spreadsheets().values().get(spreadsheetId=GSHEET_ID, range=range_, valueRenderOption=value_render_option, dateTimeRenderOption=date_time_render_option)
     response = request.execute()
 
@@ -111,12 +113,20 @@ def get_entries(target_dataset, sheet_name, first_row=3, l3_only=True, l3_model=
         if l3_only and 'L3' not in model_id:
             continue
 
+        if len(CLASSIFIER_FIELD_NAMES) != len(datum):
+            if len(datum) == len(CLASSIFIER_FIELD_NAMES) + 1:
+                datum.pop(22)
+
         entry = {k: v for k,v in zip(CLASSIFIER_FIELD_NAMES, datum)}
 
         if 'L3' in model_id:
             dataset, _, embedding_length, audioset_subset, embedding_model_type, \
                     _, _, _, _ = model_id.split('/')
 
+            if embedding_length == 'short':
+                # Skip the short embedding length
+                continue
+
             if embedding_model_type == 'cnn_L3_orig':
                 embedding_model_type = '1_orig'
                 continue
@@ -129,18 +139,17 @@ def get_entries(target_dataset, sheet_name, first_row=3, l3_only=True, l3_model=
         else:
             dataset, embedding_model_type, _, _, _, _ = model_id.split('/')
             audioset_subset = 'na'
-            embedding_length = 'na'
 
 
         fold_num = datum[5]
-        ident = (fold_num, dataset, embedding_length, audioset_subset, embedding_model_type)
+        ident = (fold_num, dataset, audioset_subset, embedding_model_type)
 
 
         if not l3_only and 'L3' in model_id and l3_model and ident[2:] != l3_model:
             continue
 
+
         entry['dataset'] = dataset
-        entry['embedding_length'] = embedding_length
         entry['audioset_subset'] = audioset_subset
         entry['embedding_model_type'] = embedding_model_type
 
@@ -171,8 +180,8 @@ def get_entries(target_dataset, sheet_name, first_row=3, l3_only=True, l3_model=
     'original': '6K',
     'short': '512',
     '2_norm': 'Linear',
-    '3_mel1': 'Mel-128',
-    '4_mel2': 'Mel-256',
+    '3_mel1': 'M128',
+    '4_mel2': 'M256',
     'augment': 'Augmented',
     'music': 'Music',
     'environmental': 'Environmental'
@@ -190,16 +199,6 @@ def get_dataset(ident):
     return dataset
 
 
-def get_embedding_size(ident):
-    if 'original' in ident:
-        emb_size = '6K'
-    elif 'short' in ident:
-        emb_size = '512'
-    else:
-        emb_size = None
-
-    return emb_size
-
 def get_subset(ident):
     if 'music' in ident:
         subset = 'Music'
@@ -213,9 +212,9 @@ def get_tfrepr(ident):
     if '2_norm' in ident:
         tfrepr = 'Linear'
     elif '3_mel1' in ident:
-        tfrepr = 'Mel-128'
+        tfrepr = 'M128'
     elif '4_mel2' in ident:
-        tfrepr = 'Mel-256'
+        tfrepr = 'M256'
     elif 'vggish' in ident:
         tfrepr = "VGGish"
     elif 'soundnet' in ident:
@@ -231,11 +230,8 @@ def get_print_parts(ident1, ident2):
 
 
     intersection_tuple = []
-    int_emb_size = get_embedding_size(intersection)
     int_subset = get_subset(intersection)
     int_tfrepr = get_tfrepr(intersection)
-    if int_emb_size:
-        intersection_tuple.append(int_emb_size)
     if int_subset:
         intersection_tuple.append(int_subset)
     if int_tfrepr:
@@ -243,12 +239,9 @@ def get_print_parts(ident1, ident2):
 
     model1_list = []
     model1 = set(ident1) - set(ident2)
-    mod1_emb_size = get_embedding_size(model1)
     mod1_subset = get_subset(model1)
     mod1_tfrepr = get_tfrepr(model1)
     mod1_augmented = 'Augmented' if 'augmented' in model1 else None
-    if mod1_emb_size:
-        model1_list.append(mod1_emb_size)
     if mod1_subset:
         model1_list.append(mod1_subset)
     if mod1_tfrepr:
@@ -258,12 +251,9 @@ def get_print_parts(ident1, ident2):
 
     model2_list = []
     model2 = set(ident2) - set(ident1)
-    mod2_emb_size = get_embedding_size(model2)
     mod2_subset = get_subset(model2)
     mod2_tfrepr = get_tfrepr(model2)
     mod2_augmented = 'Augmented' if 'augmented' in model2 else None
-    if mod2_emb_size:
-        model2_list.append(mod2_emb_size)
     if mod2_subset:
         model2_list.append(mod2_subset)
     if mod2_tfrepr:
@@ -284,12 +274,10 @@ def compute_stat_test(data2, desc, var=None, dcase=False):
 
     if var == 'dataset':
         ident_idx = 1
-    elif var == 'embedding_length':
-        ident_idx = 2
     elif var == 'audioset_subset':
-        ident_idx = 3
+        ident_idx = 2
     elif var == 'embedding_model_type':
-        ident_idx = 4
+        ident_idx = 3
 
     if var:
         print(var)
@@ -320,7 +308,6 @@ def compute_stat_test(data2, desc, var=None, dcase=False):
         dataset = get_dataset(k[1:])
     else:
         dataset = get_dataset(models[0])
-
     print("{} - {}".format(dataset, desc))
     measurements = set()
 
@@ -377,18 +364,22 @@ def compute_stat_test(data2, desc, var=None, dcase=False):
 dcase2013_data, dcase2013_data2 = get_entries('dcase2013', 'DCASE', first_row=3)
 
 compute_stat_test(us8k_data2, "L3", 'embedding_model_type')
-compute_stat_test(us8k_data2, "L3", 'embedding_length')
 compute_stat_test(us8k_data2, "L3", 'audioset_subset')
 compute_stat_test(esc50_data2, "L3", 'embedding_model_type')
-compute_stat_test(esc50_data2, "L3", 'embedding_length')
 compute_stat_test(esc50_data2, "L3", 'audioset_subset')
 #compute_stat_test(dcase2013_data2, "L3")
 
-for var in ['embedding_model_type', 'embedding_length', 'audioset_subset']:
-    font = {'size' : 18}
+var_print_name = {
+    'embedding_model_type': 'Input Representation',
+    'audioset_subset': 'Embedding Training Data'
+}
+
+#fig, axarr_arr = plt.subplots(2, 3, figsize=(8, 7))
+for var_idx, var in enumerate(['embedding_model_type', 'audioset_subset']):
+    font = {'size' : 10}
     matplotlib.rc('font', **font)
 
-    fig, axarr = plt.subplots(3, figsize=(7, 10), sharex=True)
+    fig, axarr = plt.subplots(1, 3, figsize=(8,4)) #axarr_arr[var_idx]
 
     tick_labels = []
     """
@@ -398,23 +389,27 @@ def compute_stat_test(data2, desc, var=None, dcase=False):
                 tick_labels.append("{}/{}/{}".format(model_name, width, subset))
     """
     if var == 'embedding_model_type':
-        tick_labels = ["Linear", "Mel-128", "Mel-256"]
-    elif var == 'embedding_length':
-        tick_labels = ["6144", "512"]
+        tick_labels = ["Linear", "M128", "M256"]
     else:
         tick_labels = ["Env.", "Music"]
 
     for data, ax, dataset_name, tick_interval in zip([us8k_data, esc50_data, dcase2013_data], axarr, ['UrbanSound8K', 'ESC-50', 'DCASE 2013'], [0.03, 0.02, 0.01]):
         df = pandas.DataFrame(data)
-        ax = df.boxplot(column='test_acc', by=var, figsize=(7,9), showmeans=True, ax=ax)
+        ax = df.boxplot(column='test_acc', by=var, figsize=(7,9), showmeans=True, ax=ax, widths=0.65)
 
         ax.set_xticklabels(tick_labels, ha='right')
         ax.xaxis.set_tick_params(rotation=45)
         start, end = ax.get_ylim()
         ax.yaxis.set_ticks(np.arange(start, end, tick_interval))
         ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.2f'))
+        """
+        if var_idx == 0:
+            ax.set_title(dataset_name)
+        else:
+            ax.set_title('')
+        """
         ax.set_title(dataset_name)
-        ax.set_xlabel('')
+        ax.set_xlabel(var_print_name[var])
         ax.set_ylabel('Classification accuracy')
         fig.suptitle("")
 
@@ -441,7 +436,7 @@ def compute_stat_test(data2, desc, var=None, dcase=False):
     #ax.legend().set_visible(False)
 
     fig.tight_layout()
-    fig.savefig('us8k_test_boxplot_{}.png'.format(var))
+    fig.savefig('us8k_test_boxplot_1_{}.png'.format(var))
 
 
 
@@ -452,7 +447,7 @@ def compute_stat_test(data2, desc, var=None, dcase=False):
 
 
 
-l3_model = ('original', 'music', '3_mel1')
+l3_model = ('music', '3_mel1')
 us8k_data, us8k_data2 = get_entries('us8k', 'classifier', first_row=3, l3_only=False, l3_model=l3_model)
 esc50_data, esc50_data2 = get_entries('esc50', 'ESC50', first_row=3, l3_only=False, l3_model=l3_model)
 dcase2013_data, dcase2013_data2 = get_entries('dcase2013', 'DCASE', first_row=3, l3_only=False, l3_model=l3_model)
@@ -522,8 +517,8 @@ def compute_stat_test(data2, desc, var=None, dcase=False):
 
 for data, ax, dataset_name, tick_interval, (start, end) in zip([us8k_data, esc50_data, dcase2013_data], axarr, ['UrbanSound8K', 'ESC-50', 'DCASE 2013'], [0.05, 0.05, 0.02], [(0.55, 0.9), (0.40, 0.85), (0.80, 1.)]):
     df = pandas.DataFrame(data)
-    ax = df.boxplot(column='test_acc', by=['embedding_model_type', 'embedding_length', 'audioset_subset'], showmeans=True, ax=ax)
-    ax.set_xticklabels(["L3-M128/6K/Mus", "SoundNet", "VGGish"], ha='right')
+    ax = df.boxplot(column='test_acc', by=['embedding_model_type', 'audioset_subset'], showmeans=True, ax=ax, widths=0.65)
+    ax.set_xticklabels(["L3-M128/Mus", "SoundNet", "VGGish"], ha='right')
     ax.xaxis.set_tick_params(rotation=45)
     #tick_interval = 0.05
     #start, end = 0.4, 1.0
@@ -566,14 +561,14 @@ def compute_stat_test(data2, desc, var=None, dcase=False):
 
 
 
-l3_model = ('original', 'music', '4_mel2')
+l3_model = ('music', '3_mel1')
 us8k_data, us8k_data2, = get_entries('us8k', 'classifier', first_row=3, l3_only=False, l3_model=l3_model)
 us8k_aug_data, us8k_aug_data2, = get_entries('us8k', 'classifier_augmented', first_row=3, l3_only=False, l3_model=l3_model)
 
 [x.update({'augmented': "false"}) for x in us8k_data]
 [x.update({'augmented': "true"}) for x in us8k_aug_data]
 
-us8k_data2 = {k:v for k,v in us8k_data2.items() if k[1:] == ('us8k', 'original', 'music', '4_mel2')}
+us8k_data2 = {k:v for k,v in us8k_data2.items() if k[1:] == ('us8k', 'music', '3_mel1')}
 us8k_data = [x for x in us8k_data if 'l3' in x['model_id']]
 us8k_data += us8k_aug_data
 us8k_data2.update({k + ('augment',): v for k, v in us8k_aug_data2.items()})
@@ -588,7 +583,7 @@ def compute_stat_test(data2, desc, var=None, dcase=False):
 
 df = pandas.DataFrame(us8k_data)
 ax = df.boxplot(column='test_acc', by=['augmented'], showmeans=True, ax=fig.gca())
-ax.set_xticklabels(["L3-M256/6K/Mus", "L3-M256/6K/Mus-Aug"])
+ax.set_xticklabels(["L3-M128/Mus", "L3-M128/Mus-Aug"])
 start, end = ax.get_ylim()
 ax.yaxis.set_ticks(np.arange(start, end, 0.04))
 ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.2f'))
@@ -672,3 +667,195 @@ def compute_stat_test(data2, desc, var=None, dcase=False):
 
 fig.tight_layout()
 fig.savefig('us8k_test_boxplot_4.png')
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+us8k_data, us8k_data2 = get_entries('us8k', 'classifier', first_row=3)
+
+l3_model = ('music', '3_mel1')
+us8k_data, us8k_data2, = get_entries('us8k', 'classifier', first_row=3, l3_only=False, l3_model=l3_model)
+us8k_aug_data, us8k_aug_data2, = get_entries('us8k', 'classifier_augmented', first_row=3, l3_only=False, l3_model=l3_model)
+
+font = {'size' : 12}
+matplotlib.rc('font', **font)
+fig, axes = plt.subplots(ncols=10, figsize=(13, 5), sharey=True)
+fig.subplots_adjust(wspace=0)
+#repr_strs = ["Linear", "M128", "M256"]
+#repr_ids = ["2_norm", "3_mel1", "4_mel2"]
+
+repr_strs = ["L3-M128/Mus", "L3-M128/Mus-Aug"]
+
+us8k_classes = [
+    'air conditioner',
+    'car horn',
+    'children playing',
+    'dog bark',
+    'drilling',
+    'engine idling',
+    'gun shot',
+    'jackhammer',
+    'siren',
+    'street music'
+]
+
+intervals = [
+0.1,
+0.05,
+0.02,
+0.02,
+0.05,
+0.05,
+0.05,
+0.05,
+0.05,
+0.05
+]
+
+
+for ax, class_idx, class_name in zip(axes, range(10), us8k_classes):
+
+    """
+    linear_acc_list = []
+    m128_acc_list = []
+    m256_acc_list = []
+
+    for repr_id, acc_list in zip(repr_ids, [linear_acc_list, m128_acc_list, m256_acc_list]):
+        for row in us8k_data:
+            if row['embedding_model_type'] == repr_id:
+                acc_list.append([float(x) for x in row['test_class_acc'].split(', ')][class_idx])
+
+
+    ax.boxplot([linear_acc_list, m128_acc_list, m256_acc_list], showmeans=True, labels=repr_strs, widths=0.65)
+    ax.set_xticklabels(['' for _ in range(3)], ha='right')
+    ax.set_xlabel(class_name.replace(' ', '\n'))
+    ax.margins(0.05) # Optional
+    """
+
+    nonaug_acc_list = []
+    aug_acc_list = []
+
+    for row in us8k_data:
+        nonaug_acc_list.append([float(x) for x in row['test_class_acc'].split(', ')][class_idx])
+
+    for row in us8k_aug_data:
+        aug_acc_list.append([float(x) for x in row['test_class_acc'].split(', ')][class_idx])
+
+    ax.boxplot([nonaug_acc_list, aug_acc_list], showmeans=True, labels=repr_strs, widths=0.65)
+    ax.set_xticklabels(['' for _ in range(2)], ha='right')
+    ax.set_xlabel(class_name.replace(' ', '\n'))
+    ax.margins(0.05) # Optional
+
+
+    ax.set_ylim(0, 1.0)
+    start, end = ax.get_ylim()
+    ax.yaxis.set_ticks(np.arange(start, end + 0.1, 0.1))
+    """
+    start, end = ax.get_ylim()
+    start = max(start, 0)
+    end = min(end, 1.0)
+    ax.set_ylim(start, end)
+    if end <= 1.0 and np.arange(start, end+intervals[class_idx], intervals[class_idx])[-1] <= 1.0:
+        end = end + intervals[class_idx]
+    ax.yaxis.set_ticks(np.arange(start, end, intervals[class_idx]))
+    """
+    ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.2f'))
+    ax.set_title("")
+    ax.grid(True)
+
+    if class_idx == 0:
+        ax.set_ylabel('Classification accuracy')
+    fig.suptitle("")
+
+
+    # Set colors
+    colors = sns.color_palette(None, n_colors=3)
+    children = ax.get_children()
+    for idx in range(2):
+        box = children[idx*8]
+        xs = box.get_xdata()
+        ys = box.get_ydata()
+        ax.fill_between((xs[0], xs[1]), ys[0], ys[2], color=colors[idx] + (0.5,))
+
+
+colors = sns.color_palette(None, n_colors=2)
+patches = []
+for idx in range(2):
+    patches.append(mpatches.Patch(color=colors[idx], label=repr_strs[idx]))
+
+
+# Put a legend to the right of the current axis
+
+plt.tight_layout()
+for ax in axes:
+    box = ax.get_position()
+    ax.set_position([box.x0, box.y0, box.width, box.height * 0.94])
+plt.figlegend(handles=patches, loc="upper center",  ncol=len(patches), framealpha=1.0)
+
+fig.savefig('us8k_test_boxplot_5.png')
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+avc_epoch_results = [('1', 0.50244140625),
+ ('10', 0.58526611328125),
+ ('20', 0.6324615478515625),
+ ('50', 0.68865966796875),
+ ('100', 0.729644775390625),
+ ('150', 0.7453155517578125),
+ ('200', 0.7532501220703125),
+ ('250', 0.7622222900390625),
+ ('300', 0.77081298828125)]
+
+tick_labels = ['0M', '0.26M', '2.62M', '5.24M', '13.11M', '26.21M', '39.32M', '52.43M', '65.54M', '78.64M']
+
+_, acc_list = zip(*avc_epoch_results)
+num_examples = [float(x.replace('M', '')) for x in tick_labels]
+
+font = {'size' : 8}
+matplotlib.rc('font', **font)
+
+fig = plt.figure(figsize=(5,3))
+ax = plt.subplot(111)
+plt.plot(np.arange(len(acc_list)), acc_list)
+ax.set_xticklabels(tick_labels, ha='right')
+ax.xaxis.set_tick_params(rotation=45)
+ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.2f'))
+ax.set_xlabel('# training examples')
+ax.set_ylabel('AVC accuracy')
+
+fig.tight_layout()
+fig.savefig('us8k_test_boxplot_6.png')
+

From af9a5b514476767ae7a7ea5c66c45759f58c9d5b Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Tue, 28 Aug 2018 21:02:05 -0400
Subject: [PATCH 201/201] Add missing sample rate argument to Melspectrogram
 constructor

---
 l3embedding/audio_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/l3embedding/audio_model.py b/l3embedding/audio_model.py
index fa6097e..72ab276 100644
--- a/l3embedding/audio_model.py
+++ b/l3embedding/audio_model.py
@@ -255,7 +255,7 @@ def construct_cnn_L3_melspec1_audio_model():
     # MELSPECTROGRAM PREPROCESSING
     # 128 x 199 x 1
     y_a = Melspectrogram(n_dft=n_dft, n_hop=n_hop, n_mels=n_mels,
-                      power_melgram=1.0, htk=True, # n_win=n_win,
+                      sr=asr, power_melgram=1.0, htk=True, # n_win=n_win,
                       return_decibel_melgram=True, padding='same')(x_a)
 
     # CONV BLOCK 1
@@ -364,7 +364,7 @@ def construct_cnn_L3_melspec2_audio_model():
     # MELSPECTROGRAM PREPROCESSING
     # 128 x 199 x 1
     y_a = Melspectrogram(n_dft=n_dft, n_hop=n_hop, n_mels=n_mels,
-                      power_melgram=1.0, htk=True, # n_win=n_win,
+                      sr=asr, power_melgram=1.0, htk=True, # n_win=n_win,
                       return_decibel_melgram=True, padding='same')(x_a)
 
     # CONV BLOCK 1