update

jtkim-kaist · Jan 25, 2018 · b2912cf · b2912cf
1 parent 5b67f99
commit b2912cf
Show file tree

Hide file tree

Showing 66 changed files with 1,799 additions and 366 deletions.
diff --git a/README.md b/README.md
@@ -43,28 +43,6 @@ The result will be like following figure.
 Note: To apply this toolkit to other speech data, the speech data should be sampled with 16kHz sampling frequency.
 
 ![alt tag](https://user-images.githubusercontent.com/24668469/32533149-5526a77e-c492-11e7-909f-a7c7983d9dd4.jpg)
-## Training
-We attached the sample database to 'path/to/project/data/raw'. Please refer to the database for understanding the data format. 
-The training procedure has 2 steps: (i) MRCG extraction; (ii) Model training.
-
-Note: Do not forget adding the path to this project in the matlab. Current version only supports DNN based training. We will update training script for other models.
-
-```
-# train.sh
-# train script options
-# m 0 : DNN
-# e : extract MRCG feature (1) or not (0). 
-# The MRCG extraction time is somewhat long so you can pass the feature extraction step if you already have MRCG feature.
-
-python3 $train -m 0 -e 1 --train_step=100 --prj_dir=$curdir
-
-# ckpt_update script options
-# u : update checkpoint from trained model (1) or restore checkpoint to default (0).
-# Note that when u==0, the normalization factor is also restored to default.
-# After training you should update the model checkpoint with the normalization factor.
-
-python3 $ckpt_update -u 1 --model=DNN --prj_dir=$curdir
-```
 
 ## Recorded Dataset
 Our recored dataset is freely available: 
@@ -88,10 +66,6 @@ At each environment, conversational speech by two Korean male speakers was recor
 | Dur. (min)    | 30.02         | 30.03         | 30.07         | 30.05         | 120.17        |
 | Avg. SNR (dB) | 5.61          | 2.05          | 5.71          | 18.26         | 7.91          |
 | % of speech   | 40.12         | 26.71         | 26.85         | 30.44         | 31.03         |
-## TODO List
-1. Freezing the graph for running the model fast.
-2. Training script for bDNN, LSTM, ACAM --> will be updated until 2018-01-26
-3. Although MRCG show good performance but extraction time is somewhat long, therefore we will substitute it to other feature such as spectrogram.
 ## Trouble Shooting
 If you find any errors in the code, please contact to us.
 

diff --git a/configure/ACAM/config.py b/configure/ACAM/config.py
@@ -0,0 +1,13 @@
+lr = 0.0001
+dropout_rate = 0.5
+max_epoch = 100
+batch_size = 128
+w = 19
+u = 9
+glimpse_hidden = 128
+bp_hidden = 128
+glimpse_out = 128
+nGlimpse = 7
+lstm_cell_size = 128
+action_hidden_1 = 256
+action_hidden_2 = 256
diff --git a/configure/DNN/config.py b/configure/DNN/config.py
@@ -0,0 +1,8 @@
+lr=0.0001
+dropout_rate=0.5
+max_epoch=100
+batch_size=128
+w=19
+u=9
+num_hidden_1=512
+num_hidden_2=512
diff --git a/configure/LSTM/config.py b/configure/LSTM/config.py
@@ -0,0 +1,9 @@
+lr=0.0001           # Learning rate
+max_epoch=100       # Max epoch
+dropout_rate=0.5    # Dropout rate
+target_delay=5      # Target delay of LSTM
+num_layers=3        # The number of layers of LSTM
+cell_size=256       # LSTM cell size
+seq_len=20          # Sequence length
+num_batches=200     # The number of batches
+                    # Note that batch_size=seq_len*num_batches
diff --git a/configure/bDNN/config.py b/configure/bDNN/config.py
@@ -0,0 +1,8 @@
+lr = 0.0001
+dropout_rate = 0.5
+max_epoch = 1000
+batch_size = 128
+w = 19
+u = 9
+num_hidden_1 = 512
+num_hidden_2 = 512
diff --git a/lib/matlab/vad_func.m b/lib/matlab/vad_func.m
@@ -1,4 +1,4 @@
-function [ result, pp ] = vad_func( audio_dir, mode, threshold, output_type )
+function [ result, pp ] = vad_func( audio_dir, mode, threshold, output_type, is_default )
 
     system('rm -rf result');
     system('rm -rf sample_data');
@@ -7,11 +7,11 @@
     [data_len, winlen, winstep] = mrcg_extract( audio_dir );
 
     if mode == 3
-        python_command = sprintf('python3 ./lib/python/VAD_test.py -m %d -l %d -b 100 --data_dir=./sample_data --model_dir=./saved_model --norm_dir=./norm_data', ... 
-        mode, data_len);
+        python_command = sprintf('python3 ./lib/python/VAD_test.py -m %d -l %d -d %d --data_dir=./sample_data --model_dir=./saved_model --norm_dir=./norm_data', ... 
+        mode, data_len, is_default);
     else
-        python_command = sprintf('python3 ./lib/python/VAD_test.py -m %d -l %d -b 4096 --data_dir=./sample_data --model_dir=./saved_model --norm_dir=./norm_data', ... 
-        mode, data_len);
+        python_command = sprintf('python3 ./lib/python/VAD_test.py -m %d -l %d -d %d --data_dir=./sample_data --model_dir=./saved_model --norm_dir=./norm_data', ... 
+        mode, data_len, is_default);
     end
 
     mkdir './result'
@@ -23,7 +23,7 @@
     pp = pred;
     result = zeros(length(pp), 1);
     result(pp>threshold) = 1;
-    
+
     if output_type == 1
         result = frame2rawlabel(result, winlen, winstep);
         pp = frame2inpt(pp, winlen, winstep);

diff --git a/lib/python/VAD_DNN.py b/lib/python/VAD_DNN.py
@@ -2,8 +2,8 @@
 import numpy as np
 import utils as utils
 import re
-import data_reader_DNN as dr
-import os
+import data_reader_DNN_v2 as dr
+import os, sys
 import matplotlib.pyplot as plt
 import matplotlib.image as mpimg
 from sklearn import metrics
@@ -70,33 +70,6 @@
 eval_type = 2
 
 
-def train_config(c_train_dir, c_valid_dir, c_logs_dir, c_batch_size_eval, c_max_epoch, c_mode):
-
-    global file_dir
-    global input_dir
-    global output_dir
-    global valid_file_dir
-    global norm_dir
-    global initial_logs_dir
-    global logs_dir
-    global ckpt_name
-    global batch_size
-    global valid_batch_size
-    global mode
-    global max_epoch
-
-    file_dir = c_train_dir
-    valid_file_dir = c_valid_dir
-    input_dir = file_dir
-    output_dir = file_dir + "/Labels"
-
-    norm_dir = file_dir
-    initial_logs_dir = logs_dir = c_logs_dir
-    batch_size = valid_batch_size = c_batch_size_eval + 2 * w
-    max_epoch = c_max_epoch
-    mode = c_mode
-
-
 def test_config(c_test_dir, c_norm_dir, c_initial_logs_dir, c_batch_size_eval, c_data_len):
 
     global test_file_dir
@@ -347,11 +320,12 @@ def __init__(self, is_training=True):
         self.logits = logits = inference(inputs, self.keep_probability, is_training=is_training)  # (batch_size, bdnn_outputsize)
         # set objective function
         pred = tf.argmax(logits, axis=1, name="prediction")
-        softpred = logits[:, 1]
+        softpred = tf.identity(logits[:, 1], name="soft_pred")
         pred = tf.cast(pred, tf.int32)
         truth = tf.cast(labels[:, 1], tf.int32)
 
-        self.raw_labels = truth
+        self.raw_labels = tf.identity(truth, name="raw_labels")
+
         self.softpred = softpred
         self.accuracy = tf.reduce_mean(tf.cast(tf.equal(pred, truth), tf.float32))
         self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits))
@@ -365,7 +339,41 @@ def __init__(self, is_training=True):
         self.train_op = train(self.cost, trainable_var)
 
 
-def main(argv=None):
+def main(prj_dir=None, model=None, mode=None):
+
+    #                               Configuration Part                       #
+    if mode is 'train':
+        import path_setting as ps
+
+        set_path = ps.PathSetting(prj_dir, model)
+        logs_dir = initial_logs_dir = set_path.logs_dir
+        input_dir = set_path.input_dir
+        output_dir = set_path.output_dir
+        norm_dir = set_path.norm_dir
+        valid_file_dir = set_path.valid_file_dir
+
+        sys.path.insert(0, prj_dir+'/configure/DNN')
+        import config as cg
+
+        global learning_rate, dropout_rate, max_epoch, batch_size, valid_batch_size
+        learning_rate = cg.lr
+        dropout_rate = cg.dropout_rate
+        max_epoch = cg.max_epoch
+        batch_size = valid_batch_size = cg.batch_size
+
+        global w, u
+        w = cg.w
+        u = cg.u
+
+        global bdnn_winlen, bdnn_inputsize, bdnn_outputsize
+        bdnn_winlen = (((w-1) / u) * 2) + 3
+        bdnn_inputsize = int(bdnn_winlen * num_features)
+        bdnn_outputsize = 2
+
+        global num_hidden_1, num_hidden_2
+        num_hidden_1 = cg.num_hidden_1
+        num_hidden_2 = cg.num_hidden_2
+
     #                               Graph Part                               #
     print("Graph initialization...")
     with tf.device(device):
@@ -386,10 +394,6 @@ def main(argv=None):
         cost_summary_op = tf.summary.scalar("cost", summary_ph)
         accuracy_summary_op = tf.summary.scalar("accuracy", summary_ph)
 
-    if mode is 'train':
-        train_summary_writer = tf.summary.FileWriter(logs_dir + '/train/', max_queue=2)
-        valid_summary_writer = tf.summary.FileWriter(logs_dir + '/valid/', max_queue=2)
-
     # summary_dic = summary_generation(valid_file_dir)
 
     print("Done")
@@ -398,7 +402,7 @@ def main(argv=None):
 
     print("Setting up Saver...")
     saver = tf.train.Saver()
-    ckpt = tf.train.get_checkpoint_state(logs_dir)
+    ckpt = tf.train.get_checkpoint_state(logs_dir + '/DNN')
     print("Done")
 
     #                               Session Part                              #
@@ -407,13 +411,18 @@ def main(argv=None):
     sess_config.gpu_options.allow_growth = True
     sess = tf.Session(config=sess_config)
 
+    if mode is 'train':
+        train_summary_writer = tf.summary.FileWriter(logs_dir + '/train/', sess.graph, max_queue=2)
+        valid_summary_writer = tf.summary.FileWriter(logs_dir + '/valid/', max_queue=2)
+
     if ckpt and ckpt.model_checkpoint_path:  # model restore
         print("Model restored...")
 
         if mode is 'train':
             saver.restore(sess, ckpt.model_checkpoint_path)
         else:
             saver.restore(sess, initial_logs_dir+ckpt_name)
+            # saver.save(sess, logs_dir + "/model_DNN.ckpt", 0)  # model save
 
         print("Done")
     else:
@@ -459,8 +468,7 @@ def main(argv=None):
                 saver.save(sess, logs_dir + "/model.ckpt", itr)  # model save
                 print('validation start!')
                 valid_accuracy, valid_cost = \
-                    utils.do_validation(m_valid, sess, valid_batch_size, valid_file_dir, norm_dir,
-                                        model_config, type='DNN')
+                    utils.do_validation(m_valid, sess, valid_file_dir, norm_dir, type='DNN')
 
                 print("valid_cost: %.4f, valid_accuracy=%4.4f" % (valid_cost, valid_accuracy * 100))
                 valid_cost_summary_str = sess.run(cost_summary_op, feed_dict={summary_ph: valid_cost})