Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
jtkim-kaist committed Jan 25, 2018
1 parent 5b67f99 commit b2912cf
Show file tree
Hide file tree
Showing 66 changed files with 1,799 additions and 366 deletions.
26 changes: 0 additions & 26 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,28 +43,6 @@ The result will be like following figure.
Note: To apply this toolkit to other speech data, the speech data should be sampled with 16kHz sampling frequency.

![alt tag](https://user-images.githubusercontent.com/24668469/32533149-5526a77e-c492-11e7-909f-a7c7983d9dd4.jpg)
## Training
We attached the sample database to 'path/to/project/data/raw'. Please refer to the database for understanding the data format. 
The training procedure has 2 steps: (i) MRCG extraction; (ii) Model training.

Note: Do not forget adding the path to this project in the matlab. Current version only supports DNN based training. We will update training script for other models.

```
# train.sh
# train script options
# m 0 : DNN
# e : extract MRCG feature (1) or not (0).
# The MRCG extraction time is somewhat long so you can pass the feature extraction step if you already have MRCG feature.
python3 $train -m 0 -e 1 --train_step=100 --prj_dir=$curdir
# ckpt_update script options
# u : update checkpoint from trained model (1) or restore checkpoint to default (0).
# Note that when u==0, the normalization factor is also restored to default.
# After training you should update the model checkpoint with the normalization factor.
python3 $ckpt_update -u 1 --model=DNN --prj_dir=$curdir
```

## Recorded Dataset
Our recored dataset is freely available:
Expand All @@ -88,10 +66,6 @@ At each environment, conversational speech by two Korean male speakers was recor
| Dur. (min) | 30.02 | 30.03 | 30.07 | 30.05 | 120.17 |
| Avg. SNR (dB) | 5.61 | 2.05 | 5.71 | 18.26 | 7.91 |
| % of speech | 40.12 | 26.71 | 26.85 | 30.44 | 31.03 |
## TODO List
1. Freezing the graph for running the model fast.
2. Training script for bDNN, LSTM, ACAM --> will be updated until 2018-01-26
3. Although MRCG show good performance but extraction time is somewhat long, therefore we will substitute it to other feature such as spectrogram.
## Trouble Shooting
If you find any errors in the code, please contact to us.

Expand Down
13 changes: 13 additions & 0 deletions configure/ACAM/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
lr = 0.0001
dropout_rate = 0.5
max_epoch = 100
batch_size = 128
w = 19
u = 9
glimpse_hidden = 128
bp_hidden = 128
glimpse_out = 128
nGlimpse = 7
lstm_cell_size = 128
action_hidden_1 = 256
action_hidden_2 = 256
8 changes: 8 additions & 0 deletions configure/DNN/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
lr=0.0001
dropout_rate=0.5
max_epoch=100
batch_size=128
w=19
u=9
num_hidden_1=512
num_hidden_2=512
9 changes: 9 additions & 0 deletions configure/LSTM/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
lr=0.0001 # Learning rate
max_epoch=100 # Max epoch
dropout_rate=0.5 # Dropout rate
target_delay=5 # Target delay of LSTM
num_layers=3 # The number of layers of LSTM
cell_size=256 # LSTM cell size
seq_len=20 # Sequence length
num_batches=200 # The number of batches
# Note that batch_size=seq_len*num_batches
8 changes: 8 additions & 0 deletions configure/bDNN/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
lr = 0.0001
dropout_rate = 0.5
max_epoch = 1000
batch_size = 128
w = 19
u = 9
num_hidden_1 = 512
num_hidden_2 = 512
12 changes: 6 additions & 6 deletions lib/matlab/vad_func.m
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
function [ result, pp ] = vad_func( audio_dir, mode, threshold, output_type )
function [ result, pp ] = vad_func( audio_dir, mode, threshold, output_type, is_default )

system('rm -rf result');
system('rm -rf sample_data');
Expand All @@ -7,11 +7,11 @@
[data_len, winlen, winstep] = mrcg_extract( audio_dir );

if mode == 3
python_command = sprintf('python3 ./lib/python/VAD_test.py -m %d -l %d -b 100 --data_dir=./sample_data --model_dir=./saved_model --norm_dir=./norm_data', ...
mode, data_len);
python_command = sprintf('python3 ./lib/python/VAD_test.py -m %d -l %d -d %d --data_dir=./sample_data --model_dir=./saved_model --norm_dir=./norm_data', ...
mode, data_len, is_default);
else
python_command = sprintf('python3 ./lib/python/VAD_test.py -m %d -l %d -b 4096 --data_dir=./sample_data --model_dir=./saved_model --norm_dir=./norm_data', ...
mode, data_len);
python_command = sprintf('python3 ./lib/python/VAD_test.py -m %d -l %d -d %d --data_dir=./sample_data --model_dir=./saved_model --norm_dir=./norm_data', ...
mode, data_len, is_default);
end

mkdir './result'
Expand All @@ -23,7 +23,7 @@
pp = pred;
result = zeros(length(pp), 1);
result(pp>threshold) = 1;

if output_type == 1
result = frame2rawlabel(result, winlen, winstep);
pp = frame2inpt(pp, winlen, winstep);
Expand Down
86 changes: 47 additions & 39 deletions lib/python/VAD_DNN.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import numpy as np
import utils as utils
import re
import data_reader_DNN as dr
import os
import data_reader_DNN_v2 as dr
import os, sys
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn import metrics
Expand Down Expand Up @@ -70,33 +70,6 @@
eval_type = 2


def train_config(c_train_dir, c_valid_dir, c_logs_dir, c_batch_size_eval, c_max_epoch, c_mode):

global file_dir
global input_dir
global output_dir
global valid_file_dir
global norm_dir
global initial_logs_dir
global logs_dir
global ckpt_name
global batch_size
global valid_batch_size
global mode
global max_epoch

file_dir = c_train_dir
valid_file_dir = c_valid_dir
input_dir = file_dir
output_dir = file_dir + "/Labels"

norm_dir = file_dir
initial_logs_dir = logs_dir = c_logs_dir
batch_size = valid_batch_size = c_batch_size_eval + 2 * w
max_epoch = c_max_epoch
mode = c_mode


def test_config(c_test_dir, c_norm_dir, c_initial_logs_dir, c_batch_size_eval, c_data_len):

global test_file_dir
Expand Down Expand Up @@ -347,11 +320,12 @@ def __init__(self, is_training=True):
self.logits = logits = inference(inputs, self.keep_probability, is_training=is_training) # (batch_size, bdnn_outputsize)
# set objective function
pred = tf.argmax(logits, axis=1, name="prediction")
softpred = logits[:, 1]
softpred = tf.identity(logits[:, 1], name="soft_pred")
pred = tf.cast(pred, tf.int32)
truth = tf.cast(labels[:, 1], tf.int32)

self.raw_labels = truth
self.raw_labels = tf.identity(truth, name="raw_labels")

self.softpred = softpred
self.accuracy = tf.reduce_mean(tf.cast(tf.equal(pred, truth), tf.float32))
self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits))
Expand All @@ -365,7 +339,41 @@ def __init__(self, is_training=True):
self.train_op = train(self.cost, trainable_var)


def main(argv=None):
def main(prj_dir=None, model=None, mode=None):

# Configuration Part #
if mode is 'train':
import path_setting as ps

set_path = ps.PathSetting(prj_dir, model)
logs_dir = initial_logs_dir = set_path.logs_dir
input_dir = set_path.input_dir
output_dir = set_path.output_dir
norm_dir = set_path.norm_dir
valid_file_dir = set_path.valid_file_dir

sys.path.insert(0, prj_dir+'/configure/DNN')
import config as cg

global learning_rate, dropout_rate, max_epoch, batch_size, valid_batch_size
learning_rate = cg.lr
dropout_rate = cg.dropout_rate
max_epoch = cg.max_epoch
batch_size = valid_batch_size = cg.batch_size

global w, u
w = cg.w
u = cg.u

global bdnn_winlen, bdnn_inputsize, bdnn_outputsize
bdnn_winlen = (((w-1) / u) * 2) + 3
bdnn_inputsize = int(bdnn_winlen * num_features)
bdnn_outputsize = 2

global num_hidden_1, num_hidden_2
num_hidden_1 = cg.num_hidden_1
num_hidden_2 = cg.num_hidden_2

# Graph Part #
print("Graph initialization...")
with tf.device(device):
Expand All @@ -386,10 +394,6 @@ def main(argv=None):
cost_summary_op = tf.summary.scalar("cost", summary_ph)
accuracy_summary_op = tf.summary.scalar("accuracy", summary_ph)

if mode is 'train':
train_summary_writer = tf.summary.FileWriter(logs_dir + '/train/', max_queue=2)
valid_summary_writer = tf.summary.FileWriter(logs_dir + '/valid/', max_queue=2)

# summary_dic = summary_generation(valid_file_dir)

print("Done")
Expand All @@ -398,7 +402,7 @@ def main(argv=None):

print("Setting up Saver...")
saver = tf.train.Saver()
ckpt = tf.train.get_checkpoint_state(logs_dir)
ckpt = tf.train.get_checkpoint_state(logs_dir + '/DNN')
print("Done")

# Session Part #
Expand All @@ -407,13 +411,18 @@ def main(argv=None):
sess_config.gpu_options.allow_growth = True
sess = tf.Session(config=sess_config)

if mode is 'train':
train_summary_writer = tf.summary.FileWriter(logs_dir + '/train/', sess.graph, max_queue=2)
valid_summary_writer = tf.summary.FileWriter(logs_dir + '/valid/', max_queue=2)

if ckpt and ckpt.model_checkpoint_path: # model restore
print("Model restored...")

if mode is 'train':
saver.restore(sess, ckpt.model_checkpoint_path)
else:
saver.restore(sess, initial_logs_dir+ckpt_name)
# saver.save(sess, logs_dir + "/model_DNN.ckpt", 0) # model save

print("Done")
else:
Expand Down Expand Up @@ -459,8 +468,7 @@ def main(argv=None):
saver.save(sess, logs_dir + "/model.ckpt", itr) # model save
print('validation start!')
valid_accuracy, valid_cost = \
utils.do_validation(m_valid, sess, valid_batch_size, valid_file_dir, norm_dir,
model_config, type='DNN')
utils.do_validation(m_valid, sess, valid_file_dir, norm_dir, type='DNN')

print("valid_cost: %.4f, valid_accuracy=%4.4f" % (valid_cost, valid_accuracy * 100))
valid_cost_summary_str = sess.run(cost_summary_op, feed_dict={summary_ph: valid_cost})
Expand Down
Loading

0 comments on commit b2912cf

Please sign in to comment.