read rttm output from yunitator, and deliver another rttm file with vcm

MilesICL · Dec 3, 2018 · d1ad811 · d1ad811
1 parent a813d5f
commit d1ad811
Show file tree

Hide file tree

Showing 12 changed files with 144 additions and 30 deletions.
diff --git a/HTK.py b/HTK.py
diff --git a/Net.py b/Net.py
@@ -15,7 +15,7 @@ def forward(self, x):
         x = F.relu(self.fc1(x))
         x = F.relu(self.fc2(x))
         x = self.fc3(x)
-        return F.log_softmax(x, dim=1)
+        return F.softmax(x, dim=1)
 
 
 class NetSyll(nn.Module):
@@ -29,4 +29,4 @@ def forward(self, x):
         x = F.relu(self.fc1(x))
         x = F.relu(self.fc2(x))
         x = self.fc3(x)
-        return F.log_softmax(x, dim=1)
+        return F.softmax(x, dim=1)
diff --git a/data/VCMtemp/example1_0.htk b/data/VCMtemp/example1_0.htk
diff --git a/data/VCMtemp/example1_0.rttm_sorted b/data/VCMtemp/example1_0.rttm_sorted
diff --git a/data/example1_0.wav b/data/example1_0.wav
diff --git a/data/example2_1.wav b/data/example2_1.wav
diff --git a/data/example3_0.wav b/data/example3_0.wav
diff --git a/data/example4_1.wav b/data/example4_1.wav
diff --git a/evaluate_vcm.py b/evaluate_vcm.py
diff --git a/runVCM.sh b/runVCM.sh
@@ -1,41 +1,44 @@
 #!/bin/bash
-# runDiarNoisemes.sh
-
-# run OpenSAT with hard coded models & configs found here and in /vagrant
-# assumes Python environment in /home/${user}/
-# usage: runDiarNoisemes.sh <folder containing .wav files to process>
-
-# Absolute path to this script. /home/user/bin/foo.sh
-SCRIPT=$(readlink -f $0)
-# Absolute path this script is in. /home/user/bin
-BASEDIR=`dirname $SCRIPT`
-
-
-filename=$(basename "$1")
-dirname=$(dirname "$1")
-extension="${filename##*.}"
-basename="${filename%.*}"
-
-# this is set in user's login .bashrc
-export PATH=/home/${USER}/anaconda/bin:$PATH
 
+# usage introduction 
 if [ $# -ne 1 ]; then
   echo "Usage: runVCM.sh <audiofile>"
   exit 1;
 fi
 
-# let's get our bearings: set CWD to path of this script
+# direction of scripts and set path 
+export PATH=/home/${USER}/anaconda/bin:$PATH
+SCRIPT=$(readlink -f $0)
+BASEDIR=`dirname $SCRIPT`
 cd $BASEDIR
 echo $BASEDIR
 
-# make output folder for features, below input folder
-mkdir -p $dirname/VCMtemp/
+# check results from Yunitator. If not, run Yunitator first to obtain yunitator_rttm_file
+audio_file=$1 
+bn=$(basename $audio_file)
+dn=$(dirname $audio_file)
+yunitator_rttm_file=$dn"/yunitator_"${bn//wav/rttm}  # yunicator output 
+if [ ! -e $yunitator_rttm_file ]; then 
+	echo "Error: Cannot find corresponding SAD outputs. Please run yunicatator first!"
+	exit 1; 
+fi 
+vcm_rttm_file=$dn"/vcm_"${bn//wav/rttm} # vcm output 
+
+
+# # make output folder for features, below input folder
+# KEEPTEMP=false
+# if [ $BASH_ARGV == "--keep-temp" ]; then
+#     KEEPTEMP=true
+# fi
+# VCMTEMP=$dn/VCMtemp
+# mkdir -p $VCMTEMP
 
+# do vcm recognition 
+python2 ./vcm_evaluate.py ${audio_file} ${yunitator_rttm_file} ${vcm_rttm_file}
 
-echo $dirname/VCMtemp
+# # simply remove segmented waves and acoustic features 
+# if ! $KEEPTEMP; then
+#     rm -rf $VCMTEMP
+# fi
 
-# first features
-./extract-htk-vm2.sh $1
 
-# # then confidences
-python2 evaluate_vcm.py $dirname/VCMtemp/$basename.htk $dirname/VCMtemp/vcm_$basename.rttm
diff --git a/structNN.py b/structNN.py
diff --git a/vcm_evaluate.py b/vcm_evaluate.py
@@ -0,0 +1,112 @@
+import sys, os, os.path
+import torch
+from torch.autograd import Variable
+from Net import NetLing, NetSyll
+try:
+    import _picke as pickle
+except:
+    import cPickle as pickle
+from HTK import HTKFile
+import numpy as np
+import pandas as pd
+import subprocess
+import shutil
+
+
+def seg_audio(input_audio, output_audio, onset, duration):
+    cmd_seg = 'sox ' + input_audio + " " + output_audio + ' trim ' + " " + onset + " " + duration
+    subprocess.call(cmd_seg, shell=True)
+
+
+def extract_feature(audio, feature):
+    config = './config/gemaps/eGeMAPSv01a.conf'
+    opensmile = '~/repos/opensmile-2.3.0/bin/linux_x64_standalone_static/SMILExtract'
+    # opensmile = '~/tools/opensmile-2.3.0/bin/linux_x64_standalone_static/SMILExtract'
+    cmd = '{} -C {} -I {} -htkoutput {}'.format(opensmile, config, audio, feature)
+    subprocess.call(cmd, shell=True)
+
+
+def predict_vcm(model, input, mean_var):
+    ### read normalisation parameters
+    assert os.path.exists(mean_var)
+    with open(mean_var, 'rb') as f:
+        mv = pickle.load(f)
+        m, v = mv['mean'], mv['var']
+    std = lambda feat: (feat - m) / v
+
+    # Load input feature and predict
+    htk_reader = HTKFile()
+    htk_reader.load(input)
+    feat = std(np.array(htk_reader.data))
+    input = Variable(torch.from_numpy(feat.astype('float32')))  # .cuda()
+    output_ling = model(input).data.data.cpu().numpy()
+    prediction_confidence = output_ling.max()  # post propability
+
+    class_names_ling = ['NONL', 'LING']
+    cls_ling = np.argmax(output_ling)
+    predition_vcm = class_names_ling[cls_ling]  # prediction
+
+    return predition_vcm, prediction_confidence
+
+
+def main(audio_file, yun_rttm_file, vcm_rttm_file, mean_var, vcm_model):
+    ### check the exist of the temporary folder
+    tmpdir = os.path.dirname(audio_file) + '/VCMtemp'
+    assert os.path.exists(tmpdir)
+
+    with open(vcm_rttm_file, 'w+') as vf:
+        # process each segment one by one. If it is infant vocalisation, do vcm
+        with open(yun_rttm_file, 'r') as yf:
+            for line in yf.readlines():
+                els = line.split('\t')
+                file, onset, dur, cls, conf = els[1], els[3], els[4], els[7], els[8]
+                if 'CHI' in els[7]:
+                    audio_segment = '{}/{}_{}_{}.wav'.format(tmpdir, file.replace('.rttm', ''), onset, dur)
+                    print(audio_segment)
+                    feature_file = audio_segment.replace('wav', 'htk')
+
+                    ### segment audio file into small subsegments according to the yunitator output
+                    try:
+                        seg_audio(audio_file, audio_segment, onset, dur)
+                    except:
+                        print("Error: Cannot segment the auido: {}, from: {}, length: {}".format(audio_file, onset, dur))
+                        exit()
+
+                    ### extract acoustic feature
+                    try:
+                        extract_feature(audio_segment, feature_file)
+                    except:
+                        print("Error: Cannot extract the acoustic features from: {}".format(audio_segment))
+                        exit()
+
+                    ### do vcm prediction
+                    try:
+                        vcm_prediction, vcm_confidence = predict_vcm(vcm_model, feature_file, mean_var)
+                    except:
+                        print("Error: Cannot proceed vcm prediction on: {}".format(audio_segment))
+                        exit()
+
+                    ### save prediction into rttm file
+                    line = 'SPEAKER\t{}\t1\t{}\t{}\t<NA>\t<NA>\t{}\t{:.2f}\t<NA>\n'.format(file, onset, dur, vcm_prediction, float(vcm_confidence))
+                    vf.write(line)
+
+
+if __name__ == '__main__':
+    ### global parameters
+    audio_file = sys.argv[1]  # input audio file (daylong recording)
+    yun_rttm_file = sys.argv[2] # input rttm file, results from yunitator
+    # audio_file = '/data/work2/DiViMe/vcm/data/example.wav'
+    # yun_rttm_file = '/data/work2/DiViMe/vcm/data/yunitator_example.rttm'
+    vcm_rttm_file = yun_rttm_file.replace('yunitator', 'vcm') if len(sys.argv) < 4 else sys.argv[3]
+    mean_var = './ling.eGeMAPS.func_utt.meanvar'
+
+    ### models
+    net_ling = NetLing(88, 1024, 2)  # .cuda()
+    net_ling.load_state_dict(torch.load('modelLing.pt', map_location=lambda storage, loc: storage))
+    # net_syll = NetSyll(88, 1024, 2) #.cuda()
+    # net_syll.load_state_dict(torch.load('modelSyll.pt', map_location = lambda storage, loc: storage))
+    vcm_model = net_ling
+
+    main(audio_file, yun_rttm_file, vcm_rttm_file, mean_var, vcm_model)
+
+