Skip to content

Commit

Permalink
subtools-1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Snowdar committed Mar 2, 2020
1 parent 1d1cb5f commit c43984c
Show file tree
Hide file tree
Showing 913 changed files with 207,742 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
/BeDeprecated
/challenge
/ToDo
__pycache__
*.pyc
63 changes: 63 additions & 0 deletions addPrefixForUttID.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash

# Copyright xmuspeech (Author:Snowdar 2018-12-16)

prefix="" # If NULL, add spk-id to correspondent utt-id (concatenated by -).
# If -, just do backup or recover data dir from recovering.
as_suffix=false # if true, add a suffix rather than prefix

extra_files= # could be exp/model/tdnn6/train/xvector.scp etc.

. subtools/parse_options.sh

if [[ $# != 1 ]];then
echo "[exit] Num of parameters is not equal to 1"
echo "usage:$0 <data-dir>"
exit 1
fi

datas=$1

for data in $datas;do

[ ! -d $data ] && echo "[exit] No such dir $data" && exit 1


files=""
for x in wav.scp utt2spk text utt2dur utt2num_frames utt2len feats.scp vad.scp cmvn.scp;do
[ -f $data/$x.bk ] && cp -f $data/$x.bk $data/$x
[ ! -f $data/$x.bk ] && [ -f $data/$x ] && cp -f $data/$x $data/$x.bk
[ -f $data/$x ] && files="$files $data/$x"
done

for x in $extra_files;do
[ -f $x.bk ] && cp -f $x.bk $x
[ ! -f $x.bk ] && [ -f $x ] && cp -f $x $x.bk
[ -f $x ] && files="$files $x"
done

[ "$prefix" == "-" ] && echo "Prefix is - , then just do backup or recovering and exit now." && exit 1

if [ "$prefix" == "" ];then
echo "Prefix is NULL, so add spk-id to utt-id..."
[ ! -f $data/utt2spk ] && echo "[exit] $data/utt2spk is expected to exist."
for x in $files;do
awk -v suffix=$as_suffix 'NR==FNR{a[$1]=$2}NR>FNR{if(suffix=="false"){$1=a[$1]"-"$1;}else{$1=$1"-"a[$1];}print $0}' $data/utt2spk.bk $x > $x.tmp && mv -f $x.tmp $x
echo "$x done."
done
else
for x in $files;do
awk -v suffix=$as_suffix -v prefix=$prefix '{if(suffix=="false"){$1=prefix"-"$1;}else{$1=$1"-"prefix;}print $0}' $x > $x.tmp && mv -f $x.tmp $x
echo "$x done."
done
fi

subtools/kaldi/utils/fix_data_dir.sh $data
rm -rf $data/.backup
echo "$data done."
done
echo "All done."




181 changes: 181 additions & 0 deletions augmentDataByNoise.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#!/bin/bash

# Copyright xmuspeech (Author: Snowdar 2019-09-08)

# This script is used to augment data by some noise and it refers to kaldi/egs/sre16/v2/run.sh

set -e

rirs_noises=/data1/data/RIRS_NOISES/
musan=/data1/data/musan/

reverb=true
noise=true
music=true
babble=true # a.k.a speech

sampling_rate=16000
frame_shift=0.01
factor=1 # The ratio of augmented data with origin data. In this case, 4 means using all augmented data if aug-data-dir is provided.
nj=20 # Num-jobs
force_clear=true

. subtools/parse_options.sh
. subtools/path.sh

if [[ $# != 1 && $# != 2 ]];then
echo "[exit] Num of parameters is not equal to 1 or 2"
echo "usage:$0 <data-dir> [<aug-data-dir>]"
echo "[note] if <aug-data-dir> is provided, it will contains all the data from <data-dir>"
exit 1
fi

data=$1

[ $# -eq 2 ] && aug_data_dir=$2

[[ "$reverb" != "true" && "$noise" != "true" && "$music" != "true" && "$babble" != "true" ]] && \
echo "[exit] There should be one augmentation type form [reverb|noise|music|babble]" && exit 1

if [ ! -f $data/reco2dur ];then
echo "...$data/reco2dur is not exist, so get it automatically..."
if [ -f $data/utt2num_frames ] ;then
awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' $data/utt2num_frames > $data/reco2dur
elif [ -f $data/feats.scp ];then
feat-to-len scp:$data/feats.scp ark,t:$data/utt2num_frames
awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' $data/utt2num_frames > $data/reco2dur
else
subtools/kaldi/utils/data/get_reco2dur.sh --nj $nj --frame-shift $frame_shift $data
fi
fi

all_data=""
dir=`dirname $data`
name=`basename $data`

augment_dir=$dir/augment
sdata=$augment_dir/$name
additive_aug_data="$sdata"

mkdir -p $augment_dir

num=0

if $reverb;then
[ ! -d $rirs_noises ] && echo "[check reverb] No such dir $rirs_noises" && exit 1

echo "...add reverb..."

if [[ ! -d ${sdata}_reverb || $force_clear == "true" ]];then
rvb_opts=()
rvb_opts+=(--rir-set-parameters "0.5, $rirs_noises/simulated_rirs/smallroom/rir_list")
rvb_opts+=(--rir-set-parameters "0.5, $rirs_noises/simulated_rirs/mediumroom/rir_list")

python3 subtools/kaldi/steps/data/reverberate_data_dir.py \
"${rvb_opts[@]}" \
--speech-rvb-probability 1 \
--pointsource-noise-addition-probability 0 \
--isotropic-noise-addition-probability 0 \
--num-replications 1 \
--source-sampling-rate $sampling_rate \
${data} ${sdata}_reverb || exit 1

# Add suffix
subtools/kaldi/utils/copy_data_dir.sh --utt-suffix "-reverb" ${sdata}_reverb ${sdata}_reverb.new
rm -rf ${sdata}_reverb
mv ${sdata}_reverb.new ${sdata}_reverb
top_dir=$(dirname $rirs_noises | sed 's/\//\\\//g')
sed -i 's/ RIRS_NOISES/ '$top_dir'\/RIRS_NOISES/g' ${sdata}_reverb/wav.scp
[ -f $data/vad.scp ] && awk '{print $1"-reverb",$2}' $data/vad.scp > ${sdata}_reverb/vad.scp
fi

all_data="$all_data ${sdata}_reverb"
additive_aug_data="${additive_aug_data}"_reverb
num=$[$num + 1]
fi

musan_dir=data/musan_$sampling_rate

if $noise;then
[ ! -d $musan/noise ] && echo "[check noise] No such dir $musan/noise" && exit 1

if [ ! -d $musan_dir/musan_noise ];then
subtools/kaldi/steps/data/make_musan.sh --sampling-rate $sampling_rate $musan $musan_dir || exit 1
fi

echo "...add noise..."

if [[ ! -d ${sdata}_noise || $force_clear == "true" ]];then
python3 subtools/kaldi/steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "$musan_dir/musan_noise" ${data} ${sdata}_noise || exit 1
[ -f $data/vad.scp ] && awk '{print $1"-noise",$2}' $data/vad.scp > ${sdata}_noise/vad.scp
fi

all_data="$all_data ${sdata}_noise"
additive_aug_data="${additive_aug_data}"_noise
num=$[$num + 1]
fi

if $music;then
[ ! -d $musan/music ] && echo "[check music] No such dir $musan/music" && exit 1

if [ ! -d $musan_dir/musan_music ];then
subtools/kaldi/steps/data/make_musan.sh --sampling-rate $sampling_rate $musan $musan_dir || exit 1
fi

echo "...add music..."
if [[ ! -d ${sdata}_music || $force_clear == "true" ]];then
python3 subtools/kaldi/steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "$musan_dir/musan_music" ${data} ${sdata}_music || exit 1
[ -f $data/vad.scp ] && awk '{print $1"-music",$2}' $data/vad.scp > ${sdata}_music/vad.scp
fi

all_data="$all_data ${sdata}_music"
additive_aug_data="${additive_aug_data}"_music
num=$[$num + 1]
fi

if $babble;then
[ ! -d $musan/speech ] && echo "[check babble] No such dir $musan/speech" && exit 1

if [ ! -d $musan_dir/musan_speech ];then
subtools/kaldi/steps/data/make_musan.sh --sampling-rate $sampling_rate $musan $musan_dir || exit 1
fi

echo "...add babble/speech..."

if [[ ! -d ${sdata}_babble || $force_clear == "true" ]];then
python3 subtools/kaldi/steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "$musan_dir/musan_speech" ${data} ${sdata}_babble || exit 1
[ -f $data/vad.scp ] && awk '{print $1"-babble",$2}' $data/vad.scp > ${sdata}_babble/vad.scp
fi

all_data="$all_data ${sdata}_babble"
additive_aug_data="${additive_aug_data}"_babble
num=$[$num + 1]
fi

if [ $num -gt 1 ];then
echo "...combine additive aug data to $additive_aug_data..."
subtools/kaldi/utils/combine_data.sh $additive_aug_data $all_data
fi

num_origin_utts=$(wc -l $data/reco2dur | awk '{print $1}')
[ $(echo "$factor - $num" | bc) -gt 0 ] && factor=$num # Get min
num_additive_utts=$(echo "$num_origin_utts * $factor / 1" | bc)

[ $num_additive_utts -eq 0 ] && "[exit] The factor $factor is too small" && exit 1

if [ $# -eq 2 ];then
subset_data=${additive_aug_data}

if [ $factor -ne $num ];then
echo "...get subset from $additive_aug_data to ${additive_aug_data}_$num_additive_utts..."
subtools/kaldi/utils/subset_data_dir.sh $additive_aug_data $num_additive_utts ${additive_aug_data}_$num_additive_utts
subset_data=${additive_aug_data}_$num_additive_utts
fi

echo "...generate augmented data to $aug_data_dir..."
subtools/kaldi/utils/combine_data.sh $aug_data_dir $data $subset_data
fi

echo "All done."


37 changes: 37 additions & 0 deletions combineVectordir.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash

# Copyright xmuspeech (Author:Snowdar 2019-02-22)

share=true # if false, generate a copy of ark for out-vector-dir as a single dir but it will need some space.

. subtools/parse_options.sh
. subtools/path.sh

if [[ $# != 3 ]];then
echo "[exit] Num of parameters is not equal to 3"
echo "$0 <in-vector-scp1> <in-vector-scp2> <out-vector-dir>"
exit 1
fi

inscp1=$1
inscp2=$2
outdir=$3

[ ! -f "$inscp1" ] && echo "[exit] No such file $inscp1" && exit 1
[ ! -f "$inscp2" ] && echo "[exit] No such file $inscp2" && exit 1
[ -d "$outdir" ] && echo "[exit] $outdir is exist." && exit 1

name1=`basename ${inscp1%.*}`
name2=`basename ${inscp2%.*}`

[ "$name1" != "$name2" ] && echo "[exit] the vector type of $inscp1 is not equal to $inscp2" && exit 1

mkdir -p $outdir/log

if [ "$share" == "true" ];then
cat $inscp1 $inscp2 > $outdir/$name1.scp
else
run.pl $outdir/log/combine.log \
cat $inscp1 $inscp2 \| copy-vector scp:- ark,scp:$outdir/$name1.ark,$outdir/$name1.scp
fi
echo "Combine done."
49 changes: 49 additions & 0 deletions computeAugmentedVad.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/bin/bash

aug_suffixes="reverb noise music babble"
vad_conf=subtools/conf/vad-5.5.conf

. subtools/parse_options.sh

if [[ $# != 2 ]];then
echo "[exit] Num of parameters is not equal to 2"
echo "usage:$0 <aug-data-dir> <clean-list|clean-vad>"
exit 1
fi

datadir=$1
clean_list=$2

name=$(basename $clean_list)
if [ "$name" != "vad.scp" ];then
echo "$clean_list is not vad.scp, so compute vad for clean data firstly."

[ ! -f "$vad_conf" ] && echo "Expected vad conf to exist." && exit 1
[ ! -f "$datadir/feats.scp" ] && echo "Expected $datadir/feats.scp to exist." && exit 1

subtools/filterDataDir.sh $datadir $clean_list $datadir/clean
subtools/computeVad.sh $datadir/clean $vad_conf

clean_vad=$datadir/clean/vad.scp
else
clean_vad=$clean_list
fi

cat $clean_vad > $datadir/aug.vad
for aug_suffix in $aug_suffixes;do
awk -v suffix=$aug_suffix '{print $1"-"suffix, $2}' $clean_vad >> $datadir/aug.vad
done

> $datadir/lost_clean.utts
awk -v data=$datadir 'NR==FNR{a[$1]=$2}NR>FNR{if(!a[$1]){print $1 >> data"/lost_clean.utts"}else{print $1,a[$1]}}' \
$datadir/aug.vad $datadir/utt2spk > $datadir/vad.scp

num=$(wc -l $datadir/lost_clean.utts | awk '{print $1}')

[ $num -gt 0 ] && echo "[exit] Could not find $num clean items for augmented utts which are in $datadir/lost_clean.utts." && \
rm -rf $datadir/clean $datadir/aug.vad && exit 1

rm -rf $datadir/clean $datadir/aug.vad $datadir/lost.clean.utts

echo "Compute VAD for augmented data done."

Loading

0 comments on commit c43984c

Please sign in to comment.