-
Notifications
You must be signed in to change notification settings - Fork 130
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
913 changed files
with
207,742 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
/BeDeprecated | ||
/challenge | ||
/ToDo | ||
__pycache__ | ||
*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#!/bin/bash | ||
|
||
# Copyright xmuspeech (Author:Snowdar 2018-12-16) | ||
|
||
prefix="" # If NULL, add spk-id to correspondent utt-id (concatenated by -). | ||
# If -, just do backup or recover data dir from recovering. | ||
as_suffix=false # if true, add a suffix rather than prefix | ||
|
||
extra_files= # could be exp/model/tdnn6/train/xvector.scp etc. | ||
|
||
. subtools/parse_options.sh | ||
|
||
if [[ $# != 1 ]];then | ||
echo "[exit] Num of parameters is not equal to 1" | ||
echo "usage:$0 <data-dir>" | ||
exit 1 | ||
fi | ||
|
||
datas=$1 | ||
|
||
for data in $datas;do | ||
|
||
[ ! -d $data ] && echo "[exit] No such dir $data" && exit 1 | ||
|
||
|
||
files="" | ||
for x in wav.scp utt2spk text utt2dur utt2num_frames utt2len feats.scp vad.scp cmvn.scp;do | ||
[ -f $data/$x.bk ] && cp -f $data/$x.bk $data/$x | ||
[ ! -f $data/$x.bk ] && [ -f $data/$x ] && cp -f $data/$x $data/$x.bk | ||
[ -f $data/$x ] && files="$files $data/$x" | ||
done | ||
|
||
for x in $extra_files;do | ||
[ -f $x.bk ] && cp -f $x.bk $x | ||
[ ! -f $x.bk ] && [ -f $x ] && cp -f $x $x.bk | ||
[ -f $x ] && files="$files $x" | ||
done | ||
|
||
[ "$prefix" == "-" ] && echo "Prefix is - , then just do backup or recovering and exit now." && exit 1 | ||
|
||
if [ "$prefix" == "" ];then | ||
echo "Prefix is NULL, so add spk-id to utt-id..." | ||
[ ! -f $data/utt2spk ] && echo "[exit] $data/utt2spk is expected to exist." | ||
for x in $files;do | ||
awk -v suffix=$as_suffix 'NR==FNR{a[$1]=$2}NR>FNR{if(suffix=="false"){$1=a[$1]"-"$1;}else{$1=$1"-"a[$1];}print $0}' $data/utt2spk.bk $x > $x.tmp && mv -f $x.tmp $x | ||
echo "$x done." | ||
done | ||
else | ||
for x in $files;do | ||
awk -v suffix=$as_suffix -v prefix=$prefix '{if(suffix=="false"){$1=prefix"-"$1;}else{$1=$1"-"prefix;}print $0}' $x > $x.tmp && mv -f $x.tmp $x | ||
echo "$x done." | ||
done | ||
fi | ||
|
||
subtools/kaldi/utils/fix_data_dir.sh $data | ||
rm -rf $data/.backup | ||
echo "$data done." | ||
done | ||
echo "All done." | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
#!/bin/bash | ||
|
||
# Copyright xmuspeech (Author: Snowdar 2019-09-08) | ||
|
||
# This script is used to augment data by some noise and it refers to kaldi/egs/sre16/v2/run.sh | ||
|
||
set -e | ||
|
||
rirs_noises=/data1/data/RIRS_NOISES/ | ||
musan=/data1/data/musan/ | ||
|
||
reverb=true | ||
noise=true | ||
music=true | ||
babble=true # a.k.a speech | ||
|
||
sampling_rate=16000 | ||
frame_shift=0.01 | ||
factor=1 # The ratio of augmented data with origin data. In this case, 4 means using all augmented data if aug-data-dir is provided. | ||
nj=20 # Num-jobs | ||
force_clear=true | ||
|
||
. subtools/parse_options.sh | ||
. subtools/path.sh | ||
|
||
if [[ $# != 1 && $# != 2 ]];then | ||
echo "[exit] Num of parameters is not equal to 1 or 2" | ||
echo "usage:$0 <data-dir> [<aug-data-dir>]" | ||
echo "[note] if <aug-data-dir> is provided, it will contains all the data from <data-dir>" | ||
exit 1 | ||
fi | ||
|
||
data=$1 | ||
|
||
[ $# -eq 2 ] && aug_data_dir=$2 | ||
|
||
[[ "$reverb" != "true" && "$noise" != "true" && "$music" != "true" && "$babble" != "true" ]] && \ | ||
echo "[exit] There should be one augmentation type form [reverb|noise|music|babble]" && exit 1 | ||
|
||
if [ ! -f $data/reco2dur ];then | ||
echo "...$data/reco2dur is not exist, so get it automatically..." | ||
if [ -f $data/utt2num_frames ] ;then | ||
awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' $data/utt2num_frames > $data/reco2dur | ||
elif [ -f $data/feats.scp ];then | ||
feat-to-len scp:$data/feats.scp ark,t:$data/utt2num_frames | ||
awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' $data/utt2num_frames > $data/reco2dur | ||
else | ||
subtools/kaldi/utils/data/get_reco2dur.sh --nj $nj --frame-shift $frame_shift $data | ||
fi | ||
fi | ||
|
||
all_data="" | ||
dir=`dirname $data` | ||
name=`basename $data` | ||
|
||
augment_dir=$dir/augment | ||
sdata=$augment_dir/$name | ||
additive_aug_data="$sdata" | ||
|
||
mkdir -p $augment_dir | ||
|
||
num=0 | ||
|
||
if $reverb;then | ||
[ ! -d $rirs_noises ] && echo "[check reverb] No such dir $rirs_noises" && exit 1 | ||
|
||
echo "...add reverb..." | ||
|
||
if [[ ! -d ${sdata}_reverb || $force_clear == "true" ]];then | ||
rvb_opts=() | ||
rvb_opts+=(--rir-set-parameters "0.5, $rirs_noises/simulated_rirs/smallroom/rir_list") | ||
rvb_opts+=(--rir-set-parameters "0.5, $rirs_noises/simulated_rirs/mediumroom/rir_list") | ||
|
||
python3 subtools/kaldi/steps/data/reverberate_data_dir.py \ | ||
"${rvb_opts[@]}" \ | ||
--speech-rvb-probability 1 \ | ||
--pointsource-noise-addition-probability 0 \ | ||
--isotropic-noise-addition-probability 0 \ | ||
--num-replications 1 \ | ||
--source-sampling-rate $sampling_rate \ | ||
${data} ${sdata}_reverb || exit 1 | ||
|
||
# Add suffix | ||
subtools/kaldi/utils/copy_data_dir.sh --utt-suffix "-reverb" ${sdata}_reverb ${sdata}_reverb.new | ||
rm -rf ${sdata}_reverb | ||
mv ${sdata}_reverb.new ${sdata}_reverb | ||
top_dir=$(dirname $rirs_noises | sed 's/\//\\\//g') | ||
sed -i 's/ RIRS_NOISES/ '$top_dir'\/RIRS_NOISES/g' ${sdata}_reverb/wav.scp | ||
[ -f $data/vad.scp ] && awk '{print $1"-reverb",$2}' $data/vad.scp > ${sdata}_reverb/vad.scp | ||
fi | ||
|
||
all_data="$all_data ${sdata}_reverb" | ||
additive_aug_data="${additive_aug_data}"_reverb | ||
num=$[$num + 1] | ||
fi | ||
|
||
musan_dir=data/musan_$sampling_rate | ||
|
||
if $noise;then | ||
[ ! -d $musan/noise ] && echo "[check noise] No such dir $musan/noise" && exit 1 | ||
|
||
if [ ! -d $musan_dir/musan_noise ];then | ||
subtools/kaldi/steps/data/make_musan.sh --sampling-rate $sampling_rate $musan $musan_dir || exit 1 | ||
fi | ||
|
||
echo "...add noise..." | ||
|
||
if [[ ! -d ${sdata}_noise || $force_clear == "true" ]];then | ||
python3 subtools/kaldi/steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "$musan_dir/musan_noise" ${data} ${sdata}_noise || exit 1 | ||
[ -f $data/vad.scp ] && awk '{print $1"-noise",$2}' $data/vad.scp > ${sdata}_noise/vad.scp | ||
fi | ||
|
||
all_data="$all_data ${sdata}_noise" | ||
additive_aug_data="${additive_aug_data}"_noise | ||
num=$[$num + 1] | ||
fi | ||
|
||
if $music;then | ||
[ ! -d $musan/music ] && echo "[check music] No such dir $musan/music" && exit 1 | ||
|
||
if [ ! -d $musan_dir/musan_music ];then | ||
subtools/kaldi/steps/data/make_musan.sh --sampling-rate $sampling_rate $musan $musan_dir || exit 1 | ||
fi | ||
|
||
echo "...add music..." | ||
if [[ ! -d ${sdata}_music || $force_clear == "true" ]];then | ||
python3 subtools/kaldi/steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "$musan_dir/musan_music" ${data} ${sdata}_music || exit 1 | ||
[ -f $data/vad.scp ] && awk '{print $1"-music",$2}' $data/vad.scp > ${sdata}_music/vad.scp | ||
fi | ||
|
||
all_data="$all_data ${sdata}_music" | ||
additive_aug_data="${additive_aug_data}"_music | ||
num=$[$num + 1] | ||
fi | ||
|
||
if $babble;then | ||
[ ! -d $musan/speech ] && echo "[check babble] No such dir $musan/speech" && exit 1 | ||
|
||
if [ ! -d $musan_dir/musan_speech ];then | ||
subtools/kaldi/steps/data/make_musan.sh --sampling-rate $sampling_rate $musan $musan_dir || exit 1 | ||
fi | ||
|
||
echo "...add babble/speech..." | ||
|
||
if [[ ! -d ${sdata}_babble || $force_clear == "true" ]];then | ||
python3 subtools/kaldi/steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "$musan_dir/musan_speech" ${data} ${sdata}_babble || exit 1 | ||
[ -f $data/vad.scp ] && awk '{print $1"-babble",$2}' $data/vad.scp > ${sdata}_babble/vad.scp | ||
fi | ||
|
||
all_data="$all_data ${sdata}_babble" | ||
additive_aug_data="${additive_aug_data}"_babble | ||
num=$[$num + 1] | ||
fi | ||
|
||
if [ $num -gt 1 ];then | ||
echo "...combine additive aug data to $additive_aug_data..." | ||
subtools/kaldi/utils/combine_data.sh $additive_aug_data $all_data | ||
fi | ||
|
||
num_origin_utts=$(wc -l $data/reco2dur | awk '{print $1}') | ||
[ $(echo "$factor - $num" | bc) -gt 0 ] && factor=$num # Get min | ||
num_additive_utts=$(echo "$num_origin_utts * $factor / 1" | bc) | ||
|
||
[ $num_additive_utts -eq 0 ] && "[exit] The factor $factor is too small" && exit 1 | ||
|
||
if [ $# -eq 2 ];then | ||
subset_data=${additive_aug_data} | ||
|
||
if [ $factor -ne $num ];then | ||
echo "...get subset from $additive_aug_data to ${additive_aug_data}_$num_additive_utts..." | ||
subtools/kaldi/utils/subset_data_dir.sh $additive_aug_data $num_additive_utts ${additive_aug_data}_$num_additive_utts | ||
subset_data=${additive_aug_data}_$num_additive_utts | ||
fi | ||
|
||
echo "...generate augmented data to $aug_data_dir..." | ||
subtools/kaldi/utils/combine_data.sh $aug_data_dir $data $subset_data | ||
fi | ||
|
||
echo "All done." | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#!/bin/bash | ||
|
||
# Copyright xmuspeech (Author:Snowdar 2019-02-22) | ||
|
||
share=true # if false, generate a copy of ark for out-vector-dir as a single dir but it will need some space. | ||
|
||
. subtools/parse_options.sh | ||
. subtools/path.sh | ||
|
||
if [[ $# != 3 ]];then | ||
echo "[exit] Num of parameters is not equal to 3" | ||
echo "$0 <in-vector-scp1> <in-vector-scp2> <out-vector-dir>" | ||
exit 1 | ||
fi | ||
|
||
inscp1=$1 | ||
inscp2=$2 | ||
outdir=$3 | ||
|
||
[ ! -f "$inscp1" ] && echo "[exit] No such file $inscp1" && exit 1 | ||
[ ! -f "$inscp2" ] && echo "[exit] No such file $inscp2" && exit 1 | ||
[ -d "$outdir" ] && echo "[exit] $outdir is exist." && exit 1 | ||
|
||
name1=`basename ${inscp1%.*}` | ||
name2=`basename ${inscp2%.*}` | ||
|
||
[ "$name1" != "$name2" ] && echo "[exit] the vector type of $inscp1 is not equal to $inscp2" && exit 1 | ||
|
||
mkdir -p $outdir/log | ||
|
||
if [ "$share" == "true" ];then | ||
cat $inscp1 $inscp2 > $outdir/$name1.scp | ||
else | ||
run.pl $outdir/log/combine.log \ | ||
cat $inscp1 $inscp2 \| copy-vector scp:- ark,scp:$outdir/$name1.ark,$outdir/$name1.scp | ||
fi | ||
echo "Combine done." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
#!/bin/bash | ||
|
||
aug_suffixes="reverb noise music babble" | ||
vad_conf=subtools/conf/vad-5.5.conf | ||
|
||
. subtools/parse_options.sh | ||
|
||
if [[ $# != 2 ]];then | ||
echo "[exit] Num of parameters is not equal to 2" | ||
echo "usage:$0 <aug-data-dir> <clean-list|clean-vad>" | ||
exit 1 | ||
fi | ||
|
||
datadir=$1 | ||
clean_list=$2 | ||
|
||
name=$(basename $clean_list) | ||
if [ "$name" != "vad.scp" ];then | ||
echo "$clean_list is not vad.scp, so compute vad for clean data firstly." | ||
|
||
[ ! -f "$vad_conf" ] && echo "Expected vad conf to exist." && exit 1 | ||
[ ! -f "$datadir/feats.scp" ] && echo "Expected $datadir/feats.scp to exist." && exit 1 | ||
|
||
subtools/filterDataDir.sh $datadir $clean_list $datadir/clean | ||
subtools/computeVad.sh $datadir/clean $vad_conf | ||
|
||
clean_vad=$datadir/clean/vad.scp | ||
else | ||
clean_vad=$clean_list | ||
fi | ||
|
||
cat $clean_vad > $datadir/aug.vad | ||
for aug_suffix in $aug_suffixes;do | ||
awk -v suffix=$aug_suffix '{print $1"-"suffix, $2}' $clean_vad >> $datadir/aug.vad | ||
done | ||
|
||
> $datadir/lost_clean.utts | ||
awk -v data=$datadir 'NR==FNR{a[$1]=$2}NR>FNR{if(!a[$1]){print $1 >> data"/lost_clean.utts"}else{print $1,a[$1]}}' \ | ||
$datadir/aug.vad $datadir/utt2spk > $datadir/vad.scp | ||
|
||
num=$(wc -l $datadir/lost_clean.utts | awk '{print $1}') | ||
|
||
[ $num -gt 0 ] && echo "[exit] Could not find $num clean items for augmented utts which are in $datadir/lost_clean.utts." && \ | ||
rm -rf $datadir/clean $datadir/aug.vad && exit 1 | ||
|
||
rm -rf $datadir/clean $datadir/aug.vad $datadir/lost.clean.utts | ||
|
||
echo "Compute VAD for augmented data done." | ||
|
Oops, something went wrong.