Skip to content

Commit

Permalink
voxceleb recipe
Browse files Browse the repository at this point in the history
  • Loading branch information
Snowdar committed May 4, 2020
1 parent 9141186 commit 866196e
Show file tree
Hide file tree
Showing 10 changed files with 850 additions and 56 deletions.
2 changes: 1 addition & 1 deletion filterDataDir.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,4 @@ done
subtools/kaldi/utils/fix_data_dir.sh $outdata

rm -rf ${idlist}_aug $outdata/.backup
echo "Filter done."
echo "Filter $outdata done."
2 changes: 1 addition & 1 deletion filterVectorDir.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,4 @@ else
run.pl $outdir/log/filter.log \
awk -v f=$f '{print $f}' $idlist \| subtools/kaldi/utils/filter_scp.pl $exclude_string - $inscp \| copy-vector scp:- ark,scp:$outdir/$name.ark,$outdir/$name.scp
fi
echo "Filter done."
echo "Filter $outdir done."
99 changes: 56 additions & 43 deletions recipe/voxceleb/gather_results_from_epochs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Copyright xmuspeech (Author: Snowdar 2020-02-27 2019-12-22)

prefix=mfcc_23_pitch
epochs="7 14 21"
epochs="21"
positions="far near"

vectordir=exp/standard_xv_baseline_warmR_voxceleb1_adam
Expand Down Expand Up @@ -57,38 +57,6 @@ if [ "$prenorm" == "true" ];then
plda_process="norm-"$plda_process
fi

if [[ "$test_set" == "voxceleb1_test" && "$enroll_set" == "voxceleb1_enroll" ]];then
[ "$force" == "true" ] && rm -rf data/$prefix/voxceleb1_test/enroll.list data/$prefix/voxceleb1_enroll

[ ! -f data/$prefix/voxceleb1_test/enroll.list ] && awk '{print $1}' $trials | sort -u > data/$prefix/voxceleb1_test/enroll.list
[[ ! -d data/$prefix/voxceleb1_enroll ]] && subtools/filterDataDir.sh data/$prefix/voxceleb1_test \
data/$prefix/voxceleb1_test/enroll.list data/$prefix/voxceleb1_enroll
fi

if [ "$score_norm" == "true" ];then
if [ "$cohort_set" == "" ];then
[[ "$force" == "true" ]] && rm -rf data/$prefix/$cohort_set
if [ "$cohort_method" == "sub" ];then
cohort_set=${cohort_set_from}_cohort_sub_${sub_num}$sub_option
[ ! -d data/$prefix/$cohort_set ] && subtools/kaldi/utils/subset_data_dir.sh $sub_option \
data/$prefix/$cohort_set_from $sub_num data/$prefix/$cohort_set
elif [ "$cohort_method" == "mean" ];then
cohort_set=${cohort_set_from}_cohort_mean
[ ! -d data/$prefix/$cohort_set ] && mkdir -p data/$prefix/$cohort_set && \
awk '{print $1,$1}' data/$prefix/$cohort_set_from/spk2utt > data/$prefix/$cohort_set/spk2utt && \
awk '{print $1,$1}' data/$prefix/$cohort_set_from/spk2utt > data/$prefix/$cohort_set/utt2spk
fi
fi

[ ! -f data/$prefix/$cohort_set/utt2spk ] && echo "Expected cohort_set to exist." && exit 1
[ "$force" == "true" ] && rm -rf data/$prefix/$cohort_set/enroll.list data/$prefix/$cohort_set/test.list \
data/$prefix/$cohort_set/enroll.cohort.trials data/$prefix/$cohort_set/test.cohort.trials

[ ! -f data/$prefix/$cohort_set/enroll.list ] && awk '{print $1}' $trials | sort -u > data/$prefix/$cohort_set/enroll.list
[ ! -f data/$prefix/$cohort_set/test.list ] && awk '{print $2}' $trials | sort -u > data/$prefix/$cohort_set/test.list
[ ! -f data/$prefix/$cohort_set/enroll.cohort.trials ] && sh subtools/getTrials.sh 3 data/$prefix/$cohort_set/enroll.list data/$prefix/$cohort_set/utt2spk data/$prefix/$cohort_set/enroll.cohort.trials
[ ! -f data/$prefix/$cohort_set/test.cohort.trials ] && sh subtools/getTrials.sh 3 data/$prefix/$cohort_set/test.list data/$prefix/$cohort_set/utt2spk data/$prefix/$cohort_set/test.cohort.trials
fi

name="$test_set/score/${score}_${enroll_set}_${test_set}${prenorm_string}${submean_string}${lda_string}_norm"

Expand All @@ -102,16 +70,37 @@ for position in $positions;do
for epoch in $epochs;do
obj_dir=$vectordir/${position}_epoch_${epoch}

# Prepare task for scoring. Here it is only needed to extract voxceleb1_test/voxceleb xvectors and then it will split subsets.
# voxcleb1_test -> voxceleb1_enroll
# voxceleb -> voxceleb1-O/E/H[-clean]_enroll/test
if [[ "$test_set" == "voxceleb1_test" && "$enroll_set" == "voxceleb1_enroll" ]];then
[ "$force" == "true" ] && rm -rf $obj_dir/voxceleb1_enroll
[[ ! -d $obj_dir/voxceleb1_enroll ]] && subtools/filterVectorDir.sh $obj_dir/voxceleb1_test/xvector.scp \
data/$prefix/voxceleb1_test/enroll.list $obj_dir/voxceleb1_enroll
elif [[ "$test_set" == "voxceleb1-O_test" && "$enroll_set" == "voxceleb1-O_enroll" ]];then
subtools/recipe/voxceleb/get_voxceleb1_task.sh --force $force --prefix $prefix --tasks voxceleb1-O --vectordir $obj_dir
elif [[ "$test_set" == "voxceleb1-E_test" && "$enroll_set" == "voxceleb1-E_enroll" ]];then
subtools/recipe/voxceleb/get_voxceleb1_task.sh --force $force --prefix $prefix --tasks voxceleb1-E --vectordir $obj_dir
elif [[ "$test_set" == "voxceleb1-H_test" && "$enroll_set" == "voxceleb1-H_enroll" ]];then
subtools/recipe/voxceleb/get_voxceleb1_task.sh --force $force --prefix $prefix --tasks voxceleb1-H --vectordir $obj_dir
[ "$force" == "true" ] && rm -rf data/$prefix/voxceleb1_test/enroll.list data/$prefix/voxceleb1_enroll \
$obj_dir/voxceleb1_enroll
if [ ! -f $trials ];then
[ ! -f data/$prefix/voxceleb1_test/voxceleb1-O.trials ] && \
echo "[exit] Expected data/$prefix/voxceleb1_test/voxceleb1-O.trials to exist." && exit 1
cp data/$prefix/voxceleb1_test/voxceleb1-O.trials data/$prefix/voxceleb1_test/trials
fi

[ ! -f data/$prefix/voxceleb1_test/enroll.list ] && awk '{print $1}' $trials | sort -u > \
data/$prefix/voxceleb1_test/enroll.list
[[ ! -d data/$prefix/voxceleb1_enroll ]] && subtools/filterDataDir.sh data/$prefix/voxceleb1_test \
data/$prefix/voxceleb1_test/enroll.list data/$prefix/voxceleb1_enroll
[[ ! -d $obj_dir/voxceleb1_enroll ]] && subtools/filterVectorDir.sh $obj_dir/voxceleb1_test/xvector.scp \
data/$prefix/voxceleb1_test/enroll.list $obj_dir/voxceleb1_enroll

elif [[ "$test_set" == "voxceleb1_O_test" && "$enroll_set" == "voxceleb1_O_enroll" ]];then
subtools/recipe/voxcelebSRC/prepare_task_for_scoring.sh --force $force --prefix $prefix --tasks voxceleb1-O --vectordir $obj_dir
elif [[ "$test_set" == "voxceleb1_E_test" && "$enroll_set" == "voxceleb1_E_enroll" ]];then
subtools/recipe/voxcelebSRC/prepare_task_for_scoring.sh --force $force --prefix $prefix --tasks voxceleb1-E --vectordir $obj_dir
elif [[ "$test_set" == "voxceleb1_H_test" && "$enroll_set" == "voxceleb1_H_enroll" ]];then
subtools/recipe/voxcelebSRC/prepare_task_for_scoring.sh --force $force --prefix $prefix --tasks voxceleb1-H --vectordir $obj_dir
elif [[ "$test_set" == "voxceleb1_O_clean_test" && "$enroll_set" == "voxceleb1_O_clean_enroll" ]];then
subtools/recipe/voxcelebSRC/prepare_task_for_scoring.sh --force $force --prefix $prefix --tasks voxceleb1-O-clean --vectordir $obj_dir
elif [[ "$test_set" == "voxceleb1_E_clean_test" && "$enroll_set" == "voxceleb1_E_clean_enroll" ]];then
subtools/recipe/voxcelebSRC/prepare_task_for_scoring.sh --force $force --prefix $prefix --tasks voxceleb1-E-clean --vectordir $obj_dir
elif [[ "$test_set" == "voxceleb1_H_clean_test" && "$enroll_set" == "voxceleb1_H_clean_enroll" ]];then
subtools/recipe/voxcelebSRC/prepare_task_for_scoring.sh --force $force --prefix $prefix --tasks voxceleb1-H-clean --vectordir $obj_dir
fi

[[ "$force" == "true" || ! -f $obj_dir/$name.eer ]] && \
Expand All @@ -121,6 +110,30 @@ for position in $positions;do
--lda-data-config "$lda_data_config" --submean-data-config "$submean_data_config" --plda-trainset $train_set

if [[ "$score_norm" == "true" && -f $obj_dir/$name.score ]];then
if [ "$cohort_set" == "" ];then
[[ "$force" == "true" ]] && rm -rf data/$prefix/$cohort_set
if [ "$cohort_method" == "sub" ];then
cohort_set=${cohort_set_from}_cohort_sub_${sub_num}$sub_option
[ ! -d data/$prefix/$cohort_set ] && subtools/kaldi/utils/subset_data_dir.sh $sub_option \
data/$prefix/$cohort_set_from $sub_num data/$prefix/$cohort_set
elif [ "$cohort_method" == "mean" ];then
cohort_set=${cohort_set_from}_cohort_mean
[ ! -d data/$prefix/$cohort_set ] && mkdir -p data/$prefix/$cohort_set && \
awk '{print $1,$1}' data/$prefix/$cohort_set_from/spk2utt > data/$prefix/$cohort_set/spk2utt && \
awk '{print $1,$1}' data/$prefix/$cohort_set_from/spk2utt > data/$prefix/$cohort_set/utt2spk
fi
fi

[ ! -f data/$prefix/$cohort_set/utt2spk ] && echo "Expected cohort_set to exist." && exit 1
[ "$force" == "true" ] && rm -rf data/$prefix/$cohort_set/enroll.list data/$prefix/$cohort_set/test.list \
data/$prefix/$cohort_set/enroll.cohort.trials data/$prefix/$cohort_set/test.cohort.trials

[ ! -f data/$prefix/$cohort_set/enroll.list ] && awk '{print $1}' $trials | sort -u > data/$prefix/$cohort_set/enroll.list
[ ! -f data/$prefix/$cohort_set/test.list ] && awk '{print $2}' $trials | sort -u > data/$prefix/$cohort_set/test.list
[ ! -f data/$prefix/$cohort_set/enroll.cohort.trials ] && sh subtools/getTrials.sh 3 data/$prefix/$cohort_set/enroll.list \
data/$prefix/$cohort_set/utt2spk data/$prefix/$cohort_set/enroll.cohort.trials
[ ! -f data/$prefix/$cohort_set/test.cohort.trials ] && sh subtools/getTrials.sh 3 data/$prefix/$cohort_set/test.list \
data/$prefix/$cohort_set/utt2spk data/$prefix/$cohort_set/test.cohort.trials

[[ "$force" == "true" ]] && rm -rf $obj_dir/$cohort_set
if [ "$cohort_method" == "sub" ];then
Expand Down Expand Up @@ -166,7 +179,7 @@ for position in $positions;do
$obj_dir/$enroll_cohort_name.score $obj_dir/$test_cohort_name.score \
$obj_dir/$output_name.score

#[ ! -f "$obj_dir/$output_name.eer" ] && \
[ ! -f "$obj_dir/$output_name.eer" ] && \
subtools/computeEER.sh --write-file $obj_dir/$output_name.eer $obj_dir/$output_name.score 3 $trials 3

eer=""
Expand Down
36 changes: 36 additions & 0 deletions recipe/voxceleb/prepare/get_trials.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

# Copyright xmuspeech (Author: Snowdar 2020-05-04)

dir=data/voxceleb1
tasks="voxceleb1-O voxceleb1-O-clean voxceleb1-E voxceleb1-E-clean voxceleb1-H voxceleb1-H-clean"

. subtools/parse_options.sh
. subtools/path.sh

[ "$dir" == "" ] && echo "[exit] Expected a dir to save trials, but got nothing." && exit 1

mkdir -p $dir

for task in $tasks;do
name=""
[ "$task" == "voxceleb1-O" ] && name="veri_test.txt"
[ "$task" == "voxceleb1-O-clean" ] && name="veri_test2.txt"
[ "$task" == "voxceleb1-H" ] && name="list_test_hard.txt"
[ "$task" == "voxceleb1-H-clean" ] && name="list_test_hard2.txt"
[ "$task" == "voxceleb1-E" ] && name="list_test_all.txt"
[ "$task" == "voxceleb1-E-clean" ] && name="list_test_all2.txt"

[ "$name" == "" ] && echo "The $task task is invalid here. Please select from voxceleb1-O/E/H[-clean]." && exit 1

if [ ! -f $dir/$name ];then
echo "The $dir/$name is not exist, so download it now...(If failed, download the list from" \
"http://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta by yourself.)"
trap "rm -f $dir/$name && exit 1" INT
wget -P $dir http://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/$name || (rm -f $dir/$name && exit 1)
trap INT
fi

sed 's/\//-/g;s/\.wav//g' $dir/$name | awk '{if($1=="1"){print $2,$3,"target"}else{print $2,$3,"nontarget"}}' > $dir/${task}.trials
echo "Generate $dir/${task}.trials done."
done
125 changes: 125 additions & 0 deletions recipe/voxceleb/prepare/make_voxceleb1_v2.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#!/usr/bin/perl
#
# Copyright 2018 Ewald Enzinger
# 2018 David Snyder
# 2019 Soonshin Seo
#
# Usage: make_voxceleb1_v2.pl /export/voxceleb1 dev data/dev
#
# The VoxCeleb1 corpus underwent several updates that changed the directory and speaker ID format.
# The script 'make_voxceleb1.pl' works for the oldest version of the corpus.
# This script should be used if you've downloaded the corpus recently.

if (@ARGV != 3) {
print STDERR "Usage: $0 <path-to-voxceleb1> <dataset> <path-to-data-dir>\n";
print STDERR "e.g. $0 /export/voxceleb1 dev data/dev\n";
exit(1);
}

($data_base, $dataset, $out_dir) = @ARGV;

if ("$dataset" ne "dev" && "$dataset" ne "test") {
die "dataset parameter must be 'dev' or 'test'!";
}

if (system("mkdir -p $out_dir") != 0) {
die "Error making directory $out_dir";
}

opendir my $dh, "$data_base/$dataset/wav" or die "Cannot open directory: $!";
my @spkr_dirs = grep {-d "$data_base/$dataset/wav/$_" && ! /^\.{1,2}$/} readdir($dh);
closedir $dh;

if ($dataset eq "dev"){
open(SPKR_TRAIN, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
open(WAV_TRAIN, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";

foreach (@spkr_dirs) {
my $spkr_id = $_;
opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
closedir $dh;
foreach (@rec_dirs) {
my $rec_id = $_;
opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
closedir $dh;
foreach (@files) {
my $name = $_;
my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
my $utt_id = "$spkr_id-$rec_id-$name";
print WAV_TRAIN "$utt_id", " $wav", "\n";
print SPKR_TRAIN "$utt_id", " $spkr_id", "\n";
}
}
}
close(SPKR_TRAIN) or die;
close(WAV_TRAIN) or die;
}

if ($dataset eq "test"){
if (! -e "$data_base/voxceleb1_test_v2.txt") {
system("wget -O $data_base/voxceleb1_test_v2.txt http://www.openslr.org/resources/49/voxceleb1_test_v2.txt");
}

open(TRIAL_IN, "<", "$data_base/voxceleb1_test_v2.txt") or die "could not open the verification trials file $data_base/voxceleb1_test_v2.txt";
open(TRIAL_OUT, ">", "$out_dir/trials") or die "Could not open the output file $out_test_dir/trials";
open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
open(WAV_TEST, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";

my $test_spkrs = ();
while (<TRIAL_IN>) {
chomp;
my ($tar_or_non, $path1, $path2) = split;
# Create entry for left-hand side of trial
my ($spkr_id, $rec_id, $name) = split('/', $path1);
$name =~ s/\.wav$//g;
my $utt_id1 = "$spkr_id-$rec_id-$name";
$test_spkrs{$spkr_id} = ();

# Create entry for right-hand side of trial
my ($spkr_id, $rec_id, $name) = split('/', $path2);
$name =~ s/\.wav$//g;
my $utt_id2 = "$spkr_id-$rec_id-$name";
$test_spkrs{$spkr_id} = ();

my $target = "nontarget";
if ($tar_or_non eq "1") {
$target = "target";
}
print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
}

foreach (@spkr_dirs) {
my $spkr_id = $_;
opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
closedir $dh;
foreach (@rec_dirs) {
my $rec_id = $_;
opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
closedir $dh;
foreach (@files) {
my $name = $_;
my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
my $utt_id = "$spkr_id-$rec_id-$name";
print WAV_TEST "$utt_id", " $wav", "\n";
print SPKR_TEST "$utt_id", " $spkr_id", "\n";
}
}
}
close(SPKR_TEST) or die;
close(WAV_TEST) or die;
close(TRIAL_OUT) or die;
close(TRIAL_IN) or die;
}

if (system(
"utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
die "Error creating spk2utt file in directory $out_dir";
}
system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
die "Error validating directory $out_dir";
}
70 changes: 70 additions & 0 deletions recipe/voxceleb/prepare/make_voxceleb2.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/perl
#
# Copyright 2018 Ewald Enzinger
#
# Usage: make_voxceleb2.pl /export/voxceleb2 dev data/dev
#
# Note: This script requires ffmpeg to be installed and its location included in $PATH.

if (@ARGV != 3) {
print STDERR "Usage: $0 <path-to-voxceleb2> <dataset> <path-to-data-dir>\n";
print STDERR "e.g. $0 /export/voxceleb2 dev data/dev\n";
exit(1);
}

# Check that ffmpeg is installed.
if (`which ffmpeg` eq "") {
die "Error: this script requires that ffmpeg is installed.";
}

($data_base, $dataset, $out_dir) = @ARGV;

if ("$dataset" ne "dev" && "$dataset" ne "test") {
die "dataset parameter must be 'dev' or 'test'!";
}

opendir my $dh, "$data_base/$dataset/aac" or die "Cannot open directory: $!";
my @spkr_dirs = grep {-d "$data_base/$dataset/aac/$_" && ! /^\.{1,2}$/} readdir($dh);
closedir $dh;

if (system("mkdir -p $out_dir") != 0) {
die "Error making directory $out_dir";
}

open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";

foreach (@spkr_dirs) {
my $spkr_id = $_;

opendir my $dh, "$data_base/$dataset/aac/$spkr_id/" or die "Cannot open directory: $!";
my @rec_dirs = grep {-d "$data_base/$dataset/aac/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
closedir $dh;

foreach (@rec_dirs) {
my $rec_id = $_;

opendir my $dh, "$data_base/$dataset/aac/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
my @files = map{s/\.[^.]+$//;$_}grep {/\.m4a$/} readdir($dh);
closedir $dh;

foreach (@files) {
my $name = $_;
my $wav = "ffmpeg -v 8 -i $data_base/$dataset/aac/$spkr_id/$rec_id/$name.m4a -f wav -acodec pcm_s16le -|";
my $utt_id = "$spkr_id-$rec_id-$name";
print WAV "$utt_id", " $wav", "\n";
print SPKR "$utt_id", " $spkr_id", "\n";
}
}
}
close(SPKR) or die;
close(WAV) or die;

if (system(
"utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
die "Error creating spk2utt file in directory $out_dir";
}
system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
die "Error validating directory $out_dir";
}
Loading

0 comments on commit 866196e

Please sign in to comment.