Skip to content

Commit

Permalink
add conformer-xv (#59)
Browse files Browse the repository at this point in the history
* fix data processing

* add margin_warm, support chunksize to extract xv

* add conformer-xv

* add large-margin fine-tune

* fix name

Co-authored-by: YANOrange <[email protected]>
  • Loading branch information
wangers and YanOrange1 authored Nov 13, 2022
1 parent ea4db02 commit 8dbafa6
Show file tree
Hide file tree
Showing 51 changed files with 8,254 additions and 790 deletions.
85 changes: 85 additions & 0 deletions conf/egs_pre_sre_lm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Copyright xmuspeech (Author: Leo 2022-01-23)
feat_dim: 80 # the num_mel_bins of fbank and the num_ceps of mfcc
data_type: 'shard' # shard or raw
# feature extraction
dataset_conf:
# asv_target: true
filter: false
filter_conf:
max_length: 15.0
min_length: 0.2
max_cut: true
# resample
resample: false
resample_conf:
resample_rate: 16000

# pre speed_perturb
pre_speed_perturb: false
perturb_conf:
speeds: [90, 100, 110] # larger->slower
sample_rate: 16000
# random_chunk
random_chunk: true
random_chunk_size: 6.015

# waveform true config
speech_aug: true
speech_aug_conf: subtools/conf/speech_aug_lm.yaml
csv_aug_folder: ''

# It seems exit some bug, DO NOT set dither and use_energy together.
feature_extraction_conf:
# feature_type: 'mfcc'
# kaldi_featset:
# num_ceps: 23
# num_mel_bins: 23
# frame_shift: 10
# frame_length: 25
# low_freq: 40.0
# high_freq: -200
# energy_floor: 0.0
# dither: 0.0 # conflicted with use_energy=true.
# use_energy: true # if you want use energy-based vad, set it true.

feature_type: 'fbank'
kaldi_featset:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
low_freq: 40
high_freq: -200
energy_floor: 0.0
use_energy: false

mean_var_conf:
mean_norm: true
std_norm: false



# spec level config
spec_aug: false
spec_aug_conf:
aug: specaugment # None or specaugment
aug_params:
frequency: 0.2
frame: 0.2
rows: 4
cols: 4
random_rows: true
random_cols: true


shuffle: true
shuffle_conf:
shuffle_size: 3000
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 64

# attention: Do not specify batch size in dataloader.
data_loader_conf:
num_workers: 8
pin_memory: false
prefetch_factor: 20 # pf(400) * bs(16) is about 2 shards which has 3000 samples each.
85 changes: 85 additions & 0 deletions conf/egs_pre_sre_transformer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Copyright xmuspeech (Author: Leo 2022-01-23)
feat_dim: 80 # the num_mel_bins of fbank and the num_ceps of mfcc
data_type: 'shard' # shard or raw
# feature extraction
dataset_conf:
# asv_target: true
filter: false
filter_conf:
max_length: 15.0
min_length: 0.2
max_cut: true
# resample
resample: false
resample_conf:
resample_rate: 16000

# pre speed_perturb
pre_speed_perturb: false
perturb_conf:
speeds: [90, 100, 110] # larger->slower
sample_rate: 16000
# random_chunk
random_chunk: true
random_chunk_size: 3.015

# waveform true config
speech_aug: true
speech_aug_conf: subtools/conf/speech_aug_transformer.yaml
csv_aug_folder: ''

# It seems exit some bug, DO NOT set dither and use_energy together.
feature_extraction_conf:
# feature_type: 'mfcc'
# kaldi_featset:
# num_ceps: 23
# num_mel_bins: 23
# frame_shift: 10
# frame_length: 25
# low_freq: 40.0
# high_freq: -200
# energy_floor: 0.0
# dither: 0.0 # conflicted with use_energy=true.
# use_energy: true # if you want use energy-based vad, set it true.

feature_type: 'fbank'
kaldi_featset:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
low_freq: 40
high_freq: -200
energy_floor: 0.0
use_energy: false

mean_var_conf:
mean_norm: true
std_norm: false



# spec level config
spec_aug: false
spec_aug_conf:
aug: specaugment # None or specaugment
aug_params:
frequency: 0.2
frame: 0.2
rows: 4
cols: 4
random_rows: true
random_cols: true


shuffle: true
shuffle_conf:
shuffle_size: 3000
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 128

# attention: Do not specify batch size in dataloader.
data_loader_conf:
num_workers: 8
pin_memory: false
prefetch_factor: 50 # pf(400) * bs(16) is about 2 shards which has 3000 samples each.
73 changes: 73 additions & 0 deletions conf/speech_aug_lm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
speechaug:
mod: random # chain,concat,random
aug_classes:
-
aug_name: add_noise # Define the speeech augment name
aug_type: Env # Env or Time
random_mod_weight: 0.5
reverb_prob: 0.0
noise_prob: 1.0
noise_snr_low: 5
noise_snr_high: 15
noise_csv: exp/aug_csv/combine_music_noise.csv
add_filt_min: 0.5
pad_noise: true
noise_num_workers: 0

-
aug_name: add_babble_noise
aug_type: Env
random_mod_weight: 0.2
reverb_prob: 0.0
noise_prob: 0.0
babble_prob: 1.0
babble_speaker_count: 4
babble_snr_low: 13
babble_snr_high: 20
babble_csv: exp/aug_csv/musan_speech.csv
babble_noise_max_len: 8.0
add_filt_min: 0.5
pad_noise: true
noise_num_workers: 0

-
aug_name: add_rev
aug_type: Env
random_mod_weight: 0.3
reverb_prob: 1.0
noise_prob: 0.0
babble_prob: 0.0
reverb_csv: exp/aug_csv/combine_sim_small_medium_rev.csv
rir_scale_factor: 1.0


-
aug_name: add_rev_noise
aug_type: Env
random_mod_weight: 0.2
reverb_prob: 1.0
noise_prob: 0.5
noise_snr_low: 0
noise_snr_high: 15
noise_csv: exp/aug_csv/pointsrc_noise.csv
reverb_csv: exp/aug_csv/real_reverb.csv
add_filt_min: 0.5
pad_noise: true
noise_num_workers: 0
rir_scale_factor: 1.0

# You can define here for more augment strategy.
# tail_speechaug:
# mod: chain
# aug_classes:
# -
# aug_name: augment_speed
# aug_type: Time
# perturb_type: resample # ['resample','sox_speed','sox_tempo']
# perturb_prob: 0.0
# drop_freq_prob: 0.0
# drop_chunk_prob: 0.0
# sample_rate: 16000
# speeds: [95, 100, 105]
# keep_shape: true
# change_spk: false
126 changes: 126 additions & 0 deletions conf/speech_aug_transformer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
speechaug:
mod: random # chain,concat,random
aug_classes:
-
aug_name: add_noise # Define the speeech augment name
aug_type: Env # Env or Time
random_mod_weight: 0.5
reverb_prob: 0.0
noise_prob: 1.0
noise_snr_low: 5
noise_snr_high: 15
noise_csv: exp/aug_csv/combine_music_noise.csv
noise_num_workers: 0



# -
# aug_name: add_white_noise
# aug_type: Env
# random_mod_weight: 1
# reverb_prob: 0.0
# noise_prob: 1.0
# noise_snr_low: 0
# noise_snr_high: 15
# noise_csv: ~
# noise_num_workers: 0

-
aug_name: add_babble_noise
aug_type: Env
random_mod_weight: 0.2
reverb_prob: 0.0
noise_prob: 0.0
babble_prob: 1.0
babble_speaker_count: 4
babble_snr_low: 13
babble_snr_high: 20
babble_csv: exp/aug_csv/musan_speech.csv
babble_noise_max_len: 3.015
noise_num_workers: 0

-
aug_name: add_rev
aug_type: Env
random_mod_weight: 0.3
reverb_prob: 1.0
noise_prob: 0.0
babble_prob: 0.0
reverb_csv: exp/aug_csv/combine_sim_small_medium_rev.csv
rir_scale_factor: 1.0


-
aug_name: add_rev_noise
aug_type: Env
random_mod_weight: 0.2
reverb_prob: 1.0
noise_prob: 0.5
noise_snr_low: 0
noise_snr_high: 15
noise_csv: exp/aug_csv/pointsrc_noise.csv
reverb_csv: exp/aug_csv/real_reverb.csv
noise_num_workers: 0
rir_scale_factor: 1.0

# -
# aug_name: augment_wavedrop
# aug_type: Time
# random_mod_weight: 1
# perturb_prob: 0.0
# drop_freq_prob: 1.0
# drop_chunk_prob: 1.0
# drop_freq_count_low: 0
# drop_freq_count_high: 3
# drop_chunk_count_low: 0
# drop_chunk_count_high: 4
# drop_chunk_length_low: 1000
# drop_chunk_length_high: 2000
# sample_rate: 16000
# speeds: [100]

# -
# aug_name: augment_speed
# aug_type: Time
# random_mod_weight: 1
# perturb_prob: 1.0
# drop_freq_prob: 1.0
# drop_chunk_prob: 1.0
# drop_freq_count_low: 0
# drop_freq_count_high: 3
# drop_chunk_count_low: 0
# drop_chunk_count_high: 4
# drop_chunk_length_low: 1000
# drop_chunk_length_high: 2000
# sample_rate: 16000
# speeds: [95, 100, 105]
# keep_shape: true

# You can define here for more augment strategy.
tail_speechaug:
mod: chain
aug_classes:
# -
# aug_name: augment_wavedrop
# aug_type: Time
# random_mod_weight: 1
# perturb_prob: 0.0
# drop_freq_prob: 1.0
# drop_chunk_prob: 1.0
# drop_freq_count_low: 0
# drop_freq_count_high: 3
# drop_chunk_count_low: 0
# drop_chunk_count_high: 4
# drop_chunk_length_low: 1000
# drop_chunk_length_high: 2000
# sample_rate: 16000
# speeds: [100]
-
aug_name: augment_speed
aug_type: Time
perturb_prob: 1.0
drop_freq_prob: 0.0
drop_chunk_prob: 0.0
sample_rate: 16000
speeds: [95, 100, 105]
keep_shape: true
Binary file added doc/papers/cntrans.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 8dbafa6

Please sign in to comment.