add conformer-xv (#59)

* fix data processing * add margin_warm, support chunksize to extract xv * add conformer-xv * add large-margin fine-tune * fix name Co-authored-by: YANOrange <[email protected]>
Snowdar · Nov 13, 2022 · 8dbafa6 · 8dbafa6
1 parent ea4db02
commit 8dbafa6
Show file tree

Hide file tree

Showing 51 changed files with 8,254 additions and 790 deletions.
diff --git a/conf/egs_pre_sre_lm.yaml b/conf/egs_pre_sre_lm.yaml
@@ -0,0 +1,85 @@
+# Copyright xmuspeech (Author: Leo 2022-01-23)
+feat_dim: 80 # the num_mel_bins of fbank and the num_ceps of mfcc
+data_type: 'shard'  # shard or raw
+# feature extraction
+dataset_conf:
+    # asv_target: true
+    filter: false
+    filter_conf:
+        max_length: 15.0
+        min_length: 0.2
+        max_cut: true
+    # resample
+    resample: false
+    resample_conf: 
+        resample_rate: 16000
+
+    # pre speed_perturb
+    pre_speed_perturb: false
+    perturb_conf:
+        speeds: [90, 100, 110]  # larger->slower
+        sample_rate: 16000
+    # random_chunk
+    random_chunk: true
+    random_chunk_size: 6.015
+
+    # waveform true config
+    speech_aug: true
+    speech_aug_conf: subtools/conf/speech_aug_lm.yaml
+    csv_aug_folder: ''
+
+    # It seems exit some bug, DO NOT set dither and use_energy together.
+    feature_extraction_conf:
+        # feature_type: 'mfcc'
+        # kaldi_featset:
+        #     num_ceps: 23
+        #     num_mel_bins: 23
+        #     frame_shift: 10
+        #     frame_length: 25
+        #     low_freq: 40.0
+        #     high_freq: -200
+        #     energy_floor: 0.0
+        #     dither: 0.0  # conflicted with use_energy=true.
+        #     use_energy: true  # if you want use energy-based vad, set it true.
+
+        feature_type: 'fbank'
+        kaldi_featset:
+            num_mel_bins: 80
+            frame_shift: 10
+            frame_length: 25
+            low_freq: 40
+            high_freq: -200
+            energy_floor: 0.0
+            use_energy: false
+
+        mean_var_conf:
+            mean_norm: true
+            std_norm: false
+
+
+
+    # spec level config
+    spec_aug: false
+    spec_aug_conf:
+        aug: specaugment # None or specaugment
+        aug_params:
+            frequency: 0.2
+            frame: 0.2
+            rows: 4
+            cols: 4
+            random_rows: true
+            random_cols: true
+
+
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 3000
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 64
+
+# attention: Do not specify batch size in dataloader.
+data_loader_conf:
+    num_workers: 8
+    pin_memory: false
+    prefetch_factor: 20 # pf(400) * bs(16) is about 2 shards which has 3000 samples each.
diff --git a/conf/egs_pre_sre_transformer.yaml b/conf/egs_pre_sre_transformer.yaml
@@ -0,0 +1,85 @@
+# Copyright xmuspeech (Author: Leo 2022-01-23)
+feat_dim: 80 # the num_mel_bins of fbank and the num_ceps of mfcc
+data_type: 'shard'  # shard or raw
+# feature extraction
+dataset_conf:
+    # asv_target: true
+    filter: false
+    filter_conf:
+        max_length: 15.0
+        min_length: 0.2
+        max_cut: true
+    # resample
+    resample: false
+    resample_conf: 
+        resample_rate: 16000
+
+    # pre speed_perturb
+    pre_speed_perturb: false
+    perturb_conf:
+        speeds: [90, 100, 110]  # larger->slower
+        sample_rate: 16000
+    # random_chunk
+    random_chunk: true
+    random_chunk_size: 3.015
+
+    # waveform true config
+    speech_aug: true
+    speech_aug_conf: subtools/conf/speech_aug_transformer.yaml
+    csv_aug_folder: ''
+
+    # It seems exit some bug, DO NOT set dither and use_energy together.
+    feature_extraction_conf:
+        # feature_type: 'mfcc'
+        # kaldi_featset:
+        #     num_ceps: 23
+        #     num_mel_bins: 23
+        #     frame_shift: 10
+        #     frame_length: 25
+        #     low_freq: 40.0
+        #     high_freq: -200
+        #     energy_floor: 0.0
+        #     dither: 0.0  # conflicted with use_energy=true.
+        #     use_energy: true  # if you want use energy-based vad, set it true.
+
+        feature_type: 'fbank'
+        kaldi_featset:
+            num_mel_bins: 80
+            frame_shift: 10
+            frame_length: 25
+            low_freq: 40
+            high_freq: -200
+            energy_floor: 0.0
+            use_energy: false
+
+        mean_var_conf:
+            mean_norm: true
+            std_norm: false
+
+
+
+    # spec level config
+    spec_aug: false
+    spec_aug_conf:
+        aug: specaugment # None or specaugment
+        aug_params:
+            frequency: 0.2
+            frame: 0.2
+            rows: 4
+            cols: 4
+            random_rows: true
+            random_cols: true
+
+
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 3000
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 128
+
+# attention: Do not specify batch size in dataloader.
+data_loader_conf:
+    num_workers: 8
+    pin_memory: false
+    prefetch_factor: 50 # pf(400) * bs(16) is about 2 shards which has 3000 samples each.
diff --git a/conf/speech_aug_lm.yaml b/conf/speech_aug_lm.yaml
@@ -0,0 +1,73 @@
+speechaug:
+    mod: random   # chain,concat,random
+    aug_classes:
+        - 
+            aug_name: add_noise  # Define the speeech augment name 
+            aug_type: Env        # Env or Time
+            random_mod_weight: 0.5
+            reverb_prob: 0.0
+            noise_prob: 1.0
+            noise_snr_low: 5
+            noise_snr_high: 15
+            noise_csv: exp/aug_csv/combine_music_noise.csv
+            add_filt_min: 0.5
+            pad_noise: true
+            noise_num_workers: 0
+
+        - 
+            aug_name: add_babble_noise
+            aug_type: Env
+            random_mod_weight: 0.2
+            reverb_prob: 0.0
+            noise_prob: 0.0
+            babble_prob: 1.0
+            babble_speaker_count: 4
+            babble_snr_low: 13
+            babble_snr_high: 20
+            babble_csv: exp/aug_csv/musan_speech.csv
+            babble_noise_max_len: 8.0
+            add_filt_min: 0.5
+            pad_noise: true
+            noise_num_workers: 0
+
+        -
+            aug_name: add_rev
+            aug_type: Env
+            random_mod_weight: 0.3
+            reverb_prob: 1.0
+            noise_prob: 0.0
+            babble_prob: 0.0
+            reverb_csv: exp/aug_csv/combine_sim_small_medium_rev.csv
+            rir_scale_factor: 1.0
+
+
+        -
+            aug_name: add_rev_noise
+            aug_type: Env
+            random_mod_weight: 0.2
+            reverb_prob: 1.0
+            noise_prob: 0.5
+            noise_snr_low: 0
+            noise_snr_high: 15
+            noise_csv: exp/aug_csv/pointsrc_noise.csv
+            reverb_csv: exp/aug_csv/real_reverb.csv
+            add_filt_min: 0.5
+            pad_noise: true
+            noise_num_workers: 0
+            rir_scale_factor: 1.0
+
+# You can define here for more augment strategy.
+# tail_speechaug:
+#     mod: chain
+#     aug_classes:
+#         - 
+#             aug_name: augment_speed
+#             aug_type: Time
+#             perturb_type: resample   # ['resample','sox_speed','sox_tempo']
+#             perturb_prob: 0.0
+#             drop_freq_prob: 0.0
+#             drop_chunk_prob: 0.0
+#             sample_rate: 16000
+#             speeds: [95, 100, 105]
+#             keep_shape: true
+#             change_spk: false
diff --git a/conf/speech_aug_transformer.yaml b/conf/speech_aug_transformer.yaml
@@ -0,0 +1,126 @@
+speechaug:
+    mod: random   # chain,concat,random
+    aug_classes:
+        - 
+            aug_name: add_noise  # Define the speeech augment name 
+            aug_type: Env        # Env or Time
+            random_mod_weight: 0.5
+            reverb_prob: 0.0
+            noise_prob: 1.0
+            noise_snr_low: 5
+            noise_snr_high: 15
+            noise_csv: exp/aug_csv/combine_music_noise.csv
+            noise_num_workers: 0
+
+
+
+        # - 
+        #     aug_name: add_white_noise
+        #     aug_type: Env
+        #     random_mod_weight: 1
+        #     reverb_prob: 0.0
+        #     noise_prob: 1.0
+        #     noise_snr_low: 0
+        #     noise_snr_high: 15
+        #     noise_csv: ~
+        #     noise_num_workers: 0
+
+        - 
+            aug_name: add_babble_noise
+            aug_type: Env
+            random_mod_weight: 0.2
+            reverb_prob: 0.0
+            noise_prob: 0.0
+            babble_prob: 1.0
+            babble_speaker_count: 4
+            babble_snr_low: 13
+            babble_snr_high: 20
+            babble_csv: exp/aug_csv/musan_speech.csv
+            babble_noise_max_len: 3.015
+            noise_num_workers: 0
+
+        -
+            aug_name: add_rev
+            aug_type: Env
+            random_mod_weight: 0.3
+            reverb_prob: 1.0
+            noise_prob: 0.0
+            babble_prob: 0.0
+            reverb_csv: exp/aug_csv/combine_sim_small_medium_rev.csv
+            rir_scale_factor: 1.0
+
+
+        -
+            aug_name: add_rev_noise
+            aug_type: Env
+            random_mod_weight: 0.2
+            reverb_prob: 1.0
+            noise_prob: 0.5
+            noise_snr_low: 0
+            noise_snr_high: 15
+            noise_csv: exp/aug_csv/pointsrc_noise.csv
+            reverb_csv: exp/aug_csv/real_reverb.csv
+            noise_num_workers: 0
+            rir_scale_factor: 1.0
+
+        # -
+        #     aug_name: augment_wavedrop
+        #     aug_type: Time
+        #     random_mod_weight: 1
+        #     perturb_prob: 0.0
+        #     drop_freq_prob: 1.0
+        #     drop_chunk_prob: 1.0
+        #     drop_freq_count_low: 0
+        #     drop_freq_count_high: 3
+        #     drop_chunk_count_low: 0
+        #     drop_chunk_count_high: 4
+        #     drop_chunk_length_low: 1000
+        #     drop_chunk_length_high: 2000
+        #     sample_rate: 16000
+        #     speeds: [100]
+
+        # -
+        #     aug_name: augment_speed
+        #     aug_type: Time
+        #     random_mod_weight: 1
+        #     perturb_prob: 1.0
+        #     drop_freq_prob: 1.0
+        #     drop_chunk_prob: 1.0
+        #     drop_freq_count_low: 0
+        #     drop_freq_count_high: 3
+        #     drop_chunk_count_low: 0
+        #     drop_chunk_count_high: 4
+        #     drop_chunk_length_low: 1000
+        #     drop_chunk_length_high: 2000
+        #     sample_rate: 16000
+        #     speeds: [95, 100, 105]
+        #     keep_shape: true
+
+# You can define here for more augment strategy.
+tail_speechaug:
+    mod: chain
+    aug_classes:
+        # -
+        #     aug_name: augment_wavedrop
+        #     aug_type: Time
+        #     random_mod_weight: 1
+        #     perturb_prob: 0.0
+        #     drop_freq_prob: 1.0
+        #     drop_chunk_prob: 1.0
+        #     drop_freq_count_low: 0
+        #     drop_freq_count_high: 3
+        #     drop_chunk_count_low: 0
+        #     drop_chunk_count_high: 4
+        #     drop_chunk_length_low: 1000
+        #     drop_chunk_length_high: 2000
+        #     sample_rate: 16000
+        #     speeds: [100]
+        - 
+            aug_name: augment_speed
+            aug_type: Time
+            perturb_prob: 1.0
+            drop_freq_prob: 0.0
+            drop_chunk_prob: 0.0
+            sample_rate: 16000
+            speeds: [95, 100, 105]
+            keep_shape: true
diff --git a/doc/papers/cntrans.jpg b/doc/papers/cntrans.jpg