Skip to content

Latest commit

 

History

History
147 lines (124 loc) · 3.85 KB

References.md

File metadata and controls

147 lines (124 loc) · 3.85 KB

References

Please cite the following if you make use of the code.

@inproceedings{chung2020in,
  title={In defence of metric learning for speaker recognition},
  author={Chung, Joon Son and Huh, Jaesung and Mun, Seongkyu and Lee, Minjae and Heo, Hee Soo and Choe, Soyeon and Ham, Chiheon and Jung, Sunghwan and Lee, Bong-Jin and Han, Icksang},
  booktitle={Interspeech},
  year={2020}
}

This trainer uses many models and loss functions that have been proposed in previous works. The suggested citations are as follows:

Models

VGGVox

@inproceedings{nagrani2017voxceleb,
  title={VoxCeleb: A Large-Scale Speaker Identification Dataset},
  author={Nagrani, Arsha and Chung, Joon Son and Zisserman, Andrew},
  booktitle={Interspeech},
  pages={2616--2620},
  year={2017}
}

ResNet

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
  pages={770--778},
  year={2016}
}

Aggregation

SAP

@inproceedings{bhattacharya2017deep,
  title={Deep Speaker Embeddings for Short-Duration Speaker Verification},
  author={Bhattacharya, Gautam and Alam, Md Jahangir and Kenny, Patrick},
  booktitle={Interspeech},
  pages={1517--1521},
  year={2017}
}

ASP

@inproceedings{okabe2018attentive,
  title={Attentive Statistics Pooling for Deep Speaker Embedding},
  author={Okabe, Koji and Koshinaka, Takafumi and Shinoda, Koichi},
  booktitle={Interspeech},
  pages={2252--2256},
  year={2018}
}

Loss functions

Prototypical Networks

@inproceedings{snell2017prototypical,
  title={Prototypical networks for few-shot learning},
  author={Snell, Jake and Swersky, Kevin and Zemel, Richard},
  booktitle={Advances in Neural Information Processing Systems},
  pages={4077--4087},
  year={2017}
}

GE2E

@inproceedings{wan2018generalized,
  title={Generalized end-to-end loss for speaker verification},
  author={Wan, Li and Wang, Quan and Papir, Alan and Moreno, Ignacio Lopez},
  booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing},
  pages={4879--4883},
  year={2018}
}

Triplet loss

@inproceedings{schroff2015facenet,
  title={Facenet: A unified embedding for face recognition and clustering},
  author={Schroff, Florian and Kalenichenko, Dmitry and Philbin, James},
  booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
  pages={815--823},
  year={2015}
}

AM-Softmax

@inproceedings{wang2018cosface,
  title={Cosface: Large margin cosine loss for deep face recognition},
  author={Wang, Hao and Wang, Yitong and Zhou, Zheng and Ji, Xing and Gong, Dihong and Zhou, Jingchao and Li, Zhifeng and Liu, Wei},
  booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
  pages={5265--5274},
  year={2018}
}

AAM-Softmax

@inproceedings{deng2019arcface,
  title={Arcface: Additive angular margin loss for deep face recognition},
  author={Deng, Jiankang and Guo, Jia and Xue, Niannan and Zafeiriou, Stefanos},
  booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
  pages={4690--4699},
  year={2019}
}

Data augmentation

MUSAN database

@article{snyder2015musan,
  title={Musan: A music, speech, and noise corpus},
  author={Snyder, David and Chen, Guoguo and Povey, Daniel},
  journal={arXiv preprint arXiv:1510.08484},
  year={2015}
}

Room Impulse Response database

@inproceedings{ko2017study,
  title={A study on data augmentation of reverberant speech for robust speech recognition},
  author={Ko, Tom and Peddinti, Vijayaditya and Povey, Daniel and Seltzer, Michael L and Khudanpur, Sanjeev},
  booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing},
  pages={5220--5224},
  year={2017}
}