Please cite the following if you make use of the code.
@inproceedings{chung2020in,
title={In defence of metric learning for speaker recognition},
author={Chung, Joon Son and Huh, Jaesung and Mun, Seongkyu and Lee, Minjae and Heo, Hee Soo and Choe, Soyeon and Ham, Chiheon and Jung, Sunghwan and Lee, Bong-Jin and Han, Icksang},
booktitle={Interspeech},
year={2020}
}
This trainer uses many models and loss functions that have been proposed in previous works. The suggested citations are as follows:
@inproceedings{nagrani2017voxceleb,
title={VoxCeleb: A Large-Scale Speaker Identification Dataset},
author={Nagrani, Arsha and Chung, Joon Son and Zisserman, Andrew},
booktitle={Interspeech},
pages={2616--2620},
year={2017}
}
@inproceedings{he2016deep,
title={Deep residual learning for image recognition},
author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
pages={770--778},
year={2016}
}
@inproceedings{bhattacharya2017deep,
title={Deep Speaker Embeddings for Short-Duration Speaker Verification},
author={Bhattacharya, Gautam and Alam, Md Jahangir and Kenny, Patrick},
booktitle={Interspeech},
pages={1517--1521},
year={2017}
}
@inproceedings{okabe2018attentive,
title={Attentive Statistics Pooling for Deep Speaker Embedding},
author={Okabe, Koji and Koshinaka, Takafumi and Shinoda, Koichi},
booktitle={Interspeech},
pages={2252--2256},
year={2018}
}
@inproceedings{snell2017prototypical,
title={Prototypical networks for few-shot learning},
author={Snell, Jake and Swersky, Kevin and Zemel, Richard},
booktitle={Advances in Neural Information Processing Systems},
pages={4077--4087},
year={2017}
}
@inproceedings{wan2018generalized,
title={Generalized end-to-end loss for speaker verification},
author={Wan, Li and Wang, Quan and Papir, Alan and Moreno, Ignacio Lopez},
booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing},
pages={4879--4883},
year={2018}
}
@inproceedings{schroff2015facenet,
title={Facenet: A unified embedding for face recognition and clustering},
author={Schroff, Florian and Kalenichenko, Dmitry and Philbin, James},
booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
pages={815--823},
year={2015}
}
@inproceedings{wang2018cosface,
title={Cosface: Large margin cosine loss for deep face recognition},
author={Wang, Hao and Wang, Yitong and Zhou, Zheng and Ji, Xing and Gong, Dihong and Zhou, Jingchao and Li, Zhifeng and Liu, Wei},
booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
pages={5265--5274},
year={2018}
}
@inproceedings{deng2019arcface,
title={Arcface: Additive angular margin loss for deep face recognition},
author={Deng, Jiankang and Guo, Jia and Xue, Niannan and Zafeiriou, Stefanos},
booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
pages={4690--4699},
year={2019}
}
@article{snyder2015musan,
title={Musan: A music, speech, and noise corpus},
author={Snyder, David and Chen, Guoguo and Povey, Daniel},
journal={arXiv preprint arXiv:1510.08484},
year={2015}
}
@inproceedings{ko2017study,
title={A study on data augmentation of reverberant speech for robust speech recognition},
author={Ko, Tom and Peddinti, Vijayaditya and Povey, Daniel and Seltzer, Michael L and Khudanpur, Sanjeev},
booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing},
pages={5220--5224},
year={2017}
}