From b7adb84f34965cb8fea5c21e2fa8f5a871552ddf Mon Sep 17 00:00:00 2001 From: Elizabeth Salesky Date: Fri, 5 Mar 2021 10:33:10 +0100 Subject: [PATCH] Add MTEDx valid and test data Add MTEDx valid and test data, update ChangeLog Co-authored-by: Matt Post Co-authored-by: Ozan Caglayan --- CHANGELOG.md | 3 ++- sacrebleu/dataset.py | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 25a83a6b..9fac42bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ # VERSION HISTORY -- 1.5.1 (2021-03-04) +- 1.5.1 (2021-03-05) - Fix extraction error for WMT18 extra test sets (test-ts) (#142) + - Validation and test datasets are added for multilingual TEDx - 1.5.0 (2021-01-15) - Fix an assertion error in chrF (#121) diff --git a/sacrebleu/dataset.py b/sacrebleu/dataset.py index 16219d6c..ba53ea84 100644 --- a/sacrebleu/dataset.py +++ b/sacrebleu/dataset.py @@ -22,6 +22,44 @@ # Many of these are *.sgm files, which are processed to produced plain text that can be used by this script. # The canonical location of unpacked, processed data is $SACREBLEU_DIR/$TEST/$SOURCE-$TARGET.{$SOURCE,$TARGET} DATASETS = { + "mtedx/valid": { + "data": ['https://raw.githubusercontent.com/esalesky/mtedx-eval/main/valid.tar.gz'], + "description": 'mTEDx evaluation data, valid: http://openslr.org/100', + "citation": "@misc{salesky2021multilingual,\n title={The Multilingual TEDx Corpus for Speech Recognition and Translation}, \n author={Elizabeth Salesky and Matthew Wiesner and Jacob Bremerman and Roldano Cattoni and Matteo Negri and Marco Turchi and Douglas W. Oard and Matt Post},\n year={2021},\n eprint={2102.01757},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}", + "md5": ['866e44c55657c6c485ef96a244c0fe07'], + "el-en": ['valid/mtedx-valid-elen.el','valid/mtedx-valid-elen.en'], + "es-en": ['valid/mtedx-valid-esen.es','valid/mtedx-valid-esen.en'], + "es-fr": ['valid/mtedx-valid-esfr.es','valid/mtedx-valid-esfr.fr'], + "es-it": ['valid/mtedx-valid-esit.es','valid/mtedx-valid-esit.it'], + "es-pt": ['valid/mtedx-valid-espt.es','valid/mtedx-valid-espt.pt'], + "fr-en": ['valid/mtedx-valid-fren.fr','valid/mtedx-valid-fren.en'], + "fr-es": ['valid/mtedx-valid-fres.fr','valid/mtedx-valid-fres.es'], + "fr-pt": ['valid/mtedx-valid-frpt.fr','valid/mtedx-valid-frpt.pt'], + "it-en": ['valid/mtedx-valid-iten.it','valid/mtedx-valid-iten.en'], + "it-es": ['valid/mtedx-valid-ites.it','valid/mtedx-valid-ites.es'], + "pt-en": ['valid/mtedx-valid-pten.pt','valid/mtedx-valid-pten.en'], + "pt-es": ['valid/mtedx-valid-ptes.pt','valid/mtedx-valid-ptes.es'], + "ru-en": ['valid/mtedx-valid-ruen.ru','valid/mtedx-valid-ruen.en'] + }, + "mtedx/test": { + "data": ['https://raw.githubusercontent.com/esalesky/mtedx-eval/main/test.tar.gz'], + "description": 'mTEDx evaluation data, test: http://openslr.org/100', + "citation": "@misc{salesky2021multilingual,\n title={The Multilingual TEDx Corpus for Speech Recognition and Translation}, \n author={Elizabeth Salesky and Matthew Wiesner and Jacob Bremerman and Roldano Cattoni and Matteo Negri and Marco Turchi and Douglas W. Oard and Matt Post},\n year={2021},\n eprint={2102.01757},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}", + "md5": ['ac79c5a5ab1b615eecd6dfbc163c0588'], + "el-en": ['test/mtedx-test-elen.el','test/mtedx-test-elen.en'], + "es-en": ['test/mtedx-test-esen.es','test/mtedx-test-esen.en'], + "es-fr": ['test/mtedx-test-esfr.es','test/mtedx-test-esfr.fr'], + "es-it": ['test/mtedx-test-esit.es','test/mtedx-test-esit.it'], + "es-pt": ['test/mtedx-test-espt.es','test/mtedx-test-espt.pt'], + "fr-en": ['test/mtedx-test-fren.fr','test/mtedx-test-fren.en'], + "fr-es": ['test/mtedx-test-fres.fr','test/mtedx-test-fres.es'], + "fr-pt": ['test/mtedx-test-frpt.fr','test/mtedx-test-frpt.pt'], + "it-en": ['test/mtedx-test-iten.it','test/mtedx-test-iten.en'], + "it-es": ['test/mtedx-test-ites.it','test/mtedx-test-ites.es'], + "pt-en": ['test/mtedx-test-pten.pt','test/mtedx-test-pten.en'], + "pt-es": ['test/mtedx-test-ptes.pt','test/mtedx-test-ptes.es'], + "ru-en": ['test/mtedx-test-ruen.ru','test/mtedx-test-ruen.en'] + }, "wmt20/robust/set1": { "data": ["http://data.statmt.org/wmt20/robustness-task/robustness20-3-sets.zip"], 'description': "WMT20 robustness task, set 1",