Skip to content

Commit

Permalink
Add MTEDx valid and test data
Browse files Browse the repository at this point in the history
Add MTEDx valid and test data, update ChangeLog

Co-authored-by: Matt Post <[email protected]>
Co-authored-by: Ozan Caglayan <[email protected]>
  • Loading branch information
3 people authored Mar 5, 2021
1 parent 5ce6b06 commit b7adb84
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 1 deletion.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# VERSION HISTORY

- 1.5.1 (2021-03-04)
- 1.5.1 (2021-03-05)
- Fix extraction error for WMT18 extra test sets (test-ts) (#142)
- Validation and test datasets are added for multilingual TEDx

- 1.5.0 (2021-01-15)
- Fix an assertion error in chrF (#121)
Expand Down
38 changes: 38 additions & 0 deletions sacrebleu/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,44 @@
# Many of these are *.sgm files, which are processed to produced plain text that can be used by this script.
# The canonical location of unpacked, processed data is $SACREBLEU_DIR/$TEST/$SOURCE-$TARGET.{$SOURCE,$TARGET}
DATASETS = {
"mtedx/valid": {
"data": ['https://raw.githubusercontent.com/esalesky/mtedx-eval/main/valid.tar.gz'],
"description": 'mTEDx evaluation data, valid: http://openslr.org/100',
"citation": "@misc{salesky2021multilingual,\n title={The Multilingual TEDx Corpus for Speech Recognition and Translation}, \n author={Elizabeth Salesky and Matthew Wiesner and Jacob Bremerman and Roldano Cattoni and Matteo Negri and Marco Turchi and Douglas W. Oard and Matt Post},\n year={2021},\n eprint={2102.01757},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}",
"md5": ['866e44c55657c6c485ef96a244c0fe07'],
"el-en": ['valid/mtedx-valid-elen.el','valid/mtedx-valid-elen.en'],
"es-en": ['valid/mtedx-valid-esen.es','valid/mtedx-valid-esen.en'],
"es-fr": ['valid/mtedx-valid-esfr.es','valid/mtedx-valid-esfr.fr'],
"es-it": ['valid/mtedx-valid-esit.es','valid/mtedx-valid-esit.it'],
"es-pt": ['valid/mtedx-valid-espt.es','valid/mtedx-valid-espt.pt'],
"fr-en": ['valid/mtedx-valid-fren.fr','valid/mtedx-valid-fren.en'],
"fr-es": ['valid/mtedx-valid-fres.fr','valid/mtedx-valid-fres.es'],
"fr-pt": ['valid/mtedx-valid-frpt.fr','valid/mtedx-valid-frpt.pt'],
"it-en": ['valid/mtedx-valid-iten.it','valid/mtedx-valid-iten.en'],
"it-es": ['valid/mtedx-valid-ites.it','valid/mtedx-valid-ites.es'],
"pt-en": ['valid/mtedx-valid-pten.pt','valid/mtedx-valid-pten.en'],
"pt-es": ['valid/mtedx-valid-ptes.pt','valid/mtedx-valid-ptes.es'],
"ru-en": ['valid/mtedx-valid-ruen.ru','valid/mtedx-valid-ruen.en']
},
"mtedx/test": {
"data": ['https://raw.githubusercontent.com/esalesky/mtedx-eval/main/test.tar.gz'],
"description": 'mTEDx evaluation data, test: http://openslr.org/100',
"citation": "@misc{salesky2021multilingual,\n title={The Multilingual TEDx Corpus for Speech Recognition and Translation}, \n author={Elizabeth Salesky and Matthew Wiesner and Jacob Bremerman and Roldano Cattoni and Matteo Negri and Marco Turchi and Douglas W. Oard and Matt Post},\n year={2021},\n eprint={2102.01757},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}",
"md5": ['ac79c5a5ab1b615eecd6dfbc163c0588'],
"el-en": ['test/mtedx-test-elen.el','test/mtedx-test-elen.en'],
"es-en": ['test/mtedx-test-esen.es','test/mtedx-test-esen.en'],
"es-fr": ['test/mtedx-test-esfr.es','test/mtedx-test-esfr.fr'],
"es-it": ['test/mtedx-test-esit.es','test/mtedx-test-esit.it'],
"es-pt": ['test/mtedx-test-espt.es','test/mtedx-test-espt.pt'],
"fr-en": ['test/mtedx-test-fren.fr','test/mtedx-test-fren.en'],
"fr-es": ['test/mtedx-test-fres.fr','test/mtedx-test-fres.es'],
"fr-pt": ['test/mtedx-test-frpt.fr','test/mtedx-test-frpt.pt'],
"it-en": ['test/mtedx-test-iten.it','test/mtedx-test-iten.en'],
"it-es": ['test/mtedx-test-ites.it','test/mtedx-test-ites.es'],
"pt-en": ['test/mtedx-test-pten.pt','test/mtedx-test-pten.en'],
"pt-es": ['test/mtedx-test-ptes.pt','test/mtedx-test-ptes.es'],
"ru-en": ['test/mtedx-test-ruen.ru','test/mtedx-test-ruen.en']
},
"wmt20/robust/set1": {
"data": ["http://data.statmt.org/wmt20/robustness-task/robustness20-3-sets.zip"],
'description': "WMT20 robustness task, set 1",
Expand Down

0 comments on commit b7adb84

Please sign in to comment.