Skip to content

Commit

Permalink
neuMARCO (#181)
Browse files Browse the repository at this point in the history
* neumarco - wip

* added documentation
  • Loading branch information
seanmacavaney authored Mar 17, 2022
1 parent c348cd4 commit 765c606
Show file tree
Hide file tree
Showing 6 changed files with 290 additions and 0 deletions.
1 change: 1 addition & 0 deletions ir_datasets/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from . import msmarco_passage
from . import msmarco_passage_v2
from . import msmarco_qna
from . import neumarco
from . import nfcorpus
from . import natural_questions
from . import nyt
Expand Down
64 changes: 64 additions & 0 deletions ir_datasets/datasets/neumarco.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import io
import codecs
import re
import ir_datasets
from ir_datasets.util import DownloadConfig, TarExtract, Cache
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.datasets import msmarco_passage
from ir_datasets.formats import TsvDocs

NAME = 'neumarco'


def _init():
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)

subsets = {}

subsets_from_msmarco = {
'train': [
ir_datasets.registry['msmarco-passage/train'].queries_handler(),
ir_datasets.registry['msmarco-passage/train'].qrels_handler(),
ir_datasets.registry['msmarco-passage/train'].docpairs_handler(),
],
'train/judged': [
ir_datasets.registry['msmarco-passage/train/judged'].queries_handler(),
ir_datasets.registry['msmarco-passage/train/judged'].qrels_handler(),
ir_datasets.registry['msmarco-passage/train/judged'].docpairs_handler(),
],
'dev': [
ir_datasets.registry['msmarco-passage/dev'].queries_handler(),
ir_datasets.registry['msmarco-passage/dev'].qrels_handler(),
],
'dev/small': [
ir_datasets.registry['msmarco-passage/dev/small'].queries_handler(),
ir_datasets.registry['msmarco-passage/dev/small'].qrels_handler(),
],
'dev/judged': [
ir_datasets.registry['msmarco-passage/dev/judged'].queries_handler(),
ir_datasets.registry['msmarco-passage/dev/judged'].qrels_handler(),
]
}

base_dlc = dlc['main']

for lang3, lang2 in [('fas', 'fa'), ('zho', 'zh'), ('rus', 'ru')]:
corpus_dlc = Cache(TarExtract(base_dlc, f'eng-{lang3}/msmarco.collection.20210731-scale21-sockeye2-tm1.tsv'), base_path/f'{lang2}.tsv')
collection = TsvDocs(corpus_dlc, namespace=f'{NAME}/{lang2}', lang=lang2, count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang2}'))
subsets[f'{lang2}'] = Dataset(collection, documentation(f'{lang2}'))
for s, items in subsets_from_msmarco.items():
subsets[f'{lang2}/{s}'] = Dataset(
collection,
*items,
documentation(f'{lang2}/{s}'))

ir_datasets.registry.register(NAME, Dataset(documentation('_')))
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

return collection, subsets


collection, subsets = _init()
105 changes: 105 additions & 0 deletions ir_datasets/docs/neumarco.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
_:
pretty_name: "neuMARCO"
desc: '
<p>
A version of <a class="ds-ref">msmarco-passage</a> for cross-language
information retrieval, provided by <a href="https://hltcoe.jhu.edu/">JHU HLTCOE</a> with documents
translated to other langauges using a <a href="https://www.amazon.science/publications/sockeye-2-a-toolkit-for-neural-machine-translation">
Sockeye 2</a> translation model.
</p>
<ul>
<li>Documents: Web passages using machine translation to English</li>
<li>Queries: Natural-language web queries in English</li>
</ul>
'

fa:
desc: '
<p>The <a class="ds-ref">msmarco-passage</a> corpus, translated to Persian (Farsi).</p>
'

fa/dev:
desc: '
<p>A version of <a class="ds-ref">msmarco-passage/dev</a>, with the corpus translated to Persian (Farsi).</p>
'

fa/dev/judged:
desc: '
<p>A version of <a class="ds-ref">msmarco-passage/dev/judged</a>, with the corpus translated to Persian (Farsi).</p>
'

fa/dev/small:
desc: '
<p>A version of <a class="ds-ref">msmarco-passage/dev/small</a>, with the corpus translated to Persian (Farsi).</p>
'

fa/train:
desc: '
<p>A version of <a class="ds-ref">msmarco-passage/train</a>, with the corpus translated to Persian (Farsi).</p>
'

fa/train/judged:
desc: '
<p>A version of <a class="ds-ref">msmarco-passage/train/judged</a>, with the corpus translated to Persian (Farsi).</p>
'

ru:
desc: '
<p>The <a class="ds-ref">msmarco-passage</a> corpus, translated to Russian.</p>
'

ru/dev:
desc: '
<p>A version of <a class="ds-ref">msmarco-passage/dev</a>, with the corpus translated to Russian.</p>
'

ru/dev/judged:
desc: '
<p>A version of <a class="ds-ref">msmarco-passage/dev/judged</a>, with the corpus translated to Russian.</p>
'

ru/dev/small:
desc: '
<p>A version of <a class="ds-ref">msmarco-passage/dev/small</a>, with the corpus translated to Russian.</p>
'

ru/train:
desc: '
<p>A version of <a class="ds-ref">msmarco-passage/train</a>, with the corpus translated to Russian.</p>
'

ru/train/judged:
desc: '
<p>A version of <a class="ds-ref">msmarco-passage/train/judged</a>, with the corpus translated to Russian.</p>
'

zh:
desc: '
<p>The <a class="ds-ref">msmarco-passage</a> corpus, translated to Chinese.</p>
'

zh/dev:
desc: '
<p>A version of <a class="ds-ref">msmarco-passage/dev</a>, with the corpus translated to Chinese.</p>
'

zh/dev/judged:
desc: '
<p>A version of <a class="ds-ref">msmarco-passage/dev/judged</a>, with the corpus translated to Chinese.</p>
'

zh/dev/small:
desc: '
<p>A version of <a class="ds-ref">msmarco-passage/dev/small</a>, with the corpus translated to Chinese.</p>
'

zh/train:
desc: '
<p>A version of <a class="ds-ref">msmarco-passage/train</a>, with the corpus translated to Chinese.</p>
'

zh/train/judged:
desc: '
<p>A version of <a class="ds-ref">msmarco-passage/train/judged</a>, with the corpus translated to Chinese.</p>
'

9 changes: 9 additions & 0 deletions ir_datasets/etc/downloads.json
Original file line number Diff line number Diff line change
Expand Up @@ -2861,6 +2861,15 @@
}
},

"neumarco": {
"main": {
"url": "https://livejohnshopkins-my.sharepoint.com/:u:/g/personal/dlawrie1_jh_edu/EQcICtPaSqFNoCZHtoeZszoB7FC362BvaPvieUSk2j30tA?download=1",
"size_hint": 3723728998,
"expected_md5": "733181c211959a7c09c695bfcddaea54",
"cache_path": "neuMSMARCO.tar.gz"
}
},

"nfcorpus": {
"main": {
"url": "https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/nfcorpus.tar.gz",
Expand Down
19 changes: 19 additions & 0 deletions ir_datasets/etc/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,25 @@
"neuclir/1/ru/hc4-filtered": {"docs": {"count": 964719, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}, "queries": {"count": 54}, "qrels": {"count": 3235, "fields": {"relevance": {"counts_by_value": {"0": 2483, "1": 478, "3": 274}}}}},
"neuclir/1/zh": {"docs": {"count": 3179209, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}},
"neuclir/1/zh/hc4-filtered": {"docs": {"count": 519945, "fields": {"doc_id": {"max_len": 36, "common_prefix": ""}}}, "queries": {"count": 60}, "qrels": {"count": 3217, "fields": {"relevance": {"counts_by_value": {"0": 2651, "3": 344, "1": 222}}}}},
"neumarco": {},
"neumarco/fa": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}},
"neumarco/fa/dev": {"docs": {"_ref": "neumarco/fa"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}},
"neumarco/fa/dev/judged": {"docs": {"_ref": "neumarco/fa"}, "queries": {"count": 55578}, "qrels": {"_ref": "neumarco/fa/dev"}},
"neumarco/fa/dev/small": {"docs": {"_ref": "neumarco/fa"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}},
"neumarco/fa/train": {"docs": {"_ref": "neumarco/fa"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 269919004}},
"neumarco/fa/train/judged": {"docs": {"_ref": "neumarco/fa"}, "queries": {"count": 502939}, "qrels": {"_ref": "neumarco/fa/train"}, "docpairs": {"_ref": "neumarco/fa/train"}},
"neumarco/ru": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}},
"neumarco/ru/dev": {"docs": {"_ref": "neumarco/ru"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}},
"neumarco/ru/dev/judged": {"docs": {"_ref": "neumarco/ru"}, "queries": {"count": 55578}, "qrels": {"_ref": "neumarco/ru/dev"}},
"neumarco/ru/dev/small": {"docs": {"_ref": "neumarco/ru"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}},
"neumarco/ru/train": {"docs": {"_ref": "neumarco/ru"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 269919004}},
"neumarco/ru/train/judged": {"docs": {"_ref": "neumarco/ru"}, "queries": {"count": 502939}, "qrels": {"_ref": "neumarco/ru/train"}, "docpairs": {"_ref": "neumarco/ru/train"}},
"neumarco/zh": {"docs": {"count": 8841823, "fields": {"doc_id": {"max_len": 7, "common_prefix": ""}}}},
"neumarco/zh/dev": {"docs": {"_ref": "neumarco/zh"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}},
"neumarco/zh/dev/judged": {"docs": {"_ref": "neumarco/zh"}, "queries": {"count": 55578}, "qrels": {"_ref": "neumarco/zh/dev"}},
"neumarco/zh/dev/small": {"docs": {"_ref": "neumarco/zh"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}},
"neumarco/zh/train": {"docs": {"_ref": "neumarco/zh"}, "queries": {"count": 808731}, "qrels": {"count": 532761, "fields": {"relevance": {"counts_by_value": {"1": 532761}}}}, "docpairs": {"count": 269919004}},
"neumarco/zh/train/judged": {"docs": {"_ref": "neumarco/zh"}, "queries": {"count": 502939}, "qrels": {"_ref": "neumarco/zh/train"}, "docpairs": {"_ref": "neumarco/zh/train"}},
"nfcorpus": {"docs": {"count": 5371, "fields": {"doc_id": {"max_len": 8, "common_prefix": "MED-"}}}},
"nfcorpus/dev": {"docs": {"_ref": "nfcorpus"}, "queries": {"count": 325}, "qrels": {"count": 14589, "fields": {"relevance": {"counts_by_value": {"3": 521, "2": 10864, "1": 3204}}}}},
"nfcorpus/dev/nontopic": {"docs": {"_ref": "nfcorpus"}, "queries": {"count": 144}, "qrels": {"count": 4353, "fields": {"relevance": {"counts_by_value": {"3": 521, "2": 3133, "1": 699}}}}},
Expand Down
92 changes: 92 additions & 0 deletions test/integration/neumarco.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import re
import unittest
from ir_datasets.formats import TrecQrel, GenericQuery, GenericDoc, GenericDocPair
from .base import DatasetIntegrationTest


class TestNeuMarco(DatasetIntegrationTest):
def test_docs(self):
self._test_docs('neumarco/fa', count=8841823, items={
0: GenericDoc('0', re.compile('^ حضور ارتباطات در میان ذهن \u200c های علمی برای موفقیت پروژه منهتن به همان اندازه مهم بود که هوش علمی بود.{57}می معلق است ، چیزی است که موفقیت آن \u200c ها به معنای واقعی آن است ؛ صدها هزار زندگی بی \u200c گناه نابود شد\\.$', flags=48)),
9: GenericDoc('9', ' یکی از دلایل اصلی انتخاب هنفورد به عنوان یک مکان برای پروژه منهتن رآکتور B نزدیکی به رودخانه کلمبیا بود ، بزرگترین رودخانه \u200c ای که از ساحل آمریکای شمالی به اقیانوس آرام می \u200c ریزد.'),
8841822: GenericDoc('8841822', re.compile('^ تصویر کامل اندازه را مشاهده کنید\\. پشت صحنه نور خیره کننده نشان می دهد که تماشاگران اووه و آه در چها.{272}ده نمک \u200c های فلزی و اکسید فلزی وجود دارد که برای تولید مجموعه \u200c ای از رنگ \u200c ها واکنش نشان می \u200c دهند\\.$', flags=48)),
})
self._test_docs('neumarco/ru', count=8841823, items={
0: GenericDoc('0', re.compile('^ Присутствие общения среди научных умов было не менее важным для успеха Манхэттенского проекта, как .{98}лей и инженеров, это то, что их успех действительно означал; сотни тысяч невинных жизней уничтожены\\.$', flags=48)),
9: GenericDoc('9', re.compile('^ Одной из главных причин, по которой Хэнфорд был выбран в качестве объекта для « B Reactor » Манхэтт.{31}сть к реке Колумбия, самой большой реке, протекающей в Тихий океан с северо американского побережья\\.$', flags=48)),
8841822: GenericDoc('8841822', re.compile('^ Просмотр полноразмерного изображения\\. За кулисами ослепительного света видно, что зрители ooh и ahh.{313}ми, в основном солями металлов и оксидами металлов, которые реагируют на получение множества цветов\\.$', flags=48)),
})
self._test_docs('neumarco/zh', count=8841823, items={
0: GenericDoc('0', ' 在科学头脑中的交流对曼哈顿项目的成功同样重要,因为科学智慧是科学智慧。 原子研究人员和工程师令人印象深刻的成就中唯一的云就是他们的成功真正意味着什么;数十万无辜的生命被消灭了。'),
9: GenericDoc('9', ' 汉福德被选定为曼哈顿项目B反应堆的一个主要原因是它靠近哥伦比亚河,这是北美海岸流入太平洋的最大河流。'),
8841822: GenericDoc('8841822', ' 查看全尺寸图像。 在耀眼的灯光的背后,7月4日的观众们都是精心制作的烟花。 不管是红色、白色和蓝色的喷泉还是紫色的火花,每个烟花都充满了正确的化学物质组合,以创造这些五颜六色的灯光。 在每一个手工烟花中,都有少量的特殊化学物质,主要是金属盐和金属氧化物,它们会反应产生一系列的颜色。'),
})

def test_queries(self):
for lang in ['fa', 'ru', 'zh']:
self._test_queries(f'neumarco/{lang}/train', count=808731, items={
0: GenericQuery('121352', 'define extreme'),
9: GenericQuery('492875', 'sanitizer temperature'),
808730: GenericQuery('50393', 'benefits of boiling lemons and drinking juice.')
})
self._test_queries(f'neumarco/{lang}/train/judged', count=502939, items={
0: GenericQuery('121352', 'define extreme'),
9: GenericQuery('54528', 'blood clots in urine after menopause'),
502938: GenericQuery('50393', 'benefits of boiling lemons and drinking juice.')
})
self._test_queries(f'neumarco/{lang}/dev', count=101093, items={
0: GenericQuery('1048578', 'cost of endless pools/swim spa'),
9: GenericQuery('1048587', 'what is patron'),
101092: GenericQuery('524285', 'treadmill incline meaning')
})
self._test_queries(f'neumarco/{lang}/dev/small', count=6980, items={
0: GenericQuery('1048585', "what is paula deen's brother"),
9: GenericQuery('524699', 'tricare service number'),
6979: GenericQuery('1048565', 'who plays sebastian michaelis'),
})
self._test_queries(f'neumarco/{lang}/dev/judged', count=55578, items={
0: GenericQuery('1048578', 'cost of endless pools/swim spa'),
9: GenericQuery('1048601', 'what is pastoral medicine'),
55577: GenericQuery('1048570', 'what is pearls before swine?')
})

def test_qrels(self):
for lang in ['fa', 'ru', 'zh']:
self._test_qrels(f'neumarco/{lang}/train', count=532761, items={
0: TrecQrel('1185869', '0', 1, '0'),
9: TrecQrel('186154', '1160', 1, '0'),
532760: TrecQrel('405466', '8841735', 1, '0')
})
self._test_qrels(f'neumarco/{lang}/train/judged', count=532761, items={
0: TrecQrel('1185869', '0', 1, '0'),
9: TrecQrel('186154', '1160', 1, '0'),
532760: TrecQrel('405466', '8841735', 1, '0')
})
self._test_qrels(f'neumarco/{lang}/dev', count=59273, items={
0: TrecQrel('1102432', '2026790', 1, '0'),
9: TrecQrel('300674', '7067032', 1, '0'),
59272: TrecQrel('371455', '8009476', 1, '0')
})
self._test_qrels(f'neumarco/{lang}/dev/small', count=7437, items={
0: TrecQrel('300674', '7067032', 1, '0'),
9: TrecQrel('54544', '7068203', 1, '0'),
7436: TrecQrel('195199', '8009377', 1, '0'),
})
self._test_qrels(f'neumarco/{lang}/dev/judged', count=59273, items={
0: TrecQrel('1102432', '2026790', 1, '0'),
9: TrecQrel('300674', '7067032', 1, '0'),
59272: TrecQrel('371455', '8009476', 1, '0')
})

def test_docpairs(self):
for lang in ['fa', 'ru', 'zh']:
self._test_docpairs(f'neumarco/{lang}/train', count=269919004, items={
0: GenericDocPair('662731', '193249', '2975302'),
9: GenericDocPair('411362', '31018', '4238671'),
269919003: GenericDocPair('88228', '5117891', '7075853')
})



if __name__ == '__main__':
unittest.main()

0 comments on commit 765c606

Please sign in to comment.