Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Touche 2023 #218

Open
wants to merge 147 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
147 commits
Select commit Hold shift + click to select a range
3d2141b
Add Touché 22 downloads
janheinrichmerker Oct 4, 2022
4bb6a7a
Improve argsme docs
janheinrichmerker Oct 5, 2022
3886070
Add argsme docs
janheinrichmerker Oct 5, 2022
115895e
Improve Touché bib
janheinrichmerker Oct 5, 2022
4c13e84
Add Touché bib
janheinrichmerker Oct 5, 2022
49e7f5c
Improve Touché docs
janheinrichmerker Oct 5, 2022
b74ed1e
Add Touché docs
janheinrichmerker Oct 5, 2022
d5d44bf
Add argsme processed docs format
janheinrichmerker Oct 5, 2022
a60cb4d
Add argsme processed docs
janheinrichmerker Oct 5, 2022
8817aa1
Fix Touché downloads
janheinrichmerker Oct 5, 2022
619a630
Add Touché formats
janheinrichmerker Oct 5, 2022
19cb0f5
Add Touché datasets
janheinrichmerker Oct 5, 2022
27eadc4
Add argsme tests
janheinrichmerker Oct 5, 2022
d8cdcee
Fix Touché image parsing
janheinrichmerker Oct 5, 2022
bd76f4e
Fix Touché parsing
janheinrichmerker Oct 5, 2022
b98a54d
Add Touché tests
janheinrichmerker Oct 5, 2022
8a4176f
Re-organize Touché dataset paths
janheinrichmerker Oct 5, 2022
9968811
Add argsme sentence parsing
janheinrichmerker Oct 5, 2022
4d3d522
Prepare Touché 23 datasets
janheinrichmerker Oct 18, 2022
a842f8e
Add IO utils
janheinrichmerker Oct 19, 2022
879301f
Fix IO utils
janheinrichmerker Oct 19, 2022
05d710d
Add CW22 base records
janheinrichmerker Oct 19, 2022
f052f0f
Add CW22 doc records
janheinrichmerker Oct 19, 2022
f08d029
Improve IO utils typing
janheinrichmerker Oct 19, 2022
2f15e36
Fix concat IO util
janheinrichmerker Oct 19, 2022
bc08d5e
Add documentation
janheinrichmerker Oct 19, 2022
30bba6e
Add CW22 config types
janheinrichmerker Oct 19, 2022
b34f395
Add CW22 ID type
janheinrichmerker Oct 19, 2022
dbee8fd
Add CW22 record readers
janheinrichmerker Oct 19, 2022
56456cf
Add CW22 record readers
janheinrichmerker Oct 19, 2022
af074a1
Add CW22 combining doc iterators
janheinrichmerker Oct 19, 2022
bef1678
Re-structure CW22 format
janheinrichmerker Oct 19, 2022
7770ea4
Re-structure CW22 format
janheinrichmerker Oct 19, 2022
80b4265
Configure readers
janheinrichmerker Oct 19, 2022
288b481
Configure CW22 document combiners
janheinrichmerker Oct 19, 2022
d4b1861
Fix generics
janheinrichmerker Oct 19, 2022
e5bfe0b
Fix IO wrapper
janheinrichmerker Oct 19, 2022
1842ef6
Add CW22 iterator
janheinrichmerker Oct 19, 2022
b292548
Add CW22 docs
janheinrichmerker Oct 20, 2022
17d347a
Add CW22 docstore
janheinrichmerker Oct 20, 2022
210becb
Rename CW22 classes
janheinrichmerker Oct 20, 2022
b54ac03
Rename CW22 classes
janheinrichmerker Oct 20, 2022
e5e0d0c
Re-export CW22 classes
janheinrichmerker Oct 20, 2022
0aadd43
Prepare CW22 download instructions
janheinrichmerker Oct 20, 2022
6fa307b
Add CW22 bib
janheinrichmerker Oct 20, 2022
6cb7af6
Add CW22 docs
janheinrichmerker Oct 20, 2022
fefd059
Add CW22 docs
janheinrichmerker Oct 20, 2022
5ed17ff
Add CW22 datasets
janheinrichmerker Oct 20, 2022
9315cea
Merge branch 'master' into clueweb22
janheinrichmerker Oct 20, 2022
3873d78
Improve CW22 download description
janheinrichmerker Nov 25, 2022
4692dd3
Improve IO util
janheinrichmerker Nov 25, 2022
d290f9e
Add CW22 import
janheinrichmerker Nov 25, 2022
bc85b8f
Fix IO utils
janheinrichmerker Nov 25, 2022
31090d6
Fix CW offset file extension
janheinrichmerker Nov 25, 2022
c7eb1c3
Fix CW enum
janheinrichmerker Nov 25, 2022
7cb0cd1
Fix CW id parsing
janheinrichmerker Nov 25, 2022
5a7e827
Fix CW file glob chaining
janheinrichmerker Nov 25, 2022
69fe341
Add CW doc_store attribute
janheinrichmerker Nov 25, 2022
14ae5c3
Add CW docs_lang attribute
janheinrichmerker Nov 25, 2022
590c480
Fix CW file extension
janheinrichmerker Nov 25, 2022
bb32625
Fix CW offsets file path
janheinrichmerker Nov 25, 2022
0b1d40a
Fix CW gzip mode
janheinrichmerker Nov 25, 2022
be527d6
Fix CW context manager
janheinrichmerker Nov 25, 2022
b84a9cc
Implement CW22 fancy iter slicing
janheinrichmerker Nov 25, 2022
0c3c63c
Fix CW22 iterator
janheinrichmerker Nov 25, 2022
cbf61cb
Fix CW22 record counts
janheinrichmerker Nov 26, 2022
d3c09c1
Fix CW22 file glob patterns
janheinrichmerker Nov 26, 2022
9a1a59e
Fix IO utils
janheinrichmerker Nov 26, 2022
5a790b9
Rename functions
janheinrichmerker Nov 26, 2022
8778028
Simplify CW22 vdom structure
janheinrichmerker Nov 26, 2022
ac47dbd
Use warcio for CW22 WARC parsing
janheinrichmerker Nov 26, 2022
bcd57da
Fix CW22 outlink parsing
janheinrichmerker Nov 26, 2022
e04cf8a
Fix CW22 offset file extensions
janheinrichmerker Nov 26, 2022
5007a8e
Fix CW22 txt url
janheinrichmerker Nov 26, 2022
a32ac81
Fix CW22 empty inlink parsing
janheinrichmerker Nov 26, 2022
61bb814
Fix CW22 empty outlink parsing
janheinrichmerker Nov 26, 2022
fabd088
Catch CW22 URL inconsistency bug
janheinrichmerker Nov 26, 2022
d9a0013
Change sequence to generator
janheinrichmerker Nov 26, 2022
4f66a2d
Add CW22 logger
janheinrichmerker Nov 26, 2022
ca53246
Document CW22 bug
janheinrichmerker Nov 26, 2022
0ef2eda
Improve CW22 format error handling
janheinrichmerker Nov 26, 2022
3256a5d
Document CW22 bugs
janheinrichmerker Nov 27, 2022
0fddf14
Fix CW22 whole file gzip decompression
janheinrichmerker Nov 27, 2022
c82de32
Relax IO utils method
janheinrichmerker Nov 27, 2022
38706ff
Use FastWARC (2x faster than warcio)
janheinrichmerker Nov 27, 2022
e3f4c5c
Don't concat IO streams
janheinrichmerker Nov 27, 2022
5131201
Fix CW22 HTML parsing
janheinrichmerker Nov 27, 2022
ce92a69
Add CW22 tests
janheinrichmerker Nov 27, 2022
e43f02d
Add CW22 compatible subsets
janheinrichmerker Nov 27, 2022
df51c7f
Add CW22 version check
janheinrichmerker Nov 27, 2022
40b9363
Rename variables to avoid name conflict
janheinrichmerker Nov 27, 2022
4d78981
Remove obsolete property
janheinrichmerker Nov 27, 2022
a5259d9
Add CW22 subset views
janheinrichmerker Nov 27, 2022
dcc146f
Add CW22 subset view tests
janheinrichmerker Nov 27, 2022
57a73a5
Fix filter CW22 record counts by language
janheinrichmerker Nov 27, 2022
3e49a7b
Improve CW22 version and records count check
janheinrichmerker Nov 27, 2022
1aeebae
Simplify CW22 ID wrappers
janheinrichmerker Nov 27, 2022
5ba0e39
Flatten CW22 readers
janheinrichmerker Nov 28, 2022
c0b4890
Skip CW22 record files entirely when slicing
janheinrichmerker Nov 28, 2022
e735dc5
Fix CW22 record counts
janheinrichmerker Nov 29, 2022
97db8ba
Fix CW22 slice file skipping
janheinrichmerker Nov 29, 2022
56f3ab3
Sort CW22 file paths
janheinrichmerker Nov 29, 2022
64e3eac
Remove CW22 JPG format
janheinrichmerker Nov 29, 2022
8a73a2e
Fix CW22 subset B document alignment
janheinrichmerker Nov 29, 2022
78ed831
Finalize CW22 unit tests
janheinrichmerker Nov 29, 2022
65172ac
Merge remote-tracking branch 'origin/master' into clueweb22
janheinrichmerker Nov 29, 2022
6ba284a
Improve CW22 typing
janheinrichmerker Nov 29, 2022
6422955
Fix CW22 typing
janheinrichmerker Nov 29, 2022
69ab184
Add CW22 category B metadata
janheinrichmerker Nov 29, 2022
28900cd
Temporarily hide CW22 categories A and L
janheinrichmerker Nov 29, 2022
ae996ed
Hide CW22 language subsets for compatible category views
janheinrichmerker Nov 29, 2022
da75475
Add CW22 docs
janheinrichmerker Nov 29, 2022
af958a5
Hide CW22 language subsets for compatible category views
janheinrichmerker Nov 29, 2022
e0ba8f2
Temporarily hide CW22 categories A and L
janheinrichmerker Nov 29, 2022
85520b7
Backport replacesuffix
janheinrichmerker Nov 29, 2022
9bc9764
Sort metadata
janheinrichmerker Nov 29, 2022
6847c05
Merge remote-tracking branch 'origin/master' into touche2023
janheinrichmerker Nov 29, 2022
4d564d6
Merge branch 'clueweb22' into touche2023
janheinrichmerker Nov 29, 2022
de8eb7a
Remove double blank lines
janheinrichmerker Nov 29, 2022
df1e005
Reformat tests
janheinrichmerker Nov 29, 2022
6dce5b8
Normalize whitespace
janheinrichmerker Nov 29, 2022
92e400a
Add Touché causal queries
janheinrichmerker Nov 29, 2022
9ee1585
Add final topic downloads
janheinrichmerker Nov 29, 2022
f8c2e79
Remove Touché re-ranking datasets
janheinrichmerker Nov 29, 2022
7bb9ad0
Add Touché 23 queries
janheinrichmerker Nov 29, 2022
001d875
Add Touché 23 query tests
janheinrichmerker Nov 29, 2022
11a5829
Simplify Touché 23 query tests
janheinrichmerker Nov 29, 2022
d06390a
Fix incompatible Python 3.7 code
janheinrichmerker Nov 30, 2022
02c1ca5
Merge remote-tracking branch 'private/clueweb22' into clueweb22
janheinrichmerker Nov 30, 2022
8a9608a
Remove Final usages
janheinrichmerker Dec 1, 2022
2ba133c
Improve CW22 Python backwards compatibility
janheinrichmerker Dec 1, 2022
668edf7
Move CW22 docstrings to their public interfaces
janheinrichmerker Dec 1, 2022
e6c1ea0
Prepare CW22 screenshots
janheinrichmerker Dec 1, 2022
6e60492
Replace Protocol usages with Callable
janheinrichmerker Dec 1, 2022
2b7b063
Fix parameter name
janheinrichmerker Dec 1, 2022
a2e1d10
Add CW22 record ID and payload digest
janheinrichmerker Dec 8, 2022
df9c20d
Fix ClueWeb22 date format
janheinrichmerker Dec 13, 2022
3e17a30
Fix ClueWeb22 path bug
janheinrichmerker Dec 20, 2022
9e9819a
Add slice test for fixed paths
janheinrichmerker Dec 20, 2022
be741ba
Merge branch 'clueweb22' into touche2023
janheinrichmerker Dec 20, 2022
27ba440
Fix language tag
janheinrichmerker Dec 20, 2022
72bd72e
Add bug fix for missing line break in CW22 offset file
janheinrichmerker Jan 6, 2023
2dd9581
Merge remote-tracking branch 'origin/master' into clueweb22
janheinrichmerker Mar 14, 2023
b1f9836
Update requirement
janheinrichmerker Mar 14, 2023
845f344
Add CW22 default text
janheinrichmerker Mar 14, 2023
46cbdc4
Merge remote-tracking branch 'private/clueweb22' into touche2023
janheinrichmerker Mar 14, 2023
a3fef87
Add Touché default text
janheinrichmerker Mar 14, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ir_datasets/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from . import clirmatrix
from . import clueweb09
from . import clueweb12
from . import clueweb22
from . import codec
from . import cord19
from . import cranfield
Expand Down
82 changes: 82 additions & 0 deletions ir_datasets/datasets/clueweb22.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from ir_datasets import registry
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.formats import ClueWeb22Language, ClueWeb22Subset, \
ClueWeb22Docs
from ir_datasets.util import DownloadConfig, home_path

NAME = "clueweb22"


def _init():
documentation = YamlDocumentation(f"docs/{NAME}.yaml")
base_path = home_path() / NAME
download = DownloadConfig.context(NAME, base_path)

registry.register(
NAME,
Dataset(documentation("_")),
)

for subset in ClueWeb22Subset:
if subset.hide:
continue
subset_tag = subset.tag
registry.register(
f"{NAME}/{subset_tag}",
Dataset(
documentation(subset_tag),
ClueWeb22Docs(
name=NAME,
source=download["docs"],
subset=subset,
subset_view=subset,
)
),
)
for language in ClueWeb22Language:
language_tag = f"{subset_tag}/{language.tag}"
registry.register(
f"{NAME}/{language_tag}",
Dataset(
documentation(language_tag),
ClueWeb22Docs(
name=NAME,
source=download["docs"],
subset=subset,
subset_view=subset,
language=language,
)
),
)
for subset_view in subset.subset_views - {subset}:
subset_view_tag = f"{subset_tag}/as-{subset_view.tag}"
registry.register(
f"{NAME}/{subset_view_tag}",
Dataset(
documentation(subset_view_tag),
ClueWeb22Docs(
name=NAME,
source=download["docs"],
subset=subset,
subset_view=subset_view,
)
),
)
# for language in ClueWeb22Language:
# language_tag = f"{subset_view_tag}/{language.value.tag}"
# registry.register(
# f"{NAME}/{language_tag}",
# Dataset(
# documentation(language_tag),
# ClueWeb22Docs(
# name=NAME,
# source=download["docs"],
# subset=subset,
# subset_view=subset_view,
# language=language,
# )
# ),
# )


_init()
Loading