From 0d47d68f776fca66aec0ba32b5147a3d7484402b Mon Sep 17 00:00:00 2001 From: Nicola Tonellotto Date: Wed, 24 Apr 2024 13:03:30 +0200 Subject: [PATCH] Update tsv.py The following code doesn't run correctly it the docstore is not already cache: import ir_datasets import more_itertools dataset = ir_datasets.load('msmarco-passage') for batch in more_itertools.chunked(dataset.docs_iter(), 8196*4): print(len(batch)) With this quick fix, it can. --- ir_datasets/formats/tsv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ir_datasets/formats/tsv.py b/ir_datasets/formats/tsv.py index 4059e829..1d96dc90 100644 --- a/ir_datasets/formats/tsv.py +++ b/ir_datasets/formats/tsv.py @@ -26,6 +26,7 @@ def __next__(self): self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc[self.stream_idx].stream())) else: self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc.stream())) + line = '' while self.pos < self.start: line = self.stream.readline() if line != '\n':