Skip to content
This repository has been archived by the owner on Nov 1, 2024. It is now read-only.

fix: add support for wide characters when building index of dataset files #728

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 11 additions & 15 deletions metaseq/data/jsonl_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# LICENSE file in the root directory of this source tree.

import argparse
from io import TextIOWrapper
import json
import logging
import mmap
Expand Down Expand Up @@ -132,22 +133,17 @@ def _get_subshard_id(self):
# and then wraps around if the epoch id goes beyond the data_subshard_count
return (self.epoch - 1) % self.data_subshard_count

def _build_index(self, path: str):
def _build_index(self, file_path: str):
"""Build index of start positions of each line."""
logger.info(f"Building index for file: {path}")
f = self._get_mmap()
f.seek(0)
offsets = []
cur = 0
line_num = 0
while True:
line = f.readline()
if line == b"":
break
offsets.append(cur)
cur += len(line)
mattmazzola marked this conversation as resolved.
Show resolved Hide resolved
line_num += 1
return offsets
logger.info(f"Building index for file: {file_path}")
file: TextIOWrapper = self._get_mmap()

offsets = [0]
for _ in iter(file.readline, b""):
offsets.append(file.tell())

# return all offsets except the last one, which is the end of the file
return offsets[:-1]

def __setstate__(self, state):
self.__dict__ = state
Expand Down