forked from huggingface/datatrove
-
Notifications
You must be signed in to change notification settings - Fork 0
/
jsonl.py
31 lines (24 loc) · 1.21 KB
/
jsonl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import json
from typing import IO, Callable
from datatrove.io import DataFolderLike
from datatrove.pipeline.writers.disk_base import DiskWriter
class JsonlWriter(DiskWriter):
"""Write data to datafolder (local or remote) in JSONL format
Args:
output_folder: a str, tuple or DataFolder where data should be saved
output_filename: the filename to use when saving data, including extension. Can contain placeholders such as `${rank}` or metadata tags `${tag}`
compression: if any compression scheme should be used. By default, "infer" - will be guessed from the filename
adapter: a custom function to "adapt" the Document format to the desired output format
"""
default_output_filename: str = "${rank}.jsonl"
name = "🐿 Jsonl"
def __init__(
self,
output_folder: DataFolderLike,
output_filename: str = None,
compression: str | None = "gzip",
adapter: Callable = None,
):
super().__init__(output_folder, output_filename=output_filename, compression=compression, adapter=adapter)
def _write(self, document: dict, file_handler: IO, _filename: str):
file_handler.write(json.dumps(document, ensure_ascii=False) + "\n")