Skip to content

Commit

Permalink
Add support for loadding dataset from jsonl files
Browse files Browse the repository at this point in the history
  • Loading branch information
luozhouyang committed Nov 19, 2021
1 parent 3cfc3d4 commit 45afa13
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 33 deletions.
68 changes: 39 additions & 29 deletions datasets/qa/dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import abc
import logging
import os
from typing import Dict, List

import tensorflow as tf
Expand All @@ -10,7 +9,7 @@

from .example import ExampleForQuestionAnswering
from .parsers import ParserForQuestionAnswering
from .readers import read_dureader_checklist, read_dureader_rubost
from .readers import read_dureader_checklist, read_dureader_rubost, read_jsonl_files


class DatasetForQuestionAnswering(abc.ABC):
Expand Down Expand Up @@ -82,19 +81,34 @@ def from_tfrecord_files(cls, input_files, num_parallel_calls=None, buffer_size=N
return d(dataset, **kwargs)

@classmethod
def from_dureader_robust(cls, input_files, **kwargs) -> tf.data.Dataset:
def from_dureader_robust(cls, input_files, tokenizer=None, vocab_file=None, **kwargs) -> tf.data.Dataset:
instances = read_dureader_rubost(input_files, **kwargs)
return cls.from_instances(instances, **kwargs)
return cls.from_instances(instances, tokenizer=tokenizer, vocab_file=vocab_file, **kwargs)

@classmethod
def from_dureader_checklist(cls, input_files, **kwargs) -> tf.data.Dataset:
def from_dureader_checklist(cls, input_files, tokenizer=None, vocab_file=None, **kwargs) -> tf.data.Dataset:
instances = read_dureader_checklist(input_files, **kwargs)
return cls.from_instances(instances, **kwargs)
return cls.from_instances(instances, tokenizer=tokenizer, vocab_file=vocab_file, **kwargs)

@classmethod
def from_jsonl_files(cls, input_files, tokenizer=None, vocab_file=None, **kwargs) -> tf.data.Dataset:
instances = read_jsonl_files(input_files, **kwargs)
return cls.from_instances(instances, tokenizer=tokenizer, vocab_file=vocab_file, **kwargs)

@classmethod
def from_instances(
cls, instances: List[Dict], tokenizer: BertWordPieceTokenizer = None, vocab_file=None, **kwargs
) -> tf.data.Dataset:
"""Build tf.data.Dataset from json instances.
Args:
instances: List instance of dict, each instance contains keys `context`, `question`, `answer` and `id`
tokenizer: Tokenizer used to tokenize text
vocab_file: The vocab path to build tokenizer. The `tokenizer` or `vocab_file` must be provided!
Returns:
Instance of tf.data.Dataset, can be used to fit to tf.keras.Model directly.
"""
examples = []
parser = ParserForQuestionAnswering(tokenizer=tokenizer, vocab_file=vocab_file, **kwargs)
for instance in instances:
Expand All @@ -115,16 +129,32 @@ def from_dataset(cls, dataset: DatasetForQuestionAnswering, **kwargs) -> tf.data
@classmethod
def from_examples(cls, examples: List[ExampleForQuestionAnswering], **kwargs) -> tf.data.Dataset:
d = cls(**kwargs)
dataset = d._zip_dataset(examples, **kwargs)

def _to_dataset(x, dtype=tf.int32):
x = tf.ragged.constant(x, dtype=dtype)
d = tf.data.Dataset.from_tensor_slices(x)
d = d.map(lambda x: x)
return d

dataset = tf.data.Dataset.zip(
(
_to_dataset(x=[e.input_ids for e in examples], dtype=tf.int32),
_to_dataset(x=[e.segment_ids for e in examples], dtype=tf.int32),
_to_dataset(x=[e.attention_mask for e in examples], dtype=tf.int32),
_to_dataset(x=[e.start for e in examples], dtype=tf.int32),
_to_dataset(x=[e.end for e in examples], dtype=tf.int32),
)
)

return d(dataset, **kwargs)

def _filter(self, dataset: tf.data.Dataset, max_sequence_length=512, **kwargs) -> tf.data.Dataset:
dataset = dataset.filter(lambda a, b, c, x, y: tf.size(a) <= max_sequence_length)
return dataset

def _to_dict(self, dataset: tf.data.Dataset, **kwargs) -> tf.data.Dataset:
def _to_dict(self, dataset: tf.data.Dataset, start_key="start", end_key="end", **kwargs) -> tf.data.Dataset:
dataset = dataset.map(
lambda a, b, c, x, y: ({"input_ids": a, "segment_ids": b, "attention_mask": c}, {"head": x, "tail": y}),
lambda a, b, c, x, y: ({"input_ids": a, "segment_ids": b, "attention_mask": c}, {start_key: x, end_key: y}),
num_parallel_calls=kwargs.get("num_parallel_calls", utils.AUTOTUNE),
).prefetch(kwargs.get("buffer_size", utils.AUTOTUNE))
return dataset
Expand Down Expand Up @@ -162,23 +192,3 @@ def _bucket_padding(self, dataset: tf.data.Dataset, **kwargs) -> tf.data.Dataset
**kwargs,
)
return dataset

def _zip_dataset(self, examples: List[ExampleForQuestionAnswering], **kwargs) -> tf.data.Dataset:
"""Zip examples to tf.data.Dataset"""

def _to_dataset(x, dtype=tf.int32):
x = tf.ragged.constant(x, dtype=dtype)
d = tf.data.Dataset.from_tensor_slices(x)
d = d.map(lambda x: x)
return d

dataset = tf.data.Dataset.zip(
(
_to_dataset(x=[e.input_ids for e in examples], dtype=tf.int32),
_to_dataset(x=[e.segment_ids for e in examples], dtype=tf.int32),
_to_dataset(x=[e.attention_mask for e in examples], dtype=tf.int32),
_to_dataset(x=[e.start for e in examples], dtype=tf.int32),
_to_dataset(x=[e.end for e in examples], dtype=tf.int32),
)
)
return dataset
24 changes: 20 additions & 4 deletions datasets/qa/readers.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
import abc
import json
import logging
import os
from typing import Dict, List


def read_dureader_rubost(input_files, **kwargs):
Expand Down Expand Up @@ -42,3 +38,23 @@ def read_dureader_checklist(input_files, **kwargs):
answer = qa["answers"][0]["text"]
instance = {"context": title + context, "question": question, "answer": answer, "id": qa["id"]}
yield instance


def read_jsonl_files(input_files, context_key="context", question_key="question", answers_key="answer", id_key="id", **kwargs):
if isinstance(input_files, str):
input_files = [input_files]
_id = 0
for input_file in input_files:
with open(input_file, mode="rt", encoding="utf-8") as fin:
for line in fin:
data = json.loads(line)
answers = data[answers_key]
if not answers:
continue
if not isinstance(answers, list):
answers = [answers]
answer = answers[0]
instance_id = data.get(id_key, _id)
_id += 1
instance = {"context": data[context_key], "question": data[question_key], "answer": answer, "id": instance_id}
yield instance

0 comments on commit 45afa13

Please sign in to comment.