From c614314b2e2d83896b252670c6e3d8bd158f055b Mon Sep 17 00:00:00 2001 From: luozhouyang Date: Tue, 1 Feb 2022 19:02:33 +0800 Subject: [PATCH] Update README --- README.md | 264 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 176 insertions(+), 88 deletions(-) diff --git a/README.md b/README.md index 0581985..b786550 100644 --- a/README.md +++ b/README.md @@ -20,10 +20,6 @@ If you want to load local, personal dataset with minimized boilerplate, use **ra pip install -U rapidnlp-datasets ``` -> If you work with PyTorch, you should install [PyTorch](https://pytorch.org/get-started/locally/) first. - -> If you work with TensorFlow, you should install [TensorFlow](https://github.com/tensorflow/tensorflow) first. - ## Usage Here are few examples to show you how to use this library. @@ -36,102 +32,194 @@ Here are few examples to show you how to use this library. ### sequence-classification-quickstart -In PyTorch, - -```bash ->>> import torch ->>> from rapidnlp_datasets.pt import DatasetForSequenceClassification ->>> dataset = DatasetForSequenceClassification.from_jsonl_files( - input_files=["testdata/sequence_classification.jsonl"], - vocab_file="testdata/vocab.txt", - ) ->>> dataloader = torch.utils.data.DataLoader(dataset, shuffle=True, batch_size=32, collate_fn=dataset.batch_padding_collate) ->>> for idx, batch in enumerate(dataloader): -... print("No.{} batch: \n{}".format(idx, batch)) -... -``` - -In TensorFlow, - -```bash ->>> from rapidnlp_datasets.tf import TFDatasetForSequenceClassifiation ->>> dataset, d = TFDatasetForSequenceClassifiation.from_jsonl_files( - input_files=["testdata/sequence_classification.jsonl"], - vocab_file="testdata/vocab.txt", - return_self=True, - ) ->>> for idx, batch in enumerate(iter(dataset)): -... print("No.{} batch: \n{}".format(idx, batch)) -... -``` - -Especially, you can save dataset to `tfrecord` format when working with TensorFlow, and then build dataset from tfrecord files directly! - -```bash ->>> d.save_tfrecord("testdata/sequence_classification.tfrecord") -2021-12-08 14:52:41,295 INFO utils.py 128] Finished to write 2 examples to tfrecords. ->>> dataset = TFDatasetForSequenceClassifiation.from_tfrecord_files("testdata/sequence_classification.tfrecord") ->>> for idx, batch in enumerate(iter(dataset)): -... print("No.{} batch: \n{}".format(idx, batch)) -... +```python +import torch +from tokenizers import BertWordPieceTokenizer +from rapidnlp_datasets import DatasetForSequenceClassification +from rapidnlp_datasets.tf import TFDatasetForSequenceClassifiation + +# build dataset +tokenizer = BertWordPieceTokenizer.from_file("testdata/vocab.txt") +dataset = DatasetForSequenceClassification(tokenizer) +dataset.add_jsonl_files(input_files=["testdata/sequence_classification.jsonl"]) + +# convert to tf.data.Dataset +tf_dataset = dataset.to_tf_dataset(batch_size=32) +for idx, batch in enumerate(iter(tf_dataset)): + print("No.{} batch: \n{}".format(idx, batch)) + +# save tfrecord +dataset.save_tfrecord("testdata/sequence_classification.tfrecord") +# build dataset from tfrecord files +dataset = TFDatasetForSequenceClassifiation.from_tfrecord_files("testdata/sequence_classification.tfrecord") +for idx, batch in enumerate(iter(dataset)): + print("No.{} batch: \n{}".format(idx, batch)) + +# convert to torch.utils.data.Dataset +pt_dataset = dataset.to_pt_dataset() +dataloader = torch.utils.data.DataLoader( + pt_dataset, num_workers=2, shuffle=True, batch_size=32, collate_fn=pt_dataset.batch_padding_collator +) +for idx, batch in enumerate(dataloader): + print("No.{} batch: \n{}".format(idx, batch)) ``` ### question-answering-quickstart -In PyTorch: -```bash ->>> import torch ->>> from rapidnlp_datasets.pt import DatasetForQuestionAnswering ->>> ->>> dataset = DatasetForQuestionAnswering.from_jsonl_files( - input_files="testdata/qa.jsonl", - vocab_file="testdata/vocab.txt", - ) ->>> dataloader = torch.utils.data.DataLoader(dataset, shuffle=True, batch_size=32, collate_fn=dataset.batch_padding_collate) ->>> for idx, batch in enumerate(dataloader): -... print("No.{} batch: \n{}".format(idx, batch)) -... -``` - -In TensorFlow, - -```bash ->>> from rapidnlp_datasets.tf import TFDatasetForQuestionAnswering ->>> dataset, d = TFDatasetForQuestionAnswering.from_jsonl_files( - input_files="testdata/qa.jsonl", - vocab_file="testdata/vocab.txt", - return_self=True, - ) -2021-12-08 15:09:06,747 INFO question_answering_dataset.py 101] Read 3 examples in total. ->>> for idx, batch in enumerate(iter(dataset)): - print() - print("NO.{} batch: \n{}".format(idx, batch)) -... -``` - -Especially, you can save dataset to `tfrecord` format when working with TensorFlow, and then build dataset from tfrecord files directly! - -```bash ->>> d.save_tfrecord("testdata/qa.tfrecord") -2021-12-08 15:09:31,329 INFO utils.py 128] Finished to write 3 examples to tfrecords. ->>> dataset = TFDatasetForQuestionAnswering.from_tfrecord_files( - "testdata/qa.tfrecord", - batch_size=32, - padding="batch", - ) ->>> for idx, batch in enumerate(iter(dataset)): - print() - print("NO.{} batch: \n{}".format(idx, batch)) -... - +```python +import torch +from tokenizers import BertWordPieceTokenizer +from rapidnlp_datasets import DatasetForQuestionAnswering +from rapidnlp_datasets.tf import TFDatasetForQuestionAnswering + +# build dataset +tokenizer = BertWordPieceTokenizer.from_file("testdata/vocab.txt") +dataset = DatasetForQuestionAnswering(tokenizer) +dataset.add_jsonl_files(input_files="testdata/qa.jsonl") + +# convert to tf.data.Dataset +tf_dataset = dataset.to_tf_dataset() +for idx, batch in enumerate(iter(tf_dataset)): + print("NO.{} batch: \n{}".format(idx, batch)) + +# save to tfrecord +dataset.save_tfrecord("testdata/qa.tfrecord") + +# build dataset from tfrecord files +tf_dataset = TFDatasetForQuestionAnswering.from_tfrecord_files( + "testdata/qa.tfrecord", + batch_size=32, + padding="batch" +) +for idx, batch in enumerate(iter(tf_dataset)): + print() + print("No.{} batch: \n{}".format(idx, batch)) + +# convert to torch.utils.data.Dataset +pt_dataset = dataset.to_pt_dataset() +dataloader = torch.utils.data.DataLoader( + pt_dataset, + batch_size=32, + collate_fn=pt_dataset.batch_padding_collator, +) +for idx, batch in enumerate(dataloader): + print("No.{} batch: \n{}".format(idx, batch)) ``` ### token-classification-quickstart +```python +import torch +from tokenizers import BertWordPieceTokenizer +from rapidnlp_datasets import DatasetForTokenClassification +from rapidnlp_datasets.tf import TFDatasetForTokenClassification + +# build dataset +tokenizer = BertCharLevelTokenizer.from_file("testdata/vocab.txt") +dataset = DatasetForTokenClassification(tokenizer) +dataset.add_jsonl_files("testdata/token_classification.jsonl", label2id=_label_to_id) + +# conver to tf.data.Dataset +tf_dataset = dataset.to_tf_dataset() +for idx, batch in enumerate(iter(tf_dataset)): + print("No.{} batch:\n{}".format(idx, batch)) + +# save dataset to tfrecord +dataset.save_tfrecord("testdata/token_classification.tfrecord") +# build dataset from tfrecord files +tf_dataset = TFDatasetForTokenClassification.from_tfrecord_files( + input_files="testdata/token_classification.tfrecord", + batch_size=4, +) +for idx, batch in enumerate(iter(tf_dataset)): + print("No.{} batch:\n{}".format(idx, batch)) + +# convert to torch.utils.data.Dataset +pt_dataset = dataset.to_pt_dataset() +dataloader = torch.utils.data.DataLoader( + pt_dataset, num_workers=1, batch_size=4, collate_fn=pt_dataset.batch_padding_collator +) +for idx, batch in enumerate(dataloader): + print("No.{} batch:\n{}".format(idx, batch)) +``` ### masked-language-models-quickstart +```python +import torch +from tokenizers import BertWordPieceTokenizer +from rapidnlp_datasets import DatasetForMaskedLanguageModel +from rapidnlp_datasets.tf import TFDatasetForMaksedLanguageModel + +# build dataset +tokenizer = BertWordPieceTokenizer.from_file("testdata/vocab.txt") +dataset = DatasetForMaskedLanguageModel(tokenizer) +dataset.add_jsonl_files(input_files=["testdata/mlm.jsonl"]) +dataset.add_text_files(input_files=["/path/to/text/files"]) + +# convert to tf.data.Dataset +tf_dataset = dataset.to_tf_dataset(batch_size=4) +for idx, batch in enumerate(iter(tf_dataset)): + print("No.{} batch:\n{}".format(idx, batch)) + +# save dataset as tfrecord +dataset.save_tfrecord("testdata/mlm.tfrecord") +# load tf.data.Dataset from tfrecord files +dataset = TFDatasetForMaksedLanguageModel.from_tfrecord_files(input_files="testdata/mlm.tfrecord", batch_size=4) +for idx, batch in enumerate(iter(dataset)): + print("No.{} batch:\n{}".format(idx, batch)) + +# convert to torch.utils.data.Dataset +pt_dataset = dataset.to_pt_dataset() +# build dataloader +dataloader = torch.utils.data.DataLoader( + pt_dataset, batch_size=4, num_workers=1, collate_fn=pt_dataset.batch_padding_collator +) +for idx, batch in enumerate(dataloader): + print("No.{} batch:\n{}".format(idx, batch)) + +```` ### simcse-quickstart +```python +import torch +from tokenizers import BertWordPieceTokenizer +from rapidnlp_datasets import DatasetForSimCSE +from rapidnlp_datasets.tf import TFDatasetForSimCSE + +# build dataset +dataset = DatasetForSimCSE( + tokenizer=BertWordPieceTokenizer.from_file("testdata/vocab.txt"), + with_positive_sequence=False, + with_negative_sequence=False, +) +dataset.add_jsonl_files("testdata/simcse.jsonl") + +# convert to tf.data.Dataset +tf_dataset = dataset.to_tf_dataset() +for idx, batch in enumerate(iter(tf_dataset)): + print() + print("No.{} batch: \n{}".format(idx, batch)) + +# save to tfrecord +dataset.save_tfrecord("testdata/simcse.tfrecord") +# build dataset from tfrecord files +tf_dataset = TFDatasetForSimCSE.from_tfrecord_files( + "testdata/simcse.tfrecord", + with_positive_sequence=False, + with_negative_sequence=False, +) +for idx, batch in enumerate(iter(tf_dataset)): + print("No.{} batch: \n{}".format(idx, batch)) + +# convert to torch.utils.data.Dataset +pt_dataset = dataset.to_pt_dataset() +dataloader = torch.utils.data.DataLoader( + pt_dataset, num_workers=2, shuffle=True, batch_size=32, collate_fn=pt_dataset.batch_padding_collator +) +for idx, batch in enumerate(dataloader): + print("No.{} batch: \n{}".format(idx, batch)) +``` +