Skip to content

Commit

Permalink
Update README
Browse files Browse the repository at this point in the history
  • Loading branch information
luozhouyang committed Feb 1, 2022
1 parent dc483b7 commit c614314
Showing 1 changed file with 176 additions and 88 deletions.
264 changes: 176 additions & 88 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@ If you want to load local, personal dataset with minimized boilerplate, use **ra
pip install -U rapidnlp-datasets
```

> If you work with PyTorch, you should install [PyTorch](https://pytorch.org/get-started/locally/) first.
> If you work with TensorFlow, you should install [TensorFlow](https://github.com/tensorflow/tensorflow) first.
## Usage

Here are few examples to show you how to use this library.
Expand All @@ -36,102 +32,194 @@ Here are few examples to show you how to use this library.

### sequence-classification-quickstart

In PyTorch,

```bash
>>> import torch
>>> from rapidnlp_datasets.pt import DatasetForSequenceClassification
>>> dataset = DatasetForSequenceClassification.from_jsonl_files(
input_files=["testdata/sequence_classification.jsonl"],
vocab_file="testdata/vocab.txt",
)
>>> dataloader = torch.utils.data.DataLoader(dataset, shuffle=True, batch_size=32, collate_fn=dataset.batch_padding_collate)
>>> for idx, batch in enumerate(dataloader):
... print("No.{} batch: \n{}".format(idx, batch))
...
```
In TensorFlow,
```bash
>>> from rapidnlp_datasets.tf import TFDatasetForSequenceClassifiation
>>> dataset, d = TFDatasetForSequenceClassifiation.from_jsonl_files(
input_files=["testdata/sequence_classification.jsonl"],
vocab_file="testdata/vocab.txt",
return_self=True,
)
>>> for idx, batch in enumerate(iter(dataset)):
... print("No.{} batch: \n{}".format(idx, batch))
...
```
Especially, you can save dataset to `tfrecord` format when working with TensorFlow, and then build dataset from tfrecord files directly!
```bash
>>> d.save_tfrecord("testdata/sequence_classification.tfrecord")
2021-12-08 14:52:41,295 INFO utils.py 128] Finished to write 2 examples to tfrecords.
>>> dataset = TFDatasetForSequenceClassifiation.from_tfrecord_files("testdata/sequence_classification.tfrecord")
>>> for idx, batch in enumerate(iter(dataset)):
... print("No.{} batch: \n{}".format(idx, batch))
...
```python
import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForSequenceClassification
from rapidnlp_datasets.tf import TFDatasetForSequenceClassifiation

# build dataset
tokenizer = BertWordPieceTokenizer.from_file("testdata/vocab.txt")
dataset = DatasetForSequenceClassification(tokenizer)
dataset.add_jsonl_files(input_files=["testdata/sequence_classification.jsonl"])

# convert to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset(batch_size=32)
for idx, batch in enumerate(iter(tf_dataset)):
print("No.{} batch: \n{}".format(idx, batch))

# save tfrecord
dataset.save_tfrecord("testdata/sequence_classification.tfrecord")
# build dataset from tfrecord files
dataset = TFDatasetForSequenceClassifiation.from_tfrecord_files("testdata/sequence_classification.tfrecord")
for idx, batch in enumerate(iter(dataset)):
print("No.{} batch: \n{}".format(idx, batch))

# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
dataloader = torch.utils.data.DataLoader(
pt_dataset, num_workers=2, shuffle=True, batch_size=32, collate_fn=pt_dataset.batch_padding_collator
)
for idx, batch in enumerate(dataloader):
print("No.{} batch: \n{}".format(idx, batch))
```

### question-answering-quickstart

In PyTorch:
```bash
>>> import torch
>>> from rapidnlp_datasets.pt import DatasetForQuestionAnswering
>>>
>>> dataset = DatasetForQuestionAnswering.from_jsonl_files(
input_files="testdata/qa.jsonl",
vocab_file="testdata/vocab.txt",
)
>>> dataloader = torch.utils.data.DataLoader(dataset, shuffle=True, batch_size=32, collate_fn=dataset.batch_padding_collate)
>>> for idx, batch in enumerate(dataloader):
... print("No.{} batch: \n{}".format(idx, batch))
...
```
In TensorFlow,
```bash
>>> from rapidnlp_datasets.tf import TFDatasetForQuestionAnswering
>>> dataset, d = TFDatasetForQuestionAnswering.from_jsonl_files(
input_files="testdata/qa.jsonl",
vocab_file="testdata/vocab.txt",
return_self=True,
)
2021-12-08 15:09:06,747 INFO question_answering_dataset.py 101] Read 3 examples in total.
>>> for idx, batch in enumerate(iter(dataset)):
print()
print("NO.{} batch: \n{}".format(idx, batch))
...
```
Especially, you can save dataset to `tfrecord` format when working with TensorFlow, and then build dataset from tfrecord files directly!
```bash
>>> d.save_tfrecord("testdata/qa.tfrecord")
2021-12-08 15:09:31,329 INFO utils.py 128] Finished to write 3 examples to tfrecords.
>>> dataset = TFDatasetForQuestionAnswering.from_tfrecord_files(
"testdata/qa.tfrecord",
batch_size=32,
padding="batch",
)
>>> for idx, batch in enumerate(iter(dataset)):
print()
print("NO.{} batch: \n{}".format(idx, batch))
...

```python
import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForQuestionAnswering
from rapidnlp_datasets.tf import TFDatasetForQuestionAnswering

# build dataset
tokenizer = BertWordPieceTokenizer.from_file("testdata/vocab.txt")
dataset = DatasetForQuestionAnswering(tokenizer)
dataset.add_jsonl_files(input_files="testdata/qa.jsonl")

# convert to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset()
for idx, batch in enumerate(iter(tf_dataset)):
print("NO.{} batch: \n{}".format(idx, batch))

# save to tfrecord
dataset.save_tfrecord("testdata/qa.tfrecord")

# build dataset from tfrecord files
tf_dataset = TFDatasetForQuestionAnswering.from_tfrecord_files(
"testdata/qa.tfrecord",
batch_size=32,
padding="batch"
)
for idx, batch in enumerate(iter(tf_dataset)):
print()
print("No.{} batch: \n{}".format(idx, batch))

# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
dataloader = torch.utils.data.DataLoader(
pt_dataset,
batch_size=32,
collate_fn=pt_dataset.batch_padding_collator,
)
for idx, batch in enumerate(dataloader):
print("No.{} batch: \n{}".format(idx, batch))
```


### token-classification-quickstart

```python
import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForTokenClassification
from rapidnlp_datasets.tf import TFDatasetForTokenClassification

# build dataset
tokenizer = BertCharLevelTokenizer.from_file("testdata/vocab.txt")
dataset = DatasetForTokenClassification(tokenizer)
dataset.add_jsonl_files("testdata/token_classification.jsonl", label2id=_label_to_id)

# conver to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset()
for idx, batch in enumerate(iter(tf_dataset)):
print("No.{} batch:\n{}".format(idx, batch))

# save dataset to tfrecord
dataset.save_tfrecord("testdata/token_classification.tfrecord")
# build dataset from tfrecord files
tf_dataset = TFDatasetForTokenClassification.from_tfrecord_files(
input_files="testdata/token_classification.tfrecord",
batch_size=4,
)
for idx, batch in enumerate(iter(tf_dataset)):
print("No.{} batch:\n{}".format(idx, batch))

# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
dataloader = torch.utils.data.DataLoader(
pt_dataset, num_workers=1, batch_size=4, collate_fn=pt_dataset.batch_padding_collator
)
for idx, batch in enumerate(dataloader):
print("No.{} batch:\n{}".format(idx, batch))
```

### masked-language-models-quickstart

```python
import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForMaskedLanguageModel
from rapidnlp_datasets.tf import TFDatasetForMaksedLanguageModel

# build dataset
tokenizer = BertWordPieceTokenizer.from_file("testdata/vocab.txt")
dataset = DatasetForMaskedLanguageModel(tokenizer)
dataset.add_jsonl_files(input_files=["testdata/mlm.jsonl"])
dataset.add_text_files(input_files=["/path/to/text/files"])

# convert to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset(batch_size=4)
for idx, batch in enumerate(iter(tf_dataset)):
print("No.{} batch:\n{}".format(idx, batch))

# save dataset as tfrecord
dataset.save_tfrecord("testdata/mlm.tfrecord")
# load tf.data.Dataset from tfrecord files
dataset = TFDatasetForMaksedLanguageModel.from_tfrecord_files(input_files="testdata/mlm.tfrecord", batch_size=4)
for idx, batch in enumerate(iter(dataset)):
print("No.{} batch:\n{}".format(idx, batch))

# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
# build dataloader
dataloader = torch.utils.data.DataLoader(
pt_dataset, batch_size=4, num_workers=1, collate_fn=pt_dataset.batch_padding_collator
)
for idx, batch in enumerate(dataloader):
print("No.{} batch:\n{}".format(idx, batch))

````

### simcse-quickstart

```python
import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForSimCSE
from rapidnlp_datasets.tf import TFDatasetForSimCSE

# build dataset
dataset = DatasetForSimCSE(
tokenizer=BertWordPieceTokenizer.from_file("testdata/vocab.txt"),
with_positive_sequence=False,
with_negative_sequence=False,
)
dataset.add_jsonl_files("testdata/simcse.jsonl")

# convert to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset()
for idx, batch in enumerate(iter(tf_dataset)):
print()
print("No.{} batch: \n{}".format(idx, batch))

# save to tfrecord
dataset.save_tfrecord("testdata/simcse.tfrecord")
# build dataset from tfrecord files
tf_dataset = TFDatasetForSimCSE.from_tfrecord_files(
"testdata/simcse.tfrecord",
with_positive_sequence=False,
with_negative_sequence=False,
)
for idx, batch in enumerate(iter(tf_dataset)):
print("No.{} batch: \n{}".format(idx, batch))

# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
dataloader = torch.utils.data.DataLoader(
pt_dataset, num_workers=2, shuffle=True, batch_size=32, collate_fn=pt_dataset.batch_padding_collator
)
for idx, batch in enumerate(dataloader):
print("No.{} batch: \n{}".format(idx, batch))
```

0 comments on commit c614314

Please sign in to comment.