Skip to content

Commit f14fa55

Browse files
authored
Resolve asyml#217: Change feature type names in RecordData (asyml#219)
- Naming changes as mentioned in asyml#217. - Third parameter (previously `len`) now changed to `shape`, and is now enforced if specified (previously ignored). - First parameter `dtype` can be `None` if using `list` as collate method. - A UserWarning will be shown if the user still uses `feature_original_types` or the old feature type names.
1 parent 43967ee commit f14fa55

21 files changed

+302
-213
lines changed

.gitignore

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -219,41 +219,3 @@ docs/_build
219219

220220
### mypy ###
221221
/.mypy_cache/
222-
223-
### Project ###
224-
/data/
225-
texar_download/
226-
checkpoints/
227-
/language_models/
228-
/examples/language_model_ptb/simple-examples/
229-
simple-examples.tgz
230-
/examples/hierarchical_dialog/data/
231-
/examples/sequence_tagging/data/
232-
/examples/sequence_tagging/tmp/
233-
/examples/sentence_classifier/data/
234-
/examples/seq2seq_attn/data/
235-
/examples/seq2seq_attn/data.zip
236-
/examples/seq2seq_attn/iwslt14.zip
237-
/examples/seq2seq_attn/toy_copy.zip
238-
/examples/seq2seq_rl/data/
239-
/examples/seq2seq_rl/data.zip
240-
/examples/seq2seq_rl/iwslt14.zip
241-
/examples/seq2seq_rl/toy_copy.zip
242-
/examples/seq2seq_configs/data/
243-
/examples/seq2seq_configs/data.zip
244-
/examples/seq2seq_config/iwslt14.zip
245-
/examples/seq2seq_config/toy_copy.zip
246-
/examples/seq2seq_exposure_bias/data/
247-
/examples/text_style_transfer/checkpoints/
248-
/examples/text_style_transfer/samples/
249-
/examples/text_style_transfer/data/
250-
/examples/text_style_transfer/yelp.zip
251-
/examples/vae_text/simple-examples/
252-
/examples/vae_text/data/
253-
/examples/transformer/data/
254-
/examples/transformer/temp/
255-
/examples/transformer/outputs/
256-
/examples/bert/data/*
257-
!/examples/bert/data/download_glue_data.py
258-
!/examples/bert/data/README.md
259-
/examples/bert/output

docs/spelling_wordlist.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,4 @@ tensorboard
7171
tokenizer
7272
wordpiece
7373
unigram
74+
TF

examples/bert/.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
/data/*
2+
!/data/download_glue_data.py
3+
!/data/README.md
4+
/output
5+
/runs

examples/bert/bert_classifier_using_executor_main.py

Lines changed: 18 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,16 @@
1818
import argparse
1919
import functools
2020
import importlib
21-
import logging
2221
import os
2322
import sys
2423
import tempfile
2524
from pathlib import Path
26-
from typing import Any, Dict, List, Union, Optional
25+
from typing import Any, Dict, List, Optional, Tuple, Union
2726

2827
import torch
2928
from torch import nn
3029
from torch.nn import functional as F
30+
3131
import texar.torch as tx
3232
from texar.torch.run import *
3333

@@ -67,48 +67,35 @@
6767

6868
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
6969

70-
logging.root.setLevel(logging.INFO)
71-
7270

7371
class ModelWrapper(nn.Module):
7472
def __init__(self, model: tx.modules.BERTClassifier):
7573
super().__init__()
7674
self.model = model
7775

78-
def _compute_loss(self, logits, labels):
79-
r"""Compute loss.
80-
"""
76+
def _get_outputs(self, batch: tx.data.Batch) \
77+
-> Tuple[torch.Tensor, torch.LongTensor]:
78+
input_ids = batch["input_ids"]
79+
segment_ids = batch["segment_ids"]
80+
input_length = (1 - (input_ids == 0).int()).sum(dim=1)
81+
logits, preds = self.model(input_ids, input_length, segment_ids)
82+
return logits, preds
83+
84+
def forward(self, # type: ignore
85+
batch: tx.data.Batch) -> Dict[str, torch.Tensor]:
86+
logits, preds = self._get_outputs(batch)
87+
labels = batch["label_ids"]
8188
if self.model.is_binary:
8289
loss = F.binary_cross_entropy(
8390
logits.view(-1), labels.view(-1), reduction='mean')
8491
else:
8592
loss = F.cross_entropy(
8693
logits.view(-1, self.model.num_classes),
8794
labels.view(-1), reduction='mean')
88-
return loss
89-
90-
def forward(self, # type: ignore
91-
batch: tx.data.Batch) -> Dict[str, torch.Tensor]:
92-
input_ids = batch["input_ids"]
93-
segment_ids = batch["segment_ids"]
94-
labels = batch["label_ids"]
95-
96-
input_length = (1 - (input_ids == 0).int()).sum(dim=1)
97-
98-
logits, preds = self.model(input_ids, input_length, segment_ids)
99-
100-
loss = self._compute_loss(logits, labels)
101-
10295
return {"loss": loss, "preds": preds}
10396

10497
def predict(self, batch: tx.data.Batch) -> Dict[str, torch.Tensor]:
105-
input_ids = batch["input_ids"]
106-
segment_ids = batch["segment_ids"]
107-
108-
input_length = (1 - (input_ids == 0).int()).sum(dim=1)
109-
110-
_, preds = self.model(input_ids, input_length, segment_ids)
111-
98+
_, preds = self._get_outputs(batch)
11299
return {"preds": preds}
113100

114101

@@ -117,7 +104,7 @@ def __init__(self, file_path: Optional[Union[str, Path]] = None):
117104
super().__init__(pred_name="preds", label_name="input_ids")
118105
self.file_path = file_path
119106

120-
def value(self) -> float:
107+
def _value(self) -> float:
121108
path = self.file_path or tempfile.mktemp()
122109
with open(path, "w+") as writer:
123110
writer.write("\n".join(str(p) for p in self.predicted))
@@ -217,7 +204,8 @@ def main() -> None:
217204
("loss", metric.RunningAverage(1)), # only show current loss
218205
("lr", metric.LR(optim))],
219206
valid_metrics=[valid_metric, ("loss", metric.Average())],
220-
test_metrics=[FileWriterMetric(output_dir / "test.output")],
207+
test_metrics=[
208+
valid_metric, FileWriterMetric(output_dir / "test.output")],
221209
# freq of validation
222210
validate_every=[cond.iteration(config_data.eval_steps)],
223211
# checkpoint saving location

examples/bert/config_data.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,23 +22,23 @@
2222
eval_batch_size = 8
2323
test_batch_size = 8
2424

25-
feature_original_types = {
25+
feature_types = {
2626
# Reading features from pickled data file.
2727
# E.g., Reading feature "input_ids" as dtype `int64`;
2828
# "FixedLenFeature" indicates its length is fixed for all data instances;
2929
# and the sequence length is limited by `max_seq_length`.
30-
"input_ids": ["int64", "FixedLenFeature", max_seq_length],
31-
"input_mask": ["int64", "FixedLenFeature", max_seq_length],
32-
"segment_ids": ["int64", "FixedLenFeature", max_seq_length],
33-
"label_ids": ["int64", "FixedLenFeature"]
30+
"input_ids": ["int64", "stacked_tensor", max_seq_length],
31+
"input_mask": ["int64", "stacked_tensor", max_seq_length],
32+
"segment_ids": ["int64", "stacked_tensor", max_seq_length],
33+
"label_ids": ["int64", "stacked_tensor"]
3434
}
3535

3636
train_hparam = {
3737
"allow_smaller_final_batch": False,
3838
"batch_size": train_batch_size,
3939
"dataset": {
4040
"data_name": "data",
41-
"feature_original_types": feature_original_types,
41+
"feature_types": feature_types,
4242
"files": "{}/train.pkl".format(pickle_data_dir)
4343
},
4444
"shuffle": True,
@@ -50,7 +50,7 @@
5050
"batch_size": eval_batch_size,
5151
"dataset": {
5252
"data_name": "data",
53-
"feature_original_types": feature_original_types,
53+
"feature_types": feature_types,
5454
"files": "{}/eval.pkl".format(pickle_data_dir)
5555
},
5656
"shuffle": False
@@ -61,7 +61,7 @@
6161
"batch_size": test_batch_size,
6262
"dataset": {
6363
"data_name": "data",
64-
"feature_original_types": feature_original_types,
64+
"feature_types": feature_types,
6565
"files": "{}/predict.pkl".format(pickle_data_dir)
6666
},
6767
"shuffle": False

examples/bert/prepare_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def main() -> None:
141141
data_dir=data_dir,
142142
max_seq_length=args.max_seq_length,
143143
output_dir=output_dir,
144-
feature_original_types=config_data.feature_original_types)
144+
feature_types=config_data.feature_types)
145145
modify_config_data(args.max_seq_length, num_train_data, num_classes)
146146

147147

examples/bert/utils/data_utils.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -348,11 +348,10 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
348348

349349
def convert_examples_to_features_and_output_to_files(
350350
examples, label_list, max_seq_length, tokenizer, output_file,
351-
feature_original_types):
351+
feature_types):
352352
r"""Convert a set of `InputExample`s to a pickled file."""
353353

354-
with tx.data.RecordData.writer(
355-
output_file, feature_original_types) as writer:
354+
with tx.data.RecordData.writer(output_file, feature_types) as writer:
356355
for (ex_index, example) in enumerate(examples):
357356
feature = convert_single_example(ex_index, example, label_list,
358357
max_seq_length, tokenizer)
@@ -368,7 +367,7 @@ def convert_examples_to_features_and_output_to_files(
368367

369368
def prepare_record_data(processor, tokenizer,
370369
data_dir, max_seq_length, output_dir,
371-
feature_original_types):
370+
feature_types):
372371
r"""Prepare record data.
373372
Args:
374373
processor: Data Preprocessor, which must have get_labels,
@@ -378,24 +377,24 @@ def prepare_record_data(processor, tokenizer,
378377
data_dir: The input data directory.
379378
max_seq_length: Max sequence length.
380379
output_dir: The directory to save the pickled file in.
381-
feature_original_types: The original type of the feature.
380+
feature_types: The original type of the feature.
382381
"""
383382
label_list = processor.get_labels()
384383

385384
train_examples = processor.get_train_examples(data_dir)
386385
train_file = os.path.join(output_dir, "train.pkl")
387386
convert_examples_to_features_and_output_to_files(
388387
train_examples, label_list, max_seq_length,
389-
tokenizer, train_file, feature_original_types)
388+
tokenizer, train_file, feature_types)
390389

391390
eval_examples = processor.get_dev_examples(data_dir)
392391
eval_file = os.path.join(output_dir, "eval.pkl")
393392
convert_examples_to_features_and_output_to_files(
394393
eval_examples, label_list,
395-
max_seq_length, tokenizer, eval_file, feature_original_types)
394+
max_seq_length, tokenizer, eval_file, feature_types)
396395

397396
test_examples = processor.get_test_examples(data_dir)
398397
test_file = os.path.join(output_dir, "predict.pkl")
399398
convert_examples_to_features_and_output_to_files(
400399
test_examples, label_list,
401-
max_seq_length, tokenizer, test_file, feature_original_types)
400+
max_seq_length, tokenizer, test_file, feature_types)

examples/gpt-2/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
/data/toy/*.pkl
2+
/output/

examples/gpt-2/config_train.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,21 +26,21 @@
2626

2727
# Data configs
2828

29-
feature_original_types = {
29+
feature_types = {
3030
# Reading features from pickle data file.
3131
# E.g., Reading feature "text_ids" as dtype `int64`;
32-
# "FixedLenFeature" indicates its length is fixed for all data instances;
32+
# "stacked_tensor" indicates its length is fixed for all data instances;
3333
# and the sequence length is limited by `max_seq_length`.
34-
"text_ids": ["int64", "FixedLenFeature", max_seq_length],
35-
"length": ["int64", "FixedLenFeature"]
34+
"text_ids": ["int64", "stacked_tensor", max_seq_length],
35+
"length": ["int64", "stacked_tensor"]
3636
}
3737

3838
train_hparam = {
3939
"allow_smaller_final_batch": False,
4040
"batch_size": train_batch_size,
4141
"dataset": {
4242
"data_name": "data",
43-
"feature_original_types": feature_original_types,
43+
"feature_types": feature_types,
4444
"files": "{}/train.pkl".format(pickle_data_dir)
4545
},
4646
"shuffle": True,
@@ -52,7 +52,7 @@
5252
"batch_size": eval_batch_size,
5353
"dataset": {
5454
"data_name": "data",
55-
"feature_original_types": feature_original_types,
55+
"feature_types": feature_types,
5656
"files": "{}/dev.pkl".format(pickle_data_dir)
5757
},
5858
"shuffle": False
@@ -65,7 +65,7 @@
6565
"batch_size": test_batch_size,
6666
"dataset": {
6767
"data_name": "data",
68-
"feature_original_types": feature_original_types,
68+
"feature_types": feature_types,
6969
"files": "{}/test.pkl".format(pickle_data_dir)
7070
},
7171
"shuffle": False

examples/gpt-2/prepare_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def main() -> None:
6868
max_seq_length=args.max_seq_length,
6969
tokenizer=tokenizer,
7070
output_dir=pickle_output_dir,
71-
feature_original_types=config_train.feature_original_types)
71+
feature_types=config_train.feature_types)
7272

7373

7474
if __name__ == "__main__":

examples/gpt-2/utils/data_utils.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,11 @@ def convert_examples_to_features_and_output_to_files(
3838
max_seq_length: int,
3939
tokenizer: tx.data.GPT2Tokenizer,
4040
output_file: str,
41-
feature_original_types: Dict[str, Any],
41+
feature_types: Dict[str, Any],
4242
append_eos_token: bool = True):
4343
r"""Converts a set of examples to a `pickle` file."""
4444

45-
with tx.data.RecordData.writer(
46-
output_file, feature_original_types) as writer:
45+
with tx.data.RecordData.writer(output_file, feature_types) as writer:
4746

4847
for (_, example) in enumerate(examples):
4948

@@ -62,14 +61,14 @@ def prepare_pickle_data(data_dir: str,
6261
max_seq_length: int,
6362
tokenizer: tx.data.GPT2Tokenizer,
6463
output_dir: str,
65-
feature_original_types: Dict[str, Any]):
64+
feature_types: Dict[str, Any]):
6665
r"""Prepare the `pickle` dataset.
6766
Args:
6867
data_dir: The input data directory.
6968
max_seq_length: Max sequence length.
7069
tokenizer: The GPT-2 tokenizer.
7170
output_dir: The directory to save the pickled files in.
72-
feature_original_types: The original type of the feature.
71+
feature_types: The original type of the feature.
7372
"""
7473
train_fn = os.path.join(data_dir, "train.txt")
7574
if os.path.isfile(train_fn):
@@ -78,7 +77,7 @@ def prepare_pickle_data(data_dir: str,
7877
train_file = os.path.join(output_dir, "train.pkl")
7978
convert_examples_to_features_and_output_to_files(
8079
train_examples, max_seq_length, tokenizer, train_file,
81-
feature_original_types)
80+
feature_types)
8281

8382
dev_fn = os.path.join(data_dir, "dev.txt")
8483
if os.path.isfile(dev_fn):
@@ -87,7 +86,7 @@ def prepare_pickle_data(data_dir: str,
8786
eval_file = os.path.join(output_dir, "dev.pkl")
8887
convert_examples_to_features_and_output_to_files(
8988
eval_examples, max_seq_length, tokenizer, eval_file,
90-
feature_original_types)
89+
feature_types)
9190

9291
test_fn = os.path.join(data_dir, "test.txt")
9392
if os.path.isfile(test_fn):
@@ -96,4 +95,4 @@ def prepare_pickle_data(data_dir: str,
9695
test_file = os.path.join(output_dir, "test.pkl")
9796
convert_examples_to_features_and_output_to_files(
9897
test_examples, max_seq_length, tokenizer, test_file,
99-
feature_original_types, append_eos_token=False)
98+
feature_types, append_eos_token=False)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
/data

examples/seq2seq_attn/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
/data/
2+
/data.zip
3+
/iwslt14.zip
4+
/toy_copy.zip

examples/transformer/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
/data/
2+
/temp/
3+
/outputs/

examples/vae_text/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
/simple-examples/
2+
/data/
3+
/models/
4+
/simple-examples.tgz

0 commit comments

Comments
 (0)