Resolve asyml#217: Change feature type names in RecordData (asyml#219)

huzecong · web-flow · commit f14fa55d39d0 · 2019-10-01T16:20:36.000-04:00
- Naming changes as mentioned in asyml#217. - Third parameter (previously `len`) now changed to `shape`, and is now enforced if specified (previously ignored). - First parameter `dtype` can be `None` if using `list` as collate method. - A UserWarning will be shown if the user still uses `feature_original_types` or the old feature type names.
diff --git a/.gitignore b/.gitignore
@@ -219,41 +219,3 @@ docs/_build
 
 ### mypy ###
 /.mypy_cache/
-
-### Project ###
-/data/
-texar_download/
-checkpoints/
-/language_models/
-/examples/language_model_ptb/simple-examples/
-simple-examples.tgz
-/examples/hierarchical_dialog/data/
-/examples/sequence_tagging/data/
-/examples/sequence_tagging/tmp/
-/examples/sentence_classifier/data/
-/examples/seq2seq_attn/data/
-/examples/seq2seq_attn/data.zip
-/examples/seq2seq_attn/iwslt14.zip
-/examples/seq2seq_attn/toy_copy.zip
-/examples/seq2seq_rl/data/
-/examples/seq2seq_rl/data.zip
-/examples/seq2seq_rl/iwslt14.zip
-/examples/seq2seq_rl/toy_copy.zip
-/examples/seq2seq_configs/data/
-/examples/seq2seq_configs/data.zip
-/examples/seq2seq_config/iwslt14.zip
-/examples/seq2seq_config/toy_copy.zip
-/examples/seq2seq_exposure_bias/data/
-/examples/text_style_transfer/checkpoints/
-/examples/text_style_transfer/samples/
-/examples/text_style_transfer/data/
-/examples/text_style_transfer/yelp.zip
-/examples/vae_text/simple-examples/
-/examples/vae_text/data/
-/examples/transformer/data/
-/examples/transformer/temp/
-/examples/transformer/outputs/
-/examples/bert/data/*
-!/examples/bert/data/download_glue_data.py
-!/examples/bert/data/README.md
-/examples/bert/output
diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt
@@ -71,3 +71,4 @@ tensorboard
 tokenizer
 wordpiece
 unigram
+TF
diff --git a/examples/bert/.gitignore b/examples/bert/.gitignore
@@ -0,0 +1,5 @@
+/data/*
+!/data/download_glue_data.py
+!/data/README.md
+/output
+/runs
diff --git a/examples/bert/bert_classifier_using_executor_main.py b/examples/bert/bert_classifier_using_executor_main.py
@@ -18,16 +18,16 @@
 import argparse
 import functools
 import importlib
-import logging
 import os
 import sys
 import tempfile
 from pathlib import Path
-from typing import Any, Dict, List, Union, Optional
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
 from torch.nn import functional as F
+
 import texar.torch as tx
 from texar.torch.run import *
 
@@ -67,48 +67,35 @@
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
-logging.root.setLevel(logging.INFO)
-
 
 class ModelWrapper(nn.Module):
     def __init__(self, model: tx.modules.BERTClassifier):
         super().__init__()
         self.model = model
 
-    def _compute_loss(self, logits, labels):
-        r"""Compute loss.
-        """
+    def _get_outputs(self, batch: tx.data.Batch) \
+            -> Tuple[torch.Tensor, torch.LongTensor]:
+        input_ids = batch["input_ids"]
+        segment_ids = batch["segment_ids"]
+        input_length = (1 - (input_ids == 0).int()).sum(dim=1)
+        logits, preds = self.model(input_ids, input_length, segment_ids)
+        return logits, preds
+
+    def forward(self,  # type: ignore
+                batch: tx.data.Batch) -> Dict[str, torch.Tensor]:
+        logits, preds = self._get_outputs(batch)
+        labels = batch["label_ids"]
         if self.model.is_binary:
             loss = F.binary_cross_entropy(
                 logits.view(-1), labels.view(-1), reduction='mean')
         else:
             loss = F.cross_entropy(
                 logits.view(-1, self.model.num_classes),
                 labels.view(-1), reduction='mean')
-        return loss
-
-    def forward(self,  # type: ignore
-                batch: tx.data.Batch) -> Dict[str, torch.Tensor]:
-        input_ids = batch["input_ids"]
-        segment_ids = batch["segment_ids"]
-        labels = batch["label_ids"]
-
-        input_length = (1 - (input_ids == 0).int()).sum(dim=1)
-
-        logits, preds = self.model(input_ids, input_length, segment_ids)
-
-        loss = self._compute_loss(logits, labels)
-
         return {"loss": loss, "preds": preds}
 
     def predict(self, batch: tx.data.Batch) -> Dict[str, torch.Tensor]:
-        input_ids = batch["input_ids"]
-        segment_ids = batch["segment_ids"]
-
-        input_length = (1 - (input_ids == 0).int()).sum(dim=1)
-
-        _, preds = self.model(input_ids, input_length, segment_ids)
-
+        _, preds = self._get_outputs(batch)
         return {"preds": preds}
 
 
@@ -117,7 +104,7 @@ def __init__(self, file_path: Optional[Union[str, Path]] = None):
         super().__init__(pred_name="preds", label_name="input_ids")
         self.file_path = file_path
 
-    def value(self) -> float:
+    def _value(self) -> float:
         path = self.file_path or tempfile.mktemp()
         with open(path, "w+") as writer:
             writer.write("\n".join(str(p) for p in self.predicted))
@@ -217,7 +204,8 @@ def main() -> None:
             ("loss", metric.RunningAverage(1)),  # only show current loss
             ("lr", metric.LR(optim))],
         valid_metrics=[valid_metric, ("loss", metric.Average())],
-        test_metrics=[FileWriterMetric(output_dir / "test.output")],
+        test_metrics=[
+            valid_metric, FileWriterMetric(output_dir / "test.output")],
         # freq of validation
         validate_every=[cond.iteration(config_data.eval_steps)],
         # checkpoint saving location
diff --git a/examples/bert/config_data.py b/examples/bert/config_data.py
@@ -22,23 +22,23 @@
 eval_batch_size = 8
 test_batch_size = 8
 
-feature_original_types = {
+feature_types = {
     # Reading features from pickled data file.
     # E.g., Reading feature "input_ids" as dtype `int64`;
     # "FixedLenFeature" indicates its length is fixed for all data instances;
     # and the sequence length is limited by `max_seq_length`.
-    "input_ids": ["int64", "FixedLenFeature", max_seq_length],
-    "input_mask": ["int64", "FixedLenFeature", max_seq_length],
-    "segment_ids": ["int64", "FixedLenFeature", max_seq_length],
-    "label_ids": ["int64", "FixedLenFeature"]
+    "input_ids": ["int64", "stacked_tensor", max_seq_length],
+    "input_mask": ["int64", "stacked_tensor", max_seq_length],
+    "segment_ids": ["int64", "stacked_tensor", max_seq_length],
+    "label_ids": ["int64", "stacked_tensor"]
 }
 
 train_hparam = {
     "allow_smaller_final_batch": False,
     "batch_size": train_batch_size,
     "dataset": {
         "data_name": "data",
-        "feature_original_types": feature_original_types,
+        "feature_types": feature_types,
         "files": "{}/train.pkl".format(pickle_data_dir)
     },
     "shuffle": True,
@@ -50,7 +50,7 @@
     "batch_size": eval_batch_size,
     "dataset": {
         "data_name": "data",
-        "feature_original_types": feature_original_types,
+        "feature_types": feature_types,
         "files": "{}/eval.pkl".format(pickle_data_dir)
     },
     "shuffle": False
@@ -61,7 +61,7 @@
     "batch_size": test_batch_size,
     "dataset": {
         "data_name": "data",
-        "feature_original_types": feature_original_types,
+        "feature_types": feature_types,
         "files": "{}/predict.pkl".format(pickle_data_dir)
     },
     "shuffle": False
diff --git a/examples/bert/prepare_data.py b/examples/bert/prepare_data.py
@@ -141,7 +141,7 @@ def main() -> None:
         data_dir=data_dir,
         max_seq_length=args.max_seq_length,
         output_dir=output_dir,
-        feature_original_types=config_data.feature_original_types)
+        feature_types=config_data.feature_types)
     modify_config_data(args.max_seq_length, num_train_data, num_classes)
 
 
diff --git a/examples/bert/utils/data_utils.py b/examples/bert/utils/data_utils.py
@@ -348,11 +348,10 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
 
 def convert_examples_to_features_and_output_to_files(
         examples, label_list, max_seq_length, tokenizer, output_file,
-        feature_original_types):
+        feature_types):
     r"""Convert a set of `InputExample`s to a pickled file."""
 
-    with tx.data.RecordData.writer(
-            output_file, feature_original_types) as writer:
+    with tx.data.RecordData.writer(output_file, feature_types) as writer:
         for (ex_index, example) in enumerate(examples):
             feature = convert_single_example(ex_index, example, label_list,
                                              max_seq_length, tokenizer)
@@ -368,7 +367,7 @@ def convert_examples_to_features_and_output_to_files(
 
 def prepare_record_data(processor, tokenizer,
                         data_dir, max_seq_length, output_dir,
-                        feature_original_types):
+                        feature_types):
     r"""Prepare record data.
     Args:
         processor: Data Preprocessor, which must have get_labels,
@@ -378,24 +377,24 @@ def prepare_record_data(processor, tokenizer,
         data_dir: The input data directory.
         max_seq_length: Max sequence length.
         output_dir: The directory to save the pickled file in.
-        feature_original_types: The original type of the feature.
+        feature_types: The original type of the feature.
     """
     label_list = processor.get_labels()
 
     train_examples = processor.get_train_examples(data_dir)
     train_file = os.path.join(output_dir, "train.pkl")
     convert_examples_to_features_and_output_to_files(
         train_examples, label_list, max_seq_length,
-        tokenizer, train_file, feature_original_types)
+        tokenizer, train_file, feature_types)
 
     eval_examples = processor.get_dev_examples(data_dir)
     eval_file = os.path.join(output_dir, "eval.pkl")
     convert_examples_to_features_and_output_to_files(
         eval_examples, label_list,
-        max_seq_length, tokenizer, eval_file, feature_original_types)
+        max_seq_length, tokenizer, eval_file, feature_types)
 
     test_examples = processor.get_test_examples(data_dir)
     test_file = os.path.join(output_dir, "predict.pkl")
     convert_examples_to_features_and_output_to_files(
         test_examples, label_list,
-        max_seq_length, tokenizer, test_file, feature_original_types)
+        max_seq_length, tokenizer, test_file, feature_types)
diff --git a/examples/gpt-2/.gitignore b/examples/gpt-2/.gitignore
@@ -0,0 +1,2 @@
+/data/toy/*.pkl
+/output/
diff --git a/examples/gpt-2/config_train.py b/examples/gpt-2/config_train.py
@@ -26,21 +26,21 @@
 
 # Data configs
 
-feature_original_types = {
+feature_types = {
     # Reading features from pickle data file.
     # E.g., Reading feature "text_ids" as dtype `int64`;
-    # "FixedLenFeature" indicates its length is fixed for all data instances;
+    # "stacked_tensor" indicates its length is fixed for all data instances;
     # and the sequence length is limited by `max_seq_length`.
-    "text_ids": ["int64", "FixedLenFeature", max_seq_length],
-    "length": ["int64", "FixedLenFeature"]
+    "text_ids": ["int64", "stacked_tensor", max_seq_length],
+    "length": ["int64", "stacked_tensor"]
 }
 
 train_hparam = {
     "allow_smaller_final_batch": False,
     "batch_size": train_batch_size,
     "dataset": {
         "data_name": "data",
-        "feature_original_types": feature_original_types,
+        "feature_types": feature_types,
         "files": "{}/train.pkl".format(pickle_data_dir)
     },
     "shuffle": True,
@@ -52,7 +52,7 @@
     "batch_size": eval_batch_size,
     "dataset": {
         "data_name": "data",
-        "feature_original_types": feature_original_types,
+        "feature_types": feature_types,
         "files": "{}/dev.pkl".format(pickle_data_dir)
     },
     "shuffle": False
@@ -65,7 +65,7 @@
     "batch_size": test_batch_size,
     "dataset": {
         "data_name": "data",
-        "feature_original_types": feature_original_types,
+        "feature_types": feature_types,
         "files": "{}/test.pkl".format(pickle_data_dir)
     },
     "shuffle": False
diff --git a/examples/gpt-2/prepare_data.py b/examples/gpt-2/prepare_data.py
@@ -68,7 +68,7 @@ def main() -> None:
         max_seq_length=args.max_seq_length,
         tokenizer=tokenizer,
         output_dir=pickle_output_dir,
-        feature_original_types=config_train.feature_original_types)
+        feature_types=config_train.feature_types)
 
 
 if __name__ == "__main__":
diff --git a/examples/gpt-2/utils/data_utils.py b/examples/gpt-2/utils/data_utils.py
@@ -38,12 +38,11 @@ def convert_examples_to_features_and_output_to_files(
         max_seq_length: int,
         tokenizer: tx.data.GPT2Tokenizer,
         output_file: str,
-        feature_original_types: Dict[str, Any],
+        feature_types: Dict[str, Any],
         append_eos_token: bool = True):
     r"""Converts a set of examples to a `pickle` file."""
 
-    with tx.data.RecordData.writer(
-            output_file, feature_original_types) as writer:
+    with tx.data.RecordData.writer(output_file, feature_types) as writer:
 
         for (_, example) in enumerate(examples):
 
@@ -62,14 +61,14 @@ def prepare_pickle_data(data_dir: str,
                         max_seq_length: int,
                         tokenizer: tx.data.GPT2Tokenizer,
                         output_dir: str,
-                        feature_original_types: Dict[str, Any]):
+                        feature_types: Dict[str, Any]):
     r"""Prepare the `pickle` dataset.
     Args:
         data_dir: The input data directory.
         max_seq_length: Max sequence length.
         tokenizer: The GPT-2 tokenizer.
         output_dir: The directory to save the pickled files in.
-        feature_original_types: The original type of the feature.
+        feature_types: The original type of the feature.
     """
     train_fn = os.path.join(data_dir, "train.txt")
     if os.path.isfile(train_fn):
@@ -78,7 +77,7 @@ def prepare_pickle_data(data_dir: str,
         train_file = os.path.join(output_dir, "train.pkl")
         convert_examples_to_features_and_output_to_files(
             train_examples, max_seq_length, tokenizer, train_file,
-            feature_original_types)
+            feature_types)
 
     dev_fn = os.path.join(data_dir, "dev.txt")
     if os.path.isfile(dev_fn):
@@ -87,7 +86,7 @@ def prepare_pickle_data(data_dir: str,
         eval_file = os.path.join(output_dir, "dev.pkl")
         convert_examples_to_features_and_output_to_files(
             eval_examples, max_seq_length, tokenizer, eval_file,
-            feature_original_types)
+            feature_types)
 
     test_fn = os.path.join(data_dir, "test.txt")
     if os.path.isfile(test_fn):
@@ -96,4 +95,4 @@ def prepare_pickle_data(data_dir: str,
         test_file = os.path.join(output_dir, "test.pkl")
         convert_examples_to_features_and_output_to_files(
             test_examples, max_seq_length, tokenizer, test_file,
-            feature_original_types, append_eos_token=False)
+            feature_types, append_eos_token=False)
diff --git a/examples/sentence_classifier/.gitignore b/examples/sentence_classifier/.gitignore
@@ -0,0 +1 @@
+/data
diff --git a/examples/seq2seq_attn/.gitignore b/examples/seq2seq_attn/.gitignore
@@ -0,0 +1,4 @@
+/data/
+/data.zip
+/iwslt14.zip
+/toy_copy.zip
diff --git a/examples/transformer/.gitignore b/examples/transformer/.gitignore
@@ -0,0 +1,3 @@
+/data/
+/temp/
+/outputs/
diff --git a/examples/vae_text/.gitignore b/examples/vae_text/.gitignore
@@ -0,0 +1,4 @@
+/simple-examples/
+/data/
+/models/
+/simple-examples.tgz
diff --git a/examples/xlnet/utils/dataset.py b/examples/xlnet/utils/dataset.py
diff --git a/texar/torch/data/data/multi_aligned_data.py b/texar/torch/data/data/multi_aligned_data.py
diff --git a/texar/torch/data/data/multi_aligned_data_test.py b/texar/torch/data/data/multi_aligned_data_test.py
diff --git a/texar/torch/data/data/record_data.py b/texar/torch/data/data/record_data.py
diff --git a/texar/torch/data/data/record_data_test.py b/texar/torch/data/data/record_data_test.py
diff --git a/texar/torch/modules/pretrained/gpt2.py b/texar/torch/modules/pretrained/gpt2.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +/data/*
 +!/data/download_glue_data.py
 +!/data/README.md
 +/output
 +/runs