Update for GNMT reference (mlcommons#222)

* Update for GNMT reference * Fixed weight init for not shared embeddings (this doesn't affect the reference config) * Better description of vocabulary and text datasets * Added example logfile from GNMT training on reference config
PenghuiCheng · Apr 9, 2019 · 69dbe8b · 69dbe8b
1 parent d99c36d
commit 69dbe8b
Show file tree

Hide file tree

Showing 37 changed files with 2,854 additions and 1,461 deletions.
diff --git a/compliance/example_logs/gnmt_example.log b/compliance/example_logs/gnmt_example.log
@@ -0,0 +1,74 @@
+:::MLPv0.5.0 gnmt 1550049265.393176317 (train.py:268) run_start
+:::MLPv0.5.0 gnmt 1550049265.394439697 (seq2seq/utils.py:113) run_set_random_seed: 1
+:::MLPv0.5.0 gnmt 1550049265.412598848 (train.py:312) preproc_tokenize_training
+:::MLPv0.5.0 gnmt 1550049265.413046360 (train.py:314) train_hp_max_sequence_length: 50
+:::MLPv0.5.0 gnmt 1550049274.456599236 (train.py:326) preproc_num_train_examples: 3498161
+:::MLPv0.5.0 gnmt 1550049275.559447765 (train.py:336) preproc_tokenize_eval
+:::MLPv0.5.0 gnmt 1550049275.597178459 (train.py:346) preproc_num_eval_examples: 3003
+:::MLPv0.5.0 gnmt 1550049275.597592831 (train.py:350) preproc_vocab_size: 32317
+:::MLPv0.5.0 gnmt 1550049275.598169804 (seq2seq/models/gnmt.py:34) model_hp_num_layers: 4
+:::MLPv0.5.0 gnmt 1550049275.598643303 (seq2seq/models/gnmt.py:36) model_hp_hidden_size: 1024
+:::MLPv0.5.0 gnmt 1550049275.599111080 (seq2seq/models/gnmt.py:38) model_hp_dropout: 0.2
+:::MLPv0.5.0 gnmt 1550049279.603173733 (train.py:251) model_hp_loss_fn: "Cross Entropy with label smoothing"
+:::MLPv0.5.0 gnmt 1550049279.603722811 (train.py:253) model_hp_loss_smoothing: 0.1
+:::MLPv0.5.0 gnmt 1550049280.493244410 (train.py:393) input_batch_size: 128
+:::MLPv0.5.0 gnmt 1550049280.493689060 (train.py:395) input_size: 3497728
+:::MLPv0.5.0 gnmt 1550049280.494647741 (seq2seq/data/sampler.py:254) input_order
+:::MLPv0.5.0 gnmt 1550049280.496007442 (seq2seq/data/sampler.py:254) input_order
+:::MLPv0.5.0 gnmt 1550049280.496948242 (train.py:409) eval_size: 3003
+:::MLPv0.5.0 gnmt 1550049280.497889280 (seq2seq/inference/beam_search.py:43) eval_hp_beam_size: 5
+:::MLPv0.5.0 gnmt 1550049280.498333216 (seq2seq/inference/beam_search.py:45) eval_hp_max_sequence_length: 150
+:::MLPv0.5.0 gnmt 1550049280.498714685 (seq2seq/inference/beam_search.py:47) eval_hp_length_normalization_constant: 5.0
+:::MLPv0.5.0 gnmt 1550049280.499088049 (seq2seq/inference/beam_search.py:49) eval_hp_length_normalization_factor: 0.6
+:::MLPv0.5.0 gnmt 1550049280.499464273 (seq2seq/inference/beam_search.py:51) eval_hp_coverage_penalty_factor: 0.1
+:::MLPv0.5.0 gnmt 1550049283.242700577 (seq2seq/train/trainer.py:115) opt_name: "adam"
+:::MLPv0.5.0 gnmt 1550049283.243218422 (seq2seq/train/trainer.py:117) opt_learning_rate: 0.001
+:::MLPv0.5.0 gnmt 1550049283.243656635 (seq2seq/train/trainer.py:119) opt_hp_Adam_beta1: 0.9
+:::MLPv0.5.0 gnmt 1550049283.244005442 (seq2seq/train/trainer.py:121) opt_hp_Adam_beta2: 0.999
+:::MLPv0.5.0 gnmt 1550049283.244350195 (seq2seq/train/trainer.py:123) opt_hp_Adam_epsilon: 1e-08
+:::MLPv0.5.0 gnmt 1550049283.245311022 (seq2seq/train/lr_scheduler.py:78) opt_learning_rate_warmup_steps: 200
+:::MLPv0.5.0 gnmt 1550049283.245801687 (train.py:466) train_loop
+:::MLPv0.5.0 gnmt 1550049283.246350050 (train.py:470) train_epoch: 0
+:::MLPv0.5.0 gnmt 1550049284.053482771 (seq2seq/data/sampler.py:205) input_order
+:::MLPv0.5.0 gnmt 1550061484.477649927 (train.py:483) train_checkpoint
+:::MLPv0.5.0 gnmt 1550061486.181115627 (train.py:490) eval_start: 0
+:::MLPv0.5.0 gnmt 1550061514.467880964 (train.py:495) eval_accuracy: {"epoch": 0, "value": 20.49}
+:::MLPv0.5.0 gnmt 1550061514.468363047 (train.py:497) eval_target: 24.0
+:::MLPv0.5.0 gnmt 1550061514.468725204 (train.py:498) eval_stop
+:::MLPv0.5.0 gnmt 1550061514.469318390 (train.py:470) train_epoch: 1
+:::MLPv0.5.0 gnmt 1550061515.311945915 (seq2seq/data/sampler.py:205) input_order
+:::MLPv0.5.0 gnmt 1550073699.708187580 (train.py:483) train_checkpoint
+:::MLPv0.5.0 gnmt 1550073705.950762510 (train.py:490) eval_start: 1
+:::MLPv0.5.0 gnmt 1550073733.111917734 (train.py:495) eval_accuracy: {"epoch": 1, "value": 21.72}
+:::MLPv0.5.0 gnmt 1550073733.112442732 (train.py:497) eval_target: 24.0
+:::MLPv0.5.0 gnmt 1550073733.112777948 (train.py:498) eval_stop
+:::MLPv0.5.0 gnmt 1550073733.113427401 (train.py:470) train_epoch: 2
+:::MLPv0.5.0 gnmt 1550073733.976921797 (seq2seq/data/sampler.py:205) input_order
+:::MLPv0.5.0 gnmt 1550085915.256659985 (train.py:483) train_checkpoint
+:::MLPv0.5.0 gnmt 1550085921.003725052 (train.py:490) eval_start: 2
+:::MLPv0.5.0 gnmt 1550085948.763688564 (train.py:495) eval_accuracy: {"epoch": 2, "value": 22.52}
+:::MLPv0.5.0 gnmt 1550085948.764161110 (train.py:497) eval_target: 24.0
+:::MLPv0.5.0 gnmt 1550085948.764480591 (train.py:498) eval_stop
+:::MLPv0.5.0 gnmt 1550085948.765091658 (train.py:470) train_epoch: 3
+:::MLPv0.5.0 gnmt 1550085949.625080585 (seq2seq/data/sampler.py:205) input_order
+:::MLPv0.5.0 gnmt 1550098119.481947660 (train.py:483) train_checkpoint
+:::MLPv0.5.0 gnmt 1550098125.219326258 (train.py:490) eval_start: 3
+:::MLPv0.5.0 gnmt 1550098154.012802124 (train.py:495) eval_accuracy: {"epoch": 3, "value": 23.09}
+:::MLPv0.5.0 gnmt 1550098154.013273716 (train.py:497) eval_target: 24.0
+:::MLPv0.5.0 gnmt 1550098154.013695478 (train.py:498) eval_stop
+:::MLPv0.5.0 gnmt 1550098154.014324665 (train.py:470) train_epoch: 4
+:::MLPv0.5.0 gnmt 1550098154.874188900 (seq2seq/data/sampler.py:205) input_order
+:::MLPv0.5.0 gnmt 1550110332.296045065 (train.py:483) train_checkpoint
+:::MLPv0.5.0 gnmt 1550110338.542354584 (train.py:490) eval_start: 4
+:::MLPv0.5.0 gnmt 1550110366.501193285 (train.py:495) eval_accuracy: {"epoch": 4, "value": 23.22}
+:::MLPv0.5.0 gnmt 1550110366.501719475 (train.py:497) eval_target: 24.0
+:::MLPv0.5.0 gnmt 1550110366.502229214 (train.py:498) eval_stop
+:::MLPv0.5.0 gnmt 1550110366.502951622 (train.py:470) train_epoch: 5
+:::MLPv0.5.0 gnmt 1550110367.328888416 (seq2seq/data/sampler.py:205) input_order
+:::MLPv0.5.0 gnmt 1550122538.255764246 (train.py:483) train_checkpoint
+:::MLPv0.5.0 gnmt 1550122544.367511034 (train.py:490) eval_start: 5
+:::MLPv0.5.0 gnmt 1550122571.801926613 (train.py:495) eval_accuracy: {"epoch": 5, "value": 24.11}
+:::MLPv0.5.0 gnmt 1550122571.802411556 (train.py:497) eval_target: 24.0
+:::MLPv0.5.0 gnmt 1550122571.802731991 (train.py:498) eval_stop
+:::MLPv0.5.0 gnmt 1550122571.803371668 (train.py:522) run_stop: {"success": true}
+:::MLPv0.5.0 gnmt 1550122571.803844213 (train.py:523) run_final
diff --git a/rnn_translator/README.md b/rnn_translator/README.md
@@ -2,8 +2,15 @@
 
 This problem uses recurrent neural network to do language translation.
 
+## Requirements
+* [Python 3.6](https://www.python.org)
+* [CUDA 9.0](https://developer.nvidia.com/cuda-90-download-archive)
+* [PyTorch 1.0.1](https://pytorch.org)
+* [sacrebleu](https://pypi.org/project/sacrebleu/)
+
 ### Recommended setup
 * [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
+* [pytorch/pytorch:1.0.1-cuda10.0-cudnn7-runtime container](https://hub.docker.com/r/pytorch/pytorch/tags/)
 
 # 2. Directions
 ### Steps to configure machine
@@ -63,6 +70,21 @@ Verify data with:
 
     bash verify_dataset.sh
 
+### Steps specific to the pytorch version to run and time
+
+    cd pytorch
+    sudo docker build . --rm -t gnmt:latest
+    SEED=1
+    NOW=`date "+%F-%T"`
+    sudo nvidia-docker run -it --rm --ipc=host \
+      -v $(pwd)/../data:/data \
+      gnmt:latest "./run_and_time.sh" $SEED |tee benchmark-$NOW.log
+
+### one can control which GPUs are used with the NV_GPU variable
+    sudo NV_GPU=0 nvidia-docker run -it --rm --ipc=host \
+      -v $(pwd)/../data:/data \
+      gnmt:latest "./run_and_time.sh" $SEED |tee benchmark-$NOW.log
+
 # 3. Dataset/Environment
 ### Publication/Attribution
 We use [WMT16 English-German](http://www.statmt.org/wmt16/translation-task.html)
@@ -75,15 +97,105 @@ segment text into subword units (BPE), by default it builds shared vocabulary of
 Preprocessing removes all pairs of sentences that can't be decoded by latin-1
 encoder.
 
+### Vocabulary
+Vocabulary is generated by the following lines from the `download_dataset.sh`
+script:
+
+```
+# Create vocabulary file for BPE
+cat "${OUTPUT_DIR}/train.tok.clean.bpe.${merge_ops}.en" "${OUTPUT_DIR}/train.tok.clean.bpe.${merge_ops}.de" | \
+  ${OUTPUT_DIR}/subword-nmt/get_vocab.py | cut -f1 -d ' ' > "${OUTPUT_DIR}/vocab.bpe.${merge_ops}"
+```
+
+Vocabulary is stored to the `rnn_translator/data/vocab.bpe.32000` plain text
+file. Tokens are separated with a newline character, one token per line. The
+vocabulary file doesn't contain special tokens like for example BOS
+(begin-of-string) or EOS (end-of-string) tokens.
+
+Here are first 10 lines from the `rnn_translator/data/vocab.bpe.32000` file:
+```
+,
+.
+the
+in
+of
+and
+die
+der
+to
+und
+```
+
+### Text datasets
+The `download_dataset.sh` script automatically creates training, validation and
+test datasets. Datasets are stored as plain text files. Sentences are separated
+with a newline character, and tokens within each sentence are separated with a
+single space character.
+
+Training data:
+* source language (English): `rnn_translator/data/train.tok.clean.bpe.32000.en`
+* target language (German): `rnn_translator/data/train.tok.clean.bpe.32000.de`
+
+Validation data:
+* source language (English): `rnn_translator/data/newstest_dev.tok.clean.bpe.32000.en`
+* target language (German): `rnn_translator/data/newstest_dev.tok.clean.bpe.32000.de`
+
+Test data:
+* source language (English): `rnn_translator/data/newstest2014.tok.bpe.32000.en`
+* target language (German): `rnn_translator/data/newstest2014.de`
+  * notice that the `newstest2014.de` file isn't tokenized, BLEU evaluation is
+    performed by the sacrebleu package and sacrebleu expects plain text raw data
+    (tokenization is performed internally by sacrebleu)
+
+Here are first 5 lines from the `rnn_translator/data/train.tok.clean.bpe.32000.en` file:
+```
+Res@@ um@@ ption of the session
+I declare resumed the session of the European Parliament ad@@ jour@@ ned on Friday 17 December 1999 , and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant fes@@ tive period .
+Although , as you will have seen , the d@@ read@@ ed &apos; millenn@@ ium bug &apos; failed to materi@@ alise , still the people in a number of countries suffered a series of natural disasters that truly were d@@ read@@ ful .
+You have requested a debate on this subject in the course of the next few days , during this part-session .
+In the meantime , I should like to observe a minute &apos; s silence , as a number of Members have requested , on behalf of all the victims concerned , particularly those of the terrible stor@@ ms , in the various countries of the European Union .
+```
+
+And here are first 5 lines from the `rnn_translator/data/train.tok.clean.bpe.32000.de` file:
+```
+Wiederaufnahme der Sitzungsperiode
+Ich erkläre die am Freitag , dem 17. Dezember unterbro@@ ch@@ ene Sitzungsperiode des Europäischen Parlaments für wieder@@ aufgenommen , wünsche Ihnen nochmals alles Gute zum Jahres@@ wechsel und hoffe , daß Sie schöne Ferien hatten .
+Wie Sie feststellen konnten , ist der ge@@ für@@ ch@@ tete &quot; Mill@@ en@@ i@@ um-@@ Bu@@ g &quot; nicht eingetreten . Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden .
+Im Parlament besteht der Wunsch nach einer Aussprache im Verlauf dieser Sitzungsperiode in den nächsten Tagen .
+Heute möchte ich Sie bitten - das ist auch der Wunsch einiger Kolleginnen und Kollegen - , allen Opfern der St@@ ür@@ me , insbesondere in den verschiedenen Ländern der Europäischen Union , in einer Schwei@@ ge@@ minute zu ge@@ denken .
+```
+
 ### Training and test data separation
 Training uses WMT16 English-German dataset, validation is on concatenation of
 newstest2015 and newstest2016, BLEU evaluation is done on newstest2014.
 
+
+### Data filtering
+Training is executed only on pairs of sentences which satisfy the following equation:
+```
+    min_len <= src sentence sequence length <= max_len AND
+    min_len <= tgt sentence sequence length <= max_len
+```
+`min_len` is set to 0, `max_len` is set to 50. Source and target sequence
+lengths include special BOS (begin-of-sentence) and EOS (end-of-sentence)
+tokens.
+
+Filtering is implemented in `pytorch/seq2seq/data/dataset.py`, class
+`LazyParallelDataset`.
+
 ### Training data order
-By default training script does bucketing by sequence length. Before each epoch
-dataset is randomly shuffled and split into chunks of 80 batches each. Within
-each chunk it's sorted by (src + tgt) sequence length and then batches are
-reshuffled within each chunk.
+Training script does bucketing by sequence length. Bucketing algorithm uses 5
+equal-width buckets (`num_buckets = 5`). Pairs of training sentences are
+assigned to buckets by the value of
+`max(src_sentence_len // bucket_width, tgt_sentence_len // bucket_width)`, where
+`bucket_width = (max_len + num_buckets - 1) // num_buckets`.
+Before each training epoch batches are randomly sampled from the buckets (last
+incomplete batches are dropped for each bucket), then all batches are
+reshuffled.
+
+
+Bucketing is implemented in `pytorch/seq2seq/data/sampler.py`, class
+`BucketingSampler`.
 
 # 4. Model
 ### Publication/Attribution
@@ -106,47 +218,91 @@ GNMT-like models from [tensorflow/nmt](https://github.com/tensorflow/nmt) and
 * general:
   * encoder and decoder are using shared embeddings
   * data-parallel multi-gpu training
-  * dynamic loss scaling with backoff for Tensor Cores (mixed precision) training
   * trained with label smoothing loss (smoothing factor 0.1)
 * encoder:
-  * 4-layer LSTM, hidden size 1024, first layer is bidirectional, the rest are
-    undirectional
-  * with residual connections starting from 3rd layer
-  * uses standard LSTM layer (accelerated by cudnn)
+  * 4-layer LSTM, hidden size 1024, first layer is bidirectional, the rest of
+    layers are unidirectional
+  * with residual connections starting from 3rd LSTM layer
+  * uses standard pytorch nn.LSTM layer
+  * dropout is applied on input to all LSTM layers, probability of dropout is
+    set to 0.2
+  * hidden state of LSTM layers is initialized with zeros
+  * weights and bias of LSTM layers is initialized with uniform(-0.1, 0.1)
+    distribution
 * decoder:
   * 4-layer unidirectional LSTM with hidden size 1024 and fully-connected
     classifier
-  * with residual connections starting from 3rd layer
-  * uses standard LSTM layer (accelerated by cudnn)
+  * with residual connections starting from 3rd LSTM layer
+  * uses standard pytorch nn.LSTM layer
+  * dropout is applied on input to all LSTM layers, probability of dropout is
+    set to 0.2
+  * hidden state of LSTM layers is initialized with zeros
+  * weights and bias of LSTM layers is initialized with uniform(-0.1, 0.1)
+    distribution
+  * weights and bias of fully-connected classifier is initialized with
+    uniform(-0.1, 0.1) distribution
 * attention:
   * normalized Bahdanau attention
   * model uses `gnmt_v2` attention mechanism
   * output from first LSTM layer of decoder goes into attention,
   then re-weighted context is concatenated with the input to all subsequent
   LSTM layers in decoder at the current timestep
+  * linear transform of keys and queries is initialized with uniform(-0.1, 0.1),
+  normalization scalar is initialized with 1.0 / sqrt(1024),
+    normalization bias is initialized with zero
 * inference:
-  * beam search with default beam size 5
-  * with coverage penalty and length normalization
+  * beam search with beam size of 5
+  * with coverage penalty and length normalization, coverage penalty factor is
+    set to 0.1, length normalization factor is set to 0.6 and length
+    normalization constant is set to 5.0
   * BLEU computed by [sacrebleu](https://pypi.org/project/sacrebleu/)
 
+
+Implementation:
+* base Seq2Seq model: `pytorch/seq2seq/models/seq2seq_base.py`, class `Seq2Seq`
+* GNMT model: `pytorch/seq2seq/models/gnmt.py`, class `GNMT`
+* encoder: `pytorch/seq2seq/models/encoder.py`, class `ResidualRecurrentEncoder`
+* decoder: `pytorch/seq2seq/models/decoder.py`, class `ResidualRecurrentDecoder`
+* attention: `pytorch/seq2seq/models/attention.py`, class `BahdanauAttention`
+* inference (including BLEU evaluation and detokenization): `pytorch/seq2seq/inference/inference.py`, class `Translator`
+* beam search: `pytorch/seq2seq/inference/beam_search.py`, class `SequenceGenerator`
+
 ### Loss function
 Cross entropy loss with label smoothing (smoothing factor = 0.1), padding is not
 considered part of the loss.
 
+Loss function is implemented in `pytorch/seq2seq/train/smoothing.py`, class
+`LabelSmoothing`.
+
 ### Optimizer
-Adam optimizer with learning rate 5e-4.
+Adam optimizer with learning rate 1e-3, beta1 = 0.9, beta2 = 0.999, epsilon =
+1e-8 and no weight decay.
+Network is trained with gradient clipping, max L2 norm of gradients is set to 5.0.
+
+Optimizer is implemented in `pytorch/seq2seq/train/fp_optimizers.py`, class
+`Fp32Optimizer`.
+
+### Learning rate schedule
+Model is trained with exponential learning rate warmup for 200 steps and with
+step learning rate decay. Decay is started after 2/3 of training steps, decays
+for a total of 4 times, at regularly spaced intervals, decay factor is 0.5.
+
+Learning rate scheduler is implemented in
+`pytorch/seq2seq/train/lr_scheduler.py`, class `WarmupMultiStepLR`.
 
 # 5. Quality
 
 ### Quality metric
-BLEU score on newstest2014 dataset.
-BLEU scores reported by [sacrebleu](https://pypi.org/project/sacrebleu/) package
+Uncased BLEU score on newstest2014 en-de dataset.
+BLEU scores reported by [sacrebleu](https://pypi.org/project/sacrebleu/)
+package (version 1.2.10). Sacrebleu is executed with the following flags:
+`--score-only -lc --tokenize intl`.
 
 ### Quality target
-Uncased BLEU score of 21.80.
+Uncased BLEU score of 24.00.
 
 ### Evaluation frequency
 Evaluation of BLEU score is done after every epoch.
 
 ### Evaluation thoroughness
-Evaluation uses all of `newstest2014.en`.
+Evaluation uses all of `newstest2014.en` (3003 sentences).
diff --git a/rnn_translator/download_dataset.sh b/rnn_translator/download_dataset.sh
@@ -16,6 +16,8 @@
 
 set -e
 
+export LANG=C.UTF-8
+export LC_ALL=C.UTF-8
 
 OUTPUT_DIR=${1:-"data"}
 echo "Writing to ${OUTPUT_DIR}. To change this, set the OUTPUT_DIR environment variable."
@@ -136,8 +138,8 @@ cat "${OUTPUT_DIR}/newstest2015.tok.clean.de" \
    > "${OUTPUT_DIR}/newstest_dev.tok.clean.de"
 
 # Filter datasets
-python3 pytorch/scripts/downloaders/filter_dataset.py -f1 ${OUTPUT_DIR}/train.tok.clean.en -f2 ${OUTPUT_DIR}/train.tok.clean.de
-python3 pytorch/scripts/downloaders/filter_dataset.py -f1 ${OUTPUT_DIR}/newstest_dev.tok.clean.en -f2 ${OUTPUT_DIR}/newstest_dev.tok.clean.de
+python3 pytorch/scripts/filter_dataset.py -f1 ${OUTPUT_DIR}/train.tok.clean.en -f2 ${OUTPUT_DIR}/train.tok.clean.de
+python3 pytorch/scripts/filter_dataset.py -f1 ${OUTPUT_DIR}/newstest_dev.tok.clean.en -f2 ${OUTPUT_DIR}/newstest_dev.tok.clean.de
 
 # Generate Subword Units (BPE)
 # Clone Subword NMT

diff --git a/rnn_translator/pytorch/Dockerfile b/rnn_translator/pytorch/Dockerfile
@@ -1,9 +1,9 @@
-FROM pytorch/pytorch:0.4_cuda9_cudnn7
+FROM pytorch/pytorch:1.0.1-cuda10.0-cudnn7-runtime
 
 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8
 
 ADD . /workspace/pytorch
-RUN pip install -r /workspace/pytorch/requirements.txt
-
 WORKDIR /workspace/pytorch
+
+RUN pip install -r requirements.txt