Skip to content

Commit

Permalink
Update for GNMT reference (mlcommons#222)
Browse files Browse the repository at this point in the history
* Update for GNMT reference

* Fixed weight init for not shared embeddings (this doesn't affect the reference config)

* Better description of vocabulary and text datasets

* Added example logfile from GNMT training on reference config
  • Loading branch information
szmigacz authored and nvpaulius committed Apr 9, 2019
1 parent d99c36d commit 69dbe8b
Show file tree
Hide file tree
Showing 37 changed files with 2,854 additions and 1,461 deletions.
74 changes: 74 additions & 0 deletions compliance/example_logs/gnmt_example.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
:::MLPv0.5.0 gnmt 1550049265.393176317 (train.py:268) run_start
:::MLPv0.5.0 gnmt 1550049265.394439697 (seq2seq/utils.py:113) run_set_random_seed: 1
:::MLPv0.5.0 gnmt 1550049265.412598848 (train.py:312) preproc_tokenize_training
:::MLPv0.5.0 gnmt 1550049265.413046360 (train.py:314) train_hp_max_sequence_length: 50
:::MLPv0.5.0 gnmt 1550049274.456599236 (train.py:326) preproc_num_train_examples: 3498161
:::MLPv0.5.0 gnmt 1550049275.559447765 (train.py:336) preproc_tokenize_eval
:::MLPv0.5.0 gnmt 1550049275.597178459 (train.py:346) preproc_num_eval_examples: 3003
:::MLPv0.5.0 gnmt 1550049275.597592831 (train.py:350) preproc_vocab_size: 32317
:::MLPv0.5.0 gnmt 1550049275.598169804 (seq2seq/models/gnmt.py:34) model_hp_num_layers: 4
:::MLPv0.5.0 gnmt 1550049275.598643303 (seq2seq/models/gnmt.py:36) model_hp_hidden_size: 1024
:::MLPv0.5.0 gnmt 1550049275.599111080 (seq2seq/models/gnmt.py:38) model_hp_dropout: 0.2
:::MLPv0.5.0 gnmt 1550049279.603173733 (train.py:251) model_hp_loss_fn: "Cross Entropy with label smoothing"
:::MLPv0.5.0 gnmt 1550049279.603722811 (train.py:253) model_hp_loss_smoothing: 0.1
:::MLPv0.5.0 gnmt 1550049280.493244410 (train.py:393) input_batch_size: 128
:::MLPv0.5.0 gnmt 1550049280.493689060 (train.py:395) input_size: 3497728
:::MLPv0.5.0 gnmt 1550049280.494647741 (seq2seq/data/sampler.py:254) input_order
:::MLPv0.5.0 gnmt 1550049280.496007442 (seq2seq/data/sampler.py:254) input_order
:::MLPv0.5.0 gnmt 1550049280.496948242 (train.py:409) eval_size: 3003
:::MLPv0.5.0 gnmt 1550049280.497889280 (seq2seq/inference/beam_search.py:43) eval_hp_beam_size: 5
:::MLPv0.5.0 gnmt 1550049280.498333216 (seq2seq/inference/beam_search.py:45) eval_hp_max_sequence_length: 150
:::MLPv0.5.0 gnmt 1550049280.498714685 (seq2seq/inference/beam_search.py:47) eval_hp_length_normalization_constant: 5.0
:::MLPv0.5.0 gnmt 1550049280.499088049 (seq2seq/inference/beam_search.py:49) eval_hp_length_normalization_factor: 0.6
:::MLPv0.5.0 gnmt 1550049280.499464273 (seq2seq/inference/beam_search.py:51) eval_hp_coverage_penalty_factor: 0.1
:::MLPv0.5.0 gnmt 1550049283.242700577 (seq2seq/train/trainer.py:115) opt_name: "adam"
:::MLPv0.5.0 gnmt 1550049283.243218422 (seq2seq/train/trainer.py:117) opt_learning_rate: 0.001
:::MLPv0.5.0 gnmt 1550049283.243656635 (seq2seq/train/trainer.py:119) opt_hp_Adam_beta1: 0.9
:::MLPv0.5.0 gnmt 1550049283.244005442 (seq2seq/train/trainer.py:121) opt_hp_Adam_beta2: 0.999
:::MLPv0.5.0 gnmt 1550049283.244350195 (seq2seq/train/trainer.py:123) opt_hp_Adam_epsilon: 1e-08
:::MLPv0.5.0 gnmt 1550049283.245311022 (seq2seq/train/lr_scheduler.py:78) opt_learning_rate_warmup_steps: 200
:::MLPv0.5.0 gnmt 1550049283.245801687 (train.py:466) train_loop
:::MLPv0.5.0 gnmt 1550049283.246350050 (train.py:470) train_epoch: 0
:::MLPv0.5.0 gnmt 1550049284.053482771 (seq2seq/data/sampler.py:205) input_order
:::MLPv0.5.0 gnmt 1550061484.477649927 (train.py:483) train_checkpoint
:::MLPv0.5.0 gnmt 1550061486.181115627 (train.py:490) eval_start: 0
:::MLPv0.5.0 gnmt 1550061514.467880964 (train.py:495) eval_accuracy: {"epoch": 0, "value": 20.49}
:::MLPv0.5.0 gnmt 1550061514.468363047 (train.py:497) eval_target: 24.0
:::MLPv0.5.0 gnmt 1550061514.468725204 (train.py:498) eval_stop
:::MLPv0.5.0 gnmt 1550061514.469318390 (train.py:470) train_epoch: 1
:::MLPv0.5.0 gnmt 1550061515.311945915 (seq2seq/data/sampler.py:205) input_order
:::MLPv0.5.0 gnmt 1550073699.708187580 (train.py:483) train_checkpoint
:::MLPv0.5.0 gnmt 1550073705.950762510 (train.py:490) eval_start: 1
:::MLPv0.5.0 gnmt 1550073733.111917734 (train.py:495) eval_accuracy: {"epoch": 1, "value": 21.72}
:::MLPv0.5.0 gnmt 1550073733.112442732 (train.py:497) eval_target: 24.0
:::MLPv0.5.0 gnmt 1550073733.112777948 (train.py:498) eval_stop
:::MLPv0.5.0 gnmt 1550073733.113427401 (train.py:470) train_epoch: 2
:::MLPv0.5.0 gnmt 1550073733.976921797 (seq2seq/data/sampler.py:205) input_order
:::MLPv0.5.0 gnmt 1550085915.256659985 (train.py:483) train_checkpoint
:::MLPv0.5.0 gnmt 1550085921.003725052 (train.py:490) eval_start: 2
:::MLPv0.5.0 gnmt 1550085948.763688564 (train.py:495) eval_accuracy: {"epoch": 2, "value": 22.52}
:::MLPv0.5.0 gnmt 1550085948.764161110 (train.py:497) eval_target: 24.0
:::MLPv0.5.0 gnmt 1550085948.764480591 (train.py:498) eval_stop
:::MLPv0.5.0 gnmt 1550085948.765091658 (train.py:470) train_epoch: 3
:::MLPv0.5.0 gnmt 1550085949.625080585 (seq2seq/data/sampler.py:205) input_order
:::MLPv0.5.0 gnmt 1550098119.481947660 (train.py:483) train_checkpoint
:::MLPv0.5.0 gnmt 1550098125.219326258 (train.py:490) eval_start: 3
:::MLPv0.5.0 gnmt 1550098154.012802124 (train.py:495) eval_accuracy: {"epoch": 3, "value": 23.09}
:::MLPv0.5.0 gnmt 1550098154.013273716 (train.py:497) eval_target: 24.0
:::MLPv0.5.0 gnmt 1550098154.013695478 (train.py:498) eval_stop
:::MLPv0.5.0 gnmt 1550098154.014324665 (train.py:470) train_epoch: 4
:::MLPv0.5.0 gnmt 1550098154.874188900 (seq2seq/data/sampler.py:205) input_order
:::MLPv0.5.0 gnmt 1550110332.296045065 (train.py:483) train_checkpoint
:::MLPv0.5.0 gnmt 1550110338.542354584 (train.py:490) eval_start: 4
:::MLPv0.5.0 gnmt 1550110366.501193285 (train.py:495) eval_accuracy: {"epoch": 4, "value": 23.22}
:::MLPv0.5.0 gnmt 1550110366.501719475 (train.py:497) eval_target: 24.0
:::MLPv0.5.0 gnmt 1550110366.502229214 (train.py:498) eval_stop
:::MLPv0.5.0 gnmt 1550110366.502951622 (train.py:470) train_epoch: 5
:::MLPv0.5.0 gnmt 1550110367.328888416 (seq2seq/data/sampler.py:205) input_order
:::MLPv0.5.0 gnmt 1550122538.255764246 (train.py:483) train_checkpoint
:::MLPv0.5.0 gnmt 1550122544.367511034 (train.py:490) eval_start: 5
:::MLPv0.5.0 gnmt 1550122571.801926613 (train.py:495) eval_accuracy: {"epoch": 5, "value": 24.11}
:::MLPv0.5.0 gnmt 1550122571.802411556 (train.py:497) eval_target: 24.0
:::MLPv0.5.0 gnmt 1550122571.802731991 (train.py:498) eval_stop
:::MLPv0.5.0 gnmt 1550122571.803371668 (train.py:522) run_stop: {"success": true}
:::MLPv0.5.0 gnmt 1550122571.803844213 (train.py:523) run_final
192 changes: 174 additions & 18 deletions rnn_translator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,15 @@

This problem uses recurrent neural network to do language translation.

## Requirements
* [Python 3.6](https://www.python.org)
* [CUDA 9.0](https://developer.nvidia.com/cuda-90-download-archive)
* [PyTorch 1.0.1](https://pytorch.org)
* [sacrebleu](https://pypi.org/project/sacrebleu/)

### Recommended setup
* [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
* [pytorch/pytorch:1.0.1-cuda10.0-cudnn7-runtime container](https://hub.docker.com/r/pytorch/pytorch/tags/)

# 2. Directions
### Steps to configure machine
Expand Down Expand Up @@ -63,6 +70,21 @@ Verify data with:

bash verify_dataset.sh

### Steps specific to the pytorch version to run and time

cd pytorch
sudo docker build . --rm -t gnmt:latest
SEED=1
NOW=`date "+%F-%T"`
sudo nvidia-docker run -it --rm --ipc=host \
-v $(pwd)/../data:/data \
gnmt:latest "./run_and_time.sh" $SEED |tee benchmark-$NOW.log

### one can control which GPUs are used with the NV_GPU variable
sudo NV_GPU=0 nvidia-docker run -it --rm --ipc=host \
-v $(pwd)/../data:/data \
gnmt:latest "./run_and_time.sh" $SEED |tee benchmark-$NOW.log

# 3. Dataset/Environment
### Publication/Attribution
We use [WMT16 English-German](http://www.statmt.org/wmt16/translation-task.html)
Expand All @@ -75,15 +97,105 @@ segment text into subword units (BPE), by default it builds shared vocabulary of
Preprocessing removes all pairs of sentences that can't be decoded by latin-1
encoder.

### Vocabulary
Vocabulary is generated by the following lines from the `download_dataset.sh`
script:

```
# Create vocabulary file for BPE
cat "${OUTPUT_DIR}/train.tok.clean.bpe.${merge_ops}.en" "${OUTPUT_DIR}/train.tok.clean.bpe.${merge_ops}.de" | \
${OUTPUT_DIR}/subword-nmt/get_vocab.py | cut -f1 -d ' ' > "${OUTPUT_DIR}/vocab.bpe.${merge_ops}"
```

Vocabulary is stored to the `rnn_translator/data/vocab.bpe.32000` plain text
file. Tokens are separated with a newline character, one token per line. The
vocabulary file doesn't contain special tokens like for example BOS
(begin-of-string) or EOS (end-of-string) tokens.

Here are first 10 lines from the `rnn_translator/data/vocab.bpe.32000` file:
```
,
.
the
in
of
and
die
der
to
und
```

### Text datasets
The `download_dataset.sh` script automatically creates training, validation and
test datasets. Datasets are stored as plain text files. Sentences are separated
with a newline character, and tokens within each sentence are separated with a
single space character.

Training data:
* source language (English): `rnn_translator/data/train.tok.clean.bpe.32000.en`
* target language (German): `rnn_translator/data/train.tok.clean.bpe.32000.de`

Validation data:
* source language (English): `rnn_translator/data/newstest_dev.tok.clean.bpe.32000.en`
* target language (German): `rnn_translator/data/newstest_dev.tok.clean.bpe.32000.de`

Test data:
* source language (English): `rnn_translator/data/newstest2014.tok.bpe.32000.en`
* target language (German): `rnn_translator/data/newstest2014.de`
* notice that the `newstest2014.de` file isn't tokenized, BLEU evaluation is
performed by the sacrebleu package and sacrebleu expects plain text raw data
(tokenization is performed internally by sacrebleu)

Here are first 5 lines from the `rnn_translator/data/train.tok.clean.bpe.32000.en` file:
```
Res@@ um@@ ption of the session
I declare resumed the session of the European Parliament ad@@ jour@@ ned on Friday 17 December 1999 , and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant fes@@ tive period .
Although , as you will have seen , the d@@ read@@ ed ' millenn@@ ium bug ' failed to materi@@ alise , still the people in a number of countries suffered a series of natural disasters that truly were d@@ read@@ ful .
You have requested a debate on this subject in the course of the next few days , during this part-session .
In the meantime , I should like to observe a minute ' s silence , as a number of Members have requested , on behalf of all the victims concerned , particularly those of the terrible stor@@ ms , in the various countries of the European Union .
```

And here are first 5 lines from the `rnn_translator/data/train.tok.clean.bpe.32000.de` file:
```
Wiederaufnahme der Sitzungsperiode
Ich erkläre die am Freitag , dem 17. Dezember unterbro@@ ch@@ ene Sitzungsperiode des Europäischen Parlaments für wieder@@ aufgenommen , wünsche Ihnen nochmals alles Gute zum Jahres@@ wechsel und hoffe , daß Sie schöne Ferien hatten .
Wie Sie feststellen konnten , ist der ge@@ für@@ ch@@ tete " Mill@@ en@@ i@@ um-@@ Bu@@ g " nicht eingetreten . Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden .
Im Parlament besteht der Wunsch nach einer Aussprache im Verlauf dieser Sitzungsperiode in den nächsten Tagen .
Heute möchte ich Sie bitten - das ist auch der Wunsch einiger Kolleginnen und Kollegen - , allen Opfern der St@@ ür@@ me , insbesondere in den verschiedenen Ländern der Europäischen Union , in einer Schwei@@ ge@@ minute zu ge@@ denken .
```

### Training and test data separation
Training uses WMT16 English-German dataset, validation is on concatenation of
newstest2015 and newstest2016, BLEU evaluation is done on newstest2014.


### Data filtering
Training is executed only on pairs of sentences which satisfy the following equation:
```
min_len <= src sentence sequence length <= max_len AND
min_len <= tgt sentence sequence length <= max_len
```
`min_len` is set to 0, `max_len` is set to 50. Source and target sequence
lengths include special BOS (begin-of-sentence) and EOS (end-of-sentence)
tokens.

Filtering is implemented in `pytorch/seq2seq/data/dataset.py`, class
`LazyParallelDataset`.

### Training data order
By default training script does bucketing by sequence length. Before each epoch
dataset is randomly shuffled and split into chunks of 80 batches each. Within
each chunk it's sorted by (src + tgt) sequence length and then batches are
reshuffled within each chunk.
Training script does bucketing by sequence length. Bucketing algorithm uses 5
equal-width buckets (`num_buckets = 5`). Pairs of training sentences are
assigned to buckets by the value of
`max(src_sentence_len // bucket_width, tgt_sentence_len // bucket_width)`, where
`bucket_width = (max_len + num_buckets - 1) // num_buckets`.
Before each training epoch batches are randomly sampled from the buckets (last
incomplete batches are dropped for each bucket), then all batches are
reshuffled.


Bucketing is implemented in `pytorch/seq2seq/data/sampler.py`, class
`BucketingSampler`.

# 4. Model
### Publication/Attribution
Expand All @@ -106,47 +218,91 @@ GNMT-like models from [tensorflow/nmt](https://github.com/tensorflow/nmt) and
* general:
* encoder and decoder are using shared embeddings
* data-parallel multi-gpu training
* dynamic loss scaling with backoff for Tensor Cores (mixed precision) training
* trained with label smoothing loss (smoothing factor 0.1)
* encoder:
* 4-layer LSTM, hidden size 1024, first layer is bidirectional, the rest are
undirectional
* with residual connections starting from 3rd layer
* uses standard LSTM layer (accelerated by cudnn)
* 4-layer LSTM, hidden size 1024, first layer is bidirectional, the rest of
layers are unidirectional
* with residual connections starting from 3rd LSTM layer
* uses standard pytorch nn.LSTM layer
* dropout is applied on input to all LSTM layers, probability of dropout is
set to 0.2
* hidden state of LSTM layers is initialized with zeros
* weights and bias of LSTM layers is initialized with uniform(-0.1, 0.1)
distribution
* decoder:
* 4-layer unidirectional LSTM with hidden size 1024 and fully-connected
classifier
* with residual connections starting from 3rd layer
* uses standard LSTM layer (accelerated by cudnn)
* with residual connections starting from 3rd LSTM layer
* uses standard pytorch nn.LSTM layer
* dropout is applied on input to all LSTM layers, probability of dropout is
set to 0.2
* hidden state of LSTM layers is initialized with zeros
* weights and bias of LSTM layers is initialized with uniform(-0.1, 0.1)
distribution
* weights and bias of fully-connected classifier is initialized with
uniform(-0.1, 0.1) distribution
* attention:
* normalized Bahdanau attention
* model uses `gnmt_v2` attention mechanism
* output from first LSTM layer of decoder goes into attention,
then re-weighted context is concatenated with the input to all subsequent
LSTM layers in decoder at the current timestep
* linear transform of keys and queries is initialized with uniform(-0.1, 0.1),
normalization scalar is initialized with 1.0 / sqrt(1024),
normalization bias is initialized with zero
* inference:
* beam search with default beam size 5
* with coverage penalty and length normalization
* beam search with beam size of 5
* with coverage penalty and length normalization, coverage penalty factor is
set to 0.1, length normalization factor is set to 0.6 and length
normalization constant is set to 5.0
* BLEU computed by [sacrebleu](https://pypi.org/project/sacrebleu/)


Implementation:
* base Seq2Seq model: `pytorch/seq2seq/models/seq2seq_base.py`, class `Seq2Seq`
* GNMT model: `pytorch/seq2seq/models/gnmt.py`, class `GNMT`
* encoder: `pytorch/seq2seq/models/encoder.py`, class `ResidualRecurrentEncoder`
* decoder: `pytorch/seq2seq/models/decoder.py`, class `ResidualRecurrentDecoder`
* attention: `pytorch/seq2seq/models/attention.py`, class `BahdanauAttention`
* inference (including BLEU evaluation and detokenization): `pytorch/seq2seq/inference/inference.py`, class `Translator`
* beam search: `pytorch/seq2seq/inference/beam_search.py`, class `SequenceGenerator`

### Loss function
Cross entropy loss with label smoothing (smoothing factor = 0.1), padding is not
considered part of the loss.

Loss function is implemented in `pytorch/seq2seq/train/smoothing.py`, class
`LabelSmoothing`.

### Optimizer
Adam optimizer with learning rate 5e-4.
Adam optimizer with learning rate 1e-3, beta1 = 0.9, beta2 = 0.999, epsilon =
1e-8 and no weight decay.
Network is trained with gradient clipping, max L2 norm of gradients is set to 5.0.

Optimizer is implemented in `pytorch/seq2seq/train/fp_optimizers.py`, class
`Fp32Optimizer`.

### Learning rate schedule
Model is trained with exponential learning rate warmup for 200 steps and with
step learning rate decay. Decay is started after 2/3 of training steps, decays
for a total of 4 times, at regularly spaced intervals, decay factor is 0.5.

Learning rate scheduler is implemented in
`pytorch/seq2seq/train/lr_scheduler.py`, class `WarmupMultiStepLR`.

# 5. Quality

### Quality metric
BLEU score on newstest2014 dataset.
BLEU scores reported by [sacrebleu](https://pypi.org/project/sacrebleu/) package
Uncased BLEU score on newstest2014 en-de dataset.
BLEU scores reported by [sacrebleu](https://pypi.org/project/sacrebleu/)
package (version 1.2.10). Sacrebleu is executed with the following flags:
`--score-only -lc --tokenize intl`.

### Quality target
Uncased BLEU score of 21.80.
Uncased BLEU score of 24.00.

### Evaluation frequency
Evaluation of BLEU score is done after every epoch.

### Evaluation thoroughness
Evaluation uses all of `newstest2014.en`.
Evaluation uses all of `newstest2014.en` (3003 sentences).
6 changes: 4 additions & 2 deletions rnn_translator/download_dataset.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

set -e

export LANG=C.UTF-8
export LC_ALL=C.UTF-8

OUTPUT_DIR=${1:-"data"}
echo "Writing to ${OUTPUT_DIR}. To change this, set the OUTPUT_DIR environment variable."
Expand Down Expand Up @@ -136,8 +138,8 @@ cat "${OUTPUT_DIR}/newstest2015.tok.clean.de" \
> "${OUTPUT_DIR}/newstest_dev.tok.clean.de"

# Filter datasets
python3 pytorch/scripts/downloaders/filter_dataset.py -f1 ${OUTPUT_DIR}/train.tok.clean.en -f2 ${OUTPUT_DIR}/train.tok.clean.de
python3 pytorch/scripts/downloaders/filter_dataset.py -f1 ${OUTPUT_DIR}/newstest_dev.tok.clean.en -f2 ${OUTPUT_DIR}/newstest_dev.tok.clean.de
python3 pytorch/scripts/filter_dataset.py -f1 ${OUTPUT_DIR}/train.tok.clean.en -f2 ${OUTPUT_DIR}/train.tok.clean.de
python3 pytorch/scripts/filter_dataset.py -f1 ${OUTPUT_DIR}/newstest_dev.tok.clean.en -f2 ${OUTPUT_DIR}/newstest_dev.tok.clean.de

# Generate Subword Units (BPE)
# Clone Subword NMT
Expand Down
6 changes: 3 additions & 3 deletions rnn_translator/pytorch/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
FROM pytorch/pytorch:0.4_cuda9_cudnn7
FROM pytorch/pytorch:1.0.1-cuda10.0-cudnn7-runtime

ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8

ADD . /workspace/pytorch
RUN pip install -r /workspace/pytorch/requirements.txt

WORKDIR /workspace/pytorch

RUN pip install -r requirements.txt
Loading

0 comments on commit 69dbe8b

Please sign in to comment.