seq2seq model

wb14123 · Aug 1, 2017 · e281afb · e281afb
commit e281afb
Show file tree

Hide file tree

Showing 6 changed files with 549 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+*.pyc
+data/
+output/
+output_*/
+tmp/
diff --git a/bleu.py b/bleu.py
@@ -0,0 +1,112 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Python implementation of BLEU and smooth-BLEU.
+
+This module provides a Python implementation of BLEU and smooth-BLEU.
+Smooth BLEU is computed following the method outlined in the paper:
+Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
+evaluation metrics for machine translation. COLING 2004.
+"""
+
+import collections
+import math
+
+
+def _get_ngrams(segment, max_order):
+  """Extracts all n-grams upto a given maximum order from an input segment.
+
+  Args:
+    segment: text segment from which n-grams will be extracted.
+    max_order: maximum length in tokens of the n-grams returned by this
+        methods.
+
+  Returns:
+    The Counter containing all n-grams upto max_order in segment
+    with a count of how many times each n-gram occurred.
+  """
+  ngram_counts = collections.Counter()
+  for order in range(1, max_order + 1):
+    for i in range(0, len(segment) - order + 1):
+      ngram = tuple(segment[i:i+order])
+      ngram_counts[ngram] += 1
+  return ngram_counts
+
+
+def compute_bleu(reference_corpus, translation_corpus, max_order=4,
+                 smooth=False):
+  """Computes BLEU score of translated segments against one or more references.
+
+  Args:
+    reference_corpus: list of lists of references for each translation. Each
+        reference should be tokenized into a list of tokens.
+    translation_corpus: list of translations to score. Each translation
+        should be tokenized into a list of tokens.
+    max_order: Maximum n-gram order to use when computing BLEU score.
+    smooth: Whether or not to apply Lin et al. 2004 smoothing.
+
+  Returns:
+    3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
+    precisions and brevity penalty.
+  """
+  matches_by_order = [0] * max_order
+  possible_matches_by_order = [0] * max_order
+  reference_length = 0
+  translation_length = 0
+  for (references, translation) in zip(reference_corpus,
+                                       translation_corpus):
+    reference_length += min(len(r) for r in references)
+    translation_length += len(translation)
+
+    merged_ref_ngram_counts = collections.Counter()
+    for reference in references:
+      merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
+    translation_ngram_counts = _get_ngrams(translation, max_order)
+    overlap = translation_ngram_counts & merged_ref_ngram_counts
+    for ngram in overlap:
+      matches_by_order[len(ngram)-1] += overlap[ngram]
+    for order in range(1, max_order+1):
+      possible_matches = len(translation) - order + 1
+      if possible_matches > 0:
+        possible_matches_by_order[order-1] += possible_matches
+
+  precisions = [0] * max_order
+  for i in range(0, max_order):
+    if smooth:
+      precisions[i] = ((matches_by_order[i] + 1.) /
+                       (possible_matches_by_order[i] + 1.))
+    else:
+      if possible_matches_by_order[i] > 0:
+        precisions[i] = (float(matches_by_order[i]) /
+                         possible_matches_by_order[i])
+      else:
+        precisions[i] = 0.0
+
+  if min(precisions) > 0:
+    p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
+    geo_mean = math.exp(p_log_sum)
+  else:
+    geo_mean = 0
+
+  ratio = float(translation_length) / reference_length
+
+  if ratio > 1.0:
+    bp = 1.
+  else:
+    bp = math.exp(1 - 1. / ratio)
+
+  bleu = geo_mean * bp
+
+  return (bleu, precisions, bp, ratio, translation_length, reference_length)
diff --git a/model.py b/model.py
@@ -0,0 +1,170 @@
+
+import tensorflow as tf
+import seq2seq
+import bleu
+import reader
+from os import path
+import random
+
+
+class Model():
+
+    def __init__(self, train_input_file, train_target_file,
+            test_input_file, test_target_file, vocab_file,
+            num_units, layers, dropout,
+            batch_size, learning_rate, output_dir,
+            save_step = 100, eval_step = 1000,
+            param_histogram=False, restore_model=False):
+        self.num_units = num_units
+        self.layers = layers
+        self.dropout = dropout
+        self.batch_size = batch_size
+        self.learning_rate = learning_rate
+        self.save_step = save_step
+        self.eval_step = eval_step
+        self.param_histogram = param_histogram
+        self.restore_model = restore_model
+
+        self.train_reader = reader.SeqReader(train_input_file,
+                train_target_file, vocab_file, batch_size)
+        self.eval_reader = reader.SeqReader(test_input_file, test_target_file,
+                vocab_file, batch_size)
+        self.train_reader.start()
+        self.eval_reader.start()
+        self.train_data = self.train_reader.read()
+        self.eval_data = self.eval_reader.read()
+
+        self.model_file = path.join(output_dir, 'model.ckpl')
+        self.log_writter = tf.summary.FileWriter(output_dir)
+
+        self._init_train()
+        self._init_eval()
+
+
+    def gpu_session_config(self):
+        config = tf.ConfigProto()
+        config.gpu_options.allow_growth = True
+        return config
+
+
+    def _init_train(self):
+        self.train_graph = tf.Graph()
+        with self.train_graph.as_default():
+            self.train_in_seq = tf.placeholder(tf.int32, shape=[self.batch_size, None])
+            self.train_in_seq_len = tf.placeholder(tf.int32, shape=[self.batch_size])
+            self.train_target_seq = tf.placeholder(tf.int32, shape=[self.batch_size, None])
+            self.train_target_seq_len = tf.placeholder(tf.int32, shape=[self.batch_size])
+            output = seq2seq.seq2seq(self.train_in_seq, self.train_in_seq_len,
+                    self.train_target_seq, self.train_target_seq_len,
+                    len(self.train_reader.vocabs),
+                    self.num_units, self.layers, self.dropout)
+            self.train_output = tf.argmax(tf.nn.softmax(output), 2)
+            self.loss = seq2seq.seq_loss(output, self.train_target_seq,
+                    self.train_target_seq_len)
+            self.train_op = tf.train.AdamOptimizer(
+                    learning_rate=self.learning_rate).minimize(self.loss)
+            if self.param_histogram:
+                for v in tf.trainable_variables():
+                    tf.summary.histogram('train_' + v.name, v)
+            tf.summary.scalar('loss', self.loss)
+            self.train_summary = tf.summary.merge_all()
+            self.train_init = tf.global_variables_initializer()
+            self.train_saver = tf.train.Saver()
+        self.train_session = tf.Session(graph=self.train_graph,
+                config=self.gpu_session_config())
+
+
+    def _init_eval(self):
+        self.eval_graph = tf.Graph()
+        with self.eval_graph.as_default():
+            self.eval_in_seq = tf.placeholder(tf.int32, shape=[self.batch_size, None])
+            self.eval_in_seq_len = tf.placeholder(tf.int32, shape=[self.batch_size])
+            self.eval_output = seq2seq.seq2seq(self.eval_in_seq,
+                    self.eval_in_seq_len, None, None,
+                    len(self.eval_reader.vocabs),
+                    self.num_units, self.layers, self.dropout)
+            if self.param_histogram:
+                for v in tf.trainable_variables():
+                    tf.summary.histogram('eval_' + v.name, v)
+            self.eval_summary = tf.summary.merge_all()
+            self.eval_saver = tf.train.Saver()
+        self.eval_session = tf.Session(graph=self.eval_graph,
+                config=self.gpu_session_config())
+
+
+    def train(self, epochs):
+        with self.train_graph.as_default():
+            if path.isfile(self.model_file + '.meta') and self.restore_model:
+                print("Reloading model file before training.")
+                self.train_saver.restore(self.train_session, self.model_file)
+            self.train_session.run(self.train_init)
+            total_loss = 0
+            for step in range(0, epochs):
+                data = next(self.train_data)
+                in_seq = data['in_seq']
+                in_seq_len = data['in_seq_len']
+                target_seq = data['target_seq']
+                target_seq_len = data['target_seq_len']
+                output, loss, train, summary = self.train_session.run(
+                        [self.train_output, self.loss, self.train_op, self.train_summary],
+                        feed_dict={
+                            self.train_in_seq: in_seq,
+                            self.train_in_seq_len: in_seq_len,
+                            self.train_target_seq: target_seq,
+                            self.train_target_seq_len: target_seq_len})
+                total_loss += loss
+                self.log_writter.add_summary(summary, step)
+                if step % self.save_step == 0:
+                    self.train_saver.save(self.train_session, self.model_file)
+                    print("Saving model. Step: %d, loss: %f" % (step,
+                        total_loss / self.save_step))
+                    # print sample output
+                    sid = random.randint(0, self.batch_size-1)
+                    output_text = reader.decode_text(output[sid],
+                            self.train_reader.vocabs)
+                    target_text = reader.decode_text(target_seq[sid],
+                            self.train_reader.vocabs)
+                    print('******************************')
+                    print(output_text)
+                    print(target_text)
+                if step % self.eval_step == 0:
+                    bleu_score = self.eval(step)
+                    print("Evaluate model. Step: %d, loss: %f, score: %f" % (
+                        step, bleu_score, loss / self.save_step))
+                    eval_summary = tf.Summary(value=[tf.Summary.Value(
+                        tag='bleu', simple_value=bleu_score)])
+                    self.log_writter.add_summary(eval_summary, step)
+                if step % self.save_step == 0:
+                    total_loss = 0
+
+
+    def eval(self, train_step):
+        with self.eval_graph.as_default():
+            self.eval_saver.restore(self.eval_session, self.model_file)
+            bleu_score = 0
+            for step in range(0, self.eval_reader.data_size):
+                data = next(self.eval_data)
+                in_seq = data['in_seq']
+                in_seq_len = data['in_seq_len']
+                target_seq = data['target_seq']
+                target_seq_len = data['target_seq_len']
+                outputs, summary = self.eval_session.run(
+                        [self.eval_output, self.eval_summary],
+                        feed_dict={
+                            self.eval_in_seq: in_seq,
+                            self.eval_in_seq_len: in_seq_len})
+                if step == 0: # draw histogram summary once only
+                    self.log_writter.add_summary(summary, train_step)
+                for i in range(len(outputs)):
+                    output = outputs[i]
+                    target = target_seq[i]
+                    output_text = reader.decode_text(output,
+                            self.eval_reader.vocabs).split(' ')
+                    target_text = reader.decode_text(target,
+                            self.eval_reader.vocabs).split(' ')
+                    if random.randint(1, 20) == 1:
+                        print('====================')
+                        print(output_text, target_text)
+                    bleu_score += bleu.compute_bleu([[output_text]], [target_text])[0] * 100
+            return bleu_score / self.eval_reader.data_size / self.batch_size
+
diff --git a/nmt.py b/nmt.py
@@ -0,0 +1,14 @@
+
+from model import Model
+
+m = Model(
+        '/data/dl-data/wmt-2016/train.tok.clean.bpe.32000.en',
+        '/data/dl-data/wmt-2016/train.tok.clean.bpe.32000.de',
+        '/data/dl-data/wmt-2016/newstest2016.tok.en',
+        '/data/dl-data/wmt-2016/newstest2016.tok.de',
+        '/data/dl-data/wmt-2016/vocabs.de-en',
+        num_units=1024, layers=4, dropout=0.2,
+        batch_size=32, learning_rate=0.001, output_dir='./output',
+        param_histogram=True, restore_model=True)
+
+m.train(5000000)