From 471ee9bf08caa11efc6fe21fbd251bb6325c5633 Mon Sep 17 00:00:00 2001
From: Chaitanya Joshi <ckjoshi9@gmail.com>
Date: Wed, 22 May 2019 12:53:37 +0530
Subject: [PATCH] Removed redundant files

---
 Chapter09/chatbots_code/chatbot.py           | 242 -------------------
 Chapter09/chatbots_code/data_utils.py        |  53 ----
 Chapter09/chatbots_code/memory_network.py    | 122 ----------
 Chapter09/chatbots_code/requirements.txt.txt |   5 -
 4 files changed, 422 deletions(-)
 delete mode 100644 Chapter09/chatbots_code/chatbot.py
 delete mode 100644 Chapter09/chatbots_code/data_utils.py
 delete mode 100644 Chapter09/chatbots_code/memory_network.py
 delete mode 100644 Chapter09/chatbots_code/requirements.txt.txt

diff --git a/Chapter09/chatbots_code/chatbot.py b/Chapter09/chatbots_code/chatbot.py
deleted file mode 100644
index 648f3af..0000000
--- a/Chapter09/chatbots_code/chatbot.py
+++ /dev/null
@@ -1,242 +0,0 @@
-from sklearn import metrics
-from itertools import chain
-from six.moves import range, reduce
-import numpy as np
-import tensorflow as tf
-
-from data_utils import tokenize, parse_dialogs_per_response
-from memory_network import MemoryNetwork
-
-def vectorize_candidates(candidates, word_idx, sentence_size):
-    # Determine shape of final vector
-    shape = (len(candidates), sentence_size)
-    candidates_vector = []
-    for i, candidate in enumerate(candidates):
-        # Determine zero padding
-        zero_padding = max(0, sentence_size - len(candidate))
-        # Append to final vector
-        candidates_vector.append(
-            [word_idx[w] if w in word_idx else 0 for w in candidate] 
-            + [0] * zero_padding)
-    # Return as TensorFlow constant
-    return tf.constant(candidates_vector, shape=shape)
-
-def vectorize_data(data, word_idx, sentence_size, batch_size, max_memory_size):
-    facts_vector = []
-    questions_vector = []
-    answers_vector = []
-    # Sort data in descending order by number of facts
-    data.sort(key=lambda x: len(x[0]), reverse=True)
-    for i, (fact, question, answer) in enumerate(data):
-        # Find memory size
-        if i % batch_size == 0:
-            memory_size = max(1, min(max_memory_size, len(fact)))
-        # Build fact vector
-        fact_vector = []
-        for i, sentence in enumerate(fact, 1):
-            fact_padding = max(0, sentence_size - len(sentence))
-            fact_vector.append(
-                [word_idx[w] if w in word_idx else 0 for w in sentence] 
-                + [0] * fact_padding)
-        # Keep the most recent sentences that fit in memory
-        fact_vector = fact_vector[::-1][:memory_size][::-1]
-        # Pad to memory_size
-        memory_padding = max(0, memory_size - len(fact_vector))
-        for _ in range(memory_padding):
-            fact_vector.append([0] * sentence_size)
-        # Build question vector
-        question_padding = max(0, sentence_size - len(question))
-        question_vector = [word_idx[w] if w in word_idx else 0 
-                           for w in question] \
-                           + [0] * question_padding
-        # Append to final vectors
-        facts_vector.append(np.array(fact_vector))
-        questions_vector.append(np.array(question_vector))
-        # Answer is already an integer corresponding to a candidate
-        answers_vector.append(np.array(answer))
-    return facts_vector, questions_vector, answers_vector
-
-class ChatBotWrapper(object):
-    def __init__(self, train_data, test_data, val_data, 
-                 candidates, candidates_to_idx,
-                 memory_size, batch_size, learning_rate, 
-                 evaluation_interval, hops,
-                 epochs, embedding_size):
-        self.memory_size = memory_size
-        self.batch_size = batch_size
-        self.evaluation_interval = evaluation_interval
-        self.epochs = epochs
-
-        self.candidates = candidates 
-        self.candidates_to_idx = candidates_to_idx
-        self.candidates_size = len(candidates)
-        self.idx_to_candidates = dict((self.candidates_to_idx[key], key) 
-                                      for key in self.candidates_to_idx)
-        # Initialize data and build vocabulary
-        self.train_data = train_data
-        self.test_data = test_data
-        self.val_data = val_data
-        self.build_vocab(train_data + test_data + val_data, candidates)
-        # Vectorize candidates
-        self.candidates_vec = vectorize_candidates(
-            candidates, self.word_idx, self.candidate_sentence_size)
-        # Initialize optimizer
-        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
-        # Initialize TensorFlow session and Memory Network model
-        self.sess = tf.Session()
-        self.model = MemoryNetwork(
-                        self.sentence_size, self.vocab_size, 
-                        self.candidates_size, self.candidates_vec, 
-                        embedding_size, hops,
-                        optimizer=optimizer, session=self.sess)
-    
-    def build_vocab(self, data, candidates):
-        # Build word vocabulary set from all data and candidate words
-        vocab = reduce(lambda x1, x2: x1 | x2, 
-            (set(list(chain.from_iterable(facts)) + questions) 
-                for facts, questions, answers in data))
-        vocab |= reduce(lambda x1, x2: x1 | x2, 
-            (set(candidate) for candidate in candidates))
-        vocab = sorted(vocab)
-        # Assign integer indices to each word
-        self.word_idx = dict((word, idx + 1) for idx, word in enumerate(vocab))
-        # Compute various data size numbers
-        max_facts_size = max(map(len, (facts for facts, _, _ in data)))
-        self.sentence_size = max(
-            map(len, chain.from_iterable(facts for facts, _, _ in data)))
-        self.candidate_sentence_size = max(map(len, candidates))
-        question_size = max(map(len, (questions for _, questions, _ in data)))
-        self.memory_size = min(self.memory_size, max_facts_size)
-        self.vocab_size = len(self.word_idx) + 1  # +1 for null word
-        self.sentence_size = max(question_size, self.sentence_size)
-        
-    def predict_for_batch(self, facts, questions):
-        preds = []
-        # Iterate over mini-batches
-        for start in range(0, len(facts), self.batch_size):
-            end = start + self.batch_size
-            facts_batch = facts[start:end]
-            questions_batch = questions[start:end]
-            # Predict per batch
-            pred = self.model.predict(facts_batch, questions_batch)
-            preds += list(pred)
-        return preds    
-
-    def train(self):
-        # Vectorize training and validation data
-        train_facts, train_questions, train_answers = vectorize_data(
-            self.train_data, self.word_idx, self.sentence_size, 
-            self.batch_size, self.memory_size)
-        val_facts, val_questions, val_answers = vectorize_data(
-            self.val_data, self.word_idx, self.sentence_size, 
-            self.batch_size, self.memory_size)
-        # Chunk training data into batches
-        batches = zip(range(0, len(train_facts) - self.batch_size, 
-                            self.batch_size),
-                      range(self.batch_size, len(train_facts), 
-                            self.batch_size))
-        batches = [(start, end) for start, end in batches]
-        # Start training loop
-        for epoch in range(1, self.epochs + 1):
-            np.random.shuffle(batches)
-            total_cost = 0.0
-            for start, end in batches:
-                facts = train_facts[start:end]
-                questions = train_questions[start:end]
-                answers = train_answers[start:end]
-                # Train on batch
-                batch_cost = self.model.fit(facts, questions, answers)
-                total_cost += batch_cost
-            if epoch % self.evaluation_interval == 0:
-                # Compute accuracy over training and validation set
-                train_preds = self.predict_for_batch(
-                    train_facts, train_questions)
-                val_preds = self.predict_for_batch(
-                    val_facts, val_questions)
-                train_acc = metrics.accuracy_score(
-                    train_preds, train_answers)
-                val_acc = metrics.accuracy_score(
-                    val_preds, val_answers)
-                print("Epoch: ", epoch)
-                print("Total Cost: ", total_cost)
-                print("Training Accuracy: ", train_acc)
-                print("Validation Accuracy: ", val_acc)
-                print("---")
-    
-    def test(self):
-        # Compute accuracy over test set
-        test_facts, test_questions, test_answers = vectorize_data(
-            self.test_data, self.word_idx, self.sentence_size, 
-            self.batch_size, self.memory_size)
-        test_preds = self.predict_for_batch(test_facts, test_questions)
-        test_acc = metrics.accuracy_score(test_preds, test_answers)
-        print("Testing Accuracy: ", test_acc)
-        
-    def interactive_mode(self):
-        facts = []
-        utterance = None
-        response = None
-        turn_count = 1
-        while True:
-            line = input("==> ").strip().lower()
-            if line == "exit":
-                break
-            if line == "restart":
-                facts = []
-                turn_count = 1
-                print("Restarting dialog...\n")
-                continue
-            utterance = tokenize(line)
-            data = [(facts, utterance, -1)]
-            # Vectorize data and make prediction
-            f, q, a = vectorize_data(data, self.word_idx, 
-                self.sentence_size, self.batch_size, self.memory_size)
-            preds = self.model.predict(f, q)
-            response = self.idx_to_candidates[preds[0]]
-            # Print predicted response
-            print(response)
-            response = tokenize(response)
-            # Add turn count temporal encoding
-            utterance.append("$u")
-            response.append("$r")
-            # Add utterance/response encoding
-            utterance.append("#" + str(turn_count))
-            response.append("#" + str(turn_count))
-            # Update facts memory
-            facts.append(utterance)
-            facts.append(response)
-            turn_count += 1
-
-if __name__ == "__main__":
-    candidates = []
-    candidates_to_idx = {}
-    with open('dialog-babi/dialog-babi-candidates.txt') as f:
-        for i, line in enumerate(f):
-            candidates_to_idx[line.strip().split(' ', 1)[1]] = i
-            line = tokenize(line.strip())[1:]
-            candidates.append(line)
-
-    train_data = []
-    with open('dialog-babi/dialog-babi-task5-full-dialogs-trn.txt') as f:
-        train_data = parse_dialogs_per_response(f.readlines(), candidates_to_idx)
-
-    test_data = []
-    with open('dialog-babi/dialog-babi-task5-full-dialogs-tst.txt') as f:
-        test_data = parse_dialogs_per_response(f.readlines(), candidates_to_idx)
-
-    val_data = [] 
-    with open('dialog-babi/dialog-babi-task5-full-dialogs-dev.txt') as f:
-        val_data = parse_dialogs_per_response(f.readlines(), candidates_to_idx)
-
-    chatbot = ChatBotWrapper(train_data, test_data, val_data,
-                             candidates, candidates_to_idx,
-                             memory_size=50,
-                             batch_size=32,
-                             learning_rate=0.001,
-                             evaluation_interval=10,
-                             hops=3,
-                             epochs=200,
-                             embedding_size=50)
-    chatbot.train()
-    chatbot.test()
-    chatbot.interactive_mode()
diff --git a/Chapter09/chatbots_code/data_utils.py b/Chapter09/chatbots_code/data_utils.py
deleted file mode 100644
index 59c842b..0000000
--- a/Chapter09/chatbots_code/data_utils.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import re
-
-def tokenize(sent):
-    stop_words = {"a", "an", "the"}
-    sent = sent.lower()
-    if sent == '<silence>':
-        return [sent]
-    # Convert sentence to tokens
-    result = [word.strip() for word in re.split('(\W+)?', sent) 
-              if word.strip() and word.strip() not in stop_words]
-    # Cleanup
-    if not result:
-        result = ['<silence>']
-    if result[-1]=='.' or result[-1]=='?' or result[-1]=='!':
-        result = result[:-1]
-    return result
-
-def parse_dialogs_per_response(lines, candidates_to_idx):
-    data = []
-    facts_temp = []
-    utterance_temp = None
-    response_temp = None
-    # Parse line by line
-    for line in lines:
-        line = line.strip()
-        if line:
-            nid, line = line.split(' ', 1)
-            if '\t' in line: # Has utterance and respone
-                utterance_temp, response_temp = line.split('\t')
-                # Convert answer to integer index
-                answer = candidates_to_idx[response_temp]
-                # Tokenize sentences
-                utterance_temp = tokenize(utterance_temp)
-                response_temp = tokenize(response_temp)
-                # Add (facts, question, answer) tuple to data
-                data.append((facts_temp[:], utterance_temp[:], answer))
-                # Add utterance/response encoding
-                utterance_temp.append('$u')
-                response_temp.append('$r')
-                # Add turn count temporal encoding
-                utterance_temp.append('#' + nid)
-                response_temp.append('#' + nid)
-                # Update facts
-                facts_temp.append(utterance_temp)
-                facts_temp.append(response_temp)
-            else: # Has KB Fact
-                response_temp = tokenize(line)
-                response_temp.append('$r')
-                response_temp.append('#' + nid)
-                facts_temp.append(response_temp)
-        else: # Start of new dialog
-            facts_temp = []
-    return data
\ No newline at end of file
diff --git a/Chapter09/chatbots_code/memory_network.py b/Chapter09/chatbots_code/memory_network.py
deleted file mode 100644
index 6c317a6..0000000
--- a/Chapter09/chatbots_code/memory_network.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import numpy as np
-import tensorflow as tf
-
-class MemoryNetwork(object):
-    def __init__(self, sentence_size, vocab_size, candidates_size, 
-                 candidates_vec, embedding_size, hops, 
-                 initializer=tf.random_normal_initializer(stddev=0.1), 
-                 optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
-                 session=tf.Session()):
-        self._hops = hops
-        self._candidates_vec = candidates_vec
-        
-        # Define placeholders for inputs to the model
-        self._facts = tf.placeholder(
-            tf.int32, [None, None, sentence_size], name="facts")
-        self._questions = tf.placeholder(
-            tf.int32, [None, sentence_size], name="questions")
-        self._answers = tf.placeholder(
-            tf.int32, [None], name="answers") 
-
-        # Define trainable variables used for inference
-        with tf.variable_scope("MemoryNetwork"):
-            # Word embedding lookup matrix for input facts and questions
-            self.word_emb_matrix = tf.Variable(initializer(
-                [vocab_size, embedding_size]), name="A")
-            # Matrix used for linear transformations during inference
-            self.transformation_matrix = tf.Variable(initializer(
-                [embedding_size, embedding_size]), name="H")
-            # Word embedding lookup matrix for output responses
-            self.output_word_emb_matrix = tf.Variable(initializer(
-                [vocab_size, embedding_size]), name="W")
-
-        # Compute cross entropy error on inference predictions
-        logits = self._inference(self._facts, self._questions)
-        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
-            logits=logits, labels=self._answers, name="cross_entropy")
-        cross_entropy_sum = tf.reduce_sum(
-            cross_entropy, name="cross_entropy_sum")
-
-        # Define loss operation
-        self.loss_op = cross_entropy_sum
-
-        # Define gradient pipeline
-        grads_and_vars = optimizer.compute_gradients(self.loss_op)
-        
-        # Define training operation
-        self.train_op = optimizer.apply_gradients(
-            grads_and_vars, name="train_op")
-
-        # Define prediction operation
-        self.predict_op = tf.argmax(logits, 1, name="predict_op")
-
-        # Load session and initialize all variables
-        self._session = session
-        self._session.run(tf.initialize_all_variables())
-        
-    def _input_module(self, facts):
-        with tf.variable_scope("InputModule"):
-            facts_emb = tf.nn.embedding_lookup(self.word_emb_matrix, 
-                                               facts)
-            return tf.reduce_sum(facts_emb, 2)
-    
-    def _question_module(self, questions):
-        with tf.variable_scope("QuestionModule"):
-            questions_emb = tf.nn.embedding_lookup(
-                self.word_emb_matrix, questions)
-            return tf.reduce_sum(questions_emb, 1)
-        
-    def _memory_module(self, questions_emb, facts_emb):
-        with tf.variable_scope("MemoryModule"):
-            initial_context_vector = questions_emb
-            context_vectors = [initial_context_vector]
-            # Multi-hop attention over facts to update context vector
-            for hop in range(self._hops):
-                # Perform reduce_dot
-                context_temp = tf.transpose(
-                    tf.expand_dims(context_vectors[-1], -1), [0, 2, 1])
-                similarity_scores = tf.reduce_sum(
-                    facts_emb * context_temp, 2)
-                # Calculate similarity probabilities
-                probs = tf.nn.softmax(similarity_scores)
-                # Perform attention multiplication
-                probs_temp = tf.transpose(tf.expand_dims(probs, -1), 
-                                          [0, 2, 1])
-                facts_temp = tf.transpose(facts_emb, [0, 2, 1])
-                context_rep = tf.reduce_sum(facts_temp*probs_temp, 2)
-                # Update context vector
-                context_vector = tf.matmul(context_vectors[-1], 
-                                           self.transformation_matrix) \
-                                 + context_rep
-                # Append to context vector list to use in next hop
-                context_vectors.append(context_vector)
-            # Return context vector for last hop
-            return context_vector
-        
-    def _output_module(self, context_vector):
-        with tf.variable_scope("OuptutModule"):
-            candidates_emb = tf.nn.embedding_lookup(self.output_word_emb_matrix, 
-                                                    self._candidates_vec)
-            candidates_emb_sum = tf.reduce_sum(candidates_emb, 1)
-            return tf.matmul(context_vector, tf.transpose(candidates_emb_sum))
-    
-    def _inference(self, facts, questions):
-        with tf.variable_scope("MemoryNetwork"):
-            input_vectors = self._input_module(facts)
-            question_vectors = self._question_module(questions)
-            context_vectors = self._memory_module(question_vectors, 
-                                                  input_vectors)
-            output = self._output_module(context_vectors)
-            return output
-    
-    def fit(self, facts, questions, answers):
-        feed_dict = {self._facts: facts, 
-                     self._questions: questions, 
-                     self._answers: answers}
-        loss, _ = self._session.run([self.loss_op, self.train_op], 
-                                    feed_dict=feed_dict)
-        return loss
-
-    def predict(self, facts, questions):
-        feed_dict = {self._facts: facts, self._questions: questions}
-        return self._session.run(self.predict_op, feed_dict=feed_dict)
diff --git a/Chapter09/chatbots_code/requirements.txt.txt b/Chapter09/chatbots_code/requirements.txt.txt
deleted file mode 100644
index 0b9bc17..0000000
--- a/Chapter09/chatbots_code/requirements.txt.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-numpy==1.14.0
-scikit-learn==0.19.1
-scipy==1.0.0
-six==1.11.0
-tensorflow==1.3.0
\ No newline at end of file