First embeddings commit

AlexFridman · Mar 8, 2017 · 02ba616 · 02ba616
1 parent f09ee4a
commit 02ba616
Show file tree

Hide file tree

Showing 8 changed files with 2,509 additions and 0 deletions.
diff --git a/embeddings/.gitignore b/embeddings/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/embeddings/Skip-Gram word2vec.ipynb b/embeddings/Skip-Gram word2vec.ipynb
diff --git a/embeddings/Skip-Grams-Solution.ipynb b/embeddings/Skip-Grams-Solution.ipynb
diff --git a/embeddings/assets/matrix_mult_w_one_hot.png b/embeddings/assets/matrix_mult_w_one_hot.png
diff --git a/embeddings/assets/skip_gram_net_arch.png b/embeddings/assets/skip_gram_net_arch.png
diff --git a/embeddings/assets/word2vec_architectures.png b/embeddings/assets/word2vec_architectures.png
diff --git a/embeddings/assets/word2vec_weight_matrix_lookup_table.png b/embeddings/assets/word2vec_weight_matrix_lookup_table.png
diff --git a/embeddings/utils.py b/embeddings/utils.py
@@ -0,0 +1,59 @@
+import re
+from collections import Counter
+
+def preprocess(text):
+
+    # Replace punctuation with tokens so we can use them in our model
+    text = text.lower()
+    text = text.replace('.', ' <PERIOD> ')
+    text = text.replace(',', ' <COMMA> ')
+    text = text.replace('"', ' <QUOTATION_MARK> ')
+    text = text.replace(';', ' <SEMICOLON> ')
+    text = text.replace('!', ' <EXCLAMATION_MARK> ')
+    text = text.replace('?', ' <QUESTION_MARK> ')
+    text = text.replace('(', ' <LEFT_PAREN> ')
+    text = text.replace(')', ' <RIGHT_PAREN> ')
+    text = text.replace('--', ' <HYPHENS> ')
+    text = text.replace('?', ' <QUESTION_MARK> ')
+    # text = text.replace('\n', ' <NEW_LINE> ')
+    text = text.replace(':', ' <COLON> ')
+    words = text.split()
+
+    # Remove all words with  5 or fewer occurences
+    word_counts = Counter(words)
+    trimmed_words = [word for word in words if word_counts[word] > 5]
+
+    return trimmed_words
+
+def get_batches(int_text, batch_size, seq_length):
+    """
+    Return batches of input and target
+    :param int_text: Text with the words replaced by their ids
+    :param batch_size: The size of batch
+    :param seq_length: The length of sequence
+    :return: A list where each item is a tuple of (batch of input, batch of target).
+    """
+    n_batches = int(len(int_text) / (batch_size * seq_length))
+
+    # Drop the last few characters to make only full batches
+    xdata = np.array(int_text[: n_batches * batch_size * seq_length])
+    ydata = np.array(int_text[1: n_batches * batch_size * seq_length + 1])
+
+    x_batches = np.split(xdata.reshape(batch_size, -1), n_batches, 1)
+    y_batches = np.split(ydata.reshape(batch_size, -1), n_batches, 1)
+
+    return list(zip(x_batches, y_batches))
+
+
+def create_lookup_tables(words):
+    """
+    Create lookup tables for vocabulary
+    :param words: Input list of words
+    :return: A tuple of dicts.  The first dict....
+    """
+    word_counts = Counter(words)
+    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
+    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
+    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}
+
+    return vocab_to_int, int_to_vocab