diff --git a/ann_class2/batch_norm_tf.py b/ann_class2/batch_norm_tf.py index de25cd3b..31d9a351 100644 --- a/ann_class2/batch_norm_tf.py +++ b/ann_class2/batch_norm_tf.py @@ -4,13 +4,15 @@ # sudo pip install -U future import numpy as np -import pandas as pd +#import pandas as pd import matplotlib.pyplot as plt import tensorflow as tf from sklearn.utils import shuffle -from sklearn.model_selection import train_test_split +#from sklearn.model_selection import train_test_split from util import get_normalized_data +if tf.__version__.startswith('2'): + tf.compat.v1.disable_eager_execution() def init_weight(M1, M2): return np.random.randn(M1, M2) * np.sqrt(2.0 / M1) @@ -38,13 +40,11 @@ def forward(self, X, is_training, decay=0.9): activation = tf.matmul(X, self.W) if is_training: batch_mean, batch_var = tf.nn.moments(activation, [0]) - update_running_mean = tf.assign( - self.running_mean, - self.running_mean * decay + batch_mean * (1 - decay) + update_running_mean = self.running_mean.assign( + self.running_mean * decay + batch_mean * (1 - decay) ) - update_running_var = tf.assign( - self.running_var, - self.running_var * decay + batch_var * (1 - decay) + update_running_var = self.running_var.assign( + self.running_var * decay + batch_var * (1 - decay) ) with tf.control_dependencies([update_running_mean, update_running_var]): @@ -115,8 +115,8 @@ def fit(self, X, Y, Xtest, Ytest, activation=tf.nn.relu, learning_rate=1e-2, epo # for train and test (prediction) # set up theano functions and variables - tfX = tf.placeholder(tf.float32, shape=(None, D), name='X') - tfY = tf.placeholder(tf.int32, shape=(None,), name='Y') + tfX = tf.compat.v1.placeholder(tf.float32, shape=(None, D), name='X') + tfY = tf.compat.v1.placeholder(tf.int32, shape=(None,), name='Y') # for later use self.tfX = tfX @@ -131,7 +131,7 @@ def fit(self, X, Y, Xtest, Ytest, activation=tf.nn.relu, learning_rate=1e-2, epo ) # train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost) # train_op = tf.train.RMSPropOptimizer(learning_rate, decay=0.99, momentum=0.9).minimize(cost) - train_op = tf.train.MomentumOptimizer(learning_rate, momentum=0.9, use_nesterov=True).minimize(cost) + train_op = tf.compat.v1.train.MomentumOptimizer(learning_rate, momentum=0.9, use_nesterov=True).minimize(cost) # train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) # for testing @@ -141,7 +141,7 @@ def fit(self, X, Y, Xtest, Ytest, activation=tf.nn.relu, learning_rate=1e-2, epo # accuracy = tf.reduce_mean(1.0*(tfY == tf.argmax(logits, 1))) # init the variables - self.session.run(tf.global_variables_initializer()) + self.session.run(tf.compat.v1.global_variables_initializer()) n_batches = N // batch_sz costs = [] @@ -187,7 +187,7 @@ def main(): ann = ANN([500, 300]) - session = tf.InteractiveSession() + session = tf.compat.v1.InteractiveSession() ann.set_session(session) ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig=True) diff --git a/ann_class2/dropout_tensorflow.py b/ann_class2/dropout_tensorflow.py index b20c44fb..c2186e57 100644 --- a/ann_class2/dropout_tensorflow.py +++ b/ann_class2/dropout_tensorflow.py @@ -13,6 +13,8 @@ from util import get_normalized_data from sklearn.utils import shuffle +if tf.__version__.startswith('2'): + tf.compat.v1.disable_eager_execution() class HiddenLayer(object): def __init__(self, M1, M2): @@ -59,8 +61,8 @@ def fit(self, X, Y, Xvalid, Yvalid, lr=1e-4, mu=0.9, decay=0.9, epochs=15, batch self.params += h.params # set up theano functions and variables - inputs = tf.placeholder(tf.float32, shape=(None, D), name='inputs') - labels = tf.placeholder(tf.int64, shape=(None,), name='labels') + inputs = tf.compat.v1.placeholder(tf.float32, shape=(None, D), name='inputs') + labels = tf.compat.v1.placeholder(tf.int64, shape=(None,), name='labels') logits = self.forward(inputs) cost = tf.reduce_mean( @@ -69,7 +71,7 @@ def fit(self, X, Y, Xvalid, Yvalid, lr=1e-4, mu=0.9, decay=0.9, epochs=15, batch labels=labels ) ) - train_op = tf.train.RMSPropOptimizer(lr, decay=decay, momentum=mu).minimize(cost) + train_op = tf.compat.v1.train.RMSPropOptimizer(lr, decay=decay, momentum=mu).minimize(cost) # train_op = tf.train.MomentumOptimizer(lr, momentum=mu).minimize(cost) # train_op = tf.train.AdamOptimizer(lr).minimize(cost) prediction = self.predict(inputs) @@ -85,8 +87,8 @@ def fit(self, X, Y, Xvalid, Yvalid, lr=1e-4, mu=0.9, decay=0.9, epochs=15, batch n_batches = N // batch_sz costs = [] - init = tf.global_variables_initializer() - with tf.Session() as session: + init = tf.compat.v1.global_variables_initializer() + with tf.compat.v1.Session() as session: session.run(init) for i in range(epochs): print("epoch:", i, "n_batches:", n_batches) diff --git a/ann_class2/tensorflow2.py b/ann_class2/tensorflow2.py index a07f0104..00bd7746 100644 --- a/ann_class2/tensorflow2.py +++ b/ann_class2/tensorflow2.py @@ -12,11 +12,12 @@ import numpy as np import tensorflow as tf - import matplotlib.pyplot as plt - from util import get_normalized_data, y2indicator +if tf.__version__.startswith('2'): + tf.compat.v1.disable_eager_execution() + def error_rate(p, t): return np.mean(p != t) @@ -31,7 +32,7 @@ def main(): print_period = 50 lr = 0.00004 - reg = 0.01 + #reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) @@ -53,8 +54,8 @@ def main(): # define variables and expressions - X = tf.placeholder(tf.float32, shape=(None, D), name='X') - T = tf.placeholder(tf.float32, shape=(None, K), name='T') + X = tf.compat.v1.placeholder(tf.float32, shape=(None, D), name='X') + T = tf.compat.v1.placeholder(tf.float32, shape=(None, K), name='T') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) @@ -70,19 +71,19 @@ def main(): # softmax_cross_entropy_with_logits take in the "logits" # if you wanted to know the actual output of the neural net, # you could pass "Yish" into tf.nn.softmax(logits) - cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(logits=Yish, labels=T)) + cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T)) # we choose the optimizer but don't implement the algorithm ourselves # let's go with RMSprop, since we just learned about it. # it includes momentum! - train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost) + train_op = tf.compat.v1.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost) # we'll use this to calculate the error rate predict_op = tf.argmax(Yish, 1) costs = [] - init = tf.global_variables_initializer() - with tf.Session() as session: + init = tf.compat.v1.global_variables_initializer() + with tf.compat.v1.Session() as session: session.run(init) for i in range(max_iter): diff --git a/ann_class2/util.py b/ann_class2/util.py index 5c8ad934..47d2a23a 100644 --- a/ann_class2/util.py +++ b/ann_class2/util.py @@ -15,7 +15,7 @@ import pandas as pd import matplotlib.pyplot as plt from sklearn.decomposition import PCA -from sklearn.linear_model import LogisticRegression +#from sklearn.linear_model import LogisticRegression def get_clouds(): diff --git a/keras_examples/ann.py b/keras_examples/ann.py index 08636b15..857fd95c 100644 --- a/keras_examples/ann.py +++ b/keras_examples/ann.py @@ -7,8 +7,8 @@ import matplotlib.pyplot as plt from util import getKaggleMNIST -from keras.models import Model -from keras.layers import Dense, Activation, Input +from tensorflow.keras.models import Model # type: ignore +from tensorflow.keras.layers import Dense, Input # type: ignore # get the data @@ -58,8 +58,8 @@ plt.show() # accuracies -plt.plot(r.history['acc'], label='acc') -plt.plot(r.history['val_acc'], label='val_acc') +plt.plot(r.history['accuracy'], label='acc') +plt.plot(r.history['val_accuracy'], label='val_acc') plt.legend() plt.show() diff --git a/keras_examples/cnn.py b/keras_examples/cnn.py index 088cc5b2..f0eee61f 100644 --- a/keras_examples/cnn.py +++ b/keras_examples/cnn.py @@ -5,14 +5,14 @@ # Note: you may need to update your version of future # sudo pip install -U future -from keras.models import Model -from keras.layers import Dense, Activation, Conv2D, MaxPooling2D, Flatten, Input +from tensorflow.keras.models import Model # type: ignore +from tensorflow.keras.layers import Dense, Activation, Conv2D, MaxPooling2D, Flatten, Input # type: ignore import matplotlib.pyplot as plt import pandas as pd import numpy as np -from util import getKaggleMNIST3D, getKaggleFashionMNIST3D, getCIFAR10 +from util import getKaggleFashionMNIST3D # get the data @@ -73,8 +73,8 @@ plt.show() # accuracies -plt.plot(r.history['acc'], label='acc') -plt.plot(r.history['val_acc'], label='val_acc') +plt.plot(r.history['accuracy'], label='acc') +plt.plot(r.history['val_accuracy'], label='val_acc') plt.legend() plt.show() diff --git a/keras_examples/util.py b/keras_examples/util.py index 2e3af106..22fa6832 100644 --- a/keras_examples/util.py +++ b/keras_examples/util.py @@ -12,12 +12,12 @@ def getKaggleMNIST(): # https://www.kaggle.com/c/digit-recognizer - return getMNISTFormat('../large_files/train.csv') + return getMNISTFormat('.\\large_files\\digit-recognizer\\train.csv') def getKaggleFashionMNIST(): # https://www.kaggle.com/zalando-research/fashionmnist - return getMNISTFormat('../large_files/fashionmnist/fashion-mnist_train.csv') + return getMNISTFormat('.\\large_files\\fashionmnist\\fashion-mnist_train.csv') def getMNISTFormat(path): # MNIST data: diff --git a/nlp_class2/bow_classifier.py b/nlp_class2/bow_classifier.py index 25588e3b..70efdde6 100644 --- a/nlp_class2/bow_classifier.py +++ b/nlp_class2/bow_classifier.py @@ -32,7 +32,7 @@ def __init__(self): word2vec = {} embedding = [] idx2word = [] - with open('../large_files/glove.6B/glove.6B.50d.txt') as f: + with open('../large_files/glove.6B/glove.6B.50d.txt', encoding='utf-8') as f: # is just a space-separated text file in the format: # word vec[0] vec[1] vec[2] ... for line in f: diff --git a/nlp_class2/cc_matrix_50.npy b/nlp_class2/cc_matrix_50.npy new file mode 100644 index 00000000..508d3a6b Binary files /dev/null and b/nlp_class2/cc_matrix_50.npy differ diff --git a/nlp_class2/glove.py b/nlp_class2/glove.py index b46c13f2..fdd9e6fa 100644 --- a/nlp_class2/glove.py +++ b/nlp_class2/glove.py @@ -20,7 +20,7 @@ import sys sys.path.append(os.path.abspath('..')) from rnn_class.util import get_wikipedia_data -from rnn_class.brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx +from rnn_class.brown import get_sentences_with_word2idx_limit_vocab # using ALS, what's the least # files to get correct analogies? # use this for word2vec training to make it faster @@ -120,7 +120,7 @@ def fit(self, sentences, cc_matrix=None, learning_rate=1e-4, reg=0.1, xmax=100, costs = [] - sentence_indexes = range(len(sentences)) + #sentence_indexes = range(len(sentences)) for epoch in range(epochs): delta = W.dot(U.T) + b.reshape(V, 1) + c.reshape(1, V) + mu - logX cost = ( fX * delta * delta ).sum() diff --git a/nlp_class2/glove_model_50.npz b/nlp_class2/glove_model_50.npz new file mode 100644 index 00000000..56e47511 Binary files /dev/null and b/nlp_class2/glove_model_50.npz differ diff --git a/nlp_class2/glove_svd.py b/nlp_class2/glove_svd.py index a0fd3c0a..a8db4f92 100644 --- a/nlp_class2/glove_svd.py +++ b/nlp_class2/glove_svd.py @@ -14,14 +14,13 @@ from sklearn.decomposition import TruncatedSVD from datetime import datetime -from sklearn.utils import shuffle from util import find_analogies import sys sys.path.append(os.path.abspath('..')) from rnn_class.util import get_wikipedia_data -from rnn_class.brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx +from rnn_class.brown import get_sentences_with_word2idx_limit_vocab class Glove: diff --git a/nlp_class2/glove_tf.py b/nlp_class2/glove_tf.py index 9db18bb4..8986ad94 100644 --- a/nlp_class2/glove_tf.py +++ b/nlp_class2/glove_tf.py @@ -14,13 +14,13 @@ import matplotlib.pyplot as plt from datetime import datetime -from sklearn.utils import shuffle +#from sklearn.utils import shuffle from util import find_analogies import sys sys.path.append(os.path.abspath('..')) from rnn_class.util import get_wikipedia_data -from rnn_class.brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx +from rnn_class.brown import get_sentences_with_word2idx_limit_vocab if tf.__version__.startswith('2'): tf.compat.v1.disable_eager_execution() @@ -141,7 +141,7 @@ def fit(self, sentences, cc_matrix=None, learning_rate=1e-4, reg=0.1, xmax=100, session.run(init) costs = [] - sentence_indexes = range(len(sentences)) + #sentence_indexes = range(len(sentences)) for epoch in range(epochs): c, _ = session.run((cost, train_op), feed_dict={tfLogX: logX, tffX: fX}) print("epoch:", epoch, "cost:", c) @@ -190,7 +190,7 @@ def main(we_file, w2i_file, use_brown=True, n_files=50): V = len(word2idx) model = Glove(100, V, 10) - model.fit(sentences, cc_matrix=cc_matrix, epochs=200) + model.fit(sentences, cc_matrix=cc_matrix, epochs=10000) model.save(we_file) diff --git a/nlp_class2/glove_word2idx_50.json b/nlp_class2/glove_word2idx_50.json new file mode 100644 index 00000000..a3142b0f --- /dev/null +++ b/nlp_class2/glove_word2idx_50.json @@ -0,0 +1 @@ +{"START": 0, "END": 1, "the": 2, "of": 3, "and": 4, "in": 5, "to": 6, "a": 7, "as": 8, "is": 9, "was": 10, "for": 11, "that": 12, "by": 13, "with": 14, "on": 15, "from": 16, "his": 17, "are": 18, "it": 19, "an": 20, "at": 21, "he": 22, "or": 23, "which": 24, "be": 25, "were": 26, "this": 27, "not": 28, "have": 29, "also": 30, "had": 31, "their": 32, "has": 33, "its": 34, "but": 35, "one": 36, "first": 37, "other": 38, "they": 39, "been": 40, "such": 41, "after": 42, "who": 43, "more": 44, "new": 45, "some": 46, "most": 47, "used": 48, "can": 49, "into": 50, "two": 51, "all": 52, "when": 53, "during": 54, "there": 55, "these": 56, "may": 57, "many": 58, "than": 59, "time": 60, "between": 61, "would": 62, "only": 63, "over": 64, "while": 65, "states": 66, "about": 67, "years": 68, "world": 69, "her": 70, "later": 71, "known": 72, "no": 73, "use": 74, "war": 75, "people": 76, "however": 77, "both": 78, "including": 79, "united": 80, "where": 81, "made": 82, "became": 83, "him": 84, "being": 85, "city": 86, "american": 87, "under": 88, "through": 89, "century": 90, "called": 91, "early": 92, "state": 93, "since": 94, "them": 95, "system": 96, "then": 97, "three": 98, "up": 99, "government": 100, "part": 101, "number": 102, "if": 103, "out": 104, "well": 105, "often": 106, "several": 107, "because": 108, "any": 109, "work": 110, "before": 111, "i": 112, "national": 113, "she": 114, "so": 115, "against": 116, "each": 117, "could": 118, "same": 119, "year": 120, "us": 121, "film": 122, "although": 123, "until": 124, "found": 125, "second": 126, "form": 127, "according": 128, "following": 129, "example": 130, "will": 131, "around": 132, "british": 133, "include": 134, "like": 135, "name": 136, "those": 137, "different": 138, "due": 139, "did": 140, "english": 141, "among": 142, "began": 143, "major": 144, "within": 145, "another": 146, "life": 147, "large": 148, "high": 149, "based": 150, "french": 151, "series": 152, "even": 153, "language": 154, "general": 155, "group": 156, "international": 157, "much": 158, "using": 159, "population": 160, "north": 161, "power": 162, "music": 163, "south": 164, "modern": 165, "set": 166, "four": 167, "end": 168, "country": 169, "period": 170, "common": 171, "political": 172, "public": 173, "area": 174, "university": 175, "military": 176, "million": 177, "own": 178, "led": 179, "german": 180, "members": 181, "now": 182, "death": 183, "what": 184, "\u2013": 185, "1": 186, "church": 187, "history": 188, "very": 189, "party": 190, "de": 191, "still": 192, "john": 193, "great": 194, "considered": 195, "said": 196, "law": 197, "european": 198, "small": 199, "book": 200, "order": 201, "published": 202, "king": 203, "do": 204, "late": 205, "day": 206, "development": 207, "family": 208, "support": 209, "president": 210, "water": 211, "important": 212, "various": 213, "along": 214, "without": 215, "central": 216, "categories": 217, "developed": 218, "though": 219, "school": 220, "countries": 221, "control": 222, "east": 223, "human": 224, "army": 225, "west": 226, "took": 227, "place": 228, "long": 229, "term": 230, "wrote": 231, "home": 232, "included": 233, "become": 234, "times": 235, "game": 236, "established": 237, "main": 238, "given": 239, "way": 240, "local": 241, "island": 242, "theory": 243, "last": 244, "union": 245, "house": 246, "usually": 247, "age": 248, "similar": 249, "europe": 250, "held": 251, "make": 252, "force": 253, "western": 254, "back": 255, "production": 256, "ii": 257, "left": 258, "systems": 259, "less": 260, "company": 261, "air": 262, "released": 263, "popular": 264, "forces": 265, "social": 266, "roman": 267, "having": 268, "old": 269, "others": 270, "named": 271, "economic": 272, "further": 273, "groups": 274, "empire": 275, "films": 276, "original": 277, "result": 278, "region": 279, "few": 280, "thus": 281, "largest": 282, "point": 283, "role": 284, "court": 285, "case": 286, "former": 287, "described": 288, "team": 289, "march": 290, "works": 291, "written": 292, "land": 293, "five": 294, "process": 295, "service": 296, "languages": 297, "january": 298, "areas": 299, "river": 300, "produced": 301, "per": 302, "single": 303, "games": 304, "research": 305, "june": 306, "july": 307, "sometimes": 308, "december": 309, "came": 310, "2": 311, "rather": 312, "created": 313, "october": 314, "line": 315, "women": 316, "data": 317, "field": 318, "generally": 319, "does": 320, "continued": 321, "down": 322, "york": 323, "september": 324, "islands": 325, "received": 326, "how": 327, "should": 328, "black": 329, "france": 330, "england": 331, "played": 332, "greek": 333, "november": 334, "either": 335, "must": 336, "germany": 337, "show": 338, "april": 339, "god": 340, "best": 341, "season": 342, "total": 343, "species": 344, "see": 345, "third": 346, "kingdom": 347, "council": 348, "soviet": 349, "especially": 350, "just": 351, "science": 352, "art": 353, "study": 354, "word": 355, "10": 356, "member": 357, "august": 358, "every": 359, "free": 360, "days": 361, "light": 362, "instead": 363, "won": 364, "body": 365, "act": 366, "sea": 367, "trade": 368, "son": 369, "space": 370, "throughout": 371, "men": 372, "died": 373, "children": 374, "society": 375, "near": 376, "foreign": 377, "we": 378, "london": 379, "significant": 380, "information": 381, "version": 382, "built": 383, "energy": 384, "take": 385, "possible": 386, "northern": 387, "standard": 388, "ancient": 389, "christian": 390, "white": 391, "next": 392, "final": 393, "natural": 394, "despite": 395, "himself": 396, "addition": 397, "again": 398, "league": 399, "introduced": 400, "bc": 401, "design": 402, "upon": 403, "man": 404, "making": 405, "never": 406, "rights": 407, "movement": 408, "right": 409, "position": 410, "least": 411, "eastern": 412, "february": 413, "india": 414, "seen": 415, "traditional": 416, "battle": 417, "southern": 418, "change": 419, "education": 420, "parts": 421, "religious": 422, "terms": 423, "play": 424, "formed": 425, "followed": 426, "almost": 427, "america": 428, "influence": 429, "china": 430, "republic": 431, "once": 432, "together": 433, "3": 434, "father": 435, "culture": 436, "royal": 437, "across": 438, "evidence": 439, "television": 440, "six": 441, "chinese": 442, "civil": 443, "higher": 444, "forms": 445, "little": 446, "off": 447, "20": 448, "elements": 449, "certain": 450, "middle": 451, "office": 452, "community": 453, "level": 454, "red": 455, "means": 456, "range": 457, "available": 458, "type": 459, "increased": 460, "lost": 461, "album": 462, "official": 463, "side": 464, "15": 465, "program": 466, "particularly": 467, "numbers": 468, "short": 469, "remained": 470, "young": 471, "itself": 472, "born": 473, "above": 474, "band": 475, "computer": 476, "lower": 477, "special": 478, "present": 479, "nations": 480, "record": 481, "model": 482, "associated": 483, "head": 484, "rule": 485, "thought": 486, "2010": 487, "earth": 488, "particular": 489, "eventually": 490, "low": 491, "latin": 492, "center": 493, "rate": 494, "japanese": 495, "jewish": 496, "college": 497, "good": 498, "services": 499, "words": 500, "minister": 501, "capital": 502, "whose": 503, "2011": 504, "2020": 505, "writers": 506, "character": 507, "leading": 508, "you": 509, "story": 510, "believed": 511, "4": 512, "cities": 513, "5": 514, "announced": 515, "referred": 516, "2021": 517, "allowed": 518, "taken": 519, "located": 520, "building": 521, "went": 522, "typically": 523, "structure": 524, "2022": 525, "moved": 526, "beginning": 527, "africa": 528, "male": 529, "months": 530, "market": 531, "spanish": 532, "meaning": 533, "provided": 534, "source": 535, "food": 536, "12": 537, "writing": 538, "nature": 539, "industry": 540, "st": 541, "living": 542, "project": 543, "2023": 544, "function": 545, "live": 546, "italian": 547, "half": 548, "able": 549, "cases": 550, "effect": 551, "gave": 552, "provide": 553, "style": 554, "current": 555, "appeared": 556, "required": 557, "top": 558, "2008": 559, "served": 560, "radio": 561, "health": 562, "saw": 563, "started": 564, "value": 565, "title": 566, "related": 567, "election": 568, "economy": 569, "2012": 570, "aircraft": 571, "books": 572, "town": 573, "includes": 574, "strong": 575, "william": 576, "companies": 577, "lead": 578, "open": 579, "stated": 580, "average": 581, "2000": 582, "network": 583, "events": 584, "emperor": 585, "today": 586, "c": 587, "players": 588, "majority": 589, "far": 590, "outside": 591, "policy": 592, "full": 593, "view": 594, "2009": 595, "mass": 596, "complex": 597, "30": 598, "return": 599, "returned": 600, "sent": 601, "increase": 602, "2007": 603, "independent": 604, "working": 605, "person": 606, "practice": 607, "limited": 608, "rock": 609, "2019": 610, "legal": 611, "2015": 612, "russian": 613, "brought": 614, "founded": 615, "caused": 616, "features": 617, "close": 618, "individual": 619, "private": 620, "technology": 621, "characters": 622, "earlier": 623, "reported": 624, "size": 625, "indian": 626, "james": 627, "whether": 628, "business": 629, "award": 630, "might": 631, "catholic": 632, "2014": 633, "designed": 634, "class": 635, "cultural": 636, "material": 637, "2016": 638, "commonly": 639, "changes": 640, "primary": 641, "action": 642, "recorded": 643, "proposed": 644, "growth": 645, "2017": 646, "therefore": 647, "studies": 648, "types": 649, "schools": 650, "widely": 651, "japan": 652, "specific": 653, "larger": 654, "too": 655, "2013": 656, "2018": 657, "prime": 658, "cause": 659, "code": 660, "themselves": 661, "subject": 662, "mostly": 663, "african": 664, "2006": 665, "charles": 666, "interest": 667, "historical": 668, "surface": 669, "club": 670, "territory": 671, "run": 672, "11": 673, "video": 674, "base": 675, "25": 676, "media": 677, "seven": 678, "uses": 679, "physical": 680, "software": 681, "effects": 682, "students": 683, "canada": 684, "success": 685, "greater": 686, "parliament": 687, "originally": 688, "performance": 689, "names": 690, "help": 691, "away": 692, "always": 693, "defined": 694, "list": 695, "future": 696, "100": 697, "produce": 698, "likely": 699, "italy": 700, "approximately": 701, "sound": 702, "federal": 703, "county": 704, "replaced": 705, "6": 706, "david": 707, "billion": 708, "song": 709, "recent": 710, "coast": 711, "key": 712, "involved": 713, "added": 714, "george": 715, "release": 716, "mother": 717, "b": 718, "elected": 719, "via": 720, "eg": 721, "personal": 722, "below": 723, "conditions": 724, "regions": 725, "security": 726, "construction": 727, "2005": 728, "sources": 729, "numerous": 730, "concept": 731, "britain": 732, "come": 733, "division": 734, "records": 735, "attack": 736, "soon": 737, "19th": 738, "idea": 739, "killed": 740, "access": 741, "uk": 742, "longer": 743, "largely": 744, "hand": 745, "fact": 746, "successful": 747, "supported": 748, "star": 749, "remains": 750, "units": 751, "lines": 752, "real": 753, "park": 754, "love": 755, "site": 756, "my": 757, "directly": 758, "create": 759, "results": 760, "scholars": 761, "multiple": 762, "adopted": 763, "metal": 764, "already": 765, "henry": 766, "classical": 767, "association": 768, "reached": 769, "commercial": 770, "put": 771, "method": 772, "variety": 773, "whom": 774, "20th": 775, "independence": 776, "problems": 777, "towards": 778, "direct": 779, "football": 780, "native": 781, "wife": 782, "sold": 783, "need": 784, "la": 785, "tradition": 786, "points": 787, "initially": 788, "8": 789, "era": 790, "claimed": 791, "organization": 792, "medical": 793, "authority": 794, "18": 795, "shows": 796, "products": 797, "scientific": 798, "16": 799, "asia": 800, "14": 801, "financial": 802, "relationship": 803, "separate": 804, "centre": 805, "smaller": 806, "worked": 807, "laws": 808, "additional": 809, "performed": 810, "oil": 811, "relations": 812, "letter": 813, "estimated": 814, "fire": 815, "global": 816, "lake": 817, "highest": 818, "artists": 819, "leader": 820, "australia": 821, "peoples": 822, "musical": 823, "cells": 824, "problem": 825, "operations": 826, "report": 827, "discovered": 828, "7": 829, "met": 830, "2001": 831, "career": 832, "event": 833, "response": 834, "status": 835, "centuries": 836, "previous": 837, "methods": 838, "knowledge": 839, "active": 840, "compared": 841, "nearly": 842, "primarily": 843, "2024": 844, "levels": 845, "jews": 846, "pressure": 847, "robert": 848, "highly": 849, "complete": 850, "california": 851, "married": 852, "examples": 853, "gas": 854, "rules": 855, "allow": 856, "2004": 857, "gold": 858, "street": 859, "religion": 860, "campaign": 861, "basis": 862, "true": 863, "treaty": 864, "player": 865, "stage": 866, "placed": 867, "13": 868, "novel": 869, "bank": 870, "give": 871, "cell": 872, "ten": 873, "constitution": 874, "mainly": 875, "committee": 876, "joined": 877, "revolution": 878, "changed": 879, "influenced": 880, "parties": 881, "road": 882, "internet": 883, "agreement": 884, "argued": 885, "divided": 886, "museum": 887, "memory": 888, "better": 889, "academy": 890, "cannot": 891, "worlds": 892, "board": 893, "front": 894, "station": 895, "teams": 896, "philosophy": 897, "spain": 898, "heavy": 899, "individuals": 900, "unit": 901, "percent": 902, "basic": 903, "experience": 904, "our": 905, "congress": 906, "police": 907, "entire": 908, "training": 909, "literature": 910, "israel": 911, "rest": 912, "irish": 913, "grand": 914, "chemical": 915, "arts": 916, "suggested": 917, "money": 918, "troops": 919, "50": 920, "ever": 921, "24": 922, "female": 923, "songs": 924, "paul": 925, "text": 926, "yet": 927, "lack": 928, "relatively": 929, "color": 930, "taking": 931, "appointed": 932, "night": 933, "deaths": 934, "x": 935, "past": 936, "letters": 937, "blue": 938, "latter": 939, "eight": 940, "contains": 941, "shown": 942, "previously": 943, "analysis": 944, "2003": 945, "ended": 946, "animals": 947, "fiction": 948, "ground": 949, "account": 950, "turn": 951, "signed": 952, "marriage": 953, "go": 954, "issues": 955, "potential": 956, "resulting": 957, "whole": 958, "enough": 959, "summer": 960, "opened": 961, "appear": 962, "noted": 963, "decided": 964, "rome": 965, "peace": 966, "attempt": 967, "forced": 968, "issue": 969, "director": 970, "imperial": 971, "daughter": 972, "done": 973, "plan": 974, "paris": 975, "department": 976, "completed": 977, "prior": 978, "1980s": 979, "climate": 980, "temperature": 981, "famous": 982, "test": 983, "ireland": 984, "nuclear": 985, "regional": 986, "accepted": 987, "sense": 988, "thomas": 989, "collection": 990, "1970s": 991, "programs": 992, "powers": 993, "operation": 994, "administration": 995, "russia": 996, "property": 997, "17": 998, "1999": 999, "v": 1000, "green": 1001, "blood": 1002, "origin": 1003, "applied": 1004, "stories": 1005, "treatment": 1006, "ad": 1007, "intended": 1008, "approach": 1009, "length": 1010, "rise": 1011, "passed": 1012, "birth": 1013, "ships": 1014, "move": 1015, "speed": 1016, "functions": 1017, "objects": 1018, "wide": 1019, "hall": 1020, "hours": 1021, "activity": 1022, "difficult": 1023, "probably": 1024, "child": 1025, "date": 1026, "amount": 1027, "2002": 1028, "matter": 1029, "article": 1030, "district": 1031, "brother": 1032, "behind": 1033, "ideas": 1034, "chief": 1035, "earliest": 1036, "navy": 1037, "exist": 1038, "degree": 1039, "machine": 1040, "opposition": 1041, "industrial": 1042, "democratic": 1043, "loss": 1044, "derived": 1045, "product": 1046, "canadian": 1047, "find": 1048, "creation": 1049, "reduced": 1050, "presence": 1051, "failed": 1052, "becoming": 1053, "simple": 1054, "properties": 1055, "makes": 1056, "m": 1057, "dutch": 1058, "get": 1059, "needed": 1060, "finally": 1061, "provides": 1062, "reference": 1063, "lived": 1064, "institute": 1065, "ability": 1066, "assembly": 1067, "annual": 1068, "me": 1069, "carried": 1070, "21": 1071, "humans": 1072, "notable": 1073, "contemporary": 1074, "declared": 1075, "ones": 1076, "represented": 1077, "playing": 1078, "cost": 1079, "composed": 1080, "appears": 1081, "washington": 1082, "1960s": 1083, "extended": 1084, "leaders": 1085, "frequently": 1086, "kings": 1087, "ie": 1088, "9": 1089, "transport": 1090, "elections": 1091, "armed": 1092, "resulted": 1093, "holy": 1094, "port": 1095, "border": 1096, "start": 1097, "increasing": 1098, "dna": 1099, "feature": 1100, "plants": 1101, "disease": 1102, "combined": 1103, "carbon": 1104, "featured": 1105, "element": 1106, "identified": 1107, "iron": 1108, "starting": 1109, "agreed": 1110, "older": 1111, "refer": 1112, "responsible": 1113, "existence": 1114, "stars": 1115, "necessary": 1116, "operating": 1117, "fall": 1118, "1998": 1119, "location": 1120, "commission": 1121, "materials": 1122, "louis": 1123, "1990s": 1124, "news": 1125, "san": 1126, "regular": 1127, "ice": 1128, "dead": 1129, "read": 1130, "except": 1131, "greatest": 1132, "mission": 1133, "internal": 1134, "1997": 1135, "conflict": 1136, "activities": 1137, "22": 1138, "spread": 1139, "e": 1140, "command": 1141, "opposed": 1142, "section": 1143, "offered": 1144, "professional": 1145, "province": 1146, "airport": 1147, "critical": 1148, "initial": 1149, "40": 1150, "reason": 1151, "efforts": 1152, "festival": 1153, "polish": 1154, "iii": 1155, "remain": 1156, "buildings": 1157, "places": 1158, "claims": 1159, "quickly": 1160, "respectively": 1161, "flight": 1162, "soldiers": 1163, "contrast": 1164, "fourth": 1165, "alternative": 1166, "launched": 1167, "occur": 1168, "management": 1169, "models": 1170, "mexico": 1171, "decision": 1172, "big": 1173, "cup": 1174, "woman": 1175, "plant": 1176, "consists": 1177, "prominent": 1178, "remaining": 1179, "definition": 1180, "presented": 1181, "exchange": 1182, "simply": 1183, "image": 1184, "asked": 1185, "applications": 1186, "claim": 1187, "currently": 1188, "clear": 1189, "wars": 1190, "paper": 1191, "workers": 1192, "believe": 1193, "australian": 1194, "risk": 1195, "continue": 1196, "1990": 1197, "egypt": 1198, "tour": 1199, "environment": 1200, "engineering": 1201, "engine": 1202, "digital": 1203, "channel": 1204, "techniques": 1205, "upper": 1206, "1996": 1207, "entered": 1208, "author": 1209, "poland": 1210, "urban": 1211, "muslim": 1212, "sports": 1213, "wanted": 1214, "values": 1215, "speech": 1216, "directed": 1217, "occurred": 1218, "sexual": 1219, "defeated": 1220, "meeting": 1221, "issued": 1222, "electric": 1223, "1992": 1224, "jesus": 1225, "allowing": 1226, "islamic": 1227, "allows": 1228, "contain": 1229, "mary": 1230, "freedom": 1231, "23": 1232, "1991": 1233, "expressed": 1234, "attacks": 1235, "object": 1236, "course": 1237, "defense": 1238, "acid": 1239, "communities": 1240, "library": 1241, "figure": 1242, "positive": 1243, "expected": 1244, "quality": 1245, "beyond": 1246, "scale": 1247, "alexander": 1248, "poor": 1249, "magazine": 1250, "race": 1251, "governor": 1252, "unlike": 1253, "richard": 1254, "observed": 1255, "here": 1256, "subsequently": 1257, "minor": 1258, "month": 1259, "growing": 1260, "historian": 1261, "edition": 1262, "turned": 1263, "treaties": 1264, "regarded": 1265, "things": 1266, "sun": 1267, "19": 1268, "organizations": 1269, "versions": 1270, "charge": 1271, "fully": 1272, "families": 1273, "spent": 1274, "structures": 1275, "focus": 1276, "moon": 1277, "medieval": 1278, "conference": 1279, "governments": 1280, "churches": 1281, "fields": 1282, "convention": 1283, "ocean": 1284, "lord": 1285, "1995": 1286, "dynasty": 1287, "mark": 1288, "hold": 1289, "effective": 1290, "d": 1291, "20thcentury": 1292, "institutions": 1293, "distance": 1294, "1994": 1295, "reign": 1296, "orthodox": 1297, "win": 1298, "subsequent": 1299, "recognized": 1300, "helped": 1301, "victory": 1302, "inspired": 1303, "ethnic": 1304, "distinct": 1305, "told": 1306, "formation": 1307, "share": 1308, "ways": 1309, "27": 1310, "n": 1311, "28": 1312, "ship": 1313, "standards": 1314, "impact": 1315, "formal": 1316, "expansion": 1317, "labour": 1318, "critics": 1319, "26": 1320, "direction": 1321, "los": 1322, "attempted": 1323, "prevent": 1324, "f": 1325, "figures": 1326, "notes": 1327, "bands": 1328, "address": 1329, "protection": 1330, "press": 1331, "appearance": 1332, "marked": 1333, "weapons": 1334, "officially": 1335, "instance": 1336, "serve": 1337, "resources": 1338, "content": 1339, "leaving": 1340, "gods": 1341, "friend": 1342, "countrys": 1343, "golden": 1344, "develop": 1345, "negative": 1346, "nation": 1347, "j": 1348, "refused": 1349, "valley": 1350, "showed": 1351, "equal": 1352, "motion": 1353, "factors": 1354, "vote": 1355, "decades": 1356, "stone": 1357, "refers": 1358, "acts": 1359, "heart": 1360, "prince": 1361, "citizens": 1362, "reaction": 1363, "call": 1364, "arrived": 1365, "removed": 1366, "literary": 1367, "grew": 1368, "bill": 1369, "s": 1370, "faith": 1371, "unique": 1372, "sector": 1373, "car": 1374, "income": 1375, "square": 1376, "saint": 1377, "winter": 1378, "1993": 1379, "gained": 1380, "animal": 1381, "writer": 1382, "table": 1383, "double": 1384, "friends": 1385, "invasion": 1386, "distribution": 1387, "communist": 1388, "executive": 1389, "sought": 1390, "giving": 1391, "mean": 1392, "ordered": 1393, "territories": 1394, "overall": 1395, "staff": 1396, "completely": 1397, "increasingly": 1398, "nine": 1399, "justice": 1400, "expanded": 1401, "christianity": 1402, "historians": 1403, "powerful": 1404, "awarded": 1405, "1989": 1406, "specifically": 1407, "foundation": 1408, "politics": 1409, "g": 1410, "americans": 1411, "keep": 1412, "containing": 1413, "hit": 1414, "peter": 1415, "p": 1416, "supreme": 1417, "studio": 1418, "immediately": 1419, "sites": 1420, "advanced": 1421, "inside": 1422, "takes": 1423, "competition": 1424, "notably": 1425, "railway": 1426, "actions": 1427, "actually": 1428, "normal": 1429, "cross": 1430, "theatre": 1431, "h": 1432, "secretary": 1433, "michael": 1434, "pacific": 1435, "r": 1436, "deal": 1437, "rates": 1438, "attention": 1439, "question": 1440, "apollo": 1441, "users": 1442, "significantly": 1443, "understanding": 1444, "student": 1445, "running": 1446, "spoken": 1447, "principle": 1448, "occurs": 1449, "weeks": 1450, "saying": 1451, "application": 1452, "write": 1453, "fish": 1454, "mentioned": 1455, "domestic": 1456, "pope": 1457, "leadership": 1458, "rejected": 1459, "raised": 1460, "cold": 1461, "possibly": 1462, "resistance": 1463, "creating": 1464, "extensive": 1465, "equipment": 1466, "whereas": 1467, "theories": 1468, "face": 1469, "hydrogen": 1470, "liberal": 1471, "worldwide": 1472, "oldest": 1473, "relative": 1474, "awards": 1475, "recently": 1476, "depending": 1477, "formula": 1478, "bay": 1479, "joseph": 1480, "identity": 1481, "planned": 1482, "cut": 1483, "brown": 1484, "tax": 1485, "determined": 1486, "plays": 1487, "branch": 1488, "describes": 1489, "authors": 1490, "von": 1491, "windows": 1492, "generation": 1493, "sets": 1494, "crisis": 1495, "mathematics": 1496, "chicago": 1497, "medicine": 1498, "moving": 1499, "hard": 1500, "situation": 1501, "differences": 1502, "cycle": 1503, "processes": 1504, "queen": 1505, "goal": 1506, "belief": 1507, "arab": 1508, "travel": 1509, "volume": 1510, "studied": 1511, "perhaps": 1512, "ultimately": 1513, "tried": 1514, "follows": 1515, "reduce": 1516, "require": 1517, "plans": 1518, "scotland": 1519, "policies": 1520, "kept": 1521, "difference": 1522, "importance": 1523, "stations": 1524, "scientists": 1525, "destroyed": 1526, "devices": 1527, "cover": 1528, "phase": 1529, "texts": 1530, "greece": 1531, "heat": 1532, "context": 1533, "census": 1534, "closed": 1535, "labor": 1536, "granted": 1537, "purpose": 1538, "shared": 1539, "mountains": 1540, "connected": 1541, "indigenous": 1542, "aid": 1543, "equivalent": 1544, "programming": 1545, "arms": 1546, "fell": 1547, "temple": 1548, "intelligence": 1549, "dance": 1550, "bce": 1551, "martin": 1552, "existing": 1553, "meant": 1554, "settlement": 1555, "gives": 1556, "something": 1557, "conservative": 1558, "christ": 1559, "say": 1560, "shot": 1561, "controlled": 1562, "avoid": 1563, "ruled": 1564, "mind": 1565, "architecture": 1566, "regarding": 1567, "deep": 1568, "instruments": 1569, "attempts": 1570, "causes": 1571, "represent": 1572, "electronic": 1573, "communication": 1574, "reach": 1575, "presidential": 1576, "review": 1577, "core": 1578, "etc": 1579, "tv": 1580, "projects": 1581, "proved": 1582, "behavior": 1583, "prize": 1584, "officers": 1585, "price": 1586, "comes": 1587, "actors": 1588, "care": 1589, "says": 1590, "closely": 1591, "1986": 1592, "achieved": 1593, "week": 1594, "flow": 1595, "shortly": 1596, "describe": 1597, "learning": 1598, "universe": 1599, "solution": 1600, "bodies": 1601, "bridge": 1602, "widespread": 1603, "1984": 1604, "conducted": 1605, "views": 1606, "universal": 1607, "toward": 1608, "parents": 1609, "1945": 1610, "reform": 1611, "felt": 1612, "opening": 1613, "kind": 1614, "1980": 1615, "reasons": 1616, "influential": 1617, "environmental": 1618, "fighting": 1619, "christians": 1620, "going": 1621, "captured": 1622, "supply": 1623, "fuel": 1624, "suggests": 1625, "1979": 1626, "31": 1627, "daily": 1628, "29": 1629, "winning": 1630, "1975": 1631, "academic": 1632, "portuguese": 1633, "1985": 1634, "match": 1635, "200": 1636, "crew": 1637, "offer": 1638, "reports": 1639, "nor": 1640, "gdp": 1641, "angeles": 1642, "principles": 1643, "developing": 1644, "capacity": 1645, "providing": 1646, "visited": 1647, "sciences": 1648, "authorities": 1649, "historically": 1650, "constant": 1651, "serious": 1652, "computers": 1653, "episode": 1654, "unknown": 1655, "pass": 1656, "combination": 1657, "van": 1658, "1950s": 1659, "mountain": 1660, "heavily": 1661, "championship": 1662, "weight": 1663, "articles": 1664, "traditionally": 1665, "mathematical": 1666, "pay": 1667, "alongside": 1668, "failure": 1669, "contact": 1670, "smith": 1671, "thousands": 1672, "towns": 1673, "round": 1674, "agricultural": 1675, "leave": 1676, "brothers": 1677, "scottish": 1678, "naval": 1679, "defeat": 1680, "physics": 1681, "1970": 1682, "listed": 1683, "effort": 1684, "discovery": 1685, "know": 1686, "citys": 1687, "technical": 1688, "scene": 1689, "colonial": 1690, "solar": 1691, "eu": 1692, "classes": 1693, "dark": 1694, "introduction": 1695, "suffered": 1696, "secondary": 1697, "fifth": 1698, "births": 1699, "requires": 1700, "alliance": 1701, "similarly": 1702, "finished": 1703, "external": 1704, "practices": 1705, "novels": 1706, "particles": 1707, "organized": 1708, "shape": 1709, "room": 1710, "attended": 1711, "hebrew": 1712, "genetic": 1713, "ages": 1714, "edward": 1715, "residents": 1716, "le": 1717, "route": 1718, "t": 1719, "alone": 1720, "build": 1721, "paid": 1722, "statement": 1723, "artist": 1724, "affected": 1725, "secret": 1726, "1983": 1727, "tree": 1728, "online": 1729, "60": 1730, "owned": 1731, "mixed": 1732, "1988": 1733, "courts": 1734, "ranked": 1735, "1968": 1736, "emerged": 1737, "receive": 1738, "positions": 1739, "arabic": 1740, "logic": 1741, "oxygen": 1742, "mobile": 1743, "professor": 1744, "persons": 1745, "contained": 1746, "maintain": 1747, "components": 1748, "acquired": 1749, "maintained": 1750, "host": 1751, "defence": 1752, "moral": 1753, "traditions": 1754, "guitar": 1755, "compounds": 1756, "consider": 1757, "officials": 1758, "becomes": 1759, "minutes": 1760, "target": 1761, "combat": 1762, "village": 1763, "entirely": 1764, "maximum": 1765, "lands": 1766, "fight": 1767, "rivers": 1768, "rare": 1769, "damage": 1770, "agriculture": 1771, "popularity": 1772, "contributed": 1773, "spirit": 1774, "goods": 1775, "roughly": 1776, "symbol": 1777, "voice": 1778, "choice": 1779, "1987": 1780, "aspects": 1781, "typical": 1782, "meet": 1783, "sequence": 1784, "bring": 1785, "carry": 1786, "dedicated": 1787, "easily": 1788, "perform": 1789, "violence": 1790, "constructed": 1791, "publication": 1792, "1969": 1793, "1982": 1794, "ottoman": 1795, "houses": 1796, "jerusalem": 1797, "atlantic": 1798, "christmas": 1799, "evolution": 1800, "banks": 1801, "cast": 1802, "display": 1803, "operated": 1804, "wall": 1805, "18th": 1806, "broadcast": 1807, "cancer": 1808, "slightly": 1809, "1000": 1810, "investment": 1811, "condition": 1812, "senate": 1813, "am": 1814, "trial": 1815, "zone": 1816, "wave": 1817, "republican": 1818, "1972": 1819, "settled": 1820, "k": 1821, "1971": 1822, "descent": 1823, "concluded": 1824, "bible": 1825, "sales": 1826, "comedy": 1827, "permanent": 1828, "hot": 1829, "employed": 1830, "younger": 1831, "hospital": 1832, "atoms": 1833, "orders": 1834, "track": 1835, "frequency": 1836, "confirmed": 1837, "clubs": 1838, "contract": 1839, "ball": 1840, "persian": 1841, "magnetic": 1842, "output": 1843, "device": 1844, "technique": 1845, "causing": 1846, "stable": 1847, "apple": 1848, "forest": 1849, "1974": 1850, "factor": 1851, "bbc": 1852, "electron": 1853, "note": 1854, "signal": 1855, "netherlands": 1856, "asian": 1857, "runs": 1858, "drug": 1859, "measure": 1860, "surrounding": 1861, "sons": 1862, "actual": 1863, "w": 1864, "purposes": 1865, "occupied": 1866, "audience": 1867, "marine": 1868, "otherwise": 1869, "duke": 1870, "spring": 1871, "demand": 1872, "reading": 1873, "post": 1874, "1981": 1875, "sister": 1876, "obtained": 1877, "revealed": 1878, "translation": 1879, "unable": 1880, "improved": 1881, "ibn": 1882, "philosophers": 1883, "rail": 1884, "crime": 1885, "measures": 1886, "recording": 1887, "fleet": 1888, "molecules": 1889, "joint": 1890, "columbia": 1891, "sign": 1892, "affairs": 1893, "1967": 1894, "follow": 1895, "1976": 1896, "wood": 1897, "brain": 1898, "additionally": 1899, "producing": 1900, "decline": 1901, "1973": 1902, "approved": 1903, "jersey": 1904, "safety": 1905, "fundamental": 1906, "movements": 1907, "nazi": 1908, "split": 1909, "crown": 1910, "populations": 1911, "mental": 1912, "coming": 1913, "silver": 1914, "greatly": 1915, "sides": 1916, "lives": 1917, "expression": 1918, "temperatures": 1919, "vehicles": 1920, "radiation": 1921, "strength": 1922, "setting": 1923, "supporting": 1924, "movie": 1925, "debate": 1926, "al": 1927, "covered": 1928, "accounts": 1929, "seats": 1930, "managed": 1931, "painting": 1932, "protect": 1933, "transfer": 1934, "steel": 1935, "succeeded": 1936, "concepts": 1937, "rapid": 1938, "1978": 1939, "writings": 1940, "calendar": 1941, "womens": 1942, "ran": 1943, "composition": 1944, "images": 1945, "connection": 1946, "el": 1947, "ago": 1948, "visit": 1949, "finland": 1950, "hands": 1951, "forward": 1952, "search": 1953, "hill": 1954, "personnel": 1955, "ministry": 1956, "instrument": 1957, "titled": 1958, "quantum": 1959, "advantage": 1960, "dominant": 1961, "tribes": 1962, "establishment": 1963, "establish": 1964, "0": 1965, "teaching": 1966, "your": 1967, "extent": 1968, "broke": 1969, "networks": 1970, "useful": 1971, "peninsula": 1972, "attributed": 1973, "file": 1974, "argues": 1975, "islam": 1976, "why": 1977, "roles": 1978, "constitutional": 1979, "pieces": 1980, "producer": 1981, "experienced": 1982, "cars": 1983, "musicians": 1984, "script": 1985, "chosen": 1986, "electrons": 1987, "drive": 1988, "southeast": 1989, "quite": 1990, "master": 1991, "dates": 1992, "afghanistan": 1993, "principal": 1994, "severe": 1995, "determine": 1996, "sir": 1997, "rose": 1998, "focused": 1999, "UNKNOWN": 2000} \ No newline at end of file diff --git a/nlp_class2/logistic.py b/nlp_class2/logistic.py index 352c2f57..eda2357e 100644 --- a/nlp_class2/logistic.py +++ b/nlp_class2/logistic.py @@ -47,7 +47,7 @@ # train a logistic model - W = np.random.randn(V, V) / np.sqrt(V) + W = np.random.randn(V, V)/np.sqrt(V) losses = [] epochs = 1 @@ -56,7 +56,7 @@ def softmax(a): a = a - a.max() exp_a = np.exp(a) - return exp_a / exp_a.sum(axis=1, keepdims=True) + return exp_a/exp_a.sum(axis=1, keepdims=True) # what is the loss if we set W = log(bigram_probs)? W_bigram = np.log(bigram_probs) @@ -85,19 +85,19 @@ def softmax(a): W = W - lr * inputs.T.dot(predictions - targets) # keep track of the loss - loss = -np.sum(targets * np.log(predictions)) / (n - 1) + loss = -np.sum(targets*np.log(predictions))/(n - 1) losses.append(loss) # keep track of the bigram loss # only do it for the first epoch to avoid redundancy if epoch == 0: bigram_predictions = softmax(inputs.dot(W_bigram)) - bigram_loss = -np.sum(targets * np.log(bigram_predictions)) / (n - 1) + bigram_loss = -np.sum(targets*np.log(bigram_predictions))/(n - 1) bigram_losses.append(bigram_loss) - if j % 10 == 0: - print("epoch:", epoch, "sentence: %s/%s" % (j, len(sentences)), "loss:", loss) + if j%10 == 0: + print(f"epoch: {epoch}, sentence: {j}/{len(sentences)}, loss: {loss}") j += 1 print("Elapsed time training:", datetime.now() - t0) @@ -114,8 +114,8 @@ def smoothed_loss(x, decay=0.99): y = np.zeros(len(x)) last = 0 for t in range(len(x)): - z = decay * last + (1 - decay) * x[t] - y[t] = z / (1 - decay ** (t + 1)) + z = decay*last + (1 - decay)*x[t] + y[t] = z/(1 - decay**(t + 1)) last = z return y diff --git a/nlp_class2/ner_tf.py b/nlp_class2/ner_tf.py index 7f8fa2c1..6c2e7a53 100644 --- a/nlp_class2/ner_tf.py +++ b/nlp_class2/ner_tf.py @@ -13,16 +13,15 @@ import os import sys sys.path.append(os.path.abspath('..')) -from pos_baseline import get_data +#from pos_baseline import get_data from sklearn.utils import shuffle from util import init_weight from datetime import datetime -from sklearn.metrics import f1_score - -from tensorflow.contrib.rnn import static_rnn as get_rnn_output -from tensorflow.contrib.rnn import BasicRNNCell, GRUCell - +#from sklearn.metrics import f1_score +from tensorflow.keras.layers import GRUCell, RNN #type: ignore +if tf.__version__.startswith('2'): + tf.compat.v1.disable_eager_execution() def get_data(split_sequences=False): word2idx = {} @@ -33,7 +32,7 @@ def get_data(split_sequences=False): Ytrain = [] currentX = [] currentY = [] - for line in open('ner.txt'): + for line in open('ner.txt', encoding='utf-8'): line = line.rstrip() if line: r = line.split() @@ -95,16 +94,16 @@ def flatten(l): # pad sequences Xtrain = tf.keras.preprocessing.sequence.pad_sequences(Xtrain, maxlen=sequence_length) Ytrain = tf.keras.preprocessing.sequence.pad_sequences(Ytrain, maxlen=sequence_length) -Xtest = tf.keras.preprocessing.sequence.pad_sequences(Xtest, maxlen=sequence_length) -Ytest = tf.keras.preprocessing.sequence.pad_sequences(Ytest, maxlen=sequence_length) +Xtest = tf.keras.preprocessing.sequence.pad_sequences(Xtest, maxlen=sequence_length) +Ytest = tf.keras.preprocessing.sequence.pad_sequences(Ytest, maxlen=sequence_length) print("Xtrain.shape:", Xtrain.shape) print("Ytrain.shape:", Ytrain.shape) # inputs -inputs = tf.placeholder(tf.int32, shape=(None, sequence_length)) -targets = tf.placeholder(tf.int32, shape=(None, sequence_length)) +inputs = tf.compat.v1.placeholder(tf.int32, shape=(None, sequence_length)) +targets = tf.compat.v1.placeholder(tf.int32, shape=(None, sequence_length)) num_samples = tf.shape(inputs)[0] # useful for later # embedding @@ -119,19 +118,18 @@ def flatten(l): tfWo = tf.Variable(Wo) tfbo = tf.Variable(bo) -# make the rnn unit -rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu) - +rnn_unit = RNN(GRUCell( + units=hidden_layer_size, activation=tf.nn.relu), return_sequences=True, return_state=True) # get the output x = tf.nn.embedding_lookup(tfWe, inputs) # converts x from a tensor of shape N x T x D # into a list of length T, where each element is a tensor of shape N x D -x = tf.unstack(x, sequence_length, 1) +#x = tf.unstack(x, sequence_length, 1) # get the rnn output -outputs, states = get_rnn_output(rnn_unit, x, dtype=tf.float32) +outputs, states = rnn_unit(x) # outputs are now of size (T, N, M) @@ -151,14 +149,14 @@ def flatten(l): labels=labels_flat ) ) -train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost_op) +train_op = tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(cost_op) # init stuff -sess = tf.InteractiveSession() -init = tf.global_variables_initializer() +sess = tf.compat.v1.InteractiveSession() +init = tf.compat.v1.global_variables_initializer() sess.run(init) diff --git a/nlp_class2/neural_network.py b/nlp_class2/neural_network.py index d44c6f52..4f5148a3 100644 --- a/nlp_class2/neural_network.py +++ b/nlp_class2/neural_network.py @@ -48,8 +48,8 @@ # train a shallow neural network model D = 100 - W1 = np.random.randn(V, D) / np.sqrt(V) - W2 = np.random.randn(D, V) / np.sqrt(D) + W1 = np.random.randn(V, D)/np.sqrt(V) + W2 = np.random.randn(D, V)/np.sqrt(D) losses = [] epochs = 1 @@ -58,7 +58,7 @@ def softmax(a): a = a - a.max() exp_a = np.exp(a) - return exp_a / exp_a.sum(axis=1, keepdims=True) + return exp_a/exp_a.sum(axis=1, keepdims=True) # what is the loss if we set W = log(bigram_probs)? W_bigram = np.log(bigram_probs) @@ -84,24 +84,24 @@ def softmax(a): predictions = softmax(hidden.dot(W2)) # do a gradient descent step - W2 = W2 - lr * hidden.T.dot(predictions - targets) - dhidden = (predictions - targets).dot(W2.T) * (1 - hidden * hidden) - W1 = W1 - lr * inputs.T.dot(dhidden) + W2 = W2 - lr*hidden.T.dot(predictions - targets) + dhidden = (predictions - targets).dot(W2.T)*(1 - hidden*hidden) + W1 = W1 - lr*inputs.T.dot(dhidden) # keep track of the loss - loss = -np.sum(targets * np.log(predictions)) / (n - 1) + loss = -np.sum(targets*np.log(predictions))/(n - 1) losses.append(loss) # keep track of the bigram loss # only do it for the first epoch to avoid redundancy if epoch == 0: bigram_predictions = softmax(inputs.dot(W_bigram)) - bigram_loss = -np.sum(targets * np.log(bigram_predictions)) / (n - 1) + bigram_loss = -np.sum(targets*np.log(bigram_predictions))/(n - 1) bigram_losses.append(bigram_loss) if j % 10 == 0: - print("epoch:", epoch, "sentence: %s/%s" % (j, len(sentences)), "loss:", loss) + print(f"epoch: {epoch}, sentence: {j}/{len(sentences)}, loss: {loss}") j += 1 print("Elapsed time training:", datetime.now() - t0) @@ -118,8 +118,8 @@ def smoothed_loss(x, decay=0.99): y = np.zeros(len(x)) last = 0 for t in range(len(x)): - z = decay * last + (1 - decay) * x[t] - y[t] = z / (1 - decay ** (t + 1)) + z = decay*last + (1 - decay)*x[t] + y[t] = z/(1 - decay**(t + 1)) last = z return y diff --git a/nlp_class2/neural_network2.py b/nlp_class2/neural_network2.py index 159dc571..c9df7f11 100644 --- a/nlp_class2/neural_network2.py +++ b/nlp_class2/neural_network2.py @@ -48,8 +48,8 @@ # train a shallow neural network model D = 100 - W1 = np.random.randn(V, D) / np.sqrt(V) - W2 = np.random.randn(D, V) / np.sqrt(D) + W1 = np.random.randn(V, D)/np.sqrt(V) + W2 = np.random.randn(D, V)/np.sqrt(D) losses = [] epochs = 1 @@ -58,7 +58,7 @@ def softmax(a): a = a - a.max() exp_a = np.exp(a) - return exp_a / exp_a.sum(axis=1, keepdims=True) + return exp_a/exp_a.sum(axis=1, keepdims=True) # what is the loss if we set W = log(bigram_probs)? W_bigram = np.log(bigram_probs) @@ -82,7 +82,7 @@ def softmax(a): predictions = softmax(hidden.dot(W2)) # keep track of the loss - loss = -np.sum(np.log(predictions[np.arange(n - 1), targets])) / (n - 1) + loss = -np.sum(np.log(predictions[np.arange(n - 1), targets]))/(n - 1) losses.append(loss) # do a gradient descent step @@ -90,14 +90,14 @@ def softmax(a): # we don't want to make a copy because it would be slow doutput = predictions # N x V doutput[np.arange(n - 1), targets] -= 1 - W2 = W2 - lr * hidden.T.dot(doutput) # (D x N) (N x V) - dhidden = doutput.dot(W2.T) * (1 - hidden * hidden) # (N x V) (V x D) * (N x D) + W2 = W2 - lr*hidden.T.dot(doutput) # (D x N) (N x V) + dhidden = doutput.dot(W2.T)*(1 - hidden*hidden) # (N x V) (V x D) * (N x D) # # for reference: # # original: W1 = W1 - lr * inputs.T.dot(dhidden) # VxN NxD --> VxD # fastest way W1_copy = W1.copy() - np.subtract.at(W1, inputs, lr * dhidden) + np.subtract.at(W1, inputs, lr*dhidden) # vs this # W1_test = W1_copy.copy() @@ -118,12 +118,12 @@ def softmax(a): # only do it for the first epoch to avoid redundancy if epoch == 0: bigram_predictions = softmax(W_bigram[inputs]) - bigram_loss = -np.sum(np.log(bigram_predictions[np.arange(n - 1), targets])) / (n - 1) + bigram_loss = -np.sum(np.log(bigram_predictions[np.arange(n - 1), targets]))/(n - 1) bigram_losses.append(bigram_loss) if j % 100 == 0: - print("epoch:", epoch, "sentence: %s/%s" % (j, len(sentences)), "loss:", loss) + print(f"epoch: {epoch}, sentence: {j}/{len(sentences)}, loss: {loss}") j += 1 @@ -141,8 +141,8 @@ def smoothed_loss(x, decay=0.99): y = np.zeros(len(x)) last = 0 for t in range(len(x)): - z = decay * last + (1 - decay) * x[t] - y[t] = z / (1 - decay ** (t + 1)) + z = decay*last + (1 - decay)*x[t] + y[t] = z / (1 - decay**(t + 1)) last = z return y diff --git a/nlp_class2/pmi.py b/nlp_class2/pmi.py index b321e91f..941517cb 100644 --- a/nlp_class2/pmi.py +++ b/nlp_class2/pmi.py @@ -49,7 +49,7 @@ def remove_punctuation_3(s): num_lines = 0 num_tokens = 0 for f in files: - for line in open(f): + for line in open(f, encoding='utf-8'): # don't count headers, structured data, lists, etc... if line and line[0] not in ('[', '*', '-', '|', '=', '{', '}'): num_lines += 1 @@ -112,7 +112,7 @@ def remove_punctuation_3(s): k = 0 # for line in open('../large_files/text8'): for f in files: - for line in open(f): + for line in open(f, encoding='utf-8'): # don't count headers, structured data, lists, etc... if line and line[0] not in ('[', '*', '-', '|', '=', '{', '}'): line_as_idx = [] @@ -153,7 +153,7 @@ def remove_punctuation_3(s): # PMI(w, c) = #(w, c) / #(w) / p(c) # pmi = wc_counts / wc_counts.sum(axis=1) / c_probs # works only if numpy arrays -pmi = wc_counts.multiply(1.0 / wc_counts.sum(axis=1) / c_probs).tocsr() +pmi = wc_counts.multiply(1.0/wc_counts.sum(axis=1)/c_probs).tocsr() # this operation changes it to a coo_matrix # which doesn't have functions we need, e.g log1p() # so convert it back to a csr @@ -172,9 +172,9 @@ def remove_punctuation_3(s): # initialize weights -W = np.random.randn(V, D) / np.sqrt(V + D) +W = np.random.randn(V, D)/np.sqrt(V + D) b = np.zeros(V) -U = np.random.randn(V, D) / np.sqrt(V + D) +U = np.random.randn(V, D)/np.sqrt(V + D) c = np.zeros(V) mu = logX.mean() @@ -220,7 +220,7 @@ def remove_punctuation_3(s): W = np.linalg.solve(matrix, vector).T # vectorized update b - b = (logX - W.dot(U.T) - c.reshape(1, V) - mu).sum(axis=1) / V + b = (logX - W.dot(U.T) - c.reshape(1, V) - mu).sum(axis=1)/V # vectorized update U matrix = reg*np.eye(D) + W.T.dot(W) @@ -228,7 +228,7 @@ def remove_punctuation_3(s): U = np.linalg.solve(matrix, vector).T # vectorized update c - c = (logX - W.dot(U.T) - b.reshape(V, 1) - mu).sum(axis=0) / V + c = (logX - W.dot(U.T) - b.reshape(V, 1) - mu).sum(axis=0)/V print("train duration:", datetime.now() - t0) @@ -259,6 +259,9 @@ def remove_punctuation_3(s): # set word embedding matrix # W = (W + U) / 2 +vec = np.asarray(vec) +W = np.asarray(W) + distances = pairwise_distances(vec.reshape(1, D), W, metric='cosine').reshape(V) idx = distances.argsort()[:10] @@ -266,7 +269,9 @@ def remove_punctuation_3(s): for i in idx: print(top_words[i], distances[i]) -print("dist to queen:", cos_dist(W[word2idx['queen']], vec)) +queen_vector = np.squeeze(W[word2idx['queen']]) +vec = np.squeeze(vec) +print("dist to queen:", cos_dist(queen_vector, vec)) diff --git a/nlp_class2/pmi_counts_2000.npz b/nlp_class2/pmi_counts_2000.npz new file mode 100644 index 00000000..d626d8d6 Binary files /dev/null and b/nlp_class2/pmi_counts_2000.npz differ diff --git a/nlp_class2/pos_hmm.py b/nlp_class2/pos_hmm.py index e3065cd2..0e3345ab 100644 --- a/nlp_class2/pos_hmm.py +++ b/nlp_class2/pos_hmm.py @@ -15,7 +15,7 @@ sys.path.append(os.path.abspath('..')) from hmm_class.hmmd_scaled import HMM -from pos_baseline import get_data +#from pos_baseline import get_data from sklearn.utils import shuffle from datetime import datetime from sklearn.metrics import f1_score @@ -28,7 +28,7 @@ def accuracy(T, Y): for t, y in zip(T, Y): n_correct += np.sum(t == y) n_total += len(y) - return float(n_correct) / n_total + return float(n_correct)/n_total def total_f1_score(T, Y): @@ -41,6 +41,78 @@ def total_f1_score(T, Y): # def flatten(l): # return [item for sublist in l for item in sublist] +def get_data(split_sequences=False): + if not os.path.exists('chunking'): + print("Please create a folder in your local directory called 'chunking'") + print("train.txt and test.txt should be stored in there.") + print("Please check the comments to get the download link.") + exit() + elif not os.path.exists('chunking/train.txt'): + print("train.txt is not in chunking/train.txt") + print("Please check the comments to get the download link.") + exit() + elif not os.path.exists('chunking/test.txt'): + print("test.txt is not in chunking/test.txt") + print("Please check the comments to get the download link.") + exit() + + word2idx = {} + tag2idx = {} + word_idx = 0 + tag_idx = 0 + Xtrain = [] + Ytrain = [] + currentX = [] + currentY = [] + for line in open('chunking/train.txt', encoding='utf-8'): + line = line.rstrip() + if line: + r = line.split() + word, tag, _ = r + if word not in word2idx: + word2idx[word] = word_idx + word_idx += 1 + currentX.append(word2idx[word]) + + if tag not in tag2idx: + tag2idx[tag] = tag_idx + tag_idx += 1 + currentY.append(tag2idx[tag]) + elif split_sequences: + Xtrain.append(currentX) + Ytrain.append(currentY) + currentX = [] + currentY = [] + + if not split_sequences: + Xtrain = currentX + Ytrain = currentY + + # load and score test data + Xtest = [] + Ytest = [] + currentX = [] + currentY = [] + for line in open('chunking/test.txt', encoding='utf-8'): + line = line.rstrip() + if line: + r = line.split() + word, tag, _ = r + if word in word2idx: + currentX.append(word2idx[word]) + else: + currentX.append(word_idx) # use this as unknown + currentY.append(tag2idx[tag]) + elif split_sequences: + Xtest.append(currentX) + Ytest.append(currentY) + currentX = [] + currentY = [] + if not split_sequences: + Xtest = currentX + Ytest = currentY + + return Xtrain, Ytrain, Xtest, Ytest, word2idx def main(smoothing=1e-1): # X = words, Y = POS tags diff --git a/nlp_class2/pos_ner_keras.py b/nlp_class2/pos_ner_keras.py index 7a1335e1..9c64609e 100644 --- a/nlp_class2/pos_ner_keras.py +++ b/nlp_class2/pos_ner_keras.py @@ -9,20 +9,21 @@ import numpy as np import matplotlib.pyplot as plt +import tensorflow as tf import os import sys sys.path.append(os.path.abspath('..')) -from pos_baseline import get_data +#from pos_baseline import get_data from sklearn.utils import shuffle -from util import init_weight +#from util import init_weight from datetime import datetime -from sklearn.metrics import f1_score +#from sklearn.metrics import f1_score -from keras.models import Model -from keras.layers import Input, Dense, Embedding, LSTM, GRU -from keras.preprocessing.sequence import pad_sequences -from keras.preprocessing.text import Tokenizer -from keras.optimizers import Adam +from tensorflow.keras.models import Model #type: ignore +from tensorflow.keras.layers import Input, Dense, Embedding, GRU, LSTM, SimpleRNN #type: ignore +from tensorflow.keras.preprocessing.sequence import pad_sequences #type: ignore +from tensorflow.keras.preprocessing.text import Tokenizer #type: ignore +from tensorflow.keras.optimizers import Adam #type: ignore MAX_VOCAB_SIZE = 20000 @@ -30,73 +31,12 @@ -def get_data_pos(split_sequences=False): - if not os.path.exists('chunking'): - print("Please create a folder in your local directory called 'chunking'") - print("train.txt and test.txt should be stored in there.") - print("Please check the comments to get the download link.") - exit() - elif not os.path.exists('chunking/train.txt'): - print("train.txt is not in chunking/train.txt") - print("Please check the comments to get the download link.") - exit() - elif not os.path.exists('chunking/test.txt'): - print("test.txt is not in chunking/test.txt") - print("Please check the comments to get the download link.") - exit() - - Xtrain = [] - Ytrain = [] - currentX = [] - currentY = [] - for line in open('chunking/train.txt'): - line = line.rstrip() - if line: - r = line.split() - word, tag, _ = r - currentX.append(word) - - currentY.append(tag) - elif split_sequences: - Xtrain.append(currentX) - Ytrain.append(currentY) - currentX = [] - currentY = [] - - if not split_sequences: - Xtrain = currentX - Ytrain = currentY - - # load and score test data - Xtest = [] - Ytest = [] - currentX = [] - currentY = [] - for line in open('chunking/test.txt'): - line = line.rstrip() - if line: - r = line.split() - word, tag, _ = r - currentX.append(word) - currentY.append(tag) - elif split_sequences: - Xtest.append(currentX) - Ytest.append(currentY) - currentX = [] - currentY = [] - if not split_sequences: - Xtest = currentX - Ytest = currentY - - return Xtrain, Ytrain, Xtest, Ytest - - def get_data_ner(split_sequences=False): Xtrain = [] Ytrain = [] currentX = [] currentY = [] - for line in open('ner.txt'): + for line in open('ner.txt', encoding='utf-8'): line = line.rstrip() if line: r = line.split() @@ -138,7 +78,7 @@ def get_data_ner(split_sequences=False): # get word -> integer mapping word2idx = tokenizer.word_index -print('Found %s unique tokens.' % len(word2idx)) +print(f'Found {len(word2idx)} unique tokens.') vocab_size = min(MAX_VOCAB_SIZE, len(word2idx) + 1) @@ -150,7 +90,7 @@ def get_data_ner(split_sequences=False): # get tag -> integer mapping tag2idx = tokenizer2.word_index -print('Found %s unique tags.' % len(tag2idx)) +print(f'Found {len(tag2idx)} unique tags.') num_tags = min(MAX_TAGS, len(tag2idx) + 1) @@ -189,34 +129,32 @@ def get_data_ner(split_sequences=False): # build the model input_ = Input(shape=(sequence_length,)) x = Embedding(vocab_size, embedding_dim)(input_) -x = GRU(hidden_layer_size, return_sequences=True)(x) +x = SimpleRNN(hidden_layer_size, return_sequences=True)(x) output = Dense(num_tags, activation='softmax')(x) model = Model(input_, output) model.compile( loss='categorical_crossentropy', - optimizer=Adam(lr=1e-2), + optimizer=Adam(learning_rate=1e-2), metrics=['accuracy'] ) print('Training model...') -r = model.fit( - Xtrain, - Ytrain_onehot, - batch_size=batch_size, - epochs=epochs, - validation_data=(Xtest, Ytest_onehot) -) +r = model.fit(Xtrain, + Ytrain_onehot, + batch_size=batch_size, + epochs=epochs, + validation_data=(Xtest, Ytest_onehot)) -# plot some data +# plot loss plt.plot(r.history['loss'], label='loss') plt.plot(r.history['val_loss'], label='val_loss') plt.legend() plt.show() -# accuracies +# plot accuracy plt.plot(r.history['accuracy'], label='acc') plt.plot(r.history['val_accuracy'], label='val_acc') plt.legend() diff --git a/nlp_class2/pos_tf.py b/nlp_class2/pos_tf.py index 974453b6..c4d1724c 100644 --- a/nlp_class2/pos_tf.py +++ b/nlp_class2/pos_tf.py @@ -13,15 +13,15 @@ import os import sys sys.path.append(os.path.abspath('..')) -from pos_baseline import get_data +#from pos_baseline import get_data from sklearn.utils import shuffle from util import init_weight from datetime import datetime -from sklearn.metrics import f1_score - -from tensorflow.contrib.rnn import static_rnn as get_rnn_output -from tensorflow.contrib.rnn import BasicRNNCell, GRUCell +#from sklearn.metrics import f1_score +from tensorflow.keras.layers import GRUCell, RNN #type: ignore +if tf.__version__.startswith('2'): + tf.compat.v1.disable_eager_execution() def get_data(split_sequences=False): @@ -47,7 +47,7 @@ def get_data(split_sequences=False): Ytrain = [] currentX = [] currentY = [] - for line in open('chunking/train.txt'): + for line in open('chunking/train.txt', encoding='utf-8'): line = line.rstrip() if line: r = line.split() @@ -76,7 +76,7 @@ def get_data(split_sequences=False): Ytest = [] currentX = [] currentY = [] - for line in open('chunking/test.txt'): + for line in open('chunking/test.txt', encoding='utf-8'): line = line.rstrip() if line: r = line.split() @@ -110,7 +110,7 @@ def flatten(l): # training config -epochs = 20 +epochs = 200 learning_rate = 1e-2 mu = 0.99 batch_size = 32 @@ -131,8 +131,8 @@ def flatten(l): # inputs -inputs = tf.placeholder(tf.int32, shape=(None, sequence_length)) -targets = tf.placeholder(tf.int32, shape=(None, sequence_length)) +inputs = tf.compat.v1.placeholder(tf.int32, shape=(None, sequence_length)) +targets = tf.compat.v1.placeholder(tf.int32, shape=(None, sequence_length)) num_samples = tf.shape(inputs)[0] # useful for later # embedding @@ -148,7 +148,8 @@ def flatten(l): tfbo = tf.Variable(bo) # make the rnn unit -rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu) +rnn_unit = RNN(GRUCell( + units=hidden_layer_size, activation=tf.nn.relu), return_sequences=True, return_state=True) # get the output @@ -156,10 +157,10 @@ def flatten(l): # converts x from a tensor of shape N x T x M # into a list of length T, where each element is a tensor of shape N x M -x = tf.unstack(x, sequence_length, 1) +#x = tf.unstack(x, sequence_length, 1) # get the rnn output -outputs, states = get_rnn_output(rnn_unit, x, dtype=tf.float32) +outputs, states = rnn_unit(x) # outputs are now of size (T, N, M) @@ -179,14 +180,14 @@ def flatten(l): labels=labels_flat ) ) -train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost_op) +train_op = tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(cost_op) # init stuff -sess = tf.InteractiveSession() -init = tf.global_variables_initializer() +sess = tf.compat.v1.InteractiveSession() +init = tf.compat.v1.global_variables_initializer() sess.run(init) @@ -222,8 +223,7 @@ def flatten(l): # print stuff out periodically if j % 10 == 0: sys.stdout.write( - "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % - (j, n_batches, float(n_correct)/n_total, cost) + f"j/N: {j}/{n_batches} correct rate so far: {float(n_correct)/n_total}, cost so far: {cost}\r" ) sys.stdout.flush() @@ -236,13 +236,13 @@ def flatten(l): pii = pi[yi > 0] n_test_correct += np.sum(yii == pii) n_test_total += len(yii) - test_acc = float(n_test_correct) / n_test_total + test_acc = float(n_test_correct)/n_test_total print( - "i:", i, "cost:", "%.4f" % cost, - "train acc:", "%.4f" % (float(n_correct)/n_total), - "test acc:", "%.4f" % test_acc, - "time for epoch:", (datetime.now() - t0) + f'''i: {i}, cost: {cost:.4f}, + train acc: {float(n_correct)/n_total:.4f}, + test acc: {test_acc:.4f}, + time for epoch: {(datetime.now() - t0)}''' ) costs.append(cost) diff --git a/nlp_class2/rntn_tensorflow.py b/nlp_class2/rntn_tensorflow.py index 77b563f2..f3022da1 100644 --- a/nlp_class2/rntn_tensorflow.py +++ b/nlp_class2/rntn_tensorflow.py @@ -47,9 +47,9 @@ def __init__(self, V, D, K, activation): We = init_weight(V, D) # quadratic terms - W11 = np.random.randn(D, D, D) / np.sqrt(3*D) - W22 = np.random.randn(D, D, D) / np.sqrt(3*D) - W12 = np.random.randn(D, D, D) / np.sqrt(3*D) + W11 = np.random.randn(D, D, D)/np.sqrt(3*D) + W22 = np.random.randn(D, D, D)/np.sqrt(3*D) + W12 = np.random.randn(D, D, D)/np.sqrt(3*D) # linear terms W1 = init_weight(D, D) diff --git a/nlp_class2/rntn_tensorflow_rnn.py b/nlp_class2/rntn_tensorflow_rnn.py index 816ff4a2..a47d40aa 100644 --- a/nlp_class2/rntn_tensorflow_rnn.py +++ b/nlp_class2/rntn_tensorflow_rnn.py @@ -13,7 +13,7 @@ import tensorflow as tf from sklearn.utils import shuffle -from util import init_weight, get_ptb_data, display_tree +from util import init_weight, get_ptb_data from datetime import datetime from sklearn.metrics import f1_score @@ -191,7 +191,7 @@ def condition(hiddens, n): it += 1 if it % 10 == 0: sys.stdout.write( - "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % + "j/N: %d/%d correct rate so far: %.4f, cost so far: %.4f\r" % (it, N, float(n_correct)/n_total, cost) ) sys.stdout.flush() @@ -212,10 +212,10 @@ def condition(hiddens, n): print( - "i:", i, "cost:", cost, - "train acc:", float(n_correct)/n_total, - "test acc:", float(n_test_correct)/n_test_total, - "time for epoch:", (datetime.now() - t0) + "i: ",i, "cost: %.4f", cost, + "train acc: %.4f", float(n_correct)/n_total, + "test acc: %.4f", float(n_test_correct)/n_test_total, + "time for epoch: ",(datetime.now() - t0) ) costs.append(cost) diff --git a/nlp_class2/tfidf_tsne.py b/nlp_class2/tfidf_tsne.py index 55bd4ce5..329ef46a 100644 --- a/nlp_class2/tfidf_tsne.py +++ b/nlp_class2/tfidf_tsne.py @@ -20,7 +20,7 @@ import sys sys.path.append(os.path.abspath('..')) from rnn_class.util import get_wikipedia_data -from rnn_class.brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx +#from rnn_class.brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx from util import find_analogies from sklearn.feature_extraction.text import TfidfTransformer diff --git a/nlp_class2/util.py b/nlp_class2/util.py index f2a79888..620192ba 100644 --- a/nlp_class2/util.py +++ b/nlp_class2/util.py @@ -182,7 +182,7 @@ def get_ptb_data(): test = [] # train set first - for line in open('../large_files/trees/train.txt'): + for line in open('../large_files/trees/train.txt', encoding='utf-8'): line = line.rstrip() if line: t = str2tree(line, word2idx) @@ -194,7 +194,7 @@ def get_ptb_data(): # break # test set - for line in open('../large_files/trees/test.txt'): + for line in open('../large_files/trees/test.txt', encoding='utf-8'): line = line.rstrip() if line: t = str2tree(line, word2idx) diff --git a/nlp_class2/visualize_countries.py b/nlp_class2/visualize_countries.py index 9d0a44e8..456ff0e5 100644 --- a/nlp_class2/visualize_countries.py +++ b/nlp_class2/visualize_countries.py @@ -31,7 +31,7 @@ def main(we_file='glove_model_50.npz', w2i_file='glove_word2idx_50.json'): Z = Z[idx] plt.scatter(Z[:,0], Z[:,1]) for i in range(len(words)): - plt.annotate(s=words[i], xy=(Z[i,0], Z[i,1])) + plt.annotate(text=words[i], xy=(Z[i,0], Z[i,1])) plt.show() diff --git a/nlp_class2/word2vec.py b/nlp_class2/word2vec.py index ba92e68c..e8989fb6 100644 --- a/nlp_class2/word2vec.py +++ b/nlp_class2/word2vec.py @@ -49,7 +49,7 @@ def get_wiki(): files = glob('../large_files/enwiki*.txt') all_word_counts = {} for f in files: - for line in open(f): + for line in open(f, encoding='utf-8'): if line and line[0] not in '[*-|=\{\}': s = remove_punctuation(line).lower().split() if len(s) > 1: @@ -68,7 +68,7 @@ def get_wiki(): sents = [] for f in files: - for line in open(f): + for line in open(f, encoding='utf-8'): if line and line[0] not in '[*-|=\{\}': s = remove_punctuation(line).lower().split() if len(s) > 1: @@ -100,7 +100,7 @@ def train_model(savedir): # learning rate decay - learning_rate_delta = (learning_rate - final_learning_rate) / epochs + learning_rate_delta = (learning_rate - final_learning_rate)/epochs # params @@ -122,7 +122,7 @@ def train_model(savedir): # for subsampling each sentence threshold = 1e-5 - p_drop = 1 - np.sqrt(threshold / p_neg) + p_drop = 1 - np.sqrt(threshold/p_neg) # train the model @@ -137,9 +137,7 @@ def train_model(savedir): t0 = datetime.now() for sentence in sentences: # keep only certain words based on p_neg - sentence = [w for w in sentence \ - if np.random.random() < (1 - p_drop[w]) - ] + sentence = [w for w in sentence if np.random.random()<(1 - p_drop[w])] if len(sentence) < 2: continue @@ -170,14 +168,14 @@ def train_model(savedir): counter += 1 if counter % 100 == 0: - sys.stdout.write("processed %s / %s\r" % (counter, len(sentences))) + sys.stdout.write(f"processed {counter}/{len(sentence)}\r") sys.stdout.flush() # break # print stuff so we don't stare at a blank screen dt = datetime.now() - t0 - print("epoch complete:", epoch, "cost:", cost, "dt:", dt) + print(f"epoch complete: {epoch}, cost: {cost}, dt: {dt}") # save the cost costs.append(cost) @@ -195,10 +193,10 @@ def train_model(savedir): if not os.path.exists(savedir): os.mkdir(savedir) - with open('%s/word2idx.json' % savedir, 'w') as f: + with open(f'{savedir}/word2idx.json', 'w') as f: json.dump(word2idx, f) - np.savez('%s/weights.npz' % savedir, W, V) + np.savez(f'{savedir}/weights.npz', W, V) # return the model return word2idx, W, V @@ -220,7 +218,7 @@ def get_negative_sampling_distribution(sentences, vocab_size): p_neg = word_freq**0.75 # normalize it - p_neg = p_neg / p_neg.sum() + p_neg = p_neg/p_neg.sum() assert(np.all(p_neg > 0)) return p_neg @@ -259,12 +257,12 @@ def sgd(input_, targets, label, learning_rate, W, V): W[input_] -= learning_rate*gW # D # return cost (binary cross entropy) - cost = label * np.log(prob + 1e-10) + (1 - label) * np.log(1 - prob + 1e-10) + cost = label*np.log(prob + 1e-10) + (1 - label)*np.log(1 - prob + 1e-10) return cost.sum() def load_model(savedir): - with open('%s/word2idx.json' % savedir) as f: + with open(f'{savedir}/word2idx.json') as f: word2idx = json.load(f) npz = np.load('%s/weights.npz' % savedir) W = npz['arr_0'] @@ -277,7 +275,7 @@ def analogy(pos1, neg1, pos2, neg2, word2idx, idx2word, W): V, D = W.shape # don't actually use pos2 in calculation, just print what's expected - print("testing: %s - %s = %s - %s" % (pos1, neg1, pos2, neg2)) + print(f"testing: {pos1} - {neg1} = {pos2} - {neg2}") for w in (pos1, neg1, pos2, neg2): if w not in word2idx: print("Sorry, %s not in word2idx" % w) @@ -303,12 +301,12 @@ def analogy(pos1, neg1, pos2, neg2, word2idx, idx2word, W): break # print("best_idx:", best_idx) - print("got: %s - %s = %s - %s" % (pos1, neg1, idx2word[best_idx], neg2)) + print(f"got: {pos1} - {neg1} = {idx2word[best_idx]} - {neg2}") print("closest 10:") for i in idx: print(idx2word[i], distances[i]) - print("dist to %s:" % pos2, cos_dist(p2, vec)) + print(f"dist to {pos2}: {cos_dist(p2, vec)}") def test_model(word2idx, W, V): diff --git a/nlp_class2/word2vec_tf.py b/nlp_class2/word2vec_tf.py index d272b003..2d53ab22 100644 --- a/nlp_class2/word2vec_tf.py +++ b/nlp_class2/word2vec_tf.py @@ -47,23 +47,23 @@ def download_text8(dst): pass -def get_text8(): - # download the data if it is not yet in the right place - path = '../large_files/text8' - if not os.path.exists(path): - download_text8(path) - - words = open(path).read() - word2idx = {} - sents = [[]] - count = 0 - for word in words.split(): - if word not in word2idx: - word2idx[word] = count - count += 1 - sents[0].append(word2idx[word]) - print("count:", count) - return sents, word2idx +# def get_text8(): +# # download the data if it is not yet in the right place +# path = '../large_files/text8' +# if not os.path.exists(path): +# download_text8(path) + +# words = open(path).read() +# word2idx = {} +# sents = [[]] +# count = 0 +# for word in words.split(): +# if word not in word2idx: +# word2idx[word] = count +# count += 1 +# sents[0].append(word2idx[word]) +# print("count:", count) +# return sents, word2idx def get_wiki(): @@ -71,7 +71,7 @@ def get_wiki(): files = glob('../large_files/enwiki*.txt') all_word_counts = {} for f in files: - for line in open(f): + for line in open(f, encoding='utf-8'): if line and line[0] not in '[*-|=\{\}': s = remove_punctuation(line).lower().split() if len(s) > 1: @@ -90,7 +90,7 @@ def get_wiki(): sents = [] for f in files: - for line in open(f): + for line in open(f, encoding='utf-8'): if line and line[0] not in '[*-|=\{\}': s = remove_punctuation(line).lower().split() if len(s) > 1: @@ -122,7 +122,7 @@ def train_model(savedir): D = 50 # word embedding size # learning rate decay - learning_rate_delta = (learning_rate - final_learning_rate) / epochs + learning_rate_delta = (learning_rate - final_learning_rate)/epochs # distribution for drawing negative samples p_neg = get_negative_sampling_distribution(sentences) @@ -202,7 +202,7 @@ def dot(A, B): # for subsampling each sentence threshold = 1e-5 - p_drop = 1 - np.sqrt(threshold / p_neg) + p_drop = 1 - np.sqrt(threshold/p_neg) # train the model @@ -221,9 +221,7 @@ def dot(A, B): for sentence in sentences: # keep only certain words based on p_neg - sentence = [w for w in sentence \ - if np.random.random() < (1 - p_drop[w]) - ] + sentence = [w for w in sentence if np.random.random() < (1 - p_drop[w])] if len(sentence) < 2: continue @@ -282,14 +280,14 @@ def dot(A, B): counter += 1 if counter % 100 == 0: - sys.stdout.write("processed %s / %s\r" % (counter, len(sentences))) + sys.stdout.write(f"processed {counter}/{len(sentences)}\r") sys.stdout.flush() # break # print stuff so we don't stare at a blank screen dt = datetime.now() - t0 - print("epoch complete:", epoch, "cost:", cost, "dt:", dt) + print(f"epoch complete: {epoch}, cost: {cost}, dt: {dt}") # save the cost costs.append(cost) @@ -310,10 +308,10 @@ def dot(A, B): if not os.path.exists(savedir): os.mkdir(savedir) - with open('%s/word2idx.json' % savedir, 'w') as f: + with open(f'{savedir}/word2idx.json', 'w') as f: json.dump(word2idx, f) - np.savez('%s/weights.npz' % savedir, W, V) + np.savez(f'{savedir}/weights.npz', W, V) # return the model return word2idx, W, V @@ -341,7 +339,7 @@ def get_negative_sampling_distribution(sentences): p_neg[j] = word_freq[j]**0.75 # normalize it - p_neg = p_neg / p_neg.sum() + p_neg = p_neg/p_neg.sum() assert(np.all(p_neg > 0)) return p_neg @@ -366,9 +364,9 @@ def get_context(pos, sentence, window_size): def load_model(savedir): - with open('%s/word2idx.json' % savedir) as f: + with open(f'{savedir}/word2idx.json') as f: word2idx = json.load(f) - npz = np.load('%s/weights.npz' % savedir) + npz = np.load(f'{savedir}/weights.npz') W = npz['arr_0'] V = npz['arr_1'] return word2idx, W, V @@ -379,10 +377,10 @@ def analogy(pos1, neg1, pos2, neg2, word2idx, idx2word, W): V, D = W.shape # don't actually use pos2 in calculation, just print what's expected - print("testing: %s - %s = %s - %s" % (pos1, neg1, pos2, neg2)) + print(f"testing: {pos1} - {neg1} = {pos2} - {neg2}") for w in (pos1, neg1, pos2, neg2): if w not in word2idx: - print("Sorry, %s not in word2idx" % w) + print(f"Sorry, {w} not in word2idx") return p1 = W[word2idx[pos1]] @@ -403,12 +401,12 @@ def analogy(pos1, neg1, pos2, neg2, word2idx, idx2word, W): best_idx = i break - print("got: %s - %s = %s - %s" % (pos1, neg1, idx2word[idx[0]], neg2)) + print(f"got: {pos1} - {neg1} = {idx2word[idx[0]]} - {neg2}" ) print("closest 10:") for i in idx: print(idx2word[i], distances[i]) - print("dist to %s:" % pos2, cos_dist(p2, vec)) + print(f"dist to {pos2}: {cos_dist(p2, vec)}") def test_model(word2idx, W, V): diff --git a/recommenders/autorec.py b/recommenders/autorec.py index fa0bd415..9d044099 100644 --- a/recommenders/autorec.py +++ b/recommenders/autorec.py @@ -5,17 +5,17 @@ # Note: you may need to update your version of future # sudo pip install -U future -import numpy as np -import pandas as pd +#import numpy as np +#import pandas as pd import matplotlib.pyplot as plt from sklearn.utils import shuffle -from scipy.sparse import save_npz, load_npz +from scipy.sparse import load_npz -import keras.backend as K -from keras.models import Model -from keras.layers import Input, Dropout, Dense -from keras.regularizers import l2 -from keras.optimizers import SGD +import tensorflow.keras.backend as K #type:ignore +from tensorflow.keras.models import Model #type:ignore +from tensorflow.keras.layers import Input, Dropout, Dense #type:ignore +from tensorflow.keras.regularizers import l2 #type:ignore +from tensorflow.keras.optimizers import SGD #type:ignore # config batch_size = 128 @@ -23,8 +23,8 @@ reg = 0.0001 # reg = 0 -A = load_npz("Atrain.npz") -A_test = load_npz("Atest.npz") +A = load_npz(".\\large_files\\movielens-20m-dataset\\Atrain.npz") +A_test = load_npz(".\\large_files\\movielens-20m-dataset\\Atest.npz") mask = (A > 0) * 1.0 mask_test = (A_test > 0) * 1.0 @@ -56,6 +56,8 @@ def custom_loss(y_true, y_pred): mask = K.cast(K.not_equal(y_true, 0), dtype='float32') + y_true = K.cast(y_true, dtype='float32') + y_pred = K.cast(y_pred, dtype='float32') diff = y_pred - y_true sqdiff = diff * diff * mask sse = K.sum(K.sum(sqdiff)) @@ -96,7 +98,7 @@ def test_generator(A, M, A_test, M_test): model = Model(i, x) model.compile( loss=custom_loss, - optimizer=SGD(lr=0.08, momentum=0.9), + optimizer=SGD(learning_rate=0.08, momentum=0.9), # optimizer='adam', metrics=[custom_loss], ) diff --git a/recommenders/itembased.py b/recommenders/itembased.py index f87f9481..ff5619a8 100644 --- a/recommenders/itembased.py +++ b/recommenders/itembased.py @@ -1,37 +1,37 @@ # https://udemy.com/recommender-systems # https://deeplearningcourses.com/recommender-systems from __future__ import print_function, division -from builtins import range, input +from builtins import range#, input # Note: you may need to update your version of future # sudo pip install -U future import pickle import numpy as np -import pandas as pd -import matplotlib.pyplot as plt +#import pandas as pd +#import matplotlib.pyplot as plt from sklearn.utils import shuffle -from datetime import datetime +#from datetime import datetime from sortedcontainers import SortedList # load in the data import os -if not os.path.exists('user2movie.json') or \ - not os.path.exists('movie2user.json') or \ - not os.path.exists('usermovie2rating.json') or \ - not os.path.exists('usermovie2rating_test.json'): +if not os.path.exists('.\\large_files\\movielens-20m-dataset\\user2movie.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\movie2user.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\usermovie2rating.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\usermovie2rating_test.json'): import preprocess2dict -with open('user2movie.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\user2movie.json', 'rb') as f: user2movie = pickle.load(f) -with open('movie2user.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\movie2user.json', 'rb') as f: movie2user = pickle.load(f) -with open('usermovie2rating.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\usermovie2rating.json', 'rb') as f: usermovie2rating = pickle.load(f) -with open('usermovie2rating_test.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\usermovie2rating_test.json', 'rb') as f: usermovie2rating_test = pickle.load(f) diff --git a/recommenders/mf2.py b/recommenders/mf2.py index 62b599c6..bcf54b54 100644 --- a/recommenders/mf2.py +++ b/recommenders/mf2.py @@ -1,13 +1,13 @@ # https://udemy.com/recommender-systems # https://deeplearningcourses.com/recommender-systems from __future__ import print_function, division -from builtins import range, input +from builtins import range#, input # Note: you may need to update your version of future # sudo pip install -U future import pickle import numpy as np -import pandas as pd +#import pandas as pd import matplotlib.pyplot as plt from sklearn.utils import shuffle from datetime import datetime @@ -15,23 +15,23 @@ # load in the data import os -if not os.path.exists('user2movie.json') or \ - not os.path.exists('movie2user.json') or \ - not os.path.exists('usermovie2rating.json') or \ - not os.path.exists('usermovie2rating_test.json'): +if not os.path.exists('.\\large_files\\movielens-20m-dataset\\user2movie.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\movie2user.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\usermovie2rating.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\usermovie2rating_test.json'): import preprocess2dict -with open('user2movie.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\user2movie.json', 'rb') as f: user2movie = pickle.load(f) -with open('movie2user.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\movie2user.json', 'rb') as f: movie2user = pickle.load(f) -with open('usermovie2rating.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\usermovie2rating.json', 'rb') as f: usermovie2rating = pickle.load(f) -with open('usermovie2rating_test.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\usermovie2rating_test.json', 'rb') as f: usermovie2rating_test = pickle.load(f) diff --git a/recommenders/mf_keras.py b/recommenders/mf_keras.py index efc3315b..5f8ea4ad 100644 --- a/recommenders/mf_keras.py +++ b/recommenders/mf_keras.py @@ -1,23 +1,23 @@ # https://udemy.com/recommender-systems # https://deeplearningcourses.com/recommender-systems from __future__ import print_function, division -from builtins import range, input +#from builtins import range, input # Note: you may need to update your version of future # sudo pip install -U future -import pickle -import numpy as np +#import pickle +#import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.utils import shuffle -from keras.models import Model -from keras.layers import Input, Embedding, Dot, Add, Flatten -from keras.regularizers import l2 -from keras.optimizers import SGD, Adam +from tensorflow.keras.models import Model #type:ignore +from tensorflow.keras.layers import Input, Embedding, Dot, Add, Flatten #type:ignore +from tensorflow.keras.regularizers import l2 #type:ignore +from tensorflow.keras.optimizers import SGD #type:ignore # load in the data -df = pd.read_csv('../large_files/movielens-20m-dataset/edited_rating.csv') +df = pd.read_csv('.\\large_files\\movielens-20m-dataset\\edited_rating.csv') N = df.userId.max() + 1 # number of users M = df.movie_idx.max() + 1 # number of movies @@ -71,7 +71,7 @@ loss='mse', # optimizer='adam', # optimizer=Adam(lr=0.01), - optimizer=SGD(lr=0.08, momentum=0.9), + optimizer=SGD(learning_rate=0.08, momentum=0.9), metrics=['mse'], ) diff --git a/recommenders/mf_keras_deep.py b/recommenders/mf_keras_deep.py index f3888a7a..b22c4abb 100644 --- a/recommenders/mf_keras_deep.py +++ b/recommenders/mf_keras_deep.py @@ -1,24 +1,24 @@ # https://udemy.com/recommender-systems # https://deeplearningcourses.com/recommender-systems from __future__ import print_function, division -from builtins import range, input +#from builtins import range, input # Note: you may need to update your version of future # sudo pip install -U future -import pickle -import numpy as np +#import pickle +#import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.utils import shuffle -from keras.models import Model -from keras.layers import Input, Embedding, Flatten, Dense, Concatenate -from keras.layers import Dropout, BatchNormalization, Activation -from keras.regularizers import l2 -from keras.optimizers import SGD, Adam +from tensorflow.keras.models import Model # type:ignore +from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate # type:ignore +from tensorflow.keras.layers import Dropout, BatchNormalization, Activation # type:ignore +#from tensorflow.keras.regularizers import l2 +from tensorflow.keras.optimizers import SGD#, Adam # type:ignore # load in the data -df = pd.read_csv('../large_files/movielens-20m-dataset/edited_rating.csv') +df = pd.read_csv('.\\large_files\\movielens-20m-dataset\\edited_rating.csv') N = df.userId.max() + 1 # number of users M = df.movie_idx.max() + 1 # number of movies @@ -47,12 +47,12 @@ # the neural network x = Dense(400)(x) -# x = BatchNormalization()(x) +x = BatchNormalization()(x) +x = Activation('relu')(x) +x = Dropout(0.5)(x) +x = Dense(100)(x) +x = BatchNormalization()(x) x = Activation('relu')(x) -# x = Dropout(0.5)(x) -# x = Dense(100)(x) -# x = BatchNormalization()(x) -# x = Activation('relu')(x) x = Dense(1)(x) model = Model(inputs=[u, m], outputs=x) @@ -60,7 +60,7 @@ loss='mse', # optimizer='adam', # optimizer=Adam(lr=0.01), - optimizer=SGD(lr=0.08, momentum=0.9), + optimizer=SGD(learning_rate=0.08, momentum=0.9), metrics=['mse'], ) diff --git a/recommenders/preprocess.py b/recommenders/preprocess.py index 72585460..9e8d19ef 100644 --- a/recommenders/preprocess.py +++ b/recommenders/preprocess.py @@ -1,14 +1,14 @@ # https://udemy.com/recommender-systems # https://deeplearningcourses.com/recommender-systems from __future__ import print_function, division -from builtins import range, input +#from builtins import range, input # Note: you may need to update your version of future # sudo pip install -U future import pandas as pd # https://www.kaggle.com/grouplens/movielens-20m-dataset -df = pd.read_csv('../large_files/movielens-20m-dataset/rating.csv') +df = pd.read_csv('.\\large_files\\movielens-20m-dataset\\rating.csv') @@ -34,8 +34,9 @@ # add them to the data frame # takes awhile -df['movie_idx'] = df.apply(lambda row: movie2idx[row.movieId], axis=1) +#df['movie_idx'] = df.apply(lambda row: movie2idx[row.movieId], axis=1) +df['movie_idx'] = df.movieId.map(movie2idx) df = df.drop(columns=['timestamp']) -df.to_csv('../large_files/movielens-20m-dataset/edited_rating.csv', index=False) \ No newline at end of file +df.to_csv('.\\large_files\\movielens-20m-dataset\\edited_rating.csv', index=False) \ No newline at end of file diff --git a/recommenders/preprocess2dict.py b/recommenders/preprocess2dict.py index 2ed5d8b7..e019cde4 100644 --- a/recommenders/preprocess2dict.py +++ b/recommenders/preprocess2dict.py @@ -1,19 +1,19 @@ # https://udemy.com/recommender-systems # https://deeplearningcourses.com/recommender-systems from __future__ import print_function, division -from builtins import range, input +#from builtins import range, input # Note: you may need to update your version of future # sudo pip install -U future import pickle -import numpy as np +#import numpy as np import pandas as pd -import matplotlib.pyplot as plt +#import matplotlib.pyplot as plt from sklearn.utils import shuffle # load in the data # https://www.kaggle.com/grouplens/movielens-20m-dataset -df = pd.read_csv('../large_files/movielens-20m-dataset/very_small_rating.csv') +df = pd.read_csv('.\\large_files\\movielens-20m-dataset\\small_rating.csv') N = df.userId.max() + 1 # number of users M = df.movie_idx.max() + 1 # number of movies @@ -25,58 +25,62 @@ df_test = df.iloc[cutoff:] # a dictionary to tell us which users have rated which movies -user2movie = {} +user2movie = df_train.groupby('userId').movie_idx.agg(list).to_dict() # a dicationary to tell us which movies have been rated by which users -movie2user = {} +movie2user = df_train.groupby('movie_idx').userId.agg(list).to_dict() # a dictionary to look up ratings -usermovie2rating = {} -print("Calling: update_user2movie_and_movie2user") -count = 0 -def update_user2movie_and_movie2user(row): - global count - count += 1 - if count % 100000 == 0: - print("processed: %.3f" % (float(count)/cutoff)) - - i = int(row.userId) - j = int(row.movie_idx) - if i not in user2movie: - user2movie[i] = [j] - else: - user2movie[i].append(j) - - if j not in movie2user: - movie2user[j] = [i] - else: - movie2user[j].append(i) - - usermovie2rating[(i,j)] = row.rating -df_train.apply(update_user2movie_and_movie2user, axis=1) +user_movie_keys = zip(df_train.userId, df_train.movie_idx) +usermovie2rating = pd.Series(df_train.rating.values, index=user_movie_keys).to_dict() + +# print("Calling: update_user2movie_and_movie2user") +# count = 0 +# def update_user2movie_and_movie2user(row): +# global count +# count += 1 +# if count % 100000 == 0: +# print("processed: %.3f" % (float(count)/cutoff)) + +# i = int(row.userId) +# j = int(row.movie_idx) +# if i not in user2movie: +# user2movie[i] = [j] +# else: +# user2movie[i].append(j) + +# if j not in movie2user: +# movie2user[j] = [i] +# else: +# movie2user[j].append(i) + +# usermovie2rating[(i,j)] = row.rating +#df_train.apply(update_user2movie_and_movie2user, axis=1) # test ratings dictionary -usermovie2rating_test = {} -print("Calling: update_usermovie2rating_test") -count = 0 -def update_usermovie2rating_test(row): - global count - count += 1 - if count % 100000 == 0: - print("processed: %.3f" % (float(count)/len(df_test))) - - i = int(row.userId) - j = int(row.movie_idx) - usermovie2rating_test[(i,j)] = row.rating -df_test.apply(update_usermovie2rating_test, axis=1) +user_movie_keys_test = zip(df_test.userId, df_test.movie_idx) +usermovie2rating_test = pd.Series(df_test.rating.values, index=user_movie_keys_test).to_dict() + +# print("Calling: update_usermovie2rating_test") +# count = 0 +# def update_usermovie2rating_test(row): +# global count +# count += 1 +# if count % 100000 == 0: +# print("processed: %.3f" % (float(count)/len(df_test))) + +# i = int(row.userId) +# j = int(row.movie_idx) +# usermovie2rating_test[(i,j)] = row.rating +# df_test.apply(update_usermovie2rating_test, axis=1) # note: these are not really JSONs -with open('user2movie.json', 'wb') as f: +with open('.\\large_files\\movielens-20m-dataset\\user2movie.json', 'wb') as f: pickle.dump(user2movie, f) -with open('movie2user.json', 'wb') as f: +with open('.\\large_files\\movielens-20m-dataset\\movie2user.json', 'wb') as f: pickle.dump(movie2user, f) -with open('usermovie2rating.json', 'wb') as f: +with open('.\\large_files\\movielens-20m-dataset\\usermovie2rating.json', 'wb') as f: pickle.dump(usermovie2rating, f) -with open('usermovie2rating_test.json', 'wb') as f: +with open('.\\large_files\\movielens-20m-dataset\\usermovie2rating_test.json', 'wb') as f: pickle.dump(usermovie2rating_test, f) diff --git a/recommenders/preprocess2sparse.py b/recommenders/preprocess2sparse.py index 864de56d..fdc525a7 100644 --- a/recommenders/preprocess2sparse.py +++ b/recommenders/preprocess2sparse.py @@ -5,14 +5,14 @@ # Note: you may need to update your version of future # sudo pip install -U future -import numpy as np +#import numpy as np import pandas as pd -import matplotlib.pyplot as plt +#import matplotlib.pyplot as plt from sklearn.utils import shuffle -from scipy.sparse import lil_matrix, csr_matrix, save_npz, load_npz +from scipy.sparse import lil_matrix, save_npz # load in the data -df = pd.read_csv('../large_files/movielens-20m-dataset/edited_rating.csv') +df = pd.read_csv('.\\large_files\\movielens-20m-dataset\\edited_rating.csv') # df = pd.read_csv('../large_files/movielens-20m-dataset/small_rating.csv') N = df.userId.max() + 1 # number of users @@ -41,7 +41,7 @@ def update_train(row): # mask, to tell us which entries exist and which do not A = A.tocsr() mask = (A > 0) -save_npz("Atrain.npz", A) +save_npz(".\\large_files\\movielens-20m-dataset\\Atrain.npz", A) # test ratings dictionary A_test = lil_matrix((N, M)) @@ -59,4 +59,4 @@ def update_test(row): df_test.apply(update_test, axis=1) A_test = A_test.tocsr() mask_test = (A_test > 0) -save_npz("Atest.npz", A_test) +save_npz(".\\large_files\\movielens-20m-dataset\\Atest.npz", A_test) diff --git a/recommenders/preprocess_shrink.py b/recommenders/preprocess_shrink.py index 665a80e6..e7aa5b87 100644 --- a/recommenders/preprocess_shrink.py +++ b/recommenders/preprocess_shrink.py @@ -1,18 +1,18 @@ # https://udemy.com/recommender-systems # https://deeplearningcourses.com/recommender-systems from __future__ import print_function, division -from builtins import range, input +#from builtins import range, input # Note: you may need to update your version of future # sudo pip install -U future -import pickle -import numpy as np +#import pickle +#import numpy as np import pandas as pd from collections import Counter # load in the data # https://www.kaggle.com/grouplens/movielens-20m-dataset -df = pd.read_csv('../large_files/movielens-20m-dataset/edited_rating.csv') +df = pd.read_csv('.\\large_files\\movielens-20m-dataset\\edited_rating.csv') print("original dataframe size:", len(df)) N = df.userId.max() + 1 # number of users @@ -25,8 +25,8 @@ n = 10000 m = 2000 -user_ids = [u for u, c in user_ids_count.most_common(n)] -movie_ids = [m for m, c in movie_ids_count.most_common(m)] +user_ids = [u for u, _ in user_ids_count.most_common(n)] +movie_ids = [m for m, _ in movie_ids_count.most_common(m)] # make a copy, otherwise ids won't be overwritten df_small = df[df.userId.isin(user_ids) & df.movie_idx.isin(movie_ids)].copy() @@ -55,4 +55,4 @@ print("max movie id:", df_small.movie_idx.max()) print("small dataframe size:", len(df_small)) -df_small.to_csv('../large_files/movielens-20m-dataset/small_rating.csv', index=False) +df_small.to_csv('.\\large_files\\movielens-20m-dataset\\small_rating.csv', index=False) diff --git a/recommenders/rbm_tf_k_faster.py b/recommenders/rbm_tf_k_faster.py index 9a1a242a..71b9666b 100644 --- a/recommenders/rbm_tf_k_faster.py +++ b/recommenders/rbm_tf_k_faster.py @@ -10,8 +10,8 @@ import matplotlib.pyplot as plt from sklearn.utils import shuffle -import pandas as pd -from scipy.sparse import lil_matrix, csr_matrix, save_npz, load_npz +#import pandas as pd +from scipy.sparse import load_npz from datetime import datetime if tf.__version__.startswith('2'): @@ -33,6 +33,7 @@ def dot2(H, W): class RBM(object): def __init__(self, D, M, K): + super().__init__() self.D = D # input feature size self.M = M # hidden size self.K = K # number of ratings @@ -115,7 +116,7 @@ def build(self, D, M, K): self.session.run(initop) def fit(self, X, X_test, epochs=10, batch_sz=256, show_fig=True): - N, D = X.shape + N, _ = X.shape n_batches = N // batch_sz @@ -134,7 +135,7 @@ def fit(self, X, X_test, epochs=10, batch_sz=256, show_fig=True): ) if j % 100 == 0: - print("j / n_batches:", j, "/", n_batches, "cost:", c) + print(f"j / n_batches: {j}/{n_batches}", "cost: ",c) print("duration:", datetime.now() - t0) # calculate the true train and test cost @@ -209,10 +210,10 @@ def get_sse(self, X, Xt): def main(): - A = load_npz("Atrain.npz") - A_test = load_npz("Atest.npz") + A = load_npz(".\\large_files\\movielens-20m-dataset\\Atrain.npz") + A_test = load_npz(".\\large_files\\movielens-20m-dataset\\Atest.npz") - N, M = A.shape + _, M = A.shape rbm = RBM(M, 50, 10) rbm.fit(A, A_test) diff --git a/recommenders/spark.py b/recommenders/spark.py index 23ea365a..33154899 100644 --- a/recommenders/spark.py +++ b/recommenders/spark.py @@ -9,11 +9,14 @@ # tmp = p.take(5) # print(tmp) -from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating -import os +from pyspark.mllib.recommendation import ALS, Rating +from pyspark import SparkContext +#import os # load in the data -data = sc.textFile("../large_files/movielens-20m-dataset/small_rating.csv") +sc = SparkContext('local', 'random') +data = sc.textFile(".\\large_files\\movielens-20m-dataset\\small_rating.csv") +#'/mnt/c/Users/Saif/Downloads/personal/Udemy_labs/nlp/machine_learning_examples/large_files/movielens-20m-dataset//small_ratings.csv' # filter out header header = data.first() #extract header diff --git a/recommenders/spark2.py b/recommenders/spark2.py index 5879269d..8310c69b 100644 --- a/recommenders/spark2.py +++ b/recommenders/spark2.py @@ -7,7 +7,7 @@ # tmp = p.take(5) # print(tmp) -from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating +from pyspark.mllib.recommendation import ALS, Rating from pyspark import SparkContext # increase memory @@ -18,8 +18,8 @@ # load in the data -# data = sc.textFile("../large_files/movielens-20m-dataset/small_rating.csv") -data = sc.textFile("../large_files/movielens-20m-dataset/rating.csv.gz") +data = sc.textFile("/mnt/c/Users/Saif/Downloads/personal/Udemy_labs/nlp/machine_learning_examples/large_files/movielens-20m-dataset/rating.csv") +#data = sc.textFile(".\\large_files\\movielens-20m-dataset\\rating.csv.gz") # filter out header header = data.first() #extract header diff --git a/recommenders/tfidf.py b/recommenders/tfidf.py index a6078ec3..7c380205 100644 --- a/recommenders/tfidf.py +++ b/recommenders/tfidf.py @@ -2,12 +2,12 @@ import json from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances +from sklearn.metrics.pairwise import cosine_similarity # get the data from: https://www.kaggle.com/tmdb/tmdb-movie-metadata # load in the data -df = pd.read_csv('../large_files/tmdb_5000_movies.csv') +df = pd.read_csv('.\\large_files\\tmdb_5000_movies.csv') # convert the relevant data for each movie into a single string @@ -59,7 +59,7 @@ def recommend(title): recommended_idx = (-scores).argsort()[1:6] # return the titles of the recommendations - return df['title'].iloc[recommended_idx] + return df['title'].iloc[recommended_idx].values print("\nRecommendations for 'Scream 3':") diff --git a/recommenders/userbased.py b/recommenders/userbased.py index b512a722..07e84489 100644 --- a/recommenders/userbased.py +++ b/recommenders/userbased.py @@ -1,44 +1,44 @@ # https://udemy.com/recommender-systems # https://deeplearningcourses.com/recommender-systems from __future__ import print_function, division -from builtins import range, input +from builtins import range#, input # Note: you may need to update your version of future # sudo pip install -U future import pickle import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -from sklearn.utils import shuffle -from datetime import datetime +#import pandas as pd +#import matplotlib.pyplot as plt +#from sklearn.utils import shuffle +#from datetime import datetime from sortedcontainers import SortedList # load in the data import os -if not os.path.exists('user2movie.json') or \ - not os.path.exists('movie2user.json') or \ - not os.path.exists('usermovie2rating.json') or \ - not os.path.exists('usermovie2rating_test.json'): +if not os.path.exists('.\\large_files\\movielens-20m-dataset\\user2movie.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\movie2user.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\usermovie2rating.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\usermovie2rating_test.json'): import preprocess2dict -with open('user2movie.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\user2movie.json', 'rb') as f: user2movie = pickle.load(f) -with open('movie2user.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\movie2user.json', 'rb') as f: movie2user = pickle.load(f) -with open('usermovie2rating.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\usermovie2rating.json', 'rb') as f: usermovie2rating = pickle.load(f) -with open('usermovie2rating_test.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\usermovie2rating_test.json', 'rb') as f: usermovie2rating_test = pickle.load(f) N = np.max(list(user2movie.keys())) + 1 # the test set may contain movies the train set doesn't have data on m1 = np.max(list(movie2user.keys())) -m2 = np.max([m for (u, m), r in usermovie2rating_test.items()]) +m2 = np.max([m for (_, m), _ in usermovie2rating_test.items()]) M = max(m1, m2) + 1 print("N:", N, "M:", M) diff --git a/rnn_class/util.py b/rnn_class/util.py index 54801efa..5aa98a66 100644 --- a/rnn_class/util.py +++ b/rnn_class/util.py @@ -118,7 +118,7 @@ def get_wikipedia_data(n_files, n_vocab, by_paragraph=False): for f in input_files: print("reading:", f) - for line in open(prefix + f): + for line in open(prefix + f, encoding='utf-8'): line = line.strip() # don't count headers, structured data, lists, etc... if line and line[0] not in ('[', '*', '-', '|', '=', '{', '}'):