SRM-Hackathon · satrajit-chatterjee · Oct 11, 2019 · Oct 11, 2019 · Oct 11, 2019 · Oct 11, 2019
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
 .idea
 .vscode
-
+/ml/ml-1m
+/ml/weights.h5
+/ml/.gitignore
+/ml/__pycache__
diff --git a/ml/.gitignore b/ml/.gitignore
diff --git a/ml/CFModel.py b/ml/CFModel.py
@@ -0,0 +1,29 @@
+import numpy as np
+from keras.layers import Embedding, Reshape, Merge
+from keras.models import Sequential
+
+
+class CFModel(Sequential):
+
+    # The constructor for the class
+    def __init__(self, n_users, m_items, k_factors, **kwargs):
+        super(CFModel, self).__init__(**kwargs)
+        # P is the embedding layer that creates an User by latent factors matrix.
+        # If the intput is a user_id, P returns the latent factor vector for that user.
+        P = Sequential()
+        P.add(Embedding(n_users, k_factors, input_length=1))
+        P.add(Reshape((k_factors,)))
+
+        # Q is the embedding layer that creates a Movie by latent factors matrix.
+        # If the input is a movie_id, Q returns the latent factor vector for that movie.
+        Q = Sequential()
+        Q.add(Embedding(m_items, k_factors, input_length=1))
+        Q.add(Reshape((k_factors,)))
+
+        # The Merge layer takes the dot product of user and movie latent factor vectors to return the corresponding
+        # rating.
+        self.add(Merge([P, Q], mode='dot', dot_axes=1))
+
+    # The rate function to predict user's rating of unrated items
+    def rate(self, user_id, item_id):
+        return self.predict([np.array([user_id]), np.array([item_id])])[0][0]
diff --git a/ml/evaluate.py b/ml/evaluate.py
@@ -0,0 +1,5 @@
+import math
+from recommender import *
+# Show the best validation RMSE
+min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
+print('Minimum RMSE at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(math.sqrt(min_val_loss)))
diff --git a/ml/inference.py b/ml/inference.py
@@ -0,0 +1,44 @@
+import argparse
+from CFModel import CFModel
+from recommender import *
+
+# Use the pre-trained model
+trained_model = CFModel(max_userid, max_movieid, K_FACTORS)
+
+# Load weights
+trained_model.load_weights('weights.h5')
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--user_id', type=int)
+parser.add_argument('--emo', type=int)
+args = vars(parser.parse_args())
+# Pick a random test user
+TEST_USER = 2000  # A random test user (user_id = 2000)  # TODO: Test user for debugging
+users[users['user_id'] == TEST_USER]
+
+
+# Function to predict the ratings given User ID and Movie ID
+def predict_rating(user_id, movie_id):
+    return trained_model.rate(user_id - 1, movie_id - 1)
+
+
+user_ratings = ratings[ratings['user_id'] == TEST_USER][['user_id', 'movie_id', 'rating']]
+user_ratings['prediction'] = user_ratings.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)
+recommendations = ratings[ratings['movie_id'].isin(user_ratings['movie_id']) == False][['movie_id']].drop_duplicates()
+recommendations['prediction'] = recommendations.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)
+
+# negative is 0 and positive is 1
+emo_map = {1: ["Comedy", "Drama", "Fantasy", "Action", "Adventure", "Animation", "Children's", "Crime", "Documentary",
+               "Horror", "Mystery", "Sci-Fi"], 0: ["Comedy", "Fantasy", "Thriller", "War", "Western", "Action",
+                                                   "Adventure", "Film-Noir", "Musical", "Romance"]}
+sorted_recs = recommendations.sort_values(by='prediction',
+                            ascending=False).merge(movies,
+                                                   on='movie_id',
+                                                   how='inner',
+                                                   suffixes=['_u', '_m']).head(20)
+
+print(type(sorted_recs))
+print(sorted_recs)
+
+
+
diff --git a/ml/recommender.py b/ml/recommender.py
@@ -0,0 +1,55 @@
+import math
+import numpy as np
+import pandas as pd
+from CFModel import CFModel
+import matplotlib.pyplot as plt
+from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
+
+# Reading ratings file
+ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1',
+                      usecols=['user_id', 'movie_id', 'user_emb_id', 'movie_emb_id', 'rating'])
+max_userid = ratings['user_id'].drop_duplicates().max()
+max_movieid = ratings['movie_id'].drop_duplicates().max()
+
+# Reading users file
+users = pd.read_csv('users.csv', sep='\t', encoding='latin-1',
+                    usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])
+
+# Reading movies file
+movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1',
+                     usecols=['movie_id', 'title', 'genres'])
+
+# Create training set
+shuffled_ratings = ratings.sample(frac=1., random_state=1)
+
+# Shuffling users
+Users = shuffled_ratings['user_emb_id'].values
+print('Users:', Users, ', shape =', Users.shape)
+
+# Shuffling movies
+Movies = shuffled_ratings['movie_emb_id'].values
+print('Movies:', Movies, ', shape =', Movies.shape)
+
+# Shuffling ratings
+Ratings = shuffled_ratings['rating'].values
+print('Ratings:', Ratings, ', shape =', Ratings.shape)
+
+# Define constants
+K_FACTORS = 100  # The number of dimensional embeddings for movies and users
+
+# Define model
+model = CFModel(max_userid, max_movieid, K_FACTORS)
+# Compile the model using MSE as the loss function and the AdaMax learning algorithm
+# model.compile(loss='mse', optimizer='adamax')
+
+# Callbacks monitor the validation loss
+# Save the model weights each time the validation loss has improved
+callbacks = [EarlyStopping('val_loss', patience=2),
+             ModelCheckpoint('weights.h5', save_best_only=True)]
+
+# Use 30 epochs, 90% training data, 10% validation data
+# history = model.fit([Users, Movies], Ratings, nb_epoch=30, validation_split=.1, callbacks=callbacks)
+
+# Show the best validation RMSE
+# min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
+# print('Minimum RMSE at epoch', '{:d}'.format(idx + 1), '=', '{:.4f}'.format(math.sqrt(min_val_loss)))