Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

inference working #5

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
.idea
.vscode

/ml/ml-1m
/ml/weights.h5
/ml/.gitignore
/ml/__pycache__
1 change: 0 additions & 1 deletion ml/.gitignore

This file was deleted.

29 changes: 29 additions & 0 deletions ml/CFModel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import numpy as np
from keras.layers import Embedding, Reshape, Merge
from keras.models import Sequential


class CFModel(Sequential):

# The constructor for the class
def __init__(self, n_users, m_items, k_factors, **kwargs):
super(CFModel, self).__init__(**kwargs)
# P is the embedding layer that creates an User by latent factors matrix.
# If the intput is a user_id, P returns the latent factor vector for that user.
P = Sequential()
P.add(Embedding(n_users, k_factors, input_length=1))
P.add(Reshape((k_factors,)))

# Q is the embedding layer that creates a Movie by latent factors matrix.
# If the input is a movie_id, Q returns the latent factor vector for that movie.
Q = Sequential()
Q.add(Embedding(m_items, k_factors, input_length=1))
Q.add(Reshape((k_factors,)))

# The Merge layer takes the dot product of user and movie latent factor vectors to return the corresponding
# rating.
self.add(Merge([P, Q], mode='dot', dot_axes=1))

# The rate function to predict user's rating of unrated items
def rate(self, user_id, item_id):
return self.predict([np.array([user_id]), np.array([item_id])])[0][0]
5 changes: 5 additions & 0 deletions ml/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import math
from recommender import *
# Show the best validation RMSE
min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
print('Minimum RMSE at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(math.sqrt(min_val_loss)))
44 changes: 44 additions & 0 deletions ml/inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import argparse
from CFModel import CFModel
from recommender import *

# Use the pre-trained model
trained_model = CFModel(max_userid, max_movieid, K_FACTORS)

# Load weights
trained_model.load_weights('weights.h5')

parser = argparse.ArgumentParser()
parser.add_argument('--user_id', type=int)
parser.add_argument('--emo', type=int)
args = vars(parser.parse_args())
# Pick a random test user
TEST_USER = 2000 # A random test user (user_id = 2000) # TODO: Test user for debugging
users[users['user_id'] == TEST_USER]


# Function to predict the ratings given User ID and Movie ID
def predict_rating(user_id, movie_id):
return trained_model.rate(user_id - 1, movie_id - 1)


user_ratings = ratings[ratings['user_id'] == TEST_USER][['user_id', 'movie_id', 'rating']]
user_ratings['prediction'] = user_ratings.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)
recommendations = ratings[ratings['movie_id'].isin(user_ratings['movie_id']) == False][['movie_id']].drop_duplicates()
recommendations['prediction'] = recommendations.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)

# negative is 0 and positive is 1
emo_map = {1: ["Comedy", "Drama", "Fantasy", "Action", "Adventure", "Animation", "Children's", "Crime", "Documentary",
"Horror", "Mystery", "Sci-Fi"], 0: ["Comedy", "Fantasy", "Thriller", "War", "Western", "Action",
"Adventure", "Film-Noir", "Musical", "Romance"]}
sorted_recs = recommendations.sort_values(by='prediction',
ascending=False).merge(movies,
on='movie_id',
how='inner',
suffixes=['_u', '_m']).head(20)

print(type(sorted_recs))
print(sorted_recs)



55 changes: 55 additions & 0 deletions ml/recommender.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import math
import numpy as np
import pandas as pd
from CFModel import CFModel
import matplotlib.pyplot as plt
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint

# Reading ratings file
ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1',
usecols=['user_id', 'movie_id', 'user_emb_id', 'movie_emb_id', 'rating'])
max_userid = ratings['user_id'].drop_duplicates().max()
max_movieid = ratings['movie_id'].drop_duplicates().max()

# Reading users file
users = pd.read_csv('users.csv', sep='\t', encoding='latin-1',
usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])

# Reading movies file
movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1',
usecols=['movie_id', 'title', 'genres'])

# Create training set
shuffled_ratings = ratings.sample(frac=1., random_state=1)

# Shuffling users
Users = shuffled_ratings['user_emb_id'].values
print('Users:', Users, ', shape =', Users.shape)

# Shuffling movies
Movies = shuffled_ratings['movie_emb_id'].values
print('Movies:', Movies, ', shape =', Movies.shape)

# Shuffling ratings
Ratings = shuffled_ratings['rating'].values
print('Ratings:', Ratings, ', shape =', Ratings.shape)

# Define constants
K_FACTORS = 100 # The number of dimensional embeddings for movies and users

# Define model
model = CFModel(max_userid, max_movieid, K_FACTORS)
# Compile the model using MSE as the loss function and the AdaMax learning algorithm
# model.compile(loss='mse', optimizer='adamax')

# Callbacks monitor the validation loss
# Save the model weights each time the validation loss has improved
callbacks = [EarlyStopping('val_loss', patience=2),
ModelCheckpoint('weights.h5', save_best_only=True)]

# Use 30 epochs, 90% training data, 10% validation data
# history = model.fit([Users, Movies], Ratings, nb_epoch=30, validation_split=.1, callbacks=callbacks)

# Show the best validation RMSE
# min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
# print('Minimum RMSE at epoch', '{:d}'.format(idx + 1), '=', '{:.4f}'.format(math.sqrt(min_val_loss)))