-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathutils.py
85 lines (70 loc) · 2.79 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""
Utils
=====
"""
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
def create_X(df):
"""
Generates a sparse matrix from ratings dataframe.
Args:
df: pandas dataframe containing 3 columns (userId, movieId, rating)
Returns:
X: sparse matrix
user_mapper: dict that maps user id's to user indices
user_inv_mapper: dict that maps user indices to user id's
movie_mapper: dict that maps movie id's to movie indices
movie_inv_mapper: dict that maps movie indices to movie id's
"""
M = df['userId'].nunique()
N = df['movieId'].nunique()
user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))
movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N))))
user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"])))
user_index = [user_mapper[i] for i in df['userId']]
item_index = [movie_mapper[i] for i in df['movieId']]
X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N))
return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper
def find_similar_movies(movie_id, X, movie_mapper, movie_inv_mapper, k, metric='cosine'):
"""
Finds k-nearest neighbours for a given movie id.
Args:
movie_id: id of the movie of interest
X: user-item utility matrix
k: number of similar movies to retrieve
metric: distance metric for kNN calculations
Output: returns list of k similar movie ID's
"""
X = X.T
neighbour_ids = []
movie_ind = movie_mapper[movie_id]
movie_vec = X[movie_ind]
if isinstance(movie_vec, (np.ndarray)):
movie_vec = movie_vec.reshape(1,-1)
# use k+1 since kNN output includes the movieId of interest
kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
kNN.fit(X)
neighbour = kNN.kneighbors(movie_vec, return_distance=False)
for i in range(0,k):
n = neighbour.item(i)
neighbour_ids.append(movie_inv_mapper[n])
neighbour_ids.pop(0)
return neighbour_ids
def get_mean_user_matrix(X):
"""
Create matrix with rows containing user's mean ratings.
Args:
X: scipy.sparse.csr_matrix of shape (n_users, n_movies)
populated with original user-movie ratings
Returns:
a scipy.sparse.csr_matrix of shape (n_users, n_movies)
populated with a user's mean rating for each row.
"""
sum_ratings_per_user = X.sum(axis=1)
n_ratings_per_user = X.getnnz(axis=1)
mean_rating_per_user = sum_ratings_per_user/n_ratings_per_user.reshape(-1,1)
X_mean_user = np.tile(mean_rating_per_user, (1, X.shape[1]))
return csr_matrix(X_mean_user)