-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFINAL_SUBMISSION_CODE_Goodfellas(1).py
221 lines (170 loc) · 6.95 KB
/
FINAL_SUBMISSION_CODE_Goodfellas(1).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge as KRR
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD
# Data Loading
print("Preparing training files....")
data_folder = '../Data_contest/dataset/'
genome_scores_df=pd.read_csv(data_folder+'genome_scores.csv') # Large (500MB)
movies_df=pd.read_csv(data_folder+'movies.csv')
df_test=pd.read_csv(data_folder+'test.csv') # Large 500MB
df_submission = pd.read_csv(data_folder+'dummy_submission.csv')
df_train_partial = pd.read_csv(data_folder+'train.csv')
df_valid_partial = pd.read_csv(data_folder+'validation.csv')
#Concatenating train and validation set
df_valid_partial=df_valid_partial.drop('timestamp',axis=1)
df_new_train = pd.concat([df_train_partial, df_valid_partial])
df_train_full=df_new_train.reset_index(drop=True)
train_df = df_train_full
train = train_df
test = df_test
test_df= df_test
print('done loading data')
### NORMAL REGRESSION ###
# create movie rating dataset from train
# Feature vector for the 10000 movies, each with a 1128 dimensional vector.
# If a movie doesn't appear in genome_scores we make it simply the 0 vector.
def generate_XY():
X=np.zeros((10000,1128))
movies_with_featvecs=set(genome_scores_df['movieId'])
# The average rating, for each of the movies in the training set.
# -1 if it is not in the train set.
rating_movies = -1*np.ones(10000)
# Each movie, is labelled +1 or -1 based on whetherr it is a comedy or not
for i in range(10000):
if i not in movies_with_featvecs:
continue
temp = genome_scores_df[genome_scores_df['movieId']==i]
feat_vec= np.array(temp['relevance'])
X[i,:]=feat_vec
for i in range(10000):
temp = train_df[train_df['movieId']==i]
if len(temp)==0:
continue
ratings_curr_movies = temp['rating']
rating_movies[i] = np.mean(ratings_curr_movies)
all_genres = []
for i in range(10000):
temp = movies_df[movies_df['movieId']==i]
if len(temp)==0:
continue
temp = temp['genres'].values[0]
temp = temp.split('|')
for genre in temp:
if genre not in all_genres:
all_genres.append(genre)
X_genre = np.zeros((10000,19))
for i in range(10000):
temp = movies_df[movies_df['movieId']==i]
if len(temp)==0:
continue
temp = temp['genres'].values[0]
temp = temp.split('|')
for idx, genre in enumerate(all_genres):
X_genre[i,idx] = genre in temp
X_concat = np.concatenate((X,X_genre),axis=1)
return X_concat, rating_movies
def SVR_Predictions(X, rating_movies):
X_all = X[rating_movies>0]
Y_all = rating_movies[rating_movies>0]
best_kernel_param = 0.1
best_reg_param = 10
SVM_algo = SVR(C=best_reg_param, kernel='rbf', gamma = best_kernel_param)
classifier = SVM_algo.fit(X_all,Y_all)
X_all_full = X
Y_pred_all = classifier.predict(X_all_full)
return Y_pred_all
X, rating_movies = generate_XY()
print('done generation of X')
Y_pred_all = SVR_Predictions(X, rating_movies)
print('done SVR Predictions')
### USER BASED REGRESSION ###
def userbased_regression():
kernel_param = 0.1
C = 10
alpha = 1/(2*C)
#For user specific
rating_pred = np.zeros((10000,10000))
for userId in range(10000):
User_specific = train_df.loc[train_df['userId'] == userId]
User_specific_test = test_df.loc[test_df['userId'] == userId]
if (len(User_specific)!=0) and (len(User_specific_test)!=0):
X_training_matrix = X[User_specific.movieId,:]
Y_training_matrix = User_specific.rating
X_testing_matrix = X[User_specific_test.movieId,:]
list_movieId = User_specific_test.movieId
SVM_algo = KRR(kernel='rbf')
classifier = SVM_algo.fit(X_training_matrix,Y_training_matrix)
Y_test_pred_matrix = classifier.predict(X_testing_matrix)
rating_pred[userId,list_movieId.values] = Y_test_pred_matrix
return rating_pred
rating_pred = userbased_regression()
# rectifying zero values of user regression values
user_regression = np.zeros(len(df_test))
for i in range(len(df_test)):
userid = df_test.iloc[i,0]
movieid = df_test.iloc[i,1]
#movie_based
rating_movie = Y_pred_all[movieid]
#user based
rating_user = rating_pred[userid,movieid]
if rating_user==0:
rating_user = rating_movie
user_regression[i] = rating_user
del train_df
del rating_pred
del X
### TRUNCSVD ###
# ADD HERE
#Due to adding of validation, we have some duplicates
movie_matrix = pd.concat([train,test]).drop_duplicates(subset = ['userId','movieId'],keep = 'first')
#Creates a movie matrix of #numofusers vs #noofmovies
movie_matrix = movie_matrix.pivot('userId','movieId','rating')
movie_means = movie_matrix.mean()
user_means = movie_matrix.mean(axis=1)
#Mean shifting
movie_shifted_temp = movie_matrix-movie_means
movie_shifted = movie_shifted_temp.fillna(0)
#To get locations where we have ratings
mask = -movie_shifted_temp.isnull()
def repeated_matrix_reconstruction(num_pcs,num_iterations):
global movie_shifted
for i in range(num_iterations):
SVD = TruncatedSVD(n_components=num_pcs,random_state=42)
SVD.fit(movie_shifted)
#For the ease of applying masks we work with pandas
movie_represented = pd.DataFrame(SVD.inverse_transform(SVD.transform(movie_shifted)),columns=movie_shifted.columns,index=movie_shifted.index)
loss = mean_squared_error(movie_represented[mask].fillna(0),movie_shifted_temp[mask].fillna(0))
print('Iteration: {} , Loss: {} '.format(i,loss))
#To just update the non-zero values of movie_reprented values to the true ratings
movie_represented[mask] = movie_shifted_temp[mask]
movie_shifted = movie_represented
#Mean shifting it back
movie_mat = movie_shifted + movie_means
movie_mat = movie_mat.clip(lower=0.5,upper=5)
return movie_mat
print("Starting truncated svd with number of components as 20")
representative_matrix_20 = repeated_matrix_reconstruction(20,10)
print("Done")
print("Starting truncated svd with number of components as 15")
representative_matrix_15 = repeated_matrix_reconstruction(15,10)
print("Done")
#bagging
rating_matrix = (representative_matrix_15+representative_matrix_20)/2
trunc_prediction = np.zeros(len(test))
for i in range(len(test)):
userid = test.iloc[i,0]
movieid = test.iloc[i,1]
trunc_prediction[i] = rating_matrix[rating_matrix.index==userid][movieid].values[0]
indices=np.argwhere(np.isnan(trunc_prediction))
trunc_prediction[indices] = user_regression[indices]
# ENSEMBLING
PRED = (2*trunc_prediction + 1*user_regression)/3 # best 2:1
PRED = np.around(PRED,1)
PRED = np.clip(PRED, a_min = 0.5, a_max = 5)
# SUBMISSION
df_submission.Prediction = PRED
df_submission.to_csv('./Goodfellas_submission.csv',index=False)
print('Done!!!')