forked from COOKIESROK/Movies
-
Notifications
You must be signed in to change notification settings - Fork 0
/
2.data.manip.R
216 lines (180 loc) · 8.51 KB
/
2.data.manip.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# === description ===========================================================================
## brief description what the script file is about
#
## if you have clear chunks of code, use collapsible dividers (# ==== or # ----)
## and number the sections. Provide brief description for each section here
## # ***** or #_____ does not make sections
# ******************************************************************************
## use descriptive names
# 1) read raw data load data from clean data folder and ...
# 2) outliers test for outliers and entryr errors
# 3) .....
# === 1) read raw data =========================================================
# reading data
movie_data <- read.csv("movies.csv")
rating_data <- read.csv("ratings.csv")
# inspecting data
head(movie_data)
head(rating_data)
# === 2) creating data frames ==================================================
# creating a data frame of movie genres pulled out of the movie data frame
movie_genre <- as.data.frame(movie_data$genres, stringsAsFactors=FALSE)
# separates genres into columns (rather than by lines)
movie_genre2 <- as.data.frame(tstrsplit(movie_genre[,1], '[|]',
type.convert=TRUE),
stringsAsFactors=FALSE)
# naming columns of the genre data frame
colnames(movie_genre2) <- c(1:10)
# creating a list of all genres
list_genre <- c("Action", "Adventure", "Animation", "Children",
"Comedy", "Crime","Documentary", "Drama", "Fantasy",
"Film-Noir", "Horror", "Musical", "Mystery","Romance",
"Sci-Fi", "Thriller", "War", "Western")
# creating an empty matrix (10330 rows and 18 columns filled with 0)
genre_mat1 <- matrix(0,10330,18)
# making our genre list the names of each column
colnames(genre_mat1) <- list_genre
# filling in the first row with the same headers (for the loop)
genre_mat1[1,] <- list_genre
# starting loop that goes through every column of every row
for (index in 1:nrow(movie_genre2)) {
for (col in 1:ncol(movie_genre2)) {
#Creates a new name for each cell, and then assigns a value of 1 if it matches
#E.X. if a movie has action, it gets a 1 under action, but can be searched for
#action as well.
gen_col = which(genre_mat1[1,] == movie_genre2[index,col])
genre_mat1[index+1,gen_col] <- 1
}
}
# removing first row, which was the genre list
genre_mat2 <- as.data.frame(genre_mat1[-1,], stringsAsFactors=FALSE)
head(genre_mat2)
# converting from characters to integers
for (col in 1:ncol(genre_mat2)) {
genre_mat2[,col] <- as.integer(genre_mat2[,col])
}
# checking str (to make sure we have integers now)
str(genre_mat2)
# adding movie id and movie title to genre matrix 2
SearchMatrix <- cbind(movie_data[,1:2], genre_mat2[])
head(SearchMatrix)
# === 3) ..... =================================================================
# creating a new data frame in which each column is a movie
ratingMatrix <- dcast(rating_data, userId~movieId, value.var = "rating", na.rm=FALSE)
# removing userIds
ratingMatrix <- as.matrix(ratingMatrix[,-1])
# Converting rating matrix into a recommenderlab sparse matrix
ratingMatrix <- as(ratingMatrix, "realRatingMatrix")
head(ratingMatrix)
# === 4) ..... =================================================================
# setting up parameters for building the recommendation system
recommendation_model <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
names(recommendation_model)
lapply(recommendation_model, "[[", "description")
recommendation_model$IBCF_realRatingMatrix$parameters
# === 5) ..... =================================================================
# setting up a similarity matrix for users that uses cosine method to measure
# how similar 2 users are
similarity_mat <- similarity(ratingMatrix[1:4, ],
method = "cosine",
which = "users")
as.matrix(similarity_mat)
# generating visualization of the matrix
image(as.matrix(similarity_mat), main = "User's Similarities")
# same process but for movie similarity
movie_similarity <- similarity(ratingMatrix[, 1:4], method =
"cosine", which = "items")
as.matrix(movie_similarity)
image(as.matrix(movie_similarity), main = "Movies similarity")
# extracting values from the matrix so the unique ones can be counted
rating_values <- as.vector(ratingMatrix@data)
unique(rating_values)
# creating a count of movie ratings
Table_of_Ratings <- table(rating_values)
Table_of_Ratings
# counting views for each movie
movie_views <- colCounts(ratingMatrix)
# create a data frame of views
table_views <- data.frame(movie = names(movie_views),
views = movie_views)
# sorting the data frame by number of views
table_views <- table_views[order(table_views$views,
decreasing = TRUE), ]
# adding a new empty column to data the data frame
table_views$title <- NA
# creating a loop that will fill in every row with the movie title
for (index in 1:10325){
table_views[index,3] <- as.character(subset
(movie_data,
movie_data$movieId == table_views[index,1])$title)
}
head(table_views)
movie_ratings <- ratingMatrix[rowCounts(ratingMatrix) > 50,
colCounts(ratingMatrix) > 50]
# looking at top 98 percentile users and movies
minimum_movies<- quantile(rowCounts(movie_ratings), 0.98)
minimum_users <- quantile(colCounts(movie_ratings), 0.98)
average_ratings <- rowMeans(movie_ratings)
# displaying distribution of the average rating per user
qplot(average_ratings, fill=I("steelblue"), col=I("red")) +
ggtitle("Distribution of the average rating per user")
# normalizing movie data
normalized_ratings <- normalize(movie_ratings)
sum(rowMeans(normalized_ratings) > 0.00001)
# looking at top 95 percentile users and movies
binary_minimum_movies <- quantile(rowCounts(movie_ratings), 0.95)
binary_minimum_users <- quantile(colCounts(movie_ratings), 0.95)
# binarizing good rated movies
good_rated_films <- binarize(movie_ratings, minRating = 3)
# === 6) ..... =================================================================
# taking a sample of 420, assigning true to 80% and false to the rest
sampled_data<- sample(x = c(TRUE, FALSE),
size = nrow(movie_ratings), # nrow = 420
replace = TRUE,
prob = c(0.8, 0.2))
# creating training data for the AI program and testing data (which are not the
# same) to see if the model works
training_data <- movie_ratings[sampled_data, ]
testing_data <- movie_ratings[!sampled_data, ]
# setting parameters for recommendation system
recommendation_system <- recommenderRegistry$get_entries(dataType ="realRatingMatrix")
recommendation_system$IBCF_realRatingMatrix$parameters
# using recommendation model
recommen_model <- Recommender(data = training_data,
method = "IBCF",
parameter = list(k = 30))
# defining an s4 class for object oriented programming
class(recommen_model)
# === 7) ..... =================================================================
# looking at info about the model
model_info <- getModel(recommen_model)
# defining an s4 class for object oriented programming
class(model_info$sim)
# checking the dimensions of the model
dim(model_info$sim)
# setting top items to 20
top_items <- 20
sum_rows <- rowSums(model_info$sim > 0)
table(sum_rows)
sum_cols <- colSums(model_info$sim > 0)
# the number of movies to recommend to each user
top_recommendations <- 10
# building a prediction thing for 10 recommendations for 73 users
predicted_recommendations <- predict(object = recommen_model,
newdata = testing_data,
n = top_recommendations)
# recommendation for the first user
user1 <- predicted_recommendations@items[[1]]
# getting movie recommendations in character form
movies_user1 <- predicted_recommendations@itemLabels[user1]
# getting the titles of the recommended movies
movies_user2 <- movies_user1
for (index in 1:10){
movies_user2[index] <- as.character(subset(movie_data,
movie_data$movieId == movies_user1[index])$title)
}
# creating matrix with the recommendations for each user
recommendation_matrix <- sapply(predicted_recommendations@items,
function(x){ as.integer(colnames(movie_ratings)[x]) })
recommendation_matrix[,1:4]
#___ end _______________________________________________________________________