-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil.py
127 lines (97 loc) · 3.67 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import csv
import numpy as np
import pickle
import time
'''
The code herein will separate the A matrix into k separate data matrices,
with 1/k of the values held as validation error:
A -> A1, A2, A3, A4, A5 ... Ak training matrices
-> Y1, Y2, Y3, Y4, Y5 ... Yk validation matrices
'''
# Transplant every k'th + c nonzero value from a training matrix to validation matrix c
# Therefore, every k'th + c nonzero value in A should now be 0 in training matrix c
# and now be A[i,j] in a validation matrix c
def k_cross(k=10):
'''
Create the training and validation matrices for k_cross validaton
param:
filename - name of pickle file with original (m x n) data matrix, must have file extension
path - path to the filename - can be omitted if path is included in filename
k - number of training/data sets
returns:
training_matrices - array of (m x n) training matrices
prediction_matrices - array of (m x n) validation matrices where each data point is omitted from the corresponding
training matrix
index_matrices - array of (m x n) boolean masks where each true element is where an element was transplanted
from the training matrix to the validation matrix
'''
A = load_data_matrix()
m = A.shape[0]
n = A.shape[1]
print('A.shape = {}'.format(A.shape))
prediction_matrices = []
training_matrices = []
index_lists = []
for i in range(k):
A_copy = A.copy()
prediction_matrices.append(np.zeros((m,n)))
training_matrices.append(A_copy)
index_lists.append(np.zeros((m,n), dtype=bool))
it = 0
for i in range(A.shape[0]):
for j in range(A.shape[1]):
if (A[i,j] != 0):
training_matrices[it%k][i,j] = 0
prediction_matrices[it%k][i,j] = A[i,j]
index_lists[it%k][i,j] = True
it+=1
return training_matrices, prediction_matrices, index_lists
def load_data_matrix(filename='data_matrix.p', path='data'):
'''
The data sits in one location therefore this function can just quickly load it in
param:
really, don't use the params unless you have your using a different dataset file stored in a diff directory
returns:
A - a (users x movies) matrix of recommendation scores
'''
filepath = filename if path == '' else '{}/{}'.format(path,filename)
A = pickle.load( open('{}'.format(filepath), 'rb'))
return A
def get_MSE(mat1, mask, mat2=''):
'''
Get MSE for predicted values
param:
mat1 - matrix to compare against the original
mask - mask that has true for only the values that mat1 filled in (this can be taken directly from the results of k_cross())
return:
mse - float value of the mean squared error for predicted rating
'''
if (mat2 == ''):
mat2 = load_data_matrix()
A_mask = mat2[mask]
mat1_mask = mat1[mask]
diff = A_mask-mat1_mask
mse = np.dot(diff, diff)/A_mask.shape
return mse[0]
if __name__ == '__main__':
'''
This little test does test for the most important quality:
This makes sure that the value in the validation matrix is not present in the
test matrix.
This is a bit overboard, but helps me sleep at night.
You really don't need to run this often.
last I checked: ~25 second runtime on my toaster for k=10
'''
k = 10
train_mats, val_mats, masks = k_cross(k=k)
print('MSE = {}'.format(get_MSE(train_mats[0], masks[0])))
m = train_mats[0].shape[0]
n = train_mats[0].shape[1]
start = time.time()
for i in range(m):
for j in range(n):
for index in range(k):
if(train_mats[index][i,j] != 0 and val_mats[index][i,j] != 0):
print('we have a problem')
end = time.time()
print('you wasted {} seconds of my life'.format(end-start))