-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans.py
206 lines (173 loc) · 8.12 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import numpy as np
#################################
# DO NOT IMPORT OHTER LIBRARIES
#################################
def get_k_means_plus_plus_center_indices(n, n_cluster, x, generator=np.random):
'''
:param n: number of samples in the data
:param n_cluster: the number of cluster centers required
:param x: data - numpy array of points
:param generator: random number generator. Use it in the same way as np.random.
In grading, to obtain deterministic results, we will be using our own random number generator.
:return: a list of length n_clusters with each entry being the *index* of a sample
chosen as centroid.
'''
p = generator.randint(0, n) #this is the index of the first center
centers = []
centers.append(p)
#############################################################################
# TODO: implement the rest of Kmeans++ initialization. To sample an example
# according to some distribution, first generate a random number between 0 and
# 1 using generator.rand(), then find the the smallest index n so that the
# cumulative probability from example 1 to example n is larger than r.
#############################################################################
for i in range(1, n_cluster):
prob = np.zeros((n, 1))
for j in range(0, n):
prob[j] = np.min(np.linalg.norm(x[centers] - x[j], axis = 1)) ** 2 + prob[j - 1]
num = generator.rand() * prob[n - 1]
centers.append(np.argmax(prob >= num))
# DO NOT CHANGE CODE BELOW THIS LINE
return centers
# Vanilla initialization method for KMeans
def get_lloyd_k_means(n, n_cluster, x, generator):
return generator.choice(n, size=n_cluster)
class KMeans():
'''
Class KMeans:
Attr:
n_cluster - Number of clusters for kmeans clustering (Int)
max_iter - maximum updates for kmeans clustering (Int)
e - error tolerance (Float)
'''
def __init__(self, n_cluster, max_iter=100, e=0.0001, generator=np.random):
self.n_cluster = n_cluster
self.max_iter = max_iter
self.e = e
self.generator = generator
def fit(self, x, centroid_func=get_lloyd_k_means):
'''
Finds n_cluster in the data x
params:
x - N X D numpy array
returns:
A tuple in the following order:
- final centroids, a n_cluster X D numpy array,
- a length (N,) numpy array where cell i is the ith sample's assigned cluster's index (start from 0),
- number of times you update the assignment, an Int (at most self.max_iter)
'''
assert len(x.shape) == 2, "fit function takes 2-D numpy arrays as input"
self.generator.seed(42)
N, D = x.shape
self.centers = x[centroid_func(len(x), self.n_cluster, x, self.generator)]
iteration = 0
membership = np.zeros(N, dtype=np.int64)
cost = float('inf')
###################################################################
# TODO: Update means and membership until convergence
# (i.e., average K-mean objective changes less than self.e)
# or until you have made self.max_iter updates.
###################################################################
while iteration < self.max_iter:
iteration = iteration + 1;
for i in range(0, N):
membership[i] = np.argmin(np.linalg.norm(self.centers - x[i], axis=1))
for i in range(0, self.n_cluster):
self.centers[i] = np.mean(x[membership == i], axis=0)
new_cost = np.mean(np.linalg.norm(x - self.centers[membership]) ** 2)
if abs(cost - new_cost) < self.e:
break
cost = new_cost
return (self.centers, membership, iteration)
class KMeansClassifier():
'''
Class KMeansClassifier:
Attr:
n_cluster - Number of clusters for kmeans clustering (Int)
max_iter - maximum updates for kmeans clustering (Int)
e - error tolerance (Float)
'''
def __init__(self, n_cluster, max_iter=100, e=1e-6, generator=np.random):
self.n_cluster = n_cluster
self.max_iter = max_iter
self.e = e
self.generator = generator
def fit(self, x, y, centroid_func=get_lloyd_k_means):
'''
Train the classifier
params:
x - N X D size numpy array
y - (N,) size numpy array of labels
returns:
None
Store following attributes:
self.centroids : centroids obtained by kmeans clustering (n_cluster X D numpy array)
self.centroid_labels : labels of each centroid obtained by
majority voting (numpy array of length n_cluster)
'''
assert len(x.shape) == 2, "x should be a 2-D numpy array"
assert len(y.shape) == 1, "y should be a 1-D numpy array"
assert y.shape[0] == x.shape[0], "y and x should have same rows"
self.generator.seed(42)
N, D = x.shape
################################################################
# TODO:
# - assign means to centroids (use KMeans class you implemented,
# and "fit" with the given "centroid_func" function)
# - assign labels to centroid_labels
################################################################
k_means = KMeans(self.n_cluster, self.max_iter, self.e)
centroids, membership, i = k_means.fit(x, centroid_func)
centroid_labels = np.zeros(self.n_cluster)
labels = np.unique(y)
for i in range(0, self.n_cluster):
centroid_labels[i] = np.bincount(y[membership == i]).argmax()
# DO NOT CHANGE CODE BELOW THIS LINE
self.centroid_labels = centroid_labels
self.centroids = centroids
assert self.centroid_labels.shape == (
self.n_cluster,), 'centroid_labels should be a numpy array of shape ({},)'.format(self.n_cluster)
assert self.centroids.shape == (
self.n_cluster, D), 'centroid should be a numpy array of shape {} X {}'.format(self.n_cluster, D)
def predict(self, x):
'''
Predict function
params:
x - N X D size numpy array
returns:
predicted labels - numpy array of size (N,)
'''
assert len(x.shape) == 2, "x should be a 2-D numpy array"
self.generator.seed(42)
N, D = x.shape
##########################################################################
# TODO:
# - for each example in x, predict its label using 1-NN on the stored
# dataset (self.centroids, self.centroid_labels)
##########################################################################
labels = np.zeros(N)
for i in range(0, N):
index = np.argmin(np.linalg.norm(self.centroids - x[i], axis=1))
labels[i] = self.centroid_labels[index]
return labels
def transform_image(image, code_vectors):
'''
Quantize image using the code_vectors (aka centroids)
Return a new image by replacing each RGB value in image with the nearest code vector
(nearest in euclidean distance sense)
returns:
numpy array of shape image.shape
'''
assert image.shape[2] == 3 and len(image.shape) == 3, \
'Image should be a 3-D array with size (?,?,3)'
assert code_vectors.shape[1] == 3 and len(code_vectors.shape) == 2, \
'code_vectors should be a 2-D array with size (?,3)'
##############################################################################
# TODO
# - replace each pixel (a 3-dimensional point) by its nearest code vector
##############################################################################
new_image = np.zeros(image.shape)
for i in range(0, image.shape[0]):
for j in range(0, image.shape[1]):
new_image[i][j] = code_vectors[np.argmin(np.linalg.norm(code_vectors - new_image[i][j], axis = 1))]
return new_image