Skip to content

Commit 275eb16

Browse files
author
quangh
committed
kmeans
1 parent f6c887e commit 275eb16

File tree

4 files changed

+212
-0
lines changed

4 files changed

+212
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
##### K-means Algorithm Mathematics base
2+
3+
###### Problem:
4+
5+
Given a data set $\{x_1, ..., x_n\} \in \mathbb{R^d}$ and a integer number K<=N. Our goal is to partition the data set into K clusters.
6+
7+
-------------------------
8+
9+
Let $\{u_i\} | u_i \in \mathbb{R^d}, i = 1,..,K$ be the set of center point of each cluster. Our goad is then to find an assignment of data points to clusters such that sum of the squares of the distances of each data point to its closest vector $u_k$ is minimum.
10+
11+
Find min:
12+
13+
​ $||x_i - u_k||_2^2$
14+
15+
For each data point $x_i$, we introduce a set $\{y_{ij} \in \{0,1\}\}$ where j = 1,...,K. If data point $x_i$ is assigned to cluster k then $y_{ik} = 1, and\: y_{ij} = 0\:for\: i\neq j $. Now, we can define an objective function:
16+
17+
​ $\mathcal{L}(\mathbf{Y}, \mathbf{U}) = \sum_{i=1}^N\sum_{j=1}^K y_{ij} \|\mathbf{x}_i - \mathbf{u}_j\|_2^2$
18+
19+
We can minimize this function through an iterative procedure in which each iteration
20+
involves two successive steps corresponding to successive optimizations with respect
21+
to the $y_{ij}$ and $u_k$.
22+
23+
- fixed U, find Y:
24+
25+
​ $\mathbf{y}_i = \arg\min_{\mathbf{y}_i} \sum_{j=1}^K y_{ij}\|\mathbf{x}_i - \mathbf{u}_j\|_2^2 ~~~ (3)$
26+
27+
​ $\text{subject to:} ~~ y_{ij} \in \{0, 1\}~~ \forall j;~~~ \sum_{j = 1}^K y_{ij} = 1$
28+
29+
​ $ <=> j = \arg\min_{j} \|\mathbf{x}_i - \mathbf{u}_j\|_2^2$
30+
31+
In other words, we simply assign point $x_i$ to the closest cluster center.
32+
33+
- fixed Y, find U:
34+
35+
​ $\mathbf{u}_j = \arg\min_{\mathbf{u}_j} \sum_{i = 1}^{N} y_{ij}\|\mathbf{x}_i - \mathbf{u}_j \|_2^2.$
36+
37+
Objective function is a quadratic function of $u_j$ so it can be minimized by setting its derivative to 0:
38+
39+
​ $\frac{\partial l(\mathbf{u}_j)}{\partial \mathbf{u}_j} = 2\sum_{i=1}^N y_{ij}(\mathbf{u}_j - \mathbf{x}_i) = 0$
40+
41+
​ $\Rightarrow \mathbf{u}_j = \frac{ \sum_{i=1}^N y_{ij} \mathbf{x}_i}{\sum_{i=1}^N y_{ij}}$
42+
43+
44+
45+
The denominator in this expression is equal to the number of points assigned to cluster k, the numerator is sum of all points of cluster j. So this result has a simple interpretation, namely set $u_j$ equal to the mean of all of the data points $x_i$ assigned to cluster j.
46+
47+
--------------------------
48+
49+
Because each phase reduces the value of the objective function, convergence of the algorithm is assured. However, it may converge to a local rather than global minimum.
50+
51+
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""
2+
1.
3+
4+
O()
5+
6+
n : number of points
7+
K : number of clusters
8+
I : number of iterations
9+
d : number of attributes
10+
11+
"""
12+
13+
import numpy as np
14+
import matplotlib.pyplot as plt
15+
from scipy.spatial.distance import cdist
16+
17+
means = [[0,0], [1,8], [5,2]]
18+
N = 500
19+
cov = [[1,0], [0,1]]
20+
X0 = np.random.multivariate_normal(means[0], cov, N)
21+
X1 = np.random.multivariate_normal(means[1], cov, N)
22+
X2 = np.random.multivariate_normal(means[2], cov, N)
23+
X = np.concatenate((X0,X1,X2),axis=0)
24+
25+
26+
def kmeans_display(X, label):
27+
X0 = X[label == 0, :]
28+
X1 = X[label == 1, :]
29+
X2 = X[label == 2, :]
30+
31+
plt.plot(X0[:, 0], X0[:, 1], 'b^', markersize=4, alpha=.8)
32+
plt.plot(X1[:, 0], X1[:, 1], 'go', markersize=4, alpha=.8)
33+
plt.plot(X2[:, 0], X2[:, 1], 'rs', markersize=4, alpha=.8)
34+
35+
plt.axis('equal')
36+
plt.plot()
37+
plt.show()
38+
39+
def init_center(X, k):
40+
return X[np.random.choice(X.shape[0], k, replace=False)]
41+
42+
def assign_labels(X, centers):
43+
D = cdist(X, centers)
44+
return np.argmin(D, axis=1)
45+
46+
def update_centers(X, labels, K):
47+
centers = np.zeros((K, X.shape[1]))
48+
for k in range(K):
49+
Xk = X[labels == k,:]
50+
centers[k,:] = np.mean(Xk,axis=0)
51+
52+
return centers
53+
54+
def is_converged(centers, new_centers):
55+
return(set([tuple(a) for a in centers]) ==
56+
set([tuple(a) for a in new_centers]))
57+
58+
def kmeans(X,K):
59+
centers = init_center(X,K)
60+
labels = []
61+
count = 0
62+
while True:
63+
labels = assign_labels(X, centers)
64+
new_centers = update_centers(X, labels, K)
65+
if is_converged(centers, new_centers):
66+
break
67+
68+
count +=1
69+
centers = new_centers
70+
71+
return centers, labels, count
72+
73+
def bisecting_kmeans(X,K):
74+
centers = [np.mean(X, axis=0).tolist()]
75+
while len(centers) < K:
76+
for cluster_center in centers:
77+
78+
79+
bisecting_kmeans(X,3)
80+
+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
"""
2+
1. choose K random points as init centers
3+
2. assign each points to its closest center
4+
3. re-compute the centers
5+
4. if centers are unchanged, stop the algorithm
6+
5. back to step 2
7+
8+
O(n * K * I * d)
9+
10+
n : number of points
11+
K : number of clusters
12+
I : number of iterations
13+
d : number of attributes
14+
15+
"""
16+
17+
import numpy as np
18+
import matplotlib.pyplot as plt
19+
from scipy.spatial.distance import cdist
20+
21+
means = [[0,0], [1,8], [5,2]]
22+
N = 500
23+
cov = [[1,0], [0,1]]
24+
X0 = np.random.multivariate_normal(means[0], cov, N)
25+
X1 = np.random.multivariate_normal(means[1], cov, N)
26+
X2 = np.random.multivariate_normal(means[2], cov, N)
27+
X = np.concatenate((X0,X1,X2),axis=0)
28+
29+
30+
def kmeans_display(X, label):
31+
X0 = X[label == 0, :]
32+
X1 = X[label == 1, :]
33+
X2 = X[label == 2, :]
34+
35+
plt.plot(X0[:, 0], X0[:, 1], 'b^', markersize=4, alpha=.8)
36+
plt.plot(X1[:, 0], X1[:, 1], 'go', markersize=4, alpha=.8)
37+
plt.plot(X2[:, 0], X2[:, 1], 'rs', markersize=4, alpha=.8)
38+
39+
plt.axis('equal')
40+
plt.plot()
41+
plt.show()
42+
43+
def init_center(X, k):
44+
return X[np.random.choice(X.shape[0], k, replace=False)]
45+
46+
def assign_labels(X, centers):
47+
D = cdist(X, centers)
48+
return np.argmin(D, axis=1)
49+
50+
def update_centers(X, labels, K):
51+
centers = np.zeros((K, X.shape[1]))
52+
for k in range(K):
53+
Xk = X[labels == k,:]
54+
centers[k,:] = np.mean(Xk,axis=0)
55+
56+
return centers
57+
58+
def is_converged(centers, new_centers):
59+
return(set([tuple(a) for a in centers]) ==
60+
set([tuple(a) for a in new_centers]))
61+
62+
def kmean(X,K):
63+
centers = init_center(X,K)
64+
labels = []
65+
count = 0
66+
while True:
67+
labels = assign_labels(X, centers)
68+
new_centers = update_centers(X, labels, K)
69+
if is_converged(centers, new_centers):
70+
break
71+
72+
count +=1
73+
centers = new_centers
74+
75+
return centers, labels, count
76+
77+
centers , labels, count = kmean(X,3)
78+
79+
print count
80+
kmeans_display(X, labels)
81+

0 commit comments

Comments
 (0)