-
Notifications
You must be signed in to change notification settings - Fork 0
/
theta_vs_k.py
77 lines (61 loc) · 1.71 KB
/
theta_vs_k.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import sys
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans
from numpy import linalg as LA
import matplotlib.pyplot as plt
from cluster import Cluster, ClusterSet
fileName = sys.argv[1]
minK = int(sys.argv[2])
theta = float(sys.argv[3])
vectorLength = None
with open(fileName, 'r') as dataFile:
firstLine = dataFile.readline()
vectorLength = int(firstLine)
data = np.loadtxt(fileName,
skiprows = 1,
usecols = range(1, vectorLength + 1))
ids = np.loadtxt(fileName,
skiprows = 1,
usecols = [0])
def plot_cluster(cluster):
for shape in cluster.points:
plt.plot(shape, color='black')
plt.plot(cluster.centroid, 'o', markerfacecolor='None',
markeredgewidth=2, markeredgecolor='red')
plt.xlabel('Hour')
plt.ylabel('Normal Usage')
plt.title('#' + str(cluster.label))
plt.show()
K = minK
thetaValues = .0001 * np.arange(8, 10)
clusterSizes = []
for theta in thetaValues:
print "Theta:\n", theta
K = minK
# Initial cluster set
clusterSet = ClusterSet(data)
clusterSet.normalize()
while True:
clusterSet.fitData(K)
n_v = clusterSet.findViolations(theta)
K += len(n_v)
print K
for label in n_v:
clusterSet.splitLabel(label)
if len(n_v) == 0:
clusterSizes.append(K)
break
plt.scatter(thetaValues, clusterSizes)
plt.title("Number of clusters as a function of $\\theta$")
plt.xlabel("$\\theta$")
plt.ylabel("Number of clusters")
plt.xlim(0, 1.1 * max(thetaValues))
plt.ylim(0, 1.1 * max(clusterSizes))
plt.show()
plt.scatter(thetaValues, np.log(clusterSizes))
plt.title("log(Number of clusters) as a function of $\\theta$")
plt.xlabel("$\\theta$")
plt.ylabel("log(Number of clusters)")
plt.xlim(0, 1.1 * max(thetaValues))
plt.ylim(0, 1.1 * max(np.log(clusterSizes)))
plt.show()