-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmean.py
183 lines (166 loc) · 6.06 KB
/
kmean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
'''
K-Means Algorithm
-----------------
'''
import csv
import random
import math
import pygal
def read_csv_fieldnames(filename, separator):
"""
Inputs:
filename - name of CSV file
separator - character that separates fields
Ouput:
A dictionary of tuple consist of x-y value in
the given CSV file and the maximum key value
"""
dict_location = {}
with open(filename, "rt", newline='') as csvfile:
csvreader = csv.reader(csvfile,
delimiter=separator)
index = 0
for row in csvreader:
dict_location[index] = tuple(row)
index = index + 1
return dict_location, index - 1
def distance_between(first_value, second_value):
"""
Inputs:
Two coordinate for which distance to be calculated
Oututs:
Distance
"""
x_coor = float(first_value[0]) - float(second_value[0])
y_coor = float(first_value[1]) - float(second_value[1])
sum_of_square = x_coor**2 + y_coor**2
distance = math.sqrt(sum_of_square)
return distance
def nearest_mean(plot_value, cluster_index):
"""
Inputs:
plot_value - tuple of x-y value
cluster_index - dictionary of list containing the
k-mean value
Output:
Key of the cluster_index to which plot_value is nearer to
"""
key = -1
min_dist = 99999
for index in cluster_index:
dist = distance_between(plot_value, cluster_index[index])
if dist < min_dist:
min_dist = dist
key = index
return key
def cal_new_mean(dict_location, cluster_list, k_value):
"""
Input:
dict_location - dictionary of tuple representing plots
cluster_list - list representing cluster number of each plots
k-value - Number of cluster
Output:
Calculate the new mean for each cluster
"""
new_means = {i : [0, 0] for i in range(k_value)}
no_of_points = {i : 0 for i in range(k_value)}
for plot in dict_location:
cluster_no = cluster_list[plot]
new_means[cluster_no][0] = new_means[cluster_no][0] + float(dict_location[plot][0])
new_means[cluster_no][1] = new_means[cluster_no][1] + float(dict_location[plot][1])
no_of_points[cluster_no] = no_of_points[cluster_no] + 1
for value in new_means:
new_means[value][0] = str(new_means[value][0]/no_of_points[value])
new_means[value][1] = str(new_means[value][1]/no_of_points[value])
return new_means
def check_similarity(old_means, new_means):
"""
Check for similarity
"""
if old_means==new_means:
return 1
else:
return 0
def calc_cluster(kmeaninfo, dict_location, max_key):
"""
Inputs:
kmeaninfo - information dictionary
dict_location - dictionary of tuple consist of x-y value
max_key - maximum value of key
Oututs:
A list representing plot cluster number
"""
cluster_list = [-1 for i in range(max_key + 1)]
cluster_index = {i : [0, 0] for i in range(int(kmeaninfo['k_value']))}
#print(cluster_index)
tag = 0
random_index = random.sample(range(max_key+1), int(kmeaninfo['k_value']))
#print(random_index)
index = 0
for plot_index in random_index:
cluster_index[index][0] = dict_location[plot_index][0]
cluster_index[index][1] = dict_location[plot_index][1]
index = index + 1
#print(cluster_index)
while tag == 0:
for plot_index in dict_location:
key = nearest_mean(dict_location[plot_index], cluster_index)
cluster_list[plot_index] = key
#print(cluster_list)
new_means = cal_new_mean(dict_location, cluster_list, int(kmeaninfo['k_value']))
#print(new_means)
tag = check_similarity(cluster_index, new_means)
#print(tag)
cluster_index = new_means
#print("-----------------")
return cluster_list
def render_cluster(dict_location, cluster_list):
"""
Input:
dict_location - dictionary of tuple consist of x-y value
cluster_list - list representing cluster number of each plots
Output:
render the cluster in browser
"""
cluster0 = []
cluster1 = []
cluster2 = []
for plot_no in dict_location:
if cluster_list[plot_no] == 0:
tup = tuple([float(dict_location[plot_no][0]), float(dict_location[plot_no][1])])
cluster0.append(tup)
elif cluster_list[plot_no] == 1:
tup = tuple([float(dict_location[plot_no][0]), float(dict_location[plot_no][1])])
cluster1.append(tup)
else:
tup = tuple([float(dict_location[plot_no][0]), float(dict_location[plot_no][1])])
cluster2.append(tup)
xy_chart = pygal.XY(stroke=False)
xy_chart.title = 'Cluster Plot'
xy_chart.add('Cluster 0', cluster0)
xy_chart.add('Cluster 1', cluster1)
xy_chart.add('Cluster 2', cluster2)
xy_chart.render_in_browser()
def output_cluster(kmeaninfo, outputfile):
(dict_location, max_key) = read_csv_fieldnames(kmeaninfo['data_file'], kmeaninfo['separator'])
#print(dict_location, max_key)
cluster_list = calc_cluster(kmeaninfo, dict_location, max_key)
#print(cluster_list)
with open(outputfile, "w") as writefile:
for row in dict_location:
writefile.write(str(row) + ' ' + str(cluster_list[row]) + '\n')
render_cluster(dict_location, cluster_list)
def test_kmean():
'''
code to implement the k-means algorithm and use it
to cluster the n locations into k clusters, such that
the locations in the same cluster are geographically
close to each other.
'''
kmeaninfo = {
"data_file" : "kmeans_data.csv",
"separator" : ",",
"k_value" : "3"
}
output_cluster(kmeaninfo, "output.txt")
test_kmean()