-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathyi_ren_hclust.py
174 lines (160 loc) · 4.52 KB
/
yi_ren_hclust.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import sys
import math
import itertools
import heapq
def get_dimension_nop_list(data):
List = []
n = 0
for line in open(data):
line1 = line.split(',')
length = len(line1)
line2 = line1[0:length-1]
line3 = []
for d in line2:
line3.append(float(d))
line4 = [[n],line3]
n = n + 1
List.append(line4)
global number_of_points
number_of_points = len(List)
global dimensions
dimensions = len(List[0][1])
return List
def No_of_point():
No_of_points = []
for i in range(number_of_points):
No_of_points.append(i)
return No_of_points
def get_heap(List):
heap = []
for i in itertools.combinations(List,2):
distancekey = distance(i[0][1],i[1][1])
tempheap = [distancekey,[[i[0][0],i[0][1]],[i[1][0],i[1][1]]]]
heapq.heappush(heap,tempheap)
return heap
def distance(list1,list2):
length=len(list1)
sum2 = 0
for i in range(length):
x = list1[i]
y = list2[i]
total = (x - y)*(x - y)
sum2 = sum2 + total
distance = math.sqrt(sum2)
return distance
def hcluster(heap,No_of_points,List,K):
cluster_times = number_of_points
k = int(K)
dict_of_step = {}
while cluster_times >= 2:
min_distance = heapq.heappop(heap)
L = min_distance
L1 = L[1][0]
L2 = L[1][1]
L10 = L[1][0][0]
L20 = L[1][1][0]
L11 = L[1][0][1]
L21 = L[1][1][1]
L30 =[]
List.remove(L1)
List.remove(L2)
for i in L10:
if i in No_of_points:
No_of_points.remove(i)
L30.append(i)
else:
No_of_points.remove(L10)
for p in L10:
L30.append(p)
break
for j in L20:
if j in No_of_points:
No_of_points.remove(j)
L30.append(j)
else:
No_of_points.remove(L20)
for q in L20:
L30.append(q)
break
No_of_points.append(L30)
No_of_points1 = []
No_of_points1 = No_of_points
cluster_times = cluster_times - 1
dict_of_step[cluster_times] = list(No_of_points1)
L31 = []
for i in range(len(L11)):
new_element = (L11[i]+L21[i])/2
L31.append(new_element)
L3 = [L30,L31]
List.append(L3)
heap = get_heap(List)
return dict_of_step[k]
def precision_recall(data,hc):
list_of_label = []
for line in open(data):
line1 = line.split(',')
length = len(line1)
line2 = line1[length-1]
line3 = line2.replace('\n','')
list_of_label.append(line3)
myset = set(list_of_label)
n = 0
totalx = 0
for item in myset:
n = list_of_label.count(item)
m = n * (n - 1) / 2
totalx = totalx + m
p = 0
q = 0
totaly = 0
for line in hc:
p = len(line)
q = p * (p - 1) / 2
totaly = totaly + q
list_of_hcluster_pairs = []
for h in hc:
for i in itertools.combinations(h,2):
temp = []
for ii in i:
temp.append(ii)
temp.sort()
list_of_hcluster_pairs.append(temp)
list_of_goldstand_pairs = []
dict_of_goldstand_pairs = {}
pointer = 0
for line in open(data):
line1 = line.split(',')
length = len(line1)
temp_label = line1[length-1]
label = temp_label.replace('\n','')
dict_of_goldstand_pairs.setdefault(label,[])
point_list = dict_of_goldstand_pairs[label]
point_list.append(pointer)
dict_of_goldstand_pairs[label] = point_list
pointer = pointer + 1
for g in dict_of_goldstand_pairs:
for i in itertools.combinations(dict_of_goldstand_pairs[g],2):
temp = []
for ii in i:
temp.append(ii)
temp.sort()
list_of_goldstand_pairs.append(temp)
z = 0
for i in list_of_hcluster_pairs:
if i in list_of_goldstand_pairs:
z = z + 1
else:
continue
recall = float(z) / float(totalx)
pre = float(z) / float(totaly)
print recall
print pre
if __name__ == '__main__':
inputdata = sys.argv[1]
inputdata1 = sys.argv[2]
List = get_dimension_nop_list(inputdata)
heap = get_heap(List)
No_of_points = No_of_point()
hc = hcluster(heap,No_of_points,List,inputdata1)
precision_recall(inputdata,hc)
print hc