forked from josiahw/SimpleSVClustering
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSimpleSVC.py
200 lines (168 loc) · 6.46 KB
/
SimpleSVC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 11 20:40:56 2016
@author: josiahw
"""
import numpy
import numpy.linalg
def polyKernel(a,b,pwr):
return numpy.dot(a,b)**pwr #numpy.dot(a,a) - numpy.dot(b,b) # -1 #
def rbfKernel(a,b,gamma):
return numpy.exp(-gamma * numpy.linalg.norm(a - b))
class SimpleSVClustering:
w = None
a = None
b = None
C = None
sv = None
kernel = None
tolerance = None
verbose = False
def __init__(self,
C,
tolerance = 0.001,
kernel = numpy.dot,
**kwargs
):
"""
The parameters are:
- C: SVC cost
- tolerance: gradient descent solution accuracy
- kernel: the kernel function do use as k(a, b, *kwargs)
- kwargs: extra parameters for the kernel
"""
self.C = C
self.kernel = kernel
self.tolerance = tolerance
self.kwargs = kwargs
print(self.kwargs)
def _checkClass(self, a, b, n_checks = 5):
"""
This does a straight line interpolation between a and b, using n_checks number of segments.
It returns True if a and b are connected by a high probability region, false otherwise.
NOTE: authors originally suggested 20 segments but that is SLOOOOOW, so we use 5. In practice it is pretty good.
"""
for i in numpy.arange(1.0/n_checks,1.0,1.0/n_checks):
if self._predict(i*a + (1-i)*b) > self.b:
return False
return True
#test = [bool(self._predict(i*a + (1-i)*b) <= self.b) for i in numpy.arange(1.0/n_checks,1.0,1.0/n_checks)]
#return not False in test
def _getAllClasses(self, X):
"""
Assign class labels to each vector based on connected graph components.
TODO: The outputs of this should really be saved in order to embed new points into the clusters.
"""
#1: build the connected clusters
unvisited = list(range(len(X)))
clusters = []
while len(unvisited):
#create a new cluster with the first unvisited node
c = [unvisited[0]]
unvisited.pop(0)
i = 0
while i < len(c) and len(unvisited):
#for all nodes in the cluster, add all connected unvisited nodes and remove them fromt he unvisited list
unvisitedNew = []
for j in unvisited:
(c if self._checkClass(X[c[i],:],X[j,:]) else unvisitedNew).append(j)
unvisited = unvisitedNew
i += 1
clusters.append(c)
#3: group components by classification
classifications = numpy.zeros(len(X))-1
for i in range(len(clusters)):
for c in clusters[i]:
classifications[c] = i
return classifications
def fit(self, X):
"""
Fit to data X with labels y.
"""
"""
Construct the Q matrix for solving
"""
Q = numpy.zeros((len(X),len(X)))
for i in range(len(X)):
for j in range(i,len(X)):
Qval = 1.
Qval *= self.kernel(*((X[i,:], X[j,:])), **self.kwargs)
Q[i,j] = Q[j,i] = Qval
"""
Solve for a and w simultaneously by coordinate descent.
This means no quadratic solver is needed!
The support vectors correspond to non-zero values in a.
"""
self.w = numpy.zeros(X.shape[1])
self.a = numpy.zeros(X.shape[0])
delta = 10000000000.0
while delta > self.tolerance:
delta = 0.
for i in range(len(X)):
g = numpy.dot(Q[i,:], self.a) - Q[i,i]
adelta = self.a[i] - min(max(self.a[i] - g/Q[i,i], 0.0), self.C)
self.w += adelta * X[i,:]
delta += abs(adelta)
self.a[i] -= adelta
if self.verbose:
print ("Descent step magnitude:", delta)
#get the data for support vectors
Qshrunk = Q[self.a >= self.C/100.,:][:,self.a >= self.C/100.]
self.sv = X[self.a >= self.C/100., :]
self.a = (self.a)[self.a >= self.C/100.]
#Do an all-pairs contour check
#calculate the contribution of all SVs
for i in range(len(self.a)):
for j in range(len(self.a)):
Qshrunk[i,j] *= self.a[i]*self.a[j]
#this is needed for radius calculation apparently
self.bOffset = numpy.sum(numpy.sum(Qshrunk))
if self.verbose:
print ("Number of support vectors:", len(self.a))
"""
Select support vectors and solve for b to get the final classifier
"""
self.b = numpy.mean(self._predict(self.sv))
if self.verbose:
print ("Bias value:", self.b)
def _predict(self, X):
"""
For SVClustering, we need to calculate radius rather than bias.
"""
if (len(X.shape) < 2):
X = X.reshape((1,-1))
clss = numpy.zeros(len(X))
for i in range(len(X)):
clss[i] += self.kernel(* ((X[i,:],X[i,:])), **self.kwargs)
for j in range(len(self.sv)):
clss[i] -= 2 * self.a[j] * self.kernel(* ((self.sv[j,:],X[i,:])), **self.kwargs)
return (clss+self.bOffset)**0.5
def predict(self, X):
"""
Predict classes for data X.
NOTE: this should really be done with either the fitting data or a superset of the fitting data.
"""
return self._getAllClasses(X)
if __name__ == '__main__':
import sklearn.datasets
data,labels = sklearn.datasets.make_moons(400,noise=0.01,random_state=0)
data -= numpy.mean(data,axis=0)
#parameters can be sensitive, these ones work for two moons
C = 0.1
clss = SimpleSVClustering(C,1e-10,rbfKernel,(3.5,))
clss.fit(data)
#check assigned classes for the two moons as a classification error
t = clss.predict(data)
print ("Error", numpy.sum((labels-t)**2) / float(len(data)))
from matplotlib import pyplot
#generate a heatmap and display classified clusters.
a = numpy.zeros((100,100))
for i in range(100):
for j in range(100):
a[j,i] = clss._predict(numpy.array([i*4/100.-2,j*4/100.-2]))
pyplot.imshow(a, cmap='hot', interpolation='nearest')
data *= 25.
data += 50.
pyplot.scatter(data[t==0,0],data[t==0,1],c='r')
pyplot.scatter(data[t==1,0],data[t==1,1],c='b')
pyplot.show()