-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSpectralClusteringAcceleratedRobust.py
197 lines (151 loc) · 7.18 KB
/
SpectralClusteringAcceleratedRobust.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import numpy as np
import scipy.sparse as sp
from sklearn.neighbors import kneighbors_graph
from sklearn.cluster import KMeans
from Nystrom import eigendecompositionNystrom
class SCAR:
"""
Implementation of SCAR as proposed in the Bachelor Thesis
'Acceleration Robust Spectral Clustering Using the Nyström Method'
This code is based on the implementation of Robust Spectral Clustering by:
Aleksandar Bojchevski, Yves Matkovic, and Stephan Günnemann.
2017. Robust Spectral Clustering for Noisy Data.
In Proceedings of KDD’17, August 13–17, 2017, Halifax, NS, Canada.
Ellen Hohma
"""
def __init__(self, k, nn, alpha, theta=20, m=0.5, laplacian=0, n_iter=50, normalize=False, weighted=False, verbose=False):
"""
:param k: number of clusters
:param nn: number of neighbours to consider for constructing the KNN graph (excluding the node itself)
:param alpha: percentage of landmark points selected as subsample for the Nyström method from the original dataset
:param theta: number of corrupted edges to remove
:param m: minimum percentage of neighbours to keep per node (omega_i constraints)
:param laplacian: which graph Laplacian to use: 0: L, 1: L_rw, 2: L_sym
:param n_iter: number of iterations of the alternating optimization procedure
:param normalize: whether to row normalize the eigenvectors before performing k-means
:param weighted: use weighted (True) or unweighted (False) k-nn as similarity graph
:param verbose: verbosity
"""
self.k = k
self.nn = nn
self.alpha = alpha
self.theta = theta
self.m = m
self.laplacian = laplacian
self.n_iter = n_iter
self.normalize = normalize
self.weighted = weighted
self.verbose = verbose
if laplacian == 0:
if self.verbose:
print('Using unnormalized Laplacian L')
elif laplacian == 1:
if self.verbose:
print('Using random walk based normalized Laplacian L_rw')
elif laplacian == 2:
if self.verbose:
print('Using symmetric normalized Laplacian L_sym')
else:
raise ValueError('Choice of graph Laplacian not valid. Please use 0, 1 or 2.')
def __latent_decomposition(self, X):
"""
:param X: input datapoints to be clustered
:return Ag: good graph, containing only edges not identified as noise
:return Ac: corrupted graph, containing all edges identified as noise
:return H: spectral embedding of the selected graph Laplacian
"""
# compute weighted k-nn graph
if self.weighted == True:
A = kneighbors_graph(X=X, n_neighbors=self.nn, metric='euclidean', include_self=False, mode='distance')
gamma = -1.0/(X.shape[0]*X.shape[1])
A.data = np.exp(gamma*(A.data**2))
# compute unweighted k-nn graph
else:
A = kneighbors_graph(X=X, n_neighbors=self.nn, metric='euclidean', include_self=False, mode='connectivity')
# make the graph undirected
A = A.maximum(A.T)
# number of nodes
N = A.shape[0]
# node degrees
deg = A.sum(0).A1
# check the trace for convergence
prev_trace = np.inf
Ag = A.copy()
for it in range(self.n_iter):
# form the unnormalized Laplacian
D = sp.diags(Ag.sum(0).A1).tocsc()
L = D - Ag
# form random-walk Laplacian
if self.laplacian == 1:
D_minus = D
D_minus.data = 1/D.data
L = D_minus.dot(L)
# form symmetric Laplacian
elif self.laplacian == 2:
D_minus_half = D
D_minus_half.data = 1/(np.sqrt(D.data))
L = L.dot(D_minus_half)
L = D_minus_half.dot(L)
elif self.laplacian != 0:
raise ValueError("the laplacian {} is not defined".format(self.laplacian))
# calculate eigendecomposition using the Nyström method
H, lam, subsample_pos = eigendecompositionNystrom(L, self.k, self.alpha)
trace = lam.sum()
if self.verbose:
print('Iter: {} Trace: {:.4f}'.format(it, trace))
if self.theta == 0:
# no edges are removed
Ac = sp.coo_matrix((N, N), [np.int32])
break
if prev_trace - trace < 1e-10:
# we have converged
break
allowed_to_remove_per_node = (deg * self.m).astype(np.int32)
prev_trace = trace
# consider only the edges on the lower triangular part since we are symmetric
edges = sp.tril(A).nonzero()
removed_edges = []
# for random-walk Laplacian
if self.laplacian == 1:
# fix for potential numerical instability of the eigenvalues computation
lam[np.isclose(lam, 0)] = 0
# refers to equation 3.11 in the thesis
p = np.linalg.norm(H[edges[0]] - H[edges[1]], axis=1) ** 2 \
- np.linalg.norm(H[edges[0]] * np.sqrt(lam), axis=1) ** 2 \
- np.linalg.norm(H[edges[1]] * np.sqrt(lam), axis=1) ** 2
# for unnormalized and symmetric Laplacian
else:
# refers to equation 3.10 in the thesis
p = np.linalg.norm(H[edges[0]] - H[edges[1]], axis=1) ** 2
# greedly remove the worst edges
for ind in p.argsort()[::-1]:
e_i, e_j, p_e = edges[0][ind], edges[1][ind], p[ind]
# remove the edge if it satisfies the constraints
if allowed_to_remove_per_node[e_i] > 0 and allowed_to_remove_per_node[e_j] > 0 and p_e > 0:
allowed_to_remove_per_node[e_i] -= 1
allowed_to_remove_per_node[e_j] -= 1
removed_edges.append((e_i, e_j))
if len(removed_edges) == self.theta:
break
removed_edges = np.array(removed_edges)
Ac = sp.coo_matrix((np.ones(len(removed_edges)), (removed_edges[:, 0], removed_edges[:, 1])), shape=(N, N))
Ac = Ac.maximum(Ac.T)
Ag = A - Ac
return Ag, Ac, H
def fit_predict(self, X):
"""
:param X: array-like or sparse matrix, shape (n_samples, n_features)
:return labels: cluster labels ndarray, shape (n_samples)
"""
Ag, Ac, H = self.__latent_decomposition(X)
self.Ag = Ag
self.Ac = Ac
# norm row-wise to increase eigenvector accuracy and stability
if self.normalize:
self.H = np.nan_to_num(H / np.linalg.norm(H, axis=1)[:, None])
else:
self.H = H
# cluster rows of obtained eigenvectors to obtain clutering labels
labels = KMeans(n_clusters=self.k).fit_predict(self.H)
self.labels = labels
return labels