-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
250 lines (211 loc) · 7.63 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
from copy import deepcopy
import numpy as np
from scipy.io import arff
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import pickle
import scipy
import seaborn as sns
from itertools import cycle
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD, FastICA, PCA, NMF
from sklearn.metrics import pairwise_distances
__all__ = ['read_data', 'log_scale', 'std_scale', '_ica', '_tsne', '_truncatedSVD', '_nmf', 'plot_res', 'pca', '_pca', 'knee', '_hc_cut', 'cluster_merge']
def read_data(file_path):
with open(file_path, "rb") as f:
data = pickle.load(file=f)
mapping = data["mapping1"]
_data = data["matrix"]
return _data, mapping
def log_scale(X):
'''
log scaling from paper(Cloudcluster, Nsdi22)
X: non-negtive matrix, size = (n_samples, n_features)
'''
assert X.ndim == 2
_X = X + 1.0
_X = np.log(_X)
return _X
def std_scale(X):
'''
Standard scale from sklearn
X: input matrix, size = (n_samples, n_features)
'''
scale = StandardScaler()
_X = scale.fit_transform(X)
return _X
def _ica(X, n_dim, max_iter = 1000, tol = 0.001, whiten = "unit-variance"):
'''
FastICA dimensionality reduction
X: input data, size = (n_samples, n_features)
return: _X, size = (n_samples, n_dim)
'''
transformer=FastICA(n_components=n_dim, whiten=whiten, max_iter=max_iter, tol=tol)
_X = transformer.fit_transform(X)
return _X
def _tsne(X, n_dim, n_iter = 1000, lr = 'auto'):
'''
TSNE dimensionality reduction
X: input raw data, size = (n_samples, n_features)
n_dim: keeped dim, dim <= 3
return: _X, size = (n_samples, n_dim)
'''
_X = TSNE(n_components = n_dim,n_iter = n_iter , learning_rate = lr).fit_transform(X)
return _X
def _nmf(X, n_dim, random_state = 0):
'''
NMF dimensionality reduction
X: input raw data, size = (n_samples, n_features)
n_dim: keeped dim
return: W, size = (n_samples, n_dim)
'''
nmf = NMF(n_components=n_dim, random_state=random_state)
W = nmf.fit_transform(X)
H = nmf.components_
return W
def _truncatedSVD(X, n_dim, n_iter = 1000, random_state=43):
'''
TruncatedSVD dimensionality reduction
X: input raw data, size = (n_samples, n_features)
'''
_X = TruncatedSVD(n_components=n_dim, n_iter=n_iter, random_state=random_state).fit_transform(X)
return _X
def _pca(X, n_dim):
'''
pcm dimensionality reduction
X: input raw data, size = (n_samples, n_features)
'''
# _X = PCA(n_components=n_dim).fit_transform(X)
pca = PCA(n_components=n_dim)
pca = pca.fit(X)
_X = pca.transform(X)
evr = pca.explained_variance_ratio_
return _X, evr
def pca(data, n_dim):
'''
pca is O(D^3)
data: (n_samples, n_features(D))
:param n_dim: target dimensions
:return: (n_samples, n_dim)
'''
data = data - np.mean(data, axis = 0, keepdims = True)
cov = np.dot(data.T, data)
eig_values, eig_vector = np.linalg.eig(cov)
# print(eig_values)
indexs_ = np.argsort(-eig_values)[:n_dim]
picked_eig_values = eig_values[indexs_]
picked_eig_vector = eig_vector[:, indexs_]
data_ndim = np.dot(data, picked_eig_vector)
return data_ndim
def plot_res(X = None, label = None, K = 3, tsne = False, save = False, alg=None, show=False):
'''
plot the clustering results
X: input data, size = (n_samples, n_features)
label: clustering label, size = (n_samles, )
K: # of cluster
tsne: use tsne for better visulization
'''
assert K == np.unique(label).shape[0]
# colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k', 'b', 'g', 'r']
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
shape = cycle('+*o.')
_X = deepcopy(X)
if tsne:
_X = _tsne(_X, 2)
ax = plt.figure().subplots(1, 1)
for k, color, sp in zip(set(label), colors, shape):
data = _X[np.where(label == k)[0], :]
print(data.shape[0])
x, y = data[:, 0], data[:, 1]
ax.scatter(x, y, c = color, marker = sp)
if show: plt.show()
if save: plt.savefig('fig/'+alg+str(K)+'.png')
def _hc_cut(n_clusters, children, n_leaves):
from heapq import heapify, heappop, heappush, heappushpop
import _hierarchical_fast as _hierarchical # type: ignore
"""Function cutting the ward tree for a given number of clusters.
Parameters
----------
n_clusters : int or ndarray
The number of clusters to form.
children : ndarray of shape (n_nodes-1, 2)
The children of each non-leaf node. Values less than `n_samples`
correspond to leaves of the tree which are the original samples.
A node `i` greater than or equal to `n_samples` is a non-leaf
node and has children `children_[i - n_samples]`. Alternatively
at the i-th iteration, children[i][0] and children[i][1]
are merged to form node `n_samples + i`.
n_leaves : int
Number of leaves of the tree.
Returns
-------
labels : array [n_samples]
Cluster labels for each point.
"""
if n_clusters > n_leaves:
raise ValueError(
"Cannot extract more clusters than samples: "
"%s clusters where given for a tree with %s leaves."
% (n_clusters, n_leaves)
)
# In this function, we store nodes as a heap to avoid recomputing
# the max of the nodes: the first element is always the smallest
# We use negated indices as heaps work on smallest elements, and we
# are interested in largest elements
# children[-1] is the root of the tree
# node is a minimum heap
# heap top is the largest node(-value)
nodes = [-(max(children[-1]) + 1)]
for _ in range(n_clusters - 1):
# As we have a heap, nodes[0] is the smallest element
these_children = children[-nodes[0] - n_leaves]
# Insert the 2 children and remove the largest node
heappush(nodes, -these_children[0])
heappushpop(nodes, -these_children[1])
label = np.zeros(n_leaves, dtype=np.intp)
for i, node in enumerate(nodes):
label[_hierarchical._hc_get_descendent(-node, children, n_leaves)] = i
return label
def knee(data):
'''
plot CDF of inconsistency
inc: np.ndarry or List
'''
norm_cdf = scipy.stats.norm.cdf(data)
sns.lineplot(data, norm_cdf)
plt.hist(data, bins=10000 ,cumulative=True, histtype='step', density=True)
plt.show()
def cluster_merge(X, labels, threshold = 0.001):
'''cluster merge from nsdi22'''
res = labels
tmp_labels = deepcopy(labels)
cnt = 0
while (True):
labels_ = np.unique(tmp_labels)
centeroid = []
mapping = {}
for idx, i in enumerate(labels_):
# calculate the centroid of each cluster
centeroid.append(np.mean(X[np.where(tmp_labels == i)[0]], axis = 0))
mapping[idx] = i
center = np.array(centeroid)
# distance matrix size = (n, n)
# n = center.shape[0]
distance = pairwise_distances(center, metric = 'cosine')
for i in range(distance.shape[0]):
distance[i, i] = np.inf
# size = (n, )
value_min = np.min(distance, axis = 1)
if(np.min(value_min) > threshold): break
idx_min = np.argmin(distance, axis = 1)
x = np.argmin(value_min)
y = idx_min[x]
x_label = mapping[x]
y_label = mapping[y]
# merge cluste x to y
tmp_labels[np.where(tmp_labels == x_label)[0]] = y_label
res = tmp_labels
cnt += 1
print("%d cluster(s) merged"%cnt)
return res