Skip to content

Commit 1e33614

Browse files
committed
Initial commit
0 parents  commit 1e33614

File tree

1 file changed

+181
-0
lines changed

1 file changed

+181
-0
lines changed

supervisedclustering.py

+181
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
import numpy as np
2+
from sklearn.ensemble import RandomForestClassifier
3+
from scipy.spatial.distance import squareform
4+
5+
6+
class CorrClust(object):
7+
'''
8+
class like an sklearn class, which can be trained to do correlation
9+
clustering on a new dataset
10+
'''
11+
12+
def __init__(self, balance_training=False, forest_params={},
13+
num_random_starts=10, max_iters=100):
14+
self.forest_params = forest_params
15+
self.balance_training = balance_training
16+
self.num_random_starts = num_random_starts
17+
self.max_iters = max_iters
18+
19+
def train(self, X, Y, subsample_length):
20+
'''
21+
training the model
22+
unsure if parameters should go here or in the initialisation function
23+
'''
24+
if self.balance_training:
25+
x_pairs, y_pairs = self._form_balanced_pairs(X, Y, subsample_length)
26+
else:
27+
x_pairs, y_pairs = self._form_pairs(X, Y, subsample_length)
28+
29+
print "Training - there are %d +ve pairs and %d -ve ones" % \
30+
((y_pairs==0).sum(), (y_pairs==1).sum())
31+
32+
self.rf = RandomForestClassifier(**self.forest_params)
33+
self.rf.fit(x_pairs, y_pairs)
34+
35+
def test(self, X):
36+
'''
37+
running the model on test data
38+
TODO - allow for just a subset of edges to be formed, thus creating
39+
a sparse matrix of edge probabilities
40+
'''
41+
x_pairwise = self._form_pairs(X)
42+
edge_probabilities = self.rf.predict_proba(x_pairwise)[:, 1]
43+
prob_matrix = squareform(edge_probabilities)
44+
y_prediction = self._correlation_clusterer(prob_matrix)
45+
return y_prediction
46+
47+
def _form_pairs(self, X, Y=None, subsample_length=None):
48+
49+
# here I shall be taking all pairs from the data
50+
idxs1, idxs2 = self._pair_idxs(X.shape[0])
51+
52+
if (subsample_length is not None) and (subsample_length < idxs1.shape[0]):
53+
print subsample_length, idxs1.shape[0]
54+
to_use = np.random.choice(idxs1.shape[0], subsample_length, replace=False)
55+
idxs1 = idxs1[to_use]
56+
idxs2 = idxs2[to_use]
57+
58+
x_pairs = np.abs(X[idxs1] - X[idxs2])
59+
60+
if Y is not None:
61+
return x_pairs, Y[idxs1] == Y[idxs2]
62+
else:
63+
return x_pairs
64+
65+
def _form_balanced_pairs(self, X, Y, subsample_length=None):
66+
'''
67+
forming pairs with equal +ve and -ve edges
68+
must be given Y vector for this to work
69+
'''
70+
idxs1, idxs2 = self._pair_idxs(X.shape[0])
71+
classes = Y[idxs1] == Y[idxs2]
72+
73+
# working out how many edges I can use in total
74+
max_edges = np.array(classes.sum(), (1-classes).sum()).min()
75+
76+
if subsample_length is not None:
77+
max_edges = min(max_edges, subsample_length/2)
78+
79+
# subsample each class in turn
80+
to_use = np.hstack([np.random.choice(
81+
np.where(classes==this_class)[0], max_edges, replace=False)
82+
for this_class in [0, 1]])
83+
84+
# print final_idxs1, final_idxs2
85+
x_pairs = np.abs(X[idxs1[to_use]] - X[idxs2[to_use]])
86+
y_pairs = classes[to_use]
87+
return x_pairs, y_pairs
88+
89+
def _pair_idxs(self, num_data):
90+
91+
A = np.outer(np.arange(num_data), np.ones(num_data))
92+
idxs1 = squareform(A, force='to_vector', checks=False)
93+
idxs2 = squareform(A.T, force='to_vector', checks=False)
94+
return idxs1.astype(int), idxs2.astype(int)
95+
96+
def _correlation_clusterer(self, edge_probabilities):
97+
'''
98+
does the actual coorelation clustering, given edge probabilities
99+
edge_probabilities can be a sparse matrix.
100+
'''
101+
102+
# convert edge probabilities to weights
103+
edge_probabilities[edge_probabilities==0] = 0.0001
104+
edge_probabilities[edge_probabilities==1] = 1.0 - 0.0001
105+
weights = np.log(edge_probabilities / (1.0 - edge_probabilities))
106+
np.fill_diagonal(weights, 0)
107+
108+
self.weights = weights
109+
110+
# form some different starting guesses
111+
# use: all in same cluster, all in own cluster, then random clusterings
112+
N = weights.shape[0]
113+
start_points = [np.ones(N), np.arange(N)] + \
114+
[np.random.randint(0, N, N) for _ in range(self.num_random_starts)]
115+
116+
# setting variables to keep track in the loop
117+
max_energy = -np.inf
118+
best_Y = None
119+
120+
# for each starting point, run the solver
121+
for start_point in start_points:
122+
Y, energy = self._clustering_solver(weights, start_point.astype(int))
123+
if energy > max_energy:
124+
max_energy = energy
125+
best_Y = Y
126+
127+
return best_Y, max_energy
128+
129+
def _clustering_solver(self, W, start_labels):
130+
'''
131+
the actual code that does the clustering, using the AL_ICM algorithm
132+
trying to MAXIMISE the energy
133+
134+
This could probably be done in a much more efficient way, without the
135+
need for bincount on each inner loop
136+
'''
137+
iteration = 0
138+
labels = start_labels.copy()
139+
n_items = W.shape[0]
140+
old_energy = -np.inf
141+
142+
for iteration in range(self.max_iters):
143+
144+
# assign each item in turn to the best cluster
145+
for j in range(n_items):
146+
147+
cluster_scores = np.bincount(labels, weights=W[j, :])
148+
149+
if np.all(cluster_scores < 0):
150+
# creating a new label
151+
labels[j] = labels.max() + 1
152+
else:
153+
# assigning to the best exisiting label
154+
labels[j] = np.argmax(cluster_scores)
155+
156+
if iteration % 15 == 0:
157+
# reasign labels for efficiency
158+
_, labels = np.unique(labels, return_inverse=True)
159+
160+
energy = self._clustering_energy(W, labels)
161+
162+
if energy < old_energy:
163+
raise Exception("This should never happen!")
164+
elif energy == old_energy:
165+
break
166+
167+
old_energy = energy
168+
169+
else:
170+
print "Reached max iters (%d), breaking" % iteration
171+
172+
_, labels = np.unique(labels, return_inverse=True)
173+
return labels, energy
174+
175+
def _clustering_energy(self, W, Y):
176+
'''
177+
sums up all the edges between items which have been given the same
178+
class label
179+
'''
180+
Y = Y.copy()[None, :]
181+
return (W * (Y==Y.T).astype(float)).sum()

0 commit comments

Comments
 (0)