-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmultiview_cluster_ensemble.py
165 lines (140 loc) · 4.86 KB
/
multiview_cluster_ensemble.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# coding: utf-8
"""
description: Multi-view document clustering via ensemble method
https://link.springer.com/article/10.1007/s10844-014-0307-6
Paper authors: Syed Fawad Hussain, Muhammad Mushtaq, Zahid Halim
author: Suraj Iyer
"""
__all__ = [
'cluster_based_similarity_matrix',
'pairwise_dissimilarity_matrix',
'affinity_matrix',
'aggregate_matrices',
'multiview_ensemble_similarity'
]
import pandas as pd
import time
import utils as npu
from scipy.spatial import distance
import numpy as np
from typing import Tuple
def cluster_based_similarity_matrix(
partitions: pd.DataFrame) -> np.ndarray:
"""
Calculate cluster based similarity matrix.
Parameters:
-----------
partitions: pd.DataFrame
Output of clustering algorithms on multiple views of the data.
Each row corresponds to a data point and each column a view.
The cells are the output of cluster algorithm(s) for given data point
conditioned on each view.
Returns:
--------
np.ndarray
cluster based similarity matrix
"""
# # construct a hyper graph adjacency matrix from these partitions
# df = pd.concat([
# pd.get_dummies(partitions[c], prefix=c) for c in partitions], axis=1)
# # calculate a new cluster based similarity matrix
# k = partitions.shape[1]
# return (1 / k * (df @ df.transpose(copy=True))).values
# Encode consensus partitions into integer labels
partitions = partitions.apply(
lambda s: s.astype('category').cat.codes, axis=0).values
# calculate a new cluster based similarity matrix
result = np.full((partitions.shape[0], partitions.shape[0]), 0.)
for i in range(partitions.shape[0]):
result[i, i] = 1.
result[i, i + 1:] = distance.cdist(
partitions[None, i], partitions[i + 1:], "hamming")[0]
result[i + 1:, i] = result[i, i + 1:]
return 1. - result
def pairwise_dissimilarity_matrix(partitions: np.ndarray) -> np.ndarray:
"""
Calculate pairwise dissimilarity matrix for given cluster partitions.
Parameters:
-----------
partitions: np.ndarray
Output of clustering algorithms on multiple views of the data.
Each row corresponds to a data point and each column a view.
The cells are the output of cluster algorithm(s) for given data point
conditioned on each view.
Returns:
--------
np.ndarray
Pairwise dissimilarity matrix
"""
pmd = npu.rowwise_cosine_similarity(
npu.rowwise_dissimilarity(partitions))
return pmd
def affinity_matrix(distance_matrix: np.ndarray, c: float) -> np.ndarray:
"""
Calculate affinity matrix for given distance matrix.
Parameters:
-----------
distance_matrix: np.ndarray
distance matrix (opposite of similarity matrix)
c: float
scaling factor
Returns:
--------
np.ndarray
Affinity matrix
"""
return np.exp(- (distance_matrix ** 2) / c)
def aggregate_matrices(
cbsm: np.ndarray, pdm: np.ndarray,
afm: np.ndarray, tol: float = 1e-08) -> np.ndarray:
"""
Combine the given similarity matrices into one similarity matrix.
Parameters:
-----------
cbsm: np.ndarray
Cluster based similarity matrix
pdm: np.ndarray
Pairwise dissimilarity matrix
afm: np.ndarray
Affinity matrix
tol: float
tolerance value
Returns:
--------
np.ndarray
Combined similarity matrix
"""
# D = distance matrix; D = 1 - S.
D = 1. - ((cbsm + pdm + afm) / 3.)
assert -tol <= np.min(D) and (np.max(D) - 1.) <= tol
D_new = npu.convert_to_ultrametric(D)
return 1. - D_new
def multiview_ensemble_similarity(
partitions, *similarity_matrices,
affinity_c=.1, verbose=True) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
if verbose:
print("Creating cluster-based similarity matrix.")
start = time.time()
cbsm = cluster_based_similarity_matrix(partitions)
if verbose:
print(f"Time to run: {time.time() - start}")
print("Creating pairwise dissimilarity matrix.")
start = time.time()
pdm = pairwise_dissimilarity_matrix(partitions.values)
if verbose:
print(f"Time to run: {time.time() - start}")
print("Creating affinity matrix.")
start = time.time()
afm_arr = []
for sim_m in similarity_matrices:
afm_arr.append(affinity_matrix(sim_m, c=affinity_c))
# take the average of all affinity matrices
afm = np.mean(np.array(afm_arr), axis=0)
if verbose:
print(f"Time to run: {time.time() - start}")
print("Aggregating the matrices.")
start = time.time()
similarity_matrix = aggregate_matrices(cbsm, pdm, afm)
if verbose:
print(f"Time to run: {time.time() - start}")
return similarity_matrix, cbsm, pdm, afm