-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCorrelation.py
397 lines (318 loc) · 14.1 KB
/
Correlation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
import time
import tensorflow as tf
from scipy.spatial.distance import pdist, squareform
import numpy as np
from scipy.stats import pearsonr, spearmanr, rankdata
class Correlation:
@staticmethod
def distance_corr(var_1, var_2, normedweight, power=1):
"""
https://github.com/gkasieczka/DisCo
var_1: First variable to decorrelate (eg mass)
var_2: Second variable to decorrelate (eg classifier output)
normedweight: Per-example weight. Sum of weights should add up to N (where N is the number of examples)
power: Exponent used in calculating the distance correlation
va1_1, var_2 and normedweight should all be 1D tf tensors with the same number of entries
Usage: Add to your loss function. total_loss = BCE_loss + lambda * distance_corr
"""
xx = tf.reshape(var_1, [-1, 1])
xx = tf.tile(xx, [1, tf.size(var_1)])
xx = tf.reshape(xx, [tf.size(var_1), tf.size(var_1)])
yy = tf.transpose(xx)
amat = tf.abs(xx-yy)
xx = tf.reshape(var_2, [-1, 1])
xx = tf.tile(xx, [1, tf.size(var_2)])
xx = tf.reshape(xx, [tf.size(var_2), tf.size(var_2)])
yy = tf.transpose(xx)
bmat = tf.abs(xx-yy)
amatavg = tf.reduce_mean(amat*normedweight, axis=1)
bmatavg = tf.reduce_mean(bmat*normedweight, axis=1)
minuend_1 = tf.tile(amatavg, [tf.size(var_1)])
minuend_1 = tf.reshape(minuend_1, [tf.size(var_1), tf.size(var_1)])
minuend_2 = tf.transpose(minuend_1)
Amat = amat-minuend_1-minuend_2+tf.reduce_mean(amatavg*normedweight)
minuend_1 = tf.tile(bmatavg, [tf.size(var_2)])
minuend_1 = tf.reshape(minuend_1, [tf.size(var_2), tf.size(var_2)])
minuend_2 = tf.transpose(minuend_1)
Bmat = bmat-minuend_1-minuend_2+tf.reduce_mean(bmatavg*normedweight)
ABavg = tf.reduce_mean(Amat*Bmat*normedweight,axis=1)
AAavg = tf.reduce_mean(Amat*Amat*normedweight,axis=1)
BBavg = tf.reduce_mean(Bmat*Bmat*normedweight,axis=1)
if power==1:
dCorr = tf.reduce_mean(ABavg*normedweight)/tf.sqrt(tf.reduce_mean(AAavg*normedweight)*tf.reduce_mean(BBavg*normedweight))
elif power==2:
dCorr = (tf.reduce_mean(ABavg*normedweight))**2/(tf.reduce_mean(AAavg*normedweight)*tf.reduce_mean(BBavg*normedweight))
else:
dCorr = (tf.reduce_mean(ABavg*normedweight)/tf.sqrt(tf.reduce_mean(AAavg*normedweight)*tf.reduce_mean(BBavg*normedweight)))**power
return dCorr
def dist_corr(X, Y):
"""
https://gist.github.com/satra/aa3d19a12b74e9ab7941
Compute the distance correlation function
>>> a = [1,2,3,4,5]
>>> b = np.array([1,2,9,4,4])
>>> distcorr(a, b)
0.762676242417
"""
X = np.atleast_1d(X)
Y = np.atleast_1d(Y)
if np.prod(X.shape) == len(X):
X = X[:, None]
if np.prod(Y.shape) == len(Y):
Y = Y[:, None]
X = np.atleast_2d(X)
Y = np.atleast_2d(Y)
n = X.shape[0]
if Y.shape[0] != X.shape[0]:
raise ValueError('Number of samples must match')
a = squareform(pdist(X))
b = squareform(pdist(Y))
A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean()
B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean()
dcov2_xy = (A * B).sum()/float(n * n)
dcov2_xx = (A * A).sum()/float(n * n)
dcov2_yy = (B * B).sum()/float(n * n)
dcor = np.sqrt(dcov2_xy)/np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy))
return dcor
def pearson_corr_tf(v, u):
mv, sv = tf.nn.moments(v, axes=[0])
mu, su = tf.nn.moments(u, axes=[0])
ev = v - mv
eu = u - mu
j = ev*eu
mj = tf.reduce_mean(j)
num = mj
den = sv*su
return tf.abs(num)
def pearson_corr(X, Y):
cor, pvalue = pearsonr(X, Y)
return cor
def spearman_corr(X, Y):
cor, pvalue = spearmanr(X, Y)
return cor
@staticmethod
def rdc(var_1, var_2, f=tf.sin, k=20, s=1/6., n=1):
# Some functions to handle all of the conditions within the while loop for
# ensuring real eigvals in tensor friendly syntax
def cov(x, y):
#x = tf.expand_dims(x, axis=0)
#y = tf.expand_dims(y, axis=0)
# Find mean (along k direction)
mean_x = tf.reshape(tf.reduce_mean(x, axis=1), shape=(-1,1))
mean_y = tf.reshape(tf.reduce_mean(y, axis=1), shape=(-1,1))
# Subtract mean from each value (k dimenson)
sub_x = x - mean_x
sub_y = y - mean_y
# Do the matrix multiplication
cov = tf.einsum('ij,jk->ik', sub_x, tf.transpose(sub_y))
scale = 1. / tf.cast(tf.shape(x)[1] - 1, tf.float32)
return scale * cov
# What to do when eigs aren't real or not between zero and one
def nonreal(ub, lb, k):
ub -= 1
k = (ub + lb) // 2
return ub, lb, k
# What to do when eigs are real and between zero and one
def real(ub, lb, k):
'''
Should behave the same as:
if lb == ub: break
bound_manip(ub, lb, k)
'''
return tf.cond(tf.equal(lb, ub), lambda: (ub, lb, k), lambda: bound_manip(ub, lb, k))
# Manipulate lower bound when not the same as upper bound
def bound_manip(ub, lb, k):
'''
# Logic should be the same as the code below
lb = k
if ub == lb + 1:
k = ub
else:
k = (ub + lb) // 2
return ub, lb, k
'''
lb = k
k = tf.cond(tf.equal(ub, lb+1), lambda: ub, lambda: (ub + lb) // 2)
return ub, lb, k
# Case for determining if we have reached a value of k where the eigenvalues are in [0, 1] and real
def while_case(C, ub, lb, k, k0, eigs, case):
return tf.reduce_all([tf.not_equal(ub, lb), tf.logical_not(case)])
# While loop to determine real eigenvalues... but make it tensor friendly :)
def while_body(C, ub, lb, k, k0, eigs, case):
# Compute canonical correlations
Cxx = C[:k, :k]
Cyy = C[k0:k0+k, k0:k0+k]
Cxy = C[:k, k0:k0+k]
Cyx = C[k0:k0+k, :k]
eigs = tf.linalg.eigvals(
tf.matmul(
tf.matmul(tf.linalg.pinv(Cxx), tf.transpose(Cxy)),
tf.transpose(tf.matmul(tf.linalg.pinv(Cyy), tf.transpose(Cyx)))
))
mag = tf.math.real(eigs)
# Case to determine if all eigenvalues are real and within [0,1]
case = tf.reduce_all(tf.stack([tf.reduce_all(tf.less_equal(1e-7, tf.math.imag(eigs))), tf.reduce_all(tf.less_equal(1e-7, tf.reduce_min(mag))), tf.reduce_all(tf.greater_equal(0.9999999, tf.reduce_max(mag)))]))
ub, lb, k = tf.cond(case, lambda: real(ub, lb, k), lambda: nonreal(ub, lb, k))
return[C, ub, lb, k, k0, eigs, case]
'''
# Original logic for computing proper eigenvalues by manipulating k
if not case:
ub -= 1
k = (ub + lb) // 2
continue
# Binary search if k is too large
if lb == ub: break
lb = k
if ub == lb + 1:
k = ub
else:
k = (ub + lb) // 2
'''
# Ensure correct shape of input vectors
var_1_temp = tf.reshape(var_1, [-1, 1])
var_2_temp = tf.reshape(var_2, [-1, 1])
#Copula Transformation
x = (1+tf.argsort(tf.argsort(tf.transpose(var_1)))) / tf.shape(var_1)[0]
y = (1+tf.argsort(tf.argsort(tf.transpose(var_2)))) / tf.shape(var_2)[0]
# Adding in column of ones to each tensor to make the random linear projection a dot product
x = tf.reshape(x, (-1, 1))
y = tf.reshape(y, (-1, 1))
ones = tf.ones_like(x)
X = tf.concat([x, ones], 1)
Y = tf.concat([y, ones], 1)
# Random linear projection
normX = (s/tf.cast(tf.shape(X)[1],tf.float32))*tf.random.normal([X.shape[1], k])
normY = (s/tf.cast(tf.shape(Y)[1],tf.float32))*tf.random.normal([Y.shape[1], k])
X = tf.matmul(tf.cast(X, tf.float32), normX)
Y = tf.matmul(tf.cast(Y, tf.float32), normY)
# Apply nonlinear function to random projection
fX = tf.sin(X)
fY = tf.sin(Y)
# Compute covariance matrix and eigen values
C = cov(tf.transpose(tf.concat((fX, fY), axis=1)), tf.transpose(tf.concat((fX, fY), axis=1)))
k = tf.constant(k)
k0 = k
lb = tf.constant(1)
ub = k
eigs = tf.reshape(tf.convert_to_tensor((), dtype=tf.complex64), shape=(-1,))
case = tf.constant(False)
C, ub, lb, k, k0, eigs, case = tf.while_loop(while_case, while_body, [C, ub, lb, k, k0, eigs, case], [C.get_shape(), ub.get_shape(), lb.get_shape(), k.get_shape(), k0.get_shape(), tf.TensorShape([None,]), case.get_shape()])
'''
# Original while loop logic (not tensor friendly)
while True:
# Compute canonical correlations
Cxx = C[:k, :k]
Cyy = C[k0:k0+k, k0:k0+k]
Cxy = C[:k, k0:k0+k]
Cyx = C[k0:k0+k, :k]
eigs = tf.linalg.eigvals(tf.matmul(tf.matmul(tf.linalg.pinv(Cxx), tf.transpose(Cxy)),
tf.transpose(tf.matmul(tf.linalg.pinv(Cyy), tf.transpose(Cyx)))))
mag = tf.abs(eigs)
case = tf.reduce_all(tf.stack([tf.reduce_all(tf.equal(0., tf.math.imag(eigs))), tf.reduce_all(tf.less_equal(0., tf.reduce_min(mag))), tf.reduce_all(tf.greater_equal(1., tf.reduce_max(mag)))]))
if case is None: continue
# Binary search if k is too large
ub, k = tf.cond(case, true_fn=lambda: (ub, k), false_fn=lambda: bounds(ub,lb,k) )
if lb == ub: break
lb = k
if ub == lb + 1:
k = ub
else:
k = (ub + lb) // 2
'''
return tf.sqrt(tf.reduce_max(tf.math.real(eigs)))
"""
Implements the Randomized Dependence Coefficient
David Lopez-Paz, Philipp Hennig, Bernhard Schoelkopf
http://papers.nips.cc/paper/5138-the-randomized-dependence-coefficient.pdf
"""
#@staticmethod
def rdc_np(self, x, y, f=np.sin, k=20, s=1/6., n=1):
"""
Computes the Randomized Dependence Coefficient
x,y: numpy arrays 1-D or 2-D
If 1-D, size (samples,)
If 2-D, size (samples, variables)
f: function to use for random projection
k: number of random projections to use
s: scale parameter
n: number of times to compute the RDC and
return the median (for stability)
According to the paper, the coefficient should be relatively insensitive to
the settings of the f, k, and s parameters.
"""
if n > 1:
values = []
for i in range(n):
try:
values.append(rdc(x, y, f, k, s, 1))
except np.linalg.linalg.LinAlgError: pass
return np.median(values)
if len(x.shape) == 1: x = x.reshape((-1, 1))
if len(y.shape) == 1: y = y.reshape((-1, 1))
# Copula Transformation
cx = np.column_stack([rankdata(xc, method='ordinal') for xc in x.T])/float(x.size)
cy = np.column_stack([rankdata(yc, method='ordinal') for yc in y.T])/float(y.size)
# Add a vector of ones so that w.x + b is just a dot product
O = np.ones(cx.shape[0])
X = np.column_stack([cx, O])
Y = np.column_stack([cy, O])
# Random linear projections
Rx = (s/X.shape[1])*np.random.randn(X.shape[1], k)
Ry = (s/Y.shape[1])*np.random.randn(Y.shape[1], k)
X = np.dot(X, Rx)
Y = np.dot(Y, Ry)
# Apply non-linear function to random projections
fX = f(X)
fY = f(Y)
C = np.cov(np.hstack([fX, fY]).T)
C = self.C
# Due to numerical issues, if k is too large,
# then rank(fX) < k or rank(fY) < k, so we need
# to find the largest k such that the eigenvalues
# (canonical correlations) are real-valued
k0 = k
lb = 1
ub = k
while True:
# Compute canonical correlations
Cxx = C[:k, :k]
Cyy = C[k0:k0+k, k0:k0+k]
Cxy = C[:k, k0:k0+k]
Cyx = C[k0:k0+k, :k]
eigs = np.linalg.eigvals(np.dot(np.dot(np.linalg.pinv(Cxx), Cxy),
np.dot(np.linalg.pinv(Cyy), Cyx)))
eigs1 = tf.linalg.eig(
tf.matmul(
tf.matmul(tf.linalg.pinv(Cxx), tf.transpose(Cxy)),
tf.transpose(tf.matmul(tf.linalg.pinv(Cyy), tf.transpose(Cyx)))
))[0]
# Binary search if k is too large
if not (np.all(np.isreal(eigs)) and
0 <= np.min(eigs) and
np.max(eigs) <= 1):
ub -= 1
k = (ub + lb) // 2
continue
if lb == ub: break
lb = k
if ub == lb + 1:
k = ub
else:
k = (ub + lb) // 2
return np.sqrt(np.max(eigs))
def compare_rdc():
c = Correlation()
a = tf.random.uniform(shape=(10000,), minval=0.0, maxval=1.0)
b = tf.random.uniform(shape=(10000,), minval=0.0, maxval=1.0)
start = time.time()
for i in range(10):
print("RDC TF: ", c.rdc(a, b, k=20).numpy())
end = time.time()
print("Calculation Time: {}".format(end - start))
start = time.time()
for i in range(10):
print("RDC NP: ", c.rdc_np(a.numpy(), b.numpy(), k=20))
end = time.time()
print("Calculation Time: {}".format(end - start))
if __name__ == "__main__":
compare_rdc()