-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathACP.py
executable file
·130 lines (109 loc) · 4.63 KB
/
ACP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python3
# coding: utf-8
import pandas as pd
from scipy.stats import entropy
import gensim
from scipy.spatial.distance import pdist, squareform
import numpy as np
# Jensen–Shannon divergence : JSD(P,Q) = 1/2D(P,M)+ 1/2D(Q,M) M=1/2(P+Q) D=entropy
def jensen_shannon(P, Q):
M = 0.5 * (P + Q)
return 0.5 * (entropy(P, M) + entropy(Q, M))
def js_acp(distributions):
dist_matrix = squareform(pdist(distributions, metric=jensen_shannon))
return acp(dist_matrix)
def acp(pair_dists, n_components=2):
pair_dists = np.asarray(pair_dists, np.float64)
# Decomposition en valeurs singulieres
n = pair_dists.shape[0]
H = np.eye(n) - np.ones((n, n)) / n
B = - H.dot(pair_dists ** 2).dot(H) / 2
eigvals, eigvecs = np.linalg.eig(B)
ix = eigvals.argsort()[::-1][:n_components]
eigvals = eigvals[ix]
eigvecs = eigvecs[:, ix]
eigvals[np.isclose(eigvals, 0)] = 0
if np.any(eigvals < 0):
ix_neg = eigvals < 0
eigvals[ix_neg] = np.zeros(eigvals[ix_neg].shape)
eigvecs[:, ix_neg] = np.zeros(eigvecs[:, ix_neg].shape)
return np.sqrt(eigvals) * eigvecs
def _df_with_names(data, index_name, columns_name):
if type(data) == pd.DataFrame:
# we want our index to be numbered
df = pd.DataFrame(data.values)
else:
df = pd.DataFrame(data)
df.index.name = index_name
df.columns.name = columns_name
return df
def _topic_coordinates(topic_term_dists, topic_proportion):
K = topic_term_dists.shape[0]
mds_res = js_acp(topic_term_dists)
assert mds_res.shape == (K, 2)
mds_df = pd.DataFrame({'x': mds_res[:, 0], 'y': mds_res[:, 1], 'topics': range(1, K + 1), 'Frequence': topic_proportion * 100})
return mds_df
def _series_with_name(data, name):
if type(data) == pd.Series:
data.name = name
# ensures a numeric index
return data.reset_index()[name]
else:
return pd.Series(data, name=name)
def acp_lda(ldamodel, mycorpus, mydict):
lda = ldamodel
if not gensim.matutils.ismatrix(mycorpus):
corpus_csc = gensim.matutils.corpus2csc(mycorpus, num_terms=len(mydict))
else:
corpus_csc = mycorpus
vocab = list(mydict.token2id.keys())
beta = 0.01
fnames_argsort = np.asarray(list(mydict.token2id.values()), dtype=np.int_)
term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort]
term_freqs[term_freqs == 0] = beta
doc_lengths = corpus_csc.sum(axis=0).A.ravel()
assert term_freqs.shape[0] == len(mydict),\
'Term frequencies and mydict have different shape {} != {}'.format(
term_freqs.shape[0], len(mydict))
assert doc_lengths.shape[0] == len(mycorpus),\
'Document lengths and mycorpus have different sizes {} != {}'.format(doc_lengths.shape[0], len(mycorpus))
if hasattr(lda, 'lda_alpha'):
num_topics = len(lda.lda_alpha)
else:
num_topics = lda.num_topics
if hasattr(lda, 'lda_beta'):
gamma = lda.inference(mycorpus)
else:
gamma, _ = lda.inference(mycorpus)
doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
assert doc_topic_dists.shape[1] == num_topics,\
'Document topics and number of topics do not match {} != {}'.format(
doc_topic_dists.shape[1], num_topics)
# get the topic-term distribution straight from gensim without
# iterating over tuples
if hasattr(lda, 'lda_beta'):
topic = lda.lda_beta
else:
topic = lda.state.get_lambda()
topic = topic / topic.sum(axis=1)[:, None]
topic_term_dists = topic[:, fnames_argsort]
assert doc_topic_dists.shape[1] == num_topics,\
'Document topics and number of topics do not match {} != {}'.format(doc_topic_dists.shape[1], num_topics)
#print(term_freqs)
topic_term_dists = _df_with_names(topic_term_dists, 'topic', 'term')
doc_topic_dists = _df_with_names(doc_topic_dists, 'doc', 'topic')
term_frequency = _series_with_name(term_freqs, 'term_frequency')
doc_lengths = _series_with_name(doc_lengths, 'doc_length')
vocab = _series_with_name(vocab, 'vocab')
topic_freq = (doc_topic_dists.T * doc_lengths).T.sum()
topic_proportion = (topic_freq / topic_freq.sum())
topic_order = topic_proportion.index
# reorder all data based on new ordering of topics
topic_freq = topic_freq[topic_order]
topic_term_dists = topic_term_dists.iloc[topic_order]
doc_topic_dists = doc_topic_dists[topic_order]
term_topic_freq = (topic_term_dists.T * topic_freq).T
term_frequency = np.sum(term_topic_freq, axis=0)
topic_coordinates = _topic_coordinates(topic_term_dists, topic_proportion)
# print(topic_coordinates)
return topic_coordinates