-
Notifications
You must be signed in to change notification settings - Fork 60
/
helpers.py
88 lines (78 loc) · 2.78 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def parse_raw_message(raw_message):
lines = raw_message.split('\n')
email = {}
message = ''
keys_to_extract = ['from', 'to']
for line in lines:
if ':' not in line:
message += line.strip()
email['body'] = message
else:
pairs = line.split(':')
key = pairs[0].lower()
val = pairs[1].strip()
if key in keys_to_extract:
email[key] = val
return email
def parse_into_emails(messages):
emails = [parse_raw_message(message) for message in messages]
return {
'body': map_to_list(emails, 'body'),
'to': map_to_list(emails, 'to'),
'from_': map_to_list(emails, 'from')
}
def map_to_list(emails, key):
results = []
for email in emails:
if key not in email:
results.append('')
else:
results.append(email[key])
return results
def top_tfidf_feats(row, features, top_n=20):
topn_ids = np.argsort(row)[::-1][:top_n]
top_feats = [(features[i], row[i]) for i in topn_ids]
df = pd.DataFrame(top_feats, columns=['features', 'score'])
return df
def top_feats_in_doc(X, features, row_id, top_n=25):
row = np.squeeze(X[row_id].toarray())
return top_tfidf_feats(row, features, top_n)
def top_mean_feats(X, features, grp_ids=None, min_tfidf=0.1, top_n=25):
if grp_ids:
D = X[grp_ids].toarray()
else:
D = X.toarray()
D[D < min_tfidf] = 0
tfidf_means = np.mean(D, axis=0)
return top_tfidf_feats(tfidf_means, features, top_n)
def top_feats_per_cluster(X, y, features, min_tfidf=0.1, top_n=25):
dfs = []
labels = np.unique(y)
for label in labels:
ids = np.where(y==label)
feats_df = top_mean_feats(X, features, ids, min_tfidf=min_tfidf, top_n=top_n)
feats_df.label = label
dfs.append(feats_df)
return dfs
def plot_tfidf_classfeats_h(dfs):
fig = plt.figure(figsize=(12, 9), facecolor="w")
x = np.arange(len(dfs[0]))
for i, df in enumerate(dfs):
ax = fig.add_subplot(1, len(dfs), i+1)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.set_frame_on(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.set_xlabel("Tf-Idf Score", labelpad=16, fontsize=14)
ax.set_title("cluster = " + str(df.label), fontsize=16)
ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
ax.barh(x, df.score, align='center', color='#7530FF')
ax.set_yticks(x)
ax.set_ylim([-1, x[-1]+1])
yticks = ax.set_yticklabels(df.features)
plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
plt.show()