-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsacha_test.py
104 lines (57 loc) · 1.98 KB
/
sacha_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/python3
import os
import unsec
import logging
from unsec.algorithm import HierarchicalAlgo, KMeanAlgo, SKMeanAlgo
from unsec.vectorizer import TfidfVectorizer, LogicVectorizer
from unsec import EmailCollection, TestEmailCollection
from unsec import Clusterizer, Assessor
from unsec import tools
from sklearn import metrics
import numpy as np
import multiprocessing as mp
from threading import Thread
# logging.basicConfig(level=logging.INFO)
collection = EmailCollection(unsec.SMALL_DATASET_PATH)
# # collection.keep_lang("fr")
engine = Clusterizer(collection,
target = "both",
algorithm = HierarchicalAlgo(),
vectorizer = TfidfVectorizer())
engine.compute()
for col in engine.clusters:
print("==cluster==")
for email in col:
print("---email---")
print("subject :",email.get_subject())
print("body :",email.get_body())
print("clean :",email.clean)
# engine.run_cleaner()
# engine.run_vectorizer()
# for n in range(2,100):
# engine.algorithm.n_clusters = n
# engine.run_algorithm()
# intra = sum(engine.cluster_similarity()) / n
# inter = engine.cluster_linkage()
# sil = engine.cluster_silhouette_score()
# print(n,intra, inter, sil,sep="\t")
# print(tester.categories)
# #collection.keep_lang("fr")
# c = Clusterizer(collection)
# c.set_vectorizer(TfidfVectorizer())
# c.set_algorithm(HierarchicalAlgo(n_clusters=2))
# c.compute()
# c.print_table()
# c.print_table()
# collection = EmailCollection()
# collection.add_from_directory(unsec.LARGE_DATASET_PATH)
# collection.keep_lang("fr")
# engine = Clusterizer(collection)
# engine.target = "body"
# engine.set_vectorizer(TfidfVectorizer())
# engine.run_cleaner()
# engine.run_vectorizer()
# engine.set_algorithm(HierarchicalAlgo(n_clusters = 4, affinity = "cosine"))
# engine.run_algorithm()
# engine.compute_clusters()
# engine.print_table()