-
Notifications
You must be signed in to change notification settings - Fork 0
/
spam.lisp
246 lines (199 loc) · 7.83 KB
/
spam.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
(defpackage :com.gigamonkeys.spam
(:use :common-lisp :com.gigamonkey.pathnames))
(in-package :com.gigamonkeys.spam)
(defun classify (text)
(classification (score (extract-features text))))
(defparameter *max-ham-score* .4)
(defparameter *min-spam-score* .6)
(defun classification (score)
(values
(cond
((<= score *max-ham-score*) 'ham)
((>= score *min-spam-score*) 'spam)
(t 'unsure))
score))
(defclass word-feature ()
((word
:initarg :word
:accessor word
:initform (error "Must supply :word")
:documentation "The word this feature represents.")
(spam-count
:initarg :spam-count
:accessor spam-count
:initform 0
:documentation "Number of spams we have seen this feature in.")
(ham-count
:initarg :ham-count
:accessor ham-count
:initform 0
:documentation "Number of hams we have seen this feature in.")))
(defvar *feature-database* (make-hash-table :test #'equal))
(defun clear-database ()
(setf *feature-database* (make-hash-table :test #'equal)
*total-spams* 0
*total-hams* 0))
(defun intern-feature (word)
(or (gethash word *feature-database*)
(setf (gethash word *feature-database*)
(make-instance 'word-feature :word word))))
(defun extract-words (text)
(delete-duplicates
(cl-ppcre:all-matches-as-strings "[a-zA-Z]{3,}" text)
:test #'string=))
(defun extract-features (text)
(mapcar #'intern-feature (extract-words text)))
(defmethod print-object ((object word-feature) stream)
(print-unreadable-object (object stream :type t)
(with-slots (word ham-count spam-count) object
(format stream "~s :hams ~d :spams ~d" word ham-count spam-count))))
(defun train (text type)
(dolist (feature (extract-features text))
(increment-count feature type))
(increment-total-count type))
(defun increment-count (feature type)
(ecase type
(ham (incf (ham-count feature)))
(spam (incf (spam-count feature)))))
(defvar *total-spams* 0)
(defvar *total-hams* 0)
(defun increment-total-count (type)
(ecase type
(ham (incf *total-hams*))
(spam (incf *total-spams*))))
(defun spam-probability (feature)
(with-slots (spam-count ham-count) feature
(let ((spam-frequency (/ spam-count (max 1 *total-spams*)))
(ham-frequency (/ ham-count (max 1 *total-hams*))))
(/ spam-frequency (+ spam-frequency ham-frequency)))))
(defun bayesian-spam-probability (feature &optional
(assumed-probability 1/2)
(weight 1))
(let ((basic-probability (spam-probability feature))
(data-points (+ (spam-count feature) (ham-count feature))))
(/ (+ (* weight assumed-probability)
(* data-points basic-probability))
(+ weight data-points))))
(defun score (features)
(let ((spam-probs ())
(ham-probs ())
(number-of-probs 0))
(dolist (feature features)
(unless (untrained-p feature)
(let ((spam-prob (float (bayesian-spam-probability feature) 0.0d0)))
(push spam-prob spam-probs)
(push (- 1.0d0 spam-prob) ham-probs)
(incf number-of-probs))))
(let ((h (- 1 (fisher spam-probs number-of-probs)))
(s (- 1 (fisher ham-probs number-of-probs))))
(/ (+ (- 1 h) s) 2.0d0))))
(defun untrained-p (feature)
(with-slots (spam-count ham-count) feature
(and (zerop spam-count) (zerop ham-count))))
(defun fisher (probs number-of-probs)
"The Fischer computation described by Robinson."
(inverse-chi-square
(* -2 (reduce #'+ probs :key #'log))
(* 2 number-of-probs)))
(defun inverse-chi-square (value degrees-of-freedom)
(assert (evenp degrees-of-freedom))
(min
(loop with m = (/ value 2)
for i below (/ degrees-of-freedom 2)
for prob = (exp (- m)) then (* prob (/ m i))
summing prob)
1.0))
(defun add-file-to-corpus (filename type corpus)
(vector-push-extend (list filename type) corpus))
(defparameter *corpus* (make-array 1000 :adjustable t :fill-pointer 0))
(defun add-directory-to-corpus (dir type corpus)
(dolist (filename (list-directory dir))
(add-file-to-corpus filename type corpus)))
(defun test-classifier (corpus testing-fraction)
(clear-database)
(let* ((shuffled (shuffle-vector corpus))
(size (length corpus))
(train-on (floor (* size (- 1 testing-fraction)))))
(train-from-corpus shuffled :start 0 :end train-on)
(test-from-corpus shuffled :start train-on)))
(defparameter *max-chars* (* 10 1024))
(defun train-from-corpus (corpus &key (start 0) end)
(loop for idx from start below (or end (length corpus)) do
(destructuring-bind (file type)
(aref corpus idx) (train (start-of-file file *max-chars*) type))))
(defun test-from-corpus (corpus &key (start 0) end)
(loop for idx from start below (or end (length corpus))
collect (destructuring-bind
(file type) (aref corpus idx)
(multiple-value-bind (classification score)
(classify (start-of-file file *max-chars*))
(list :file file :type type
:classification classification
:score score)))))
(defun nshuffle-vector (vector)
(loop for idx downfrom (1- (length vector))
for other = (random (1+ idx))
do (unless (= idx other)
(rotatef (aref vector idx) (aref vector other))))
vector)
(defun shuffle-vector (vector)
(nshuffle-vector (copy-seq vector)))
(defun start-of-file (file max-chars)
(with-open-file (in file)
(let* ((length (min (file-length in) max-chars))
(text (make-string length))
(read (read-sequence text in)))
(if (< read length)
(subseq text 0 read)
text))))
(defun result-type (result)
(destructuring-bind (&key type classification &allow-other-keys) result
(ecase type
(ham (ecase classification
(ham 'correct)
(spam 'false-positive)
(unsure 'missed-ham)))
(spam (ecase classification
(ham 'false-negative)
(spam 'correct)
(unsure 'missed-spam))))))
(defun false-positive-p (result)
(eql (result-type result) 'false-positive))
(defun false-negative-p (result)
(eql (result-type result) 'false-negative))
(defun missed-ham-p (result)
(eql (result-type result) 'missed-ham))
(defun missed-spam-p (result)
(eql (result-type result) 'missed-spam))
(defun correct-p (result)
(eql (result-type result) 'correct))
(defun analyze-results (results)
(let* ((keys '(total correct false-positive
false-negative missed-ham missed-spam))
(counts (loop for x in keys collect (cons x 0))))
(dolist (item results)
(incf (cdr (assoc 'total counts)))
(incf (cdr (assoc (result-type item) counts))))
(loop with total = (cdr (assoc 'total counts))
for (label . count) in counts
do (format t "~&~@(~a~):~20t~5d~,5t: ~6,2f%~%"
label count (* 100 (/ count total))))))
(defun explain-classification (file)
(let* ((text (start-of-file file *max-chars*))
(features (extract-features text))
(score (score features))
(classification (classification score)))
(show-summary file text classification score)
(dolist (feature (sorted-interesting features))
(show-feature feature))))
(defun show-summary (file text classification score)
(format t "~&~a" file)
(format t "~2%~a~2%" text)
(format t "Classified as ~a with score of ~,5f~%" classification score))
(defun show-feature (feature)
(with-slots (word ham-count spam-count) feature
(format t "~&~2t~a~30thams: ~5d; spams: ~5d;~,10tprob: ~,f~%"
word ham-count spam-count (bayesian-spam-probability feature))))
(defun sorted-interesting (features)
(sort (remove-if #'untrained-p features)
#'< :key #'bayesian-spam-probability))