-
Notifications
You must be signed in to change notification settings - Fork 0
/
myclassifier.py
158 lines (124 loc) · 5.84 KB
/
myclassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import numpy as np
class SpamClassifier:
def train(self):
# load training data into array
training_data = np.loadtxt(open("training_spam.csv"), delimiter=",").astype(np.int)
self.log_class_priors = self.estimate_log_class_priors(training_data)
self.log_class_conditional_likelihoods = self.estimate_log_class_conditional_likelihoods(training_data)
def estimate_log_class_priors(self, data):
"""
Given a data set with binary response variable (0s and 1s) in the
left-most column, calculate the logarithm of the empirical class priors,
that is, the logarithm of the proportions of 0s and 1s:
log(p(C=0)) and log(p(C=1))
:param data: a two-dimensional numpy-array with shape = [n_samples, 1 + n_features]
the first column contains the binary response (coded as 0s and 1s).
:return log_class_priors: a numpy array of length two
"""
# sum leftmost column and divide by length of column to get probability of spam
prob_spam = data[:,0].sum() / len(data)
# P(spam) + P(ham) = 1 so
prob_ham = 1-prob_spam
# take logs to reduce risk of precision loss
log_class_priors = np.array([np.log(prob_spam), np.log(prob_ham)])
return log_class_priors
def estimate_log_class_conditional_likelihoods(self, data, alpha=1):
"""
Given a data set with binary response variable (0s and 1s) in the
left-most column and binary features (words), calculate the empirical
class-conditional likelihoods, that is,
log(P(w_i | c)) for all features w_i and both classes (c in {0, 1}).
Assume a multinomial feature distribution and use Laplace smoothing
if alpha > 0.
:param data: a two-dimensional numpy-array with shape = [n_samples, 1 + n_features]
:return theta:
a numpy array of shape = [2, n_features]. theta[j, i] corresponds to the
logarithm of the probability of feature i appearing in a sample belonging
to class j.
"""
# seperate class
spam = data[data[:, 0] == 1]
ham = data[data[:, 0] == 0]
# get number of each in class
n_spam = len(spam)
n_ham = len(ham)
# get number of features
num_features = len(spam[0])
# for clarity comparing to pseudocode
k = num_features
# get number of spam with each feature, same for ham
n_spam_w = spam.sum(axis=0)[1:]
n_ham_w = ham.sum(axis=0)[1:]
# get total number of occurences of spam and ham words
n_spam = sum(n_spam_w)
n_ham = sum(n_ham_w)
# calc theta
theta_spam = []
for n_c_w in n_spam_w:
# Laplace smoothing
theta_spam.append(np.log((n_c_w+alpha) / (n_spam + k*alpha)))
theta_ham = []
for n_c_w in n_ham_w:
# Laplace smoothing
theta_ham.append(np.log((n_c_w+alpha) / (n_ham + k*alpha)))
return np.array([theta_spam, theta_ham])
def predict(self, new_data):
"""
Given a new data set with binary features, predict the corresponding
response for each instance (row) of the new_data set.
:param new_data: a two-dimensional numpy-array with shape = [n_test_samples, n_features].
:param log_class_priors: a numpy array of length 2.
:param log_class_conditional_likelihoods: a numpy array of shape = [2, n_features].
theta[j, i] corresponds to the logarithm of the probability of feature i appearing
in a sample belonging to class j.
:return class_predictions: a numpy array containing the class predictions for each row
of new_data.
"""
# calc probability times value for each w_i - operand of summation in formula
# i.e. w_i * theta_c,w_i
w_theta_spam = new_data * self.log_class_conditional_likelihoods[0]
w_theta_ham = new_data * self.log_class_conditional_likelihoods[1]
# to store result
result = np.zeros(len(new_data))
# iterate over all spam and ham so can compare which gives a larger value
for c in range(0, len(w_theta_spam)):
# calc the two arg values by adding priors to each summation
args_spam = self.log_class_priors[0] + sum(w_theta_spam[c])
args_ham = self.log_class_priors[1] + sum(w_theta_ham[c])
# if spam return 1 else return 0
result[c] = (1 if args_spam > args_ham else 0)
return result
def create_classifier():
classifier = SpamClassifier()
classifier.train()
return classifier
classifier = create_classifier()
if True:
testing_spam = np.loadtxt(open("testing_spam.csv"), delimiter=",").astype(np.int)
test_data = testing_spam[:, 1:]
test_labels = testing_spam[:, 0]
for classifier in [classifier1]:
predictions = classifier.predict(test_data)
#print(predictions)
# print(test_labels)
accuracy = np.count_nonzero(predictions == test_labels)/test_labels.shape[0]
print(f"Accuracy on test data is: {accuracy}")
fp = 0
fn = 0
tp = 0
tn = 0
for i in range(len(predictions)):
if predictions[i] != test_labels[i]:
if predictions[i] == 1:
fp += 1
else:
fn += 1
else:
if predictions[i] == 1:
tp += 1
else:
tn += 1
print(f"FPs on test data is: {fp}")
print(f"FNs on test data is: {fn}")
print(f"TPs on test data is: {tp}")
print(f"TNs on test data is: {tn}")