-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfeatmultinomial.py
84 lines (69 loc) · 3.05 KB
/
featmultinomial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import numpy as np
from math import log
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils.extmath import safe_sparse_dot
def _sanitize_logarithms(X):
"""Changes the nan
"""
class FeatMultinomalNB(MultinomialNB):
"""A MultinomialNB classfier that can be trained using labeled features.
"""
def fit(self, X, Y, sample_weight=None, features=None):
"""Fit Naive Bayes classifier according to X, y
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
sample_weight : array-like, shape = [n_samples], optional
Weights applied to individual samples (1. for unweighted).
features : array-like, shape = [n_classes, n_features], optional
Boost for the prior probability of a feature given a class. For no
boost use the value alpha given on the initialization.
Returns
-------
self : object
Returns self.
"""
if features != None:
self.alpha = features
self.instance_num = X.shape[0]
return_value = super(FeatMultinomalNB, self).fit(X, Y, sample_weight)
self._information_gain()
return return_value
def _count(self, X, Y):
super(FeatMultinomalNB, self)._count(X, Y)
# Number of intances with class j and precense of feature k
# shape class, feat.
self.count_feat_and_class = safe_sparse_dot(Y.T, (X > 0))
def _information_gain(self):
"""Calculates the information gain for each feature.
Stores the value in self.feat_information_gain
"""
prob_Ik1_and_class = self.count_feat_and_class / self.instance_num
prob_Ik1 = self.count_feat_and_class.sum(axis=0) / self.instance_num
term1 = (prob_Ik1_and_class *
((np.log(prob_Ik1_and_class) - np.log(prob_Ik1)).T -
self.class_log_prior_).T)
prob_Ik0_and_class = ((self.count_feat_and_class.T -
self.class_count_).T * -1 / self.instance_num)
term2 = (prob_Ik0_and_class *
((np.log(prob_Ik0_and_class) - np.log(1 - prob_Ik1)).T -
self.class_log_prior_).T)
self.feat_information_gain = (np.nan_to_num(term1).sum(axis=0) +
np.nan_to_num(term2).sum(axis=0))
def instance_proba(self, X):
"""Calculates the probability of each instance in X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Returns
-------
array-like, shape = [n_samples]
"""
feat_prob = safe_sparse_dot(np.exp(self.class_log_prior_),
np.exp(self.feature_log_prob_)).T
instance_log_prob = safe_sparse_dot(X, np.log(feat_prob))
return np.exp(instance_log_prob)