forked from vc1492a/PyNomaly
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathloop.py
177 lines (145 loc) · 7.73 KB
/
loop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
from math import erf
import numpy as np
import sys
import warnings
__version__ = '0.1.0'
class LocalOutlierProbability(object):
"""
:param data: a Pandas DataFrame or Numpy array of float data
:param extent: a parameter value between 0 and 1 that controls the statistical extent (optional, default 0.997)
:param n_neighbors: the total number of neighbors to consider w.r.t. each sample (optional, default 10)
:param cluster_labels: a numpy array of cluster assignments w.r.t. each sample (optional, default None)
:return:
""""""
Based on the work of Kriegel, Kröger, Schubert, and Zimek (2009) in LoOP: Local Outlier Probabilities.
----------
References
----------
.. [1] Aggarwal C., Hinneburg A., Keim D. On the Surprising Behavior of Distance Metrics in High Dimensional Space.
ICDT Conference (2001).
.. [2] Breunig M., Kriegel H.-P., Ng R., Sander, J. LOF: Identifying Density-based Local Outliers. ACM SIGMOD
International Conference on Management of Data (2000).
.. [3] Kriegel H., Kröger P., Schubert E., Zimek A. LoOP: Local Outlier Probabilities. 18th ACM conference on
"""
def __init__(self, data, extent=0.997, n_neighbors=10, cluster_labels=None):
self.data = data
self.extent = extent
self.n_neighbors = n_neighbors
self.cluster_labels = cluster_labels
self.local_outlier_probabilities = None
@staticmethod
def _standard_distance(mean_distance, sum_squared_distance):
st_dist = np.sqrt(sum_squared_distance / np.fabs(mean_distance))
return st_dist
@staticmethod
def _prob_set_distance(extent, standard_distance):
return 1.0 / (extent * standard_distance)
@staticmethod
def _prob_outlier_factor(probabilistic_set_distance, ev_prob_dist):
return (probabilistic_set_distance / ev_prob_dist) - 1
@staticmethod
def _norm_prob_outlier_factor(extent, ev_probabilistic_outlier_factor):
return extent * np.sqrt(ev_probabilistic_outlier_factor)
@staticmethod
def _local_outlier_probability(plof_val, nplof_val):
erf_vec = np.vectorize(erf)
return np.maximum(0, erf_vec(plof_val / (nplof_val * np.sqrt(2.0))))
def _n_observations(self):
return len(self.data)
def _store(self):
return np.empty([self._n_observations(), 3])
def _cluster_labels(self):
if self.cluster_labels is None:
return np.array([0] * len(self.data))
else:
return self.cluster_labels
def _distances(self, data_store):
for cluster_id in set(self._cluster_labels()):
indices = np.where(self._cluster_labels() == cluster_id)
if self.data.__class__.__name__ == 'DataFrame':
points_vector = self.data.iloc[indices].values
elif self.data.__class__.__name__ == 'ndarray':
points_vector = self.data.take(indices, axis=0)
points_vector = points_vector.reshape(points_vector.shape[1:])
d = (points_vector[:, np.newaxis] - points_vector)
for vec in range(d.shape[1]):
neighborhood_distances = np.sort(np.mean(np.sqrt(d[:, vec] ** 2), axis=1))[1:self.n_neighbors + 1]
neighborhood_dist = np.mean(neighborhood_distances)
closest_neighbor_distance = neighborhood_distances[1:2]
data_store[indices[0][vec]] = np.array([cluster_id, neighborhood_dist, closest_neighbor_distance])
return data_store
def _ssd(self, data_store):
self.cluster_labels_u = np.unique(data_store[:, 0])
ssd_dict = {}
for cluster_id in self.cluster_labels_u:
indices = np.where(data_store[:, 0] == cluster_id)
cluster_distances = np.take(data_store[:, 1], indices)
cluster_distances_nonan = cluster_distances[np.logical_not(np.isnan(cluster_distances))]
ssd = np.sum(cluster_distances_nonan ** 2.0)
ssd_dict[cluster_id] = ssd
data_store = np.hstack((data_store, np.array([[ssd_dict[x] for x in data_store[:, 0].tolist()]]).T))
return data_store
def _standard_distances(self, data_store):
return np.hstack(
(data_store,
np.array([np.apply_along_axis(self._standard_distance, 0, data_store[:, 1], data_store[:, 3])]).T))
def _prob_set_distances(self, data_store):
return np.hstack((data_store, np.array([self._prob_set_distance(self.extent, data_store[:, 4])]).T))
def _prob_set_distances_ev(self, data_store):
prob_set_distance_ev_dict = {}
for cluster_id in self.cluster_labels_u:
indices = np.where(data_store[:, 0] == cluster_id)
prob_set_distances = np.take(data_store[:, 5], indices)
prob_set_distances_nonan = prob_set_distances[np.logical_not(np.isnan(prob_set_distances))]
prob_set_distance_ev_dict[cluster_id] = np.mean(prob_set_distances_nonan)
data_store = np.hstack(
(data_store, np.array([[prob_set_distance_ev_dict[x] for x in data_store[:, 0].tolist()]]).T))
return data_store
def _prob_local_outlier_factors(self, data_store):
return np.hstack(
(data_store,
np.array([np.apply_along_axis(self._prob_outlier_factor, 0, data_store[:, 5], data_store[:, 6])]).T))
def _prob_local_outlier_factors_ev(self, data_store):
prob_local_outlier_factor_ev_dict = {}
for cluster_id in self.cluster_labels_u:
indices = np.where(data_store[:, 0] == cluster_id)
prob_local_outlier_factors = np.take(data_store[:, 7], indices)
prob_local_outlier_factors_nonan = prob_local_outlier_factors[
np.logical_not(np.isnan(prob_local_outlier_factors))]
prob_local_outlier_factor_ev_dict[cluster_id] = np.sum(prob_local_outlier_factors_nonan ** 2.0) / float(
prob_local_outlier_factors_nonan.size)
data_store = np.hstack(
(data_store, np.array([[prob_local_outlier_factor_ev_dict[x] for x in data_store[:, 0].tolist()]]).T))
return data_store
def _norm_prob_local_outlier_factors(self, data_store):
return np.hstack((data_store, np.array([self._norm_prob_outlier_factor(self.extent, data_store[:, 8])]).T))
def _local_outlier_probabilities(self, data_store):
return np.hstack(
(data_store,
np.array([np.apply_along_axis(self._local_outlier_probability, 0, data_store[:, 7], data_store[:, 9])]).T))
def fit(self):
if not self.n_neighbors > 0.:
warnings.warn('n_neighbors must be greater than 0. Execution halted.', UserWarning)
sys.exit()
if not 0. < self.extent < 1.:
warnings.warn('Statistical extent must be in [0,1]. Execution halted.', UserWarning)
sys.exit()
if self.data.__class__.__name__ == 'DataFrame' or self.data.__class__.__name__ == 'ndarray':
pass
else:
warnings.warn('Not a Pandas DataFrame or Numpy array. Execution halted.', UserWarning)
sys.exit()
if np.any(np.isnan(self.data)):
warnings.warn('Input data contains missing values. Some scores may not be returned.', UserWarning)
store = self._store()
store = self._distances(store)
store = self._ssd(store)
store = self._standard_distances(store)
store = self._prob_set_distances(store)
store = self._prob_set_distances_ev(store)
store = self._prob_local_outlier_factors(store)
store = self._prob_local_outlier_factors_ev(store)
store = self._norm_prob_local_outlier_factors(store)
store = self._local_outlier_probabilities(store)
self.local_outlier_probabilities = store[:, 10]
return self.local_outlier_probabilities