-
Notifications
You must be signed in to change notification settings - Fork 0
/
gmeans.py
99 lines (74 loc) · 2.81 KB
/
gmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sbn
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import scale
from sklearn import datasets
from scipy.stats import anderson
class GMeans(object):
"""strictness = how strict should the anderson-darling test for normality be
0: not at all strict
4: very strict
"""
def __init__(self, min_obs=1, max_depth=10, random_state=None, strictness=4):
super(GMeans, self).__init__()
self.max_depth = max_depth
self.min_obs = min_obs
self.random_state = random_state
if strictness not in range(5):
raise ValueError("strictness parameter must be integer from 0 to 4")
self.strictness = strictness
self.stopping_criteria = []
def _gaussianCheck(self, vector):
"""
check whether a given input vector follows a gaussian distribution
H0: vector is distributed gaussian
H1: vector is not distributed gaussian
"""
output = anderson(vector)
if output[0] <= output[1][self.strictness]:
return True
else:
return False
def _recursiveClustering(self, data, depth, index):
"""
recursively run kmeans with k=2 on your data until a max_depth is reached or we have
gaussian clusters
"""
depth += 1
if depth == self.max_depth:
self.data_index[index[:, 0]] = index
self.stopping_criteria.append('max_depth')
return
km = MiniBatchKMeans(n_clusters=2, random_state=self.random_state)
km.fit(data)
centers = km.cluster_centers_
v = centers[0] - centers[1]
x_prime = scale(data.dot(v) / (v.dot(v)))
gaussian = self._gaussianCheck(x_prime)
# print gaussian
if gaussian == True:
self.data_index[index[:, 0]] = index
self.stopping_criteria.append('gaussian')
return
labels = set(km.labels_)
for k in labels:
current_data = data[km.labels_ == k]
if current_data.shape[0] <= self.min_obs:
self.data_index[index[:, 0]] = index
self.stopping_criteria.append('min_obs')
return
current_index = index[km.labels_ == k]
current_index[:, 1] = np.random.randint(0, 100000000000)
self._recursiveClustering(data=current_data, depth=depth, index=current_index)
# set_trace()
def fit(self, data):
"""
fit the recursive clustering model to the data
"""
self.data = data
data_index = np.array([(i, False) for i in range(data.shape[0])])
self.data_index = data_index
self._recursiveClustering(data=data, depth=0, index=data_index)
self.labels_ = self.data_index[:, 1]