forked from alicezheng/feature-engineering-book
-
Notifications
You must be signed in to change notification settings - Fork 0
/
kmeans_featurizer.py
118 lines (95 loc) · 4.69 KB
/
kmeans_featurizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
class KMeansFeaturizer:
"""Transforms numeric data into k-means cluster memberships.
This transformer runs k-means on the input data and converts each data point
into the id of the closest cluster. If a target variable is present, it is
scaled and included as input to k-means in order to derive clusters that
obey the classification boundary as well as group similar points together.
Parameters
----------
k: integer, optional, default 100
The number of clusters to group data into.
target_scale: float, [0, infty], optional, default 5.0
The scaling factor for the target variable. Set this to zero to ignore
the target. For classification problems, larger `target_scale` values
will produce clusters that better respect the class boundary.
random_state : integer or numpy.RandomState, optional
This is passed to k-means as the generator used to initialize the
kmeans centers. If an integer is given, it fixes the seed. Defaults to
the global numpy random number generator.
Attributes
----------
cluster_centers_ : array, [k, n_features]
Coordinates of cluster centers. n_features does count the target column.
"""
def __init__(self, k=100, target_scale=5.0, random_state=None):
self.k = k
self.target_scale = target_scale
self.random_state = random_state
self.cluster_encoder = OneHotEncoder().fit(np.array(range(k)).reshape(-1,1))
def fit(self, X, y=None):
"""Runs k-means on the input data and find centroids.
If no target is given (`y` is None) then run vanilla k-means on input
`X`.
If target `y` is given, then include the target (weighted by
`target_scale`) as an extra dimension for k-means clustering. In this
case, run k-means twice, first with the target, then an extra iteration
without.
After fitting, the attribute `cluster_centers_` are set to the k-means
centroids in the input space represented by `X`.
Parameters
----------
X : array-like or sparse matrix, shape=(n_data_points, n_features)
y : vector of length n_data_points, optional, default None
If provided, will be weighted with `target_scale` and included in
k-means clustering as hint.
"""
if y is None:
# No target variable, just do plain k-means
km_model = KMeans(n_clusters=self.k,
n_init=20,
random_state=self.random_state)
km_model.fit(X)
self.km_model_ = km_model
self.cluster_centers_ = km_model.cluster_centers_
return self
# There is target information. Apply appropriate scaling and include
# into input data to k-means
data_with_target = np.hstack((X, y[:,np.newaxis]*self.target_scale))
# Build a pre-training k-means model on data and target
km_model_pretrain = KMeans(n_clusters=self.k,
n_init=20,
random_state=self.random_state)
km_model_pretrain.fit(data_with_target)
# Run k-means a second time to get the clusters in the original space
# without target info. Initialize using centroids found in pre-training.
# Go through a single iteration of cluster assignment and centroid
# recomputation.
km_model = KMeans(n_clusters=self.k,
init=km_model_pretrain.cluster_centers_[:,:2],
n_init=1,
max_iter=1)
km_model.fit(X)
self.km_model_ = km_model
self.cluster_centers_ = km_model.cluster_centers_
return self
def transform(self, X, y=None):
"""Output the closest cluster id for each input data point.
Parameters
----------
X : array-like or sparse matrix, shape=(n_data_points, n_features)
y : vector of length n_data_points, optional, default None
Target vector is ignored even if provided.
Returns
-------
cluster_ids : array, shape[n_data_points,1]
"""
clusters = self.km_model_.predict(X)
return self.cluster_encoder.transform(clusters.reshape(-1,1))
def fit_transform(self, X, y=None):
"""Runs fit followed by transform.
"""
self.fit(X, y)
return self.transform(X, y)