Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Classifiers that respect categoricals #34

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
176 changes: 175 additions & 1 deletion catwalk/estimators/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,15 @@
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

from catwalk.estimators.transformers import CutOff
from catwalk.estimators.transformers import CutOff, SubsetWithCategoricals

import numpy as np
import random

MAX_INT = np.iinfo(np.int32).max

class ScaledLogisticRegression(BaseEstimator, ClassifierMixin):
"""
Expand Down Expand Up @@ -76,3 +83,170 @@ def predict(self, X):

def score(self, X, y):
return self.pipeline.score(X,y)


class CatInATreeClassifier(BaseEstimator, ClassifierMixin):
"""
Fit a decision tree with a subset of features that respects categoricals

Args:
categoricals : list,
List of groups of column indices to be considered associated
with one another as categoricals. For instance [[1,2], [7,8,9]]
would mean columns 1 & 2 are associated as one categorical and
7, 8, and 9 are associated as a second one.
"""
def __init__(self,
categoricals,
max_features='sqrt',
random_state=None,
criterion="gini",
splitter="best",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.,
max_leaf_nodes=None,
min_impurity_split=1e-07,
class_weight=None,
presort=False):

self.categoricals = categoricals
self.criterion = criterion
self.splitter = splitter
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.min_weight_fraction_leaf = min_weight_fraction_leaf
self.max_features = max_features
self.random_state = random_state
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_split = min_impurity_split
self.class_weight = class_weight
self.presort = presort

self.subset_cols = SubsetWithCategoricals(
categoricals=categoricals, max_features=max_features, random_state=random_state
)
self.tree = DecisionTreeClassifier(
criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf,
max_features=1.0, random_state=random_state, max_leaf_nodes=max_leaf_nodes,
min_impurity_split=min_impurity_split, class_weight=class_weight, presort=presort
)

self.pipeline = Pipeline([
('subset_cols', self.subset_cols),
('tree', self.tree)
])

def fit(self, X, y):

# set the underlying random states before fitting
# doing this here rather than in the constructor because self.random_state might
# have been modified by an ensemble method
self.pipeline.named_steps['subset_cols'].set_params(random_state=self.random_state)
self.pipeline.named_steps['tree'].set_params(random_state=self.random_state)

self.pipeline.fit(X, y)

self.max_features_ = self.pipeline.named_steps['subset_cols'].max_features_
self.subset_indices = self.pipeline.named_steps['subset_cols'].subset_indices

self.classes_ = self.pipeline.named_steps['tree'].classes_
self.n_classes_ = self.pipeline.named_steps['tree'].n_classes_
self.n_features_ = self.pipeline.named_steps['tree'].n_features_
self.n_outputs_ = self.pipeline.named_steps['tree'].n_outputs_
self.tree_ = self.pipeline.named_steps['tree'].tree_

# feature importances need to reference full column set but underlying tree
# was trained on the subset, so fill in others with zeros
fi = self.pipeline.named_steps['tree'].feature_importances_
fi_dict = dict(zip(self.subset_indices, fi))
fi_full = []
for i in range(X.shape[1]):
fi_full.append(fi_dict.get(i, 0))
self.feature_importances_ = fi_full

return self

def apply(self, X):
return self.pipeline.apply(X)

def decision_path(self, X):
return self.pipeline.decision_path(X)

def predict(self, X):
return self.pipeline.predict(X)

def predict_log_proba(self, X):
return self.pipeline.predict_log_proba(X)

def predict_proba(self, X):
return self.pipeline.predict_proba(X)

def score(self, X, y):
return self.pipeline.score(X, y)


class CatInAForestClassifier(BaggingClassifier):
"""
Bagged classifier using CatInATreeClassifiers as estimators.
Note that max_features is required here for the underlying
subsetting and that the bagging classifier will use all selected
features for each tree with no option for feature bootstrapping.
"""
def __init__(self, categoricals, max_features_tree='sqrt', random_state=None,
n_estimators=10, max_samples=1.0, bootstrap=True, oob_score=False,
warm_start=False, n_jobs=1, verbose=0, criterion="gini", splitter="best",
max_depth=None, min_samples_split=2, min_samples_leaf=1,
min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_split=1e-07,
class_weight=None, presort=False):

# if isinstance(random_state, int):
# random.seed(random_state)
# elif isinstance(random_state, np.random.RandomState):
# random.seed(random_state.randint(MAX_INT))

# set up the base estimator as a CatInATreeClassifier()
self.base_estimator = CatInATreeClassifier(
categoricals=categoricals, max_features=max_features_tree, criterion=criterion,
splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf,
max_leaf_nodes=max_leaf_nodes, min_impurity_split=min_impurity_split,
class_weight=class_weight, presort=presort
)

# Call the super-class's constructor
# Here, we force each tree to consider all features (without bootstrapping)
# as we'll handle the subsetting in the base estimator to have control over
# sampling categoricals. Also note that calling the BaggingClassifier
# constructor will set an object parameter `max_features`=1.0, so we've
# nammed the class parameter `max_features_tree` avoid collision.
BaggingClassifier.__init__(
self,
base_estimator=self.base_estimator,
n_estimators=n_estimators,
max_samples=max_samples,
max_features=1.0,
bootstrap=bootstrap,
bootstrap_features=False,
oob_score=oob_score,
warm_start=warm_start,
n_jobs=n_jobs,
random_state=random_state,
verbose=verbose
)

self.categoricals = categoricals
self.max_features_tree = max_features_tree
self.criterion = criterion
self.splitter = splitter
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.min_weight_fraction_leaf = min_weight_fraction_leaf
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_split = min_impurity_split
self.class_weight = class_weight
self.presort = presort
100 changes: 100 additions & 0 deletions catwalk/estimators/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,28 @@
import warnings

import numpy as np
from math import log, sqrt
import random

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.utils.validation import FLOAT_DTYPES

MAX_INT = np.iinfo(np.int32).max

def flatten_list(l):
"""
Simple utility to flatten a list down to one dimension even if the list
contains elements of differing depth
"""
res = []
for i in l:
if isinstance(i, list):
res = res + flatten_list(i)
else:
res = res + [i]
return res

DEPRECATION_MSG_1D = (
"Passing 1d arrays as data is deprecated in 0.17 and will "
"raise ValueError in 0.19. Reshape your data either using "
Expand Down Expand Up @@ -69,3 +86,86 @@ def transform(self, X):
X[X < feature_range[0]] = feature_range[0]

return X


# feels pretty gross to have to specify the categorical columns in the constructor
# even before the object is aware of the data it's operating on, but doesn't seem
# like the fit method is flexible enough to specify it there if we're going to
# use it in a pipeline. ugh.
class SubsetWithCategoricals(BaseEstimator, TransformerMixin):
"""
Subsets features of an array treating categoricals as a group

Args:
max_features : int, float, string or None, optional (default=None)
The number of features to subset down to:
- If int, then subset to `max_features` features.
- If float, then `max_features` is a percentage and
`int(max_features * n_features)` features are used.
- If "auto", then `max_features=sqrt(n_features)`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.

categoricals : list,
List of groups of column indices to be considered associated
with one another as categoricals. For instance [[1,2], [7,8,9]]
would mean columns 1 & 2 are associated as one categorical and
7, 8, and 9 are associated as a second one.

Attributes:
subset_indices : list,
Indices of the chosen subset of columns in the original array.

max_features_ : int,
The inferred value of max_features.
"""
def __init__(self, categoricals, max_features='sqrt', random_state=None, copy=True):
self.max_features = max_features
self.categoricals = categoricals
self.random_state = random_state
self.copy = copy

def _infer_max_features(self, num_features):
if isinstance(self.max_features, float):
return int(self.max_features*num_features)
elif isinstance(self.max_features, int):
return self.max_features
elif self.max_features in ['auto', 'sqrt']:
return int(sqrt(num_features))
elif self.max_features == 'log2':
return int(log(num_features, 2))
elif self.max_features is None:
return num_features
else:
raise ValueError('Invalid value for max_features: %s' % self.max_features)

def fit(self, X, y=None):
if isinstance(self.random_state, int):
random.seed(self.random_state)
elif isinstance(self.random_state, np.random.RandomState):
random.seed(self.random_state.randint(MAX_INT))

features = list(range(X.shape[1]))

all_cats = set(flatten_list(self.categoricals))
non_cats = set(features) - all_cats

# this will be a mixed list of column indices for non-categoricals
# and lists of indices for categorics
distinct_features = list(non_cats) + self.categoricals

self.max_features_ = self._infer_max_features(len(distinct_features))
if self.max_features_ > len(distinct_features):
raise ValueError('Cannot subset to more than distinct features: %s vs %s' % (
self.max_features_, len(distinct_features)))

self.subset_indices = sorted(flatten_list(
random.sample(distinct_features, self.max_features_)
))

return self

def transform(self, X):
X = check_array(X, copy=self.copy, ensure_2d=False, dtype=FLOAT_DTYPES)
return X[:, self.subset_indices]
17 changes: 15 additions & 2 deletions catwalk/model_trainers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
filename_friendly_hash, \
retrieve_model_id_from_hash, \
db_retry, \
save_db_objects
save_db_objects, \
bag_of_cats, \
find_cats

from results_schema import Model, FeatureImportance

Expand All @@ -38,6 +40,7 @@ def __init__(
model_storage_engine,
db_engine,
model_group_keys,
feature_config,
replace=True
):
self.project_path = project_path
Expand All @@ -46,6 +49,7 @@ def __init__(
self.db_engine = db_engine
self.sessionmaker = sessionmaker(bind=self.db_engine)
self.model_group_keys = model_group_keys
self.feature_config = feature_config
self.replace = replace

def unique_parameters(self, parameters):
Expand Down Expand Up @@ -101,8 +105,17 @@ def _train(self, matrix_store, class_path, parameters):
module_name, class_name = class_path.rsplit(".", 1)
module = importlib.import_module(module_name)
cls = getattr(module, class_name)
instance = cls(**parameters)
y = matrix_store.labels()
model_params = parameters.copy() # copy since we may modify

# if using a classifier that samples respecting categoricals, detect the
# groups of categoricals and add them to the model parameter set
if class_name in ['CatInATreeClassifier', 'CatInAForestClassifier']:
cats_regex = bag_of_cats(self.feature_config)
categoricals = find_cats(matrix_store.matrix.columns.values, cats_regex)
model_params['categoricals'] = categoricals

instance = cls(**model_params)

return instance.fit(matrix_store.matrix, y), matrix_store.matrix.columns

Expand Down
Loading