dssg · shaycrk · Oct 19, 2017 · Oct 19, 2017 · Oct 19, 2017 · Oct 19, 2017
diff --git a/catwalk/estimators/classifiers.py b/catwalk/estimators/classifiers.py
@@ -4,8 +4,15 @@
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import BaggingClassifier
 
-from catwalk.estimators.transformers import CutOff
+from catwalk.estimators.transformers import CutOff, SubsetWithCategoricals
+
+import numpy as np
+import random
+
+MAX_INT = np.iinfo(np.int32).max
 
 class ScaledLogisticRegression(BaseEstimator, ClassifierMixin):
     """
@@ -76,3 +83,170 @@ def predict(self, X):
 
     def score(self, X, y):
         return self.pipeline.score(X,y)
+
+
+class CatInATreeClassifier(BaseEstimator, ClassifierMixin):
+    """
+    Fit a decision tree with a subset of features that respects categoricals
+
+    Args:
+        categoricals : list,
+            List of groups of column indices to be considered associated
+            with one another as categoricals. For instance [[1,2], [7,8,9]]
+            would mean columns 1 & 2 are associated as one categorical and
+            7, 8, and 9 are associated as a second one.
+    """
+    def __init__(self,
+                 categoricals,
+                 max_features='sqrt',
+                 random_state=None,
+                 criterion="gini",
+                 splitter="best",
+                 max_depth=None,
+                 min_samples_split=2,
+                 min_samples_leaf=1,
+                 min_weight_fraction_leaf=0.,
+                 max_leaf_nodes=None,
+                 min_impurity_split=1e-07,
+                 class_weight=None,
+                 presort=False):
+
+        self.categoricals = categoricals
+        self.criterion = criterion
+        self.splitter = splitter
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.random_state = random_state
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_split = min_impurity_split
+        self.class_weight = class_weight
+        self.presort = presort
+
+        self.subset_cols = SubsetWithCategoricals(
+            categoricals=categoricals, max_features=max_features, random_state=random_state
+        )
+        self.tree = DecisionTreeClassifier(
+            criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split,
+            min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf,
+            max_features=1.0, random_state=random_state, max_leaf_nodes=max_leaf_nodes,
+            min_impurity_split=min_impurity_split, class_weight=class_weight, presort=presort
+        )
+
+        self.pipeline = Pipeline([
+            ('subset_cols', self.subset_cols),
+            ('tree', self.tree)
+        ])
+
+    def fit(self, X, y):
+
+        # set the underlying random states before fitting
+        # doing this here rather than in the constructor because self.random_state might
+        # have been modified by an ensemble method
+        self.pipeline.named_steps['subset_cols'].set_params(random_state=self.random_state)
+        self.pipeline.named_steps['tree'].set_params(random_state=self.random_state)
+
+        self.pipeline.fit(X, y)
+
+        self.max_features_ = self.pipeline.named_steps['subset_cols'].max_features_
+        self.subset_indices = self.pipeline.named_steps['subset_cols'].subset_indices
+
+        self.classes_ = self.pipeline.named_steps['tree'].classes_
+        self.n_classes_ = self.pipeline.named_steps['tree'].n_classes_
+        self.n_features_ = self.pipeline.named_steps['tree'].n_features_
+        self.n_outputs_ = self.pipeline.named_steps['tree'].n_outputs_
+        self.tree_ = self.pipeline.named_steps['tree'].tree_
+
+        # feature importances need to reference full column set but underlying tree
+        # was trained on the subset, so fill in others with zeros
+        fi = self.pipeline.named_steps['tree'].feature_importances_
+        fi_dict = dict(zip(self.subset_indices, fi))
+        fi_full = []
+        for i in range(X.shape[1]):
+            fi_full.append(fi_dict.get(i, 0))
+        self.feature_importances_ = fi_full
+
+        return self
+
+    def apply(self, X):
+        return self.pipeline.apply(X)
+
+    def decision_path(self, X):
+        return self.pipeline.decision_path(X)
+
+    def predict(self, X):
+        return self.pipeline.predict(X)
+
+    def predict_log_proba(self, X):
+        return self.pipeline.predict_log_proba(X)
+
+    def predict_proba(self, X):
+        return self.pipeline.predict_proba(X)
+
+    def score(self, X, y):
+        return self.pipeline.score(X, y)
+
+
+class CatInAForestClassifier(BaggingClassifier):
+    """
+    Bagged classifier using CatInATreeClassifiers as estimators.
+    Note that max_features is required here for the underlying
+    subsetting and that the bagging classifier will use all selected
+    features for each tree with no option for feature bootstrapping.
+    """
+    def __init__(self, categoricals, max_features_tree='sqrt', random_state=None,
+        n_estimators=10, max_samples=1.0, bootstrap=True, oob_score=False, 
+        warm_start=False, n_jobs=1, verbose=0, criterion="gini", splitter="best", 
+        max_depth=None, min_samples_split=2, min_samples_leaf=1, 
+        min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_split=1e-07, 
+        class_weight=None, presort=False):
+
+        # if isinstance(random_state, int):
+        #     random.seed(random_state)
+        # elif isinstance(random_state, np.random.RandomState):
+        #     random.seed(random_state.randint(MAX_INT))
+
+        # set up the base estimator as a CatInATreeClassifier()
+        self.base_estimator = CatInATreeClassifier(
+            categoricals=categoricals, max_features=max_features_tree, criterion=criterion, 
+            splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, 
+            min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, 
+            max_leaf_nodes=max_leaf_nodes, min_impurity_split=min_impurity_split, 
+            class_weight=class_weight, presort=presort
+            )
+
+        # Call the super-class's constructor
+        # Here, we force each tree to consider all features (without bootstrapping)
+        # as we'll handle the subsetting in the base estimator to have control over
+        # sampling categoricals. Also note that calling the BaggingClassifier
+        # constructor will set an object parameter `max_features`=1.0, so we've
+        # nammed the class parameter `max_features_tree` avoid collision.
+        BaggingClassifier.__init__(
+            self, 
+            base_estimator=self.base_estimator,
+            n_estimators=n_estimators,
+            max_samples=max_samples,
+            max_features=1.0,
+            bootstrap=bootstrap,
+            bootstrap_features=False,
+            oob_score=oob_score,
+            warm_start=warm_start,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose
+            )
+
+        self.categoricals = categoricals
+        self.max_features_tree = max_features_tree
+        self.criterion = criterion
+        self.splitter = splitter
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_split = min_impurity_split
+        self.class_weight = class_weight
+        self.presort = presort
diff --git a/catwalk/estimators/transformers.py b/catwalk/estimators/transformers.py
@@ -3,11 +3,28 @@
 import warnings
 
 import numpy as np
+from math import log, sqrt
+import random
 
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils import check_array
 from sklearn.utils.validation import FLOAT_DTYPES
 
+MAX_INT = np.iinfo(np.int32).max
+
+def flatten_list(l):
+    """
+    Simple utility to flatten a list down to one dimension even if the list
+    contains elements of differing depth
+    """
+    res = []
+    for i in l:
+        if isinstance(i, list):
+            res = res + flatten_list(i)
+        else:
+            res = res + [i]
+    return res
+
 DEPRECATION_MSG_1D = (
     "Passing 1d arrays as data is deprecated in 0.17 and will "
     "raise ValueError in 0.19. Reshape your data either using "
@@ -69,3 +86,86 @@ def transform(self, X):
         X[X < feature_range[0]] = feature_range[0]
 
         return X
+
+
+# feels pretty gross to have to specify the categorical columns in the constructor
+# even before the object is aware of the data it's operating on, but doesn't seem
+# like the fit method is flexible enough to specify it there if we're going to 
+# use it in a pipeline. ugh.
+class SubsetWithCategoricals(BaseEstimator, TransformerMixin):
+    """
+    Subsets features of an array treating categoricals as a group
+
+    Args:
+        max_features : int, float, string or None, optional (default=None)
+            The number of features to subset down to:
+                - If int, then subset to `max_features` features.
+                - If float, then `max_features` is a percentage and
+                  `int(max_features * n_features)` features are used.
+                - If "auto", then `max_features=sqrt(n_features)`.
+                - If "sqrt", then `max_features=sqrt(n_features)`.
+                - If "log2", then `max_features=log2(n_features)`.
+                - If None, then `max_features=n_features`.
+
+        categoricals : list,
+            List of groups of column indices to be considered associated
+            with one another as categoricals. For instance [[1,2], [7,8,9]]
+            would mean columns 1 & 2 are associated as one categorical and
+            7, 8, and 9 are associated as a second one.
+
+    Attributes:
+        subset_indices : list,
+            Indices of the chosen subset of columns in the original array.
+
+        max_features_ : int,
+            The inferred value of max_features.
+    """
+    def __init__(self, categoricals, max_features='sqrt', random_state=None, copy=True):
+        self.max_features = max_features
+        self.categoricals = categoricals
+        self.random_state = random_state
+        self.copy = copy
+
+    def _infer_max_features(self, num_features):
+        if isinstance(self.max_features, float):
+            return int(self.max_features*num_features)
+        elif isinstance(self.max_features, int):
+            return self.max_features
+        elif self.max_features in ['auto', 'sqrt']:
+            return int(sqrt(num_features))
+        elif self.max_features == 'log2':
+            return int(log(num_features, 2))
+        elif self.max_features is None:
+            return num_features
+        else:
+            raise ValueError('Invalid value for max_features: %s' % self.max_features)
+
+    def fit(self, X, y=None):
+        if isinstance(self.random_state, int):
+            random.seed(self.random_state)
+        elif isinstance(self.random_state, np.random.RandomState):
+            random.seed(self.random_state.randint(MAX_INT))
+
+        features = list(range(X.shape[1]))
+
+        all_cats = set(flatten_list(self.categoricals))
+        non_cats = set(features) - all_cats
+
+        # this will be a mixed list of column indices for non-categoricals
+        # and lists of indices for categorics
+        distinct_features = list(non_cats) + self.categoricals
+
+        self.max_features_ = self._infer_max_features(len(distinct_features))
+        if self.max_features_ > len(distinct_features):
+            raise ValueError('Cannot subset to more than distinct features: %s vs %s' % (
+                self.max_features_, len(distinct_features)))
+
+        self.subset_indices = sorted(flatten_list(
+            random.sample(distinct_features, self.max_features_)
+        ))
+
+        return self
+
+    def transform(self, X):
+        X = check_array(X, copy=self.copy, ensure_2d=False, dtype=FLOAT_DTYPES)
+        return X[:, self.subset_indices]
diff --git a/catwalk/model_trainers.py b/catwalk/model_trainers.py
@@ -14,7 +14,9 @@
     filename_friendly_hash, \
     retrieve_model_id_from_hash, \
     db_retry, \
-    save_db_objects
+    save_db_objects, \
+    bag_of_cats, \
+    find_cats
 
 from results_schema import Model, FeatureImportance
 
@@ -38,6 +40,7 @@ def __init__(
         model_storage_engine,
         db_engine,
         model_group_keys,
+        feature_config,
         replace=True
     ):
         self.project_path = project_path
@@ -46,6 +49,7 @@ def __init__(
         self.db_engine = db_engine
         self.sessionmaker = sessionmaker(bind=self.db_engine)
         self.model_group_keys = model_group_keys
+        self.feature_config = feature_config
         self.replace = replace
 
     def unique_parameters(self, parameters):
@@ -101,8 +105,17 @@ def _train(self, matrix_store, class_path, parameters):
         module_name, class_name = class_path.rsplit(".", 1)
         module = importlib.import_module(module_name)
         cls = getattr(module, class_name)
-        instance = cls(**parameters)
         y = matrix_store.labels()
+        model_params = parameters.copy() # copy since we may modify
+
+        # if using a classifier that samples respecting categoricals, detect the
+        # groups of categoricals and add them to the model parameter set
+        if class_name in ['CatInATreeClassifier', 'CatInAForestClassifier']:
+            cats_regex = bag_of_cats(self.feature_config)
+            categoricals = find_cats(matrix_store.matrix.columns.values, cats_regex)
+            model_params['categoricals'] = categoricals
+
+        instance = cls(**model_params)
 
         return instance.fit(matrix_store.matrix, y), matrix_store.matrix.columns