From 792f5e125e899327f0047f10df1af7d43e82a265 Mon Sep 17 00:00:00 2001
From: Kit Rodolfa <shaycrk@gmail.com>
Date: Thu, 19 Oct 2017 01:12:54 +0000
Subject: [PATCH 01/13] classifier that respects categoricals Creates a new
 pair of classifiers, CatInATree and CatInAForest, that respect categoricals
 when randomly sampling features.

---
 catwalk/estimators/classifiers.py  | 163 ++++++++++++++++++++++++++++-
 catwalk/estimators/transformers.py | 100 ++++++++++++++++++
 catwalk/model_trainers.py          |  17 ++-
 catwalk/utils.py                   |  60 +++++++++++
 4 files changed, 337 insertions(+), 3 deletions(-)

diff --git a/catwalk/estimators/classifiers.py b/catwalk/estimators/classifiers.py
index 0ee7cf0..876bd66 100644
--- a/catwalk/estimators/classifiers.py
+++ b/catwalk/estimators/classifiers.py
@@ -4,8 +4,10 @@
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import BaggingClassifier
 
-from catwalk.estimators.transformers import CutOff
+from catwalk.estimators.transformers import CutOff, SubsetWithCategoricals
 
 class ScaledLogisticRegression(BaseEstimator, ClassifierMixin):
     """
@@ -76,3 +78,162 @@ def predict(self, X):
 
     def score(self, X, y):
         return self.pipeline.score(X,y)
+
+
+class CatInATreeClassifier(BaseEstimator, ClassifierMixin):
+    """
+    Fit a decision tree with a subset of features that respects categoricals
+
+    Args:
+        categoricals : list,
+            List of groups of column indices to be considered associated
+            with one another as categoricals. For instance [[1,2], [7,8,9]]
+            would mean columns 1 & 2 are associated as one categorical and
+            7, 8, and 9 are associated as a second one.
+    """
+    def __init__(self,
+                 categoricals,
+                 max_features='sqrt',
+                 random_state=None,
+                 criterion="gini",
+                 splitter="best",
+                 max_depth=None,
+                 min_samples_split=2,
+                 min_samples_leaf=1,
+                 min_weight_fraction_leaf=0.,
+                 max_leaf_nodes=None,
+                 min_impurity_decrease=0.,
+                 min_impurity_split=None,
+                 class_weight=None,
+                 presort=False):
+
+        self.categoricals = categoricals
+        self.criterion = criterion
+        self.splitter = splitter
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.random_state = random_state
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+        self.min_impurity_split = min_impurity_split
+        self.class_weight = class_weight
+        self.presort = presort
+
+        self.subset_cols = SubsetWithCategoricals(categoricals=categoricals, max_features=max_features)
+        self.tree = DecisionTreeClassifier(
+            criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split,
+            min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf,
+            max_features=1.0, random_state=random_state, max_leaf_nodes=max_leaf_nodes,
+            min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split,
+            class_weight=class_weight, presort=presort
+        )
+
+        self.pipeline = Pipeline([
+            ('subset_cols', self.subset_cols),
+            ('tree', self.tree)
+        ])
+
+    def fit(self, X, y):
+
+        self.pipeline.fit(X, y)
+
+        self.max_features_ = self.pipeline.named_steps['subset_cols'].max_features_
+        self.subset_indices = self.pipeline.named_steps['subset_cols'].subset_indices
+
+        self.classes_ = self.pipeline.named_steps['tree'].classes_
+        self.n_classes_ = self.pipeline.named_steps['tree'].n_classes_
+        self.n_features_ = self.pipeline.named_steps['tree'].n_features_
+        self.n_outputs_ = self.pipeline.named_steps['tree'].n_outputs_
+        self.tree_ = self.pipeline.named_steps['tree'].tree_
+
+        # feature importances need to reference full column set but underlying tree
+        # was trained on the subset, so fill in others with zeros
+        fi = self.pipeline.named_steps['tree'].feature_importances_
+        fi_dict = dict(zip(self.subset_indices, fi))
+        fi_full = []
+        for i in range(X.shape[1]):
+            fi_full.append(fi_dict.get(i, 0))
+        self.feature_importances_ = fi_full
+
+        return self
+
+    def apply(self, X):
+        return self.pipeline.apply(X)
+
+    def decision_path(self, X):
+        return self.pipeline.decision_path(X)
+
+    def predict(self, X):
+        return self.pipeline.predict(X)
+
+    def predict_log_proba(self, X):
+        return self.pipeline.predict_log_proba(X)
+
+    def predict_proba(self, X):
+        return self.pipeline.predict_proba(X)
+
+    def score(self, X, y):
+        return self.pipeline.score(X, y)
+
+
+class CatInAForestClassifier(BaggingClassifier):
+    """
+    Bagged classifier using CatInATreeClassifiers as estimators.
+    Note that max_features is required here for the underlying
+    subsetting and that the bagging classifier will use all selected
+    features for each tree with no option for feature bootstrapping.
+    """
+    def __init__(self, categoricals, max_features_tree='sqrt', random_state=None,
+        n_estimators=10, max_samples=1.0, bootstrap=True, oob_score=False, 
+        warm_start=False, n_jobs=1, verbose=0, criterion="gini", splitter="best", 
+        max_depth=None, min_samples_split=2, min_samples_leaf=1, 
+        min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_decrease=0., 
+        min_impurity_split=None, class_weight=None, presort=False):
+
+        # set up the base estimator as a CatInATreeClassifier()
+        self.base_estimator = CatInATreeClassifier(
+            categoricals=categoricals, max_features=max_features_tree, random_state=random_state, 
+            criterion=criterion, splitter=splitter, max_depth=max_depth, 
+            min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, 
+            min_weight_fraction_leaf=min_weight_fraction_leaf, max_leaf_nodes=max_leaf_nodes, 
+            min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split,
+            class_weight=class_weight, presort=presort
+            )
+
+        # Call the super-class's constructor
+        # Here, we force each tree to consider all features (without bootstrapping)
+        # as we'll handle the subsetting in the base estimator to have control over
+        # sampling categoricals. Also note that calling the BaggingClassifier
+        # constructor will set an object parameter `max_features`=1.0, so we've
+        # nammed the class parameter `max_features_tree` avoid collision.
+        BaggingClassifier.__init__(
+            self, 
+            base_estimator=self.base_estimator,
+            n_estimators=n_estimators,
+            max_samples=max_samples,
+            max_features=1.0,
+            bootstrap=bootstrap,
+            bootstrap_features=False,
+            oob_score=oob_score,
+            warm_start=warm_start,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose
+            )
+
+        self.categoricals = categoricals
+        self.max_features_tree = max_features_tree
+        self.criterion = criterion
+        self.splitter = splitter
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+        self.min_impurity_split = min_impurity_split
+        self.class_weight = class_weight
+        self.presort = presort
diff --git a/catwalk/estimators/transformers.py b/catwalk/estimators/transformers.py
index bcf75c2..fe9448f 100644
--- a/catwalk/estimators/transformers.py
+++ b/catwalk/estimators/transformers.py
@@ -3,11 +3,26 @@
 import warnings
 
 import numpy as np
+from math import log, sqrt
+import random
 
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils import check_array
 from sklearn.utils.validation import FLOAT_DTYPES
 
+def flatten_list(l):
+    """
+    Simple utility to flatten a list down to one dimension even if the list
+    contains elements of differing depth
+    """
+    res = []
+    for i in l:
+        if isinstance(i, list):
+            res = res + flatten_list(i)
+        else:
+            res = res + [i]
+    return res
+
 DEPRECATION_MSG_1D = (
     "Passing 1d arrays as data is deprecated in 0.17 and will "
     "raise ValueError in 0.19. Reshape your data either using "
@@ -69,3 +84,88 @@ def transform(self, X):
         X[X < feature_range[0]] = feature_range[0]
 
         return X
+
+
+# feels pretty gross to have to specify the categorical columns in the constructor
+# even before the object is aware of the data it's operating on, but doesn't seem
+# like the fit method is flexible enough to specify it there if we're going to 
+# use it in a pipeline. ugh.
+class SubsetWithCategoricals(BaseEstimator, TransformerMixin):
+    """
+    Subsets features of an array treating categoricals as a group
+
+    Args:
+        max_features : int, float, string or None, optional (default=None)
+            The number of features to subset down to:
+                - If int, then subset to `max_features` features.
+                - If float, then `max_features` is a percentage and
+                  `int(max_features * n_features)` features are used.
+                - If "auto", then `max_features=sqrt(n_features)`.
+                - If "sqrt", then `max_features=sqrt(n_features)`.
+                - If "log2", then `max_features=log2(n_features)`.
+                - If None, then `max_features=n_features`.
+
+        categoricals : list,
+            List of groups of column indices to be considered associated
+            with one another as categoricals. For instance [[1,2], [7,8,9]]
+            would mean columns 1 & 2 are associated as one categorical and
+            7, 8, and 9 are associated as a second one.
+
+    Attributes:
+        subset_indices : list,
+            Indices of the chosen subset of columns in the original array.
+
+        max_features_ : int,
+            The inferred value of max_features.
+    """
+    def __init__(self, categoricals, max_features='sqrt', random_state=None, copy=True):
+        self.max_features = max_features
+        self.categoricals = categoricals
+        self.copy = copy
+        if isinstance(random_state, int):
+            random.seed(random_state)
+        elif isinstance(random_state, np.random.RandomState):
+            # feels like a bit of a hack, but np doesn't seem to handle
+            # lists of mixed types so well, so using random.sample()
+            # and pretty sure this should be deterministic if a RandomState
+            # object is passed
+            random.seed(random_state.get_state()[1].sum())
+
+    def _infer_max_features(self, num_features):
+        if isinstance(self.max_features, float):
+            return int(self.max_features*num_features)
+        elif isinstance(self.max_features, int):
+            return self.max_features
+        elif self.max_features in ['auto', 'sqrt']:
+            return int(sqrt(num_features))
+        elif self.max_features == 'log2':
+            return int(log(num_features, 2))
+        elif self.max_features is None:
+            return num_features
+        else:
+            raise ValueError('Invalid value for max_features: %s' % self.max_features)
+
+    def fit(self, X, y=None):
+        features = list(range(X.shape[1]))
+
+        all_cats = set(flatten_list(self.categoricals))
+        non_cats = set(features) - all_cats
+
+        # this will be a mixed list of column indices for non-categoricals
+        # and lists of indices for categorics
+        distinct_features = features + self.categoricals
+
+        self.max_features_ = self._infer_max_features(len(distinct_features))
+        if self.max_features_ > len(distinct_features):
+            raise ValueError('Cannot subset to more than distinct features: %s vs %s' % (
+                self.max_features_, len(distinct_features)))
+
+        self.subset_indices = sorted(flatten_list(
+            random.sample(distinct_features, self.max_features_)
+        ))
+
+        return self
+
+    def transform(self, X):
+        X = check_array(X, copy=self.copy, ensure_2d=False, dtype=FLOAT_DTYPES)
+        return X[:, self.subset_indices]
diff --git a/catwalk/model_trainers.py b/catwalk/model_trainers.py
index fb277eb..8d8c991 100644
--- a/catwalk/model_trainers.py
+++ b/catwalk/model_trainers.py
@@ -14,7 +14,9 @@
     filename_friendly_hash, \
     retrieve_model_id_from_hash, \
     db_retry, \
-    save_db_objects
+    save_db_objects, \
+    bag_of_cats, \
+    find_cats
 
 from results_schema import Model, FeatureImportance
 
@@ -38,6 +40,7 @@ def __init__(
         model_storage_engine,
         db_engine,
         model_group_keys,
+        feature_config,
         replace=True
     ):
         self.project_path = project_path
@@ -46,6 +49,7 @@ def __init__(
         self.db_engine = db_engine
         self.sessionmaker = sessionmaker(bind=self.db_engine)
         self.model_group_keys = model_group_keys
+        self.feature_config = feature_config
         self.replace = replace
 
     def unique_parameters(self, parameters):
@@ -101,8 +105,17 @@ def _train(self, matrix_store, class_path, parameters):
         module_name, class_name = class_path.rsplit(".", 1)
         module = importlib.import_module(module_name)
         cls = getattr(module, class_name)
-        instance = cls(**parameters)
         y = matrix_store.labels()
+        model_params = parameters.copy() # copy since we may modify
+
+        # if using a classifier that samples respecting categoricals, detect the
+        # groups of categoricals and add them to the model parameter set
+        if class_name in ['CatInATreeClassifier', 'CatInAForestClassifier']:
+            cats_regex = bag_of_cats(self.feature_config)
+            categoricals = find_cats(matrix_store.matrix.columns.values, cats_regex)
+            model_params['categoricals'] = categoricals
+
+        instance = cls(**model_params)
 
         return instance.fit(matrix_store.matrix, y), matrix_store.matrix.columns
 
diff --git a/catwalk/utils.py b/catwalk/utils.py
index 1a3233f..5f25288 100644
--- a/catwalk/utils.py
+++ b/catwalk/utils.py
@@ -13,6 +13,8 @@
 import sqlalchemy
 import csv
 import postgres_copy
+from itertools import product
+import re
 
 
 def split_s3_path(path):
@@ -199,3 +201,61 @@ def save_db_objects(db_engine, db_objects):
             ])
         f.seek(0)
         postgres_copy.copy_from(f, type(db_objects[0]), db_engine, format='csv')
+
+
+# Two methods for identifying and grouping categorical columns
+def bag_of_cats(feature_config):
+    """
+    Parse a feature config to create regex patterns to match
+    categorical columns. Note that this assumes there's no
+    column name truncation
+    """
+    cats_regex = []
+    for fg in feature_config:
+        prefix = fg['prefix']
+        groups = fg['groups']
+        intervals = fg['intervals']
+        cats = fg.get('categoricals', [])
+        for cat in cats:
+            col = cat['column']
+            metrics = cat['metrics']
+
+            for group, interval, metric in product(
+                groups, intervals, metrics
+                ):
+                cats_regex.append(r'^%s_%s_%s_%s_(.*)_%s$' % (
+                    prefix, group, interval, col, metric
+                ))
+
+    return cats_regex
+
+
+# assumes no column name truncation!!
+def find_cats(matrix_cols, cats_regex, exclude_cols=None):
+    """
+    Assign matrix columns (by their numerical indices) to groups
+    of categoricals based on matching to a regex pattern
+    """
+
+    # be sure we exclude entity id, date, and label
+    if exclude_cols is None:
+        exclude_cols = ['entity_id', 'as_of_date', 'outcome']
+    feature_cols = [c for c in matrix_cols if c not in exclude_cols]
+
+    # We want the sets of numberical indices of columns that match our
+    # categorical patterns, so loop trough the column names then through
+    # the patterns, checking each one for a match. Here, `cats_dict`
+    # will act as a collector to hold the matches associated with each
+    # pattern. Note that if a column matches two patterns, it will get 
+    # assigned to the first categorical that matches, though this 
+    # shouldn't happen if the regex is matching the full string...
+    cats_dict = {r:[] for r in cats_regex}
+    for i, fc in enumerate(feature_cols):
+        for regex in cats_regex:
+            m = re.match(regex, fc)
+            if m is not None:
+                cats_dict[regex].append(i)
+                break
+
+    # collapse the dict into a list of lists to return
+    return [v for v in cats_dict.values() if len(v) > 0]

From 4ae548312b12e446b969f532aa3ec749757598f1 Mon Sep 17 00:00:00 2001
From: Kit Rodolfa <shaycrk@gmail.com>
Date: Thu, 19 Oct 2017 01:42:41 +0000
Subject: [PATCH 02/13] pass feature_config to ModelTrainer tests

---
 tests/test_integration.py    |  3 ++-
 tests/test_model_trainers.py | 13 +++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/tests/test_integration.py b/tests/test_integration.py
index a64d26a..8edae4c 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -77,7 +77,8 @@ def test_integration():
                 experiment_hash=experiment_hash,
                 model_storage_engine=model_storage_engine,
                 db_engine=db_engine,
-                model_group_keys=['label_name', 'label_window']
+                model_group_keys=['label_name', 'label_window'],
+                feature_config=[]
             )
             predictor = Predictor(
                 project_path,
diff --git a/tests/test_model_trainers.py b/tests/test_model_trainers.py
index 60e61a3..598537b 100644
--- a/tests/test_model_trainers.py
+++ b/tests/test_model_trainers.py
@@ -57,7 +57,8 @@ def test_model_trainer():
                 experiment_hash=None,
                 model_storage_engine=model_storage_engine,
                 db_engine=engine,
-                model_group_keys=['label_name', 'label_window']
+                model_group_keys=['label_name', 'label_window'],
+                feature_config=[]
             )
             matrix_store = InMemoryMatrixStore(matrix, metadata)
             model_ids = trainer.train_models(
@@ -136,6 +137,7 @@ def test_model_trainer():
                 model_storage_engine=model_storage_engine,
                 db_engine=engine,
                 model_group_keys=['label_name', 'label_window'],
+                feature_config=[],
                 replace=True
             )
             new_model_ids = trainer.train_models(
@@ -205,7 +207,8 @@ def test_n_jobs_not_new_model():
                 experiment_hash=None,
                 model_storage_engine=S3ModelStorageEngine(s3_conn, 'econ-dev/inspections'),
                 db_engine=engine,
-                model_group_keys=['label_name', 'label_window']
+                model_group_keys=['label_name', 'label_window'],
+                feature_config=[]
             )
 
             matrix = pandas.DataFrame.from_dict({
@@ -264,7 +267,8 @@ def test_retry_max(self):
                 experiment_hash=None,
                 model_storage_engine=InMemoryModelStorageEngine(project_path=''),
                 db_engine=engine,
-                model_group_keys=['label_name', 'label_window']
+                model_group_keys=['label_name', 'label_window'],
+                feature_config=[]
             )
 
             matrix = pandas.DataFrame.from_dict({
@@ -309,7 +313,8 @@ def test_retry_recovery(self):
                 experiment_hash=None,
                 model_storage_engine=InMemoryModelStorageEngine(project_path=''),
                 db_engine=engine,
-                model_group_keys=['label_name', 'label_window']
+                model_group_keys=['label_name', 'label_window'],
+                feature_config=[]
             )
 
             matrix = pandas.DataFrame.from_dict({

From fed9977ac5f343551bcc039625523c008ff50b53 Mon Sep 17 00:00:00 2001
From: Kit Rodolfa <shaycrk@gmail.com>
Date: Thu, 19 Oct 2017 02:25:24 +0000
Subject: [PATCH 03/13] use sklearn 0.18 interface remove the
 min_impurity_decrease parameter from DecisionTreeClassifier call to conform
 to sklearn 0.18

---
 catwalk/estimators/classifiers.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/catwalk/estimators/classifiers.py b/catwalk/estimators/classifiers.py
index 876bd66..a4fb2c8 100644
--- a/catwalk/estimators/classifiers.py
+++ b/catwalk/estimators/classifiers.py
@@ -102,7 +102,6 @@ def __init__(self,
                  min_samples_leaf=1,
                  min_weight_fraction_leaf=0.,
                  max_leaf_nodes=None,
-                 min_impurity_decrease=0.,
                  min_impurity_split=None,
                  class_weight=None,
                  presort=False):
@@ -117,7 +116,6 @@ def __init__(self,
         self.max_features = max_features
         self.random_state = random_state
         self.max_leaf_nodes = max_leaf_nodes
-        self.min_impurity_decrease = min_impurity_decrease
         self.min_impurity_split = min_impurity_split
         self.class_weight = class_weight
         self.presort = presort
@@ -127,8 +125,7 @@ def __init__(self,
             criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split,
             min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf,
             max_features=1.0, random_state=random_state, max_leaf_nodes=max_leaf_nodes,
-            min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split,
-            class_weight=class_weight, presort=presort
+            min_impurity_split=min_impurity_split, class_weight=class_weight, presort=presort
         )
 
         self.pipeline = Pipeline([
@@ -190,8 +187,8 @@ def __init__(self, categoricals, max_features_tree='sqrt', random_state=None,
         n_estimators=10, max_samples=1.0, bootstrap=True, oob_score=False, 
         warm_start=False, n_jobs=1, verbose=0, criterion="gini", splitter="best", 
         max_depth=None, min_samples_split=2, min_samples_leaf=1, 
-        min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_decrease=0., 
-        min_impurity_split=None, class_weight=None, presort=False):
+        min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_split=None, 
+        class_weight=None, presort=False):
 
         # set up the base estimator as a CatInATreeClassifier()
         self.base_estimator = CatInATreeClassifier(
@@ -199,8 +196,7 @@ def __init__(self, categoricals, max_features_tree='sqrt', random_state=None,
             criterion=criterion, splitter=splitter, max_depth=max_depth, 
             min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, 
             min_weight_fraction_leaf=min_weight_fraction_leaf, max_leaf_nodes=max_leaf_nodes, 
-            min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split,
-            class_weight=class_weight, presort=presort
+            min_impurity_split=min_impurity_split, class_weight=class_weight, presort=presort
             )
 
         # Call the super-class's constructor
@@ -233,7 +229,6 @@ def __init__(self, categoricals, max_features_tree='sqrt', random_state=None,
         self.min_samples_leaf = min_samples_leaf
         self.min_weight_fraction_leaf = min_weight_fraction_leaf
         self.max_leaf_nodes = max_leaf_nodes
-        self.min_impurity_decrease = min_impurity_decrease
         self.min_impurity_split = min_impurity_split
         self.class_weight = class_weight
         self.presort = presort

From a775f625ac326eab613d53a11837eaf2cfbc1766 Mon Sep 17 00:00:00 2001
From: Kit Rodolfa <shaycrk@gmail.com>
Date: Thu, 19 Oct 2017 02:31:30 +0000
Subject: [PATCH 04/13] min_impurity_split cannot be None in sklearn 0.18

---
 catwalk/estimators/classifiers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/catwalk/estimators/classifiers.py b/catwalk/estimators/classifiers.py
index a4fb2c8..ba54f08 100644
--- a/catwalk/estimators/classifiers.py
+++ b/catwalk/estimators/classifiers.py
@@ -102,7 +102,7 @@ def __init__(self,
                  min_samples_leaf=1,
                  min_weight_fraction_leaf=0.,
                  max_leaf_nodes=None,
-                 min_impurity_split=None,
+                 min_impurity_split=1e-07,
                  class_weight=None,
                  presort=False):
 
@@ -187,7 +187,7 @@ def __init__(self, categoricals, max_features_tree='sqrt', random_state=None,
         n_estimators=10, max_samples=1.0, bootstrap=True, oob_score=False, 
         warm_start=False, n_jobs=1, verbose=0, criterion="gini", splitter="best", 
         max_depth=None, min_samples_split=2, min_samples_leaf=1, 
-        min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_split=None, 
+        min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_split=1e-07, 
         class_weight=None, presort=False):
 
         # set up the base estimator as a CatInATreeClassifier()

From f1d9fb695152761d1924ba4b464fb1a669b01749 Mon Sep 17 00:00:00 2001
From: Kit Rodolfa <shaycrk@gmail.com>
Date: Thu, 19 Oct 2017 18:24:53 +0000
Subject: [PATCH 05/13] new utils tests

---
 tests/test_utils.py | 74 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 73 insertions(+), 1 deletion(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index aecf051..6adc735 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,12 +1,14 @@
 from catwalk.utils import filename_friendly_hash, \
     save_experiment_and_get_hash, \
-    sort_predictions_and_labels
+    sort_predictions_and_labels, \
+    bag_of_cats, find_cats
 from catwalk.db import ensure_db
 from sqlalchemy import create_engine
 import testing.postgresql
 import datetime
 import logging
 import re
+import pandas as pd
 
 
 def test_filename_friendly_hash():
@@ -104,3 +106,73 @@ def test_sort_predictions_and_labels():
     )
     assert sorted_predictions == (0.6, 0.5, 0.5, 0.4)
     assert sorted_labels == (True, False, True, False)
+
+def test_bag_of_cats():
+    feature_config = [
+        {
+            'prefix': 'first',
+            'aggregates': [
+                {'quantity': 'a1', 'metrics': ['min', 'max']}
+            ],
+            'categoricals': [
+                {'column': 'c1', 'choices': ['top', 'bottom', 'charm', 'strange'], 'metrics': ['min']},
+                {'column': 'c2', 'choices': ['up', 'down'], 'metrics': ['sum', 'max']}
+            ],
+            'intervals': ['1y', '5y'],
+            'groups': ['entity_id']
+        },
+        {
+            'prefix': 'second',
+            'categoricals': [
+                {'column': 'c3', 'choices': ['one', 'two'], 'metrics': ['sum']},
+                {'column': 'c4', 'choices': ['three', 'four'], 'metrics': ['max']}
+            ],
+            'intervals': ['1y', '10y'],
+            'groups': ['entity_id']
+        },
+        {
+            'prefix': 'third',
+            'aggregates': [
+                {'quantity': 'a2', 'metrics': ['min', 'max']}
+            ],
+            'intervals': ['6month'],
+            'groups': ['entity_id']
+        }
+    ]
+
+    cat_regex = set(bag_of_cats(feature_config))
+
+    assert cat_regex == set([
+        r'^first_entity_id_1y_c1_(.*)_min$', r'^first_entity_id_5y_c1_(.*)_min$', 
+        r'^first_entity_id_1y_c2_(.*)_sum$', r'^first_entity_id_1y_c2_(.*)_max$', 
+        r'^first_entity_id_5y_c2_(.*)_sum$', r'^first_entity_id_5y_c2_(.*)_max$', 
+        r'^second_entity_id_1y_c3_(.*)_sum$', r'^second_entity_id_10y_c3_(.*)_sum$', 
+        r'^second_entity_id_1y_c4_(.*)_max$', r'^second_entity_id_10y_c4_(.*)_max$'
+    ])
+
+def test_find_cats():
+    cat_regex = [r'^first_entity_id_1y_c1_(.*)_min$', r'^second_entity_id_10y_c3_(.*)_sum$']
+    df = pd.DataFrame({
+        'entity_id': [1,2,3,4],
+        'as_of_date': ['2012-01-01','2012-01-01','2012-01-01','2012-01-01'],
+        'first_entity_id_1y_c1_top_min': [0,1,0,0],
+        'first_entity_id_1y_c1_bottom_min': [1,0,0,0],
+        'first_entity_id_1y_c1__NULL_min': [0,0,1,0],
+        'first_entity_id_1y_a1_sum': [12,7,0,2],
+        'first_entity_id_1y_a2_max': [3,1,4,1],
+        'second_entity_id_10y_a3_sum': [5,9,2,6],
+        'second_entity_id_10y_c3_one_sum': [1,1,0,1],
+        'second_entity_id_10y_c3_two_sum': [0,0,1,0],
+        'outcome': [0,1,0,0]
+        })
+    # ensure column order
+    df = df[['entity_id', 'as_of_date', 'first_entity_id_1y_c1_top_min', 
+             'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min',
+             'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max',
+             'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum',
+             'second_entity_id_10y_c3_two_sum', 'outcome'
+    ]]
+
+    cat_cols = find_cats(df.columns.values, cat_regex)
+
+    assert cat_cols == [[0, 1, 2], [6, 7]]

From 8d3633ddc03c7939c7d5e60e0e5d563672d9ee3c Mon Sep 17 00:00:00 2001
From: Kit Rodolfa <shaycrk@gmail.com>
Date: Thu, 19 Oct 2017 18:44:52 +0000
Subject: [PATCH 06/13] fix subsetting bug

---
 catwalk/estimators/transformers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/catwalk/estimators/transformers.py b/catwalk/estimators/transformers.py
index fe9448f..4a6a10e 100644
--- a/catwalk/estimators/transformers.py
+++ b/catwalk/estimators/transformers.py
@@ -153,7 +153,7 @@ def fit(self, X, y=None):
 
         # this will be a mixed list of column indices for non-categoricals
         # and lists of indices for categorics
-        distinct_features = features + self.categoricals
+        distinct_features = list(non_cats) + self.categoricals
 
         self.max_features_ = self._infer_max_features(len(distinct_features))
         if self.max_features_ > len(distinct_features):

From 0562d19234edf9e0c0412bbe59d7baae17ce0982 Mon Sep 17 00:00:00 2001
From: Kit Rodolfa <shaycrk@gmail.com>
Date: Fri, 20 Oct 2017 03:28:12 +0000
Subject: [PATCH 07/13] unit tests and random_state fixes

---
 catwalk/estimators/classifiers.py  |  30 +++++++--
 catwalk/estimators/transformers.py |  16 ++---
 tests/test_estimators.py           | 103 ++++++++++++++++++++++++++++-
 3 files changed, 133 insertions(+), 16 deletions(-)

diff --git a/catwalk/estimators/classifiers.py b/catwalk/estimators/classifiers.py
index ba54f08..ad17284 100644
--- a/catwalk/estimators/classifiers.py
+++ b/catwalk/estimators/classifiers.py
@@ -9,6 +9,11 @@
 
 from catwalk.estimators.transformers import CutOff, SubsetWithCategoricals
 
+import numpy as np
+import random
+
+MAX_INT = np.iinfo(np.int32).max
+
 class ScaledLogisticRegression(BaseEstimator, ClassifierMixin):
     """
     An in-place replacement for the scikit-learn's LogisticRegression.
@@ -120,7 +125,9 @@ def __init__(self,
         self.class_weight = class_weight
         self.presort = presort
 
-        self.subset_cols = SubsetWithCategoricals(categoricals=categoricals, max_features=max_features)
+        self.subset_cols = SubsetWithCategoricals(
+            categoricals=categoricals, max_features=max_features, random_state=random_state
+        )
         self.tree = DecisionTreeClassifier(
             criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split,
             min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf,
@@ -135,6 +142,12 @@ def __init__(self,
 
     def fit(self, X, y):
 
+        # set the underlying random states before fitting
+        # doing this here rather than in the constructor because self.random_state might
+        # have been modified by an ensemble method
+        self.pipeline.named_steps['subset_cols'].set_params(random_state=self.random_state)
+        self.pipeline.named_steps['tree'].set_params(random_state=self.random_state)
+
         self.pipeline.fit(X, y)
 
         self.max_features_ = self.pipeline.named_steps['subset_cols'].max_features_
@@ -190,13 +203,18 @@ def __init__(self, categoricals, max_features_tree='sqrt', random_state=None,
         min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_split=1e-07, 
         class_weight=None, presort=False):
 
+        # if isinstance(random_state, int):
+        #     random.seed(random_state)
+        # elif isinstance(random_state, np.random.RandomState):
+        #     random.seed(random_state.randint(MAX_INT))
+
         # set up the base estimator as a CatInATreeClassifier()
         self.base_estimator = CatInATreeClassifier(
-            categoricals=categoricals, max_features=max_features_tree, random_state=random_state, 
-            criterion=criterion, splitter=splitter, max_depth=max_depth, 
-            min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, 
-            min_weight_fraction_leaf=min_weight_fraction_leaf, max_leaf_nodes=max_leaf_nodes, 
-            min_impurity_split=min_impurity_split, class_weight=class_weight, presort=presort
+            categoricals=categoricals, max_features=max_features_tree, criterion=criterion, 
+            splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, 
+            min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, 
+            max_leaf_nodes=max_leaf_nodes, min_impurity_split=min_impurity_split, 
+            class_weight=class_weight, presort=presort
             )
 
         # Call the super-class's constructor
diff --git a/catwalk/estimators/transformers.py b/catwalk/estimators/transformers.py
index 4a6a10e..5eb2f5f 100644
--- a/catwalk/estimators/transformers.py
+++ b/catwalk/estimators/transformers.py
@@ -10,6 +10,8 @@
 from sklearn.utils import check_array
 from sklearn.utils.validation import FLOAT_DTYPES
 
+MAX_INT = np.iinfo(np.int32).max
+
 def flatten_list(l):
     """
     Simple utility to flatten a list down to one dimension even if the list
@@ -121,15 +123,8 @@ class SubsetWithCategoricals(BaseEstimator, TransformerMixin):
     def __init__(self, categoricals, max_features='sqrt', random_state=None, copy=True):
         self.max_features = max_features
         self.categoricals = categoricals
+        self.random_state = random_state
         self.copy = copy
-        if isinstance(random_state, int):
-            random.seed(random_state)
-        elif isinstance(random_state, np.random.RandomState):
-            # feels like a bit of a hack, but np doesn't seem to handle
-            # lists of mixed types so well, so using random.sample()
-            # and pretty sure this should be deterministic if a RandomState
-            # object is passed
-            random.seed(random_state.get_state()[1].sum())
 
     def _infer_max_features(self, num_features):
         if isinstance(self.max_features, float):
@@ -146,6 +141,11 @@ def _infer_max_features(self, num_features):
             raise ValueError('Invalid value for max_features: %s' % self.max_features)
 
     def fit(self, X, y=None):
+        if isinstance(self.random_state, int):
+            random.seed(self.random_state)
+        elif isinstance(self.random_state, np.random.RandomState):
+            random.seed(self.random_state.randint(MAX_INT))
+
         features = list(range(X.shape[1]))
 
         all_cats = set(flatten_list(self.categoricals))
diff --git a/tests/test_estimators.py b/tests/test_estimators.py
index 47dea4b..86ddc2b 100644
--- a/tests/test_estimators.py
+++ b/tests/test_estimators.py
@@ -1,11 +1,14 @@
 import numpy as np
+import pandas as pd
 
 import warnings
 
 import pytest
 
-from catwalk.estimators.transformers import CutOff
-from catwalk.estimators.classifiers import ScaledLogisticRegression
+from catwalk.estimators.transformers import CutOff, \
+    SubsetWithCategoricals, flatten_list
+from catwalk.estimators.classifiers import ScaledLogisticRegression, \
+    CatInATreeClassifier, CatInAForestClassifier
 
 from sklearn import linear_model
 
@@ -74,3 +77,99 @@ def test_dsapp_lr(data):
     pipeline.fit(data['X_train'], data['y_train'])
 
     assert np.all(dsapp_lr.predict(data['X_test']) == pipeline.predict(data['X_test']))
+
+def test_flatten_list():
+    assert flatten_list([1, [2,3], [4, [5]], [], 6]) == [1,2,3,4,5,6]
+    assert flatten_list([]) == []
+    assert flatten_list([1,2,3]) == [1,2,3]
+    assert flatten_list([[1,2]]) == [1,2]
+
+def test_subset_with_categoricals():
+    df = pd.DataFrame({
+        'entity_id': [1,2,3,4],
+        'as_of_date': ['2012-01-01','2012-01-01','2012-01-01','2012-01-01'],
+        'first_entity_id_1y_c1_top_min': [0,1,0,0],
+        'first_entity_id_1y_c1_bottom_min': [1,0,0,0],
+        'first_entity_id_1y_c1__NULL_min': [0,0,1,0],
+        'first_entity_id_1y_a1_sum': [12,7,0,2],
+        'first_entity_id_1y_a2_max': [3,1,4,1],
+        'second_entity_id_10y_a3_sum': [5,9,2,6],
+        'second_entity_id_10y_c3_one_sum': [1,1,0,1],
+        'second_entity_id_10y_c3_two_sum': [0,0,1,0],
+        'outcome': [0,1,0,0]
+        })
+    # ensure column order
+    df = df[['entity_id', 'as_of_date', 'first_entity_id_1y_c1_top_min', 
+             'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min',
+             'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max',
+             'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum',
+             'second_entity_id_10y_c3_two_sum', 'outcome'
+    ]]
+
+    # random seed 0
+    sc = SubsetWithCategoricals(
+            categoricals=[[0, 1, 2], [6, 7]],
+            random_state=0
+        )
+
+    samp = sc.fit_transform(df.drop(['entity_id', 'as_of_date', 'outcome'], axis=1).values)
+
+    assert np.all(samp == np.array([  
+            [ 0.,  1.,  0.,  1.,  0.],
+            [ 1.,  0.,  0.,  1.,  0.],
+            [ 0.,  0.,  1.,  0.,  1.],
+            [ 0.,  0.,  0.,  1.,  0.]
+    ]))
+    assert sc.max_features_ == 2
+    assert sc.subset_indices == [0, 1, 2, 6, 7]
+
+    # random seed 1
+    sc = SubsetWithCategoricals(
+            categoricals=[[0, 1, 2], [6, 7]],
+            random_state=1
+        )
+
+    samp = sc.fit_transform(df.drop(['entity_id', 'as_of_date', 'outcome'], axis=1).values)
+
+    assert np.all(samp == np.array([
+            [ 12.,   3.],
+            [  7.,   1.],
+            [  0.,   4.],
+            [  2.,   1.]
+    ]))
+    assert sc.max_features_ == 2
+    assert sc.subset_indices == [3,4]
+
+def test_cat_in_a_tree(data):
+    # just for the purposes of testing, assuming several of the columns are categoricals
+    categoricals = [[2,3,4], [7,8,9,10,11], [13,14], [22,23,24,25]]
+
+    clf = CatInATreeClassifier(categoricals=categoricals, max_features=7, random_state=12345)
+    clf.fit(data['X_train'], data['y_train'])
+
+    assert clf.max_features_ == 7
+    assert clf.subset_indices == [0, 7, 8, 9, 10, 11, 12, 16, 19, 21, 27]
+    
+    pred = clf.predict_proba(data['X_test'])
+    assert len(pred) == len(data['y_test'])
+    # specific to the breast cancer data...
+    assert round(sum([p[1] for p in pred])) == 102
+
+
+def test_cat_in_a_forest(data):
+    # just for the purposes of testing, assuming several of the columns are categoricals
+    categoricals = [[2,3,4], [7,8,9,10,11], [13,14], [22,23,24,25]]
+
+    clf = CatInAForestClassifier(categoricals=categoricals, max_features_tree=7, n_estimators=3, random_state=12345)
+    clf.fit(data['X_train'], data['y_train'])
+
+    assert clf.estimators_[0].max_features_ == 7
+    assert clf.estimators_[0].subset_indices == [0, 1, 6, 12, 13, 14, 18, 22, 23, 24, 25]
+    assert clf.estimators_[1].subset_indices == [0, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 21]
+    assert clf.estimators_[2].subset_indices == [0, 2, 3, 4, 12, 13, 14, 15, 27, 28]
+    
+    pred = clf.predict_proba(data['X_test'])
+    assert len(pred) == len(data['y_test'])
+    # specific to the breast cancer data...
+    # even with 
+    assert round(sum([p[1] for p in pred])) == 108

From 931f3b31700dd3f30f9009f35f2a7dbdbb22f0df Mon Sep 17 00:00:00 2001
From: Kit Rodolfa <shaycrk@gmail.com>
Date: Fri, 20 Oct 2017 04:04:03 +0000
Subject: [PATCH 08/13] model trainers unit test

---
 tests/test_model_trainers.py | 113 +++++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)

diff --git a/tests/test_model_trainers.py b/tests/test_model_trainers.py
index 598537b..5c3532f 100644
--- a/tests/test_model_trainers.py
+++ b/tests/test_model_trainers.py
@@ -182,6 +182,119 @@ def test_model_trainer():
                 sorted([model_id for model_id in new_model_ids])
 
 
+def test_model_trainer_categoricals():
+    with testing.postgresql.Postgresql() as postgresql:
+        engine = create_engine(postgresql.url())
+        ensure_db(engine)
+
+        grid_config = {
+            'catwalk.estimators.classifiers.CatInAForestClassifier': {
+                'max_features_tree': [3],
+                'n_estimators': [3],
+                'random_state': [2193]
+            }
+        }
+
+        with mock_s3():
+            s3_conn = boto3.resource('s3')
+            s3_conn.create_bucket(Bucket='econ-dev')
+
+            feature_config = [
+                {
+                    'prefix': 'first',
+                    'aggregates': [
+                        {'quantity': 'a1', 'metrics': ['sum']},
+                        {'quantity': 'a2', 'metrics': ['max']}
+                    ],
+                    'categoricals': [
+                        {'column': 'c1', 'choices': ['top', 'bottom', 'charm', 'strange'], 'metrics': ['min']}
+                    ],
+                    'intervals': ['1y'],
+                    'groups': ['entity_id']
+                },
+                {
+                    'prefix': 'second',
+                    'aggregates': [
+                        {'quantity': 'a3', 'metrics': ['sum']}
+                    ],
+                    'categoricals': [
+                        {'column': 'c3', 'choices': ['one', 'two'], 'metrics': ['sum']}
+                    ],
+                    'intervals': ['10y'],
+                    'groups': ['entity_id']
+                }
+            ]
+
+            # create training set
+            matrix = pandas.DataFrame.from_dict({
+                'entity_id': [1,2,3,4],
+                'first_entity_id_1y_c1_top_min': [0,1,0,0],
+                'first_entity_id_1y_c1_bottom_min': [1,0,0,0],
+                'first_entity_id_1y_c1__NULL_min': [0,0,1,0],
+                'first_entity_id_1y_a1_sum': [12,7,0,2],
+                'first_entity_id_1y_a2_max': [3,1,4,1],
+                'second_entity_id_10y_a3_sum': [5,9,2,6],
+                'second_entity_id_10y_c3_one_sum': [1,1,0,1],
+                'second_entity_id_10y_c3_two_sum': [0,0,1,0],
+                'outcome': [0,1,0,0]
+            })
+            # ensure column order
+            matrix = matrix[['entity_id', 'as_of_date', 'first_entity_id_1y_c1_top_min', 
+                     'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min',
+                     'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max',
+                     'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum',
+                     'second_entity_id_10y_c3_two_sum', 'outcome'
+            ]]
+            metadata = {
+                'beginning_of_time': datetime.date(2012, 12, 20),
+                'end_time': datetime.date(2016, 12, 20),
+                'label_name': 'outcome',
+                'label_window': '1y',
+                'metta-uuid': '1234',
+                'feature_names': ['first_entity_id_1y_c1_top_min', 
+                     'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min',
+                     'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max',
+                     'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum',
+                     'second_entity_id_10y_c3_two_sum'
+                ],
+                'indices': ['entity_id'],
+            }
+            project_path = 'econ-dev/inspections'
+            model_storage_engine = S3ModelStorageEngine(s3_conn, project_path)
+            trainer = ModelTrainer(
+                project_path=project_path,
+                experiment_hash=None,
+                model_storage_engine=model_storage_engine,
+                db_engine=engine,
+                model_group_keys=['label_name', 'label_window'],
+                feature_config=feature_config
+            )
+            matrix_store = InMemoryMatrixStore(matrix, metadata)
+            model_ids = trainer.train_models(
+                grid_config=grid_config,
+                misc_db_parameters=dict(),
+                matrix_store=matrix_store
+            )
+
+            # assert categoricals were properly detected and passed to model
+            records = [
+                row for row in
+                engine.execute('select model_hash from results.models')
+            ]
+
+            cache_keys = [
+                model_cache_key(project_path, model_row[0], s3_conn)
+                for model_row in records
+            ]
+
+            model_pickles = [
+                pickle.loads(cache_key.get()['Body'].read())
+                for cache_key in cache_keys
+            ]
+
+            assert model_pickles[0].categoricals == [[0, 1, 2], [6, 7]]
+
+
 def test_n_jobs_not_new_model():
     grid_config = {
         'sklearn.ensemble.AdaBoostClassifier': {

From 105d999bc0016f8d77b10800ce561b6259689f0e Mon Sep 17 00:00:00 2001
From: Kit Rodolfa <shaycrk@gmail.com>
Date: Fri, 20 Oct 2017 04:23:15 +0000
Subject: [PATCH 09/13] debug model trainers test

---
 tests/test_model_trainers.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/test_model_trainers.py b/tests/test_model_trainers.py
index 5c3532f..3599b3b 100644
--- a/tests/test_model_trainers.py
+++ b/tests/test_model_trainers.py
@@ -183,7 +183,11 @@ def test_model_trainer():
 
 
 def test_model_trainer_categoricals():
-    with testing.postgresql.Postgresql() as postgresql:
+    # DELETE ME
+    pgpath = '/usr/lib/postgresql/9.6/bin/'
+    # DELETE ME
+    with testing.postgresql.Postgresql(initdb=pgpath+'initdb', postgres=pgpath+'postgres') as postgresql:
+#    with testing.postgresql.Postgresql() as postgresql:
         engine = create_engine(postgresql.url())
         ensure_db(engine)
 
@@ -239,7 +243,7 @@ def test_model_trainer_categoricals():
                 'outcome': [0,1,0,0]
             })
             # ensure column order
-            matrix = matrix[['entity_id', 'as_of_date', 'first_entity_id_1y_c1_top_min', 
+            matrix = matrix[['entity_id', 'first_entity_id_1y_c1_top_min', 
                      'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min',
                      'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max',
                      'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum',

From 8b92dc84c60b65e11afcdc7e9be76f28227664fd Mon Sep 17 00:00:00 2001
From: Kit Rodolfa <shaycrk@gmail.com>
Date: Fri, 20 Oct 2017 04:32:06 +0000
Subject: [PATCH 10/13] sort test result

---
 tests/test_model_trainers.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/test_model_trainers.py b/tests/test_model_trainers.py
index 3599b3b..0b83302 100644
--- a/tests/test_model_trainers.py
+++ b/tests/test_model_trainers.py
@@ -183,11 +183,7 @@ def test_model_trainer():
 
 
 def test_model_trainer_categoricals():
-    # DELETE ME
-    pgpath = '/usr/lib/postgresql/9.6/bin/'
-    # DELETE ME
-    with testing.postgresql.Postgresql(initdb=pgpath+'initdb', postgres=pgpath+'postgres') as postgresql:
-#    with testing.postgresql.Postgresql() as postgresql:
+    with testing.postgresql.Postgresql() as postgresql:
         engine = create_engine(postgresql.url())
         ensure_db(engine)
 
@@ -296,7 +292,7 @@ def test_model_trainer_categoricals():
                 for cache_key in cache_keys
             ]
 
-            assert model_pickles[0].categoricals == [[0, 1, 2], [6, 7]]
+            assert sorted([sorted(c) for c in model_pickles[0].categoricals]) == [[0, 1, 2], [6, 7]]
 
 
 def test_n_jobs_not_new_model():

From d607cd3852a2279a2f93eafd6fe0e8f8b009d0c6 Mon Sep 17 00:00:00 2001
From: Kit Rodolfa <shaycrk@gmail.com>
Date: Fri, 20 Oct 2017 18:37:42 +0000
Subject: [PATCH 11/13] sort test_find_cats result

---
 tests/test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 6adc735..3307ca6 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -175,4 +175,4 @@ def test_find_cats():
 
     cat_cols = find_cats(df.columns.values, cat_regex)
 
-    assert cat_cols == [[0, 1, 2], [6, 7]]
+    assert sorted([sorted(c) for c in cat_cols]) == [[0, 1, 2], [6, 7]]

From 904b33150052ab6ace5ee68c0e387be765a0ec1d Mon Sep 17 00:00:00 2001
From: Kit Rodolfa <shaycrk@gmail.com>
Date: Sat, 21 Oct 2017 01:52:39 +0000
Subject: [PATCH 12/13] handle imputed flags for non-categorical columns where
 imputation was performed, ensure the flag and underlying column always come
 together for models that respect categoricals (in the future, we may want to
 consider passing these separately but for the purposes here we just add them
 in with the categoricals)

---
 catwalk/utils.py    | 7 +++++++
 tests/test_utils.py | 8 ++++++++
 2 files changed, 15 insertions(+)

diff --git a/catwalk/utils.py b/catwalk/utils.py
index 5f25288..1dcee1d 100644
--- a/catwalk/utils.py
+++ b/catwalk/utils.py
@@ -242,6 +242,13 @@ def find_cats(matrix_cols, cats_regex, exclude_cols=None):
         exclude_cols = ['entity_id', 'as_of_date', 'outcome']
     feature_cols = [c for c in matrix_cols if c not in exclude_cols]
 
+    # add in regex to make sure imputed flags always come along with
+    # their reference columns
+    imp_regex = [
+        r'^%s(_imp)?$' % col[:-4] for col in matrix_cols if col[-4:] == '_imp'
+    ]
+    cats_regex += imp_regex
+
     # We want the sets of numberical indices of columns that match our
     # categorical patterns, so loop trough the column names then through
     # the patterns, checking each one for a match. Here, `cats_dict`
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 3307ca6..6018caa 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -151,6 +151,7 @@ def test_bag_of_cats():
     ])
 
 def test_find_cats():
+    # test with just categoricals
     cat_regex = [r'^first_entity_id_1y_c1_(.*)_min$', r'^second_entity_id_10y_c3_(.*)_sum$']
     df = pd.DataFrame({
         'entity_id': [1,2,3,4],
@@ -176,3 +177,10 @@ def test_find_cats():
     cat_cols = find_cats(df.columns.values, cat_regex)
 
     assert sorted([sorted(c) for c in cat_cols]) == [[0, 1, 2], [6, 7]]
+
+    # test with categoricals and imputed flags
+    df['first_entity_id_1y_a1_sum_imp'] = [0,0,0,1]
+    df['second_entity_id_10y_a3_sum_imp'] = [0,1,0,1]
+
+    cat_cols = find_cats(df.columns.values, cat_regex)
+    assert sorted([sorted(c) for c in cat_cols]) == [[0, 1, 2], [3,8], [5,9], [6, 7]]

From 8a72cbfcbc8a0d9d82291358fe701ee2672c91c1 Mon Sep 17 00:00:00 2001
From: Kit Rodolfa <shaycrk@gmail.com>
Date: Sat, 21 Oct 2017 02:00:31 +0000
Subject: [PATCH 13/13] add comments

---
 catwalk/utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/catwalk/utils.py b/catwalk/utils.py
index 1dcee1d..c4e59a5 100644
--- a/catwalk/utils.py
+++ b/catwalk/utils.py
@@ -235,6 +235,10 @@ def find_cats(matrix_cols, cats_regex, exclude_cols=None):
     """
     Assign matrix columns (by their numerical indices) to groups
     of categoricals based on matching to a regex pattern
+
+    Note that groupings of imputed columns along with their
+    underlying columns will be included in the returned result
+    as well.
     """
 
     # be sure we exclude entity id, date, and label
@@ -244,6 +248,8 @@ def find_cats(matrix_cols, cats_regex, exclude_cols=None):
 
     # add in regex to make sure imputed flags always come along with
     # their reference columns
+    # TODO: maybe return these as a separate list to allow models to
+    #       treat them differently than categoricals.
     imp_regex = [
         r'^%s(_imp)?$' % col[:-4] for col in matrix_cols if col[-4:] == '_imp'
     ]