From 792f5e125e899327f0047f10df1af7d43e82a265 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 19 Oct 2017 01:12:54 +0000 Subject: [PATCH 01/13] classifier that respects categoricals Creates a new pair of classifiers, CatInATree and CatInAForest, that respect categoricals when randomly sampling features. --- catwalk/estimators/classifiers.py | 163 ++++++++++++++++++++++++++++- catwalk/estimators/transformers.py | 100 ++++++++++++++++++ catwalk/model_trainers.py | 17 ++- catwalk/utils.py | 60 +++++++++++ 4 files changed, 337 insertions(+), 3 deletions(-) diff --git a/catwalk/estimators/classifiers.py b/catwalk/estimators/classifiers.py index 0ee7cf0..876bd66 100644 --- a/catwalk/estimators/classifiers.py +++ b/catwalk/estimators/classifiers.py @@ -4,8 +4,10 @@ from sklearn.pipeline import Pipeline from sklearn.preprocessing import MinMaxScaler from sklearn.linear_model import LogisticRegression +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import BaggingClassifier -from catwalk.estimators.transformers import CutOff +from catwalk.estimators.transformers import CutOff, SubsetWithCategoricals class ScaledLogisticRegression(BaseEstimator, ClassifierMixin): """ @@ -76,3 +78,162 @@ def predict(self, X): def score(self, X, y): return self.pipeline.score(X,y) + + +class CatInATreeClassifier(BaseEstimator, ClassifierMixin): + """ + Fit a decision tree with a subset of features that respects categoricals + + Args: + categoricals : list, + List of groups of column indices to be considered associated + with one another as categoricals. For instance [[1,2], [7,8,9]] + would mean columns 1 & 2 are associated as one categorical and + 7, 8, and 9 are associated as a second one. + """ + def __init__(self, + categoricals, + max_features='sqrt', + random_state=None, + criterion="gini", + splitter="best", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0., + max_leaf_nodes=None, + min_impurity_decrease=0., + min_impurity_split=None, + class_weight=None, + presort=False): + + self.categoricals = categoricals + self.criterion = criterion + self.splitter = splitter + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.random_state = random_state + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_decrease = min_impurity_decrease + self.min_impurity_split = min_impurity_split + self.class_weight = class_weight + self.presort = presort + + self.subset_cols = SubsetWithCategoricals(categoricals=categoricals, max_features=max_features) + self.tree = DecisionTreeClassifier( + criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=1.0, random_state=random_state, max_leaf_nodes=max_leaf_nodes, + min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, + class_weight=class_weight, presort=presort + ) + + self.pipeline = Pipeline([ + ('subset_cols', self.subset_cols), + ('tree', self.tree) + ]) + + def fit(self, X, y): + + self.pipeline.fit(X, y) + + self.max_features_ = self.pipeline.named_steps['subset_cols'].max_features_ + self.subset_indices = self.pipeline.named_steps['subset_cols'].subset_indices + + self.classes_ = self.pipeline.named_steps['tree'].classes_ + self.n_classes_ = self.pipeline.named_steps['tree'].n_classes_ + self.n_features_ = self.pipeline.named_steps['tree'].n_features_ + self.n_outputs_ = self.pipeline.named_steps['tree'].n_outputs_ + self.tree_ = self.pipeline.named_steps['tree'].tree_ + + # feature importances need to reference full column set but underlying tree + # was trained on the subset, so fill in others with zeros + fi = self.pipeline.named_steps['tree'].feature_importances_ + fi_dict = dict(zip(self.subset_indices, fi)) + fi_full = [] + for i in range(X.shape[1]): + fi_full.append(fi_dict.get(i, 0)) + self.feature_importances_ = fi_full + + return self + + def apply(self, X): + return self.pipeline.apply(X) + + def decision_path(self, X): + return self.pipeline.decision_path(X) + + def predict(self, X): + return self.pipeline.predict(X) + + def predict_log_proba(self, X): + return self.pipeline.predict_log_proba(X) + + def predict_proba(self, X): + return self.pipeline.predict_proba(X) + + def score(self, X, y): + return self.pipeline.score(X, y) + + +class CatInAForestClassifier(BaggingClassifier): + """ + Bagged classifier using CatInATreeClassifiers as estimators. + Note that max_features is required here for the underlying + subsetting and that the bagging classifier will use all selected + features for each tree with no option for feature bootstrapping. + """ + def __init__(self, categoricals, max_features_tree='sqrt', random_state=None, + n_estimators=10, max_samples=1.0, bootstrap=True, oob_score=False, + warm_start=False, n_jobs=1, verbose=0, criterion="gini", splitter="best", + max_depth=None, min_samples_split=2, min_samples_leaf=1, + min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_decrease=0., + min_impurity_split=None, class_weight=None, presort=False): + + # set up the base estimator as a CatInATreeClassifier() + self.base_estimator = CatInATreeClassifier( + categoricals=categoricals, max_features=max_features_tree, random_state=random_state, + criterion=criterion, splitter=splitter, max_depth=max_depth, + min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, max_leaf_nodes=max_leaf_nodes, + min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, + class_weight=class_weight, presort=presort + ) + + # Call the super-class's constructor + # Here, we force each tree to consider all features (without bootstrapping) + # as we'll handle the subsetting in the base estimator to have control over + # sampling categoricals. Also note that calling the BaggingClassifier + # constructor will set an object parameter `max_features`=1.0, so we've + # nammed the class parameter `max_features_tree` avoid collision. + BaggingClassifier.__init__( + self, + base_estimator=self.base_estimator, + n_estimators=n_estimators, + max_samples=max_samples, + max_features=1.0, + bootstrap=bootstrap, + bootstrap_features=False, + oob_score=oob_score, + warm_start=warm_start, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose + ) + + self.categoricals = categoricals + self.max_features_tree = max_features_tree + self.criterion = criterion + self.splitter = splitter + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_decrease = min_impurity_decrease + self.min_impurity_split = min_impurity_split + self.class_weight = class_weight + self.presort = presort diff --git a/catwalk/estimators/transformers.py b/catwalk/estimators/transformers.py index bcf75c2..fe9448f 100644 --- a/catwalk/estimators/transformers.py +++ b/catwalk/estimators/transformers.py @@ -3,11 +3,26 @@ import warnings import numpy as np +from math import log, sqrt +import random from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils import check_array from sklearn.utils.validation import FLOAT_DTYPES +def flatten_list(l): + """ + Simple utility to flatten a list down to one dimension even if the list + contains elements of differing depth + """ + res = [] + for i in l: + if isinstance(i, list): + res = res + flatten_list(i) + else: + res = res + [i] + return res + DEPRECATION_MSG_1D = ( "Passing 1d arrays as data is deprecated in 0.17 and will " "raise ValueError in 0.19. Reshape your data either using " @@ -69,3 +84,88 @@ def transform(self, X): X[X < feature_range[0]] = feature_range[0] return X + + +# feels pretty gross to have to specify the categorical columns in the constructor +# even before the object is aware of the data it's operating on, but doesn't seem +# like the fit method is flexible enough to specify it there if we're going to +# use it in a pipeline. ugh. +class SubsetWithCategoricals(BaseEstimator, TransformerMixin): + """ + Subsets features of an array treating categoricals as a group + + Args: + max_features : int, float, string or None, optional (default=None) + The number of features to subset down to: + - If int, then subset to `max_features` features. + - If float, then `max_features` is a percentage and + `int(max_features * n_features)` features are used. + - If "auto", then `max_features=sqrt(n_features)`. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + categoricals : list, + List of groups of column indices to be considered associated + with one another as categoricals. For instance [[1,2], [7,8,9]] + would mean columns 1 & 2 are associated as one categorical and + 7, 8, and 9 are associated as a second one. + + Attributes: + subset_indices : list, + Indices of the chosen subset of columns in the original array. + + max_features_ : int, + The inferred value of max_features. + """ + def __init__(self, categoricals, max_features='sqrt', random_state=None, copy=True): + self.max_features = max_features + self.categoricals = categoricals + self.copy = copy + if isinstance(random_state, int): + random.seed(random_state) + elif isinstance(random_state, np.random.RandomState): + # feels like a bit of a hack, but np doesn't seem to handle + # lists of mixed types so well, so using random.sample() + # and pretty sure this should be deterministic if a RandomState + # object is passed + random.seed(random_state.get_state()[1].sum()) + + def _infer_max_features(self, num_features): + if isinstance(self.max_features, float): + return int(self.max_features*num_features) + elif isinstance(self.max_features, int): + return self.max_features + elif self.max_features in ['auto', 'sqrt']: + return int(sqrt(num_features)) + elif self.max_features == 'log2': + return int(log(num_features, 2)) + elif self.max_features is None: + return num_features + else: + raise ValueError('Invalid value for max_features: %s' % self.max_features) + + def fit(self, X, y=None): + features = list(range(X.shape[1])) + + all_cats = set(flatten_list(self.categoricals)) + non_cats = set(features) - all_cats + + # this will be a mixed list of column indices for non-categoricals + # and lists of indices for categorics + distinct_features = features + self.categoricals + + self.max_features_ = self._infer_max_features(len(distinct_features)) + if self.max_features_ > len(distinct_features): + raise ValueError('Cannot subset to more than distinct features: %s vs %s' % ( + self.max_features_, len(distinct_features))) + + self.subset_indices = sorted(flatten_list( + random.sample(distinct_features, self.max_features_) + )) + + return self + + def transform(self, X): + X = check_array(X, copy=self.copy, ensure_2d=False, dtype=FLOAT_DTYPES) + return X[:, self.subset_indices] diff --git a/catwalk/model_trainers.py b/catwalk/model_trainers.py index fb277eb..8d8c991 100644 --- a/catwalk/model_trainers.py +++ b/catwalk/model_trainers.py @@ -14,7 +14,9 @@ filename_friendly_hash, \ retrieve_model_id_from_hash, \ db_retry, \ - save_db_objects + save_db_objects, \ + bag_of_cats, \ + find_cats from results_schema import Model, FeatureImportance @@ -38,6 +40,7 @@ def __init__( model_storage_engine, db_engine, model_group_keys, + feature_config, replace=True ): self.project_path = project_path @@ -46,6 +49,7 @@ def __init__( self.db_engine = db_engine self.sessionmaker = sessionmaker(bind=self.db_engine) self.model_group_keys = model_group_keys + self.feature_config = feature_config self.replace = replace def unique_parameters(self, parameters): @@ -101,8 +105,17 @@ def _train(self, matrix_store, class_path, parameters): module_name, class_name = class_path.rsplit(".", 1) module = importlib.import_module(module_name) cls = getattr(module, class_name) - instance = cls(**parameters) y = matrix_store.labels() + model_params = parameters.copy() # copy since we may modify + + # if using a classifier that samples respecting categoricals, detect the + # groups of categoricals and add them to the model parameter set + if class_name in ['CatInATreeClassifier', 'CatInAForestClassifier']: + cats_regex = bag_of_cats(self.feature_config) + categoricals = find_cats(matrix_store.matrix.columns.values, cats_regex) + model_params['categoricals'] = categoricals + + instance = cls(**model_params) return instance.fit(matrix_store.matrix, y), matrix_store.matrix.columns diff --git a/catwalk/utils.py b/catwalk/utils.py index 1a3233f..5f25288 100644 --- a/catwalk/utils.py +++ b/catwalk/utils.py @@ -13,6 +13,8 @@ import sqlalchemy import csv import postgres_copy +from itertools import product +import re def split_s3_path(path): @@ -199,3 +201,61 @@ def save_db_objects(db_engine, db_objects): ]) f.seek(0) postgres_copy.copy_from(f, type(db_objects[0]), db_engine, format='csv') + + +# Two methods for identifying and grouping categorical columns +def bag_of_cats(feature_config): + """ + Parse a feature config to create regex patterns to match + categorical columns. Note that this assumes there's no + column name truncation + """ + cats_regex = [] + for fg in feature_config: + prefix = fg['prefix'] + groups = fg['groups'] + intervals = fg['intervals'] + cats = fg.get('categoricals', []) + for cat in cats: + col = cat['column'] + metrics = cat['metrics'] + + for group, interval, metric in product( + groups, intervals, metrics + ): + cats_regex.append(r'^%s_%s_%s_%s_(.*)_%s$' % ( + prefix, group, interval, col, metric + )) + + return cats_regex + + +# assumes no column name truncation!! +def find_cats(matrix_cols, cats_regex, exclude_cols=None): + """ + Assign matrix columns (by their numerical indices) to groups + of categoricals based on matching to a regex pattern + """ + + # be sure we exclude entity id, date, and label + if exclude_cols is None: + exclude_cols = ['entity_id', 'as_of_date', 'outcome'] + feature_cols = [c for c in matrix_cols if c not in exclude_cols] + + # We want the sets of numberical indices of columns that match our + # categorical patterns, so loop trough the column names then through + # the patterns, checking each one for a match. Here, `cats_dict` + # will act as a collector to hold the matches associated with each + # pattern. Note that if a column matches two patterns, it will get + # assigned to the first categorical that matches, though this + # shouldn't happen if the regex is matching the full string... + cats_dict = {r:[] for r in cats_regex} + for i, fc in enumerate(feature_cols): + for regex in cats_regex: + m = re.match(regex, fc) + if m is not None: + cats_dict[regex].append(i) + break + + # collapse the dict into a list of lists to return + return [v for v in cats_dict.values() if len(v) > 0] From 4ae548312b12e446b969f532aa3ec749757598f1 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 19 Oct 2017 01:42:41 +0000 Subject: [PATCH 02/13] pass feature_config to ModelTrainer tests --- tests/test_integration.py | 3 ++- tests/test_model_trainers.py | 13 +++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index a64d26a..8edae4c 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -77,7 +77,8 @@ def test_integration(): experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, - model_group_keys=['label_name', 'label_window'] + model_group_keys=['label_name', 'label_window'], + feature_config=[] ) predictor = Predictor( project_path, diff --git a/tests/test_model_trainers.py b/tests/test_model_trainers.py index 60e61a3..598537b 100644 --- a/tests/test_model_trainers.py +++ b/tests/test_model_trainers.py @@ -57,7 +57,8 @@ def test_model_trainer(): experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=engine, - model_group_keys=['label_name', 'label_window'] + model_group_keys=['label_name', 'label_window'], + feature_config=[] ) matrix_store = InMemoryMatrixStore(matrix, metadata) model_ids = trainer.train_models( @@ -136,6 +137,7 @@ def test_model_trainer(): model_storage_engine=model_storage_engine, db_engine=engine, model_group_keys=['label_name', 'label_window'], + feature_config=[], replace=True ) new_model_ids = trainer.train_models( @@ -205,7 +207,8 @@ def test_n_jobs_not_new_model(): experiment_hash=None, model_storage_engine=S3ModelStorageEngine(s3_conn, 'econ-dev/inspections'), db_engine=engine, - model_group_keys=['label_name', 'label_window'] + model_group_keys=['label_name', 'label_window'], + feature_config=[] ) matrix = pandas.DataFrame.from_dict({ @@ -264,7 +267,8 @@ def test_retry_max(self): experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine(project_path=''), db_engine=engine, - model_group_keys=['label_name', 'label_window'] + model_group_keys=['label_name', 'label_window'], + feature_config=[] ) matrix = pandas.DataFrame.from_dict({ @@ -309,7 +313,8 @@ def test_retry_recovery(self): experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine(project_path=''), db_engine=engine, - model_group_keys=['label_name', 'label_window'] + model_group_keys=['label_name', 'label_window'], + feature_config=[] ) matrix = pandas.DataFrame.from_dict({ From fed9977ac5f343551bcc039625523c008ff50b53 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 19 Oct 2017 02:25:24 +0000 Subject: [PATCH 03/13] use sklearn 0.18 interface remove the min_impurity_decrease parameter from DecisionTreeClassifier call to conform to sklearn 0.18 --- catwalk/estimators/classifiers.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/catwalk/estimators/classifiers.py b/catwalk/estimators/classifiers.py index 876bd66..a4fb2c8 100644 --- a/catwalk/estimators/classifiers.py +++ b/catwalk/estimators/classifiers.py @@ -102,7 +102,6 @@ def __init__(self, min_samples_leaf=1, min_weight_fraction_leaf=0., max_leaf_nodes=None, - min_impurity_decrease=0., min_impurity_split=None, class_weight=None, presort=False): @@ -117,7 +116,6 @@ def __init__(self, self.max_features = max_features self.random_state = random_state self.max_leaf_nodes = max_leaf_nodes - self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.class_weight = class_weight self.presort = presort @@ -127,8 +125,7 @@ def __init__(self, criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=1.0, random_state=random_state, max_leaf_nodes=max_leaf_nodes, - min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, - class_weight=class_weight, presort=presort + min_impurity_split=min_impurity_split, class_weight=class_weight, presort=presort ) self.pipeline = Pipeline([ @@ -190,8 +187,8 @@ def __init__(self, categoricals, max_features_tree='sqrt', random_state=None, n_estimators=10, max_samples=1.0, bootstrap=True, oob_score=False, warm_start=False, n_jobs=1, verbose=0, criterion="gini", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, - min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_decrease=0., - min_impurity_split=None, class_weight=None, presort=False): + min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_split=None, + class_weight=None, presort=False): # set up the base estimator as a CatInATreeClassifier() self.base_estimator = CatInATreeClassifier( @@ -199,8 +196,7 @@ def __init__(self, categoricals, max_features_tree='sqrt', random_state=None, criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_leaf_nodes=max_leaf_nodes, - min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, - class_weight=class_weight, presort=presort + min_impurity_split=min_impurity_split, class_weight=class_weight, presort=presort ) # Call the super-class's constructor @@ -233,7 +229,6 @@ def __init__(self, categoricals, max_features_tree='sqrt', random_state=None, self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_leaf_nodes = max_leaf_nodes - self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.class_weight = class_weight self.presort = presort From a775f625ac326eab613d53a11837eaf2cfbc1766 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 19 Oct 2017 02:31:30 +0000 Subject: [PATCH 04/13] min_impurity_split cannot be None in sklearn 0.18 --- catwalk/estimators/classifiers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/catwalk/estimators/classifiers.py b/catwalk/estimators/classifiers.py index a4fb2c8..ba54f08 100644 --- a/catwalk/estimators/classifiers.py +++ b/catwalk/estimators/classifiers.py @@ -102,7 +102,7 @@ def __init__(self, min_samples_leaf=1, min_weight_fraction_leaf=0., max_leaf_nodes=None, - min_impurity_split=None, + min_impurity_split=1e-07, class_weight=None, presort=False): @@ -187,7 +187,7 @@ def __init__(self, categoricals, max_features_tree='sqrt', random_state=None, n_estimators=10, max_samples=1.0, bootstrap=True, oob_score=False, warm_start=False, n_jobs=1, verbose=0, criterion="gini", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, - min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_split=None, + min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_split=1e-07, class_weight=None, presort=False): # set up the base estimator as a CatInATreeClassifier() From f1d9fb695152761d1924ba4b464fb1a669b01749 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 19 Oct 2017 18:24:53 +0000 Subject: [PATCH 05/13] new utils tests --- tests/test_utils.py | 74 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index aecf051..6adc735 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,12 +1,14 @@ from catwalk.utils import filename_friendly_hash, \ save_experiment_and_get_hash, \ - sort_predictions_and_labels + sort_predictions_and_labels, \ + bag_of_cats, find_cats from catwalk.db import ensure_db from sqlalchemy import create_engine import testing.postgresql import datetime import logging import re +import pandas as pd def test_filename_friendly_hash(): @@ -104,3 +106,73 @@ def test_sort_predictions_and_labels(): ) assert sorted_predictions == (0.6, 0.5, 0.5, 0.4) assert sorted_labels == (True, False, True, False) + +def test_bag_of_cats(): + feature_config = [ + { + 'prefix': 'first', + 'aggregates': [ + {'quantity': 'a1', 'metrics': ['min', 'max']} + ], + 'categoricals': [ + {'column': 'c1', 'choices': ['top', 'bottom', 'charm', 'strange'], 'metrics': ['min']}, + {'column': 'c2', 'choices': ['up', 'down'], 'metrics': ['sum', 'max']} + ], + 'intervals': ['1y', '5y'], + 'groups': ['entity_id'] + }, + { + 'prefix': 'second', + 'categoricals': [ + {'column': 'c3', 'choices': ['one', 'two'], 'metrics': ['sum']}, + {'column': 'c4', 'choices': ['three', 'four'], 'metrics': ['max']} + ], + 'intervals': ['1y', '10y'], + 'groups': ['entity_id'] + }, + { + 'prefix': 'third', + 'aggregates': [ + {'quantity': 'a2', 'metrics': ['min', 'max']} + ], + 'intervals': ['6month'], + 'groups': ['entity_id'] + } + ] + + cat_regex = set(bag_of_cats(feature_config)) + + assert cat_regex == set([ + r'^first_entity_id_1y_c1_(.*)_min$', r'^first_entity_id_5y_c1_(.*)_min$', + r'^first_entity_id_1y_c2_(.*)_sum$', r'^first_entity_id_1y_c2_(.*)_max$', + r'^first_entity_id_5y_c2_(.*)_sum$', r'^first_entity_id_5y_c2_(.*)_max$', + r'^second_entity_id_1y_c3_(.*)_sum$', r'^second_entity_id_10y_c3_(.*)_sum$', + r'^second_entity_id_1y_c4_(.*)_max$', r'^second_entity_id_10y_c4_(.*)_max$' + ]) + +def test_find_cats(): + cat_regex = [r'^first_entity_id_1y_c1_(.*)_min$', r'^second_entity_id_10y_c3_(.*)_sum$'] + df = pd.DataFrame({ + 'entity_id': [1,2,3,4], + 'as_of_date': ['2012-01-01','2012-01-01','2012-01-01','2012-01-01'], + 'first_entity_id_1y_c1_top_min': [0,1,0,0], + 'first_entity_id_1y_c1_bottom_min': [1,0,0,0], + 'first_entity_id_1y_c1__NULL_min': [0,0,1,0], + 'first_entity_id_1y_a1_sum': [12,7,0,2], + 'first_entity_id_1y_a2_max': [3,1,4,1], + 'second_entity_id_10y_a3_sum': [5,9,2,6], + 'second_entity_id_10y_c3_one_sum': [1,1,0,1], + 'second_entity_id_10y_c3_two_sum': [0,0,1,0], + 'outcome': [0,1,0,0] + }) + # ensure column order + df = df[['entity_id', 'as_of_date', 'first_entity_id_1y_c1_top_min', + 'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min', + 'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max', + 'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum', + 'second_entity_id_10y_c3_two_sum', 'outcome' + ]] + + cat_cols = find_cats(df.columns.values, cat_regex) + + assert cat_cols == [[0, 1, 2], [6, 7]] From 8d3633ddc03c7939c7d5e60e0e5d563672d9ee3c Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Thu, 19 Oct 2017 18:44:52 +0000 Subject: [PATCH 06/13] fix subsetting bug --- catwalk/estimators/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catwalk/estimators/transformers.py b/catwalk/estimators/transformers.py index fe9448f..4a6a10e 100644 --- a/catwalk/estimators/transformers.py +++ b/catwalk/estimators/transformers.py @@ -153,7 +153,7 @@ def fit(self, X, y=None): # this will be a mixed list of column indices for non-categoricals # and lists of indices for categorics - distinct_features = features + self.categoricals + distinct_features = list(non_cats) + self.categoricals self.max_features_ = self._infer_max_features(len(distinct_features)) if self.max_features_ > len(distinct_features): From 0562d19234edf9e0c0412bbe59d7baae17ce0982 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Fri, 20 Oct 2017 03:28:12 +0000 Subject: [PATCH 07/13] unit tests and random_state fixes --- catwalk/estimators/classifiers.py | 30 +++++++-- catwalk/estimators/transformers.py | 16 ++--- tests/test_estimators.py | 103 ++++++++++++++++++++++++++++- 3 files changed, 133 insertions(+), 16 deletions(-) diff --git a/catwalk/estimators/classifiers.py b/catwalk/estimators/classifiers.py index ba54f08..ad17284 100644 --- a/catwalk/estimators/classifiers.py +++ b/catwalk/estimators/classifiers.py @@ -9,6 +9,11 @@ from catwalk.estimators.transformers import CutOff, SubsetWithCategoricals +import numpy as np +import random + +MAX_INT = np.iinfo(np.int32).max + class ScaledLogisticRegression(BaseEstimator, ClassifierMixin): """ An in-place replacement for the scikit-learn's LogisticRegression. @@ -120,7 +125,9 @@ def __init__(self, self.class_weight = class_weight self.presort = presort - self.subset_cols = SubsetWithCategoricals(categoricals=categoricals, max_features=max_features) + self.subset_cols = SubsetWithCategoricals( + categoricals=categoricals, max_features=max_features, random_state=random_state + ) self.tree = DecisionTreeClassifier( criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, @@ -135,6 +142,12 @@ def __init__(self, def fit(self, X, y): + # set the underlying random states before fitting + # doing this here rather than in the constructor because self.random_state might + # have been modified by an ensemble method + self.pipeline.named_steps['subset_cols'].set_params(random_state=self.random_state) + self.pipeline.named_steps['tree'].set_params(random_state=self.random_state) + self.pipeline.fit(X, y) self.max_features_ = self.pipeline.named_steps['subset_cols'].max_features_ @@ -190,13 +203,18 @@ def __init__(self, categoricals, max_features_tree='sqrt', random_state=None, min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_split=1e-07, class_weight=None, presort=False): + # if isinstance(random_state, int): + # random.seed(random_state) + # elif isinstance(random_state, np.random.RandomState): + # random.seed(random_state.randint(MAX_INT)) + # set up the base estimator as a CatInATreeClassifier() self.base_estimator = CatInATreeClassifier( - categoricals=categoricals, max_features=max_features_tree, random_state=random_state, - criterion=criterion, splitter=splitter, max_depth=max_depth, - min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, - min_weight_fraction_leaf=min_weight_fraction_leaf, max_leaf_nodes=max_leaf_nodes, - min_impurity_split=min_impurity_split, class_weight=class_weight, presort=presort + categoricals=categoricals, max_features=max_features_tree, criterion=criterion, + splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, + max_leaf_nodes=max_leaf_nodes, min_impurity_split=min_impurity_split, + class_weight=class_weight, presort=presort ) # Call the super-class's constructor diff --git a/catwalk/estimators/transformers.py b/catwalk/estimators/transformers.py index 4a6a10e..5eb2f5f 100644 --- a/catwalk/estimators/transformers.py +++ b/catwalk/estimators/transformers.py @@ -10,6 +10,8 @@ from sklearn.utils import check_array from sklearn.utils.validation import FLOAT_DTYPES +MAX_INT = np.iinfo(np.int32).max + def flatten_list(l): """ Simple utility to flatten a list down to one dimension even if the list @@ -121,15 +123,8 @@ class SubsetWithCategoricals(BaseEstimator, TransformerMixin): def __init__(self, categoricals, max_features='sqrt', random_state=None, copy=True): self.max_features = max_features self.categoricals = categoricals + self.random_state = random_state self.copy = copy - if isinstance(random_state, int): - random.seed(random_state) - elif isinstance(random_state, np.random.RandomState): - # feels like a bit of a hack, but np doesn't seem to handle - # lists of mixed types so well, so using random.sample() - # and pretty sure this should be deterministic if a RandomState - # object is passed - random.seed(random_state.get_state()[1].sum()) def _infer_max_features(self, num_features): if isinstance(self.max_features, float): @@ -146,6 +141,11 @@ def _infer_max_features(self, num_features): raise ValueError('Invalid value for max_features: %s' % self.max_features) def fit(self, X, y=None): + if isinstance(self.random_state, int): + random.seed(self.random_state) + elif isinstance(self.random_state, np.random.RandomState): + random.seed(self.random_state.randint(MAX_INT)) + features = list(range(X.shape[1])) all_cats = set(flatten_list(self.categoricals)) diff --git a/tests/test_estimators.py b/tests/test_estimators.py index 47dea4b..86ddc2b 100644 --- a/tests/test_estimators.py +++ b/tests/test_estimators.py @@ -1,11 +1,14 @@ import numpy as np +import pandas as pd import warnings import pytest -from catwalk.estimators.transformers import CutOff -from catwalk.estimators.classifiers import ScaledLogisticRegression +from catwalk.estimators.transformers import CutOff, \ + SubsetWithCategoricals, flatten_list +from catwalk.estimators.classifiers import ScaledLogisticRegression, \ + CatInATreeClassifier, CatInAForestClassifier from sklearn import linear_model @@ -74,3 +77,99 @@ def test_dsapp_lr(data): pipeline.fit(data['X_train'], data['y_train']) assert np.all(dsapp_lr.predict(data['X_test']) == pipeline.predict(data['X_test'])) + +def test_flatten_list(): + assert flatten_list([1, [2,3], [4, [5]], [], 6]) == [1,2,3,4,5,6] + assert flatten_list([]) == [] + assert flatten_list([1,2,3]) == [1,2,3] + assert flatten_list([[1,2]]) == [1,2] + +def test_subset_with_categoricals(): + df = pd.DataFrame({ + 'entity_id': [1,2,3,4], + 'as_of_date': ['2012-01-01','2012-01-01','2012-01-01','2012-01-01'], + 'first_entity_id_1y_c1_top_min': [0,1,0,0], + 'first_entity_id_1y_c1_bottom_min': [1,0,0,0], + 'first_entity_id_1y_c1__NULL_min': [0,0,1,0], + 'first_entity_id_1y_a1_sum': [12,7,0,2], + 'first_entity_id_1y_a2_max': [3,1,4,1], + 'second_entity_id_10y_a3_sum': [5,9,2,6], + 'second_entity_id_10y_c3_one_sum': [1,1,0,1], + 'second_entity_id_10y_c3_two_sum': [0,0,1,0], + 'outcome': [0,1,0,0] + }) + # ensure column order + df = df[['entity_id', 'as_of_date', 'first_entity_id_1y_c1_top_min', + 'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min', + 'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max', + 'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum', + 'second_entity_id_10y_c3_two_sum', 'outcome' + ]] + + # random seed 0 + sc = SubsetWithCategoricals( + categoricals=[[0, 1, 2], [6, 7]], + random_state=0 + ) + + samp = sc.fit_transform(df.drop(['entity_id', 'as_of_date', 'outcome'], axis=1).values) + + assert np.all(samp == np.array([ + [ 0., 1., 0., 1., 0.], + [ 1., 0., 0., 1., 0.], + [ 0., 0., 1., 0., 1.], + [ 0., 0., 0., 1., 0.] + ])) + assert sc.max_features_ == 2 + assert sc.subset_indices == [0, 1, 2, 6, 7] + + # random seed 1 + sc = SubsetWithCategoricals( + categoricals=[[0, 1, 2], [6, 7]], + random_state=1 + ) + + samp = sc.fit_transform(df.drop(['entity_id', 'as_of_date', 'outcome'], axis=1).values) + + assert np.all(samp == np.array([ + [ 12., 3.], + [ 7., 1.], + [ 0., 4.], + [ 2., 1.] + ])) + assert sc.max_features_ == 2 + assert sc.subset_indices == [3,4] + +def test_cat_in_a_tree(data): + # just for the purposes of testing, assuming several of the columns are categoricals + categoricals = [[2,3,4], [7,8,9,10,11], [13,14], [22,23,24,25]] + + clf = CatInATreeClassifier(categoricals=categoricals, max_features=7, random_state=12345) + clf.fit(data['X_train'], data['y_train']) + + assert clf.max_features_ == 7 + assert clf.subset_indices == [0, 7, 8, 9, 10, 11, 12, 16, 19, 21, 27] + + pred = clf.predict_proba(data['X_test']) + assert len(pred) == len(data['y_test']) + # specific to the breast cancer data... + assert round(sum([p[1] for p in pred])) == 102 + + +def test_cat_in_a_forest(data): + # just for the purposes of testing, assuming several of the columns are categoricals + categoricals = [[2,3,4], [7,8,9,10,11], [13,14], [22,23,24,25]] + + clf = CatInAForestClassifier(categoricals=categoricals, max_features_tree=7, n_estimators=3, random_state=12345) + clf.fit(data['X_train'], data['y_train']) + + assert clf.estimators_[0].max_features_ == 7 + assert clf.estimators_[0].subset_indices == [0, 1, 6, 12, 13, 14, 18, 22, 23, 24, 25] + assert clf.estimators_[1].subset_indices == [0, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 21] + assert clf.estimators_[2].subset_indices == [0, 2, 3, 4, 12, 13, 14, 15, 27, 28] + + pred = clf.predict_proba(data['X_test']) + assert len(pred) == len(data['y_test']) + # specific to the breast cancer data... + # even with + assert round(sum([p[1] for p in pred])) == 108 From 931f3b31700dd3f30f9009f35f2a7dbdbb22f0df Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Fri, 20 Oct 2017 04:04:03 +0000 Subject: [PATCH 08/13] model trainers unit test --- tests/test_model_trainers.py | 113 +++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/tests/test_model_trainers.py b/tests/test_model_trainers.py index 598537b..5c3532f 100644 --- a/tests/test_model_trainers.py +++ b/tests/test_model_trainers.py @@ -182,6 +182,119 @@ def test_model_trainer(): sorted([model_id for model_id in new_model_ids]) +def test_model_trainer_categoricals(): + with testing.postgresql.Postgresql() as postgresql: + engine = create_engine(postgresql.url()) + ensure_db(engine) + + grid_config = { + 'catwalk.estimators.classifiers.CatInAForestClassifier': { + 'max_features_tree': [3], + 'n_estimators': [3], + 'random_state': [2193] + } + } + + with mock_s3(): + s3_conn = boto3.resource('s3') + s3_conn.create_bucket(Bucket='econ-dev') + + feature_config = [ + { + 'prefix': 'first', + 'aggregates': [ + {'quantity': 'a1', 'metrics': ['sum']}, + {'quantity': 'a2', 'metrics': ['max']} + ], + 'categoricals': [ + {'column': 'c1', 'choices': ['top', 'bottom', 'charm', 'strange'], 'metrics': ['min']} + ], + 'intervals': ['1y'], + 'groups': ['entity_id'] + }, + { + 'prefix': 'second', + 'aggregates': [ + {'quantity': 'a3', 'metrics': ['sum']} + ], + 'categoricals': [ + {'column': 'c3', 'choices': ['one', 'two'], 'metrics': ['sum']} + ], + 'intervals': ['10y'], + 'groups': ['entity_id'] + } + ] + + # create training set + matrix = pandas.DataFrame.from_dict({ + 'entity_id': [1,2,3,4], + 'first_entity_id_1y_c1_top_min': [0,1,0,0], + 'first_entity_id_1y_c1_bottom_min': [1,0,0,0], + 'first_entity_id_1y_c1__NULL_min': [0,0,1,0], + 'first_entity_id_1y_a1_sum': [12,7,0,2], + 'first_entity_id_1y_a2_max': [3,1,4,1], + 'second_entity_id_10y_a3_sum': [5,9,2,6], + 'second_entity_id_10y_c3_one_sum': [1,1,0,1], + 'second_entity_id_10y_c3_two_sum': [0,0,1,0], + 'outcome': [0,1,0,0] + }) + # ensure column order + matrix = matrix[['entity_id', 'as_of_date', 'first_entity_id_1y_c1_top_min', + 'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min', + 'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max', + 'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum', + 'second_entity_id_10y_c3_two_sum', 'outcome' + ]] + metadata = { + 'beginning_of_time': datetime.date(2012, 12, 20), + 'end_time': datetime.date(2016, 12, 20), + 'label_name': 'outcome', + 'label_window': '1y', + 'metta-uuid': '1234', + 'feature_names': ['first_entity_id_1y_c1_top_min', + 'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min', + 'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max', + 'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum', + 'second_entity_id_10y_c3_two_sum' + ], + 'indices': ['entity_id'], + } + project_path = 'econ-dev/inspections' + model_storage_engine = S3ModelStorageEngine(s3_conn, project_path) + trainer = ModelTrainer( + project_path=project_path, + experiment_hash=None, + model_storage_engine=model_storage_engine, + db_engine=engine, + model_group_keys=['label_name', 'label_window'], + feature_config=feature_config + ) + matrix_store = InMemoryMatrixStore(matrix, metadata) + model_ids = trainer.train_models( + grid_config=grid_config, + misc_db_parameters=dict(), + matrix_store=matrix_store + ) + + # assert categoricals were properly detected and passed to model + records = [ + row for row in + engine.execute('select model_hash from results.models') + ] + + cache_keys = [ + model_cache_key(project_path, model_row[0], s3_conn) + for model_row in records + ] + + model_pickles = [ + pickle.loads(cache_key.get()['Body'].read()) + for cache_key in cache_keys + ] + + assert model_pickles[0].categoricals == [[0, 1, 2], [6, 7]] + + def test_n_jobs_not_new_model(): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { From 105d999bc0016f8d77b10800ce561b6259689f0e Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Fri, 20 Oct 2017 04:23:15 +0000 Subject: [PATCH 09/13] debug model trainers test --- tests/test_model_trainers.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_model_trainers.py b/tests/test_model_trainers.py index 5c3532f..3599b3b 100644 --- a/tests/test_model_trainers.py +++ b/tests/test_model_trainers.py @@ -183,7 +183,11 @@ def test_model_trainer(): def test_model_trainer_categoricals(): - with testing.postgresql.Postgresql() as postgresql: + # DELETE ME + pgpath = '/usr/lib/postgresql/9.6/bin/' + # DELETE ME + with testing.postgresql.Postgresql(initdb=pgpath+'initdb', postgres=pgpath+'postgres') as postgresql: +# with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) @@ -239,7 +243,7 @@ def test_model_trainer_categoricals(): 'outcome': [0,1,0,0] }) # ensure column order - matrix = matrix[['entity_id', 'as_of_date', 'first_entity_id_1y_c1_top_min', + matrix = matrix[['entity_id', 'first_entity_id_1y_c1_top_min', 'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min', 'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max', 'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum', From 8b92dc84c60b65e11afcdc7e9be76f28227664fd Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Fri, 20 Oct 2017 04:32:06 +0000 Subject: [PATCH 10/13] sort test result --- tests/test_model_trainers.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/test_model_trainers.py b/tests/test_model_trainers.py index 3599b3b..0b83302 100644 --- a/tests/test_model_trainers.py +++ b/tests/test_model_trainers.py @@ -183,11 +183,7 @@ def test_model_trainer(): def test_model_trainer_categoricals(): - # DELETE ME - pgpath = '/usr/lib/postgresql/9.6/bin/' - # DELETE ME - with testing.postgresql.Postgresql(initdb=pgpath+'initdb', postgres=pgpath+'postgres') as postgresql: -# with testing.postgresql.Postgresql() as postgresql: + with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) @@ -296,7 +292,7 @@ def test_model_trainer_categoricals(): for cache_key in cache_keys ] - assert model_pickles[0].categoricals == [[0, 1, 2], [6, 7]] + assert sorted([sorted(c) for c in model_pickles[0].categoricals]) == [[0, 1, 2], [6, 7]] def test_n_jobs_not_new_model(): From d607cd3852a2279a2f93eafd6fe0e8f8b009d0c6 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Fri, 20 Oct 2017 18:37:42 +0000 Subject: [PATCH 11/13] sort test_find_cats result --- tests/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 6adc735..3307ca6 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -175,4 +175,4 @@ def test_find_cats(): cat_cols = find_cats(df.columns.values, cat_regex) - assert cat_cols == [[0, 1, 2], [6, 7]] + assert sorted([sorted(c) for c in cat_cols]) == [[0, 1, 2], [6, 7]] From 904b33150052ab6ace5ee68c0e387be765a0ec1d Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Sat, 21 Oct 2017 01:52:39 +0000 Subject: [PATCH 12/13] handle imputed flags for non-categorical columns where imputation was performed, ensure the flag and underlying column always come together for models that respect categoricals (in the future, we may want to consider passing these separately but for the purposes here we just add them in with the categoricals) --- catwalk/utils.py | 7 +++++++ tests/test_utils.py | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/catwalk/utils.py b/catwalk/utils.py index 5f25288..1dcee1d 100644 --- a/catwalk/utils.py +++ b/catwalk/utils.py @@ -242,6 +242,13 @@ def find_cats(matrix_cols, cats_regex, exclude_cols=None): exclude_cols = ['entity_id', 'as_of_date', 'outcome'] feature_cols = [c for c in matrix_cols if c not in exclude_cols] + # add in regex to make sure imputed flags always come along with + # their reference columns + imp_regex = [ + r'^%s(_imp)?$' % col[:-4] for col in matrix_cols if col[-4:] == '_imp' + ] + cats_regex += imp_regex + # We want the sets of numberical indices of columns that match our # categorical patterns, so loop trough the column names then through # the patterns, checking each one for a match. Here, `cats_dict` diff --git a/tests/test_utils.py b/tests/test_utils.py index 3307ca6..6018caa 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -151,6 +151,7 @@ def test_bag_of_cats(): ]) def test_find_cats(): + # test with just categoricals cat_regex = [r'^first_entity_id_1y_c1_(.*)_min$', r'^second_entity_id_10y_c3_(.*)_sum$'] df = pd.DataFrame({ 'entity_id': [1,2,3,4], @@ -176,3 +177,10 @@ def test_find_cats(): cat_cols = find_cats(df.columns.values, cat_regex) assert sorted([sorted(c) for c in cat_cols]) == [[0, 1, 2], [6, 7]] + + # test with categoricals and imputed flags + df['first_entity_id_1y_a1_sum_imp'] = [0,0,0,1] + df['second_entity_id_10y_a3_sum_imp'] = [0,1,0,1] + + cat_cols = find_cats(df.columns.values, cat_regex) + assert sorted([sorted(c) for c in cat_cols]) == [[0, 1, 2], [3,8], [5,9], [6, 7]] From 8a72cbfcbc8a0d9d82291358fe701ee2672c91c1 Mon Sep 17 00:00:00 2001 From: Kit Rodolfa Date: Sat, 21 Oct 2017 02:00:31 +0000 Subject: [PATCH 13/13] add comments --- catwalk/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/catwalk/utils.py b/catwalk/utils.py index 1dcee1d..c4e59a5 100644 --- a/catwalk/utils.py +++ b/catwalk/utils.py @@ -235,6 +235,10 @@ def find_cats(matrix_cols, cats_regex, exclude_cols=None): """ Assign matrix columns (by their numerical indices) to groups of categoricals based on matching to a regex pattern + + Note that groupings of imputed columns along with their + underlying columns will be included in the returned result + as well. """ # be sure we exclude entity id, date, and label @@ -244,6 +248,8 @@ def find_cats(matrix_cols, cats_regex, exclude_cols=None): # add in regex to make sure imputed flags always come along with # their reference columns + # TODO: maybe return these as a separate list to allow models to + # treat them differently than categoricals. imp_regex = [ r'^%s(_imp)?$' % col[:-4] for col in matrix_cols if col[-4:] == '_imp' ]