Skip to content

Data lab #8

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 31 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
4655af1
Class Design + Ensemble
anuragkapale Jul 20, 2019
289805c
Refactor
anuragkapale Jul 25, 2019
23ddedf
Add Benchmarking script
anuragkapale Jul 25, 2019
fadf0ab
Add estimators
anuragkapale Jul 25, 2019
1f800e1
Fixed the tests
anuragkapale Aug 29, 2019
a7b66c3
Use hyper-opt for search
anuragkapale Aug 30, 2019
27ed4b8
Added code for primitives
anuragkapale Sep 2, 2019
1e7c94f
Fetch multiple trials from hyperopt
anuragkapale Sep 3, 2019
c5158ce
Fix regression hparamspace
anuragkapale Sep 3, 2019
6dd78ae
Resolve fziling datasets
anuragkapale Sep 4, 2019
d6a6060
Before shifting to pandas
anuragkapale Sep 4, 2019
8d0dc6b
Shift to pandas, add 2nd order and target encoding
anuragkapale Sep 5, 2019
e10541d
Use Tabular Data
anuragkapale Sep 5, 2019
2e97035
Save changes
anuragkapale Sep 5, 2019
4c7c0fb
Fix the prep pipeline
anuragkapale Sep 15, 2019
08f4a4d
Added global config/fixed label encoder
anuragkapale Sep 16, 2019
cdd2ba7
Split to classifier and regressor
anuragkapale Sep 17, 2019
516515b
Refactor Config
anuragkapale Sep 17, 2019
c400a2f
Fix config init related bug
anuragkapale Sep 17, 2019
b35d9be
Diverse Ensembles
anuragkapale Sep 18, 2019
1731d77
CV for stacking and proba stacking
anuragkapale Sep 18, 2019
e614bfd
hparam update
anuragkapale Sep 19, 2019
951e1d1
Add blind dataset in stacking
anuragkapale Sep 19, 2019
5aed127
Refactor with AutoPipe
anuragkapale Sep 20, 2019
4749d5c
Fix higher order primitives
anuragkapale Sep 20, 2019
f4df7fd
Add params to preprocessor
anuragkapale Sep 20, 2019
d5b7f9c
2 rounds search
anuragkapale Sep 20, 2019
6d25537
Select best preprocessing settings
anuragkapale Sep 20, 2019
8e0068b
prep param space update
anuragkapale Sep 20, 2019
c29f159
Address review comments
anuragkapale Sep 25, 2019
7e651e6
Fix Indent
anuragkapale Sep 26, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions autokaggle/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from autokaggle.auto_ml import Classifier, Regressor
from autokaggle.ensemblers import *
635 changes: 635 additions & 0 deletions autokaggle/auto_ml.py

Large diffs are not rendered by default.

351 changes: 351 additions & 0 deletions autokaggle/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,351 @@
from sklearn.base import BaseEstimator
from autokaggle.utils import rand_temp_folder_generator, ensure_dir
import hyperopt
from hyperopt import hp
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
RandomForestRegressor, AdaBoostRegressor, \
ExtraTreesRegressor
from sklearn.linear_model import Ridge
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, Pool, CatBoostRegressor
import numpy as np


class Config:
""" Configuration for various autoML components.

Defines the common configuration of different auto ML components. It is
shared between AutoKaggle, AutoPipe, Preprocessor and Ensembling class.

# Arguments
path: String. OS path for storing temporary model parameters.
verbose: Bool. Defines the verbosity of the logging.
time_limit: Int. Time budget for performing search and fit pipeline.
use_ensembling: Bool. Defines whether to use an ensemble of models
num_estimators_ensemble: Int. Maximum number of estimators to be used
in an ensemble
ensemble_strategy: String. Strategy to ensemble models
ensemble_method: String. Aggregation method if ensemble_strategy is
set to ranked_ensembling
random_ensemble: Bool. Whether the ensembling estimators are picked
randomly.
diverse_ensemble: Bool. Whether estimators from different families are
picked.
ensembling_search_iter: Int. Search iterations for ensembling
hyper-parameter search
search_algo: String. Search strategy for hyper-parameter search.
search_iter: Int. Number of iterations used for hyper-parameter search.
cv_folds: Int. Number of Cross Validation folds.
subsample_ratio: Percent of subsample used for for hyper-parameter
search.
data_info: list(String). Lists the datatypes of each feature column.
stack_probabilities: Bool. Whether to use class probabilities in
ensembling.
upsample_classes: Bool. Whether to upsample less represented classes
num_p_hparams: Int. Number of preprocessor search spaces.
"""

def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True,
num_estimators_ensemble=50,
ensemble_strategy='stacking', ensemble_method='max_voting',
search_iter=500, cv_folds=3,
subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True,
stack_probabilities=False,
data_info=None, upsample_classes=False, ensembling_search_iter=10,
search_algo='random',
num_p_hparams=10):
self.verbose = verbose
self.path = path if path is not None else rand_temp_folder_generator()
ensure_dir(self.path)
if self.verbose:
print('Path:', self.path)
self.time_limit = time_limit
self.objective = None
self.use_ensembling = use_ensembling
self.hparams = None
self.num_estimators_ensemble = num_estimators_ensemble
self.ensemble_strategy = ensemble_strategy
self.ensemble_method = ensemble_method
self.random_ensemble = random_ensemble
self.search_iter = search_iter
self.cv_folds = cv_folds
self.subsample_ratio = subsample_ratio
self.resampling_strategy = 'auto'
self.random_state = 1001
self.classification_models = ['knn', 'svm', 'lgbm', 'random_forest',
'adaboost']
# self.classification_models = ['knn', 'lgbm', 'random_forest',]
self.regression_models = ['extratree', 'ridge', 'lgbm', 'random_forest',
'adaboost', 'catboost']
self.diverse_ensemble = diverse_ensemble
self.stack_probabilities = stack_probabilities
self.data_info = data_info
self.upsample_classes = upsample_classes
self.ensembling_search_iter = ensembling_search_iter
self.search_algo = hyperopt.rand.suggest if search_algo == 'random' else \
hyperopt.tpe.suggest
self.num_p_hparams = num_p_hparams

def update(self, options):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add doc string.

for k, v in options.items():
if hasattr(self, k):
setattr(self, k, v)


KNN_CLASSIFIER_PARAMS = {
'n_neighbors': hp.choice('n_neighbors_knn', [1, 2, 4, 8, 16, 32, 64, 100]),
'weights': hp.choice('weight_knn', ['uniform', 'distance']),
'metric': hp.choice('metric_knn',
["euclidean", "manhattan", "chebyshev", "minkowski"]),
'p': hp.choice('p_knn', range(1, 3)),
}

SVM_CLASSIFIER_PARAMS = {
'C': hp.loguniform('C_svm', np.log(0.03125), np.log(32768)),
'kernel': hp.choice('kernel_svm', ['rbf', 'poly', 'sigmoid']),
'degree': hp.choice('degree_svm', range(2, 6)),
'gamma': hp.loguniform('gamma_svm', np.log(3e-5), np.log(8)),
'max_iter': 50000,
}

RANDOM_FOREST_CLASSIFIER_PARAMS = {
'criterion': hp.choice('criterion_rf', ['entropy', 'gini']),
'max_features': hp.uniform('max_features_rf', 0, 1.0),
'n_estimators': hp.choice('n_estimators_rf', [100, 50]),
'min_samples_leaf': hp.choice('min_samples_leaf_rf', range(1, 20)),
'min_samples_split': hp.choice('min_samples_split_rf', range(2, 20)),
}

LGBM_CLASSIFIER_PARAMS = {
'boosting_type': 'gbdt',
'min_split_gain': 0.1,
'subsample': 0.8,
'num_leaves': 80,
'colsample_bytree': hp.uniform('colsample_bytree_lgbm', 0.4, 0.8),
'min_child_weight': hp.choice('min_child_weight_lgbm', range(1, 100)),
'max_depth': hp.choice('max_depth_lgbm', range(5, 10)),
'n_estimators': hp.choice('n_estimators_lgbm', range(50, 200)),
'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-2),
high=np.log(2)),
}

ADABOOST_CLASSIFIER_PARAMS = {
'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']),
'n_estimators': hp.choice('n_estimators_adaboost', range(50, 500)),
'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2),
high=np.log(2)),
}

CATBOOST_CLASSIFIER_PARAMS = {
'iterations': hp.choice('iterations_catboost', [5, 10]),
'depth': hp.choice('depth_catboost', range(4, 11)),
'learning_rate': hp.loguniform('learning_rate_catboost', low=np.log(1e-3),
high=np.log(1)),
'loss_function': hp.choice('loss_function_catboost',
['Logloss', 'CrossEntropy']),
'verbose': True,
'leaf_estimation_iterations': 10,
'l2_leaf_reg': hp.choice('l2_leaf_reg_catboost', np.logspace(-20, -19, 3))
}

EXTRA_TREES_REGRESSOR_PARAMS = {
'n_estimators': hp.choice('n_estimators_extra_trees', [50, 100, 200]),
'criterion': hp.choice('criterion_extra_trees', ['mse', 'friedman_mse', 'mae']),
'max_features': hp.uniform('max_features_extra_trees', 0, 1.0),
'min_samples_leaf': hp.choice('min_samples_leaf_extra_trees', range(1, 20)),
'min_samples_split': hp.choice('min_samples_split_extra_trees', range(2, 20)),
'min_impurity_decrease': 0.0,
'bootstrap': hp.choice('bootstrap_extra_trees', [True, False]),
}

RIDGE_REGRESSOR_PARAMS = {
'fit_intercept': True,
'tol': hp.loguniform('tol_ridge', 1e-5, 1e-1),
'alpha': hp.loguniform('alpha_ridge', np.log(1e-5), np.log(10))
}

RANDOM_FOREST_REGRESSOR_PARAMS = {
'criterion': hp.choice('criterion_rf', ['mse', 'friedman_mse', 'mae']),
'max_features': hp.uniform('max_features_rf', 0.1, 1.0),
'n_estimators': hp.choice('n_estimators_rf', [50, 100, 200]),
'min_samples_leaf': hp.choice('min_samples_leaf_rf', range(1, 10)),
'min_samples_split': hp.choice('min_samples_split_rf', range(2, 10)),
'bootstrap': hp.choice('bootstrap_rf', [True, False]),
}

LGBM_REGRESSOR_PARAMS = {
'boosting_type': 'gbdt',
'min_split_gain': 0.1,
'subsample': 0.8,
'num_leaves': 80,
'colsample_bytree': hp.uniform('colsample_bytree_lgbm', 0.4, 0.8),
'min_child_weight': hp.choice('min_child_weight_lgbm', range(1, 100)),
'max_depth': hp.choice('max_depth_lgbm', range(5, 10)),
'n_estimators': hp.choice('n_estimators_lgbm', range(50, 200)),
'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-5),
high=np.log(1)),
}

ADABOOST_REGRESSOR_PARAMS = {
'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]),
'n_estimators': hp.choice('n_estimators_adaboost', range(50, 300)),
'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2),
high=np.log(2)),
# 'max_depth': hp.choice('max_depth_adaboost', range(1, 11)),
}

CATBOOST_REGRESSOR_PARAMS = {
'iterations': 2,
'depth': hp.choice('depth_catboost', range(4, 10)),
'learning_rate': 1,
'loss_function': 'RMSE',
'verbose': True
}

REGRESSION_HPARAM_SPACE = {
'extratree': {
'model': ExtraTreesRegressor,
'param': EXTRA_TREES_REGRESSOR_PARAMS
},
'ridge': {
'model': Ridge,
'param': RIDGE_REGRESSOR_PARAMS
},
'random_forest': {
'model': RandomForestRegressor,
'param': RANDOM_FOREST_REGRESSOR_PARAMS
},
'lgbm': {
'model': LGBMRegressor,
'param': LGBM_REGRESSOR_PARAMS
},
'adaboost': {
'model': AdaBoostRegressor,
'param': ADABOOST_REGRESSOR_PARAMS
},
'catboost': {
'model': CatBoostRegressor,
'param': CATBOOST_REGRESSOR_PARAMS
}
}

CLASSIFICATION_HPARAM_SPACE = {
'knn': {
'model': KNeighborsClassifier,
'param': KNN_CLASSIFIER_PARAMS
},
'svm': {
'model': SVC,
'param': SVM_CLASSIFIER_PARAMS
},
'random_forest': {
'model': RandomForestClassifier,
'param': RANDOM_FOREST_CLASSIFIER_PARAMS
},
'lgbm': {
'model': LGBMClassifier,
'param': LGBM_CLASSIFIER_PARAMS
},
'adaboost': {
'model': AdaBoostClassifier,
'param': ADABOOST_CLASSIFIER_PARAMS
},
'catboost': {
'model': CatBoostClassifier,
'param': CATBOOST_CLASSIFIER_PARAMS
}
}

CLASSIFICATION_BASE_HPARAM_SPACE = {
'knn': {
'model': KNeighborsClassifier,
'param': {}
},
'svm': {
'model': SVC,
'param': {}
},
'random_forest': {
'model': RandomForestClassifier,
'param': {}
},
'lgbm': {
'model': LGBMClassifier,
'param': {}
},
'adaboost': {
'model': AdaBoostClassifier,
'param': {}
},
'catboost': {
'model': CatBoostClassifier,
'param': {}
}
}

REGRESSION_BASE_HPARAM_SPACE = {
'extratree': {
'model': ExtraTreesRegressor,
'param': {}
},
'ridge': {
'model': Ridge,
'param': {}
},
'random_forest': {
'model': RandomForestRegressor,
'param': {}
},
'lgbm': {
'model': LGBMRegressor,
'param': {}
},
'adaboost': {
'model': AdaBoostRegressor,
'param': {}
},
'catboost': {
'model': CatBoostRegressor,
'param': {}
}
}

REGRESSION_PREP_HPARAM_SPACE = {
'cat_encoding': hp.choice('cat_enc',
['count', 'target+count', 'target+label', 'label']),
'scaling': hp.choice('scaling', [True, False]),
'log_transform': hp.choice('log_transform', [True, False]),
'power_transform': hp.choice('power_transform', [True, False]),
'pca': hp.choice('pca', [True, False]),
'binning': hp.choice('binning', [True, False]),
'add_time_offset': hp.choice('add_time_offset', [True, False]),
'add_time_diff': hp.choice('add_time_diff', [True, False]),
# 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max',
# 'min', None]),
# 'cat_cat_strategy': hp.choice('cat_cat_strategy', ['count', 'nunique', None]),
'imputation_strategy': hp.choice('imputation_strategy',
['most_frequent', 'zero']),
'pearson_thresh': hp.uniform('pearson_thresh', 0.001, 0.01),
'feat_importance_thresh': hp.uniform('feat_importance_thresh', 0.001, 0.01)
}

CLASSIFICATION_PREP_HPARAM_SPACE = {
'cat_encoding': hp.choice('cat_enc',
['target', 'count', 'target+count', 'target+label']),
'scaling': hp.choice('scaling', [True, False]),
'log_transform': hp.choice('log_transform', [True, False]),
'power_transform': hp.choice('power_transform', [True, False]),
'pca': hp.choice('pca', [True, False]),
'binning': hp.choice('binning', [True, False]),
'add_time_offset': hp.choice('add_time_offset', [True, False]),
'add_time_diff': hp.choice('add_time_diff', [True, False]),
# 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max',
# 'min', None]),
# 'cat_cat_strategy': hp.choice('cat_cat_strategy', ['count', 'nunique', None]),
'imputation_strategy': hp.choice('imputation_strategy',
['most_frequent', 'zero']),
'pearson_thresh': hp.uniform('pearson_thresh', 0.001, 0.01),
'feat_importance_thresh': hp.uniform('feat_importance_thresh', 0.001, 0.01)
}
Loading