Merges branch warm_start into master

ClimbsRocks · Jul 9, 2017 · 60d3eb4 · 60d3eb4
2 parents 8f1a290 + e13c917
commit 60d3eb4
Show file tree

Hide file tree

Showing 7 changed files with 62 additions and 26 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -12,12 +12,12 @@ python:
 
 # Command to install dependencies
 install:
-  - docker pull climbsrocks/auto_ml_tests:0.0.3
+  - docker pull climbsrocks/auto_ml_tests:0.0.5
   - pip install coveralls
 
 # Command to run tests
-script: docker run -v $PWD:/tmp/working -w=/tmp/working --rm -it climbsrocks/auto_ml_tests:0.0.3 python setup.py nosetests
+script: docker run -v $PWD:/tmp/working -w=/tmp/working --rm -it climbsrocks/auto_ml_tests:0.0.5 python setup.py nosetests
 
 after_success:
-  - docker cp climbsrocks/auto_ml_tests:0.0.3:/.coverage
+  - docker cp climbsrocks/auto_ml_tests:0.0.5:/.coverage
   - coveralls
diff --git a/auto_ml/predictor.py b/auto_ml/predictor.py
@@ -1427,6 +1427,10 @@ def score_uncertainty(self, X, y, advanced_scoring=True, verbose=2):
         return score
 
 
+    def transform_only(self, X):
+        return self.transformation_pipeline.transform(X)
+
+
     def save(self, file_name='auto_ml_saved_pipeline.dill', verbose=True):
 
         def save_one_step(pipeline_step, used_deep_learning):

diff --git a/auto_ml/utils.py b/auto_ml/utils.py
@@ -165,3 +165,12 @@ def score_uncertainty(self, X):
         return self.steps[-1][-1].score_uncertainty(Xt)
 
 
+    @if_delegate_has_method(delegate='_final_estimator')
+    def transform_only(self, X):
+        Xt = X
+        for name, transform in self.steps[:-1]:
+            if transform is not None:
+                Xt = transform.transform(Xt)
+        return self.steps[-1][-1].transform_only(Xt)
+
+
diff --git a/auto_ml/utils_model_training.py b/auto_ml/utils_model_training.py
@@ -1,6 +1,7 @@
 from collections import Iterable
 from copy import deepcopy
 import os
+import random
 
 import numpy as np
 import pandas as pd
@@ -88,6 +89,21 @@ def fit(self, X, y):
                 early_stopping = EarlyStopping(monitor='loss', patience=25, verbose=1)
                 self.model.fit(X_fit, y, callbacks=[early_stopping])
 
+            elif self.model_name[:4] == 'LGBM':
+
+                X_fit, X_test, y, y_test = train_test_split(X_fit, y, test_size=0.15)
+
+                if self.type_of_estimator == 'regressor':
+                    eval_metric = 'rmse'
+                elif self.type_of_estimator == 'classifier':
+                    if len(set(y_test)) > 2:
+                        eval_metric = 'multi_logloss'
+                    else:
+                        eval_metric = 'binary_logloss'
+
+
+                self.model.fit(X_fit, y, eval_set=[(X_test, y_test)], early_stopping_rounds=50, eval_metric=eval_metric, eval_names=['random_holdout_set_from_training_data'])
+
             elif self.model_name[:16] == 'GradientBoosting':
                 if scipy.sparse.issparse(X_fit):
                     X_fit = X_fit.todense()
@@ -121,7 +137,7 @@ def fit(self, X, y):
                             best_model = deepcopy(self.model)
                         else:
                             num_worse_rounds += 1
-
+                        print('[' + str(num_iter) + '] random_holdout_set_from_training_data\'s score is: ' + str(round(val_loss, 3)))
                         if num_worse_rounds >= patience:
                             break
                 except KeyboardInterrupt:
@@ -299,7 +315,10 @@ def predict_proba(self, X, verbose=False):
             X = X.todense()
 
         try:
-            predictions = self.model.predict_proba(X)
+            if self.model_name[:4] == 'LGBM':
+                predictions = self.model.predict_proba(X, num_iteration=self.model.best_iteration)
+            else:
+                predictions = self.model.predict_proba(X)
 
         except AttributeError as e:
             try:
@@ -347,19 +366,22 @@ def predict(self, X, verbose=False):
         else:
             X_predict = X
 
-        prediction = self.model.predict(X_predict)
+        if self.model_name[:4] == 'LGBM':
+            predictions = self.model.predict(X_predict, num_iteration=self.model.best_iteration)
+        else:
+            predictions = self.model.predict(X_predict)
         # Handle cases of getting a prediction for a single item.
         # It makes a cleaner interface just to get just the single prediction back, rather than a list with the prediction hidden inside.
 
-        if isinstance(prediction, np.ndarray):
-            prediction = prediction.tolist()
-            if isinstance(prediction, float) or isinstance(prediction, int) or isinstance(prediction, str):
-                return prediction
+        if isinstance(predictions, np.ndarray):
+            predictions = predictions.tolist()
+            if isinstance(predictions, float) or isinstance(predictions, int) or isinstance(predictions, str):
+                return predictions
 
-        if len(prediction) == 1:
-            return prediction[0]
+        if len(predictions) == 1:
+            return predictions[0]
         else:
-            return prediction
+            return predictions
 
     # transform is initially designed to be used with feature_learning
     def transform(self, X):
@@ -369,6 +391,10 @@ def transform(self, X):
         X = scipy.sparse.hstack([X, predicted_features], format='csr')
         return X
 
+    # Allows the user to get the fully transformed data
+    def transform_only(self, X):
+        return X
+
     def predict_uncertainty(self, X):
         if self.uncertainty_model is None:
             print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')

diff --git a/auto_ml/utils_models.py b/auto_ml/utils_models.py
@@ -80,8 +80,8 @@ def get_model_from_name(model_name, training_params=None):
         'AdaBoostRegressor': {'n_estimators': 10},
         'XGBRegressor': {'nthread':-1, 'n_estimators': 200},
         'XGBClassifier': {'nthread':-1, 'n_estimators': 200},
-        'LGBMRegressor': {},
-        'LGBMClassifier': {},
+        'LGBMRegressor': {'n_estimators': 2000, 'learning_rate': 0.05, 'num_leaves': 8, 'lambda_l2': 0.001},
+        'LGBMClassifier': {'n_estimators': 2000, 'learning_rate': 0.05, 'num_leaves': 8, 'lambda_l2': 0.001},
         'DeepLearningRegressor': {'epochs': epochs, 'batch_size': 50, 'verbose': 2},
         'DeepLearningClassifier': {'epochs': epochs, 'batch_size': 50, 'verbose': 2}
     }
@@ -454,7 +454,7 @@ def get_search_params(model_name):
 
         , 'LGBMClassifier': {
             # 'max_bin': [25, 50, 100, 200, 250, 300, 400, 500, 750, 1000]
-            'num_leaves': [10, 20, 30, 40, 50, 200]
+            'num_leaves': [2, 4, 7, 10, 15, 20, 25, 30, 35, 40, 50, 125, 200]
             , 'colsample_bytree': [0.7, 0.9, 1.0]
             , 'subsample': [0.7, 0.9, 1.0]
             # , 'subsample_freq': [0.3, 0.5, 0.7, 0.9, 1.0]
@@ -466,7 +466,7 @@ def get_search_params(model_name):
 
         , 'LGBMRegressor': {
             # 'max_bin': [25, 50, 100, 200, 250, 300, 400, 500, 750, 1000]
-            'num_leaves': [10, 20, 30, 40, 50, 200]
+            'num_leaves': [2, 4, 7, 10, 15, 20, 25, 30, 35, 40, 50, 125, 200]
             , 'colsample_bytree': [0.7, 0.9, 1.0]
             , 'subsample': [0.7, 0.9, 1.0]
             # , 'subsample_freq': [0.3, 0.5, 0.7, 0.9, 1.0]

diff --git a/tests/Dockerfile b/tests/Dockerfile
@@ -8,11 +8,6 @@ RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list &
     cd /usr/local/src && \
     pip install tensorflow && \
 
-    # Microsoft's LightGBM
-    cd /usr/local/src && git clone --recursive --depth 1 https://github.com/Microsoft/LightGBM && \
-    cd LightGBM && mkdir build && cd build && cmake .. && make -j $(nproc) && \
-    cd /usr/local/src/LightGBM/python-package && python setup.py install && \
-
     # XGBoost
     cd /usr/local/src && mkdir xgboost && cd xgboost && \
     git clone --depth 1 --recursive https://github.com/dmlc/xgboost.git && cd xgboost && \
@@ -21,9 +16,9 @@ RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list &
     apt-get autoremove -y && apt-get clean
 
 RUN pip install --upgrade pip
-RUN pip install --upgrade numpy dill h5py scikit-learn scipy python-dateutil pandas pathos keras coveralls nose
+RUN pip install --upgrade numpy dill h5py scikit-learn scipy python-dateutil pandas pathos keras coveralls nose lightgbm tabulate imblearn
 
 # To update this image and upload it:
 # docker build -t testdocker .
-# docker tag testdocker climbsrocks/auto_ml_tests:0.0.4
-# docker push climbsrocks/auto_ml_tests:0.0.4
+# docker tag testdocker climbsrocks/auto_ml_tests:0.0.6
+# docker push climbsrocks/auto_ml_tests:0.0.6
diff --git a/tests/classifiers.py b/tests/classifiers.py
@@ -42,6 +42,8 @@ def optimize_final_model_classification(model_name=None):
 
     if model_name == 'DeepLearningClassifier':
         lower_bound = -0.235
+    if model_name == 'LGBMClassifier':
+        lower_bound = -0.221
 
     assert lower_bound < test_score < -0.17
 
@@ -429,7 +431,7 @@ def feature_learning_categorical_ensembling_getting_single_predictions_classific
     if model_name == 'GradientBoostingClassifier' or model_name is None:
         lower_bound = -0.25
     if model_name == 'LGBMClassifier':
-        lower_bound = -0.221
+        lower_bound = -0.23
     if model_name == 'XGBClassifier':
         lower_bound = -0.25