From 126e598ee9e9a7b61e96fd62161bc815e3b85eca Mon Sep 17 00:00:00 2001 From: Roopkishor singh Date: Mon, 28 Aug 2017 22:08:44 +0530 Subject: [PATCH] Add files via upload --- necessary_code.ipynb | 135 +++++++++++++ xgboost_training_tuning_importance.ipynb | 231 +++++++++++++++++++++++ 2 files changed, 366 insertions(+) create mode 100644 necessary_code.ipynb create mode 100644 xgboost_training_tuning_importance.ipynb diff --git a/necessary_code.ipynb b/necessary_code.ipynb new file mode 100644 index 0000000..2f27023 --- /dev/null +++ b/necessary_code.ipynb @@ -0,0 +1,135 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd \n", + "from pandas import DataFrame, Series\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from scipy import stats\n", + "\n", + "%matplotlib inline\n", + "from IPython.core.interactiveshell import InteractiveShell #Show all consecutive outputs\n", + "InteractiveShell.ast_node_interactivity = \"all\"\n", + "\n", + "#Removes all unnecessary warnings by Python\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "from collections import defaultdict\n", + "import json\n", + "\n", + "import scipy as sp\n", + "\n", + "from matplotlib import rcParams\n", + "import matplotlib.cm as cm\n", + "import matplotlib as mpl\n", + "\n", + "#colorbrewer2 Dark2 qualitative color table\n", + "dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),\n", + " (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),\n", + " (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),\n", + " (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),\n", + " (0.4, 0.6509803921568628, 0.11764705882352941),\n", + " (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),\n", + " (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]\n", + "\n", + "rcParams['figure.figsize'] = (10, 6)\n", + "rcParams['figure.dpi'] = 150\n", + "rcParams['axes.color_cycle'] = dark2_colors\n", + "rcParams['lines.linewidth'] = 2\n", + "rcParams['axes.facecolor'] = 'white'\n", + "rcParams['font.size'] = 14\n", + "rcParams['patch.edgecolor'] = 'white'\n", + "rcParams['patch.facecolor'] = dark2_colors[0]\n", + "rcParams['font.family'] = 'StixGeneral'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def num_missing(x):\n", + " return sum(x.isnull())\n", + "train_data.apply(num_missing, axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#imputing median groupwise\n", + "def fill_na(data):\n", + " for c in data.columns[data.isnull().any()]:\n", + " data[c] = data.groupby('Stock_ID').transform(lambda x: x.fillna(x.median()))[c]\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def getCountVar(compute_df, count_df, var_name):\n", + " grouped_df = count_df.groupby(var_name)\n", + " count_dict = {}\n", + " for name, group in grouped_df:\n", + " count_dict[name] = group.shape[0]\n", + "\n", + " count_list = []\n", + " for index, row in compute_df.iterrows():\n", + " name = row[var_name]\n", + " count_list.append(count_dict.get(name, 0))\n", + " return count_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/xgboost_training_tuning_importance.ipynb b/xgboost_training_tuning_importance.ipynb new file mode 100644 index 0000000..e219007 --- /dev/null +++ b/xgboost_training_tuning_importance.ipynb @@ -0,0 +1,231 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'xgboost training, parameter tuning and feature selection'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"xgboost training, parameter tuning and feature selection\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'params' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mxg1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mXGBRegressor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobjective\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"reg:linear\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mparams_g\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"n_estimators\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m500\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1000\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mgrid_search\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mGridSearchCV\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxg1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mscoring\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"neg_mean_squared_error\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0mgrid_search\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreg_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mregistered\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'registered'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'params' is not defined" + ] + } + ], + "source": [ + "#change to XGBClassifier if classify needed\n", + "\n", + "\"\"\"class xgboost.XGBRegressor(max_depth=3, learning_rate=0.1, \n", + "n_estimators=100, silent=True, objective='reg:linear', booster='gbtree', n_jobs=1,\n", + "nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1,\n", + "colsample_bylevel=1,reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, \n", + "random_state=0, seed=None, missing=None)\"\"\"\n", + "\n", + "\"\"\"using only the python wrapper and grid search cv tuning\"\"\"\n", + "from xgboost.sklearn import XGBRegressor\n", + "from sklearn.grid_search import GridSearchCV\n", + "xg1=XGBRegressor(objective=\"reg:linear\")\n", + "params_g={\"n_estimators\":[50,500,1000]}\n", + "grid_search=GridSearchCV(xg1,param_grid=params_g,scoring=\"neg_mean_squared_error\")\n", + "grid_search.fit(reg_train,registered['registered'])\n", + "\n", + "print gridsearch.best_params_\n", + "#use it in two parts\n", + "\n", + "#use those best parameters to fit and predict a model\n", + "\"\"\"fit(X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True)ΒΆ\"\"\"\n", + "newparams={}\n", + "xg1=XGBRegressor(new_params)\n", + "xg1.fit(train[predictors],train[Outcome],early_stopping_rounds=300)\n", + "pred=xg1.predict(test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "\"\"\"use the gridsearchcv with wrapper to get the parameters than use the xgbcv to get num_boost_rounds\n", + "and then use the xgb.train to train the model\"\"\"\n", + "\n", + "#part_1\n", + "import xgboost as xgb\n", + "from xgboost.sklearn import XGBRegressor\n", + "from sklearn.grid_search import GridSearchCV\n", + "xg1=XGBRegressor(objective=\"reg:linear\")\n", + "params_g={\"n_estimators\":[50,500,1000]}\n", + "grid_search=GridSearchCV(xg1,param_grid=params_g,scoring=\"neg_mean_squared_error\")\n", + "grid_search.fit(reg_train,registered['registered'])\n", + "\n", + "print gridsearch.best_params_\n", + "\n", + "#part 2\n", + "#take this from above\n", + "\"\"\"xgboost.cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False,\n", + "folds=None, metrics=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None,\n", + "fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True, seed=0, callbacks=None, shuffle=Truebbba)\"\"\"\n", + "best_param={}\n", + "dtrain=xgb.DMatrix(casual[col].values, label=casual['logcasual'].values)\n", + "pa=xgb.cv(best_param,dtrain,num_boost_round=500,nfold=5,metrics=\"rmse\",early_stopping_rounds=30)\n", + "plt.plot(pa['test-rmse-mean'])\n", + "plt.plot(pa['train-rmse-mean'])\n", + "plt.legend()\n", + "plt.show()\n", + "\n", + "#part3\n", + "\"\"\"xgboost.train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None,\n", + "evals_result=None, verbose_eval=True, xgb_model=None, callbacks=None, learning_rates=None)\"\"\"\n", + "num_boost=169\n", + "best_parm={'objective':\"reg:linear\",\n", + " \"eta\": 0.05,\n", + " \"max_depth\": 8,\n", + " \"subsample\": 1,\n", + " \"colsample_bytree\": 1,\n", + " 'gamma':1.0,\n", + " 'min_child_weight':5,\n", + " \"silent\": 1,\n", + " \"seed\": 1301,\n", + " \"eval_metric\": 'rmse'\n", + " }\n", + "\n", + "print(\"Train a XGBoost model with cross val data\")\n", + "dtrain = xgb.DMatrix(x_train, y_train)\n", + "dvalid = xgb.DMatrix(x_test, y_test)\n", + "watchlist = [(dtrain, 'train'), (dvalid, 'eval')]\n", + "gbm = xgb.train(parms, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=300, verbose_eval=True)\n", + "\n", + "#after this train this on complete data\n", + "print(\"Train a XGBoost model\")\n", + "dtrain = xgb.DMatrix(train.drop(['timestamp', 'Outcome'], axis=1), train['Outcome'])\n", + "watchlist = [(dtrain, 'train')]\n", + "gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=300, verbose_eval=True)\n", + "#predict\n", + "pred1=gbm.predict(xgb.DMatrix(test[train.columns]))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import xgboost as xgb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#getting feature importance 1\n", + "def create_feature_map(features):\n", + " outfile = open('xgb.fmap', 'w')\n", + " for i, feat in enumerate(features):\n", + " outfile.write('{0}\\t{1}\\tq\\n'.format(i,feat))\n", + " outfile.close()\n", + " \n", + "create_feature_map(col)\n", + "model.dump_model('xgbmodel.txt', 'xgb.fmap', with_stats=True)\n", + "importance = model.get_fscore(fmap='xgb.fmap')\n", + "importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)\n", + "imp_df = pd.DataFrame(importance, columns=['feature','fscore'])\n", + "imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()\n", + "imp_df.to_csv(\"imp_feat.txt\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'algo' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \"\"\"xgboost.plot_importance(booster, ax=None, height=0.2, xlim=None, ylim=None, title='Feature importance',\n\u001b[1;32m 3\u001b[0m xlabel='F score', ylabel='Features', importance_type='weight', max_num_features=None, grid=True, **kwargs)\"\"\"\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mxgb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot_importance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0malgo\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'algo' is not defined" + ] + } + ], + "source": [ + "#getting feature importance 2\n", + "\"\"\"xgboost.plot_importance(booster, ax=None, height=0.2, xlim=None, ylim=None, title='Feature importance',\n", + "xlabel='F score', ylabel='Features', importance_type='weight', max_num_features=None, grid=True, **kwargs)\"\"\"\n", + "xgb.plot_importance(algo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [conda env:py27]", + "language": "python", + "name": "conda-env-py27-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}