From 126e598ee9e9a7b61e96fd62161bc815e3b85eca Mon Sep 17 00:00:00 2001
From: Roopkishor singh <roopkishor.singh@rediffmail.com>
Date: Mon, 28 Aug 2017 22:08:44 +0530
Subject: [PATCH] Add files via upload

---
 necessary_code.ipynb                     | 135 +++++++++++++
 xgboost_training_tuning_importance.ipynb | 231 +++++++++++++++++++++++
 2 files changed, 366 insertions(+)
 create mode 100644 necessary_code.ipynb
 create mode 100644 xgboost_training_tuning_importance.ipynb

diff --git a/necessary_code.ipynb b/necessary_code.ipynb
new file mode 100644
index 0000000..2f27023
--- /dev/null
+++ b/necessary_code.ipynb
@@ -0,0 +1,135 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd \n",
+    "from pandas import DataFrame, Series\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from scipy import stats\n",
+    "\n",
+    "%matplotlib inline\n",
+    "from IPython.core.interactiveshell import InteractiveShell #Show all consecutive outputs\n",
+    "InteractiveShell.ast_node_interactivity = \"all\"\n",
+    "\n",
+    "#Removes all unnecessary warnings by Python\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "from collections import defaultdict\n",
+    "import json\n",
+    "\n",
+    "import scipy as sp\n",
+    "\n",
+    "from matplotlib import rcParams\n",
+    "import matplotlib.cm as cm\n",
+    "import matplotlib as mpl\n",
+    "\n",
+    "#colorbrewer2 Dark2 qualitative color table\n",
+    "dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),\n",
+    "                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),\n",
+    "                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),\n",
+    "                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),\n",
+    "                (0.4, 0.6509803921568628, 0.11764705882352941),\n",
+    "                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),\n",
+    "                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]\n",
+    "\n",
+    "rcParams['figure.figsize'] = (10, 6)\n",
+    "rcParams['figure.dpi'] = 150\n",
+    "rcParams['axes.color_cycle'] = dark2_colors\n",
+    "rcParams['lines.linewidth'] = 2\n",
+    "rcParams['axes.facecolor'] = 'white'\n",
+    "rcParams['font.size'] = 14\n",
+    "rcParams['patch.edgecolor'] = 'white'\n",
+    "rcParams['patch.facecolor'] = dark2_colors[0]\n",
+    "rcParams['font.family'] = 'StixGeneral'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def num_missing(x):\n",
+    "  return sum(x.isnull())\n",
+    "train_data.apply(num_missing, axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "#imputing median groupwise\n",
+    "def fill_na(data):\n",
+    "    for c in data.columns[data.isnull().any()]:\n",
+    "        data[c] = data.groupby('Stock_ID').transform(lambda x: x.fillna(x.median()))[c]\n",
+    "    return data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def getCountVar(compute_df, count_df, var_name):\n",
+    "    grouped_df = count_df.groupby(var_name)\n",
+    "    count_dict = {}\n",
+    "    for name, group in grouped_df:\n",
+    "        count_dict[name] = group.shape[0]\n",
+    "\n",
+    "    count_list = []\n",
+    "    for index, row in compute_df.iterrows():\n",
+    "        name = row[var_name]\n",
+    "        count_list.append(count_dict.get(name, 0))\n",
+    "    return count_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/xgboost_training_tuning_importance.ipynb b/xgboost_training_tuning_importance.ipynb
new file mode 100644
index 0000000..e219007
--- /dev/null
+++ b/xgboost_training_tuning_importance.ipynb
@@ -0,0 +1,231 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'xgboost training, parameter tuning and feature selection'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\"\"\"xgboost training, parameter tuning and feature selection\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'params' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-6-d093e34651df>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[0mxg1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mXGBRegressor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobjective\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"reg:linear\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     11\u001b[0m \u001b[0mparams_g\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"n_estimators\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m500\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1000\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mgrid_search\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mGridSearchCV\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxg1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mscoring\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"neg_mean_squared_error\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     13\u001b[0m \u001b[0mgrid_search\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreg_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mregistered\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'registered'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'params' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "#change to XGBClassifier if classify needed\n",
+    "\n",
+    "\"\"\"class xgboost.XGBRegressor(max_depth=3, learning_rate=0.1, \n",
+    "n_estimators=100, silent=True, objective='reg:linear', booster='gbtree', n_jobs=1,\n",
+    "nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1,\n",
+    "colsample_bylevel=1,reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, \n",
+    "random_state=0, seed=None, missing=None)\"\"\"\n",
+    "\n",
+    "\"\"\"using only the python wrapper and grid search cv tuning\"\"\"\n",
+    "from xgboost.sklearn import XGBRegressor\n",
+    "from sklearn.grid_search import GridSearchCV\n",
+    "xg1=XGBRegressor(objective=\"reg:linear\")\n",
+    "params_g={\"n_estimators\":[50,500,1000]}\n",
+    "grid_search=GridSearchCV(xg1,param_grid=params_g,scoring=\"neg_mean_squared_error\")\n",
+    "grid_search.fit(reg_train,registered['registered'])\n",
+    "\n",
+    "print gridsearch.best_params_\n",
+    "#use it in two parts\n",
+    "\n",
+    "#use those best parameters to fit and predict a model\n",
+    "\"\"\"fit(X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True)¶\"\"\"\n",
+    "newparams={}\n",
+    "xg1=XGBRegressor(new_params)\n",
+    "xg1.fit(train[predictors],train[Outcome],early_stopping_rounds=300)\n",
+    "pred=xg1.predict(test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"use the gridsearchcv with wrapper to get the parameters than use the xgbcv to get num_boost_rounds\n",
+    "and then use the xgb.train to train the model\"\"\"\n",
+    "\n",
+    "#part_1\n",
+    "import xgboost as xgb\n",
+    "from xgboost.sklearn import XGBRegressor\n",
+    "from sklearn.grid_search import GridSearchCV\n",
+    "xg1=XGBRegressor(objective=\"reg:linear\")\n",
+    "params_g={\"n_estimators\":[50,500,1000]}\n",
+    "grid_search=GridSearchCV(xg1,param_grid=params_g,scoring=\"neg_mean_squared_error\")\n",
+    "grid_search.fit(reg_train,registered['registered'])\n",
+    "\n",
+    "print gridsearch.best_params_\n",
+    "\n",
+    "#part 2\n",
+    "#take this from above\n",
+    "\"\"\"xgboost.cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False,\n",
+    "folds=None, metrics=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None,\n",
+    "fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True, seed=0, callbacks=None, shuffle=Truebbba)\"\"\"\n",
+    "best_param={}\n",
+    "dtrain=xgb.DMatrix(casual[col].values, label=casual['logcasual'].values)\n",
+    "pa=xgb.cv(best_param,dtrain,num_boost_round=500,nfold=5,metrics=\"rmse\",early_stopping_rounds=30)\n",
+    "plt.plot(pa['test-rmse-mean'])\n",
+    "plt.plot(pa['train-rmse-mean'])\n",
+    "plt.legend()\n",
+    "plt.show()\n",
+    "\n",
+    "#part3\n",
+    "\"\"\"xgboost.train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None,\n",
+    "evals_result=None, verbose_eval=True, xgb_model=None, callbacks=None, learning_rates=None)\"\"\"\n",
+    "num_boost=169\n",
+    "best_parm={'objective':\"reg:linear\",\n",
+    "          \"eta\": 0.05,\n",
+    "          \"max_depth\": 8,\n",
+    "          \"subsample\": 1,\n",
+    "          \"colsample_bytree\": 1,\n",
+    "          'gamma':1.0,\n",
+    "          'min_child_weight':5,\n",
+    "          \"silent\": 1,\n",
+    "          \"seed\": 1301,\n",
+    "          \"eval_metric\": 'rmse'\n",
+    "     }\n",
+    "\n",
+    "print(\"Train a XGBoost model with cross val data\")\n",
+    "dtrain = xgb.DMatrix(x_train, y_train)\n",
+    "dvalid = xgb.DMatrix(x_test, y_test)\n",
+    "watchlist = [(dtrain, 'train'), (dvalid, 'eval')]\n",
+    "gbm = xgb.train(parms, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=300, verbose_eval=True)\n",
+    "\n",
+    "#after this train this on complete data\n",
+    "print(\"Train a XGBoost model\")\n",
+    "dtrain = xgb.DMatrix(train.drop(['timestamp', 'Outcome'], axis=1), train['Outcome'])\n",
+    "watchlist = [(dtrain, 'train')]\n",
+    "gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=300, verbose_eval=True)\n",
+    "#predict\n",
+    "pred1=gbm.predict(xgb.DMatrix(test[train.columns]))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import xgboost as xgb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "#getting feature importance 1\n",
+    "def create_feature_map(features):\n",
+    "    outfile = open('xgb.fmap', 'w')\n",
+    "    for i, feat in enumerate(features):\n",
+    "        outfile.write('{0}\\t{1}\\tq\\n'.format(i,feat))\n",
+    "    outfile.close()\n",
+    "    \n",
+    "create_feature_map(col)\n",
+    "model.dump_model('xgbmodel.txt', 'xgb.fmap', with_stats=True)\n",
+    "importance = model.get_fscore(fmap='xgb.fmap')\n",
+    "importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)\n",
+    "imp_df = pd.DataFrame(importance, columns=['feature','fscore'])\n",
+    "imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()\n",
+    "imp_df.to_csv(\"imp_feat.txt\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'algo' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-11-3f3efd24b96b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m \"\"\"xgboost.plot_importance(booster, ax=None, height=0.2, xlim=None, ylim=None, title='Feature importance',\n\u001b[1;32m      3\u001b[0m xlabel='F score', ylabel='Features', importance_type='weight', max_num_features=None, grid=True, **kwargs)\"\"\"\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mxgb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot_importance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0malgo\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m: name 'algo' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "#getting feature importance 2\n",
+    "\"\"\"xgboost.plot_importance(booster, ax=None, height=0.2, xlim=None, ylim=None, title='Feature importance',\n",
+    "xlabel='F score', ylabel='Features', importance_type='weight', max_num_features=None, grid=True, **kwargs)\"\"\"\n",
+    "xgb.plot_importance(algo)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [conda env:py27]",
+   "language": "python",
+   "name": "conda-env-py27-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}