Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
rks-ds authored Aug 28, 2017
1 parent 6280ff7 commit 126e598
Show file tree
Hide file tree
Showing 2 changed files with 366 additions and 0 deletions.
135 changes: 135 additions & 0 deletions necessary_code.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd \n",
"from pandas import DataFrame, Series\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from scipy import stats\n",
"\n",
"%matplotlib inline\n",
"from IPython.core.interactiveshell import InteractiveShell #Show all consecutive outputs\n",
"InteractiveShell.ast_node_interactivity = \"all\"\n",
"\n",
"#Removes all unnecessary warnings by Python\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"from collections import defaultdict\n",
"import json\n",
"\n",
"import scipy as sp\n",
"\n",
"from matplotlib import rcParams\n",
"import matplotlib.cm as cm\n",
"import matplotlib as mpl\n",
"\n",
"#colorbrewer2 Dark2 qualitative color table\n",
"dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),\n",
" (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),\n",
" (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),\n",
" (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),\n",
" (0.4, 0.6509803921568628, 0.11764705882352941),\n",
" (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),\n",
" (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]\n",
"\n",
"rcParams['figure.figsize'] = (10, 6)\n",
"rcParams['figure.dpi'] = 150\n",
"rcParams['axes.color_cycle'] = dark2_colors\n",
"rcParams['lines.linewidth'] = 2\n",
"rcParams['axes.facecolor'] = 'white'\n",
"rcParams['font.size'] = 14\n",
"rcParams['patch.edgecolor'] = 'white'\n",
"rcParams['patch.facecolor'] = dark2_colors[0]\n",
"rcParams['font.family'] = 'StixGeneral'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def num_missing(x):\n",
" return sum(x.isnull())\n",
"train_data.apply(num_missing, axis=0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#imputing median groupwise\n",
"def fill_na(data):\n",
" for c in data.columns[data.isnull().any()]:\n",
" data[c] = data.groupby('Stock_ID').transform(lambda x: x.fillna(x.median()))[c]\n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def getCountVar(compute_df, count_df, var_name):\n",
" grouped_df = count_df.groupby(var_name)\n",
" count_dict = {}\n",
" for name, group in grouped_df:\n",
" count_dict[name] = group.shape[0]\n",
"\n",
" count_list = []\n",
" for index, row in compute_df.iterrows():\n",
" name = row[var_name]\n",
" count_list.append(count_dict.get(name, 0))\n",
" return count_list"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
231 changes: 231 additions & 0 deletions xgboost_training_tuning_importance.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'xgboost training, parameter tuning and feature selection'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"\"\"xgboost training, parameter tuning and feature selection\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'params' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-6-d093e34651df>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mxg1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mXGBRegressor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobjective\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"reg:linear\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mparams_g\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"n_estimators\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m500\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1000\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mgrid_search\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mGridSearchCV\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxg1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mscoring\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"neg_mean_squared_error\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0mgrid_search\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreg_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mregistered\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'registered'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'params' is not defined"
]
}
],
"source": [
"#change to XGBClassifier if classify needed\n",
"\n",
"\"\"\"class xgboost.XGBRegressor(max_depth=3, learning_rate=0.1, \n",
"n_estimators=100, silent=True, objective='reg:linear', booster='gbtree', n_jobs=1,\n",
"nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1,\n",
"colsample_bylevel=1,reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, \n",
"random_state=0, seed=None, missing=None)\"\"\"\n",
"\n",
"\"\"\"using only the python wrapper and grid search cv tuning\"\"\"\n",
"from xgboost.sklearn import XGBRegressor\n",
"from sklearn.grid_search import GridSearchCV\n",
"xg1=XGBRegressor(objective=\"reg:linear\")\n",
"params_g={\"n_estimators\":[50,500,1000]}\n",
"grid_search=GridSearchCV(xg1,param_grid=params_g,scoring=\"neg_mean_squared_error\")\n",
"grid_search.fit(reg_train,registered['registered'])\n",
"\n",
"print gridsearch.best_params_\n",
"#use it in two parts\n",
"\n",
"#use those best parameters to fit and predict a model\n",
"\"\"\"fit(X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True)¶\"\"\"\n",
"newparams={}\n",
"xg1=XGBRegressor(new_params)\n",
"xg1.fit(train[predictors],train[Outcome],early_stopping_rounds=300)\n",
"pred=xg1.predict(test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"\"\"\"use the gridsearchcv with wrapper to get the parameters than use the xgbcv to get num_boost_rounds\n",
"and then use the xgb.train to train the model\"\"\"\n",
"\n",
"#part_1\n",
"import xgboost as xgb\n",
"from xgboost.sklearn import XGBRegressor\n",
"from sklearn.grid_search import GridSearchCV\n",
"xg1=XGBRegressor(objective=\"reg:linear\")\n",
"params_g={\"n_estimators\":[50,500,1000]}\n",
"grid_search=GridSearchCV(xg1,param_grid=params_g,scoring=\"neg_mean_squared_error\")\n",
"grid_search.fit(reg_train,registered['registered'])\n",
"\n",
"print gridsearch.best_params_\n",
"\n",
"#part 2\n",
"#take this from above\n",
"\"\"\"xgboost.cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False,\n",
"folds=None, metrics=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None,\n",
"fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True, seed=0, callbacks=None, shuffle=Truebbba)\"\"\"\n",
"best_param={}\n",
"dtrain=xgb.DMatrix(casual[col].values, label=casual['logcasual'].values)\n",
"pa=xgb.cv(best_param,dtrain,num_boost_round=500,nfold=5,metrics=\"rmse\",early_stopping_rounds=30)\n",
"plt.plot(pa['test-rmse-mean'])\n",
"plt.plot(pa['train-rmse-mean'])\n",
"plt.legend()\n",
"plt.show()\n",
"\n",
"#part3\n",
"\"\"\"xgboost.train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None,\n",
"evals_result=None, verbose_eval=True, xgb_model=None, callbacks=None, learning_rates=None)\"\"\"\n",
"num_boost=169\n",
"best_parm={'objective':\"reg:linear\",\n",
" \"eta\": 0.05,\n",
" \"max_depth\": 8,\n",
" \"subsample\": 1,\n",
" \"colsample_bytree\": 1,\n",
" 'gamma':1.0,\n",
" 'min_child_weight':5,\n",
" \"silent\": 1,\n",
" \"seed\": 1301,\n",
" \"eval_metric\": 'rmse'\n",
" }\n",
"\n",
"print(\"Train a XGBoost model with cross val data\")\n",
"dtrain = xgb.DMatrix(x_train, y_train)\n",
"dvalid = xgb.DMatrix(x_test, y_test)\n",
"watchlist = [(dtrain, 'train'), (dvalid, 'eval')]\n",
"gbm = xgb.train(parms, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=300, verbose_eval=True)\n",
"\n",
"#after this train this on complete data\n",
"print(\"Train a XGBoost model\")\n",
"dtrain = xgb.DMatrix(train.drop(['timestamp', 'Outcome'], axis=1), train['Outcome'])\n",
"watchlist = [(dtrain, 'train')]\n",
"gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=300, verbose_eval=True)\n",
"#predict\n",
"pred1=gbm.predict(xgb.DMatrix(test[train.columns]))\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import xgboost as xgb"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#getting feature importance 1\n",
"def create_feature_map(features):\n",
" outfile = open('xgb.fmap', 'w')\n",
" for i, feat in enumerate(features):\n",
" outfile.write('{0}\\t{1}\\tq\\n'.format(i,feat))\n",
" outfile.close()\n",
" \n",
"create_feature_map(col)\n",
"model.dump_model('xgbmodel.txt', 'xgb.fmap', with_stats=True)\n",
"importance = model.get_fscore(fmap='xgb.fmap')\n",
"importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)\n",
"imp_df = pd.DataFrame(importance, columns=['feature','fscore'])\n",
"imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()\n",
"imp_df.to_csv(\"imp_feat.txt\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'algo' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-11-3f3efd24b96b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \"\"\"xgboost.plot_importance(booster, ax=None, height=0.2, xlim=None, ylim=None, title='Feature importance',\n\u001b[1;32m 3\u001b[0m xlabel='F score', ylabel='Features', importance_type='weight', max_num_features=None, grid=True, **kwargs)\"\"\"\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mxgb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot_importance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0malgo\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'algo' is not defined"
]
}
],
"source": [
"#getting feature importance 2\n",
"\"\"\"xgboost.plot_importance(booster, ax=None, height=0.2, xlim=None, ylim=None, title='Feature importance',\n",
"xlabel='F score', ylabel='Features', importance_type='weight', max_num_features=None, grid=True, **kwargs)\"\"\"\n",
"xgb.plot_importance(algo)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda env:py27]",
"language": "python",
"name": "conda-env-py27-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 126e598

Please sign in to comment.