Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update 04_preprocessing_and_training.ipynb #95

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 45 additions & 45 deletions Notebooks/04_preprocessing_and_training.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -986,10 +986,10 @@
"#Save the 'Name', 'state', and 'Region' columns from the train/test data into names_train and names_test\n",
"#Then drop those columns from `X_train` and `X_test`. Use 'inplace=True'\n",
"names_list = ['Name', 'state', 'Region']\n",
"names_train = X_train[___]\n",
"names_test = X_test[___]\n",
"X_train.___(columns=names_list, inplace=___)\n",
"X_test.___(columns=names_list, inplace=___)\n",
"names_train = X_train[names_list]\n",
"names_test = X_test[names_list]\n",
"X_train.drop(columns=names_list, inplace=True)\n",
"X_test.drop(columns=names_list, inplace=True)\n",
"X_train.shape, X_test.shape"
]
},
Expand All @@ -1001,7 +1001,7 @@
"source": [
"#Code task 2#\n",
"#Check the `dtypes` attribute of `X_train` to verify all features are numeric\n",
"X_train.___"
"X_train.dtypes"
]
},
{
Expand All @@ -1012,7 +1012,7 @@
"source": [
"#Code task 3#\n",
"#Repeat this check for the test split in `X_test`\n",
"X_test.___"
"X_test.dtypes"
]
},
{
Expand Down Expand Up @@ -1044,7 +1044,7 @@
"source": [
"#Code task 4#\n",
"#Calculate the mean of `y_train`\n",
"train_mean = y_train.___\n",
"train_mean = y_train.mean()\n",
"train_mean"
]
},
Expand All @@ -1066,8 +1066,8 @@
"#Hint, call its `.fit()` method with `X_train` and `y_train` as arguments\n",
"#Then print the object's `constant_` attribute and verify it's the same as the mean above\n",
"dumb_reg = DummyRegressor(strategy='mean')\n",
"dumb_reg.___(___, ___)\n",
"dumb_reg.___"
"dumb_reg.fit('X_train', `y_train`)\n",
"dumb_reg.constant_"
]
},
{
Expand Down Expand Up @@ -1140,9 +1140,9 @@
" ypred -- the predicted values\n",
" \"\"\"\n",
" ybar = np.sum(y) / len(y) #yes, we could use np.mean(y)\n",
" sum_sq_tot = np.___((y - ybar)**2) #total sum of squares error\n",
" sum_sq_res = np.___((y - ypred)**2) #residual sum of squares error\n",
" R2 = 1.0 - ___ / ___\n",
" sum_sq_tot = np.mean((y - ybar)**2) #total sum of squares error\n",
" sum_sq_res = np.mean((y - ypred)**2) #residual sum of squares error\n",
" R2 = 1.0 - sum_sq_tot / sum_sq_res\n",
" return R2"
]
},
Expand Down Expand Up @@ -1398,8 +1398,8 @@
" y -- the observed values\n",
" ypred -- the predicted values\n",
" \"\"\"\n",
" sq_error = (___ - ___)**2\n",
" mse = np.mean(___)\n",
" sq_error = (y_true - y_pred)**2\n",
" mse = np.mean(sq_error)\n",
" return mse"
]
},
Expand Down Expand Up @@ -1805,8 +1805,8 @@
"#Code task 9#\n",
"#Call `X_train` and `X_test`'s `fillna()` method, passing `X_defaults_median` as the values to use\n",
"#Assign the results to `X_tr` and `X_te`, respectively\n",
"X_tr = X_train.___(___)\n",
"X_te = X_test.___(___)"
"X_tr = X_train.fillna(X_defaults_median)\n",
"X_te = X_test.fillna(X_defaults_median)"
]
},
{
Expand Down Expand Up @@ -1834,9 +1834,9 @@
"#then use it's `transform()` method to apply the scaling to both the train and test split\n",
"#data (`X_tr` and `X_te`), naming the results `X_tr_scaled` and `X_te_scaled`, respectively\n",
"scaler = StandardScaler()\n",
"scaler.___(X_tr)\n",
"X_tr_scaled = scaler.___(X_tr)\n",
"X_te_scaled = scaler.___(X_te)"
"scaler.fit(X_tr)\n",
"X_tr_scaled = scaler.transform(X_tr)\n",
"X_te_scaled = scaler.transform(X_te)"
]
},
{
Expand Down Expand Up @@ -1871,8 +1871,8 @@
"#Code task 11#\n",
"#Call the `predict()` method of the model (`lm`) on both the (scaled) train and test data\n",
"#Assign the predictions to `y_tr_pred` and `y_te_pred`, respectively\n",
"y_tr_pred = lm.___(X_tr_scaled)\n",
"y_te_pred = lm.___(X_te_scaled)"
"y_tr_pred = lm.predict(X_tr_scaled)\n",
"y_te_pred = lm.predict(X_te_scaled)"
]
},
{
Expand Down Expand Up @@ -1921,7 +1921,7 @@
"#Now calculate the mean absolute error scores using `sklearn`'s `mean_absolute_error` function\n",
"# as we did above for R^2\n",
"# MAE - train, test\n",
"median_mae = ___(y_train, y_tr_pred), ___(y_test, y_te_pred)\n",
"median_mae = mae_score(y_train, y_tr_pred), mae_score(y_test, y_te_pred)\n",
"median_mae"
]
},
Expand All @@ -1941,7 +1941,7 @@
"#Code task 13#\n",
"#And also do the same using `sklearn`'s `mean_squared_error`\n",
"# MSE - train, test\n",
"median_mse = ___(___, ___), ___(___, ___)\n",
"median_mse = mse_score(y_train, y_tr_pred), mse_score(y_test, y_te_pred)\n",
"median_mse"
]
},
Expand Down Expand Up @@ -1975,7 +1975,7 @@
"#Code task 14#\n",
"#As we did for the median above, calculate mean values for imputing missing values\n",
"# These are the values we'll use to fill in any missing values\n",
"X_defaults_mean = X_train.___()\n",
"X_defaults_mean = X_train.mean()\n",
"X_defaults_mean"
]
},
Expand Down Expand Up @@ -2241,7 +2241,7 @@
"source": [
"#Code task 15#\n",
"#Call the pipe's `fit()` method with `X_train` and `y_train` as arguments\n",
"pipe.___(___, ___)"
"pipe.fit(X_train, y_train)"
]
},
{
Expand Down Expand Up @@ -2459,7 +2459,7 @@
"pipe = make_pipeline(\n",
" SimpleImputer(strategy='median'), \n",
" StandardScaler(),\n",
" ___(___),\n",
" f_regression(SelectKBest),\n",
" LinearRegression()\n",
")"
]
Expand Down Expand Up @@ -2577,7 +2577,7 @@
"pipe15 = make_pipeline(\n",
" SimpleImputer(strategy='median'), \n",
" StandardScaler(),\n",
" ___(___, k=___),\n",
" f_regression(SelectKBest, k=15),\n",
" LinearRegression()\n",
")"
]
Expand Down Expand Up @@ -2804,7 +2804,7 @@
"#Code task 18#\n",
"#Call `pipe`'s `get_params()` method to get a dict of available parameters and print their names\n",
"#using dict's `keys()` method\n",
"pipe.___.keys()"
"pipe.get_params().keys()"
]
},
{
Expand Down Expand Up @@ -2892,7 +2892,7 @@
"source": [
"#Code task 19#\n",
"#Print the `best_params_` attribute of `lr_grid_cv`\n",
"lr_grid_cv.___"
"lr_grid_cv.best_params_"
]
},
{
Expand All @@ -2903,7 +2903,7 @@
"source": [
"#Code task 20#\n",
"#Assign the value of k from the above dict of `best_params_` and assign it to `best_k`\n",
"___ = lr_grid_cv.___['selectkbest__k']\n",
"best_k = lr_grid_cv.best_params_['selectkbest__k']\n",
"plt.subplots(figsize=(10, 5))\n",
"plt.errorbar(cv_k, score_mean, yerr=score_std)\n",
"plt.axvline(x=best_k, c='r', ls='--', alpha=.5)\n",
Expand Down Expand Up @@ -2955,7 +2955,7 @@
"#sorting the values in descending order\n",
"coefs = lr_grid_cv.best_estimator_.named_steps.linearregression.coef_\n",
"features = X_train.columns[selected]\n",
"pd.Series(___, index=___).___(ascending=___)"
"pd.Series(coefs, index=features).sort(ascending=False)"
]
},
{
Expand Down Expand Up @@ -3000,9 +3000,9 @@
"#StandardScaler(),\n",
"#and then RandomForestRegressor() with a random state of 47\n",
"RF_pipe = make_pipeline(\n",
" ___(strategy=___),\n",
" ___,\n",
" ___(random_state=___)\n",
" SimpleImputer(strategy=median),\n",
" StandardScaler(),\n",
" RandomForestRegressor(random_state=47)\n",
")"
]
},
Expand All @@ -3023,7 +3023,7 @@
"#Call `cross_validate` to estimate the pipeline's performance.\n",
"#Pass it the random forest pipe object, `X_train` and `y_train`,\n",
"#and get it to use 5-fold cross-validation\n",
"rf_default_cv_results = cross_validate(___, ___, ___, cv=___)"
"rf_default_cv_results = cross_validate(RF_pipe, X_train, y_train, cv=5-fold)"
]
},
{
Expand Down Expand Up @@ -3137,7 +3137,7 @@
"#Code task 24#\n",
"#Call `GridSearchCV` with the random forest pipeline, passing in the above `grid_params`\n",
"#dict for parameters to evaluate, 5-fold cross-validation, and all available CPU cores (if desired)\n",
"rf_grid_cv = GridSearchCV(___, param_grid=___, cv=___, n_jobs=-1)"
"rf_grid_cv = GridSearchCV(RF_pipe, param_grid=grid_params, cv=5-fold, n_jobs=-1)"
]
},
{
Expand All @@ -3149,7 +3149,7 @@
"#Code task 25#\n",
"#Now call the `GridSearchCV`'s `fit()` method with `X_train` and `y_train` as arguments\n",
"#to actually start the grid search. This may take a minute or two.\n",
"rf_grid_cv.___(___, ___)"
"rf_grid_cv.fit(X_train, y_train)"
]
},
{
Expand All @@ -3160,7 +3160,7 @@
"source": [
"#Code task 26#\n",
"#Print the best params (`best_params_` attribute) from the grid search\n",
"rf_grid_cv.___"
"rf_grid_cv.best_params_"
]
},
{
Expand Down Expand Up @@ -3233,7 +3233,7 @@
"#training data column names, sorting the values in descending order\n",
"plt.subplots(figsize=(10, 5))\n",
"imps = rf_grid_cv.best_estimator_.named_steps.randomforestregressor.___\n",
"rf_feat_imps = pd.Series(___, index=X_train.columns).sort_values(ascending=False)\n",
"rf_feat_imps = pd.Series(data=training, index=X_train.columns).sort_values(ascending=False)\n",
"rf_feat_imps.plot(kind='bar')\n",
"plt.xlabel('features')\n",
"plt.ylabel('importance')\n",
Expand Down Expand Up @@ -3492,12 +3492,12 @@
"#and the current datetime (`datetime.datetime.now()`) to the `build_datetime` attribute\n",
"#Let's call this model version '1.0'\n",
"best_model = rf_grid_cv.best_estimator_\n",
"best_model.version = ___\n",
"best_model.pandas_version = ___\n",
"best_model.numpy_version = ___\n",
"best_model.sklearn_version = ___\n",
"best_model.version = 1.0\n",
"best_model.pandas_version = pd.__version__\n",
"best_model.numpy_version = np.__version__\n",
"best_model.sklearn_version = sklearn_version\n",
"best_model.X_columns = [col for col in X_train.columns]\n",
"best_model.build_datetime = ___"
"best_model.build_datetime = datetime.datetime.now()"
]
},
{
Expand Down Expand Up @@ -3530,7 +3530,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"**A: 1** Your answer here"
"**A: 1** A baseline idea of performance was gained by simply taking the average ticket price, however, that prediction was found to be within $19 of the real ticket price. To get even closer to the real ticket price, a linear regression model was used and that model explains over 80% of the variance on the train set as well as over 70% on the test set. Using this model, on average, you'd expect to estimate a ticket price within approximately $9 of the real price. Testing its performance using the test/split method, as expected, did not hold up consistently. The next model used is the random forest model. This model has an even lower cross-validation estimate, to the real price, by almost $1. This model also testing consistent estimates with the various performance results. With all of this data, I’ve chosen to use the random forest model. This decision was made based off the consistency of the models results, and the ability to use this estimate on various areas of data for additional proactive solutions or predictions for conflict resolution.
]
}
],
Expand Down