diff --git a/.github/workflows/workshop_ci.yml b/.github/workflows/workshop_ci.yml
index 09c0ea72..922fa9f1 100644
--- a/.github/workflows/workshop_ci.yml
+++ b/.github/workflows/workshop_ci.yml
@@ -32,7 +32,7 @@ jobs:
       - name: Install az ml & set default values for AML
         run: | #setup: provide group, workspace and location
           az extension add -n ml -y --version 2.2.1
-          az configure --defaults group=azureml workspace=ws01ent location=westus2   
+          az configure --defaults group=mlops-rg-910166 workspace=910166 location=westus2   
       - name: run training and model validation
         run: |
          az ml job create -s -f src/workshop/core/pipelines/training_pipeline.yml
diff --git a/.github/workflows/workshop_unit_test.yml b/.github/workflows/workshop_unit_test.yml
index 3c1382c3..1e9f4423 100644
--- a/.github/workflows/workshop_unit_test.yml
+++ b/.github/workflows/workshop_unit_test.yml
@@ -13,7 +13,7 @@ jobs:
   unit-test:
     runs-on: ubuntu-latest
     steps:
-      - name: Check out repository code
+      - name: Harpreet-dev
         uses: actions/checkout@v3
       - name: Setup python
         uses: actions/setup-python@v2
@@ -31,7 +31,7 @@ jobs:
       - name: Install AZ ML and tools
         run: | # SETUP line 34 to point to your own AML workspace
           az extension add -n ml -y --version 2.2.1
-          az configure --defaults group=azureml workspace=ws01ent location=westus2   
+          az configure --defaults group=mlops-rg-910166 workspace=aml910166 location=westus2   
       - name: Run Feature Engineering
         uses: ./.github/actions/aml-job-create
         with:
diff --git a/src/workshop/core/scoring/deployment.yml b/src/workshop/core/scoring/deployment.yml
index 29c3500c..8f54df02 100644
--- a/src/workshop/core/scoring/deployment.yml
+++ b/src/workshop/core/scoring/deployment.yml
@@ -1,6 +1,6 @@
 $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
 name: green
-endpoint_name: mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name defined in endpoint.yml
+endpoint_name: mlops-workshop-endpoint34 #setup replace `mlops-workshop-endpoint` with your own endpoint name defined in endpoint.yml
 model: azureml:nyc_fare_prediction:1
 code_configuration:
   code: ./
diff --git a/src/workshop/core/scoring/endpoint.yml b/src/workshop/core/scoring/endpoint.yml
index 611e0721..c3d5b249 100644
--- a/src/workshop/core/scoring/endpoint.yml
+++ b/src/workshop/core/scoring/endpoint.yml
@@ -1,3 +1,3 @@
 $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
-name: mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name. It has to be globally unique
+name: mlops-workshop-endpoint34 #setup replace `mlops-workshop-endpoint` with your own endpoint name. It has to be globally unique
 auth_mode: key
diff --git a/src/workshop/core/training/.amlignore b/src/workshop/core/training/.amlignore
new file mode 100644
index 00000000..0621f9fc
--- /dev/null
+++ b/src/workshop/core/training/.amlignore
@@ -0,0 +1,6 @@
+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. 
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+        
+.ipynb_aml_checkpoints/ 
+*.amltmp 
+*.amltemp
\ No newline at end of file
diff --git a/src/workshop/core/training/.amlignore.amltmp b/src/workshop/core/training/.amlignore.amltmp
new file mode 100644
index 00000000..0621f9fc
--- /dev/null
+++ b/src/workshop/core/training/.amlignore.amltmp
@@ -0,0 +1,6 @@
+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. 
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+        
+.ipynb_aml_checkpoints/ 
+*.amltmp 
+*.amltemp
\ No newline at end of file
diff --git a/src/workshop/core/training/conda_ml_training.yml b/src/workshop/core/training/conda_ml_training.yml
index 3e26a9f2..8a873bf7 100644
--- a/src/workshop/core/training/conda_ml_training.yml
+++ b/src/workshop/core/training/conda_ml_training.yml
@@ -8,4 +8,4 @@ dependencies:
     - azureml-sdk==1.38.0
     - azureml-mlflow==1.38.0
     - pandas==1.3.5
-    - scikit-learn==1.0.2
\ No newline at end of file
+    - scikit-learn
\ No newline at end of file
diff --git a/src/workshop/core/training/conda_ml_training.yml.amltmp b/src/workshop/core/training/conda_ml_training.yml.amltmp
new file mode 100644
index 00000000..8a873bf7
--- /dev/null
+++ b/src/workshop/core/training/conda_ml_training.yml.amltmp
@@ -0,0 +1,11 @@
+name: ml-training
+channels:
+  - conda-forge
+dependencies:
+  - python=3.8
+  - pip=21.3.1
+  - pip:
+    - azureml-sdk==1.38.0
+    - azureml-mlflow==1.38.0
+    - pandas==1.3.5
+    - scikit-learn
\ No newline at end of file
diff --git a/src/workshop/core/training/ml_training.py b/src/workshop/core/training/ml_training.py
index 6f59dcdd..93b504a8 100644
--- a/src/workshop/core/training/ml_training.py
+++ b/src/workshop/core/training/ml_training.py
@@ -13,7 +13,7 @@
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.impute import SimpleImputer
 from sklearn.compose import ColumnTransformer
-from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error
+from sklearn.metrics import r2_score,mean_absolute_percentage_error, mean_squared_error
 import joblib
 def parse_args():
     # arg parser
@@ -43,7 +43,7 @@ def createClassModel(algo_name, catg, nums):
         #---------------------------------------------
         #setup: Update alpha value
         #---------------------------------------------
-        model = Ridge(alpha=100000)  #setup
+        model = Ridge(alpha=100)  #setup
     elif algo_name == 'random_forest':
         model = RandomForestRegressor()
     else:
diff --git a/src/workshop/core/training/ml_training.py.amltmp b/src/workshop/core/training/ml_training.py.amltmp
new file mode 100644
index 00000000..93b504a8
--- /dev/null
+++ b/src/workshop/core/training/ml_training.py.amltmp
@@ -0,0 +1,103 @@
+import pandas as pd
+import numpy as np
+import os
+import argparse
+import mlflow
+import mlflow.sklearn
+from azureml.core import Run, Dataset,Datastore, Workspace
+from sklearn.linear_model import LinearRegression
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.linear_model import Ridge
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.impute import SimpleImputer
+from sklearn.compose import ColumnTransformer
+from sklearn.metrics import r2_score,mean_absolute_percentage_error, mean_squared_error
+import joblib
+def parse_args():
+    # arg parser
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--prep_data", default="data", type=str, help="Path to prepped data, default to local folder")
+    parser.add_argument("--model_folder", type=str,default="data", help="Path of model ouput folder, default to local folder")
+    parser.add_argument("--input_file_name", type=str, default="final_df.parquet")
+    parser.add_argument("--run_mode", type=str, default="local")
+
+
+    # parse args
+    args = parser.parse_args()
+
+    # return args
+    return args
+
+
+def createClassModel(algo_name, catg, nums):
+    numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))])
+
+    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value="MISSING")), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
+
+    preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)])
+
+    if algo_name == 'linear_regression':
+        #---------------------------------------------
+        #setup: Update alpha value
+        #---------------------------------------------
+        model = Ridge(alpha=100)  #setup
+    elif algo_name == 'random_forest':
+        model = RandomForestRegressor()
+    else:
+        pass
+    
+    ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), ("model", model)])
+    
+    return ModelPipeline
+
+def main(args):
+    
+    # read in data
+    final_df = pd.read_parquet(os.path.join(args.prep_data,args.input_file_name))
+    catg_cols = ["vendorID", "month_num", "day_of_month", "normalizeHolidayName", "isPaidTimeOff"]
+    num_cols = ["passengerCount", "tripDistance", "precipTime", "temperature", "precipDepth", "hr_sin", "hr_cos", "dy_sin", "dy_cos"]
+    label = ["totalAmount"]
+    # make sure categorical columns are strings
+    final_df[catg_cols] = final_df[catg_cols].astype("str")
+
+    # split data
+    X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222)
+
+    # test 2 algorithms
+    os.makedirs(args.model_folder, exist_ok=True)
+
+    algorithmname = "linear_regression"
+    fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline
+    fitPipeline.fit(X_train, y_train.values.ravel())                   # fit pipeine
+
+    y_pred = fitPipeline.predict(X_test)                               # score with fitted pipeline
+
+    # Evaluate
+    r2 = r2_score(y_test, y_pred)
+    mape = mean_absolute_percentage_error(y_test, y_pred)
+    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
+    
+    
+    joblib.dump(fitPipeline,args.model_folder+"/"+algorithmname+".joblib")
+        
+    print("Training finished!. Metrics:")
+    print(f"R2_{algorithmname}", r2)
+    print(f"MAPE_{algorithmname}", mape)
+    print(f"RMSE_{algorithmname}", rmse)
+    print("Model",args.model_folder+"/"+algorithmname+".joblib","saved!")
+    
+    if args.run_mode == 'remote':
+        mlflow.log_metric(f"R2_{algorithmname}", r2)
+        mlflow.log_metric(f"MAPE_{algorithmname}", mape)
+        mlflow.log_metric(f"RMSE_{algorithmname}", rmse)
+        mlflow.sklearn.log_model(fitPipeline,f"{algorithmname}_model")
+
+# run script
+if __name__ == "__main__":
+    # parse args
+    args = parse_args()
+    # run main function
+    main(args)
\ No newline at end of file
diff --git a/src/workshop/data/linear_regression.joblib b/src/workshop/data/linear_regression.joblib
index d6bd0590..3e776213 100644
Binary files a/src/workshop/data/linear_regression.joblib and b/src/workshop/data/linear_regression.joblib differ
diff --git a/src/workshop/notebooks/taxi-tutorial.ipynb b/src/workshop/notebooks/taxi-tutorial.ipynb
index 41795d69..01fe859f 100644
--- a/src/workshop/notebooks/taxi-tutorial.ipynb
+++ b/src/workshop/notebooks/taxi-tutorial.ipynb
@@ -1,2898 +1,654 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Tutorial: Build a regression model with Open Datasets\n",
-    "\n",
-    "In this tutorial, you leverage the convenience of Azure Open Datasets to create a regression model to predict NYC taxi fare prices. Easily download publicly available taxi, holiday and weather data to create a dataset that can train a regression model using sklearn."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from azureml.opendatasets import NycTlcGreen\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "from datetime import datetime\n",
-    "from dateutil.relativedelta import relativedelta\n",
-    "\n",
-    "pd.options.mode.chained_assignment = None"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Download Data\n",
-    "Begin by downloading the NYC Taxi dataset from Azure Open Datasets. In non-Spark environments, Open Datasets only allows one month of data at a time with certain classes to avoid MemoryError with large datasets. To download 1 year of taxi data, we will fetch 2000 random samples from each month.\n",
-    "\n",
-    "Note: Open Datasets has mirroring classes for working in Spark where data size and memory are not a concern."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [
+  "cells": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpviwf6gni\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=1\\part-00119-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2689-1.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp6e1co7l5\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=2\\part-00060-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2630-2.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpd5lgxojh\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=3\\part-00196-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2766-1.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpela340gr\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=4\\part-00121-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2691-1.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpe79pzv2_\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=5\\part-00044-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2614-1.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpyxyv_8h4\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=6\\part-00108-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2678-1.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp498a1aem\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=7\\part-00020-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2590-2.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpuhi_se7a\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=8\\part-00172-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2742-2.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpd7id7xon\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=9\\part-00076-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2646-1.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp3he0z_qe\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=10\\part-00090-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2660-1.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp1sa8wuxl\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=11\\part-00021-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2591-1.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp1e7uekhr\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=12\\part-00116-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2686-1.c000.snappy.parquet\n"
-     ]
+      "cell_type": "markdown",
+      "source": [
+        "# Tutorial: Build a regression model with Open Datasets\n",
+        "\n",
+        "In this tutorial, you leverage the convenience of Azure Open Datasets to create a regression model to predict NYC taxi fare prices. Easily download publicly available taxi, holiday and weather data to create a dataset that can train a regression model using sklearn."
+      ],
+      "metadata": {}
     },
     {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>vendorID</th>\n",
-       "      <th>lpepPickupDatetime</th>\n",
-       "      <th>lpepDropoffDatetime</th>\n",
-       "      <th>passengerCount</th>\n",
-       "      <th>tripDistance</th>\n",
-       "      <th>puLocationId</th>\n",
-       "      <th>doLocationId</th>\n",
-       "      <th>pickupLongitude</th>\n",
-       "      <th>pickupLatitude</th>\n",
-       "      <th>dropoffLongitude</th>\n",
-       "      <th>...</th>\n",
-       "      <th>paymentType</th>\n",
-       "      <th>fareAmount</th>\n",
-       "      <th>extra</th>\n",
-       "      <th>mtaTax</th>\n",
-       "      <th>improvementSurcharge</th>\n",
-       "      <th>tipAmount</th>\n",
-       "      <th>tollsAmount</th>\n",
-       "      <th>ehailFee</th>\n",
-       "      <th>totalAmount</th>\n",
-       "      <th>tripType</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>1379860</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-14 06:39:00</td>\n",
-       "      <td>2016-01-14 06:44:55</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1.23</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>-73.911827</td>\n",
-       "      <td>40.775372</td>\n",
-       "      <td>-73.899635</td>\n",
-       "      <td>...</td>\n",
-       "      <td>2</td>\n",
-       "      <td>6.5</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.5</td>\n",
-       "      <td>0.3</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>7.30</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>377548</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-01 06:22:01</td>\n",
-       "      <td>2016-01-01 06:27:14</td>\n",
-       "      <td>5</td>\n",
-       "      <td>0.91</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>-73.962044</td>\n",
-       "      <td>40.709797</td>\n",
-       "      <td>-73.946716</td>\n",
-       "      <td>...</td>\n",
-       "      <td>2</td>\n",
-       "      <td>5.5</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.5</td>\n",
-       "      <td>0.3</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>6.30</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>473976</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-08 20:55:49</td>\n",
-       "      <td>2016-01-08 21:05:50</td>\n",
-       "      <td>6</td>\n",
-       "      <td>3.42</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>-73.904823</td>\n",
-       "      <td>40.741776</td>\n",
-       "      <td>-73.878815</td>\n",
-       "      <td>...</td>\n",
-       "      <td>2</td>\n",
-       "      <td>11.5</td>\n",
-       "      <td>0.5</td>\n",
-       "      <td>0.5</td>\n",
-       "      <td>0.3</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>12.80</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1246683</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-15 08:27:41</td>\n",
-       "      <td>2016-01-15 08:41:05</td>\n",
-       "      <td>1</td>\n",
-       "      <td>3.99</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>-73.911484</td>\n",
-       "      <td>40.854698</td>\n",
-       "      <td>-73.881821</td>\n",
-       "      <td>...</td>\n",
-       "      <td>2</td>\n",
-       "      <td>15.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.5</td>\n",
-       "      <td>0.3</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>15.80</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1152261</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-09 04:35:21</td>\n",
-       "      <td>2016-01-09 04:41:02</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.98</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>-73.921776</td>\n",
-       "      <td>40.767071</td>\n",
-       "      <td>-73.933136</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>6.0</td>\n",
-       "      <td>0.5</td>\n",
-       "      <td>0.5</td>\n",
-       "      <td>0.3</td>\n",
-       "      <td>0.70</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>8.00</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>998273</th>\n",
-       "      <td>1</td>\n",
-       "      <td>2016-12-24 22:03:25</td>\n",
-       "      <td>2016-12-24 22:17:16</td>\n",
-       "      <td>1</td>\n",
-       "      <td>5.30</td>\n",
-       "      <td>74</td>\n",
-       "      <td>235</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>2</td>\n",
-       "      <td>16.5</td>\n",
-       "      <td>0.5</td>\n",
-       "      <td>0.5</td>\n",
-       "      <td>0.3</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>17.80</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>857200</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-12-03 20:33:53</td>\n",
-       "      <td>2016-12-03 20:53:51</td>\n",
-       "      <td>1</td>\n",
-       "      <td>4.81</td>\n",
-       "      <td>83</td>\n",
-       "      <td>258</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>18.5</td>\n",
-       "      <td>0.5</td>\n",
-       "      <td>0.5</td>\n",
-       "      <td>0.3</td>\n",
-       "      <td>3.00</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>22.80</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>607768</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-12-18 16:17:54</td>\n",
-       "      <td>2016-12-18 16:33:13</td>\n",
-       "      <td>3</td>\n",
-       "      <td>2.02</td>\n",
-       "      <td>95</td>\n",
-       "      <td>56</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>2</td>\n",
-       "      <td>11.5</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.5</td>\n",
-       "      <td>0.3</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>12.30</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>78687</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-12-06 09:24:43</td>\n",
-       "      <td>2016-12-06 09:41:09</td>\n",
-       "      <td>1</td>\n",
-       "      <td>9.51</td>\n",
-       "      <td>66</td>\n",
-       "      <td>11</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>2</td>\n",
-       "      <td>27.5</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.5</td>\n",
-       "      <td>0.3</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>28.30</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>141672</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-12-14 16:12:34</td>\n",
-       "      <td>2016-12-14 16:15:11</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.51</td>\n",
-       "      <td>255</td>\n",
-       "      <td>256</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>0.5</td>\n",
-       "      <td>0.3</td>\n",
-       "      <td>1.45</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>7.25</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>24000 rows × 23 columns</p>\n",
-       "</div>"
+      "cell_type": "code",
+      "source": [
+        "from azureml.opendatasets import NycTlcGreen\n",
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "from datetime import datetime\n",
+        "from dateutil.relativedelta import relativedelta\n",
+        "\n",
+        "pd.options.mode.chained_assignment = None"
       ],
-      "text/plain": [
-       "         vendorID  lpepPickupDatetime lpepDropoffDatetime  passengerCount  \\\n",
-       "1379860         2 2016-01-14 06:39:00 2016-01-14 06:44:55               1   \n",
-       "377548          2 2016-01-01 06:22:01 2016-01-01 06:27:14               5   \n",
-       "473976          2 2016-01-08 20:55:49 2016-01-08 21:05:50               6   \n",
-       "1246683         2 2016-01-15 08:27:41 2016-01-15 08:41:05               1   \n",
-       "1152261         2 2016-01-09 04:35:21 2016-01-09 04:41:02               1   \n",
-       "...           ...                 ...                 ...             ...   \n",
-       "998273          1 2016-12-24 22:03:25 2016-12-24 22:17:16               1   \n",
-       "857200          2 2016-12-03 20:33:53 2016-12-03 20:53:51               1   \n",
-       "607768          2 2016-12-18 16:17:54 2016-12-18 16:33:13               3   \n",
-       "78687           2 2016-12-06 09:24:43 2016-12-06 09:41:09               1   \n",
-       "141672          2 2016-12-14 16:12:34 2016-12-14 16:15:11               1   \n",
-       "\n",
-       "         tripDistance puLocationId doLocationId  pickupLongitude  \\\n",
-       "1379860          1.23         None         None       -73.911827   \n",
-       "377548           0.91         None         None       -73.962044   \n",
-       "473976           3.42         None         None       -73.904823   \n",
-       "1246683          3.99         None         None       -73.911484   \n",
-       "1152261          0.98         None         None       -73.921776   \n",
-       "...               ...          ...          ...              ...   \n",
-       "998273           5.30           74          235              NaN   \n",
-       "857200           4.81           83          258              NaN   \n",
-       "607768           2.02           95           56              NaN   \n",
-       "78687            9.51           66           11              NaN   \n",
-       "141672           0.51          255          256              NaN   \n",
-       "\n",
-       "         pickupLatitude  dropoffLongitude  ...  paymentType  fareAmount extra  \\\n",
-       "1379860       40.775372        -73.899635  ...            2         6.5   0.0   \n",
-       "377548        40.709797        -73.946716  ...            2         5.5   0.0   \n",
-       "473976        40.741776        -73.878815  ...            2        11.5   0.5   \n",
-       "1246683       40.854698        -73.881821  ...            2        15.0   0.0   \n",
-       "1152261       40.767071        -73.933136  ...            1         6.0   0.5   \n",
-       "...                 ...               ...  ...          ...         ...   ...   \n",
-       "998273              NaN               NaN  ...            2        16.5   0.5   \n",
-       "857200              NaN               NaN  ...            1        18.5   0.5   \n",
-       "607768              NaN               NaN  ...            2        11.5   0.0   \n",
-       "78687               NaN               NaN  ...            2        27.5   0.0   \n",
-       "141672              NaN               NaN  ...            1         4.0   1.0   \n",
-       "\n",
-       "         mtaTax  improvementSurcharge  tipAmount  tollsAmount ehailFee  \\\n",
-       "1379860     0.5                   0.3       0.00          0.0      NaN   \n",
-       "377548      0.5                   0.3       0.00          0.0      NaN   \n",
-       "473976      0.5                   0.3       0.00          0.0      NaN   \n",
-       "1246683     0.5                   0.3       0.00          0.0      NaN   \n",
-       "1152261     0.5                   0.3       0.70          0.0      NaN   \n",
-       "...         ...                   ...        ...          ...      ...   \n",
-       "998273      0.5                   0.3       0.00          0.0      NaN   \n",
-       "857200      0.5                   0.3       3.00          0.0      NaN   \n",
-       "607768      0.5                   0.3       0.00          0.0      NaN   \n",
-       "78687       0.5                   0.3       0.00          0.0      NaN   \n",
-       "141672      0.5                   0.3       1.45          0.0      NaN   \n",
-       "\n",
-       "         totalAmount  tripType  \n",
-       "1379860         7.30       1.0  \n",
-       "377548          6.30       1.0  \n",
-       "473976         12.80       1.0  \n",
-       "1246683        15.80       1.0  \n",
-       "1152261         8.00       1.0  \n",
-       "...              ...       ...  \n",
-       "998273         17.80       1.0  \n",
-       "857200         22.80       1.0  \n",
-       "607768         12.30       1.0  \n",
-       "78687          28.30       1.0  \n",
-       "141672          7.25       1.0  \n",
-       "\n",
-       "[24000 rows x 23 columns]"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n",
-    "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n",
-    "\n",
-    "green_taxi_df = pd.concat([NycTlcGreen(start + relativedelta(months=x), end + relativedelta(months=x)) \\\n",
-    "        .to_pandas_dataframe().sample(2000) for x in range(12)])\n",
-    "green_taxi_df"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now that the initial data is loaded, define a function to create various time-based features from the pickup datetime field. This will create new fields for the month number, day of month, day of week, and hour of day. From those, we calculate the sin and cosine transformations to capture the cyclical nature of the variable which will allow the model to factor in time-based seasonality. This function also adds a static feature for the country code to join the holiday data. Use the apply() function on the dataframe to interatively apply this function to each row in the dataframe."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "<frozen importlib._bootstrap>:219: RuntimeWarning: scipy._lib.messagestream.MessageStream size changed, may indicate binary incompatibility. Expected 56 from C header, got 64 from PyObject\n"
+        }
+      ],
+      "execution_count": 1,
+      "metadata": {
+        "gather": {
+          "logged": 1681193718753
+        }
+      }
+    },
     {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>vendorID</th>\n",
-       "      <th>lpepPickupDatetime</th>\n",
-       "      <th>lpepDropoffDatetime</th>\n",
-       "      <th>passengerCount</th>\n",
-       "      <th>tripDistance</th>\n",
-       "      <th>puLocationId</th>\n",
-       "      <th>doLocationId</th>\n",
-       "      <th>pickupLongitude</th>\n",
-       "      <th>pickupLatitude</th>\n",
-       "      <th>dropoffLongitude</th>\n",
-       "      <th>...</th>\n",
-       "      <th>tripType</th>\n",
-       "      <th>month_num</th>\n",
-       "      <th>day_of_month</th>\n",
-       "      <th>day_of_week</th>\n",
-       "      <th>hour_of_day</th>\n",
-       "      <th>country_code</th>\n",
-       "      <th>hr_sin</th>\n",
-       "      <th>hr_cos</th>\n",
-       "      <th>dy_sin</th>\n",
-       "      <th>dy_cos</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>1379860</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-14 06:39:00</td>\n",
-       "      <td>2016-01-14 06:44:55</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1.23</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>-73.911827</td>\n",
-       "      <td>40.775372</td>\n",
-       "      <td>-73.899635</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>14</td>\n",
-       "      <td>3</td>\n",
-       "      <td>6</td>\n",
-       "      <td>US</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>6.123234e-17</td>\n",
-       "      <td>0.433884</td>\n",
-       "      <td>-0.900969</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>377548</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-01 06:22:01</td>\n",
-       "      <td>2016-01-01 06:27:14</td>\n",
-       "      <td>5</td>\n",
-       "      <td>0.91</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>-73.962044</td>\n",
-       "      <td>40.709797</td>\n",
-       "      <td>-73.946716</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>4</td>\n",
-       "      <td>6</td>\n",
-       "      <td>US</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>6.123234e-17</td>\n",
-       "      <td>-0.433884</td>\n",
-       "      <td>-0.900969</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>473976</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-08 20:55:49</td>\n",
-       "      <td>2016-01-08 21:05:50</td>\n",
-       "      <td>6</td>\n",
-       "      <td>3.42</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>-73.904823</td>\n",
-       "      <td>40.741776</td>\n",
-       "      <td>-73.878815</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>8</td>\n",
-       "      <td>4</td>\n",
-       "      <td>20</td>\n",
-       "      <td>US</td>\n",
-       "      <td>-0.866025</td>\n",
-       "      <td>5.000000e-01</td>\n",
-       "      <td>-0.433884</td>\n",
-       "      <td>-0.900969</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1246683</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-15 08:27:41</td>\n",
-       "      <td>2016-01-15 08:41:05</td>\n",
-       "      <td>1</td>\n",
-       "      <td>3.99</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>-73.911484</td>\n",
-       "      <td>40.854698</td>\n",
-       "      <td>-73.881821</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>15</td>\n",
-       "      <td>4</td>\n",
-       "      <td>8</td>\n",
-       "      <td>US</td>\n",
-       "      <td>0.866025</td>\n",
-       "      <td>-5.000000e-01</td>\n",
-       "      <td>-0.433884</td>\n",
-       "      <td>-0.900969</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1152261</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-09 04:35:21</td>\n",
-       "      <td>2016-01-09 04:41:02</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.98</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>-73.921776</td>\n",
-       "      <td>40.767071</td>\n",
-       "      <td>-73.933136</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>9</td>\n",
-       "      <td>5</td>\n",
-       "      <td>4</td>\n",
-       "      <td>US</td>\n",
-       "      <td>0.866025</td>\n",
-       "      <td>5.000000e-01</td>\n",
-       "      <td>-0.974928</td>\n",
-       "      <td>-0.222521</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>998273</th>\n",
-       "      <td>1</td>\n",
-       "      <td>2016-12-24 22:03:25</td>\n",
-       "      <td>2016-12-24 22:17:16</td>\n",
-       "      <td>1</td>\n",
-       "      <td>5.30</td>\n",
-       "      <td>74</td>\n",
-       "      <td>235</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>12</td>\n",
-       "      <td>24</td>\n",
-       "      <td>5</td>\n",
-       "      <td>22</td>\n",
-       "      <td>US</td>\n",
-       "      <td>-0.500000</td>\n",
-       "      <td>8.660254e-01</td>\n",
-       "      <td>-0.974928</td>\n",
-       "      <td>-0.222521</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>857200</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-12-03 20:33:53</td>\n",
-       "      <td>2016-12-03 20:53:51</td>\n",
-       "      <td>1</td>\n",
-       "      <td>4.81</td>\n",
-       "      <td>83</td>\n",
-       "      <td>258</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>12</td>\n",
-       "      <td>3</td>\n",
-       "      <td>5</td>\n",
-       "      <td>20</td>\n",
-       "      <td>US</td>\n",
-       "      <td>-0.866025</td>\n",
-       "      <td>5.000000e-01</td>\n",
-       "      <td>-0.974928</td>\n",
-       "      <td>-0.222521</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>607768</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-12-18 16:17:54</td>\n",
-       "      <td>2016-12-18 16:33:13</td>\n",
-       "      <td>3</td>\n",
-       "      <td>2.02</td>\n",
-       "      <td>95</td>\n",
-       "      <td>56</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>12</td>\n",
-       "      <td>18</td>\n",
-       "      <td>6</td>\n",
-       "      <td>16</td>\n",
-       "      <td>US</td>\n",
-       "      <td>-0.866025</td>\n",
-       "      <td>-5.000000e-01</td>\n",
-       "      <td>-0.781831</td>\n",
-       "      <td>0.623490</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>78687</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-12-06 09:24:43</td>\n",
-       "      <td>2016-12-06 09:41:09</td>\n",
-       "      <td>1</td>\n",
-       "      <td>9.51</td>\n",
-       "      <td>66</td>\n",
-       "      <td>11</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>12</td>\n",
-       "      <td>6</td>\n",
-       "      <td>1</td>\n",
-       "      <td>9</td>\n",
-       "      <td>US</td>\n",
-       "      <td>0.707107</td>\n",
-       "      <td>-7.071068e-01</td>\n",
-       "      <td>0.781831</td>\n",
-       "      <td>0.623490</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>141672</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-12-14 16:12:34</td>\n",
-       "      <td>2016-12-14 16:15:11</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.51</td>\n",
-       "      <td>255</td>\n",
-       "      <td>256</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>12</td>\n",
-       "      <td>14</td>\n",
-       "      <td>2</td>\n",
-       "      <td>16</td>\n",
-       "      <td>US</td>\n",
-       "      <td>-0.866025</td>\n",
-       "      <td>-5.000000e-01</td>\n",
-       "      <td>0.974928</td>\n",
-       "      <td>-0.222521</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>24000 rows × 32 columns</p>\n",
-       "</div>"
+      "cell_type": "markdown",
+      "source": [
+        "## Download Data\n",
+        "Begin by downloading the NYC Taxi dataset from Azure Open Datasets. In non-Spark environments, Open Datasets only allows one month of data at a time with certain classes to avoid MemoryError with large datasets. To download 1 year of taxi data, we will fetch 2000 random samples from each month.\n",
+        "\n",
+        "Note: Open Datasets has mirroring classes for working in Spark where data size and memory are not a concern."
       ],
-      "text/plain": [
-       "         vendorID  lpepPickupDatetime lpepDropoffDatetime  passengerCount  \\\n",
-       "1379860         2 2016-01-14 06:39:00 2016-01-14 06:44:55               1   \n",
-       "377548          2 2016-01-01 06:22:01 2016-01-01 06:27:14               5   \n",
-       "473976          2 2016-01-08 20:55:49 2016-01-08 21:05:50               6   \n",
-       "1246683         2 2016-01-15 08:27:41 2016-01-15 08:41:05               1   \n",
-       "1152261         2 2016-01-09 04:35:21 2016-01-09 04:41:02               1   \n",
-       "...           ...                 ...                 ...             ...   \n",
-       "998273          1 2016-12-24 22:03:25 2016-12-24 22:17:16               1   \n",
-       "857200          2 2016-12-03 20:33:53 2016-12-03 20:53:51               1   \n",
-       "607768          2 2016-12-18 16:17:54 2016-12-18 16:33:13               3   \n",
-       "78687           2 2016-12-06 09:24:43 2016-12-06 09:41:09               1   \n",
-       "141672          2 2016-12-14 16:12:34 2016-12-14 16:15:11               1   \n",
-       "\n",
-       "         tripDistance puLocationId doLocationId  pickupLongitude  \\\n",
-       "1379860          1.23         None         None       -73.911827   \n",
-       "377548           0.91         None         None       -73.962044   \n",
-       "473976           3.42         None         None       -73.904823   \n",
-       "1246683          3.99         None         None       -73.911484   \n",
-       "1152261          0.98         None         None       -73.921776   \n",
-       "...               ...          ...          ...              ...   \n",
-       "998273           5.30           74          235              NaN   \n",
-       "857200           4.81           83          258              NaN   \n",
-       "607768           2.02           95           56              NaN   \n",
-       "78687            9.51           66           11              NaN   \n",
-       "141672           0.51          255          256              NaN   \n",
-       "\n",
-       "         pickupLatitude  dropoffLongitude  ...  tripType  month_num  \\\n",
-       "1379860       40.775372        -73.899635  ...       1.0          1   \n",
-       "377548        40.709797        -73.946716  ...       1.0          1   \n",
-       "473976        40.741776        -73.878815  ...       1.0          1   \n",
-       "1246683       40.854698        -73.881821  ...       1.0          1   \n",
-       "1152261       40.767071        -73.933136  ...       1.0          1   \n",
-       "...                 ...               ...  ...       ...        ...   \n",
-       "998273              NaN               NaN  ...       1.0         12   \n",
-       "857200              NaN               NaN  ...       1.0         12   \n",
-       "607768              NaN               NaN  ...       1.0         12   \n",
-       "78687               NaN               NaN  ...       1.0         12   \n",
-       "141672              NaN               NaN  ...       1.0         12   \n",
-       "\n",
-       "        day_of_month  day_of_week  hour_of_day  country_code    hr_sin  \\\n",
-       "1379860           14            3            6            US  1.000000   \n",
-       "377548             1            4            6            US  1.000000   \n",
-       "473976             8            4           20            US -0.866025   \n",
-       "1246683           15            4            8            US  0.866025   \n",
-       "1152261            9            5            4            US  0.866025   \n",
-       "...              ...          ...          ...           ...       ...   \n",
-       "998273            24            5           22            US -0.500000   \n",
-       "857200             3            5           20            US -0.866025   \n",
-       "607768            18            6           16            US -0.866025   \n",
-       "78687              6            1            9            US  0.707107   \n",
-       "141672            14            2           16            US -0.866025   \n",
-       "\n",
-       "               hr_cos    dy_sin    dy_cos  \n",
-       "1379860  6.123234e-17  0.433884 -0.900969  \n",
-       "377548   6.123234e-17 -0.433884 -0.900969  \n",
-       "473976   5.000000e-01 -0.433884 -0.900969  \n",
-       "1246683 -5.000000e-01 -0.433884 -0.900969  \n",
-       "1152261  5.000000e-01 -0.974928 -0.222521  \n",
-       "...               ...       ...       ...  \n",
-       "998273   8.660254e-01 -0.974928 -0.222521  \n",
-       "857200   5.000000e-01 -0.974928 -0.222521  \n",
-       "607768  -5.000000e-01 -0.781831  0.623490  \n",
-       "78687   -7.071068e-01  0.781831  0.623490  \n",
-       "141672  -5.000000e-01  0.974928 -0.222521  \n",
-       "\n",
-       "[24000 rows x 32 columns]"
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "def build_time_features(vector):\n",
-    "    pickup_datetime = vector[0]\n",
-    "    month_num = pickup_datetime.month\n",
-    "    day_of_month = pickup_datetime.day\n",
-    "    day_of_week = pickup_datetime.weekday()\n",
-    "    hour_of_day = pickup_datetime.hour\n",
-    "    country_code = \"US\"\n",
-    "    hr_sin = np.sin(hour_of_day*(2.*np.pi/24))\n",
-    "    hr_cos = np.cos(hour_of_day*(2.*np.pi/24))\n",
-    "    dy_sin = np.sin(day_of_week*(2.*np.pi/7))\n",
-    "    dy_cos = np.cos(day_of_week*(2.*np.pi/7))\n",
-    "    \n",
-    "    return pd.Series((month_num, day_of_month, day_of_week, hour_of_day, country_code, hr_sin, hr_cos, dy_sin, dy_cos))\n",
-    "\n",
-    "green_taxi_df[[\"month_num\", \"day_of_month\",\"day_of_week\", \"hour_of_day\", \"country_code\", \"hr_sin\", \"hr_cos\", \"dy_sin\", \"dy_cos\"]] = green_taxi_df[[\"lpepPickupDatetime\"]].apply(build_time_features, axis=1)\n",
-    "green_taxi_df"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Remove some of the columns that you won't need for modeling or additional feature building. Rename the time field for pickup time, and additionally convert the time to midnight using `pandas.Series.dt.normalize`. This is done to all time features so that the datetime column can be later used as a key when joining datasets together at a daily level of granularity."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {},
-   "outputs": [
+      "metadata": {}
+    },
     {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>vendorID</th>\n",
-       "      <th>lpepPickupDatetime</th>\n",
-       "      <th>passengerCount</th>\n",
-       "      <th>tripDistance</th>\n",
-       "      <th>pickupLongitude</th>\n",
-       "      <th>pickupLatitude</th>\n",
-       "      <th>dropoffLongitude</th>\n",
-       "      <th>dropoffLatitude</th>\n",
-       "      <th>totalAmount</th>\n",
-       "      <th>month_num</th>\n",
-       "      <th>day_of_month</th>\n",
-       "      <th>day_of_week</th>\n",
-       "      <th>hour_of_day</th>\n",
-       "      <th>country_code</th>\n",
-       "      <th>hr_sin</th>\n",
-       "      <th>hr_cos</th>\n",
-       "      <th>dy_sin</th>\n",
-       "      <th>dy_cos</th>\n",
-       "      <th>datetime</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>1379860</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-14 06:39:00</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1.23</td>\n",
-       "      <td>-73.911827</td>\n",
-       "      <td>40.775372</td>\n",
-       "      <td>-73.899635</td>\n",
-       "      <td>40.768333</td>\n",
-       "      <td>7.3</td>\n",
-       "      <td>1</td>\n",
-       "      <td>14</td>\n",
-       "      <td>3</td>\n",
-       "      <td>6</td>\n",
-       "      <td>US</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>6.123234e-17</td>\n",
-       "      <td>0.433884</td>\n",
-       "      <td>-0.900969</td>\n",
-       "      <td>2016-01-14</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>377548</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-01 06:22:01</td>\n",
-       "      <td>5</td>\n",
-       "      <td>0.91</td>\n",
-       "      <td>-73.962044</td>\n",
-       "      <td>40.709797</td>\n",
-       "      <td>-73.946716</td>\n",
-       "      <td>40.706902</td>\n",
-       "      <td>6.3</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>4</td>\n",
-       "      <td>6</td>\n",
-       "      <td>US</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>6.123234e-17</td>\n",
-       "      <td>-0.433884</td>\n",
-       "      <td>-0.900969</td>\n",
-       "      <td>2016-01-01</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>473976</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-08 20:55:49</td>\n",
-       "      <td>6</td>\n",
-       "      <td>3.42</td>\n",
-       "      <td>-73.904823</td>\n",
-       "      <td>40.741776</td>\n",
-       "      <td>-73.878815</td>\n",
-       "      <td>40.717625</td>\n",
-       "      <td>12.8</td>\n",
-       "      <td>1</td>\n",
-       "      <td>8</td>\n",
-       "      <td>4</td>\n",
-       "      <td>20</td>\n",
-       "      <td>US</td>\n",
-       "      <td>-0.866025</td>\n",
-       "      <td>5.000000e-01</td>\n",
-       "      <td>-0.433884</td>\n",
-       "      <td>-0.900969</td>\n",
-       "      <td>2016-01-08</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1246683</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-15 08:27:41</td>\n",
-       "      <td>1</td>\n",
-       "      <td>3.99</td>\n",
-       "      <td>-73.911484</td>\n",
-       "      <td>40.854698</td>\n",
-       "      <td>-73.881821</td>\n",
-       "      <td>40.882130</td>\n",
-       "      <td>15.8</td>\n",
-       "      <td>1</td>\n",
-       "      <td>15</td>\n",
-       "      <td>4</td>\n",
-       "      <td>8</td>\n",
-       "      <td>US</td>\n",
-       "      <td>0.866025</td>\n",
-       "      <td>-5.000000e-01</td>\n",
-       "      <td>-0.433884</td>\n",
-       "      <td>-0.900969</td>\n",
-       "      <td>2016-01-15</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1152261</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-09 04:35:21</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.98</td>\n",
-       "      <td>-73.921776</td>\n",
-       "      <td>40.767071</td>\n",
-       "      <td>-73.933136</td>\n",
-       "      <td>40.774567</td>\n",
-       "      <td>8.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>9</td>\n",
-       "      <td>5</td>\n",
-       "      <td>4</td>\n",
-       "      <td>US</td>\n",
-       "      <td>0.866025</td>\n",
-       "      <td>5.000000e-01</td>\n",
-       "      <td>-0.974928</td>\n",
-       "      <td>-0.222521</td>\n",
-       "      <td>2016-01-09</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
+      "cell_type": "code",
+      "source": [
+        "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n",
+        "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n",
+        "\n",
+        "green_taxi_df = pd.concat([NycTlcGreen(start + relativedelta(months=x), end + relativedelta(months=x)) \\\n",
+        "        .to_pandas_dataframe().sample(2000) for x in range(12)])\n",
+        "green_taxi_df"
       ],
-      "text/plain": [
-       "         vendorID  lpepPickupDatetime  passengerCount  tripDistance  \\\n",
-       "1379860         2 2016-01-14 06:39:00               1          1.23   \n",
-       "377548          2 2016-01-01 06:22:01               5          0.91   \n",
-       "473976          2 2016-01-08 20:55:49               6          3.42   \n",
-       "1246683         2 2016-01-15 08:27:41               1          3.99   \n",
-       "1152261         2 2016-01-09 04:35:21               1          0.98   \n",
-       "\n",
-       "         pickupLongitude  pickupLatitude  dropoffLongitude  dropoffLatitude  \\\n",
-       "1379860       -73.911827       40.775372        -73.899635        40.768333   \n",
-       "377548        -73.962044       40.709797        -73.946716        40.706902   \n",
-       "473976        -73.904823       40.741776        -73.878815        40.717625   \n",
-       "1246683       -73.911484       40.854698        -73.881821        40.882130   \n",
-       "1152261       -73.921776       40.767071        -73.933136        40.774567   \n",
-       "\n",
-       "         totalAmount  month_num  day_of_month  day_of_week  hour_of_day  \\\n",
-       "1379860          7.3          1            14            3            6   \n",
-       "377548           6.3          1             1            4            6   \n",
-       "473976          12.8          1             8            4           20   \n",
-       "1246683         15.8          1            15            4            8   \n",
-       "1152261          8.0          1             9            5            4   \n",
-       "\n",
-       "        country_code    hr_sin        hr_cos    dy_sin    dy_cos   datetime  \n",
-       "1379860           US  1.000000  6.123234e-17  0.433884 -0.900969 2016-01-14  \n",
-       "377548            US  1.000000  6.123234e-17 -0.433884 -0.900969 2016-01-01  \n",
-       "473976            US -0.866025  5.000000e-01 -0.433884 -0.900969 2016-01-08  \n",
-       "1246683           US  0.866025 -5.000000e-01 -0.433884 -0.900969 2016-01-15  \n",
-       "1152261           US  0.866025  5.000000e-01 -0.974928 -0.222521 2016-01-09  "
-      ]
-     },
-     "execution_count": 31,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "columns_to_remove = [\"lpepDropoffDatetime\", \"puLocationId\", \"doLocationId\", \"extra\", \"mtaTax\",\n",
-    "                     \"improvementSurcharge\", \"tollsAmount\", \"ehailFee\", \"tripType\", \"rateCodeID\", \n",
-    "                     \"storeAndFwdFlag\", \"paymentType\", \"fareAmount\", \"tipAmount\"]\n",
-    "\n",
-    "green_taxi_df.drop(columns_to_remove, axis=1, inplace=True)\n",
-    "\n",
-    "green_taxi_df[\"datetime\"] = green_taxi_df[\"lpepPickupDatetime\"].dt.normalize()\n",
-    "green_taxi_df.head(5)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Enrich with Holiday Data\n",
-    "\n",
-    "Now that the taxi data is downloaded and roughly prepared, add in holiday data as additional features. Holiday-specific features will assist model accuracy, as major holidays are times where taxi demand increases dramatically and supply becomes limited. The holiday dataset is relatively small, so fetch the full set by using the `PublicHolidays` class constructor with no parameters for filtering. Preview the data to check the format."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {},
-   "outputs": [
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "[Info] read from /tmp/tmpm9erjg7h/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=1/part-00119-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2689-1.c000.snappy.parquet\n[Info] read from /tmp/tmp9svrtdy0/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=2/part-00060-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2630-2.c000.snappy.parquet\n[Info] read from /tmp/tmp2h8fjxvw/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=3/part-00196-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2766-1.c000.snappy.parquet\n[Info] read from /tmp/tmpfvo7iz0i/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=4/part-00121-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2691-1.c000.snappy.parquet\n[Info] read from /tmp/tmpjlfw4v7s/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=5/part-00044-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2614-1.c000.snappy.parquet\n[Info] read from /tmp/tmpycf1ze5d/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=6/part-00108-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2678-1.c000.snappy.parquet\n[Info] read from /tmp/tmp5z2yg073/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=7/part-00020-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2590-2.c000.snappy.parquet\n[Info] read from /tmp/tmpu_jpgy_x/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=8/part-00172-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2742-2.c000.snappy.parquet\n[Info] read from /tmp/tmpvlj9g932/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=9/part-00076-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2646-1.c000.snappy.parquet\n[Info] read from /tmp/tmp21b4rgp5/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=10/part-00090-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2660-1.c000.snappy.parquet\n[Info] read from /tmp/tmpfomnswl0/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=11/part-00021-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2591-1.c000.snappy.parquet\n[Info] read from /tmp/tmp35xg_y0t/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=12/part-00116-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2686-1.c000.snappy.parquet\n"
+        },
+        {
+          "output_type": "execute_result",
+          "execution_count": 2,
+          "data": {
+            "text/plain": "         vendorID  lpepPickupDatetime lpepDropoffDatetime  passengerCount  \\\n1312085         2 2016-01-03 11:10:13 2016-01-03 11:14:13               1   \n109916          2 2016-01-19 08:11:09 2016-01-19 08:16:29               1   \n25029           2 2016-01-02 11:47:40 2016-01-02 11:52:29               1   \n629848          2 2016-01-17 18:31:30 2016-01-17 18:42:32               1   \n139651          2 2016-01-23 00:00:17 2016-01-23 00:05:10               1   \n...           ...                 ...                 ...             ...   \n44592           1 2016-12-05 08:14:48 2016-12-05 08:39:17               1   \n731527          2 2016-12-24 00:07:40 2016-12-24 00:10:19               1   \n501002          1 2016-12-18 05:47:22 2016-12-18 06:10:34               1   \n700564          2 2016-12-23 12:49:47 2016-12-23 13:00:52               1   \n646881          2 2016-12-22 00:01:44 2016-12-22 00:26:41               1   \n\n         tripDistance puLocationId doLocationId  pickupLongitude  \\\n1312085          0.83         None         None       -73.939774   \n109916           0.85         None         None       -73.925629   \n25029            0.81         None         None       -73.973312   \n629848           2.21         None         None       -73.928474   \n139651           0.60         None         None       -73.953415   \n...               ...          ...          ...              ...   \n44592            3.70           49           71              NaN   \n731527           0.47          255          255              NaN   \n501002           8.40          116           79              NaN   \n700564           2.63          166          236              NaN   \n646881           4.77           37           40              NaN   \n\n         pickupLatitude  dropoffLongitude  ...  paymentType  fareAmount extra  \\\n1312085       40.679844        -73.930649  ...            2         5.0   0.0   \n109916        40.761787        -73.937866  ...            2         5.5   0.0   \n25029         40.689678        -73.984985  ...            1         5.0   0.0   \n629848        40.687298        -73.940605  ...            2         9.5   0.0   \n139651        40.706947        -73.948738  ...            2         5.0   0.5   \n...                 ...               ...  ...          ...         ...   ...   \n44592               NaN               NaN  ...            2        17.5   0.0   \n731527              NaN               NaN  ...            1         4.0   0.5   \n501002              NaN               NaN  ...            1        27.0   0.5   \n700564              NaN               NaN  ...            1        10.5   0.0   \n646881              NaN               NaN  ...            1        18.5   0.5   \n\n         mtaTax  improvementSurcharge  tipAmount  tollsAmount ehailFee  \\\n1312085     0.5                   0.3       0.00          0.0      NaN   \n109916      0.5                   0.3       0.00          0.0      NaN   \n25029       0.5                   0.3       1.16          0.0      NaN   \n629848      0.5                   0.3       0.00          0.0      NaN   \n139651      0.5                   0.3       0.00          0.0      NaN   \n...         ...                   ...        ...          ...      ...   \n44592       0.5                   0.3       0.00          0.0      NaN   \n731527      0.5                   0.3       1.06          0.0      NaN   \n501002      0.5                   0.3       5.65          0.0      NaN   \n700564      0.5                   0.3       2.00          0.0      NaN   \n646881      0.5                   0.3       2.97          0.0      NaN   \n\n         totalAmount  tripType  \n1312085         5.80       1.0  \n109916          6.30       1.0  \n25029           6.96       1.0  \n629848         10.30       1.0  \n139651          6.30       1.0  \n...              ...       ...  \n44592          18.30       1.0  \n731527          6.36       1.0  \n501002         33.95       1.0  \n700564         13.30       1.0  \n646881         24.72       1.0  \n\n[24000 rows x 23 columns]",
+            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>vendorID</th>\n      <th>lpepPickupDatetime</th>\n      <th>lpepDropoffDatetime</th>\n      <th>passengerCount</th>\n      <th>tripDistance</th>\n      <th>puLocationId</th>\n      <th>doLocationId</th>\n      <th>pickupLongitude</th>\n      <th>pickupLatitude</th>\n      <th>dropoffLongitude</th>\n      <th>...</th>\n      <th>paymentType</th>\n      <th>fareAmount</th>\n      <th>extra</th>\n      <th>mtaTax</th>\n      <th>improvementSurcharge</th>\n      <th>tipAmount</th>\n      <th>tollsAmount</th>\n      <th>ehailFee</th>\n      <th>totalAmount</th>\n      <th>tripType</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>1312085</th>\n      <td>2</td>\n      <td>2016-01-03 11:10:13</td>\n      <td>2016-01-03 11:14:13</td>\n      <td>1</td>\n      <td>0.83</td>\n      <td>None</td>\n      <td>None</td>\n      <td>-73.939774</td>\n      <td>40.679844</td>\n      <td>-73.930649</td>\n      <td>...</td>\n      <td>2</td>\n      <td>5.0</td>\n      <td>0.0</td>\n      <td>0.5</td>\n      <td>0.3</td>\n      <td>0.00</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>5.80</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>109916</th>\n      <td>2</td>\n      <td>2016-01-19 08:11:09</td>\n      <td>2016-01-19 08:16:29</td>\n      <td>1</td>\n      <td>0.85</td>\n      <td>None</td>\n      <td>None</td>\n      <td>-73.925629</td>\n      <td>40.761787</td>\n      <td>-73.937866</td>\n      <td>...</td>\n      <td>2</td>\n      <td>5.5</td>\n      <td>0.0</td>\n      <td>0.5</td>\n      <td>0.3</td>\n      <td>0.00</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>6.30</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>25029</th>\n      <td>2</td>\n      <td>2016-01-02 11:47:40</td>\n      <td>2016-01-02 11:52:29</td>\n      <td>1</td>\n      <td>0.81</td>\n      <td>None</td>\n      <td>None</td>\n      <td>-73.973312</td>\n      <td>40.689678</td>\n      <td>-73.984985</td>\n      <td>...</td>\n      <td>1</td>\n      <td>5.0</td>\n      <td>0.0</td>\n      <td>0.5</td>\n      <td>0.3</td>\n      <td>1.16</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>6.96</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>629848</th>\n      <td>2</td>\n      <td>2016-01-17 18:31:30</td>\n      <td>2016-01-17 18:42:32</td>\n      <td>1</td>\n      <td>2.21</td>\n      <td>None</td>\n      <td>None</td>\n      <td>-73.928474</td>\n      <td>40.687298</td>\n      <td>-73.940605</td>\n      <td>...</td>\n      <td>2</td>\n      <td>9.5</td>\n      <td>0.0</td>\n      <td>0.5</td>\n      <td>0.3</td>\n      <td>0.00</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>10.30</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>139651</th>\n      <td>2</td>\n      <td>2016-01-23 00:00:17</td>\n      <td>2016-01-23 00:05:10</td>\n      <td>1</td>\n      <td>0.60</td>\n      <td>None</td>\n      <td>None</td>\n      <td>-73.953415</td>\n      <td>40.706947</td>\n      <td>-73.948738</td>\n      <td>...</td>\n      <td>2</td>\n      <td>5.0</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.3</td>\n      <td>0.00</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>6.30</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>44592</th>\n      <td>1</td>\n      <td>2016-12-05 08:14:48</td>\n      <td>2016-12-05 08:39:17</td>\n      <td>1</td>\n      <td>3.70</td>\n      <td>49</td>\n      <td>71</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>2</td>\n      <td>17.5</td>\n      <td>0.0</td>\n      <td>0.5</td>\n      <td>0.3</td>\n      <td>0.00</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>18.30</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>731527</th>\n      <td>2</td>\n      <td>2016-12-24 00:07:40</td>\n      <td>2016-12-24 00:10:19</td>\n      <td>1</td>\n      <td>0.47</td>\n      <td>255</td>\n      <td>255</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>1</td>\n      <td>4.0</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.3</td>\n      <td>1.06</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>6.36</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>501002</th>\n      <td>1</td>\n      <td>2016-12-18 05:47:22</td>\n      <td>2016-12-18 06:10:34</td>\n      <td>1</td>\n      <td>8.40</td>\n      <td>116</td>\n      <td>79</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>1</td>\n      <td>27.0</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.3</td>\n      <td>5.65</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>33.95</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>700564</th>\n      <td>2</td>\n      <td>2016-12-23 12:49:47</td>\n      <td>2016-12-23 13:00:52</td>\n      <td>1</td>\n      <td>2.63</td>\n      <td>166</td>\n      <td>236</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>1</td>\n      <td>10.5</td>\n      <td>0.0</td>\n      <td>0.5</td>\n      <td>0.3</td>\n      <td>2.00</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>13.30</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>646881</th>\n      <td>2</td>\n      <td>2016-12-22 00:01:44</td>\n      <td>2016-12-22 00:26:41</td>\n      <td>1</td>\n      <td>4.77</td>\n      <td>37</td>\n      <td>40</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>1</td>\n      <td>18.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.3</td>\n      <td>2.97</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>24.72</td>\n      <td>1.0</td>\n    </tr>\n  </tbody>\n</table>\n<p>24000 rows × 23 columns</p>\n</div>"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 2,
+      "metadata": {
+        "gather": {
+          "logged": 1681193755843
+        }
+      }
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpya4i60qp\\https%3A\\%2Fazureopendatastorage.azurefd.net\\holidaydatacontainer\\Processed\\part-00000-tid-8468414522853579044-35925ba8-a227-4b80-9c89-17065e7bf1db-649-c000.snappy.parquet\n"
-     ]
+      "cell_type": "markdown",
+      "source": [
+        "Now that the initial data is loaded, define a function to create various time-based features from the pickup datetime field. This will create new fields for the month number, day of month, day of week, and hour of day. From those, we calculate the sin and cosine transformations to capture the cyclical nature of the variable which will allow the model to factor in time-based seasonality. This function also adds a static feature for the country code to join the holiday data. Use the apply() function on the dataframe to interatively apply this function to each row in the dataframe."
+      ],
+      "metadata": {}
     },
     {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>countryOrRegion</th>\n",
-       "      <th>holidayName</th>\n",
-       "      <th>normalizeHolidayName</th>\n",
-       "      <th>isPaidTimeOff</th>\n",
-       "      <th>countryRegionCode</th>\n",
-       "      <th>date</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>19375</th>\n",
-       "      <td>Argentina</td>\n",
-       "      <td>Año Nuevo [New Year's Day]</td>\n",
-       "      <td>Año Nuevo [New Year's Day]</td>\n",
-       "      <td>None</td>\n",
-       "      <td>AR</td>\n",
-       "      <td>2008-01-01</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>19376</th>\n",
-       "      <td>Australia</td>\n",
-       "      <td>New Year's Day</td>\n",
-       "      <td>New Year's Day</td>\n",
-       "      <td>None</td>\n",
-       "      <td>AU</td>\n",
-       "      <td>2008-01-01</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>19377</th>\n",
-       "      <td>Austria</td>\n",
-       "      <td>Neujahr</td>\n",
-       "      <td>Neujahr</td>\n",
-       "      <td>None</td>\n",
-       "      <td>AT</td>\n",
-       "      <td>2008-01-01</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>19378</th>\n",
-       "      <td>Belarus</td>\n",
-       "      <td>Новый год</td>\n",
-       "      <td>Новый год</td>\n",
-       "      <td>None</td>\n",
-       "      <td>BY</td>\n",
-       "      <td>2008-01-01</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>19379</th>\n",
-       "      <td>Belgium</td>\n",
-       "      <td>Nieuwjaarsdag</td>\n",
-       "      <td>Nieuwjaarsdag</td>\n",
-       "      <td>None</td>\n",
-       "      <td>BE</td>\n",
-       "      <td>2008-01-01</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
+      "cell_type": "code",
+      "source": [
+        "def build_time_features(vector):\n",
+        "    pickup_datetime = vector[0]\n",
+        "    month_num = pickup_datetime.month\n",
+        "    day_of_month = pickup_datetime.day\n",
+        "    day_of_week = pickup_datetime.weekday()\n",
+        "    hour_of_day = pickup_datetime.hour\n",
+        "    country_code = \"US\"\n",
+        "    hr_sin = np.sin(hour_of_day*(2.*np.pi/24))\n",
+        "    hr_cos = np.cos(hour_of_day*(2.*np.pi/24))\n",
+        "    dy_sin = np.sin(day_of_week*(2.*np.pi/7))\n",
+        "    dy_cos = np.cos(day_of_week*(2.*np.pi/7))\n",
+        "    \n",
+        "    return pd.Series((month_num, day_of_month, day_of_week, hour_of_day, country_code, hr_sin, hr_cos, dy_sin, dy_cos))\n",
+        "\n",
+        "green_taxi_df[[\"month_num\", \"day_of_month\",\"day_of_week\", \"hour_of_day\", \"country_code\", \"hr_sin\", \"hr_cos\", \"dy_sin\", \"dy_cos\"]] = green_taxi_df[[\"lpepPickupDatetime\"]].apply(build_time_features, axis=1)\n",
+        "green_taxi_df"
       ],
-      "text/plain": [
-       "      countryOrRegion                 holidayName        normalizeHolidayName  \\\n",
-       "19375       Argentina  Año Nuevo [New Year's Day]  Año Nuevo [New Year's Day]   \n",
-       "19376       Australia              New Year's Day              New Year's Day   \n",
-       "19377         Austria                     Neujahr                     Neujahr   \n",
-       "19378         Belarus                   Новый год                   Новый год   \n",
-       "19379         Belgium               Nieuwjaarsdag               Nieuwjaarsdag   \n",
-       "\n",
-       "      isPaidTimeOff countryRegionCode       date  \n",
-       "19375          None                AR 2008-01-01  \n",
-       "19376          None                AU 2008-01-01  \n",
-       "19377          None                AT 2008-01-01  \n",
-       "19378          None                BY 2008-01-01  \n",
-       "19379          None                BE 2008-01-01  "
-      ]
-     },
-     "execution_count": 32,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from azureml.opendatasets import PublicHolidays\n",
-    "\n",
-    "# call default constructor to download full dataset\n",
-    "holidays_df = PublicHolidays().to_pandas_dataframe()\n",
-    "holidays_df.head(5)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Rename the `countryRegionCode` and `date` columns to match the respective field names from the taxi data, and also normalize the time so it can be used as a key. Next, join the holiday data with the taxi data by performing a left-join using the Pandas `merge()` function. This will preserve all records from `green_taxi_df`, but add in holiday data where it exists for the corresponding `datetime` and `country_code`, which in this case is always `\\\"US\\\"`. Preview the data to verify that they were merged correctly."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "metadata": {},
-   "outputs": [
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "execution_count": 3,
+          "data": {
+            "text/plain": "         vendorID  lpepPickupDatetime lpepDropoffDatetime  passengerCount  \\\n1312085         2 2016-01-03 11:10:13 2016-01-03 11:14:13               1   \n109916          2 2016-01-19 08:11:09 2016-01-19 08:16:29               1   \n25029           2 2016-01-02 11:47:40 2016-01-02 11:52:29               1   \n629848          2 2016-01-17 18:31:30 2016-01-17 18:42:32               1   \n139651          2 2016-01-23 00:00:17 2016-01-23 00:05:10               1   \n...           ...                 ...                 ...             ...   \n44592           1 2016-12-05 08:14:48 2016-12-05 08:39:17               1   \n731527          2 2016-12-24 00:07:40 2016-12-24 00:10:19               1   \n501002          1 2016-12-18 05:47:22 2016-12-18 06:10:34               1   \n700564          2 2016-12-23 12:49:47 2016-12-23 13:00:52               1   \n646881          2 2016-12-22 00:01:44 2016-12-22 00:26:41               1   \n\n         tripDistance puLocationId doLocationId  pickupLongitude  \\\n1312085          0.83         None         None       -73.939774   \n109916           0.85         None         None       -73.925629   \n25029            0.81         None         None       -73.973312   \n629848           2.21         None         None       -73.928474   \n139651           0.60         None         None       -73.953415   \n...               ...          ...          ...              ...   \n44592            3.70           49           71              NaN   \n731527           0.47          255          255              NaN   \n501002           8.40          116           79              NaN   \n700564           2.63          166          236              NaN   \n646881           4.77           37           40              NaN   \n\n         pickupLatitude  dropoffLongitude  ...  tripType  month_num  \\\n1312085       40.679844        -73.930649  ...       1.0          1   \n109916        40.761787        -73.937866  ...       1.0          1   \n25029         40.689678        -73.984985  ...       1.0          1   \n629848        40.687298        -73.940605  ...       1.0          1   \n139651        40.706947        -73.948738  ...       1.0          1   \n...                 ...               ...  ...       ...        ...   \n44592               NaN               NaN  ...       1.0         12   \n731527              NaN               NaN  ...       1.0         12   \n501002              NaN               NaN  ...       1.0         12   \n700564              NaN               NaN  ...       1.0         12   \n646881              NaN               NaN  ...       1.0         12   \n\n        day_of_month  day_of_week  hour_of_day  country_code        hr_sin  \\\n1312085            3            6           11            US  2.588190e-01   \n109916            19            1            8            US  8.660254e-01   \n25029              2            5           11            US  2.588190e-01   \n629848            17            6           18            US -1.000000e+00   \n139651            23            5            0            US  0.000000e+00   \n...              ...          ...          ...           ...           ...   \n44592              5            0            8            US  8.660254e-01   \n731527            24            5            0            US  0.000000e+00   \n501002            18            6            5            US  9.659258e-01   \n700564            23            4           12            US  1.224647e-16   \n646881            22            3            0            US  0.000000e+00   \n\n               hr_cos    dy_sin    dy_cos  \n1312085 -9.659258e-01 -0.781831  0.623490  \n109916  -5.000000e-01  0.781831  0.623490  \n25029   -9.659258e-01 -0.974928 -0.222521  \n629848  -1.836970e-16 -0.781831  0.623490  \n139651   1.000000e+00 -0.974928 -0.222521  \n...               ...       ...       ...  \n44592   -5.000000e-01  0.000000  1.000000  \n731527   1.000000e+00 -0.974928 -0.222521  \n501002   2.588190e-01 -0.781831  0.623490  \n700564  -1.000000e+00 -0.433884 -0.900969  \n646881   1.000000e+00  0.433884 -0.900969  \n\n[24000 rows x 32 columns]",
+            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>vendorID</th>\n      <th>lpepPickupDatetime</th>\n      <th>lpepDropoffDatetime</th>\n      <th>passengerCount</th>\n      <th>tripDistance</th>\n      <th>puLocationId</th>\n      <th>doLocationId</th>\n      <th>pickupLongitude</th>\n      <th>pickupLatitude</th>\n      <th>dropoffLongitude</th>\n      <th>...</th>\n      <th>tripType</th>\n      <th>month_num</th>\n      <th>day_of_month</th>\n      <th>day_of_week</th>\n      <th>hour_of_day</th>\n      <th>country_code</th>\n      <th>hr_sin</th>\n      <th>hr_cos</th>\n      <th>dy_sin</th>\n      <th>dy_cos</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>1312085</th>\n      <td>2</td>\n      <td>2016-01-03 11:10:13</td>\n      <td>2016-01-03 11:14:13</td>\n      <td>1</td>\n      <td>0.83</td>\n      <td>None</td>\n      <td>None</td>\n      <td>-73.939774</td>\n      <td>40.679844</td>\n      <td>-73.930649</td>\n      <td>...</td>\n      <td>1.0</td>\n      <td>1</td>\n      <td>3</td>\n      <td>6</td>\n      <td>11</td>\n      <td>US</td>\n      <td>2.588190e-01</td>\n      <td>-9.659258e-01</td>\n      <td>-0.781831</td>\n      <td>0.623490</td>\n    </tr>\n    <tr>\n      <th>109916</th>\n      <td>2</td>\n      <td>2016-01-19 08:11:09</td>\n      <td>2016-01-19 08:16:29</td>\n      <td>1</td>\n      <td>0.85</td>\n      <td>None</td>\n      <td>None</td>\n      <td>-73.925629</td>\n      <td>40.761787</td>\n      <td>-73.937866</td>\n      <td>...</td>\n      <td>1.0</td>\n      <td>1</td>\n      <td>19</td>\n      <td>1</td>\n      <td>8</td>\n      <td>US</td>\n      <td>8.660254e-01</td>\n      <td>-5.000000e-01</td>\n      <td>0.781831</td>\n      <td>0.623490</td>\n    </tr>\n    <tr>\n      <th>25029</th>\n      <td>2</td>\n      <td>2016-01-02 11:47:40</td>\n      <td>2016-01-02 11:52:29</td>\n      <td>1</td>\n      <td>0.81</td>\n      <td>None</td>\n      <td>None</td>\n      <td>-73.973312</td>\n      <td>40.689678</td>\n      <td>-73.984985</td>\n      <td>...</td>\n      <td>1.0</td>\n      <td>1</td>\n      <td>2</td>\n      <td>5</td>\n      <td>11</td>\n      <td>US</td>\n      <td>2.588190e-01</td>\n      <td>-9.659258e-01</td>\n      <td>-0.974928</td>\n      <td>-0.222521</td>\n    </tr>\n    <tr>\n      <th>629848</th>\n      <td>2</td>\n      <td>2016-01-17 18:31:30</td>\n      <td>2016-01-17 18:42:32</td>\n      <td>1</td>\n      <td>2.21</td>\n      <td>None</td>\n      <td>None</td>\n      <td>-73.928474</td>\n      <td>40.687298</td>\n      <td>-73.940605</td>\n      <td>...</td>\n      <td>1.0</td>\n      <td>1</td>\n      <td>17</td>\n      <td>6</td>\n      <td>18</td>\n      <td>US</td>\n      <td>-1.000000e+00</td>\n      <td>-1.836970e-16</td>\n      <td>-0.781831</td>\n      <td>0.623490</td>\n    </tr>\n    <tr>\n      <th>139651</th>\n      <td>2</td>\n      <td>2016-01-23 00:00:17</td>\n      <td>2016-01-23 00:05:10</td>\n      <td>1</td>\n      <td>0.60</td>\n      <td>None</td>\n      <td>None</td>\n      <td>-73.953415</td>\n      <td>40.706947</td>\n      <td>-73.948738</td>\n      <td>...</td>\n      <td>1.0</td>\n      <td>1</td>\n      <td>23</td>\n      <td>5</td>\n      <td>0</td>\n      <td>US</td>\n      <td>0.000000e+00</td>\n      <td>1.000000e+00</td>\n      <td>-0.974928</td>\n      <td>-0.222521</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>44592</th>\n      <td>1</td>\n      <td>2016-12-05 08:14:48</td>\n      <td>2016-12-05 08:39:17</td>\n      <td>1</td>\n      <td>3.70</td>\n      <td>49</td>\n      <td>71</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>1.0</td>\n      <td>12</td>\n      <td>5</td>\n      <td>0</td>\n      <td>8</td>\n      <td>US</td>\n      <td>8.660254e-01</td>\n      <td>-5.000000e-01</td>\n      <td>0.000000</td>\n      <td>1.000000</td>\n    </tr>\n    <tr>\n      <th>731527</th>\n      <td>2</td>\n      <td>2016-12-24 00:07:40</td>\n      <td>2016-12-24 00:10:19</td>\n      <td>1</td>\n      <td>0.47</td>\n      <td>255</td>\n      <td>255</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>1.0</td>\n      <td>12</td>\n      <td>24</td>\n      <td>5</td>\n      <td>0</td>\n      <td>US</td>\n      <td>0.000000e+00</td>\n      <td>1.000000e+00</td>\n      <td>-0.974928</td>\n      <td>-0.222521</td>\n    </tr>\n    <tr>\n      <th>501002</th>\n      <td>1</td>\n      <td>2016-12-18 05:47:22</td>\n      <td>2016-12-18 06:10:34</td>\n      <td>1</td>\n      <td>8.40</td>\n      <td>116</td>\n      <td>79</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>1.0</td>\n      <td>12</td>\n      <td>18</td>\n      <td>6</td>\n      <td>5</td>\n      <td>US</td>\n      <td>9.659258e-01</td>\n      <td>2.588190e-01</td>\n      <td>-0.781831</td>\n      <td>0.623490</td>\n    </tr>\n    <tr>\n      <th>700564</th>\n      <td>2</td>\n      <td>2016-12-23 12:49:47</td>\n      <td>2016-12-23 13:00:52</td>\n      <td>1</td>\n      <td>2.63</td>\n      <td>166</td>\n      <td>236</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>1.0</td>\n      <td>12</td>\n      <td>23</td>\n      <td>4</td>\n      <td>12</td>\n      <td>US</td>\n      <td>1.224647e-16</td>\n      <td>-1.000000e+00</td>\n      <td>-0.433884</td>\n      <td>-0.900969</td>\n    </tr>\n    <tr>\n      <th>646881</th>\n      <td>2</td>\n      <td>2016-12-22 00:01:44</td>\n      <td>2016-12-22 00:26:41</td>\n      <td>1</td>\n      <td>4.77</td>\n      <td>37</td>\n      <td>40</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>1.0</td>\n      <td>12</td>\n      <td>22</td>\n      <td>3</td>\n      <td>0</td>\n      <td>US</td>\n      <td>0.000000e+00</td>\n      <td>1.000000e+00</td>\n      <td>0.433884</td>\n      <td>-0.900969</td>\n    </tr>\n  </tbody>\n</table>\n<p>24000 rows × 32 columns</p>\n</div>"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 3,
+      "metadata": {
+        "gather": {
+          "logged": 1681193760347
+        }
+      }
+    },
     {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>vendorID</th>\n",
-       "      <th>lpepPickupDatetime</th>\n",
-       "      <th>passengerCount</th>\n",
-       "      <th>tripDistance</th>\n",
-       "      <th>pickupLongitude</th>\n",
-       "      <th>pickupLatitude</th>\n",
-       "      <th>dropoffLongitude</th>\n",
-       "      <th>dropoffLatitude</th>\n",
-       "      <th>totalAmount</th>\n",
-       "      <th>month_num</th>\n",
-       "      <th>...</th>\n",
-       "      <th>day_of_week</th>\n",
-       "      <th>hour_of_day</th>\n",
-       "      <th>country_code</th>\n",
-       "      <th>hr_sin</th>\n",
-       "      <th>hr_cos</th>\n",
-       "      <th>dy_sin</th>\n",
-       "      <th>dy_cos</th>\n",
-       "      <th>datetime</th>\n",
-       "      <th>normalizeHolidayName</th>\n",
-       "      <th>isPaidTimeOff</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-01 06:22:01</td>\n",
-       "      <td>5</td>\n",
-       "      <td>0.91</td>\n",
-       "      <td>-73.962044</td>\n",
-       "      <td>40.709797</td>\n",
-       "      <td>-73.946716</td>\n",
-       "      <td>40.706902</td>\n",
-       "      <td>6.30</td>\n",
-       "      <td>1</td>\n",
-       "      <td>...</td>\n",
-       "      <td>4</td>\n",
-       "      <td>6</td>\n",
-       "      <td>US</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>6.123234e-17</td>\n",
-       "      <td>-0.433884</td>\n",
-       "      <td>-0.900969</td>\n",
-       "      <td>2016-01-01</td>\n",
-       "      <td>New Year's Day</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>25</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-01 06:14:43</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2.44</td>\n",
-       "      <td>-73.993576</td>\n",
-       "      <td>40.681519</td>\n",
-       "      <td>-73.999596</td>\n",
-       "      <td>40.655930</td>\n",
-       "      <td>10.30</td>\n",
-       "      <td>1</td>\n",
-       "      <td>...</td>\n",
-       "      <td>4</td>\n",
-       "      <td>6</td>\n",
-       "      <td>US</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>6.123234e-17</td>\n",
-       "      <td>-0.433884</td>\n",
-       "      <td>-0.900969</td>\n",
-       "      <td>2016-01-01</td>\n",
-       "      <td>New Year's Day</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>27</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-01 16:06:33</td>\n",
-       "      <td>1</td>\n",
-       "      <td>4.57</td>\n",
-       "      <td>-73.962509</td>\n",
-       "      <td>40.687862</td>\n",
-       "      <td>-73.981361</td>\n",
-       "      <td>40.732758</td>\n",
-       "      <td>22.25</td>\n",
-       "      <td>1</td>\n",
-       "      <td>...</td>\n",
-       "      <td>4</td>\n",
-       "      <td>16</td>\n",
-       "      <td>US</td>\n",
-       "      <td>-0.866025</td>\n",
-       "      <td>-5.000000e-01</td>\n",
-       "      <td>-0.433884</td>\n",
-       "      <td>-0.900969</td>\n",
-       "      <td>2016-01-01</td>\n",
-       "      <td>New Year's Day</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>44</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-18 11:46:27</td>\n",
-       "      <td>1</td>\n",
-       "      <td>16.10</td>\n",
-       "      <td>-73.925522</td>\n",
-       "      <td>40.827877</td>\n",
-       "      <td>-73.934982</td>\n",
-       "      <td>40.681278</td>\n",
-       "      <td>50.30</td>\n",
-       "      <td>1</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>11</td>\n",
-       "      <td>US</td>\n",
-       "      <td>0.258819</td>\n",
-       "      <td>-9.659258e-01</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>2016-01-18</td>\n",
-       "      <td>Martin Luther King Jr. Day</td>\n",
-       "      <td>None</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>45</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-01-01 10:41:39</td>\n",
-       "      <td>1</td>\n",
-       "      <td>3.33</td>\n",
-       "      <td>-73.962891</td>\n",
-       "      <td>40.711971</td>\n",
-       "      <td>-73.918060</td>\n",
-       "      <td>40.736832</td>\n",
-       "      <td>12.80</td>\n",
-       "      <td>1</td>\n",
-       "      <td>...</td>\n",
-       "      <td>4</td>\n",
-       "      <td>10</td>\n",
-       "      <td>US</td>\n",
-       "      <td>0.500000</td>\n",
-       "      <td>-8.660254e-01</td>\n",
-       "      <td>-0.433884</td>\n",
-       "      <td>-0.900969</td>\n",
-       "      <td>2016-01-01</td>\n",
-       "      <td>New Year's Day</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>23868</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-12-25 00:21:23</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2.36</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>12.30</td>\n",
-       "      <td>12</td>\n",
-       "      <td>...</td>\n",
-       "      <td>6</td>\n",
-       "      <td>0</td>\n",
-       "      <td>US</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>1.000000e+00</td>\n",
-       "      <td>-0.781831</td>\n",
-       "      <td>0.623490</td>\n",
-       "      <td>2016-12-25</td>\n",
-       "      <td>Christmas Day</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>23892</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-12-25 14:05:48</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1.05</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>12.30</td>\n",
-       "      <td>12</td>\n",
-       "      <td>...</td>\n",
-       "      <td>6</td>\n",
-       "      <td>14</td>\n",
-       "      <td>US</td>\n",
-       "      <td>-0.500000</td>\n",
-       "      <td>-8.660254e-01</td>\n",
-       "      <td>-0.781831</td>\n",
-       "      <td>0.623490</td>\n",
-       "      <td>2016-12-25</td>\n",
-       "      <td>Christmas Day</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>23942</th>\n",
-       "      <td>1</td>\n",
-       "      <td>2016-12-26 01:43:57</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.80</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>7.55</td>\n",
-       "      <td>12</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>US</td>\n",
-       "      <td>0.258819</td>\n",
-       "      <td>9.659258e-01</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>2016-12-26</td>\n",
-       "      <td>Christmas Day</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>23978</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-12-26 03:38:33</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1.55</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>8.30</td>\n",
-       "      <td>12</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>US</td>\n",
-       "      <td>0.707107</td>\n",
-       "      <td>7.071068e-01</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>2016-12-26</td>\n",
-       "      <td>Christmas Day</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>23985</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016-12-26 22:12:18</td>\n",
-       "      <td>1</td>\n",
-       "      <td>3.77</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>16.25</td>\n",
-       "      <td>12</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>22</td>\n",
-       "      <td>US</td>\n",
-       "      <td>-0.500000</td>\n",
-       "      <td>8.660254e-01</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>2016-12-26</td>\n",
-       "      <td>Christmas Day</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>673 rows × 21 columns</p>\n",
-       "</div>"
+      "cell_type": "markdown",
+      "source": [
+        "Remove some of the columns that you won't need for modeling or additional feature building. Rename the time field for pickup time, and additionally convert the time to midnight using `pandas.Series.dt.normalize`. This is done to all time features so that the datetime column can be later used as a key when joining datasets together at a daily level of granularity."
       ],
-      "text/plain": [
-       "       vendorID  lpepPickupDatetime  passengerCount  tripDistance  \\\n",
-       "1             2 2016-01-01 06:22:01               5          0.91   \n",
-       "25            2 2016-01-01 06:14:43               1          2.44   \n",
-       "27            2 2016-01-01 16:06:33               1          4.57   \n",
-       "44            2 2016-01-18 11:46:27               1         16.10   \n",
-       "45            2 2016-01-01 10:41:39               1          3.33   \n",
-       "...         ...                 ...             ...           ...   \n",
-       "23868         2 2016-12-25 00:21:23               1          2.36   \n",
-       "23892         2 2016-12-25 14:05:48               1          1.05   \n",
-       "23942         1 2016-12-26 01:43:57               1          0.80   \n",
-       "23978         2 2016-12-26 03:38:33               1          1.55   \n",
-       "23985         2 2016-12-26 22:12:18               1          3.77   \n",
-       "\n",
-       "       pickupLongitude  pickupLatitude  dropoffLongitude  dropoffLatitude  \\\n",
-       "1           -73.962044       40.709797        -73.946716        40.706902   \n",
-       "25          -73.993576       40.681519        -73.999596        40.655930   \n",
-       "27          -73.962509       40.687862        -73.981361        40.732758   \n",
-       "44          -73.925522       40.827877        -73.934982        40.681278   \n",
-       "45          -73.962891       40.711971        -73.918060        40.736832   \n",
-       "...                ...             ...               ...              ...   \n",
-       "23868              NaN             NaN               NaN              NaN   \n",
-       "23892              NaN             NaN               NaN              NaN   \n",
-       "23942              NaN             NaN               NaN              NaN   \n",
-       "23978              NaN             NaN               NaN              NaN   \n",
-       "23985              NaN             NaN               NaN              NaN   \n",
-       "\n",
-       "       totalAmount  month_num  ...  day_of_week  hour_of_day  country_code  \\\n",
-       "1             6.30          1  ...            4            6            US   \n",
-       "25           10.30          1  ...            4            6            US   \n",
-       "27           22.25          1  ...            4           16            US   \n",
-       "44           50.30          1  ...            0           11            US   \n",
-       "45           12.80          1  ...            4           10            US   \n",
-       "...            ...        ...  ...          ...          ...           ...   \n",
-       "23868        12.30         12  ...            6            0            US   \n",
-       "23892        12.30         12  ...            6           14            US   \n",
-       "23942         7.55         12  ...            0            1            US   \n",
-       "23978         8.30         12  ...            0            3            US   \n",
-       "23985        16.25         12  ...            0           22            US   \n",
-       "\n",
-       "         hr_sin        hr_cos    dy_sin    dy_cos   datetime  \\\n",
-       "1      1.000000  6.123234e-17 -0.433884 -0.900969 2016-01-01   \n",
-       "25     1.000000  6.123234e-17 -0.433884 -0.900969 2016-01-01   \n",
-       "27    -0.866025 -5.000000e-01 -0.433884 -0.900969 2016-01-01   \n",
-       "44     0.258819 -9.659258e-01  0.000000  1.000000 2016-01-18   \n",
-       "45     0.500000 -8.660254e-01 -0.433884 -0.900969 2016-01-01   \n",
-       "...         ...           ...       ...       ...        ...   \n",
-       "23868  0.000000  1.000000e+00 -0.781831  0.623490 2016-12-25   \n",
-       "23892 -0.500000 -8.660254e-01 -0.781831  0.623490 2016-12-25   \n",
-       "23942  0.258819  9.659258e-01  0.000000  1.000000 2016-12-26   \n",
-       "23978  0.707107  7.071068e-01  0.000000  1.000000 2016-12-26   \n",
-       "23985 -0.500000  8.660254e-01  0.000000  1.000000 2016-12-26   \n",
-       "\n",
-       "             normalizeHolidayName isPaidTimeOff  \n",
-       "1                  New Year's Day          True  \n",
-       "25                 New Year's Day          True  \n",
-       "27                 New Year's Day          True  \n",
-       "44     Martin Luther King Jr. Day          None  \n",
-       "45                 New Year's Day          True  \n",
-       "...                           ...           ...  \n",
-       "23868               Christmas Day          True  \n",
-       "23892               Christmas Day          True  \n",
-       "23942               Christmas Day          True  \n",
-       "23978               Christmas Day          True  \n",
-       "23985               Christmas Day          True  \n",
-       "\n",
-       "[673 rows x 21 columns]"
-      ]
-     },
-     "execution_count": 33,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "holidays_df = holidays_df.rename(columns={\"countryRegionCode\": \"country_code\"})\n",
-    "holidays_df[\"datetime\"] = holidays_df[\"date\"].dt.normalize()\n",
-    "\n",
-    "holidays_df.drop([\"countryOrRegion\", \"holidayName\", \"date\"], axis=1, inplace=True)\n",
-    "\n",
-    "taxi_holidays_df = pd.merge(green_taxi_df, holidays_df, how=\"left\", on=[\"datetime\", \"country_code\"])\n",
-    "taxi_holidays_df[taxi_holidays_df[\"normalizeHolidayName\"].notnull()]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Enrich with weather data\n",
-    "\n",
-    "Now NOAA surface weather data can be appended to the taxi and holiday data. Use a similar approach to fetch the weather data by downloading one month at a time iteratively. Additionally, specify the `cols` parameter with an array of strings to filter the columns to download. This is a very large dataset containing weather surface data from all over the world, so before appending each month, filter the lat/long fields to near NYC using the `query()` function on the dataframe. This will ensure the `weather_df` doesn't get too large."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "metadata": {},
-   "outputs": [
+      "metadata": {}
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-1.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-1.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-1.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-1.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-1.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-1.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-1.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-1.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-2.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-2.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-2.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-2.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-2.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-2.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-2.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-2.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-3.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-3.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-3.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-3.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-3.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-3.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-3.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-3.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-4.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-4.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-4.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-4.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-4.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-4.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-4.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-4.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-5.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-5.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-5.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-5.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-5.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-5.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-5.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-5.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-6.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-6.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-6.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-6.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-6.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-6.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-6.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-6.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-7.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-7.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-7.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-7.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-7.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-7.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-7.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-7.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-8.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-8.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-8.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-8.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-8.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-8.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-8.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-8.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-9.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-9.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-9.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-9.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-9.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-9.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-9.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-9.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-10.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-10.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-10.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-10.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-10.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-10.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-10.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-10.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-11.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-11.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-11.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-11.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-11.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-11.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-11.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-11.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-12.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-12.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-12.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-12.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-12.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-12.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-12.c000.snappy.parquet\n",
-      "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-12.c000.snappy.parquet\n"
-     ]
-    }
-   ],
-   "source": [
-    "from azureml.opendatasets import NoaaIsdWeather\n",
-    "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n",
-    "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n",
-    "\n",
-    "weather_df = pd.concat([NoaaIsdWeather(cols=[\"temperature\", \"precipTime\", \"precipDepth\"], start_date=start + relativedelta(months=x), end_date=end + relativedelta(months=x))\\\n",
-    "        .to_pandas_dataframe().query(\"latitude>=40.53 and latitude<=40.88 and longitude>=-74.09 and longitude<=-73.72 and temperature==temperature\") for x in range(12)])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "metadata": {},
-   "outputs": [
+      "cell_type": "code",
+      "source": [
+        "columns_to_remove = [\"lpepDropoffDatetime\", \"puLocationId\", \"doLocationId\", \"extra\", \"mtaTax\",\n",
+        "                     \"improvementSurcharge\", \"tollsAmount\", \"ehailFee\", \"tripType\", \"rateCodeID\", \n",
+        "                     \"storeAndFwdFlag\", \"paymentType\", \"fareAmount\", \"tipAmount\"]\n",
+        "\n",
+        "green_taxi_df.drop(columns_to_remove, axis=1, inplace=True)\n",
+        "\n",
+        "green_taxi_df[\"datetime\"] = green_taxi_df[\"lpepPickupDatetime\"].dt.normalize()\n",
+        "green_taxi_df.head(5)"
+      ],
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "execution_count": 4,
+          "data": {
+            "text/plain": "         vendorID  lpepPickupDatetime  passengerCount  tripDistance  \\\n1312085         2 2016-01-03 11:10:13               1          0.83   \n109916          2 2016-01-19 08:11:09               1          0.85   \n25029           2 2016-01-02 11:47:40               1          0.81   \n629848          2 2016-01-17 18:31:30               1          2.21   \n139651          2 2016-01-23 00:00:17               1          0.60   \n\n         pickupLongitude  pickupLatitude  dropoffLongitude  dropoffLatitude  \\\n1312085       -73.939774       40.679844        -73.930649        40.674252   \n109916        -73.925629       40.761787        -73.937866        40.766113   \n25029         -73.973312       40.689678        -73.984985        40.688690   \n629848        -73.928474       40.687298        -73.940605        40.674679   \n139651        -73.953415       40.706947        -73.948738        40.711098   \n\n         totalAmount  month_num  day_of_month  day_of_week  hour_of_day  \\\n1312085         5.80          1             3            6           11   \n109916          6.30          1            19            1            8   \n25029           6.96          1             2            5           11   \n629848         10.30          1            17            6           18   \n139651          6.30          1            23            5            0   \n\n        country_code    hr_sin        hr_cos    dy_sin    dy_cos   datetime  \n1312085           US  0.258819 -9.659258e-01 -0.781831  0.623490 2016-01-03  \n109916            US  0.866025 -5.000000e-01  0.781831  0.623490 2016-01-19  \n25029             US  0.258819 -9.659258e-01 -0.974928 -0.222521 2016-01-02  \n629848            US -1.000000 -1.836970e-16 -0.781831  0.623490 2016-01-17  \n139651            US  0.000000  1.000000e+00 -0.974928 -0.222521 2016-01-23  ",
+            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>vendorID</th>\n      <th>lpepPickupDatetime</th>\n      <th>passengerCount</th>\n      <th>tripDistance</th>\n      <th>pickupLongitude</th>\n      <th>pickupLatitude</th>\n      <th>dropoffLongitude</th>\n      <th>dropoffLatitude</th>\n      <th>totalAmount</th>\n      <th>month_num</th>\n      <th>day_of_month</th>\n      <th>day_of_week</th>\n      <th>hour_of_day</th>\n      <th>country_code</th>\n      <th>hr_sin</th>\n      <th>hr_cos</th>\n      <th>dy_sin</th>\n      <th>dy_cos</th>\n      <th>datetime</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>1312085</th>\n      <td>2</td>\n      <td>2016-01-03 11:10:13</td>\n      <td>1</td>\n      <td>0.83</td>\n      <td>-73.939774</td>\n      <td>40.679844</td>\n      <td>-73.930649</td>\n      <td>40.674252</td>\n      <td>5.80</td>\n      <td>1</td>\n      <td>3</td>\n      <td>6</td>\n      <td>11</td>\n      <td>US</td>\n      <td>0.258819</td>\n      <td>-9.659258e-01</td>\n      <td>-0.781831</td>\n      <td>0.623490</td>\n      <td>2016-01-03</td>\n    </tr>\n    <tr>\n      <th>109916</th>\n      <td>2</td>\n      <td>2016-01-19 08:11:09</td>\n      <td>1</td>\n      <td>0.85</td>\n      <td>-73.925629</td>\n      <td>40.761787</td>\n      <td>-73.937866</td>\n      <td>40.766113</td>\n      <td>6.30</td>\n      <td>1</td>\n      <td>19</td>\n      <td>1</td>\n      <td>8</td>\n      <td>US</td>\n      <td>0.866025</td>\n      <td>-5.000000e-01</td>\n      <td>0.781831</td>\n      <td>0.623490</td>\n      <td>2016-01-19</td>\n    </tr>\n    <tr>\n      <th>25029</th>\n      <td>2</td>\n      <td>2016-01-02 11:47:40</td>\n      <td>1</td>\n      <td>0.81</td>\n      <td>-73.973312</td>\n      <td>40.689678</td>\n      <td>-73.984985</td>\n      <td>40.688690</td>\n      <td>6.96</td>\n      <td>1</td>\n      <td>2</td>\n      <td>5</td>\n      <td>11</td>\n      <td>US</td>\n      <td>0.258819</td>\n      <td>-9.659258e-01</td>\n      <td>-0.974928</td>\n      <td>-0.222521</td>\n      <td>2016-01-02</td>\n    </tr>\n    <tr>\n      <th>629848</th>\n      <td>2</td>\n      <td>2016-01-17 18:31:30</td>\n      <td>1</td>\n      <td>2.21</td>\n      <td>-73.928474</td>\n      <td>40.687298</td>\n      <td>-73.940605</td>\n      <td>40.674679</td>\n      <td>10.30</td>\n      <td>1</td>\n      <td>17</td>\n      <td>6</td>\n      <td>18</td>\n      <td>US</td>\n      <td>-1.000000</td>\n      <td>-1.836970e-16</td>\n      <td>-0.781831</td>\n      <td>0.623490</td>\n      <td>2016-01-17</td>\n    </tr>\n    <tr>\n      <th>139651</th>\n      <td>2</td>\n      <td>2016-01-23 00:00:17</td>\n      <td>1</td>\n      <td>0.60</td>\n      <td>-73.953415</td>\n      <td>40.706947</td>\n      <td>-73.948738</td>\n      <td>40.711098</td>\n      <td>6.30</td>\n      <td>1</td>\n      <td>23</td>\n      <td>5</td>\n      <td>0</td>\n      <td>US</td>\n      <td>0.000000</td>\n      <td>1.000000e+00</td>\n      <td>-0.974928</td>\n      <td>-0.222521</td>\n      <td>2016-01-23</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 4,
+      "metadata": {
+        "gather": {
+          "logged": 1681193760703
+        }
+      }
+    },
     {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>wban</th>\n",
-       "      <th>latitude</th>\n",
-       "      <th>temperature</th>\n",
-       "      <th>usaf</th>\n",
-       "      <th>datetime</th>\n",
-       "      <th>longitude</th>\n",
-       "      <th>precipDepth</th>\n",
-       "      <th>precipTime</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>204647</th>\n",
-       "      <td>14732</td>\n",
-       "      <td>40.783</td>\n",
-       "      <td>2.8</td>\n",
-       "      <td>725030</td>\n",
-       "      <td>2016-01-02 03:00:00</td>\n",
-       "      <td>-73.867</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>204670</th>\n",
-       "      <td>14732</td>\n",
-       "      <td>40.779</td>\n",
-       "      <td>-4.4</td>\n",
-       "      <td>725030</td>\n",
-       "      <td>2016-01-22 13:51:00</td>\n",
-       "      <td>-73.880</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>204694</th>\n",
-       "      <td>14732</td>\n",
-       "      <td>40.779</td>\n",
-       "      <td>5.0</td>\n",
-       "      <td>725030</td>\n",
-       "      <td>2016-01-08 02:51:00</td>\n",
-       "      <td>-73.880</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>204701</th>\n",
-       "      <td>14732</td>\n",
-       "      <td>40.779</td>\n",
-       "      <td>-1.1</td>\n",
-       "      <td>725030</td>\n",
-       "      <td>2016-01-04 15:51:00</td>\n",
-       "      <td>-73.880</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>204715</th>\n",
-       "      <td>14732</td>\n",
-       "      <td>40.779</td>\n",
-       "      <td>4.4</td>\n",
-       "      <td>725030</td>\n",
-       "      <td>2016-01-01 21:51:00</td>\n",
-       "      <td>-73.880</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1248471</th>\n",
-       "      <td>94728</td>\n",
-       "      <td>40.789</td>\n",
-       "      <td>4.4</td>\n",
-       "      <td>725053</td>\n",
-       "      <td>2016-12-23 13:51:00</td>\n",
-       "      <td>-73.967</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1248555</th>\n",
-       "      <td>94728</td>\n",
-       "      <td>40.789</td>\n",
-       "      <td>5.0</td>\n",
-       "      <td>725053</td>\n",
-       "      <td>2016-12-12 13:51:00</td>\n",
-       "      <td>-73.967</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1248580</th>\n",
-       "      <td>94728</td>\n",
-       "      <td>40.789</td>\n",
-       "      <td>3.9</td>\n",
-       "      <td>725053</td>\n",
-       "      <td>2016-12-18 07:01:00</td>\n",
-       "      <td>-73.967</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1248597</th>\n",
-       "      <td>94728</td>\n",
-       "      <td>40.789</td>\n",
-       "      <td>7.8</td>\n",
-       "      <td>725053</td>\n",
-       "      <td>2016-12-25 00:51:00</td>\n",
-       "      <td>-73.967</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1248600</th>\n",
-       "      <td>94728</td>\n",
-       "      <td>40.789</td>\n",
-       "      <td>-2.8</td>\n",
-       "      <td>725053</td>\n",
-       "      <td>2016-12-17 11:10:00</td>\n",
-       "      <td>-73.967</td>\n",
-       "      <td>5.0</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>55683 rows × 8 columns</p>\n",
-       "</div>"
+      "cell_type": "markdown",
+      "source": [
+        "### Enrich with Holiday Data\n",
+        "\n",
+        "Now that the taxi data is downloaded and roughly prepared, add in holiday data as additional features. Holiday-specific features will assist model accuracy, as major holidays are times where taxi demand increases dramatically and supply becomes limited. The holiday dataset is relatively small, so fetch the full set by using the `PublicHolidays` class constructor with no parameters for filtering. Preview the data to check the format."
       ],
-      "text/plain": [
-       "          wban  latitude  temperature    usaf            datetime  longitude  \\\n",
-       "204647   14732    40.783          2.8  725030 2016-01-02 03:00:00    -73.867   \n",
-       "204670   14732    40.779         -4.4  725030 2016-01-22 13:51:00    -73.880   \n",
-       "204694   14732    40.779          5.0  725030 2016-01-08 02:51:00    -73.880   \n",
-       "204701   14732    40.779         -1.1  725030 2016-01-04 15:51:00    -73.880   \n",
-       "204715   14732    40.779          4.4  725030 2016-01-01 21:51:00    -73.880   \n",
-       "...        ...       ...          ...     ...                 ...        ...   \n",
-       "1248471  94728    40.789          4.4  725053 2016-12-23 13:51:00    -73.967   \n",
-       "1248555  94728    40.789          5.0  725053 2016-12-12 13:51:00    -73.967   \n",
-       "1248580  94728    40.789          3.9  725053 2016-12-18 07:01:00    -73.967   \n",
-       "1248597  94728    40.789          7.8  725053 2016-12-25 00:51:00    -73.967   \n",
-       "1248600  94728    40.789         -2.8  725053 2016-12-17 11:10:00    -73.967   \n",
-       "\n",
-       "         precipDepth  precipTime  \n",
-       "204647           NaN         NaN  \n",
-       "204670           0.0         1.0  \n",
-       "204694           0.0         1.0  \n",
-       "204701           0.0         1.0  \n",
-       "204715           0.0         1.0  \n",
-       "...              ...         ...  \n",
-       "1248471          0.0         1.0  \n",
-       "1248555          0.0         1.0  \n",
-       "1248580          NaN         NaN  \n",
-       "1248597          0.0         1.0  \n",
-       "1248600          5.0         1.0  \n",
-       "\n",
-       "[55683 rows x 8 columns]"
-      ]
-     },
-     "execution_count": 36,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "weather_df"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Again call `pandas.Series.dt.normalize` on the `datetime` field in the weather data so it matches the time key in `taxi_holidays_df`.\n",
-    "\n",
-    "\n",
-    "Next group the weather data to have daily aggregated weather values. Define a dict `aggregations` to define how to aggregate each field at a daily level. For`temperature` take the mean and for `precipTime` and `precipDepth` take the daily maximum. Use the `groupby()` function along with the aggregations to group the data. Preview the data to ensure there is one record per day."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [
+      "metadata": {}
+    },
     {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>precipTime</th>\n",
-       "      <th>temperature</th>\n",
-       "      <th>precipDepth</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>datetime</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>2016-01-01</th>\n",
-       "      <td>1.0</td>\n",
-       "      <td>5.197345</td>\n",
-       "      <td>0.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2016-01-02</th>\n",
-       "      <td>1.0</td>\n",
-       "      <td>2.567857</td>\n",
-       "      <td>0.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2016-01-03</th>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3.846429</td>\n",
-       "      <td>0.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2016-01-04</th>\n",
-       "      <td>1.0</td>\n",
-       "      <td>0.123894</td>\n",
-       "      <td>0.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2016-01-05</th>\n",
-       "      <td>6.0</td>\n",
-       "      <td>-7.206250</td>\n",
-       "      <td>0.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2016-01-06</th>\n",
-       "      <td>6.0</td>\n",
-       "      <td>-0.896396</td>\n",
-       "      <td>0.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2016-01-07</th>\n",
-       "      <td>6.0</td>\n",
-       "      <td>3.180645</td>\n",
-       "      <td>0.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2016-01-08</th>\n",
-       "      <td>1.0</td>\n",
-       "      <td>4.384091</td>\n",
-       "      <td>0.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2016-01-09</th>\n",
-       "      <td>6.0</td>\n",
-       "      <td>6.710274</td>\n",
-       "      <td>3.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2016-01-10</th>\n",
-       "      <td>24.0</td>\n",
-       "      <td>10.943655</td>\n",
-       "      <td>254.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
+      "cell_type": "code",
+      "source": [
+        "from azureml.opendatasets import PublicHolidays\n",
+        "\n",
+        "# call default constructor to download full dataset\n",
+        "holidays_df = PublicHolidays().to_pandas_dataframe()\n",
+        "holidays_df.head(5)"
       ],
-      "text/plain": [
-       "            precipTime  temperature  precipDepth\n",
-       "datetime                                        \n",
-       "2016-01-01         1.0     5.197345          0.0\n",
-       "2016-01-02         1.0     2.567857          0.0\n",
-       "2016-01-03         1.0     3.846429          0.0\n",
-       "2016-01-04         1.0     0.123894          0.0\n",
-       "2016-01-05         6.0    -7.206250          0.0\n",
-       "2016-01-06         6.0    -0.896396          0.0\n",
-       "2016-01-07         6.0     3.180645          0.0\n",
-       "2016-01-08         1.0     4.384091          0.0\n",
-       "2016-01-09         6.0     6.710274          3.0\n",
-       "2016-01-10        24.0    10.943655        254.0"
-      ]
-     },
-     "execution_count": 37,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "weather_df[\"datetime\"] = weather_df[\"datetime\"].dt.normalize()\n",
-    "\n",
-    "# group by datetime\n",
-    "aggregations = {\"precipTime\": \"max\", \"temperature\": \"mean\", \"precipDepth\": \"max\"}\n",
-    "weather_df_grouped = weather_df.groupby(\"datetime\").agg(aggregations)\n",
-    "weather_df_grouped.head(10)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Note: The examples in this tutorial merge data using Pandas functions and custom aggregations, but the Open Datasets SDK has classes designed to easily merge and enrich data sets. See the [notebook](https://github.com/Azure/OpenDatasetsNotebooks/blob/master/tutorials/data-join/04-nyc-taxi-join-weather-in-pandas.ipynb) for code examples of these design patterns."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Cleanse data\n",
-    "\n",
-    "Merge the existing taxi and holiday data with the new weather data. This time `datetime` is the only key, and again perform a left-join of the data. Run the `describe()` function on the new dataframe to see summary statistics for each field."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
-   "outputs": [
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "[Info] read from /tmp/tmpiw_16gzx/https%3A/%2Fazureopendatastorage.azurefd.net/holidaydatacontainer/Processed/part-00000-tid-8468414522853579044-35925ba8-a227-4b80-9c89-17065e7bf1db-649-c000.snappy.parquet\n"
+        },
+        {
+          "output_type": "execute_result",
+          "execution_count": 5,
+          "data": {
+            "text/plain": "      countryOrRegion                 holidayName        normalizeHolidayName  \\\n19375       Argentina  Año Nuevo [New Year's Day]  Año Nuevo [New Year's Day]   \n19376       Australia              New Year's Day              New Year's Day   \n19377         Austria                     Neujahr                     Neujahr   \n19378         Belarus                   Новый год                   Новый год   \n19379         Belgium               Nieuwjaarsdag               Nieuwjaarsdag   \n\n      isPaidTimeOff countryRegionCode       date  \n19375          None                AR 2008-01-01  \n19376          None                AU 2008-01-01  \n19377          None                AT 2008-01-01  \n19378          None                BY 2008-01-01  \n19379          None                BE 2008-01-01  ",
+            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>countryOrRegion</th>\n      <th>holidayName</th>\n      <th>normalizeHolidayName</th>\n      <th>isPaidTimeOff</th>\n      <th>countryRegionCode</th>\n      <th>date</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>19375</th>\n      <td>Argentina</td>\n      <td>Año Nuevo [New Year's Day]</td>\n      <td>Año Nuevo [New Year's Day]</td>\n      <td>None</td>\n      <td>AR</td>\n      <td>2008-01-01</td>\n    </tr>\n    <tr>\n      <th>19376</th>\n      <td>Australia</td>\n      <td>New Year's Day</td>\n      <td>New Year's Day</td>\n      <td>None</td>\n      <td>AU</td>\n      <td>2008-01-01</td>\n    </tr>\n    <tr>\n      <th>19377</th>\n      <td>Austria</td>\n      <td>Neujahr</td>\n      <td>Neujahr</td>\n      <td>None</td>\n      <td>AT</td>\n      <td>2008-01-01</td>\n    </tr>\n    <tr>\n      <th>19378</th>\n      <td>Belarus</td>\n      <td>Новый год</td>\n      <td>Новый год</td>\n      <td>None</td>\n      <td>BY</td>\n      <td>2008-01-01</td>\n    </tr>\n    <tr>\n      <th>19379</th>\n      <td>Belgium</td>\n      <td>Nieuwjaarsdag</td>\n      <td>Nieuwjaarsdag</td>\n      <td>None</td>\n      <td>BE</td>\n      <td>2008-01-01</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 5,
+      "metadata": {
+        "gather": {
+          "logged": 1681193761164
+        }
+      }
+    },
     {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>vendorID</th>\n",
-       "      <th>passengerCount</th>\n",
-       "      <th>tripDistance</th>\n",
-       "      <th>pickupLongitude</th>\n",
-       "      <th>pickupLatitude</th>\n",
-       "      <th>dropoffLongitude</th>\n",
-       "      <th>dropoffLatitude</th>\n",
-       "      <th>totalAmount</th>\n",
-       "      <th>month_num</th>\n",
-       "      <th>day_of_month</th>\n",
-       "      <th>day_of_week</th>\n",
-       "      <th>hour_of_day</th>\n",
-       "      <th>hr_sin</th>\n",
-       "      <th>hr_cos</th>\n",
-       "      <th>dy_sin</th>\n",
-       "      <th>dy_cos</th>\n",
-       "      <th>precipTime</th>\n",
-       "      <th>temperature</th>\n",
-       "      <th>precipDepth</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>count</th>\n",
-       "      <td>24000.000000</td>\n",
-       "      <td>24000.000000</td>\n",
-       "      <td>24000.000000</td>\n",
-       "      <td>12000.000000</td>\n",
-       "      <td>12000.000000</td>\n",
-       "      <td>12000.000000</td>\n",
-       "      <td>12000.000000</td>\n",
-       "      <td>24000.000000</td>\n",
-       "      <td>24000.000000</td>\n",
-       "      <td>24000.000000</td>\n",
-       "      <td>24000.000000</td>\n",
-       "      <td>24000.000000</td>\n",
-       "      <td>24000.000000</td>\n",
-       "      <td>2.400000e+04</td>\n",
-       "      <td>24000.000000</td>\n",
-       "      <td>24000.000000</td>\n",
-       "      <td>24000.000000</td>\n",
-       "      <td>24000.000000</td>\n",
-       "      <td>24000.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>mean</th>\n",
-       "      <td>1.789667</td>\n",
-       "      <td>1.355292</td>\n",
-       "      <td>2.830398</td>\n",
-       "      <td>-73.814393</td>\n",
-       "      <td>40.678791</td>\n",
-       "      <td>-73.837019</td>\n",
-       "      <td>40.690729</td>\n",
-       "      <td>14.668251</td>\n",
-       "      <td>6.500000</td>\n",
-       "      <td>15.068750</td>\n",
-       "      <td>3.247792</td>\n",
-       "      <td>13.582875</td>\n",
-       "      <td>-0.239687</td>\n",
-       "      <td>-1.510585e-02</td>\n",
-       "      <td>-0.079292</td>\n",
-       "      <td>-0.059630</td>\n",
-       "      <td>13.318667</td>\n",
-       "      <td>13.878272</td>\n",
-       "      <td>1037.956292</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>std</th>\n",
-       "      <td>0.407554</td>\n",
-       "      <td>1.020018</td>\n",
-       "      <td>3.118302</td>\n",
-       "      <td>3.016385</td>\n",
-       "      <td>1.663152</td>\n",
-       "      <td>2.698609</td>\n",
-       "      <td>1.488032</td>\n",
-       "      <td>11.738532</td>\n",
-       "      <td>3.452124</td>\n",
-       "      <td>8.477555</td>\n",
-       "      <td>1.951209</td>\n",
-       "      <td>6.708372</td>\n",
-       "      <td>0.667528</td>\n",
-       "      <td>7.048175e-01</td>\n",
-       "      <td>0.714457</td>\n",
-       "      <td>0.692640</td>\n",
-       "      <td>10.333162</td>\n",
-       "      <td>9.484443</td>\n",
-       "      <td>2788.844868</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>min</th>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>-74.164825</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>-75.186440</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>-200.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>-1.000000</td>\n",
-       "      <td>-1.000000e+00</td>\n",
-       "      <td>-0.974928</td>\n",
-       "      <td>-0.900969</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>-13.379464</td>\n",
-       "      <td>0.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>25%</th>\n",
-       "      <td>2.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>1.040000</td>\n",
-       "      <td>-73.961370</td>\n",
-       "      <td>40.693539</td>\n",
-       "      <td>-73.967514</td>\n",
-       "      <td>40.695128</td>\n",
-       "      <td>7.880000</td>\n",
-       "      <td>3.750000</td>\n",
-       "      <td>8.000000</td>\n",
-       "      <td>2.000000</td>\n",
-       "      <td>9.000000</td>\n",
-       "      <td>-0.866025</td>\n",
-       "      <td>-7.071068e-01</td>\n",
-       "      <td>-0.781831</td>\n",
-       "      <td>-0.900969</td>\n",
-       "      <td>6.000000</td>\n",
-       "      <td>6.620773</td>\n",
-       "      <td>0.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>50%</th>\n",
-       "      <td>2.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>1.840000</td>\n",
-       "      <td>-73.947132</td>\n",
-       "      <td>40.745928</td>\n",
-       "      <td>-73.945869</td>\n",
-       "      <td>40.745914</td>\n",
-       "      <td>11.300000</td>\n",
-       "      <td>6.500000</td>\n",
-       "      <td>15.000000</td>\n",
-       "      <td>3.000000</td>\n",
-       "      <td>15.000000</td>\n",
-       "      <td>-0.500000</td>\n",
-       "      <td>-1.836970e-16</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>-0.222521</td>\n",
-       "      <td>6.000000</td>\n",
-       "      <td>13.108323</td>\n",
-       "      <td>10.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>75%</th>\n",
-       "      <td>2.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>3.500000</td>\n",
-       "      <td>-73.919638</td>\n",
-       "      <td>40.802049</td>\n",
-       "      <td>-73.913059</td>\n",
-       "      <td>40.791076</td>\n",
-       "      <td>17.750000</td>\n",
-       "      <td>9.250000</td>\n",
-       "      <td>22.000000</td>\n",
-       "      <td>5.000000</td>\n",
-       "      <td>19.000000</td>\n",
-       "      <td>0.258819</td>\n",
-       "      <td>7.071068e-01</td>\n",
-       "      <td>0.781831</td>\n",
-       "      <td>0.623490</td>\n",
-       "      <td>24.000000</td>\n",
-       "      <td>22.944737</td>\n",
-       "      <td>127.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>max</th>\n",
-       "      <td>2.000000</td>\n",
-       "      <td>7.000000</td>\n",
-       "      <td>106.680000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>41.081047</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>41.081055</td>\n",
-       "      <td>450.000000</td>\n",
-       "      <td>12.000000</td>\n",
-       "      <td>30.000000</td>\n",
-       "      <td>6.000000</td>\n",
-       "      <td>23.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>1.000000e+00</td>\n",
-       "      <td>0.974928</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>24.000000</td>\n",
-       "      <td>31.303665</td>\n",
-       "      <td>9999.000000</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
+      "cell_type": "markdown",
+      "source": [
+        "Rename the `countryRegionCode` and `date` columns to match the respective field names from the taxi data, and also normalize the time so it can be used as a key. Next, join the holiday data with the taxi data by performing a left-join using the Pandas `merge()` function. This will preserve all records from `green_taxi_df`, but add in holiday data where it exists for the corresponding `datetime` and `country_code`, which in this case is always `\\\"US\\\"`. Preview the data to verify that they were merged correctly."
       ],
-      "text/plain": [
-       "           vendorID  passengerCount  tripDistance  pickupLongitude  \\\n",
-       "count  24000.000000    24000.000000  24000.000000     12000.000000   \n",
-       "mean       1.789667        1.355292      2.830398       -73.814393   \n",
-       "std        0.407554        1.020018      3.118302         3.016385   \n",
-       "min        1.000000        0.000000      0.000000       -74.164825   \n",
-       "25%        2.000000        1.000000      1.040000       -73.961370   \n",
-       "50%        2.000000        1.000000      1.840000       -73.947132   \n",
-       "75%        2.000000        1.000000      3.500000       -73.919638   \n",
-       "max        2.000000        7.000000    106.680000         0.000000   \n",
-       "\n",
-       "       pickupLatitude  dropoffLongitude  dropoffLatitude   totalAmount  \\\n",
-       "count    12000.000000      12000.000000     12000.000000  24000.000000   \n",
-       "mean        40.678791        -73.837019        40.690729     14.668251   \n",
-       "std          1.663152          2.698609         1.488032     11.738532   \n",
-       "min          0.000000        -75.186440         0.000000   -200.000000   \n",
-       "25%         40.693539        -73.967514        40.695128      7.880000   \n",
-       "50%         40.745928        -73.945869        40.745914     11.300000   \n",
-       "75%         40.802049        -73.913059        40.791076     17.750000   \n",
-       "max         41.081047          0.000000        41.081055    450.000000   \n",
-       "\n",
-       "          month_num  day_of_month   day_of_week   hour_of_day        hr_sin  \\\n",
-       "count  24000.000000  24000.000000  24000.000000  24000.000000  24000.000000   \n",
-       "mean       6.500000     15.068750      3.247792     13.582875     -0.239687   \n",
-       "std        3.452124      8.477555      1.951209      6.708372      0.667528   \n",
-       "min        1.000000      1.000000      0.000000      0.000000     -1.000000   \n",
-       "25%        3.750000      8.000000      2.000000      9.000000     -0.866025   \n",
-       "50%        6.500000     15.000000      3.000000     15.000000     -0.500000   \n",
-       "75%        9.250000     22.000000      5.000000     19.000000      0.258819   \n",
-       "max       12.000000     30.000000      6.000000     23.000000      1.000000   \n",
-       "\n",
-       "             hr_cos        dy_sin        dy_cos    precipTime   temperature  \\\n",
-       "count  2.400000e+04  24000.000000  24000.000000  24000.000000  24000.000000   \n",
-       "mean  -1.510585e-02     -0.079292     -0.059630     13.318667     13.878272   \n",
-       "std    7.048175e-01      0.714457      0.692640     10.333162      9.484443   \n",
-       "min   -1.000000e+00     -0.974928     -0.900969      1.000000    -13.379464   \n",
-       "25%   -7.071068e-01     -0.781831     -0.900969      6.000000      6.620773   \n",
-       "50%   -1.836970e-16      0.000000     -0.222521      6.000000     13.108323   \n",
-       "75%    7.071068e-01      0.781831      0.623490     24.000000     22.944737   \n",
-       "max    1.000000e+00      0.974928      1.000000     24.000000     31.303665   \n",
-       "\n",
-       "        precipDepth  \n",
-       "count  24000.000000  \n",
-       "mean    1037.956292  \n",
-       "std     2788.844868  \n",
-       "min        0.000000  \n",
-       "25%        0.000000  \n",
-       "50%       10.000000  \n",
-       "75%      127.000000  \n",
-       "max     9999.000000  "
-      ]
-     },
-     "execution_count": 38,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "taxi_holidays_weather_df = pd.merge(taxi_holidays_df, weather_df_grouped, how=\"left\", on=[\"datetime\"])\n",
-    "taxi_holidays_weather_df.describe()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "From the summary statistics, you see that there are several fields that have outliers or values that will reduce model accuracy. First filter the lat/long fields to be within the same bounds you used for filtering weather data. The `tripDistance` field has some bad data, because the minimum value is negative. The `passengerCount` field has bad data as well, with the max value being 210 passengers. Lastly, the `totalAmount` field has negative values, which don't make sense in the context of our model.\n",
-    "\n",
-    "Filter out these anomolies using query functions, and then remove the last few columns unnecesary for training.\n",
-    "\n",
-    "Note: since a random sample of 2000 was taken for each month of the taxi data, the statistics may vary each time this is ran."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "final_df = taxi_holidays_weather_df.query(\"pickupLatitude>=40.53 and pickupLatitude<=40.88 and \\\n",
-    "                                           pickupLongitude>=-74.09 and pickupLongitude<=-73.72 and \\\n",
-    "                                           tripDistance>0 and tripDistance<75 and \\\n",
-    "                                           passengerCount>0 and passengerCount<100 and \\\n",
-    "                                           totalAmount>0\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Call `describe()` again on the data to ensure cleansing worked as expected. The final data is prepared and cleansed, consisting of taxi, holiday, and weather data, and is ready to use for machine learning model training."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {},
-   "outputs": [
+      "metadata": {}
+    },
     {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>vendorID</th>\n",
-       "      <th>passengerCount</th>\n",
-       "      <th>tripDistance</th>\n",
-       "      <th>pickupLongitude</th>\n",
-       "      <th>pickupLatitude</th>\n",
-       "      <th>dropoffLongitude</th>\n",
-       "      <th>dropoffLatitude</th>\n",
-       "      <th>totalAmount</th>\n",
-       "      <th>month_num</th>\n",
-       "      <th>day_of_month</th>\n",
-       "      <th>day_of_week</th>\n",
-       "      <th>hour_of_day</th>\n",
-       "      <th>hr_sin</th>\n",
-       "      <th>hr_cos</th>\n",
-       "      <th>dy_sin</th>\n",
-       "      <th>dy_cos</th>\n",
-       "      <th>precipTime</th>\n",
-       "      <th>temperature</th>\n",
-       "      <th>precipDepth</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>count</th>\n",
-       "      <td>11763.000000</td>\n",
-       "      <td>11763.000000</td>\n",
-       "      <td>11763.000000</td>\n",
-       "      <td>11763.000000</td>\n",
-       "      <td>11763.000000</td>\n",
-       "      <td>11763.000000</td>\n",
-       "      <td>11763.000000</td>\n",
-       "      <td>11763.000000</td>\n",
-       "      <td>11763.000000</td>\n",
-       "      <td>11763.000000</td>\n",
-       "      <td>11763.000000</td>\n",
-       "      <td>11763.000000</td>\n",
-       "      <td>11763.000000</td>\n",
-       "      <td>1.176300e+04</td>\n",
-       "      <td>11763.000000</td>\n",
-       "      <td>11763.000000</td>\n",
-       "      <td>11763.000000</td>\n",
-       "      <td>11763.000000</td>\n",
-       "      <td>11763.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>mean</th>\n",
-       "      <td>1.790190</td>\n",
-       "      <td>1.369294</td>\n",
-       "      <td>2.841407</td>\n",
-       "      <td>-73.937911</td>\n",
-       "      <td>40.746224</td>\n",
-       "      <td>-73.910901</td>\n",
-       "      <td>40.730818</td>\n",
-       "      <td>14.557917</td>\n",
-       "      <td>3.501318</td>\n",
-       "      <td>14.929270</td>\n",
-       "      <td>3.252317</td>\n",
-       "      <td>13.538553</td>\n",
-       "      <td>-0.236544</td>\n",
-       "      <td>-2.265927e-03</td>\n",
-       "      <td>-0.070226</td>\n",
-       "      <td>-0.059059</td>\n",
-       "      <td>11.993964</td>\n",
-       "      <td>10.288261</td>\n",
-       "      <td>192.179546</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>std</th>\n",
-       "      <td>0.407191</td>\n",
-       "      <td>1.041634</td>\n",
-       "      <td>2.829864</td>\n",
-       "      <td>0.041121</td>\n",
-       "      <td>0.056818</td>\n",
-       "      <td>1.364114</td>\n",
-       "      <td>0.753468</td>\n",
-       "      <td>9.989165</td>\n",
-       "      <td>1.707350</td>\n",
-       "      <td>8.475793</td>\n",
-       "      <td>1.948127</td>\n",
-       "      <td>6.778012</td>\n",
-       "      <td>0.668812</td>\n",
-       "      <td>7.048492e-01</td>\n",
-       "      <td>0.718871</td>\n",
-       "      <td>0.689122</td>\n",
-       "      <td>10.114775</td>\n",
-       "      <td>8.530011</td>\n",
-       "      <td>1223.101074</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>min</th>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>0.010000</td>\n",
-       "      <td>-74.035194</td>\n",
-       "      <td>40.572906</td>\n",
-       "      <td>-74.183029</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.010000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>-1.000000</td>\n",
-       "      <td>-1.000000e+00</td>\n",
-       "      <td>-0.974928</td>\n",
-       "      <td>-0.900969</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>-13.379464</td>\n",
-       "      <td>0.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>25%</th>\n",
-       "      <td>2.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>1.090000</td>\n",
-       "      <td>-73.961601</td>\n",
-       "      <td>40.693594</td>\n",
-       "      <td>-73.967793</td>\n",
-       "      <td>40.695440</td>\n",
-       "      <td>8.160000</td>\n",
-       "      <td>2.000000</td>\n",
-       "      <td>8.000000</td>\n",
-       "      <td>2.000000</td>\n",
-       "      <td>9.000000</td>\n",
-       "      <td>-0.866025</td>\n",
-       "      <td>-7.071068e-01</td>\n",
-       "      <td>-0.781831</td>\n",
-       "      <td>-0.900969</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>3.504580</td>\n",
-       "      <td>0.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>50%</th>\n",
-       "      <td>2.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>1.900000</td>\n",
-       "      <td>-73.947517</td>\n",
-       "      <td>40.745842</td>\n",
-       "      <td>-73.946243</td>\n",
-       "      <td>40.745789</td>\n",
-       "      <td>11.300000</td>\n",
-       "      <td>4.000000</td>\n",
-       "      <td>15.000000</td>\n",
-       "      <td>3.000000</td>\n",
-       "      <td>15.000000</td>\n",
-       "      <td>-0.500000</td>\n",
-       "      <td>-1.836970e-16</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>-0.222521</td>\n",
-       "      <td>6.000000</td>\n",
-       "      <td>10.468276</td>\n",
-       "      <td>3.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>75%</th>\n",
-       "      <td>2.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>3.530000</td>\n",
-       "      <td>-73.920509</td>\n",
-       "      <td>40.801752</td>\n",
-       "      <td>-73.913807</td>\n",
-       "      <td>40.789942</td>\n",
-       "      <td>17.380000</td>\n",
-       "      <td>5.000000</td>\n",
-       "      <td>22.000000</td>\n",
-       "      <td>5.000000</td>\n",
-       "      <td>19.000000</td>\n",
-       "      <td>0.258819</td>\n",
-       "      <td>7.071068e-01</td>\n",
-       "      <td>0.781831</td>\n",
-       "      <td>0.623490</td>\n",
-       "      <td>24.000000</td>\n",
-       "      <td>16.966923</td>\n",
-       "      <td>41.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>max</th>\n",
-       "      <td>2.000000</td>\n",
-       "      <td>6.000000</td>\n",
-       "      <td>38.850000</td>\n",
-       "      <td>-73.738899</td>\n",
-       "      <td>40.879982</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>41.073185</td>\n",
-       "      <td>123.800000</td>\n",
-       "      <td>6.000000</td>\n",
-       "      <td>30.000000</td>\n",
-       "      <td>6.000000</td>\n",
-       "      <td>23.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>1.000000e+00</td>\n",
-       "      <td>0.974928</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>24.000000</td>\n",
-       "      <td>26.524107</td>\n",
-       "      <td>9999.000000</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
+      "cell_type": "code",
+      "source": [
+        "holidays_df = holidays_df.rename(columns={\"countryRegionCode\": \"country_code\"})\n",
+        "holidays_df[\"datetime\"] = holidays_df[\"date\"].dt.normalize()\n",
+        "\n",
+        "holidays_df.drop([\"countryOrRegion\", \"holidayName\", \"date\"], axis=1, inplace=True)\n",
+        "\n",
+        "taxi_holidays_df = pd.merge(green_taxi_df, holidays_df, how=\"left\", on=[\"datetime\", \"country_code\"])\n",
+        "taxi_holidays_df[taxi_holidays_df[\"normalizeHolidayName\"].notnull()]"
       ],
-      "text/plain": [
-       "           vendorID  passengerCount  tripDistance  pickupLongitude  \\\n",
-       "count  11763.000000    11763.000000  11763.000000     11763.000000   \n",
-       "mean       1.790190        1.369294      2.841407       -73.937911   \n",
-       "std        0.407191        1.041634      2.829864         0.041121   \n",
-       "min        1.000000        1.000000      0.010000       -74.035194   \n",
-       "25%        2.000000        1.000000      1.090000       -73.961601   \n",
-       "50%        2.000000        1.000000      1.900000       -73.947517   \n",
-       "75%        2.000000        1.000000      3.530000       -73.920509   \n",
-       "max        2.000000        6.000000     38.850000       -73.738899   \n",
-       "\n",
-       "       pickupLatitude  dropoffLongitude  dropoffLatitude   totalAmount  \\\n",
-       "count    11763.000000      11763.000000     11763.000000  11763.000000   \n",
-       "mean        40.746224        -73.910901        40.730818     14.557917   \n",
-       "std          0.056818          1.364114         0.753468      9.989165   \n",
-       "min         40.572906        -74.183029         0.000000      0.010000   \n",
-       "25%         40.693594        -73.967793        40.695440      8.160000   \n",
-       "50%         40.745842        -73.946243        40.745789     11.300000   \n",
-       "75%         40.801752        -73.913807        40.789942     17.380000   \n",
-       "max         40.879982          0.000000        41.073185    123.800000   \n",
-       "\n",
-       "          month_num  day_of_month   day_of_week   hour_of_day        hr_sin  \\\n",
-       "count  11763.000000  11763.000000  11763.000000  11763.000000  11763.000000   \n",
-       "mean       3.501318     14.929270      3.252317     13.538553     -0.236544   \n",
-       "std        1.707350      8.475793      1.948127      6.778012      0.668812   \n",
-       "min        1.000000      1.000000      0.000000      0.000000     -1.000000   \n",
-       "25%        2.000000      8.000000      2.000000      9.000000     -0.866025   \n",
-       "50%        4.000000     15.000000      3.000000     15.000000     -0.500000   \n",
-       "75%        5.000000     22.000000      5.000000     19.000000      0.258819   \n",
-       "max        6.000000     30.000000      6.000000     23.000000      1.000000   \n",
-       "\n",
-       "             hr_cos        dy_sin        dy_cos    precipTime   temperature  \\\n",
-       "count  1.176300e+04  11763.000000  11763.000000  11763.000000  11763.000000   \n",
-       "mean  -2.265927e-03     -0.070226     -0.059059     11.993964     10.288261   \n",
-       "std    7.048492e-01      0.718871      0.689122     10.114775      8.530011   \n",
-       "min   -1.000000e+00     -0.974928     -0.900969      1.000000    -13.379464   \n",
-       "25%   -7.071068e-01     -0.781831     -0.900969      1.000000      3.504580   \n",
-       "50%   -1.836970e-16      0.000000     -0.222521      6.000000     10.468276   \n",
-       "75%    7.071068e-01      0.781831      0.623490     24.000000     16.966923   \n",
-       "max    1.000000e+00      0.974928      1.000000     24.000000     26.524107   \n",
-       "\n",
-       "        precipDepth  \n",
-       "count  11763.000000  \n",
-       "mean     192.179546  \n",
-       "std     1223.101074  \n",
-       "min        0.000000  \n",
-       "25%        0.000000  \n",
-       "50%        3.000000  \n",
-       "75%       41.000000  \n",
-       "max     9999.000000  "
-      ]
-     },
-     "execution_count": 40,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "final_df.describe()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Train a model\n",
-    "\n",
-    "The data is ready to train a machine learning model."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.linear_model import LinearRegression\n",
-    "from sklearn.linear_model import RidgeCV\n",
-    "from sklearn.linear_model import Ridge\n",
-    "from sklearn.ensemble import RandomForestRegressor\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.pipeline import Pipeline\n",
-    "from sklearn.preprocessing import OneHotEncoder\n",
-    "from sklearn.impute import SimpleImputer\n",
-    "from sklearn.compose import ColumnTransformer\n",
-    "from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Training Function\n",
-    "\n",
-    "Define a function that can be used to create a model pipeline that can be trained and then used for scoring. This pipeline has 2 steps: preprocessing and model training.\n",
-    "\n",
-    "<b>Preprocessing Stages:</b>\n",
-    "The preprocessing step of the pipeline also has 2 stages, one for numerical features and one for categorical features.\n",
-    "For the numerical features, let's fill in any blanks with 0's. While the training data may not have any nulls in the these fields, future data that is scored may and this step will take care of those for us. Optionally, a scaler transformation could be added in this step as well. Similarly for the categorical variables, let's have the null values filled with \"MISSING\". Additionally to the categorical variables, these will need to be one hot encoded, so we will include that step in our pipeline.\n",
-    "\n",
-    "<b>Model Training Stage:</b>\n",
-    "An input parameter will determine which type of model of train. Let's test out a linear regression and random forest model to start. \n",
-    "\n",
-    "The two steps are put together into the pipeline which is what the function is returning."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def createClassModel(algo_name, catg, nums):\n",
-    "  numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))])\n",
-    "  \n",
-    "  categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=\"MISSING\")), ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n",
-    "  \n",
-    "  preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)])\n",
-    "  \n",
-    "  if algo_name == 'linear_regression':\n",
-    "    model=Ridge(alpha=100)\n",
-    "  elif algo_name == 'random_forest':\n",
-    "    model = RandomForestRegressor()\n",
-    "  else:\n",
-    "    pass\n",
-    "  ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), (\"model\", model)])\n",
-    "  return ModelPipeline"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's define the arguments that will be passed to the function. `catg_cols` is a list of the categorical variables that will be transformed in our processing step. `num_cols` is a list of the numerical variables that will be transformed in our processing step. Let's define the target column as `label` so it can be used in future steps as well."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "catg_cols = [\"vendorID\", \"month_num\", \"day_of_month\", \"normalizeHolidayName\", \"isPaidTimeOff\"]\n",
-    "num_cols = [\"passengerCount\", \"tripDistance\", \"precipTime\", \"temperature\", \"precipDepth\", \"hr_sin\", \"hr_cos\", \"dy_sin\", \"dy_cos\"]\n",
-    "label = [\"totalAmount\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The training is ready to begin, but first, let's make sure that the categorical variables are strings in our dataframe to ensure no errors in our pipeline. \n",
-    "\n",
-    "Next, the data is split into training and test sets by using the `train_test_split()` function in the `scikit-learn` library. The `test_size` parameter determines the percentage of data to allocate to testing. The `random_state` parameter sets a seed to the random number generator, so that your train-test splits are deterministic.\n",
-    "\n",
-    "The training will happen in the for loop so that both algorithms can be tested. The createClassModel funtion is called to retreive the pipeline that can then be trained using the training dataset. \n",
-    "\n",
-    "Once trained, the test dataset is then ran through the model to test the model's performance. Using various functions from sklearn.metrics, the R2 score, MAPE, and RMSE can be used to measure model performance."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "metadata": {},
-   "outputs": [
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "execution_count": 6,
+          "data": {
+            "text/plain": "       vendorID  lpepPickupDatetime  passengerCount  tripDistance  \\\n10            2 2016-01-01 06:10:47               1          1.01   \n12            2 2016-01-01 20:35:55               1          3.81   \n20            2 2016-01-01 14:29:02               1          1.60   \n30            2 2016-01-01 22:38:33               1          0.60   \n73            2 2016-01-18 14:45:12               1          1.02   \n...         ...                 ...             ...           ...   \n23721         2 2016-12-26 19:17:20               1          1.00   \n23843         2 2016-12-25 22:11:04               1          0.65   \n23852         2 2016-12-26 23:48:40               1          2.84   \n23858         2 2016-12-25 02:19:58               3          6.83   \n23917         1 2016-12-25 16:57:15               1          2.00   \n\n       pickupLongitude  pickupLatitude  dropoffLongitude  dropoffLatitude  \\\n10          -73.937195       40.679676        -73.922226        40.680149   \n12          -73.881638       40.767544        -73.917046        40.769688   \n20          -73.950226       40.678459        -73.958611        40.698792   \n30          -73.890671       40.746601        -73.896980        40.745064   \n73          -73.945190       40.792698        -73.935822        40.796143   \n...                ...             ...               ...              ...   \n23721              NaN             NaN               NaN              NaN   \n23843              NaN             NaN               NaN              NaN   \n23852              NaN             NaN               NaN              NaN   \n23858              NaN             NaN               NaN              NaN   \n23917              NaN             NaN               NaN              NaN   \n\n       totalAmount  month_num  ...  day_of_week  hour_of_day  country_code  \\\n10            5.80          1  ...            4            6            US   \n12           16.80          1  ...            4           20            US   \n20           14.80          1  ...            4           14            US   \n30            5.30          1  ...            4           22            US   \n73            6.30          1  ...            0           14            US   \n...            ...        ...  ...          ...          ...           ...   \n23721         7.30         12  ...            0           19            US   \n23843         5.30         12  ...            6           22            US   \n23852        12.30         12  ...            0           23            US   \n23858        27.30         12  ...            6            2            US   \n23917        11.15         12  ...            6           16            US   \n\n         hr_sin        hr_cos    dy_sin    dy_cos   datetime  \\\n10     1.000000  6.123234e-17 -0.433884 -0.900969 2016-01-01   \n12    -0.866025  5.000000e-01 -0.433884 -0.900969 2016-01-01   \n20    -0.500000 -8.660254e-01 -0.433884 -0.900969 2016-01-01   \n30    -0.500000  8.660254e-01 -0.433884 -0.900969 2016-01-01   \n73    -0.500000 -8.660254e-01  0.000000  1.000000 2016-01-18   \n...         ...           ...       ...       ...        ...   \n23721 -0.965926  2.588190e-01  0.000000  1.000000 2016-12-26   \n23843 -0.500000  8.660254e-01 -0.781831  0.623490 2016-12-25   \n23852 -0.258819  9.659258e-01  0.000000  1.000000 2016-12-26   \n23858  0.500000  8.660254e-01 -0.781831  0.623490 2016-12-25   \n23917 -0.866025 -5.000000e-01 -0.781831  0.623490 2016-12-25   \n\n             normalizeHolidayName isPaidTimeOff  \n10                 New Year's Day          True  \n12                 New Year's Day          True  \n20                 New Year's Day          True  \n30                 New Year's Day          True  \n73     Martin Luther King Jr. Day          None  \n...                           ...           ...  \n23721               Christmas Day          True  \n23843               Christmas Day          True  \n23852               Christmas Day          True  \n23858               Christmas Day          True  \n23917               Christmas Day          True  \n\n[611 rows x 21 columns]",
+            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>vendorID</th>\n      <th>lpepPickupDatetime</th>\n      <th>passengerCount</th>\n      <th>tripDistance</th>\n      <th>pickupLongitude</th>\n      <th>pickupLatitude</th>\n      <th>dropoffLongitude</th>\n      <th>dropoffLatitude</th>\n      <th>totalAmount</th>\n      <th>month_num</th>\n      <th>...</th>\n      <th>day_of_week</th>\n      <th>hour_of_day</th>\n      <th>country_code</th>\n      <th>hr_sin</th>\n      <th>hr_cos</th>\n      <th>dy_sin</th>\n      <th>dy_cos</th>\n      <th>datetime</th>\n      <th>normalizeHolidayName</th>\n      <th>isPaidTimeOff</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>10</th>\n      <td>2</td>\n      <td>2016-01-01 06:10:47</td>\n      <td>1</td>\n      <td>1.01</td>\n      <td>-73.937195</td>\n      <td>40.679676</td>\n      <td>-73.922226</td>\n      <td>40.680149</td>\n      <td>5.80</td>\n      <td>1</td>\n      <td>...</td>\n      <td>4</td>\n      <td>6</td>\n      <td>US</td>\n      <td>1.000000</td>\n      <td>6.123234e-17</td>\n      <td>-0.433884</td>\n      <td>-0.900969</td>\n      <td>2016-01-01</td>\n      <td>New Year's Day</td>\n      <td>True</td>\n    </tr>\n    <tr>\n      <th>12</th>\n      <td>2</td>\n      <td>2016-01-01 20:35:55</td>\n      <td>1</td>\n      <td>3.81</td>\n      <td>-73.881638</td>\n      <td>40.767544</td>\n      <td>-73.917046</td>\n      <td>40.769688</td>\n      <td>16.80</td>\n      <td>1</td>\n      <td>...</td>\n      <td>4</td>\n      <td>20</td>\n      <td>US</td>\n      <td>-0.866025</td>\n      <td>5.000000e-01</td>\n      <td>-0.433884</td>\n      <td>-0.900969</td>\n      <td>2016-01-01</td>\n      <td>New Year's Day</td>\n      <td>True</td>\n    </tr>\n    <tr>\n      <th>20</th>\n      <td>2</td>\n      <td>2016-01-01 14:29:02</td>\n      <td>1</td>\n      <td>1.60</td>\n      <td>-73.950226</td>\n      <td>40.678459</td>\n      <td>-73.958611</td>\n      <td>40.698792</td>\n      <td>14.80</td>\n      <td>1</td>\n      <td>...</td>\n      <td>4</td>\n      <td>14</td>\n      <td>US</td>\n      <td>-0.500000</td>\n      <td>-8.660254e-01</td>\n      <td>-0.433884</td>\n      <td>-0.900969</td>\n      <td>2016-01-01</td>\n      <td>New Year's Day</td>\n      <td>True</td>\n    </tr>\n    <tr>\n      <th>30</th>\n      <td>2</td>\n      <td>2016-01-01 22:38:33</td>\n      <td>1</td>\n      <td>0.60</td>\n      <td>-73.890671</td>\n      <td>40.746601</td>\n      <td>-73.896980</td>\n      <td>40.745064</td>\n      <td>5.30</td>\n      <td>1</td>\n      <td>...</td>\n      <td>4</td>\n      <td>22</td>\n      <td>US</td>\n      <td>-0.500000</td>\n      <td>8.660254e-01</td>\n      <td>-0.433884</td>\n      <td>-0.900969</td>\n      <td>2016-01-01</td>\n      <td>New Year's Day</td>\n      <td>True</td>\n    </tr>\n    <tr>\n      <th>73</th>\n      <td>2</td>\n      <td>2016-01-18 14:45:12</td>\n      <td>1</td>\n      <td>1.02</td>\n      <td>-73.945190</td>\n      <td>40.792698</td>\n      <td>-73.935822</td>\n      <td>40.796143</td>\n      <td>6.30</td>\n      <td>1</td>\n      <td>...</td>\n      <td>0</td>\n      <td>14</td>\n      <td>US</td>\n      <td>-0.500000</td>\n      <td>-8.660254e-01</td>\n      <td>0.000000</td>\n      <td>1.000000</td>\n      <td>2016-01-18</td>\n      <td>Martin Luther King Jr. Day</td>\n      <td>None</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>23721</th>\n      <td>2</td>\n      <td>2016-12-26 19:17:20</td>\n      <td>1</td>\n      <td>1.00</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>7.30</td>\n      <td>12</td>\n      <td>...</td>\n      <td>0</td>\n      <td>19</td>\n      <td>US</td>\n      <td>-0.965926</td>\n      <td>2.588190e-01</td>\n      <td>0.000000</td>\n      <td>1.000000</td>\n      <td>2016-12-26</td>\n      <td>Christmas Day</td>\n      <td>True</td>\n    </tr>\n    <tr>\n      <th>23843</th>\n      <td>2</td>\n      <td>2016-12-25 22:11:04</td>\n      <td>1</td>\n      <td>0.65</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>5.30</td>\n      <td>12</td>\n      <td>...</td>\n      <td>6</td>\n      <td>22</td>\n      <td>US</td>\n      <td>-0.500000</td>\n      <td>8.660254e-01</td>\n      <td>-0.781831</td>\n      <td>0.623490</td>\n      <td>2016-12-25</td>\n      <td>Christmas Day</td>\n      <td>True</td>\n    </tr>\n    <tr>\n      <th>23852</th>\n      <td>2</td>\n      <td>2016-12-26 23:48:40</td>\n      <td>1</td>\n      <td>2.84</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>12.30</td>\n      <td>12</td>\n      <td>...</td>\n      <td>0</td>\n      <td>23</td>\n      <td>US</td>\n      <td>-0.258819</td>\n      <td>9.659258e-01</td>\n      <td>0.000000</td>\n      <td>1.000000</td>\n      <td>2016-12-26</td>\n      <td>Christmas Day</td>\n      <td>True</td>\n    </tr>\n    <tr>\n      <th>23858</th>\n      <td>2</td>\n      <td>2016-12-25 02:19:58</td>\n      <td>3</td>\n      <td>6.83</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>27.30</td>\n      <td>12</td>\n      <td>...</td>\n      <td>6</td>\n      <td>2</td>\n      <td>US</td>\n      <td>0.500000</td>\n      <td>8.660254e-01</td>\n      <td>-0.781831</td>\n      <td>0.623490</td>\n      <td>2016-12-25</td>\n      <td>Christmas Day</td>\n      <td>True</td>\n    </tr>\n    <tr>\n      <th>23917</th>\n      <td>1</td>\n      <td>2016-12-25 16:57:15</td>\n      <td>1</td>\n      <td>2.00</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>11.15</td>\n      <td>12</td>\n      <td>...</td>\n      <td>6</td>\n      <td>16</td>\n      <td>US</td>\n      <td>-0.866025</td>\n      <td>-5.000000e-01</td>\n      <td>-0.781831</td>\n      <td>0.623490</td>\n      <td>2016-12-25</td>\n      <td>Christmas Day</td>\n      <td>True</td>\n    </tr>\n  </tbody>\n</table>\n<p>611 rows × 21 columns</p>\n</div>"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 6,
+      "metadata": {
+        "gather": {
+          "logged": 1681193761522
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Enrich with weather data\n",
+        "\n",
+        "Now NOAA surface weather data can be appended to the taxi and holiday data. Use a similar approach to fetch the weather data by downloading one month at a time iteratively. Additionally, specify the `cols` parameter with an array of strings to filter the columns to download. This is a very large dataset containing weather surface data from all over the world, so before appending each month, filter the lat/long fields to near NYC using the `query()` function on the dataframe. This will ensure the `weather_df` doesn't get too large."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from azureml.opendatasets import NoaaIsdWeather\n",
+        "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n",
+        "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n",
+        "\n",
+        "weather_df = pd.concat([NoaaIsdWeather(cols=[\"temperature\", \"precipTime\", \"precipDepth\"], start_date=start + relativedelta(months=x), end_date=end + relativedelta(months=x))\\\n",
+        "        .to_pandas_dataframe().query(\"latitude>=40.53 and latitude<=40.88 and longitude>=-74.09 and longitude<=-73.72 and temperature==temperature\") for x in range(12)])"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "[Info] read from /tmp/tmpcav0ogcg/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=1/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-1.c000.snappy.parquet\n[Info] read from /tmp/tmpcav0ogcg/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=1/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-1.c000.snappy.parquet\n[Info] read from /tmp/tmpcav0ogcg/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=1/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-1.c000.snappy.parquet\n[Info] read from /tmp/tmpcav0ogcg/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=1/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-1.c000.snappy.parquet\n[Info] read from /tmp/tmpcav0ogcg/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=1/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-1.c000.snappy.parquet\n[Info] read from /tmp/tmpcav0ogcg/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=1/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-1.c000.snappy.parquet\n[Info] read from /tmp/tmpcav0ogcg/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=1/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-1.c000.snappy.parquet\n[Info] read from /tmp/tmpcav0ogcg/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=1/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-1.c000.snappy.parquet\n[Info] read from /tmp/tmpxnl2b_3o/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=2/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-2.c000.snappy.parquet\n[Info] read from /tmp/tmpxnl2b_3o/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=2/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-2.c000.snappy.parquet\n[Info] read from /tmp/tmpxnl2b_3o/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=2/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-2.c000.snappy.parquet\n[Info] read from /tmp/tmpxnl2b_3o/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=2/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-2.c000.snappy.parquet\n[Info] read from /tmp/tmpxnl2b_3o/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=2/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-2.c000.snappy.parquet\n[Info] read from /tmp/tmpxnl2b_3o/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=2/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-2.c000.snappy.parquet\n[Info] read from /tmp/tmpxnl2b_3o/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=2/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-2.c000.snappy.parquet\n[Info] read from /tmp/tmpxnl2b_3o/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=2/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-2.c000.snappy.parquet\n[Info] read from /tmp/tmpa6giy_qd/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=3/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-3.c000.snappy.parquet\n[Info] read from /tmp/tmpa6giy_qd/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=3/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-3.c000.snappy.parquet\n[Info] read from /tmp/tmpa6giy_qd/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=3/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-3.c000.snappy.parquet\n[Info] read from /tmp/tmpa6giy_qd/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=3/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-3.c000.snappy.parquet\n[Info] read from /tmp/tmpa6giy_qd/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=3/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-3.c000.snappy.parquet\n[Info] read from /tmp/tmpa6giy_qd/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=3/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-3.c000.snappy.parquet\n[Info] read from /tmp/tmpa6giy_qd/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=3/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-3.c000.snappy.parquet\n[Info] read from /tmp/tmpa6giy_qd/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=3/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-3.c000.snappy.parquet\n[Info] read from /tmp/tmpger9x31f/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=4/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-4.c000.snappy.parquet\n[Info] read from /tmp/tmpger9x31f/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=4/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-4.c000.snappy.parquet\n[Info] read from /tmp/tmpger9x31f/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=4/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-4.c000.snappy.parquet\n[Info] read from /tmp/tmpger9x31f/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=4/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-4.c000.snappy.parquet\n[Info] read from /tmp/tmpger9x31f/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=4/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-4.c000.snappy.parquet\n[Info] read from /tmp/tmpger9x31f/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=4/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-4.c000.snappy.parquet\n[Info] read from /tmp/tmpger9x31f/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=4/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-4.c000.snappy.parquet\n[Info] read from /tmp/tmpger9x31f/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=4/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-4.c000.snappy.parquet\n[Info] read from /tmp/tmp20krwtl4/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=5/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-5.c000.snappy.parquet\n[Info] read from /tmp/tmp20krwtl4/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=5/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-5.c000.snappy.parquet\n[Info] read from /tmp/tmp20krwtl4/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=5/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-5.c000.snappy.parquet\n[Info] read from /tmp/tmp20krwtl4/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=5/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-5.c000.snappy.parquet\n[Info] read from /tmp/tmp20krwtl4/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=5/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-5.c000.snappy.parquet\n[Info] read from /tmp/tmp20krwtl4/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=5/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-5.c000.snappy.parquet\n[Info] read from /tmp/tmp20krwtl4/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=5/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-5.c000.snappy.parquet\n[Info] read from /tmp/tmp20krwtl4/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=5/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-5.c000.snappy.parquet\n[Info] read from /tmp/tmpw67juif_/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=6/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-6.c000.snappy.parquet\n[Info] read from /tmp/tmpw67juif_/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=6/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-6.c000.snappy.parquet\n[Info] read from /tmp/tmpw67juif_/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=6/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-6.c000.snappy.parquet\n[Info] read from /tmp/tmpw67juif_/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=6/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-6.c000.snappy.parquet\n[Info] read from /tmp/tmpw67juif_/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=6/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-6.c000.snappy.parquet\n[Info] read from /tmp/tmpw67juif_/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=6/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-6.c000.snappy.parquet\n[Info] read from /tmp/tmpw67juif_/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=6/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-6.c000.snappy.parquet\n[Info] read from /tmp/tmpw67juif_/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=6/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-6.c000.snappy.parquet\n[Info] read from /tmp/tmp4bedjy2u/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=7/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-7.c000.snappy.parquet\n[Info] read from /tmp/tmp4bedjy2u/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=7/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-7.c000.snappy.parquet\n[Info] read from /tmp/tmp4bedjy2u/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=7/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-7.c000.snappy.parquet\n[Info] read from /tmp/tmp4bedjy2u/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=7/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-7.c000.snappy.parquet\n[Info] read from /tmp/tmp4bedjy2u/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=7/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-7.c000.snappy.parquet\n[Info] read from /tmp/tmp4bedjy2u/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=7/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-7.c000.snappy.parquet\n[Info] read from /tmp/tmp4bedjy2u/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=7/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-7.c000.snappy.parquet\n[Info] read from /tmp/tmp4bedjy2u/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=7/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-7.c000.snappy.parquet\n[Info] read from /tmp/tmp7urdqo5t/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=8/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-8.c000.snappy.parquet\n[Info] read from /tmp/tmp7urdqo5t/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=8/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-8.c000.snappy.parquet\n[Info] read from /tmp/tmp7urdqo5t/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=8/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-8.c000.snappy.parquet\n[Info] read from /tmp/tmp7urdqo5t/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=8/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-8.c000.snappy.parquet\n[Info] read from /tmp/tmp7urdqo5t/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=8/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-8.c000.snappy.parquet\n[Info] read from /tmp/tmp7urdqo5t/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=8/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-8.c000.snappy.parquet\n[Info] read from /tmp/tmp7urdqo5t/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=8/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-8.c000.snappy.parquet\n[Info] read from /tmp/tmp7urdqo5t/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=8/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-8.c000.snappy.parquet\n[Info] read from /tmp/tmptcaeisle/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=9/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-9.c000.snappy.parquet\n[Info] read from /tmp/tmptcaeisle/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=9/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-9.c000.snappy.parquet\n[Info] read from /tmp/tmptcaeisle/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=9/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-9.c000.snappy.parquet\n[Info] read from /tmp/tmptcaeisle/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=9/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-9.c000.snappy.parquet\n[Info] read from /tmp/tmptcaeisle/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=9/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-9.c000.snappy.parquet\n[Info] read from /tmp/tmptcaeisle/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=9/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-9.c000.snappy.parquet\n[Info] read from /tmp/tmptcaeisle/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=9/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-9.c000.snappy.parquet\n[Info] read from /tmp/tmptcaeisle/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=9/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-9.c000.snappy.parquet\n[Info] read from /tmp/tmp295irf24/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=10/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-10.c000.snappy.parquet\n[Info] read from /tmp/tmp295irf24/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=10/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-10.c000.snappy.parquet\n[Info] read from /tmp/tmp295irf24/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=10/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-10.c000.snappy.parquet\n[Info] read from /tmp/tmp295irf24/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=10/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-10.c000.snappy.parquet\n[Info] read from /tmp/tmp295irf24/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=10/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-10.c000.snappy.parquet\n[Info] read from /tmp/tmp295irf24/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=10/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-10.c000.snappy.parquet\n[Info] read from /tmp/tmp295irf24/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=10/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-10.c000.snappy.parquet\n[Info] read from /tmp/tmp295irf24/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=10/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-10.c000.snappy.parquet\n[Info] read from /tmp/tmp555rpnhr/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=11/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-11.c000.snappy.parquet\n[Info] read from /tmp/tmp555rpnhr/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=11/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-11.c000.snappy.parquet\n[Info] read from /tmp/tmp555rpnhr/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=11/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-11.c000.snappy.parquet\n[Info] read from /tmp/tmp555rpnhr/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=11/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-11.c000.snappy.parquet\n[Info] read from /tmp/tmp555rpnhr/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=11/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-11.c000.snappy.parquet\n[Info] read from /tmp/tmp555rpnhr/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=11/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-11.c000.snappy.parquet\n[Info] read from /tmp/tmp555rpnhr/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=11/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-11.c000.snappy.parquet\n[Info] read from /tmp/tmp555rpnhr/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=11/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-11.c000.snappy.parquet\n[Info] read from /tmp/tmp3c3qjt4b/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=12/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-12.c000.snappy.parquet\n[Info] read from /tmp/tmp3c3qjt4b/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=12/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-12.c000.snappy.parquet\n[Info] read from /tmp/tmp3c3qjt4b/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=12/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-12.c000.snappy.parquet\n[Info] read from /tmp/tmp3c3qjt4b/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=12/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-12.c000.snappy.parquet\n[Info] read from /tmp/tmp3c3qjt4b/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=12/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-12.c000.snappy.parquet\n[Info] read from /tmp/tmp3c3qjt4b/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=12/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-12.c000.snappy.parquet\n[Info] read from /tmp/tmp3c3qjt4b/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=12/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-12.c000.snappy.parquet\n[Info] read from /tmp/tmp3c3qjt4b/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=12/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-12.c000.snappy.parquet\n"
+        }
+      ],
+      "execution_count": 7,
+      "metadata": {
+        "gather": {
+          "logged": 1681193827810
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "weather_df"
+      ],
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "execution_count": 8,
+          "data": {
+            "text/plain": "         temperature  precipTime  latitude  longitude            datetime  \\\n204647           2.8         NaN    40.783    -73.867 2016-01-02 03:00:00   \n204670          -4.4         1.0    40.779    -73.880 2016-01-22 13:51:00   \n204694           5.0         1.0    40.779    -73.880 2016-01-08 02:51:00   \n204701          -1.1         1.0    40.779    -73.880 2016-01-04 15:51:00   \n204715           4.4         1.0    40.779    -73.880 2016-01-01 21:51:00   \n...              ...         ...       ...        ...                 ...   \n1248471          4.4         1.0    40.789    -73.967 2016-12-23 13:51:00   \n1248555          5.0         1.0    40.789    -73.967 2016-12-12 13:51:00   \n1248580          3.9         NaN    40.789    -73.967 2016-12-18 07:01:00   \n1248597          7.8         1.0    40.789    -73.967 2016-12-25 00:51:00   \n1248600         -2.8         1.0    40.789    -73.967 2016-12-17 11:10:00   \n\n          wban  precipDepth    usaf  \n204647   14732          NaN  725030  \n204670   14732          0.0  725030  \n204694   14732          0.0  725030  \n204701   14732          0.0  725030  \n204715   14732          0.0  725030  \n...        ...          ...     ...  \n1248471  94728          0.0  725053  \n1248555  94728          0.0  725053  \n1248580  94728          NaN  725053  \n1248597  94728          0.0  725053  \n1248600  94728          5.0  725053  \n\n[55683 rows x 8 columns]",
+            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>temperature</th>\n      <th>precipTime</th>\n      <th>latitude</th>\n      <th>longitude</th>\n      <th>datetime</th>\n      <th>wban</th>\n      <th>precipDepth</th>\n      <th>usaf</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>204647</th>\n      <td>2.8</td>\n      <td>NaN</td>\n      <td>40.783</td>\n      <td>-73.867</td>\n      <td>2016-01-02 03:00:00</td>\n      <td>14732</td>\n      <td>NaN</td>\n      <td>725030</td>\n    </tr>\n    <tr>\n      <th>204670</th>\n      <td>-4.4</td>\n      <td>1.0</td>\n      <td>40.779</td>\n      <td>-73.880</td>\n      <td>2016-01-22 13:51:00</td>\n      <td>14732</td>\n      <td>0.0</td>\n      <td>725030</td>\n    </tr>\n    <tr>\n      <th>204694</th>\n      <td>5.0</td>\n      <td>1.0</td>\n      <td>40.779</td>\n      <td>-73.880</td>\n      <td>2016-01-08 02:51:00</td>\n      <td>14732</td>\n      <td>0.0</td>\n      <td>725030</td>\n    </tr>\n    <tr>\n      <th>204701</th>\n      <td>-1.1</td>\n      <td>1.0</td>\n      <td>40.779</td>\n      <td>-73.880</td>\n      <td>2016-01-04 15:51:00</td>\n      <td>14732</td>\n      <td>0.0</td>\n      <td>725030</td>\n    </tr>\n    <tr>\n      <th>204715</th>\n      <td>4.4</td>\n      <td>1.0</td>\n      <td>40.779</td>\n      <td>-73.880</td>\n      <td>2016-01-01 21:51:00</td>\n      <td>14732</td>\n      <td>0.0</td>\n      <td>725030</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>1248471</th>\n      <td>4.4</td>\n      <td>1.0</td>\n      <td>40.789</td>\n      <td>-73.967</td>\n      <td>2016-12-23 13:51:00</td>\n      <td>94728</td>\n      <td>0.0</td>\n      <td>725053</td>\n    </tr>\n    <tr>\n      <th>1248555</th>\n      <td>5.0</td>\n      <td>1.0</td>\n      <td>40.789</td>\n      <td>-73.967</td>\n      <td>2016-12-12 13:51:00</td>\n      <td>94728</td>\n      <td>0.0</td>\n      <td>725053</td>\n    </tr>\n    <tr>\n      <th>1248580</th>\n      <td>3.9</td>\n      <td>NaN</td>\n      <td>40.789</td>\n      <td>-73.967</td>\n      <td>2016-12-18 07:01:00</td>\n      <td>94728</td>\n      <td>NaN</td>\n      <td>725053</td>\n    </tr>\n    <tr>\n      <th>1248597</th>\n      <td>7.8</td>\n      <td>1.0</td>\n      <td>40.789</td>\n      <td>-73.967</td>\n      <td>2016-12-25 00:51:00</td>\n      <td>94728</td>\n      <td>0.0</td>\n      <td>725053</td>\n    </tr>\n    <tr>\n      <th>1248600</th>\n      <td>-2.8</td>\n      <td>1.0</td>\n      <td>40.789</td>\n      <td>-73.967</td>\n      <td>2016-12-17 11:10:00</td>\n      <td>94728</td>\n      <td>5.0</td>\n      <td>725053</td>\n    </tr>\n  </tbody>\n</table>\n<p>55683 rows × 8 columns</p>\n</div>"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 8,
+      "metadata": {
+        "gather": {
+          "logged": 1681193828162
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Again call `pandas.Series.dt.normalize` on the `datetime` field in the weather data so it matches the time key in `taxi_holidays_df`.\n",
+        "\n",
+        "\n",
+        "Next group the weather data to have daily aggregated weather values. Define a dict `aggregations` to define how to aggregate each field at a daily level. For`temperature` take the mean and for `precipTime` and `precipDepth` take the daily maximum. Use the `groupby()` function along with the aggregations to group the data. Preview the data to ensure there is one record per day."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "weather_df[\"datetime\"] = weather_df[\"datetime\"].dt.normalize()\n",
+        "\n",
+        "# group by datetime\n",
+        "aggregations = {\"precipTime\": \"max\", \"temperature\": \"mean\", \"precipDepth\": \"max\"}\n",
+        "weather_df_grouped = weather_df.groupby(\"datetime\").agg(aggregations)\n",
+        "weather_df_grouped.head(10)"
+      ],
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "execution_count": 9,
+          "data": {
+            "text/plain": "            precipTime  temperature  precipDepth\ndatetime                                        \n2016-01-01         1.0     5.197345          0.0\n2016-01-02         1.0     2.567857          0.0\n2016-01-03         1.0     3.846429          0.0\n2016-01-04         1.0     0.123894          0.0\n2016-01-05         6.0    -7.206250          0.0\n2016-01-06         6.0    -0.896396          0.0\n2016-01-07         6.0     3.180645          0.0\n2016-01-08         1.0     4.384091          0.0\n2016-01-09         6.0     6.710274          3.0\n2016-01-10        24.0    10.943655        254.0",
+            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>precipTime</th>\n      <th>temperature</th>\n      <th>precipDepth</th>\n    </tr>\n    <tr>\n      <th>datetime</th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>2016-01-01</th>\n      <td>1.0</td>\n      <td>5.197345</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>2016-01-02</th>\n      <td>1.0</td>\n      <td>2.567857</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>2016-01-03</th>\n      <td>1.0</td>\n      <td>3.846429</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>2016-01-04</th>\n      <td>1.0</td>\n      <td>0.123894</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>2016-01-05</th>\n      <td>6.0</td>\n      <td>-7.206250</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>2016-01-06</th>\n      <td>6.0</td>\n      <td>-0.896396</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>2016-01-07</th>\n      <td>6.0</td>\n      <td>3.180645</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>2016-01-08</th>\n      <td>1.0</td>\n      <td>4.384091</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>2016-01-09</th>\n      <td>6.0</td>\n      <td>6.710274</td>\n      <td>3.0</td>\n    </tr>\n    <tr>\n      <th>2016-01-10</th>\n      <td>24.0</td>\n      <td>10.943655</td>\n      <td>254.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 9,
+      "metadata": {
+        "gather": {
+          "logged": 1681193828979
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Note: The examples in this tutorial merge data using Pandas functions and custom aggregations, but the Open Datasets SDK has classes designed to easily merge and enrich data sets. See the [notebook](https://github.com/Azure/OpenDatasetsNotebooks/blob/master/tutorials/data-join/04-nyc-taxi-join-weather-in-pandas.ipynb) for code examples of these design patterns."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Cleanse data\n",
+        "\n",
+        "Merge the existing taxi and holiday data with the new weather data. This time `datetime` is the only key, and again perform a left-join of the data. Run the `describe()` function on the new dataframe to see summary statistics for each field."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "taxi_holidays_weather_df = pd.merge(taxi_holidays_df, weather_df_grouped, how=\"left\", on=[\"datetime\"])\n",
+        "taxi_holidays_weather_df.describe()"
+      ],
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "execution_count": 10,
+          "data": {
+            "text/plain": "           vendorID  passengerCount  tripDistance  pickupLongitude  \\\ncount  24000.000000    24000.000000  24000.000000     12000.000000   \nmean       1.793625        1.359458      2.798265       -73.746045   \nstd        0.404711        1.033421      2.976438         3.753491   \nmin        1.000000        0.000000      0.000000       -74.163818   \n25%        2.000000        1.000000      1.020000       -73.961123   \n50%        2.000000        1.000000      1.830000       -73.946201   \n75%        2.000000        1.000000      3.430000       -73.918732   \nmax        2.000000        8.000000     86.700000         0.000000   \n\n       pickupLatitude  dropoffLongitude  dropoffLatitude   totalAmount  \\\ncount    12000.000000      12000.000000     12000.000000  24000.000000   \nmean        40.641902        -73.812426        40.677156     14.603195   \nstd          2.069237          3.016449         1.663137     11.596075   \nmin          0.000000        -75.167496         0.000000    -83.900000   \n25%         40.694324        -73.968376        40.695145      7.880000   \n50%         40.746000        -73.945480        40.746264     11.300000   \n75%         40.801911        -73.912468        40.789734     17.300000   \nmax         41.015667          0.000000        41.085476    495.000000   \n\n          month_num  day_of_month   day_of_week   hour_of_day        hr_sin  \\\ncount  24000.000000  24000.000000  24000.000000  24000.000000  24000.000000   \nmean       6.500000     15.072875      3.236458     13.611000     -0.246484   \nstd        3.452124      8.475006      1.964295      6.682823      0.665381   \nmin        1.000000      1.000000      0.000000      0.000000     -1.000000   \n25%        3.750000      8.000000      2.000000      9.000000     -0.866025   \n50%        6.500000     15.000000      3.000000     15.000000     -0.500000   \n75%        9.250000     22.000000      5.000000     19.000000      0.258819   \nmax       12.000000     30.000000      6.000000     23.000000      1.000000   \n\n             hr_cos        dy_sin        dy_cos    precipTime   temperature  \\\ncount  2.400000e+04  24000.000000  24000.000000  24000.000000  24000.000000   \nmean  -2.038304e-02     -0.085070     -0.050450     13.408667     13.876231   \nstd    7.043703e-01      0.713593      0.693574     10.330720      9.462154   \nmin   -1.000000e+00     -0.974928     -0.900969      1.000000    -13.379464   \n25%   -7.071068e-01     -0.781831     -0.900969      6.000000      6.591071   \n50%   -1.836970e-16      0.000000     -0.222521      6.000000     13.125893   \n75%    7.071068e-01      0.781831      0.623490     24.000000     22.944737   \nmax    1.000000e+00      0.974928      1.000000     24.000000     31.303665   \n\n        precipDepth  \ncount  24000.000000  \nmean    1075.977667  \nstd     2849.048787  \nmin        0.000000  \n25%        0.000000  \n50%       10.000000  \n75%      132.000000  \nmax     9999.000000  ",
+            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>vendorID</th>\n      <th>passengerCount</th>\n      <th>tripDistance</th>\n      <th>pickupLongitude</th>\n      <th>pickupLatitude</th>\n      <th>dropoffLongitude</th>\n      <th>dropoffLatitude</th>\n      <th>totalAmount</th>\n      <th>month_num</th>\n      <th>day_of_month</th>\n      <th>day_of_week</th>\n      <th>hour_of_day</th>\n      <th>hr_sin</th>\n      <th>hr_cos</th>\n      <th>dy_sin</th>\n      <th>dy_cos</th>\n      <th>precipTime</th>\n      <th>temperature</th>\n      <th>precipDepth</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>count</th>\n      <td>24000.000000</td>\n      <td>24000.000000</td>\n      <td>24000.000000</td>\n      <td>12000.000000</td>\n      <td>12000.000000</td>\n      <td>12000.000000</td>\n      <td>12000.000000</td>\n      <td>24000.000000</td>\n      <td>24000.000000</td>\n      <td>24000.000000</td>\n      <td>24000.000000</td>\n      <td>24000.000000</td>\n      <td>24000.000000</td>\n      <td>2.400000e+04</td>\n      <td>24000.000000</td>\n      <td>24000.000000</td>\n      <td>24000.000000</td>\n      <td>24000.000000</td>\n      <td>24000.000000</td>\n    </tr>\n    <tr>\n      <th>mean</th>\n      <td>1.793625</td>\n      <td>1.359458</td>\n      <td>2.798265</td>\n      <td>-73.746045</td>\n      <td>40.641902</td>\n      <td>-73.812426</td>\n      <td>40.677156</td>\n      <td>14.603195</td>\n      <td>6.500000</td>\n      <td>15.072875</td>\n      <td>3.236458</td>\n      <td>13.611000</td>\n      <td>-0.246484</td>\n      <td>-2.038304e-02</td>\n      <td>-0.085070</td>\n      <td>-0.050450</td>\n      <td>13.408667</td>\n      <td>13.876231</td>\n      <td>1075.977667</td>\n    </tr>\n    <tr>\n      <th>std</th>\n      <td>0.404711</td>\n      <td>1.033421</td>\n      <td>2.976438</td>\n      <td>3.753491</td>\n      <td>2.069237</td>\n      <td>3.016449</td>\n      <td>1.663137</td>\n      <td>11.596075</td>\n      <td>3.452124</td>\n      <td>8.475006</td>\n      <td>1.964295</td>\n      <td>6.682823</td>\n      <td>0.665381</td>\n      <td>7.043703e-01</td>\n      <td>0.713593</td>\n      <td>0.693574</td>\n      <td>10.330720</td>\n      <td>9.462154</td>\n      <td>2849.048787</td>\n    </tr>\n    <tr>\n      <th>min</th>\n      <td>1.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>-74.163818</td>\n      <td>0.000000</td>\n      <td>-75.167496</td>\n      <td>0.000000</td>\n      <td>-83.900000</td>\n      <td>1.000000</td>\n      <td>1.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>-1.000000</td>\n      <td>-1.000000e+00</td>\n      <td>-0.974928</td>\n      <td>-0.900969</td>\n      <td>1.000000</td>\n      <td>-13.379464</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>25%</th>\n      <td>2.000000</td>\n      <td>1.000000</td>\n      <td>1.020000</td>\n      <td>-73.961123</td>\n      <td>40.694324</td>\n      <td>-73.968376</td>\n      <td>40.695145</td>\n      <td>7.880000</td>\n      <td>3.750000</td>\n      <td>8.000000</td>\n      <td>2.000000</td>\n      <td>9.000000</td>\n      <td>-0.866025</td>\n      <td>-7.071068e-01</td>\n      <td>-0.781831</td>\n      <td>-0.900969</td>\n      <td>6.000000</td>\n      <td>6.591071</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>50%</th>\n      <td>2.000000</td>\n      <td>1.000000</td>\n      <td>1.830000</td>\n      <td>-73.946201</td>\n      <td>40.746000</td>\n      <td>-73.945480</td>\n      <td>40.746264</td>\n      <td>11.300000</td>\n      <td>6.500000</td>\n      <td>15.000000</td>\n      <td>3.000000</td>\n      <td>15.000000</td>\n      <td>-0.500000</td>\n      <td>-1.836970e-16</td>\n      <td>0.000000</td>\n      <td>-0.222521</td>\n      <td>6.000000</td>\n      <td>13.125893</td>\n      <td>10.000000</td>\n    </tr>\n    <tr>\n      <th>75%</th>\n      <td>2.000000</td>\n      <td>1.000000</td>\n      <td>3.430000</td>\n      <td>-73.918732</td>\n      <td>40.801911</td>\n      <td>-73.912468</td>\n      <td>40.789734</td>\n      <td>17.300000</td>\n      <td>9.250000</td>\n      <td>22.000000</td>\n      <td>5.000000</td>\n      <td>19.000000</td>\n      <td>0.258819</td>\n      <td>7.071068e-01</td>\n      <td>0.781831</td>\n      <td>0.623490</td>\n      <td>24.000000</td>\n      <td>22.944737</td>\n      <td>132.000000</td>\n    </tr>\n    <tr>\n      <th>max</th>\n      <td>2.000000</td>\n      <td>8.000000</td>\n      <td>86.700000</td>\n      <td>0.000000</td>\n      <td>41.015667</td>\n      <td>0.000000</td>\n      <td>41.085476</td>\n      <td>495.000000</td>\n      <td>12.000000</td>\n      <td>30.000000</td>\n      <td>6.000000</td>\n      <td>23.000000</td>\n      <td>1.000000</td>\n      <td>1.000000e+00</td>\n      <td>0.974928</td>\n      <td>1.000000</td>\n      <td>24.000000</td>\n      <td>31.303665</td>\n      <td>9999.000000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 10,
+      "metadata": {
+        "gather": {
+          "logged": 1681193829356
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "From the summary statistics, you see that there are several fields that have outliers or values that will reduce model accuracy. First filter the lat/long fields to be within the same bounds you used for filtering weather data. The `tripDistance` field has some bad data, because the minimum value is negative. The `passengerCount` field has bad data as well, with the max value being 210 passengers. Lastly, the `totalAmount` field has negative values, which don't make sense in the context of our model.\n",
+        "\n",
+        "Filter out these anomolies using query functions, and then remove the last few columns unnecesary for training.\n",
+        "\n",
+        "Note: since a random sample of 2000 was taken for each month of the taxi data, the statistics may vary each time this is ran."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "final_df = taxi_holidays_weather_df.query(\"pickupLatitude>=40.53 and pickupLatitude<=40.88 and \\\n",
+        "                                           pickupLongitude>=-74.09 and pickupLongitude<=-73.72 and \\\n",
+        "                                           tripDistance>0 and tripDistance<75 and \\\n",
+        "                                           passengerCount>0 and passengerCount<100 and \\\n",
+        "                                           totalAmount>0\")"
+      ],
+      "outputs": [],
+      "execution_count": 11,
+      "metadata": {
+        "gather": {
+          "logged": 1681193829696
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Call `describe()` again on the data to ensure cleansing worked as expected. The final data is prepared and cleansed, consisting of taxi, holiday, and weather data, and is ready to use for machine learning model training."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "final_df.describe()"
+      ],
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "execution_count": 12,
+          "data": {
+            "text/plain": "           vendorID  passengerCount  tripDistance  pickupLongitude  \\\ncount  11720.000000    11720.000000  11720.000000     11720.000000   \nmean       1.794710        1.354778      2.829323       -73.937425   \nstd        0.403931        1.025023      2.899549         0.041218   \nmin        1.000000        1.000000      0.010000       -74.074181   \n25%        2.000000        1.000000      1.060000       -73.961384   \n50%        2.000000        1.000000      1.880000       -73.946762   \n75%        2.000000        1.000000      3.490000       -73.919127   \nmax        2.000000        6.000000     52.800000       -73.744164   \n\n       pickupLatitude  dropoffLongitude  dropoffLatitude   totalAmount  \\\ncount    11720.000000      11720.000000     11720.000000  11720.000000   \nmean        40.746545        -73.879205        40.713237     14.597789   \nstd          0.056494          2.048760         1.130176     10.594542   \nmin         40.573597        -74.186638         0.000000      0.010000   \n25%         40.694648        -73.968775        40.695228      8.160000   \n50%         40.745876        -73.946011        40.746073     11.300000   \n75%         40.801327        -73.912937        40.788689     17.300000   \nmax         40.879837          0.000000        41.025719    223.890000   \n\n          month_num  day_of_month   day_of_week   hour_of_day        hr_sin  \\\ncount  11720.000000  11720.000000  11720.000000  11720.000000  11720.000000   \nmean       3.501024     14.890444      3.252645     13.621672     -0.244164   \nstd        1.707714      8.454712      1.967197      6.721303      0.666575   \nmin        1.000000      1.000000      0.000000      0.000000     -1.000000   \n25%        2.000000      8.000000      2.000000      9.000000     -0.866025   \n50%        4.000000     15.000000      4.000000     15.000000     -0.500000   \n75%        5.000000     22.000000      5.000000     19.000000      0.258819   \nmax        6.000000     30.000000      6.000000     23.000000      1.000000   \n\n             hr_cos        dy_sin        dy_cos    precipTime   temperature  \\\ncount  1.172000e+04  11720.000000  11720.000000  11720.000000  11720.000000   \nmean  -1.142466e-02     -0.090539     -0.049453     12.066980     10.267549   \nstd    7.042813e-01      0.713570      0.693007     10.146518      8.484011   \nmin   -1.000000e+00     -0.974928     -0.900969      1.000000    -13.379464   \n25%   -7.071068e-01     -0.781831     -0.900969      1.000000      3.504580   \n50%   -1.836970e-16     -0.433884     -0.222521      6.000000     10.130357   \n75%    7.071068e-01      0.781831      0.623490     24.000000     17.239744   \nmax    1.000000e+00      0.974928      1.000000     24.000000     26.524107   \n\n        precipDepth  \ncount  11720.000000  \nmean     190.603498  \nstd     1215.018267  \nmin        0.000000  \n25%        0.000000  \n50%        3.000000  \n75%       41.000000  \nmax     9999.000000  ",
+            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>vendorID</th>\n      <th>passengerCount</th>\n      <th>tripDistance</th>\n      <th>pickupLongitude</th>\n      <th>pickupLatitude</th>\n      <th>dropoffLongitude</th>\n      <th>dropoffLatitude</th>\n      <th>totalAmount</th>\n      <th>month_num</th>\n      <th>day_of_month</th>\n      <th>day_of_week</th>\n      <th>hour_of_day</th>\n      <th>hr_sin</th>\n      <th>hr_cos</th>\n      <th>dy_sin</th>\n      <th>dy_cos</th>\n      <th>precipTime</th>\n      <th>temperature</th>\n      <th>precipDepth</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>count</th>\n      <td>11720.000000</td>\n      <td>11720.000000</td>\n      <td>11720.000000</td>\n      <td>11720.000000</td>\n      <td>11720.000000</td>\n      <td>11720.000000</td>\n      <td>11720.000000</td>\n      <td>11720.000000</td>\n      <td>11720.000000</td>\n      <td>11720.000000</td>\n      <td>11720.000000</td>\n      <td>11720.000000</td>\n      <td>11720.000000</td>\n      <td>1.172000e+04</td>\n      <td>11720.000000</td>\n      <td>11720.000000</td>\n      <td>11720.000000</td>\n      <td>11720.000000</td>\n      <td>11720.000000</td>\n    </tr>\n    <tr>\n      <th>mean</th>\n      <td>1.794710</td>\n      <td>1.354778</td>\n      <td>2.829323</td>\n      <td>-73.937425</td>\n      <td>40.746545</td>\n      <td>-73.879205</td>\n      <td>40.713237</td>\n      <td>14.597789</td>\n      <td>3.501024</td>\n      <td>14.890444</td>\n      <td>3.252645</td>\n      <td>13.621672</td>\n      <td>-0.244164</td>\n      <td>-1.142466e-02</td>\n      <td>-0.090539</td>\n      <td>-0.049453</td>\n      <td>12.066980</td>\n      <td>10.267549</td>\n      <td>190.603498</td>\n    </tr>\n    <tr>\n      <th>std</th>\n      <td>0.403931</td>\n      <td>1.025023</td>\n      <td>2.899549</td>\n      <td>0.041218</td>\n      <td>0.056494</td>\n      <td>2.048760</td>\n      <td>1.130176</td>\n      <td>10.594542</td>\n      <td>1.707714</td>\n      <td>8.454712</td>\n      <td>1.967197</td>\n      <td>6.721303</td>\n      <td>0.666575</td>\n      <td>7.042813e-01</td>\n      <td>0.713570</td>\n      <td>0.693007</td>\n      <td>10.146518</td>\n      <td>8.484011</td>\n      <td>1215.018267</td>\n    </tr>\n    <tr>\n      <th>min</th>\n      <td>1.000000</td>\n      <td>1.000000</td>\n      <td>0.010000</td>\n      <td>-74.074181</td>\n      <td>40.573597</td>\n      <td>-74.186638</td>\n      <td>0.000000</td>\n      <td>0.010000</td>\n      <td>1.000000</td>\n      <td>1.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>-1.000000</td>\n      <td>-1.000000e+00</td>\n      <td>-0.974928</td>\n      <td>-0.900969</td>\n      <td>1.000000</td>\n      <td>-13.379464</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>25%</th>\n      <td>2.000000</td>\n      <td>1.000000</td>\n      <td>1.060000</td>\n      <td>-73.961384</td>\n      <td>40.694648</td>\n      <td>-73.968775</td>\n      <td>40.695228</td>\n      <td>8.160000</td>\n      <td>2.000000</td>\n      <td>8.000000</td>\n      <td>2.000000</td>\n      <td>9.000000</td>\n      <td>-0.866025</td>\n      <td>-7.071068e-01</td>\n      <td>-0.781831</td>\n      <td>-0.900969</td>\n      <td>1.000000</td>\n      <td>3.504580</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>50%</th>\n      <td>2.000000</td>\n      <td>1.000000</td>\n      <td>1.880000</td>\n      <td>-73.946762</td>\n      <td>40.745876</td>\n      <td>-73.946011</td>\n      <td>40.746073</td>\n      <td>11.300000</td>\n      <td>4.000000</td>\n      <td>15.000000</td>\n      <td>4.000000</td>\n      <td>15.000000</td>\n      <td>-0.500000</td>\n      <td>-1.836970e-16</td>\n      <td>-0.433884</td>\n      <td>-0.222521</td>\n      <td>6.000000</td>\n      <td>10.130357</td>\n      <td>3.000000</td>\n    </tr>\n    <tr>\n      <th>75%</th>\n      <td>2.000000</td>\n      <td>1.000000</td>\n      <td>3.490000</td>\n      <td>-73.919127</td>\n      <td>40.801327</td>\n      <td>-73.912937</td>\n      <td>40.788689</td>\n      <td>17.300000</td>\n      <td>5.000000</td>\n      <td>22.000000</td>\n      <td>5.000000</td>\n      <td>19.000000</td>\n      <td>0.258819</td>\n      <td>7.071068e-01</td>\n      <td>0.781831</td>\n      <td>0.623490</td>\n      <td>24.000000</td>\n      <td>17.239744</td>\n      <td>41.000000</td>\n    </tr>\n    <tr>\n      <th>max</th>\n      <td>2.000000</td>\n      <td>6.000000</td>\n      <td>52.800000</td>\n      <td>-73.744164</td>\n      <td>40.879837</td>\n      <td>0.000000</td>\n      <td>41.025719</td>\n      <td>223.890000</td>\n      <td>6.000000</td>\n      <td>30.000000</td>\n      <td>6.000000</td>\n      <td>23.000000</td>\n      <td>1.000000</td>\n      <td>1.000000e+00</td>\n      <td>0.974928</td>\n      <td>1.000000</td>\n      <td>24.000000</td>\n      <td>26.524107</td>\n      <td>9999.000000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 12,
+      "metadata": {
+        "gather": {
+          "logged": 1681193830079
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Train a model\n",
+        "\n",
+        "The data is ready to train a machine learning model."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from sklearn.linear_model import LinearRegression\n",
+        "from sklearn.linear_model import RidgeCV\n",
+        "from sklearn.linear_model import Ridge\n",
+        "from sklearn.ensemble import RandomForestRegressor\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "from sklearn.pipeline import Pipeline\n",
+        "from sklearn.preprocessing import OneHotEncoder\n",
+        "from sklearn.impute import SimpleImputer\n",
+        "from sklearn.compose import ColumnTransformer\n",
+        "from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error"
+      ],
+      "outputs": [],
+      "execution_count": 13,
+      "metadata": {
+        "gather": {
+          "logged": 1681193830964
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Training Function\n",
+        "\n",
+        "Define a function that can be used to create a model pipeline that can be trained and then used for scoring. This pipeline has 2 steps: preprocessing and model training.\n",
+        "\n",
+        "<b>Preprocessing Stages:</b>\n",
+        "The preprocessing step of the pipeline also has 2 stages, one for numerical features and one for categorical features.\n",
+        "For the numerical features, let's fill in any blanks with 0's. While the training data may not have any nulls in the these fields, future data that is scored may and this step will take care of those for us. Optionally, a scaler transformation could be added in this step as well. Similarly for the categorical variables, let's have the null values filled with \"MISSING\". Additionally to the categorical variables, these will need to be one hot encoded, so we will include that step in our pipeline.\n",
+        "\n",
+        "<b>Model Training Stage:</b>\n",
+        "An input parameter will determine which type of model of train. Let's test out a linear regression and random forest model to start. \n",
+        "\n",
+        "The two steps are put together into the pipeline which is what the function is returning."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def createClassModel(algo_name, catg, nums):\n",
+        "  numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))])\n",
+        "  \n",
+        "  categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=\"MISSING\")), ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n",
+        "  \n",
+        "  preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)])\n",
+        "  \n",
+        "  if algo_name == 'linear_regression':\n",
+        "    model=Ridge(alpha=100)\n",
+        "  elif algo_name == 'random_forest':\n",
+        "    model = RandomForestRegressor()\n",
+        "  else:\n",
+        "    pass\n",
+        "  ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), (\"model\", model)])\n",
+        "  return ModelPipeline"
+      ],
+      "outputs": [],
+      "execution_count": 14,
+      "metadata": {
+        "gather": {
+          "logged": 1681193831335
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let's define the arguments that will be passed to the function. `catg_cols` is a list of the categorical variables that will be transformed in our processing step. `num_cols` is a list of the numerical variables that will be transformed in our processing step. Let's define the target column as `label` so it can be used in future steps as well."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "catg_cols = [\"vendorID\", \"month_num\", \"day_of_month\", \"normalizeHolidayName\", \"isPaidTimeOff\"]\n",
+        "num_cols = [\"passengerCount\", \"tripDistance\", \"precipTime\", \"temperature\", \"precipDepth\", \"hr_sin\", \"hr_cos\", \"dy_sin\", \"dy_cos\"]\n",
+        "label = [\"totalAmount\"]"
+      ],
+      "outputs": [],
+      "execution_count": 15,
+      "metadata": {
+        "gather": {
+          "logged": 1681193831647
+        }
+      }
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "linear_regression\n",
-      "R2: 0.8034971051723139\n",
-      "MAPE: 0.15888983234876766\n",
-      "RMSE: 4.606544019524053\n",
-      "\n",
-      "random_forest\n",
-      "R2: 0.8073017231520601\n",
-      "MAPE: 0.14715914748857337\n",
-      "RMSE: 4.5617309259357475\n",
-      "\n"
-     ]
+      "cell_type": "markdown",
+      "source": [
+        "The training is ready to begin, but first, let's make sure that the categorical variables are strings in our dataframe to ensure no errors in our pipeline. \n",
+        "\n",
+        "Next, the data is split into training and test sets by using the `train_test_split()` function in the `scikit-learn` library. The `test_size` parameter determines the percentage of data to allocate to testing. The `random_state` parameter sets a seed to the random number generator, so that your train-test splits are deterministic.\n",
+        "\n",
+        "The training will happen in the for loop so that both algorithms can be tested. The createClassModel funtion is called to retreive the pipeline that can then be trained using the training dataset. \n",
+        "\n",
+        "Once trained, the test dataset is then ran through the model to test the model's performance. Using various functions from sklearn.metrics, the R2 score, MAPE, and RMSE can be used to measure model performance."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# make sure categorical columns are strings\n",
+        "final_df[catg_cols] = final_df[catg_cols].astype(\"str\")\n",
+        "\n",
+        "# split data\n",
+        "X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222)\n",
+        "\n",
+        "# test 2 algorithms\n",
+        "for algorithmname in [\"linear_regression\", 'random_forest']:\n",
+        "    fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline\n",
+        "    fitPipeline.fit(X_train, y_train.values.ravel())                   # fit pipeine\n",
+        "\n",
+        "    y_pred = fitPipeline.predict(X_test)                               # score with fitted pipeline\n",
+        "\n",
+        "    # Evaluate\n",
+        "    r2 = r2_score(y_test, y_pred)\n",
+        "    mape = mean_absolute_percentage_error(y_test, y_pred)\n",
+        "    rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
+        "\n",
+        "    print(algorithmname)\n",
+        "    print(\"R2:\", r2)\n",
+        "    print(\"MAPE:\", mape)\n",
+        "    print(\"RMSE:\", rmse)\n",
+        "    print()"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "linear_regression\nR2: 0.8939180427845623\nMAPE: 0.15217635144070302\nRMSE: 3.409148681526453\n\nrandom_forest\nR2: 0.8540936112427824\nMAPE: 0.15527304667688627\nRMSE: 3.998179929258663\n\n"
+        }
+      ],
+      "execution_count": 16,
+      "metadata": {
+        "gather": {
+          "logged": 1681193874528
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    }
+  ],
+  "metadata": {
+    "interpreter": {
+      "hash": "74e9702761b8f12846716a18132904990016d49f378e22e0e13a0e91318de754"
+    },
+    "kernelspec": {
+      "name": "python38-azureml",
+      "language": "python",
+      "display_name": "Python 3.8 - AzureML"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.8.10",
+      "mimetype": "text/x-python",
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "pygments_lexer": "ipython3",
+      "nbconvert_exporter": "python",
+      "file_extension": ".py"
+    },
+    "orig_nbformat": 4,
+    "microsoft": {
+      "ms_spell_check": {
+        "ms_spell_check_language": "en"
+      }
+    },
+    "kernel_info": {
+      "name": "python38-azureml"
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
     }
-   ],
-   "source": [
-    "# make sure categorical columns are strings\n",
-    "final_df[catg_cols] = final_df[catg_cols].astype(\"str\")\n",
-    "\n",
-    "# split data\n",
-    "X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222)\n",
-    "\n",
-    "# test 2 algorithms\n",
-    "for algorithmname in [\"linear_regression\", 'random_forest']:\n",
-    "    fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline\n",
-    "    fitPipeline.fit(X_train, y_train.values.ravel())                   # fit pipeine\n",
-    "\n",
-    "    y_pred = fitPipeline.predict(X_test)                               # score with fitted pipeline\n",
-    "\n",
-    "    # Evaluate\n",
-    "    r2 = r2_score(y_test, y_pred)\n",
-    "    mape = mean_absolute_percentage_error(y_test, y_pred)\n",
-    "    rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
-    "\n",
-    "    print(algorithmname)\n",
-    "    print(\"R2:\", r2)\n",
-    "    print(\"MAPE:\", mape)\n",
-    "    print(\"RMSE:\", rmse)\n",
-    "    print()"
-   ]
-  }
- ],
- "metadata": {
-  "interpreter": {
-   "hash": "74e9702761b8f12846716a18132904990016d49f378e22e0e13a0e91318de754"
-  },
-  "kernelspec": {
-   "display_name": "Python 3.8.12 ('mlopsenv')",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
   },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
+  "nbformat": 4,
+  "nbformat_minor": 2
+}
\ No newline at end of file