diff --git a/.github/workflows/workshop_ci.yml b/.github/workflows/workshop_ci.yml
index 09c0ea72..922fa9f1 100644
--- a/.github/workflows/workshop_ci.yml
+++ b/.github/workflows/workshop_ci.yml
@@ -32,7 +32,7 @@ jobs:
- name: Install az ml & set default values for AML
run: | #setup: provide group, workspace and location
az extension add -n ml -y --version 2.2.1
- az configure --defaults group=azureml workspace=ws01ent location=westus2
+ az configure --defaults group=mlops-rg-910166 workspace=910166 location=westus2
- name: run training and model validation
run: |
az ml job create -s -f src/workshop/core/pipelines/training_pipeline.yml
diff --git a/.github/workflows/workshop_unit_test.yml b/.github/workflows/workshop_unit_test.yml
index 3c1382c3..1e9f4423 100644
--- a/.github/workflows/workshop_unit_test.yml
+++ b/.github/workflows/workshop_unit_test.yml
@@ -13,7 +13,7 @@ jobs:
unit-test:
runs-on: ubuntu-latest
steps:
- - name: Check out repository code
+ - name: Harpreet-dev
uses: actions/checkout@v3
- name: Setup python
uses: actions/setup-python@v2
@@ -31,7 +31,7 @@ jobs:
- name: Install AZ ML and tools
run: | # SETUP line 34 to point to your own AML workspace
az extension add -n ml -y --version 2.2.1
- az configure --defaults group=azureml workspace=ws01ent location=westus2
+ az configure --defaults group=mlops-rg-910166 workspace=aml910166 location=westus2
- name: Run Feature Engineering
uses: ./.github/actions/aml-job-create
with:
diff --git a/src/workshop/core/scoring/deployment.yml b/src/workshop/core/scoring/deployment.yml
index 29c3500c..8f54df02 100644
--- a/src/workshop/core/scoring/deployment.yml
+++ b/src/workshop/core/scoring/deployment.yml
@@ -1,6 +1,6 @@
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
name: green
-endpoint_name: mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name defined in endpoint.yml
+endpoint_name: mlops-workshop-endpoint34 #setup replace `mlops-workshop-endpoint` with your own endpoint name defined in endpoint.yml
model: azureml:nyc_fare_prediction:1
code_configuration:
code: ./
diff --git a/src/workshop/core/scoring/endpoint.yml b/src/workshop/core/scoring/endpoint.yml
index 611e0721..c3d5b249 100644
--- a/src/workshop/core/scoring/endpoint.yml
+++ b/src/workshop/core/scoring/endpoint.yml
@@ -1,3 +1,3 @@
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
-name: mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name. It has to be globally unique
+name: mlops-workshop-endpoint34 #setup replace `mlops-workshop-endpoint` with your own endpoint name. It has to be globally unique
auth_mode: key
diff --git a/src/workshop/core/training/.amlignore b/src/workshop/core/training/.amlignore
new file mode 100644
index 00000000..0621f9fc
--- /dev/null
+++ b/src/workshop/core/training/.amlignore
@@ -0,0 +1,6 @@
+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove.
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+
+.ipynb_aml_checkpoints/
+*.amltmp
+*.amltemp
\ No newline at end of file
diff --git a/src/workshop/core/training/.amlignore.amltmp b/src/workshop/core/training/.amlignore.amltmp
new file mode 100644
index 00000000..0621f9fc
--- /dev/null
+++ b/src/workshop/core/training/.amlignore.amltmp
@@ -0,0 +1,6 @@
+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove.
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+
+.ipynb_aml_checkpoints/
+*.amltmp
+*.amltemp
\ No newline at end of file
diff --git a/src/workshop/core/training/conda_ml_training.yml b/src/workshop/core/training/conda_ml_training.yml
index 3e26a9f2..8a873bf7 100644
--- a/src/workshop/core/training/conda_ml_training.yml
+++ b/src/workshop/core/training/conda_ml_training.yml
@@ -8,4 +8,4 @@ dependencies:
- azureml-sdk==1.38.0
- azureml-mlflow==1.38.0
- pandas==1.3.5
- - scikit-learn==1.0.2
\ No newline at end of file
+ - scikit-learn
\ No newline at end of file
diff --git a/src/workshop/core/training/conda_ml_training.yml.amltmp b/src/workshop/core/training/conda_ml_training.yml.amltmp
new file mode 100644
index 00000000..8a873bf7
--- /dev/null
+++ b/src/workshop/core/training/conda_ml_training.yml.amltmp
@@ -0,0 +1,11 @@
+name: ml-training
+channels:
+ - conda-forge
+dependencies:
+ - python=3.8
+ - pip=21.3.1
+ - pip:
+ - azureml-sdk==1.38.0
+ - azureml-mlflow==1.38.0
+ - pandas==1.3.5
+ - scikit-learn
\ No newline at end of file
diff --git a/src/workshop/core/training/ml_training.py b/src/workshop/core/training/ml_training.py
index 6f59dcdd..93b504a8 100644
--- a/src/workshop/core/training/ml_training.py
+++ b/src/workshop/core/training/ml_training.py
@@ -13,7 +13,7 @@
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
-from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error
+from sklearn.metrics import r2_score,mean_absolute_percentage_error, mean_squared_error
import joblib
def parse_args():
# arg parser
@@ -43,7 +43,7 @@ def createClassModel(algo_name, catg, nums):
#---------------------------------------------
#setup: Update alpha value
#---------------------------------------------
- model = Ridge(alpha=100000) #setup
+ model = Ridge(alpha=100) #setup
elif algo_name == 'random_forest':
model = RandomForestRegressor()
else:
diff --git a/src/workshop/core/training/ml_training.py.amltmp b/src/workshop/core/training/ml_training.py.amltmp
new file mode 100644
index 00000000..93b504a8
--- /dev/null
+++ b/src/workshop/core/training/ml_training.py.amltmp
@@ -0,0 +1,103 @@
+import pandas as pd
+import numpy as np
+import os
+import argparse
+import mlflow
+import mlflow.sklearn
+from azureml.core import Run, Dataset,Datastore, Workspace
+from sklearn.linear_model import LinearRegression
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.linear_model import Ridge
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.impute import SimpleImputer
+from sklearn.compose import ColumnTransformer
+from sklearn.metrics import r2_score,mean_absolute_percentage_error, mean_squared_error
+import joblib
+def parse_args():
+ # arg parser
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument("--prep_data", default="data", type=str, help="Path to prepped data, default to local folder")
+ parser.add_argument("--model_folder", type=str,default="data", help="Path of model ouput folder, default to local folder")
+ parser.add_argument("--input_file_name", type=str, default="final_df.parquet")
+ parser.add_argument("--run_mode", type=str, default="local")
+
+
+ # parse args
+ args = parser.parse_args()
+
+ # return args
+ return args
+
+
+def createClassModel(algo_name, catg, nums):
+ numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))])
+
+ categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value="MISSING")), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
+
+ preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)])
+
+ if algo_name == 'linear_regression':
+ #---------------------------------------------
+ #setup: Update alpha value
+ #---------------------------------------------
+ model = Ridge(alpha=100) #setup
+ elif algo_name == 'random_forest':
+ model = RandomForestRegressor()
+ else:
+ pass
+
+ ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), ("model", model)])
+
+ return ModelPipeline
+
+def main(args):
+
+ # read in data
+ final_df = pd.read_parquet(os.path.join(args.prep_data,args.input_file_name))
+ catg_cols = ["vendorID", "month_num", "day_of_month", "normalizeHolidayName", "isPaidTimeOff"]
+ num_cols = ["passengerCount", "tripDistance", "precipTime", "temperature", "precipDepth", "hr_sin", "hr_cos", "dy_sin", "dy_cos"]
+ label = ["totalAmount"]
+ # make sure categorical columns are strings
+ final_df[catg_cols] = final_df[catg_cols].astype("str")
+
+ # split data
+ X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222)
+
+ # test 2 algorithms
+ os.makedirs(args.model_folder, exist_ok=True)
+
+ algorithmname = "linear_regression"
+ fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline
+ fitPipeline.fit(X_train, y_train.values.ravel()) # fit pipeine
+
+ y_pred = fitPipeline.predict(X_test) # score with fitted pipeline
+
+ # Evaluate
+ r2 = r2_score(y_test, y_pred)
+ mape = mean_absolute_percentage_error(y_test, y_pred)
+ rmse = np.sqrt(mean_squared_error(y_test, y_pred))
+
+
+ joblib.dump(fitPipeline,args.model_folder+"/"+algorithmname+".joblib")
+
+ print("Training finished!. Metrics:")
+ print(f"R2_{algorithmname}", r2)
+ print(f"MAPE_{algorithmname}", mape)
+ print(f"RMSE_{algorithmname}", rmse)
+ print("Model",args.model_folder+"/"+algorithmname+".joblib","saved!")
+
+ if args.run_mode == 'remote':
+ mlflow.log_metric(f"R2_{algorithmname}", r2)
+ mlflow.log_metric(f"MAPE_{algorithmname}", mape)
+ mlflow.log_metric(f"RMSE_{algorithmname}", rmse)
+ mlflow.sklearn.log_model(fitPipeline,f"{algorithmname}_model")
+
+# run script
+if __name__ == "__main__":
+ # parse args
+ args = parse_args()
+ # run main function
+ main(args)
\ No newline at end of file
diff --git a/src/workshop/data/linear_regression.joblib b/src/workshop/data/linear_regression.joblib
index d6bd0590..3e776213 100644
Binary files a/src/workshop/data/linear_regression.joblib and b/src/workshop/data/linear_regression.joblib differ
diff --git a/src/workshop/notebooks/taxi-tutorial.ipynb b/src/workshop/notebooks/taxi-tutorial.ipynb
index 41795d69..01fe859f 100644
--- a/src/workshop/notebooks/taxi-tutorial.ipynb
+++ b/src/workshop/notebooks/taxi-tutorial.ipynb
@@ -1,2898 +1,654 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Tutorial: Build a regression model with Open Datasets\n",
- "\n",
- "In this tutorial, you leverage the convenience of Azure Open Datasets to create a regression model to predict NYC taxi fare prices. Easily download publicly available taxi, holiday and weather data to create a dataset that can train a regression model using sklearn."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {},
- "outputs": [],
- "source": [
- "from azureml.opendatasets import NycTlcGreen\n",
- "import pandas as pd\n",
- "import numpy as np\n",
- "from datetime import datetime\n",
- "from dateutil.relativedelta import relativedelta\n",
- "\n",
- "pd.options.mode.chained_assignment = None"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Download Data\n",
- "Begin by downloading the NYC Taxi dataset from Azure Open Datasets. In non-Spark environments, Open Datasets only allows one month of data at a time with certain classes to avoid MemoryError with large datasets. To download 1 year of taxi data, we will fetch 2000 random samples from each month.\n",
- "\n",
- "Note: Open Datasets has mirroring classes for working in Spark where data size and memory are not a concern."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [
+ "cells": [
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpviwf6gni\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=1\\part-00119-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2689-1.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp6e1co7l5\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=2\\part-00060-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2630-2.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpd5lgxojh\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=3\\part-00196-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2766-1.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpela340gr\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=4\\part-00121-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2691-1.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpe79pzv2_\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=5\\part-00044-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2614-1.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpyxyv_8h4\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=6\\part-00108-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2678-1.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp498a1aem\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=7\\part-00020-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2590-2.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpuhi_se7a\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=8\\part-00172-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2742-2.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpd7id7xon\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=9\\part-00076-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2646-1.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp3he0z_qe\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=10\\part-00090-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2660-1.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp1sa8wuxl\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=11\\part-00021-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2591-1.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp1e7uekhr\\https%3A\\%2Fazureopendatastorage.azurefd.net\\nyctlc\\green\\puYear=2016\\puMonth=12\\part-00116-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2686-1.c000.snappy.parquet\n"
- ]
+ "cell_type": "markdown",
+ "source": [
+ "# Tutorial: Build a regression model with Open Datasets\n",
+ "\n",
+ "In this tutorial, you leverage the convenience of Azure Open Datasets to create a regression model to predict NYC taxi fare prices. Easily download publicly available taxi, holiday and weather data to create a dataset that can train a regression model using sklearn."
+ ],
+ "metadata": {}
},
{
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " vendorID | \n",
- " lpepPickupDatetime | \n",
- " lpepDropoffDatetime | \n",
- " passengerCount | \n",
- " tripDistance | \n",
- " puLocationId | \n",
- " doLocationId | \n",
- " pickupLongitude | \n",
- " pickupLatitude | \n",
- " dropoffLongitude | \n",
- " ... | \n",
- " paymentType | \n",
- " fareAmount | \n",
- " extra | \n",
- " mtaTax | \n",
- " improvementSurcharge | \n",
- " tipAmount | \n",
- " tollsAmount | \n",
- " ehailFee | \n",
- " totalAmount | \n",
- " tripType | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1379860 | \n",
- " 2 | \n",
- " 2016-01-14 06:39:00 | \n",
- " 2016-01-14 06:44:55 | \n",
- " 1 | \n",
- " 1.23 | \n",
- " None | \n",
- " None | \n",
- " -73.911827 | \n",
- " 40.775372 | \n",
- " -73.899635 | \n",
- " ... | \n",
- " 2 | \n",
- " 6.5 | \n",
- " 0.0 | \n",
- " 0.5 | \n",
- " 0.3 | \n",
- " 0.00 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " 7.30 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 377548 | \n",
- " 2 | \n",
- " 2016-01-01 06:22:01 | \n",
- " 2016-01-01 06:27:14 | \n",
- " 5 | \n",
- " 0.91 | \n",
- " None | \n",
- " None | \n",
- " -73.962044 | \n",
- " 40.709797 | \n",
- " -73.946716 | \n",
- " ... | \n",
- " 2 | \n",
- " 5.5 | \n",
- " 0.0 | \n",
- " 0.5 | \n",
- " 0.3 | \n",
- " 0.00 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " 6.30 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 473976 | \n",
- " 2 | \n",
- " 2016-01-08 20:55:49 | \n",
- " 2016-01-08 21:05:50 | \n",
- " 6 | \n",
- " 3.42 | \n",
- " None | \n",
- " None | \n",
- " -73.904823 | \n",
- " 40.741776 | \n",
- " -73.878815 | \n",
- " ... | \n",
- " 2 | \n",
- " 11.5 | \n",
- " 0.5 | \n",
- " 0.5 | \n",
- " 0.3 | \n",
- " 0.00 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " 12.80 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 1246683 | \n",
- " 2 | \n",
- " 2016-01-15 08:27:41 | \n",
- " 2016-01-15 08:41:05 | \n",
- " 1 | \n",
- " 3.99 | \n",
- " None | \n",
- " None | \n",
- " -73.911484 | \n",
- " 40.854698 | \n",
- " -73.881821 | \n",
- " ... | \n",
- " 2 | \n",
- " 15.0 | \n",
- " 0.0 | \n",
- " 0.5 | \n",
- " 0.3 | \n",
- " 0.00 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " 15.80 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 1152261 | \n",
- " 2 | \n",
- " 2016-01-09 04:35:21 | \n",
- " 2016-01-09 04:41:02 | \n",
- " 1 | \n",
- " 0.98 | \n",
- " None | \n",
- " None | \n",
- " -73.921776 | \n",
- " 40.767071 | \n",
- " -73.933136 | \n",
- " ... | \n",
- " 1 | \n",
- " 6.0 | \n",
- " 0.5 | \n",
- " 0.5 | \n",
- " 0.3 | \n",
- " 0.70 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " 8.00 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 998273 | \n",
- " 1 | \n",
- " 2016-12-24 22:03:25 | \n",
- " 2016-12-24 22:17:16 | \n",
- " 1 | \n",
- " 5.30 | \n",
- " 74 | \n",
- " 235 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " 2 | \n",
- " 16.5 | \n",
- " 0.5 | \n",
- " 0.5 | \n",
- " 0.3 | \n",
- " 0.00 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " 17.80 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 857200 | \n",
- " 2 | \n",
- " 2016-12-03 20:33:53 | \n",
- " 2016-12-03 20:53:51 | \n",
- " 1 | \n",
- " 4.81 | \n",
- " 83 | \n",
- " 258 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " 1 | \n",
- " 18.5 | \n",
- " 0.5 | \n",
- " 0.5 | \n",
- " 0.3 | \n",
- " 3.00 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " 22.80 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 607768 | \n",
- " 2 | \n",
- " 2016-12-18 16:17:54 | \n",
- " 2016-12-18 16:33:13 | \n",
- " 3 | \n",
- " 2.02 | \n",
- " 95 | \n",
- " 56 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " 2 | \n",
- " 11.5 | \n",
- " 0.0 | \n",
- " 0.5 | \n",
- " 0.3 | \n",
- " 0.00 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " 12.30 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 78687 | \n",
- " 2 | \n",
- " 2016-12-06 09:24:43 | \n",
- " 2016-12-06 09:41:09 | \n",
- " 1 | \n",
- " 9.51 | \n",
- " 66 | \n",
- " 11 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " 2 | \n",
- " 27.5 | \n",
- " 0.0 | \n",
- " 0.5 | \n",
- " 0.3 | \n",
- " 0.00 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " 28.30 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 141672 | \n",
- " 2 | \n",
- " 2016-12-14 16:12:34 | \n",
- " 2016-12-14 16:15:11 | \n",
- " 1 | \n",
- " 0.51 | \n",
- " 255 | \n",
- " 256 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " 1 | \n",
- " 4.0 | \n",
- " 1.0 | \n",
- " 0.5 | \n",
- " 0.3 | \n",
- " 1.45 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " 7.25 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
24000 rows × 23 columns
\n",
- "
"
+ "cell_type": "code",
+ "source": [
+ "from azureml.opendatasets import NycTlcGreen\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from datetime import datetime\n",
+ "from dateutil.relativedelta import relativedelta\n",
+ "\n",
+ "pd.options.mode.chained_assignment = None"
],
- "text/plain": [
- " vendorID lpepPickupDatetime lpepDropoffDatetime passengerCount \\\n",
- "1379860 2 2016-01-14 06:39:00 2016-01-14 06:44:55 1 \n",
- "377548 2 2016-01-01 06:22:01 2016-01-01 06:27:14 5 \n",
- "473976 2 2016-01-08 20:55:49 2016-01-08 21:05:50 6 \n",
- "1246683 2 2016-01-15 08:27:41 2016-01-15 08:41:05 1 \n",
- "1152261 2 2016-01-09 04:35:21 2016-01-09 04:41:02 1 \n",
- "... ... ... ... ... \n",
- "998273 1 2016-12-24 22:03:25 2016-12-24 22:17:16 1 \n",
- "857200 2 2016-12-03 20:33:53 2016-12-03 20:53:51 1 \n",
- "607768 2 2016-12-18 16:17:54 2016-12-18 16:33:13 3 \n",
- "78687 2 2016-12-06 09:24:43 2016-12-06 09:41:09 1 \n",
- "141672 2 2016-12-14 16:12:34 2016-12-14 16:15:11 1 \n",
- "\n",
- " tripDistance puLocationId doLocationId pickupLongitude \\\n",
- "1379860 1.23 None None -73.911827 \n",
- "377548 0.91 None None -73.962044 \n",
- "473976 3.42 None None -73.904823 \n",
- "1246683 3.99 None None -73.911484 \n",
- "1152261 0.98 None None -73.921776 \n",
- "... ... ... ... ... \n",
- "998273 5.30 74 235 NaN \n",
- "857200 4.81 83 258 NaN \n",
- "607768 2.02 95 56 NaN \n",
- "78687 9.51 66 11 NaN \n",
- "141672 0.51 255 256 NaN \n",
- "\n",
- " pickupLatitude dropoffLongitude ... paymentType fareAmount extra \\\n",
- "1379860 40.775372 -73.899635 ... 2 6.5 0.0 \n",
- "377548 40.709797 -73.946716 ... 2 5.5 0.0 \n",
- "473976 40.741776 -73.878815 ... 2 11.5 0.5 \n",
- "1246683 40.854698 -73.881821 ... 2 15.0 0.0 \n",
- "1152261 40.767071 -73.933136 ... 1 6.0 0.5 \n",
- "... ... ... ... ... ... ... \n",
- "998273 NaN NaN ... 2 16.5 0.5 \n",
- "857200 NaN NaN ... 1 18.5 0.5 \n",
- "607768 NaN NaN ... 2 11.5 0.0 \n",
- "78687 NaN NaN ... 2 27.5 0.0 \n",
- "141672 NaN NaN ... 1 4.0 1.0 \n",
- "\n",
- " mtaTax improvementSurcharge tipAmount tollsAmount ehailFee \\\n",
- "1379860 0.5 0.3 0.00 0.0 NaN \n",
- "377548 0.5 0.3 0.00 0.0 NaN \n",
- "473976 0.5 0.3 0.00 0.0 NaN \n",
- "1246683 0.5 0.3 0.00 0.0 NaN \n",
- "1152261 0.5 0.3 0.70 0.0 NaN \n",
- "... ... ... ... ... ... \n",
- "998273 0.5 0.3 0.00 0.0 NaN \n",
- "857200 0.5 0.3 3.00 0.0 NaN \n",
- "607768 0.5 0.3 0.00 0.0 NaN \n",
- "78687 0.5 0.3 0.00 0.0 NaN \n",
- "141672 0.5 0.3 1.45 0.0 NaN \n",
- "\n",
- " totalAmount tripType \n",
- "1379860 7.30 1.0 \n",
- "377548 6.30 1.0 \n",
- "473976 12.80 1.0 \n",
- "1246683 15.80 1.0 \n",
- "1152261 8.00 1.0 \n",
- "... ... ... \n",
- "998273 17.80 1.0 \n",
- "857200 22.80 1.0 \n",
- "607768 12.30 1.0 \n",
- "78687 28.30 1.0 \n",
- "141672 7.25 1.0 \n",
- "\n",
- "[24000 rows x 23 columns]"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n",
- "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n",
- "\n",
- "green_taxi_df = pd.concat([NycTlcGreen(start + relativedelta(months=x), end + relativedelta(months=x)) \\\n",
- " .to_pandas_dataframe().sample(2000) for x in range(12)])\n",
- "green_taxi_df"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now that the initial data is loaded, define a function to create various time-based features from the pickup datetime field. This will create new fields for the month number, day of month, day of week, and hour of day. From those, we calculate the sin and cosine transformations to capture the cyclical nature of the variable which will allow the model to factor in time-based seasonality. This function also adds a static feature for the country code to join the holiday data. Use the apply() function on the dataframe to interatively apply this function to each row in the dataframe."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {},
- "outputs": [
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": ":219: RuntimeWarning: scipy._lib.messagestream.MessageStream size changed, may indicate binary incompatibility. Expected 56 from C header, got 64 from PyObject\n"
+ }
+ ],
+ "execution_count": 1,
+ "metadata": {
+ "gather": {
+ "logged": 1681193718753
+ }
+ }
+ },
{
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " vendorID | \n",
- " lpepPickupDatetime | \n",
- " lpepDropoffDatetime | \n",
- " passengerCount | \n",
- " tripDistance | \n",
- " puLocationId | \n",
- " doLocationId | \n",
- " pickupLongitude | \n",
- " pickupLatitude | \n",
- " dropoffLongitude | \n",
- " ... | \n",
- " tripType | \n",
- " month_num | \n",
- " day_of_month | \n",
- " day_of_week | \n",
- " hour_of_day | \n",
- " country_code | \n",
- " hr_sin | \n",
- " hr_cos | \n",
- " dy_sin | \n",
- " dy_cos | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1379860 | \n",
- " 2 | \n",
- " 2016-01-14 06:39:00 | \n",
- " 2016-01-14 06:44:55 | \n",
- " 1 | \n",
- " 1.23 | \n",
- " None | \n",
- " None | \n",
- " -73.911827 | \n",
- " 40.775372 | \n",
- " -73.899635 | \n",
- " ... | \n",
- " 1.0 | \n",
- " 1 | \n",
- " 14 | \n",
- " 3 | \n",
- " 6 | \n",
- " US | \n",
- " 1.000000 | \n",
- " 6.123234e-17 | \n",
- " 0.433884 | \n",
- " -0.900969 | \n",
- "
\n",
- " \n",
- " 377548 | \n",
- " 2 | \n",
- " 2016-01-01 06:22:01 | \n",
- " 2016-01-01 06:27:14 | \n",
- " 5 | \n",
- " 0.91 | \n",
- " None | \n",
- " None | \n",
- " -73.962044 | \n",
- " 40.709797 | \n",
- " -73.946716 | \n",
- " ... | \n",
- " 1.0 | \n",
- " 1 | \n",
- " 1 | \n",
- " 4 | \n",
- " 6 | \n",
- " US | \n",
- " 1.000000 | \n",
- " 6.123234e-17 | \n",
- " -0.433884 | \n",
- " -0.900969 | \n",
- "
\n",
- " \n",
- " 473976 | \n",
- " 2 | \n",
- " 2016-01-08 20:55:49 | \n",
- " 2016-01-08 21:05:50 | \n",
- " 6 | \n",
- " 3.42 | \n",
- " None | \n",
- " None | \n",
- " -73.904823 | \n",
- " 40.741776 | \n",
- " -73.878815 | \n",
- " ... | \n",
- " 1.0 | \n",
- " 1 | \n",
- " 8 | \n",
- " 4 | \n",
- " 20 | \n",
- " US | \n",
- " -0.866025 | \n",
- " 5.000000e-01 | \n",
- " -0.433884 | \n",
- " -0.900969 | \n",
- "
\n",
- " \n",
- " 1246683 | \n",
- " 2 | \n",
- " 2016-01-15 08:27:41 | \n",
- " 2016-01-15 08:41:05 | \n",
- " 1 | \n",
- " 3.99 | \n",
- " None | \n",
- " None | \n",
- " -73.911484 | \n",
- " 40.854698 | \n",
- " -73.881821 | \n",
- " ... | \n",
- " 1.0 | \n",
- " 1 | \n",
- " 15 | \n",
- " 4 | \n",
- " 8 | \n",
- " US | \n",
- " 0.866025 | \n",
- " -5.000000e-01 | \n",
- " -0.433884 | \n",
- " -0.900969 | \n",
- "
\n",
- " \n",
- " 1152261 | \n",
- " 2 | \n",
- " 2016-01-09 04:35:21 | \n",
- " 2016-01-09 04:41:02 | \n",
- " 1 | \n",
- " 0.98 | \n",
- " None | \n",
- " None | \n",
- " -73.921776 | \n",
- " 40.767071 | \n",
- " -73.933136 | \n",
- " ... | \n",
- " 1.0 | \n",
- " 1 | \n",
- " 9 | \n",
- " 5 | \n",
- " 4 | \n",
- " US | \n",
- " 0.866025 | \n",
- " 5.000000e-01 | \n",
- " -0.974928 | \n",
- " -0.222521 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 998273 | \n",
- " 1 | \n",
- " 2016-12-24 22:03:25 | \n",
- " 2016-12-24 22:17:16 | \n",
- " 1 | \n",
- " 5.30 | \n",
- " 74 | \n",
- " 235 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " 1.0 | \n",
- " 12 | \n",
- " 24 | \n",
- " 5 | \n",
- " 22 | \n",
- " US | \n",
- " -0.500000 | \n",
- " 8.660254e-01 | \n",
- " -0.974928 | \n",
- " -0.222521 | \n",
- "
\n",
- " \n",
- " 857200 | \n",
- " 2 | \n",
- " 2016-12-03 20:33:53 | \n",
- " 2016-12-03 20:53:51 | \n",
- " 1 | \n",
- " 4.81 | \n",
- " 83 | \n",
- " 258 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " 1.0 | \n",
- " 12 | \n",
- " 3 | \n",
- " 5 | \n",
- " 20 | \n",
- " US | \n",
- " -0.866025 | \n",
- " 5.000000e-01 | \n",
- " -0.974928 | \n",
- " -0.222521 | \n",
- "
\n",
- " \n",
- " 607768 | \n",
- " 2 | \n",
- " 2016-12-18 16:17:54 | \n",
- " 2016-12-18 16:33:13 | \n",
- " 3 | \n",
- " 2.02 | \n",
- " 95 | \n",
- " 56 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " 1.0 | \n",
- " 12 | \n",
- " 18 | \n",
- " 6 | \n",
- " 16 | \n",
- " US | \n",
- " -0.866025 | \n",
- " -5.000000e-01 | \n",
- " -0.781831 | \n",
- " 0.623490 | \n",
- "
\n",
- " \n",
- " 78687 | \n",
- " 2 | \n",
- " 2016-12-06 09:24:43 | \n",
- " 2016-12-06 09:41:09 | \n",
- " 1 | \n",
- " 9.51 | \n",
- " 66 | \n",
- " 11 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " 1.0 | \n",
- " 12 | \n",
- " 6 | \n",
- " 1 | \n",
- " 9 | \n",
- " US | \n",
- " 0.707107 | \n",
- " -7.071068e-01 | \n",
- " 0.781831 | \n",
- " 0.623490 | \n",
- "
\n",
- " \n",
- " 141672 | \n",
- " 2 | \n",
- " 2016-12-14 16:12:34 | \n",
- " 2016-12-14 16:15:11 | \n",
- " 1 | \n",
- " 0.51 | \n",
- " 255 | \n",
- " 256 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " 1.0 | \n",
- " 12 | \n",
- " 14 | \n",
- " 2 | \n",
- " 16 | \n",
- " US | \n",
- " -0.866025 | \n",
- " -5.000000e-01 | \n",
- " 0.974928 | \n",
- " -0.222521 | \n",
- "
\n",
- " \n",
- "
\n",
- "
24000 rows × 32 columns
\n",
- "
"
+ "cell_type": "markdown",
+ "source": [
+ "## Download Data\n",
+ "Begin by downloading the NYC Taxi dataset from Azure Open Datasets. In non-Spark environments, Open Datasets only allows one month of data at a time with certain classes to avoid MemoryError with large datasets. To download 1 year of taxi data, we will fetch 2000 random samples from each month.\n",
+ "\n",
+ "Note: Open Datasets has mirroring classes for working in Spark where data size and memory are not a concern."
],
- "text/plain": [
- " vendorID lpepPickupDatetime lpepDropoffDatetime passengerCount \\\n",
- "1379860 2 2016-01-14 06:39:00 2016-01-14 06:44:55 1 \n",
- "377548 2 2016-01-01 06:22:01 2016-01-01 06:27:14 5 \n",
- "473976 2 2016-01-08 20:55:49 2016-01-08 21:05:50 6 \n",
- "1246683 2 2016-01-15 08:27:41 2016-01-15 08:41:05 1 \n",
- "1152261 2 2016-01-09 04:35:21 2016-01-09 04:41:02 1 \n",
- "... ... ... ... ... \n",
- "998273 1 2016-12-24 22:03:25 2016-12-24 22:17:16 1 \n",
- "857200 2 2016-12-03 20:33:53 2016-12-03 20:53:51 1 \n",
- "607768 2 2016-12-18 16:17:54 2016-12-18 16:33:13 3 \n",
- "78687 2 2016-12-06 09:24:43 2016-12-06 09:41:09 1 \n",
- "141672 2 2016-12-14 16:12:34 2016-12-14 16:15:11 1 \n",
- "\n",
- " tripDistance puLocationId doLocationId pickupLongitude \\\n",
- "1379860 1.23 None None -73.911827 \n",
- "377548 0.91 None None -73.962044 \n",
- "473976 3.42 None None -73.904823 \n",
- "1246683 3.99 None None -73.911484 \n",
- "1152261 0.98 None None -73.921776 \n",
- "... ... ... ... ... \n",
- "998273 5.30 74 235 NaN \n",
- "857200 4.81 83 258 NaN \n",
- "607768 2.02 95 56 NaN \n",
- "78687 9.51 66 11 NaN \n",
- "141672 0.51 255 256 NaN \n",
- "\n",
- " pickupLatitude dropoffLongitude ... tripType month_num \\\n",
- "1379860 40.775372 -73.899635 ... 1.0 1 \n",
- "377548 40.709797 -73.946716 ... 1.0 1 \n",
- "473976 40.741776 -73.878815 ... 1.0 1 \n",
- "1246683 40.854698 -73.881821 ... 1.0 1 \n",
- "1152261 40.767071 -73.933136 ... 1.0 1 \n",
- "... ... ... ... ... ... \n",
- "998273 NaN NaN ... 1.0 12 \n",
- "857200 NaN NaN ... 1.0 12 \n",
- "607768 NaN NaN ... 1.0 12 \n",
- "78687 NaN NaN ... 1.0 12 \n",
- "141672 NaN NaN ... 1.0 12 \n",
- "\n",
- " day_of_month day_of_week hour_of_day country_code hr_sin \\\n",
- "1379860 14 3 6 US 1.000000 \n",
- "377548 1 4 6 US 1.000000 \n",
- "473976 8 4 20 US -0.866025 \n",
- "1246683 15 4 8 US 0.866025 \n",
- "1152261 9 5 4 US 0.866025 \n",
- "... ... ... ... ... ... \n",
- "998273 24 5 22 US -0.500000 \n",
- "857200 3 5 20 US -0.866025 \n",
- "607768 18 6 16 US -0.866025 \n",
- "78687 6 1 9 US 0.707107 \n",
- "141672 14 2 16 US -0.866025 \n",
- "\n",
- " hr_cos dy_sin dy_cos \n",
- "1379860 6.123234e-17 0.433884 -0.900969 \n",
- "377548 6.123234e-17 -0.433884 -0.900969 \n",
- "473976 5.000000e-01 -0.433884 -0.900969 \n",
- "1246683 -5.000000e-01 -0.433884 -0.900969 \n",
- "1152261 5.000000e-01 -0.974928 -0.222521 \n",
- "... ... ... ... \n",
- "998273 8.660254e-01 -0.974928 -0.222521 \n",
- "857200 5.000000e-01 -0.974928 -0.222521 \n",
- "607768 -5.000000e-01 -0.781831 0.623490 \n",
- "78687 -7.071068e-01 0.781831 0.623490 \n",
- "141672 -5.000000e-01 0.974928 -0.222521 \n",
- "\n",
- "[24000 rows x 32 columns]"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "def build_time_features(vector):\n",
- " pickup_datetime = vector[0]\n",
- " month_num = pickup_datetime.month\n",
- " day_of_month = pickup_datetime.day\n",
- " day_of_week = pickup_datetime.weekday()\n",
- " hour_of_day = pickup_datetime.hour\n",
- " country_code = \"US\"\n",
- " hr_sin = np.sin(hour_of_day*(2.*np.pi/24))\n",
- " hr_cos = np.cos(hour_of_day*(2.*np.pi/24))\n",
- " dy_sin = np.sin(day_of_week*(2.*np.pi/7))\n",
- " dy_cos = np.cos(day_of_week*(2.*np.pi/7))\n",
- " \n",
- " return pd.Series((month_num, day_of_month, day_of_week, hour_of_day, country_code, hr_sin, hr_cos, dy_sin, dy_cos))\n",
- "\n",
- "green_taxi_df[[\"month_num\", \"day_of_month\",\"day_of_week\", \"hour_of_day\", \"country_code\", \"hr_sin\", \"hr_cos\", \"dy_sin\", \"dy_cos\"]] = green_taxi_df[[\"lpepPickupDatetime\"]].apply(build_time_features, axis=1)\n",
- "green_taxi_df"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Remove some of the columns that you won't need for modeling or additional feature building. Rename the time field for pickup time, and additionally convert the time to midnight using `pandas.Series.dt.normalize`. This is done to all time features so that the datetime column can be later used as a key when joining datasets together at a daily level of granularity."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [
+ "metadata": {}
+ },
{
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " vendorID | \n",
- " lpepPickupDatetime | \n",
- " passengerCount | \n",
- " tripDistance | \n",
- " pickupLongitude | \n",
- " pickupLatitude | \n",
- " dropoffLongitude | \n",
- " dropoffLatitude | \n",
- " totalAmount | \n",
- " month_num | \n",
- " day_of_month | \n",
- " day_of_week | \n",
- " hour_of_day | \n",
- " country_code | \n",
- " hr_sin | \n",
- " hr_cos | \n",
- " dy_sin | \n",
- " dy_cos | \n",
- " datetime | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1379860 | \n",
- " 2 | \n",
- " 2016-01-14 06:39:00 | \n",
- " 1 | \n",
- " 1.23 | \n",
- " -73.911827 | \n",
- " 40.775372 | \n",
- " -73.899635 | \n",
- " 40.768333 | \n",
- " 7.3 | \n",
- " 1 | \n",
- " 14 | \n",
- " 3 | \n",
- " 6 | \n",
- " US | \n",
- " 1.000000 | \n",
- " 6.123234e-17 | \n",
- " 0.433884 | \n",
- " -0.900969 | \n",
- " 2016-01-14 | \n",
- "
\n",
- " \n",
- " 377548 | \n",
- " 2 | \n",
- " 2016-01-01 06:22:01 | \n",
- " 5 | \n",
- " 0.91 | \n",
- " -73.962044 | \n",
- " 40.709797 | \n",
- " -73.946716 | \n",
- " 40.706902 | \n",
- " 6.3 | \n",
- " 1 | \n",
- " 1 | \n",
- " 4 | \n",
- " 6 | \n",
- " US | \n",
- " 1.000000 | \n",
- " 6.123234e-17 | \n",
- " -0.433884 | \n",
- " -0.900969 | \n",
- " 2016-01-01 | \n",
- "
\n",
- " \n",
- " 473976 | \n",
- " 2 | \n",
- " 2016-01-08 20:55:49 | \n",
- " 6 | \n",
- " 3.42 | \n",
- " -73.904823 | \n",
- " 40.741776 | \n",
- " -73.878815 | \n",
- " 40.717625 | \n",
- " 12.8 | \n",
- " 1 | \n",
- " 8 | \n",
- " 4 | \n",
- " 20 | \n",
- " US | \n",
- " -0.866025 | \n",
- " 5.000000e-01 | \n",
- " -0.433884 | \n",
- " -0.900969 | \n",
- " 2016-01-08 | \n",
- "
\n",
- " \n",
- " 1246683 | \n",
- " 2 | \n",
- " 2016-01-15 08:27:41 | \n",
- " 1 | \n",
- " 3.99 | \n",
- " -73.911484 | \n",
- " 40.854698 | \n",
- " -73.881821 | \n",
- " 40.882130 | \n",
- " 15.8 | \n",
- " 1 | \n",
- " 15 | \n",
- " 4 | \n",
- " 8 | \n",
- " US | \n",
- " 0.866025 | \n",
- " -5.000000e-01 | \n",
- " -0.433884 | \n",
- " -0.900969 | \n",
- " 2016-01-15 | \n",
- "
\n",
- " \n",
- " 1152261 | \n",
- " 2 | \n",
- " 2016-01-09 04:35:21 | \n",
- " 1 | \n",
- " 0.98 | \n",
- " -73.921776 | \n",
- " 40.767071 | \n",
- " -73.933136 | \n",
- " 40.774567 | \n",
- " 8.0 | \n",
- " 1 | \n",
- " 9 | \n",
- " 5 | \n",
- " 4 | \n",
- " US | \n",
- " 0.866025 | \n",
- " 5.000000e-01 | \n",
- " -0.974928 | \n",
- " -0.222521 | \n",
- " 2016-01-09 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
+ "cell_type": "code",
+ "source": [
+ "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n",
+ "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n",
+ "\n",
+ "green_taxi_df = pd.concat([NycTlcGreen(start + relativedelta(months=x), end + relativedelta(months=x)) \\\n",
+ " .to_pandas_dataframe().sample(2000) for x in range(12)])\n",
+ "green_taxi_df"
],
- "text/plain": [
- " vendorID lpepPickupDatetime passengerCount tripDistance \\\n",
- "1379860 2 2016-01-14 06:39:00 1 1.23 \n",
- "377548 2 2016-01-01 06:22:01 5 0.91 \n",
- "473976 2 2016-01-08 20:55:49 6 3.42 \n",
- "1246683 2 2016-01-15 08:27:41 1 3.99 \n",
- "1152261 2 2016-01-09 04:35:21 1 0.98 \n",
- "\n",
- " pickupLongitude pickupLatitude dropoffLongitude dropoffLatitude \\\n",
- "1379860 -73.911827 40.775372 -73.899635 40.768333 \n",
- "377548 -73.962044 40.709797 -73.946716 40.706902 \n",
- "473976 -73.904823 40.741776 -73.878815 40.717625 \n",
- "1246683 -73.911484 40.854698 -73.881821 40.882130 \n",
- "1152261 -73.921776 40.767071 -73.933136 40.774567 \n",
- "\n",
- " totalAmount month_num day_of_month day_of_week hour_of_day \\\n",
- "1379860 7.3 1 14 3 6 \n",
- "377548 6.3 1 1 4 6 \n",
- "473976 12.8 1 8 4 20 \n",
- "1246683 15.8 1 15 4 8 \n",
- "1152261 8.0 1 9 5 4 \n",
- "\n",
- " country_code hr_sin hr_cos dy_sin dy_cos datetime \n",
- "1379860 US 1.000000 6.123234e-17 0.433884 -0.900969 2016-01-14 \n",
- "377548 US 1.000000 6.123234e-17 -0.433884 -0.900969 2016-01-01 \n",
- "473976 US -0.866025 5.000000e-01 -0.433884 -0.900969 2016-01-08 \n",
- "1246683 US 0.866025 -5.000000e-01 -0.433884 -0.900969 2016-01-15 \n",
- "1152261 US 0.866025 5.000000e-01 -0.974928 -0.222521 2016-01-09 "
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "columns_to_remove = [\"lpepDropoffDatetime\", \"puLocationId\", \"doLocationId\", \"extra\", \"mtaTax\",\n",
- " \"improvementSurcharge\", \"tollsAmount\", \"ehailFee\", \"tripType\", \"rateCodeID\", \n",
- " \"storeAndFwdFlag\", \"paymentType\", \"fareAmount\", \"tipAmount\"]\n",
- "\n",
- "green_taxi_df.drop(columns_to_remove, axis=1, inplace=True)\n",
- "\n",
- "green_taxi_df[\"datetime\"] = green_taxi_df[\"lpepPickupDatetime\"].dt.normalize()\n",
- "green_taxi_df.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Enrich with Holiday Data\n",
- "\n",
- "Now that the taxi data is downloaded and roughly prepared, add in holiday data as additional features. Holiday-specific features will assist model accuracy, as major holidays are times where taxi demand increases dramatically and supply becomes limited. The holiday dataset is relatively small, so fetch the full set by using the `PublicHolidays` class constructor with no parameters for filtering. Preview the data to check the format."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {},
- "outputs": [
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": "[Info] read from /tmp/tmpm9erjg7h/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=1/part-00119-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2689-1.c000.snappy.parquet\n[Info] read from /tmp/tmp9svrtdy0/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=2/part-00060-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2630-2.c000.snappy.parquet\n[Info] read from /tmp/tmp2h8fjxvw/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=3/part-00196-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2766-1.c000.snappy.parquet\n[Info] read from /tmp/tmpfvo7iz0i/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=4/part-00121-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2691-1.c000.snappy.parquet\n[Info] read from /tmp/tmpjlfw4v7s/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=5/part-00044-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2614-1.c000.snappy.parquet\n[Info] read from /tmp/tmpycf1ze5d/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=6/part-00108-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2678-1.c000.snappy.parquet\n[Info] read from /tmp/tmp5z2yg073/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=7/part-00020-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2590-2.c000.snappy.parquet\n[Info] read from /tmp/tmpu_jpgy_x/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=8/part-00172-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2742-2.c000.snappy.parquet\n[Info] read from /tmp/tmpvlj9g932/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=9/part-00076-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2646-1.c000.snappy.parquet\n[Info] read from /tmp/tmp21b4rgp5/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=10/part-00090-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2660-1.c000.snappy.parquet\n[Info] read from /tmp/tmpfomnswl0/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=11/part-00021-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2591-1.c000.snappy.parquet\n[Info] read from /tmp/tmp35xg_y0t/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/green/puYear=2016/puMonth=12/part-00116-tid-4753095944193949832-fee7e113-666d-4114-9fcb-bcd3046479f3-2686-1.c000.snappy.parquet\n"
+ },
+ {
+ "output_type": "execute_result",
+ "execution_count": 2,
+ "data": {
+ "text/plain": " vendorID lpepPickupDatetime lpepDropoffDatetime passengerCount \\\n1312085 2 2016-01-03 11:10:13 2016-01-03 11:14:13 1 \n109916 2 2016-01-19 08:11:09 2016-01-19 08:16:29 1 \n25029 2 2016-01-02 11:47:40 2016-01-02 11:52:29 1 \n629848 2 2016-01-17 18:31:30 2016-01-17 18:42:32 1 \n139651 2 2016-01-23 00:00:17 2016-01-23 00:05:10 1 \n... ... ... ... ... \n44592 1 2016-12-05 08:14:48 2016-12-05 08:39:17 1 \n731527 2 2016-12-24 00:07:40 2016-12-24 00:10:19 1 \n501002 1 2016-12-18 05:47:22 2016-12-18 06:10:34 1 \n700564 2 2016-12-23 12:49:47 2016-12-23 13:00:52 1 \n646881 2 2016-12-22 00:01:44 2016-12-22 00:26:41 1 \n\n tripDistance puLocationId doLocationId pickupLongitude \\\n1312085 0.83 None None -73.939774 \n109916 0.85 None None -73.925629 \n25029 0.81 None None -73.973312 \n629848 2.21 None None -73.928474 \n139651 0.60 None None -73.953415 \n... ... ... ... ... \n44592 3.70 49 71 NaN \n731527 0.47 255 255 NaN \n501002 8.40 116 79 NaN \n700564 2.63 166 236 NaN \n646881 4.77 37 40 NaN \n\n pickupLatitude dropoffLongitude ... paymentType fareAmount extra \\\n1312085 40.679844 -73.930649 ... 2 5.0 0.0 \n109916 40.761787 -73.937866 ... 2 5.5 0.0 \n25029 40.689678 -73.984985 ... 1 5.0 0.0 \n629848 40.687298 -73.940605 ... 2 9.5 0.0 \n139651 40.706947 -73.948738 ... 2 5.0 0.5 \n... ... ... ... ... ... ... \n44592 NaN NaN ... 2 17.5 0.0 \n731527 NaN NaN ... 1 4.0 0.5 \n501002 NaN NaN ... 1 27.0 0.5 \n700564 NaN NaN ... 1 10.5 0.0 \n646881 NaN NaN ... 1 18.5 0.5 \n\n mtaTax improvementSurcharge tipAmount tollsAmount ehailFee \\\n1312085 0.5 0.3 0.00 0.0 NaN \n109916 0.5 0.3 0.00 0.0 NaN \n25029 0.5 0.3 1.16 0.0 NaN \n629848 0.5 0.3 0.00 0.0 NaN \n139651 0.5 0.3 0.00 0.0 NaN \n... ... ... ... ... ... \n44592 0.5 0.3 0.00 0.0 NaN \n731527 0.5 0.3 1.06 0.0 NaN \n501002 0.5 0.3 5.65 0.0 NaN \n700564 0.5 0.3 2.00 0.0 NaN \n646881 0.5 0.3 2.97 0.0 NaN \n\n totalAmount tripType \n1312085 5.80 1.0 \n109916 6.30 1.0 \n25029 6.96 1.0 \n629848 10.30 1.0 \n139651 6.30 1.0 \n... ... ... \n44592 18.30 1.0 \n731527 6.36 1.0 \n501002 33.95 1.0 \n700564 13.30 1.0 \n646881 24.72 1.0 \n\n[24000 rows x 23 columns]",
+ "text/html": "\n\n
\n \n \n | \n vendorID | \n lpepPickupDatetime | \n lpepDropoffDatetime | \n passengerCount | \n tripDistance | \n puLocationId | \n doLocationId | \n pickupLongitude | \n pickupLatitude | \n dropoffLongitude | \n ... | \n paymentType | \n fareAmount | \n extra | \n mtaTax | \n improvementSurcharge | \n tipAmount | \n tollsAmount | \n ehailFee | \n totalAmount | \n tripType | \n
\n \n \n \n 1312085 | \n 2 | \n 2016-01-03 11:10:13 | \n 2016-01-03 11:14:13 | \n 1 | \n 0.83 | \n None | \n None | \n -73.939774 | \n 40.679844 | \n -73.930649 | \n ... | \n 2 | \n 5.0 | \n 0.0 | \n 0.5 | \n 0.3 | \n 0.00 | \n 0.0 | \n NaN | \n 5.80 | \n 1.0 | \n
\n \n 109916 | \n 2 | \n 2016-01-19 08:11:09 | \n 2016-01-19 08:16:29 | \n 1 | \n 0.85 | \n None | \n None | \n -73.925629 | \n 40.761787 | \n -73.937866 | \n ... | \n 2 | \n 5.5 | \n 0.0 | \n 0.5 | \n 0.3 | \n 0.00 | \n 0.0 | \n NaN | \n 6.30 | \n 1.0 | \n
\n \n 25029 | \n 2 | \n 2016-01-02 11:47:40 | \n 2016-01-02 11:52:29 | \n 1 | \n 0.81 | \n None | \n None | \n -73.973312 | \n 40.689678 | \n -73.984985 | \n ... | \n 1 | \n 5.0 | \n 0.0 | \n 0.5 | \n 0.3 | \n 1.16 | \n 0.0 | \n NaN | \n 6.96 | \n 1.0 | \n
\n \n 629848 | \n 2 | \n 2016-01-17 18:31:30 | \n 2016-01-17 18:42:32 | \n 1 | \n 2.21 | \n None | \n None | \n -73.928474 | \n 40.687298 | \n -73.940605 | \n ... | \n 2 | \n 9.5 | \n 0.0 | \n 0.5 | \n 0.3 | \n 0.00 | \n 0.0 | \n NaN | \n 10.30 | \n 1.0 | \n
\n \n 139651 | \n 2 | \n 2016-01-23 00:00:17 | \n 2016-01-23 00:05:10 | \n 1 | \n 0.60 | \n None | \n None | \n -73.953415 | \n 40.706947 | \n -73.948738 | \n ... | \n 2 | \n 5.0 | \n 0.5 | \n 0.5 | \n 0.3 | \n 0.00 | \n 0.0 | \n NaN | \n 6.30 | \n 1.0 | \n
\n \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n
\n \n 44592 | \n 1 | \n 2016-12-05 08:14:48 | \n 2016-12-05 08:39:17 | \n 1 | \n 3.70 | \n 49 | \n 71 | \n NaN | \n NaN | \n NaN | \n ... | \n 2 | \n 17.5 | \n 0.0 | \n 0.5 | \n 0.3 | \n 0.00 | \n 0.0 | \n NaN | \n 18.30 | \n 1.0 | \n
\n \n 731527 | \n 2 | \n 2016-12-24 00:07:40 | \n 2016-12-24 00:10:19 | \n 1 | \n 0.47 | \n 255 | \n 255 | \n NaN | \n NaN | \n NaN | \n ... | \n 1 | \n 4.0 | \n 0.5 | \n 0.5 | \n 0.3 | \n 1.06 | \n 0.0 | \n NaN | \n 6.36 | \n 1.0 | \n
\n \n 501002 | \n 1 | \n 2016-12-18 05:47:22 | \n 2016-12-18 06:10:34 | \n 1 | \n 8.40 | \n 116 | \n 79 | \n NaN | \n NaN | \n NaN | \n ... | \n 1 | \n 27.0 | \n 0.5 | \n 0.5 | \n 0.3 | \n 5.65 | \n 0.0 | \n NaN | \n 33.95 | \n 1.0 | \n
\n \n 700564 | \n 2 | \n 2016-12-23 12:49:47 | \n 2016-12-23 13:00:52 | \n 1 | \n 2.63 | \n 166 | \n 236 | \n NaN | \n NaN | \n NaN | \n ... | \n 1 | \n 10.5 | \n 0.0 | \n 0.5 | \n 0.3 | \n 2.00 | \n 0.0 | \n NaN | \n 13.30 | \n 1.0 | \n
\n \n 646881 | \n 2 | \n 2016-12-22 00:01:44 | \n 2016-12-22 00:26:41 | \n 1 | \n 4.77 | \n 37 | \n 40 | \n NaN | \n NaN | \n NaN | \n ... | \n 1 | \n 18.5 | \n 0.5 | \n 0.5 | \n 0.3 | \n 2.97 | \n 0.0 | \n NaN | \n 24.72 | \n 1.0 | \n
\n \n
\n
24000 rows × 23 columns
\n
"
+ },
+ "metadata": {}
+ }
+ ],
+ "execution_count": 2,
+ "metadata": {
+ "gather": {
+ "logged": 1681193755843
+ }
+ }
+ },
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpya4i60qp\\https%3A\\%2Fazureopendatastorage.azurefd.net\\holidaydatacontainer\\Processed\\part-00000-tid-8468414522853579044-35925ba8-a227-4b80-9c89-17065e7bf1db-649-c000.snappy.parquet\n"
- ]
+ "cell_type": "markdown",
+ "source": [
+ "Now that the initial data is loaded, define a function to create various time-based features from the pickup datetime field. This will create new fields for the month number, day of month, day of week, and hour of day. From those, we calculate the sin and cosine transformations to capture the cyclical nature of the variable which will allow the model to factor in time-based seasonality. This function also adds a static feature for the country code to join the holiday data. Use the apply() function on the dataframe to interatively apply this function to each row in the dataframe."
+ ],
+ "metadata": {}
},
{
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " countryOrRegion | \n",
- " holidayName | \n",
- " normalizeHolidayName | \n",
- " isPaidTimeOff | \n",
- " countryRegionCode | \n",
- " date | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 19375 | \n",
- " Argentina | \n",
- " Año Nuevo [New Year's Day] | \n",
- " Año Nuevo [New Year's Day] | \n",
- " None | \n",
- " AR | \n",
- " 2008-01-01 | \n",
- "
\n",
- " \n",
- " 19376 | \n",
- " Australia | \n",
- " New Year's Day | \n",
- " New Year's Day | \n",
- " None | \n",
- " AU | \n",
- " 2008-01-01 | \n",
- "
\n",
- " \n",
- " 19377 | \n",
- " Austria | \n",
- " Neujahr | \n",
- " Neujahr | \n",
- " None | \n",
- " AT | \n",
- " 2008-01-01 | \n",
- "
\n",
- " \n",
- " 19378 | \n",
- " Belarus | \n",
- " Новый год | \n",
- " Новый год | \n",
- " None | \n",
- " BY | \n",
- " 2008-01-01 | \n",
- "
\n",
- " \n",
- " 19379 | \n",
- " Belgium | \n",
- " Nieuwjaarsdag | \n",
- " Nieuwjaarsdag | \n",
- " None | \n",
- " BE | \n",
- " 2008-01-01 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
+ "cell_type": "code",
+ "source": [
+ "def build_time_features(vector):\n",
+ " pickup_datetime = vector[0]\n",
+ " month_num = pickup_datetime.month\n",
+ " day_of_month = pickup_datetime.day\n",
+ " day_of_week = pickup_datetime.weekday()\n",
+ " hour_of_day = pickup_datetime.hour\n",
+ " country_code = \"US\"\n",
+ " hr_sin = np.sin(hour_of_day*(2.*np.pi/24))\n",
+ " hr_cos = np.cos(hour_of_day*(2.*np.pi/24))\n",
+ " dy_sin = np.sin(day_of_week*(2.*np.pi/7))\n",
+ " dy_cos = np.cos(day_of_week*(2.*np.pi/7))\n",
+ " \n",
+ " return pd.Series((month_num, day_of_month, day_of_week, hour_of_day, country_code, hr_sin, hr_cos, dy_sin, dy_cos))\n",
+ "\n",
+ "green_taxi_df[[\"month_num\", \"day_of_month\",\"day_of_week\", \"hour_of_day\", \"country_code\", \"hr_sin\", \"hr_cos\", \"dy_sin\", \"dy_cos\"]] = green_taxi_df[[\"lpepPickupDatetime\"]].apply(build_time_features, axis=1)\n",
+ "green_taxi_df"
],
- "text/plain": [
- " countryOrRegion holidayName normalizeHolidayName \\\n",
- "19375 Argentina Año Nuevo [New Year's Day] Año Nuevo [New Year's Day] \n",
- "19376 Australia New Year's Day New Year's Day \n",
- "19377 Austria Neujahr Neujahr \n",
- "19378 Belarus Новый год Новый год \n",
- "19379 Belgium Nieuwjaarsdag Nieuwjaarsdag \n",
- "\n",
- " isPaidTimeOff countryRegionCode date \n",
- "19375 None AR 2008-01-01 \n",
- "19376 None AU 2008-01-01 \n",
- "19377 None AT 2008-01-01 \n",
- "19378 None BY 2008-01-01 \n",
- "19379 None BE 2008-01-01 "
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from azureml.opendatasets import PublicHolidays\n",
- "\n",
- "# call default constructor to download full dataset\n",
- "holidays_df = PublicHolidays().to_pandas_dataframe()\n",
- "holidays_df.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Rename the `countryRegionCode` and `date` columns to match the respective field names from the taxi data, and also normalize the time so it can be used as a key. Next, join the holiday data with the taxi data by performing a left-join using the Pandas `merge()` function. This will preserve all records from `green_taxi_df`, but add in holiday data where it exists for the corresponding `datetime` and `country_code`, which in this case is always `\\\"US\\\"`. Preview the data to verify that they were merged correctly."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {},
- "outputs": [
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "execution_count": 3,
+ "data": {
+ "text/plain": " vendorID lpepPickupDatetime lpepDropoffDatetime passengerCount \\\n1312085 2 2016-01-03 11:10:13 2016-01-03 11:14:13 1 \n109916 2 2016-01-19 08:11:09 2016-01-19 08:16:29 1 \n25029 2 2016-01-02 11:47:40 2016-01-02 11:52:29 1 \n629848 2 2016-01-17 18:31:30 2016-01-17 18:42:32 1 \n139651 2 2016-01-23 00:00:17 2016-01-23 00:05:10 1 \n... ... ... ... ... \n44592 1 2016-12-05 08:14:48 2016-12-05 08:39:17 1 \n731527 2 2016-12-24 00:07:40 2016-12-24 00:10:19 1 \n501002 1 2016-12-18 05:47:22 2016-12-18 06:10:34 1 \n700564 2 2016-12-23 12:49:47 2016-12-23 13:00:52 1 \n646881 2 2016-12-22 00:01:44 2016-12-22 00:26:41 1 \n\n tripDistance puLocationId doLocationId pickupLongitude \\\n1312085 0.83 None None -73.939774 \n109916 0.85 None None -73.925629 \n25029 0.81 None None -73.973312 \n629848 2.21 None None -73.928474 \n139651 0.60 None None -73.953415 \n... ... ... ... ... \n44592 3.70 49 71 NaN \n731527 0.47 255 255 NaN \n501002 8.40 116 79 NaN \n700564 2.63 166 236 NaN \n646881 4.77 37 40 NaN \n\n pickupLatitude dropoffLongitude ... tripType month_num \\\n1312085 40.679844 -73.930649 ... 1.0 1 \n109916 40.761787 -73.937866 ... 1.0 1 \n25029 40.689678 -73.984985 ... 1.0 1 \n629848 40.687298 -73.940605 ... 1.0 1 \n139651 40.706947 -73.948738 ... 1.0 1 \n... ... ... ... ... ... \n44592 NaN NaN ... 1.0 12 \n731527 NaN NaN ... 1.0 12 \n501002 NaN NaN ... 1.0 12 \n700564 NaN NaN ... 1.0 12 \n646881 NaN NaN ... 1.0 12 \n\n day_of_month day_of_week hour_of_day country_code hr_sin \\\n1312085 3 6 11 US 2.588190e-01 \n109916 19 1 8 US 8.660254e-01 \n25029 2 5 11 US 2.588190e-01 \n629848 17 6 18 US -1.000000e+00 \n139651 23 5 0 US 0.000000e+00 \n... ... ... ... ... ... \n44592 5 0 8 US 8.660254e-01 \n731527 24 5 0 US 0.000000e+00 \n501002 18 6 5 US 9.659258e-01 \n700564 23 4 12 US 1.224647e-16 \n646881 22 3 0 US 0.000000e+00 \n\n hr_cos dy_sin dy_cos \n1312085 -9.659258e-01 -0.781831 0.623490 \n109916 -5.000000e-01 0.781831 0.623490 \n25029 -9.659258e-01 -0.974928 -0.222521 \n629848 -1.836970e-16 -0.781831 0.623490 \n139651 1.000000e+00 -0.974928 -0.222521 \n... ... ... ... \n44592 -5.000000e-01 0.000000 1.000000 \n731527 1.000000e+00 -0.974928 -0.222521 \n501002 2.588190e-01 -0.781831 0.623490 \n700564 -1.000000e+00 -0.433884 -0.900969 \n646881 1.000000e+00 0.433884 -0.900969 \n\n[24000 rows x 32 columns]",
+ "text/html": "\n\n
\n \n \n | \n vendorID | \n lpepPickupDatetime | \n lpepDropoffDatetime | \n passengerCount | \n tripDistance | \n puLocationId | \n doLocationId | \n pickupLongitude | \n pickupLatitude | \n dropoffLongitude | \n ... | \n tripType | \n month_num | \n day_of_month | \n day_of_week | \n hour_of_day | \n country_code | \n hr_sin | \n hr_cos | \n dy_sin | \n dy_cos | \n
\n \n \n \n 1312085 | \n 2 | \n 2016-01-03 11:10:13 | \n 2016-01-03 11:14:13 | \n 1 | \n 0.83 | \n None | \n None | \n -73.939774 | \n 40.679844 | \n -73.930649 | \n ... | \n 1.0 | \n 1 | \n 3 | \n 6 | \n 11 | \n US | \n 2.588190e-01 | \n -9.659258e-01 | \n -0.781831 | \n 0.623490 | \n
\n \n 109916 | \n 2 | \n 2016-01-19 08:11:09 | \n 2016-01-19 08:16:29 | \n 1 | \n 0.85 | \n None | \n None | \n -73.925629 | \n 40.761787 | \n -73.937866 | \n ... | \n 1.0 | \n 1 | \n 19 | \n 1 | \n 8 | \n US | \n 8.660254e-01 | \n -5.000000e-01 | \n 0.781831 | \n 0.623490 | \n
\n \n 25029 | \n 2 | \n 2016-01-02 11:47:40 | \n 2016-01-02 11:52:29 | \n 1 | \n 0.81 | \n None | \n None | \n -73.973312 | \n 40.689678 | \n -73.984985 | \n ... | \n 1.0 | \n 1 | \n 2 | \n 5 | \n 11 | \n US | \n 2.588190e-01 | \n -9.659258e-01 | \n -0.974928 | \n -0.222521 | \n
\n \n 629848 | \n 2 | \n 2016-01-17 18:31:30 | \n 2016-01-17 18:42:32 | \n 1 | \n 2.21 | \n None | \n None | \n -73.928474 | \n 40.687298 | \n -73.940605 | \n ... | \n 1.0 | \n 1 | \n 17 | \n 6 | \n 18 | \n US | \n -1.000000e+00 | \n -1.836970e-16 | \n -0.781831 | \n 0.623490 | \n
\n \n 139651 | \n 2 | \n 2016-01-23 00:00:17 | \n 2016-01-23 00:05:10 | \n 1 | \n 0.60 | \n None | \n None | \n -73.953415 | \n 40.706947 | \n -73.948738 | \n ... | \n 1.0 | \n 1 | \n 23 | \n 5 | \n 0 | \n US | \n 0.000000e+00 | \n 1.000000e+00 | \n -0.974928 | \n -0.222521 | \n
\n \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n
\n \n 44592 | \n 1 | \n 2016-12-05 08:14:48 | \n 2016-12-05 08:39:17 | \n 1 | \n 3.70 | \n 49 | \n 71 | \n NaN | \n NaN | \n NaN | \n ... | \n 1.0 | \n 12 | \n 5 | \n 0 | \n 8 | \n US | \n 8.660254e-01 | \n -5.000000e-01 | \n 0.000000 | \n 1.000000 | \n
\n \n 731527 | \n 2 | \n 2016-12-24 00:07:40 | \n 2016-12-24 00:10:19 | \n 1 | \n 0.47 | \n 255 | \n 255 | \n NaN | \n NaN | \n NaN | \n ... | \n 1.0 | \n 12 | \n 24 | \n 5 | \n 0 | \n US | \n 0.000000e+00 | \n 1.000000e+00 | \n -0.974928 | \n -0.222521 | \n
\n \n 501002 | \n 1 | \n 2016-12-18 05:47:22 | \n 2016-12-18 06:10:34 | \n 1 | \n 8.40 | \n 116 | \n 79 | \n NaN | \n NaN | \n NaN | \n ... | \n 1.0 | \n 12 | \n 18 | \n 6 | \n 5 | \n US | \n 9.659258e-01 | \n 2.588190e-01 | \n -0.781831 | \n 0.623490 | \n
\n \n 700564 | \n 2 | \n 2016-12-23 12:49:47 | \n 2016-12-23 13:00:52 | \n 1 | \n 2.63 | \n 166 | \n 236 | \n NaN | \n NaN | \n NaN | \n ... | \n 1.0 | \n 12 | \n 23 | \n 4 | \n 12 | \n US | \n 1.224647e-16 | \n -1.000000e+00 | \n -0.433884 | \n -0.900969 | \n
\n \n 646881 | \n 2 | \n 2016-12-22 00:01:44 | \n 2016-12-22 00:26:41 | \n 1 | \n 4.77 | \n 37 | \n 40 | \n NaN | \n NaN | \n NaN | \n ... | \n 1.0 | \n 12 | \n 22 | \n 3 | \n 0 | \n US | \n 0.000000e+00 | \n 1.000000e+00 | \n 0.433884 | \n -0.900969 | \n
\n \n
\n
24000 rows × 32 columns
\n
"
+ },
+ "metadata": {}
+ }
+ ],
+ "execution_count": 3,
+ "metadata": {
+ "gather": {
+ "logged": 1681193760347
+ }
+ }
+ },
{
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " vendorID | \n",
- " lpepPickupDatetime | \n",
- " passengerCount | \n",
- " tripDistance | \n",
- " pickupLongitude | \n",
- " pickupLatitude | \n",
- " dropoffLongitude | \n",
- " dropoffLatitude | \n",
- " totalAmount | \n",
- " month_num | \n",
- " ... | \n",
- " day_of_week | \n",
- " hour_of_day | \n",
- " country_code | \n",
- " hr_sin | \n",
- " hr_cos | \n",
- " dy_sin | \n",
- " dy_cos | \n",
- " datetime | \n",
- " normalizeHolidayName | \n",
- " isPaidTimeOff | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1 | \n",
- " 2 | \n",
- " 2016-01-01 06:22:01 | \n",
- " 5 | \n",
- " 0.91 | \n",
- " -73.962044 | \n",
- " 40.709797 | \n",
- " -73.946716 | \n",
- " 40.706902 | \n",
- " 6.30 | \n",
- " 1 | \n",
- " ... | \n",
- " 4 | \n",
- " 6 | \n",
- " US | \n",
- " 1.000000 | \n",
- " 6.123234e-17 | \n",
- " -0.433884 | \n",
- " -0.900969 | \n",
- " 2016-01-01 | \n",
- " New Year's Day | \n",
- " True | \n",
- "
\n",
- " \n",
- " 25 | \n",
- " 2 | \n",
- " 2016-01-01 06:14:43 | \n",
- " 1 | \n",
- " 2.44 | \n",
- " -73.993576 | \n",
- " 40.681519 | \n",
- " -73.999596 | \n",
- " 40.655930 | \n",
- " 10.30 | \n",
- " 1 | \n",
- " ... | \n",
- " 4 | \n",
- " 6 | \n",
- " US | \n",
- " 1.000000 | \n",
- " 6.123234e-17 | \n",
- " -0.433884 | \n",
- " -0.900969 | \n",
- " 2016-01-01 | \n",
- " New Year's Day | \n",
- " True | \n",
- "
\n",
- " \n",
- " 27 | \n",
- " 2 | \n",
- " 2016-01-01 16:06:33 | \n",
- " 1 | \n",
- " 4.57 | \n",
- " -73.962509 | \n",
- " 40.687862 | \n",
- " -73.981361 | \n",
- " 40.732758 | \n",
- " 22.25 | \n",
- " 1 | \n",
- " ... | \n",
- " 4 | \n",
- " 16 | \n",
- " US | \n",
- " -0.866025 | \n",
- " -5.000000e-01 | \n",
- " -0.433884 | \n",
- " -0.900969 | \n",
- " 2016-01-01 | \n",
- " New Year's Day | \n",
- " True | \n",
- "
\n",
- " \n",
- " 44 | \n",
- " 2 | \n",
- " 2016-01-18 11:46:27 | \n",
- " 1 | \n",
- " 16.10 | \n",
- " -73.925522 | \n",
- " 40.827877 | \n",
- " -73.934982 | \n",
- " 40.681278 | \n",
- " 50.30 | \n",
- " 1 | \n",
- " ... | \n",
- " 0 | \n",
- " 11 | \n",
- " US | \n",
- " 0.258819 | \n",
- " -9.659258e-01 | \n",
- " 0.000000 | \n",
- " 1.000000 | \n",
- " 2016-01-18 | \n",
- " Martin Luther King Jr. Day | \n",
- " None | \n",
- "
\n",
- " \n",
- " 45 | \n",
- " 2 | \n",
- " 2016-01-01 10:41:39 | \n",
- " 1 | \n",
- " 3.33 | \n",
- " -73.962891 | \n",
- " 40.711971 | \n",
- " -73.918060 | \n",
- " 40.736832 | \n",
- " 12.80 | \n",
- " 1 | \n",
- " ... | \n",
- " 4 | \n",
- " 10 | \n",
- " US | \n",
- " 0.500000 | \n",
- " -8.660254e-01 | \n",
- " -0.433884 | \n",
- " -0.900969 | \n",
- " 2016-01-01 | \n",
- " New Year's Day | \n",
- " True | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 23868 | \n",
- " 2 | \n",
- " 2016-12-25 00:21:23 | \n",
- " 1 | \n",
- " 2.36 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 12.30 | \n",
- " 12 | \n",
- " ... | \n",
- " 6 | \n",
- " 0 | \n",
- " US | \n",
- " 0.000000 | \n",
- " 1.000000e+00 | \n",
- " -0.781831 | \n",
- " 0.623490 | \n",
- " 2016-12-25 | \n",
- " Christmas Day | \n",
- " True | \n",
- "
\n",
- " \n",
- " 23892 | \n",
- " 2 | \n",
- " 2016-12-25 14:05:48 | \n",
- " 1 | \n",
- " 1.05 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 12.30 | \n",
- " 12 | \n",
- " ... | \n",
- " 6 | \n",
- " 14 | \n",
- " US | \n",
- " -0.500000 | \n",
- " -8.660254e-01 | \n",
- " -0.781831 | \n",
- " 0.623490 | \n",
- " 2016-12-25 | \n",
- " Christmas Day | \n",
- " True | \n",
- "
\n",
- " \n",
- " 23942 | \n",
- " 1 | \n",
- " 2016-12-26 01:43:57 | \n",
- " 1 | \n",
- " 0.80 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 7.55 | \n",
- " 12 | \n",
- " ... | \n",
- " 0 | \n",
- " 1 | \n",
- " US | \n",
- " 0.258819 | \n",
- " 9.659258e-01 | \n",
- " 0.000000 | \n",
- " 1.000000 | \n",
- " 2016-12-26 | \n",
- " Christmas Day | \n",
- " True | \n",
- "
\n",
- " \n",
- " 23978 | \n",
- " 2 | \n",
- " 2016-12-26 03:38:33 | \n",
- " 1 | \n",
- " 1.55 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 8.30 | \n",
- " 12 | \n",
- " ... | \n",
- " 0 | \n",
- " 3 | \n",
- " US | \n",
- " 0.707107 | \n",
- " 7.071068e-01 | \n",
- " 0.000000 | \n",
- " 1.000000 | \n",
- " 2016-12-26 | \n",
- " Christmas Day | \n",
- " True | \n",
- "
\n",
- " \n",
- " 23985 | \n",
- " 2 | \n",
- " 2016-12-26 22:12:18 | \n",
- " 1 | \n",
- " 3.77 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 16.25 | \n",
- " 12 | \n",
- " ... | \n",
- " 0 | \n",
- " 22 | \n",
- " US | \n",
- " -0.500000 | \n",
- " 8.660254e-01 | \n",
- " 0.000000 | \n",
- " 1.000000 | \n",
- " 2016-12-26 | \n",
- " Christmas Day | \n",
- " True | \n",
- "
\n",
- " \n",
- "
\n",
- "
673 rows × 21 columns
\n",
- "
"
+ "cell_type": "markdown",
+ "source": [
+ "Remove some of the columns that you won't need for modeling or additional feature building. Rename the time field for pickup time, and additionally convert the time to midnight using `pandas.Series.dt.normalize`. This is done to all time features so that the datetime column can be later used as a key when joining datasets together at a daily level of granularity."
],
- "text/plain": [
- " vendorID lpepPickupDatetime passengerCount tripDistance \\\n",
- "1 2 2016-01-01 06:22:01 5 0.91 \n",
- "25 2 2016-01-01 06:14:43 1 2.44 \n",
- "27 2 2016-01-01 16:06:33 1 4.57 \n",
- "44 2 2016-01-18 11:46:27 1 16.10 \n",
- "45 2 2016-01-01 10:41:39 1 3.33 \n",
- "... ... ... ... ... \n",
- "23868 2 2016-12-25 00:21:23 1 2.36 \n",
- "23892 2 2016-12-25 14:05:48 1 1.05 \n",
- "23942 1 2016-12-26 01:43:57 1 0.80 \n",
- "23978 2 2016-12-26 03:38:33 1 1.55 \n",
- "23985 2 2016-12-26 22:12:18 1 3.77 \n",
- "\n",
- " pickupLongitude pickupLatitude dropoffLongitude dropoffLatitude \\\n",
- "1 -73.962044 40.709797 -73.946716 40.706902 \n",
- "25 -73.993576 40.681519 -73.999596 40.655930 \n",
- "27 -73.962509 40.687862 -73.981361 40.732758 \n",
- "44 -73.925522 40.827877 -73.934982 40.681278 \n",
- "45 -73.962891 40.711971 -73.918060 40.736832 \n",
- "... ... ... ... ... \n",
- "23868 NaN NaN NaN NaN \n",
- "23892 NaN NaN NaN NaN \n",
- "23942 NaN NaN NaN NaN \n",
- "23978 NaN NaN NaN NaN \n",
- "23985 NaN NaN NaN NaN \n",
- "\n",
- " totalAmount month_num ... day_of_week hour_of_day country_code \\\n",
- "1 6.30 1 ... 4 6 US \n",
- "25 10.30 1 ... 4 6 US \n",
- "27 22.25 1 ... 4 16 US \n",
- "44 50.30 1 ... 0 11 US \n",
- "45 12.80 1 ... 4 10 US \n",
- "... ... ... ... ... ... ... \n",
- "23868 12.30 12 ... 6 0 US \n",
- "23892 12.30 12 ... 6 14 US \n",
- "23942 7.55 12 ... 0 1 US \n",
- "23978 8.30 12 ... 0 3 US \n",
- "23985 16.25 12 ... 0 22 US \n",
- "\n",
- " hr_sin hr_cos dy_sin dy_cos datetime \\\n",
- "1 1.000000 6.123234e-17 -0.433884 -0.900969 2016-01-01 \n",
- "25 1.000000 6.123234e-17 -0.433884 -0.900969 2016-01-01 \n",
- "27 -0.866025 -5.000000e-01 -0.433884 -0.900969 2016-01-01 \n",
- "44 0.258819 -9.659258e-01 0.000000 1.000000 2016-01-18 \n",
- "45 0.500000 -8.660254e-01 -0.433884 -0.900969 2016-01-01 \n",
- "... ... ... ... ... ... \n",
- "23868 0.000000 1.000000e+00 -0.781831 0.623490 2016-12-25 \n",
- "23892 -0.500000 -8.660254e-01 -0.781831 0.623490 2016-12-25 \n",
- "23942 0.258819 9.659258e-01 0.000000 1.000000 2016-12-26 \n",
- "23978 0.707107 7.071068e-01 0.000000 1.000000 2016-12-26 \n",
- "23985 -0.500000 8.660254e-01 0.000000 1.000000 2016-12-26 \n",
- "\n",
- " normalizeHolidayName isPaidTimeOff \n",
- "1 New Year's Day True \n",
- "25 New Year's Day True \n",
- "27 New Year's Day True \n",
- "44 Martin Luther King Jr. Day None \n",
- "45 New Year's Day True \n",
- "... ... ... \n",
- "23868 Christmas Day True \n",
- "23892 Christmas Day True \n",
- "23942 Christmas Day True \n",
- "23978 Christmas Day True \n",
- "23985 Christmas Day True \n",
- "\n",
- "[673 rows x 21 columns]"
- ]
- },
- "execution_count": 33,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "holidays_df = holidays_df.rename(columns={\"countryRegionCode\": \"country_code\"})\n",
- "holidays_df[\"datetime\"] = holidays_df[\"date\"].dt.normalize()\n",
- "\n",
- "holidays_df.drop([\"countryOrRegion\", \"holidayName\", \"date\"], axis=1, inplace=True)\n",
- "\n",
- "taxi_holidays_df = pd.merge(green_taxi_df, holidays_df, how=\"left\", on=[\"datetime\", \"country_code\"])\n",
- "taxi_holidays_df[taxi_holidays_df[\"normalizeHolidayName\"].notnull()]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Enrich with weather data\n",
- "\n",
- "Now NOAA surface weather data can be appended to the taxi and holiday data. Use a similar approach to fetch the weather data by downloading one month at a time iteratively. Additionally, specify the `cols` parameter with an array of strings to filter the columns to download. This is a very large dataset containing weather surface data from all over the world, so before appending each month, filter the lat/long fields to near NYC using the `query()` function on the dataframe. This will ensure the `weather_df` doesn't get too large."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "metadata": {},
- "outputs": [
+ "metadata": {}
+ },
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-1.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-1.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-1.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-1.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-1.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-1.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-1.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpi7x_kizu\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=1\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-1.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-2.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-2.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-2.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-2.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-2.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-2.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-2.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpxdx6b0cq\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=2\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-2.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-3.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-3.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-3.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-3.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-3.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-3.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-3.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmp_lv3o8dm\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=3\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-3.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-4.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-4.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-4.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-4.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-4.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-4.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-4.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv8ysxuyi\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=4\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-4.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-5.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-5.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-5.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-5.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-5.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-5.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-5.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpo7yuruxk\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=5\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-5.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-6.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-6.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-6.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-6.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-6.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-6.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-6.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpv9q26rt7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=6\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-6.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-7.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-7.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-7.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-7.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-7.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-7.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-7.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpvu440niz\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=7\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-7.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-8.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-8.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-8.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-8.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-8.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-8.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-8.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmphcszahaa\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=8\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-8.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-9.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-9.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-9.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-9.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-9.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-9.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-9.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpu7hqa5k7\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=9\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-9.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-10.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-10.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-10.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-10.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-10.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-10.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-10.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpwmkuu32e\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=10\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-10.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-11.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-11.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-11.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-11.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-11.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-11.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-11.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmpc_6yjsyt\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=11\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-11.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-12.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-12.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-12.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-12.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-12.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-12.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-12.c000.snappy.parquet\n",
- "[Info] read from C:\\Users\\FKHOSH~1\\AppData\\Local\\Temp\\tmptth4kr9r\\https%3A\\%2Fazureopendatastorage.azurefd.net\\isdweatherdatacontainer\\ISDWeather\\year=2016\\month=12\\part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-12.c000.snappy.parquet\n"
- ]
- }
- ],
- "source": [
- "from azureml.opendatasets import NoaaIsdWeather\n",
- "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n",
- "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n",
- "\n",
- "weather_df = pd.concat([NoaaIsdWeather(cols=[\"temperature\", \"precipTime\", \"precipDepth\"], start_date=start + relativedelta(months=x), end_date=end + relativedelta(months=x))\\\n",
- " .to_pandas_dataframe().query(\"latitude>=40.53 and latitude<=40.88 and longitude>=-74.09 and longitude<=-73.72 and temperature==temperature\") for x in range(12)])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "metadata": {},
- "outputs": [
+ "cell_type": "code",
+ "source": [
+ "columns_to_remove = [\"lpepDropoffDatetime\", \"puLocationId\", \"doLocationId\", \"extra\", \"mtaTax\",\n",
+ " \"improvementSurcharge\", \"tollsAmount\", \"ehailFee\", \"tripType\", \"rateCodeID\", \n",
+ " \"storeAndFwdFlag\", \"paymentType\", \"fareAmount\", \"tipAmount\"]\n",
+ "\n",
+ "green_taxi_df.drop(columns_to_remove, axis=1, inplace=True)\n",
+ "\n",
+ "green_taxi_df[\"datetime\"] = green_taxi_df[\"lpepPickupDatetime\"].dt.normalize()\n",
+ "green_taxi_df.head(5)"
+ ],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "execution_count": 4,
+ "data": {
+ "text/plain": " vendorID lpepPickupDatetime passengerCount tripDistance \\\n1312085 2 2016-01-03 11:10:13 1 0.83 \n109916 2 2016-01-19 08:11:09 1 0.85 \n25029 2 2016-01-02 11:47:40 1 0.81 \n629848 2 2016-01-17 18:31:30 1 2.21 \n139651 2 2016-01-23 00:00:17 1 0.60 \n\n pickupLongitude pickupLatitude dropoffLongitude dropoffLatitude \\\n1312085 -73.939774 40.679844 -73.930649 40.674252 \n109916 -73.925629 40.761787 -73.937866 40.766113 \n25029 -73.973312 40.689678 -73.984985 40.688690 \n629848 -73.928474 40.687298 -73.940605 40.674679 \n139651 -73.953415 40.706947 -73.948738 40.711098 \n\n totalAmount month_num day_of_month day_of_week hour_of_day \\\n1312085 5.80 1 3 6 11 \n109916 6.30 1 19 1 8 \n25029 6.96 1 2 5 11 \n629848 10.30 1 17 6 18 \n139651 6.30 1 23 5 0 \n\n country_code hr_sin hr_cos dy_sin dy_cos datetime \n1312085 US 0.258819 -9.659258e-01 -0.781831 0.623490 2016-01-03 \n109916 US 0.866025 -5.000000e-01 0.781831 0.623490 2016-01-19 \n25029 US 0.258819 -9.659258e-01 -0.974928 -0.222521 2016-01-02 \n629848 US -1.000000 -1.836970e-16 -0.781831 0.623490 2016-01-17 \n139651 US 0.000000 1.000000e+00 -0.974928 -0.222521 2016-01-23 ",
+ "text/html": "\n\n
\n \n \n | \n vendorID | \n lpepPickupDatetime | \n passengerCount | \n tripDistance | \n pickupLongitude | \n pickupLatitude | \n dropoffLongitude | \n dropoffLatitude | \n totalAmount | \n month_num | \n day_of_month | \n day_of_week | \n hour_of_day | \n country_code | \n hr_sin | \n hr_cos | \n dy_sin | \n dy_cos | \n datetime | \n
\n \n \n \n 1312085 | \n 2 | \n 2016-01-03 11:10:13 | \n 1 | \n 0.83 | \n -73.939774 | \n 40.679844 | \n -73.930649 | \n 40.674252 | \n 5.80 | \n 1 | \n 3 | \n 6 | \n 11 | \n US | \n 0.258819 | \n -9.659258e-01 | \n -0.781831 | \n 0.623490 | \n 2016-01-03 | \n
\n \n 109916 | \n 2 | \n 2016-01-19 08:11:09 | \n 1 | \n 0.85 | \n -73.925629 | \n 40.761787 | \n -73.937866 | \n 40.766113 | \n 6.30 | \n 1 | \n 19 | \n 1 | \n 8 | \n US | \n 0.866025 | \n -5.000000e-01 | \n 0.781831 | \n 0.623490 | \n 2016-01-19 | \n
\n \n 25029 | \n 2 | \n 2016-01-02 11:47:40 | \n 1 | \n 0.81 | \n -73.973312 | \n 40.689678 | \n -73.984985 | \n 40.688690 | \n 6.96 | \n 1 | \n 2 | \n 5 | \n 11 | \n US | \n 0.258819 | \n -9.659258e-01 | \n -0.974928 | \n -0.222521 | \n 2016-01-02 | \n
\n \n 629848 | \n 2 | \n 2016-01-17 18:31:30 | \n 1 | \n 2.21 | \n -73.928474 | \n 40.687298 | \n -73.940605 | \n 40.674679 | \n 10.30 | \n 1 | \n 17 | \n 6 | \n 18 | \n US | \n -1.000000 | \n -1.836970e-16 | \n -0.781831 | \n 0.623490 | \n 2016-01-17 | \n
\n \n 139651 | \n 2 | \n 2016-01-23 00:00:17 | \n 1 | \n 0.60 | \n -73.953415 | \n 40.706947 | \n -73.948738 | \n 40.711098 | \n 6.30 | \n 1 | \n 23 | \n 5 | \n 0 | \n US | \n 0.000000 | \n 1.000000e+00 | \n -0.974928 | \n -0.222521 | \n 2016-01-23 | \n
\n \n
\n
"
+ },
+ "metadata": {}
+ }
+ ],
+ "execution_count": 4,
+ "metadata": {
+ "gather": {
+ "logged": 1681193760703
+ }
+ }
+ },
{
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " wban | \n",
- " latitude | \n",
- " temperature | \n",
- " usaf | \n",
- " datetime | \n",
- " longitude | \n",
- " precipDepth | \n",
- " precipTime | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 204647 | \n",
- " 14732 | \n",
- " 40.783 | \n",
- " 2.8 | \n",
- " 725030 | \n",
- " 2016-01-02 03:00:00 | \n",
- " -73.867 | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 204670 | \n",
- " 14732 | \n",
- " 40.779 | \n",
- " -4.4 | \n",
- " 725030 | \n",
- " 2016-01-22 13:51:00 | \n",
- " -73.880 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 204694 | \n",
- " 14732 | \n",
- " 40.779 | \n",
- " 5.0 | \n",
- " 725030 | \n",
- " 2016-01-08 02:51:00 | \n",
- " -73.880 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 204701 | \n",
- " 14732 | \n",
- " 40.779 | \n",
- " -1.1 | \n",
- " 725030 | \n",
- " 2016-01-04 15:51:00 | \n",
- " -73.880 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 204715 | \n",
- " 14732 | \n",
- " 40.779 | \n",
- " 4.4 | \n",
- " 725030 | \n",
- " 2016-01-01 21:51:00 | \n",
- " -73.880 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 1248471 | \n",
- " 94728 | \n",
- " 40.789 | \n",
- " 4.4 | \n",
- " 725053 | \n",
- " 2016-12-23 13:51:00 | \n",
- " -73.967 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 1248555 | \n",
- " 94728 | \n",
- " 40.789 | \n",
- " 5.0 | \n",
- " 725053 | \n",
- " 2016-12-12 13:51:00 | \n",
- " -73.967 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 1248580 | \n",
- " 94728 | \n",
- " 40.789 | \n",
- " 3.9 | \n",
- " 725053 | \n",
- " 2016-12-18 07:01:00 | \n",
- " -73.967 | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 1248597 | \n",
- " 94728 | \n",
- " 40.789 | \n",
- " 7.8 | \n",
- " 725053 | \n",
- " 2016-12-25 00:51:00 | \n",
- " -73.967 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " 1248600 | \n",
- " 94728 | \n",
- " 40.789 | \n",
- " -2.8 | \n",
- " 725053 | \n",
- " 2016-12-17 11:10:00 | \n",
- " -73.967 | \n",
- " 5.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
55683 rows × 8 columns
\n",
- "
"
+ "cell_type": "markdown",
+ "source": [
+ "### Enrich with Holiday Data\n",
+ "\n",
+ "Now that the taxi data is downloaded and roughly prepared, add in holiday data as additional features. Holiday-specific features will assist model accuracy, as major holidays are times where taxi demand increases dramatically and supply becomes limited. The holiday dataset is relatively small, so fetch the full set by using the `PublicHolidays` class constructor with no parameters for filtering. Preview the data to check the format."
],
- "text/plain": [
- " wban latitude temperature usaf datetime longitude \\\n",
- "204647 14732 40.783 2.8 725030 2016-01-02 03:00:00 -73.867 \n",
- "204670 14732 40.779 -4.4 725030 2016-01-22 13:51:00 -73.880 \n",
- "204694 14732 40.779 5.0 725030 2016-01-08 02:51:00 -73.880 \n",
- "204701 14732 40.779 -1.1 725030 2016-01-04 15:51:00 -73.880 \n",
- "204715 14732 40.779 4.4 725030 2016-01-01 21:51:00 -73.880 \n",
- "... ... ... ... ... ... ... \n",
- "1248471 94728 40.789 4.4 725053 2016-12-23 13:51:00 -73.967 \n",
- "1248555 94728 40.789 5.0 725053 2016-12-12 13:51:00 -73.967 \n",
- "1248580 94728 40.789 3.9 725053 2016-12-18 07:01:00 -73.967 \n",
- "1248597 94728 40.789 7.8 725053 2016-12-25 00:51:00 -73.967 \n",
- "1248600 94728 40.789 -2.8 725053 2016-12-17 11:10:00 -73.967 \n",
- "\n",
- " precipDepth precipTime \n",
- "204647 NaN NaN \n",
- "204670 0.0 1.0 \n",
- "204694 0.0 1.0 \n",
- "204701 0.0 1.0 \n",
- "204715 0.0 1.0 \n",
- "... ... ... \n",
- "1248471 0.0 1.0 \n",
- "1248555 0.0 1.0 \n",
- "1248580 NaN NaN \n",
- "1248597 0.0 1.0 \n",
- "1248600 5.0 1.0 \n",
- "\n",
- "[55683 rows x 8 columns]"
- ]
- },
- "execution_count": 36,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "weather_df"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Again call `pandas.Series.dt.normalize` on the `datetime` field in the weather data so it matches the time key in `taxi_holidays_df`.\n",
- "\n",
- "\n",
- "Next group the weather data to have daily aggregated weather values. Define a dict `aggregations` to define how to aggregate each field at a daily level. For`temperature` take the mean and for `precipTime` and `precipDepth` take the daily maximum. Use the `groupby()` function along with the aggregations to group the data. Preview the data to ensure there is one record per day."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {},
- "outputs": [
+ "metadata": {}
+ },
{
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " precipTime | \n",
- " temperature | \n",
- " precipDepth | \n",
- "
\n",
- " \n",
- " datetime | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 2016-01-01 | \n",
- " 1.0 | \n",
- " 5.197345 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 2016-01-02 | \n",
- " 1.0 | \n",
- " 2.567857 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 2016-01-03 | \n",
- " 1.0 | \n",
- " 3.846429 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 2016-01-04 | \n",
- " 1.0 | \n",
- " 0.123894 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 2016-01-05 | \n",
- " 6.0 | \n",
- " -7.206250 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 2016-01-06 | \n",
- " 6.0 | \n",
- " -0.896396 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 2016-01-07 | \n",
- " 6.0 | \n",
- " 3.180645 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 2016-01-08 | \n",
- " 1.0 | \n",
- " 4.384091 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " 2016-01-09 | \n",
- " 6.0 | \n",
- " 6.710274 | \n",
- " 3.0 | \n",
- "
\n",
- " \n",
- " 2016-01-10 | \n",
- " 24.0 | \n",
- " 10.943655 | \n",
- " 254.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
+ "cell_type": "code",
+ "source": [
+ "from azureml.opendatasets import PublicHolidays\n",
+ "\n",
+ "# call default constructor to download full dataset\n",
+ "holidays_df = PublicHolidays().to_pandas_dataframe()\n",
+ "holidays_df.head(5)"
],
- "text/plain": [
- " precipTime temperature precipDepth\n",
- "datetime \n",
- "2016-01-01 1.0 5.197345 0.0\n",
- "2016-01-02 1.0 2.567857 0.0\n",
- "2016-01-03 1.0 3.846429 0.0\n",
- "2016-01-04 1.0 0.123894 0.0\n",
- "2016-01-05 6.0 -7.206250 0.0\n",
- "2016-01-06 6.0 -0.896396 0.0\n",
- "2016-01-07 6.0 3.180645 0.0\n",
- "2016-01-08 1.0 4.384091 0.0\n",
- "2016-01-09 6.0 6.710274 3.0\n",
- "2016-01-10 24.0 10.943655 254.0"
- ]
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "weather_df[\"datetime\"] = weather_df[\"datetime\"].dt.normalize()\n",
- "\n",
- "# group by datetime\n",
- "aggregations = {\"precipTime\": \"max\", \"temperature\": \"mean\", \"precipDepth\": \"max\"}\n",
- "weather_df_grouped = weather_df.groupby(\"datetime\").agg(aggregations)\n",
- "weather_df_grouped.head(10)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Note: The examples in this tutorial merge data using Pandas functions and custom aggregations, but the Open Datasets SDK has classes designed to easily merge and enrich data sets. See the [notebook](https://github.com/Azure/OpenDatasetsNotebooks/blob/master/tutorials/data-join/04-nyc-taxi-join-weather-in-pandas.ipynb) for code examples of these design patterns."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Cleanse data\n",
- "\n",
- "Merge the existing taxi and holiday data with the new weather data. This time `datetime` is the only key, and again perform a left-join of the data. Run the `describe()` function on the new dataframe to see summary statistics for each field."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "metadata": {},
- "outputs": [
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": "[Info] read from /tmp/tmpiw_16gzx/https%3A/%2Fazureopendatastorage.azurefd.net/holidaydatacontainer/Processed/part-00000-tid-8468414522853579044-35925ba8-a227-4b80-9c89-17065e7bf1db-649-c000.snappy.parquet\n"
+ },
+ {
+ "output_type": "execute_result",
+ "execution_count": 5,
+ "data": {
+ "text/plain": " countryOrRegion holidayName normalizeHolidayName \\\n19375 Argentina Año Nuevo [New Year's Day] Año Nuevo [New Year's Day] \n19376 Australia New Year's Day New Year's Day \n19377 Austria Neujahr Neujahr \n19378 Belarus Новый год Новый год \n19379 Belgium Nieuwjaarsdag Nieuwjaarsdag \n\n isPaidTimeOff countryRegionCode date \n19375 None AR 2008-01-01 \n19376 None AU 2008-01-01 \n19377 None AT 2008-01-01 \n19378 None BY 2008-01-01 \n19379 None BE 2008-01-01 ",
+ "text/html": "\n\n
\n \n \n | \n countryOrRegion | \n holidayName | \n normalizeHolidayName | \n isPaidTimeOff | \n countryRegionCode | \n date | \n
\n \n \n \n 19375 | \n Argentina | \n Año Nuevo [New Year's Day] | \n Año Nuevo [New Year's Day] | \n None | \n AR | \n 2008-01-01 | \n
\n \n 19376 | \n Australia | \n New Year's Day | \n New Year's Day | \n None | \n AU | \n 2008-01-01 | \n
\n \n 19377 | \n Austria | \n Neujahr | \n Neujahr | \n None | \n AT | \n 2008-01-01 | \n
\n \n 19378 | \n Belarus | \n Новый год | \n Новый год | \n None | \n BY | \n 2008-01-01 | \n
\n \n 19379 | \n Belgium | \n Nieuwjaarsdag | \n Nieuwjaarsdag | \n None | \n BE | \n 2008-01-01 | \n
\n \n
\n
"
+ },
+ "metadata": {}
+ }
+ ],
+ "execution_count": 5,
+ "metadata": {
+ "gather": {
+ "logged": 1681193761164
+ }
+ }
+ },
{
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " vendorID | \n",
- " passengerCount | \n",
- " tripDistance | \n",
- " pickupLongitude | \n",
- " pickupLatitude | \n",
- " dropoffLongitude | \n",
- " dropoffLatitude | \n",
- " totalAmount | \n",
- " month_num | \n",
- " day_of_month | \n",
- " day_of_week | \n",
- " hour_of_day | \n",
- " hr_sin | \n",
- " hr_cos | \n",
- " dy_sin | \n",
- " dy_cos | \n",
- " precipTime | \n",
- " temperature | \n",
- " precipDepth | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " count | \n",
- " 24000.000000 | \n",
- " 24000.000000 | \n",
- " 24000.000000 | \n",
- " 12000.000000 | \n",
- " 12000.000000 | \n",
- " 12000.000000 | \n",
- " 12000.000000 | \n",
- " 24000.000000 | \n",
- " 24000.000000 | \n",
- " 24000.000000 | \n",
- " 24000.000000 | \n",
- " 24000.000000 | \n",
- " 24000.000000 | \n",
- " 2.400000e+04 | \n",
- " 24000.000000 | \n",
- " 24000.000000 | \n",
- " 24000.000000 | \n",
- " 24000.000000 | \n",
- " 24000.000000 | \n",
- "
\n",
- " \n",
- " mean | \n",
- " 1.789667 | \n",
- " 1.355292 | \n",
- " 2.830398 | \n",
- " -73.814393 | \n",
- " 40.678791 | \n",
- " -73.837019 | \n",
- " 40.690729 | \n",
- " 14.668251 | \n",
- " 6.500000 | \n",
- " 15.068750 | \n",
- " 3.247792 | \n",
- " 13.582875 | \n",
- " -0.239687 | \n",
- " -1.510585e-02 | \n",
- " -0.079292 | \n",
- " -0.059630 | \n",
- " 13.318667 | \n",
- " 13.878272 | \n",
- " 1037.956292 | \n",
- "
\n",
- " \n",
- " std | \n",
- " 0.407554 | \n",
- " 1.020018 | \n",
- " 3.118302 | \n",
- " 3.016385 | \n",
- " 1.663152 | \n",
- " 2.698609 | \n",
- " 1.488032 | \n",
- " 11.738532 | \n",
- " 3.452124 | \n",
- " 8.477555 | \n",
- " 1.951209 | \n",
- " 6.708372 | \n",
- " 0.667528 | \n",
- " 7.048175e-01 | \n",
- " 0.714457 | \n",
- " 0.692640 | \n",
- " 10.333162 | \n",
- " 9.484443 | \n",
- " 2788.844868 | \n",
- "
\n",
- " \n",
- " min | \n",
- " 1.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " -74.164825 | \n",
- " 0.000000 | \n",
- " -75.186440 | \n",
- " 0.000000 | \n",
- " -200.000000 | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " -1.000000 | \n",
- " -1.000000e+00 | \n",
- " -0.974928 | \n",
- " -0.900969 | \n",
- " 1.000000 | \n",
- " -13.379464 | \n",
- " 0.000000 | \n",
- "
\n",
- " \n",
- " 25% | \n",
- " 2.000000 | \n",
- " 1.000000 | \n",
- " 1.040000 | \n",
- " -73.961370 | \n",
- " 40.693539 | \n",
- " -73.967514 | \n",
- " 40.695128 | \n",
- " 7.880000 | \n",
- " 3.750000 | \n",
- " 8.000000 | \n",
- " 2.000000 | \n",
- " 9.000000 | \n",
- " -0.866025 | \n",
- " -7.071068e-01 | \n",
- " -0.781831 | \n",
- " -0.900969 | \n",
- " 6.000000 | \n",
- " 6.620773 | \n",
- " 0.000000 | \n",
- "
\n",
- " \n",
- " 50% | \n",
- " 2.000000 | \n",
- " 1.000000 | \n",
- " 1.840000 | \n",
- " -73.947132 | \n",
- " 40.745928 | \n",
- " -73.945869 | \n",
- " 40.745914 | \n",
- " 11.300000 | \n",
- " 6.500000 | \n",
- " 15.000000 | \n",
- " 3.000000 | \n",
- " 15.000000 | \n",
- " -0.500000 | \n",
- " -1.836970e-16 | \n",
- " 0.000000 | \n",
- " -0.222521 | \n",
- " 6.000000 | \n",
- " 13.108323 | \n",
- " 10.000000 | \n",
- "
\n",
- " \n",
- " 75% | \n",
- " 2.000000 | \n",
- " 1.000000 | \n",
- " 3.500000 | \n",
- " -73.919638 | \n",
- " 40.802049 | \n",
- " -73.913059 | \n",
- " 40.791076 | \n",
- " 17.750000 | \n",
- " 9.250000 | \n",
- " 22.000000 | \n",
- " 5.000000 | \n",
- " 19.000000 | \n",
- " 0.258819 | \n",
- " 7.071068e-01 | \n",
- " 0.781831 | \n",
- " 0.623490 | \n",
- " 24.000000 | \n",
- " 22.944737 | \n",
- " 127.000000 | \n",
- "
\n",
- " \n",
- " max | \n",
- " 2.000000 | \n",
- " 7.000000 | \n",
- " 106.680000 | \n",
- " 0.000000 | \n",
- " 41.081047 | \n",
- " 0.000000 | \n",
- " 41.081055 | \n",
- " 450.000000 | \n",
- " 12.000000 | \n",
- " 30.000000 | \n",
- " 6.000000 | \n",
- " 23.000000 | \n",
- " 1.000000 | \n",
- " 1.000000e+00 | \n",
- " 0.974928 | \n",
- " 1.000000 | \n",
- " 24.000000 | \n",
- " 31.303665 | \n",
- " 9999.000000 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
+ "cell_type": "markdown",
+ "source": [
+ "Rename the `countryRegionCode` and `date` columns to match the respective field names from the taxi data, and also normalize the time so it can be used as a key. Next, join the holiday data with the taxi data by performing a left-join using the Pandas `merge()` function. This will preserve all records from `green_taxi_df`, but add in holiday data where it exists for the corresponding `datetime` and `country_code`, which in this case is always `\\\"US\\\"`. Preview the data to verify that they were merged correctly."
],
- "text/plain": [
- " vendorID passengerCount tripDistance pickupLongitude \\\n",
- "count 24000.000000 24000.000000 24000.000000 12000.000000 \n",
- "mean 1.789667 1.355292 2.830398 -73.814393 \n",
- "std 0.407554 1.020018 3.118302 3.016385 \n",
- "min 1.000000 0.000000 0.000000 -74.164825 \n",
- "25% 2.000000 1.000000 1.040000 -73.961370 \n",
- "50% 2.000000 1.000000 1.840000 -73.947132 \n",
- "75% 2.000000 1.000000 3.500000 -73.919638 \n",
- "max 2.000000 7.000000 106.680000 0.000000 \n",
- "\n",
- " pickupLatitude dropoffLongitude dropoffLatitude totalAmount \\\n",
- "count 12000.000000 12000.000000 12000.000000 24000.000000 \n",
- "mean 40.678791 -73.837019 40.690729 14.668251 \n",
- "std 1.663152 2.698609 1.488032 11.738532 \n",
- "min 0.000000 -75.186440 0.000000 -200.000000 \n",
- "25% 40.693539 -73.967514 40.695128 7.880000 \n",
- "50% 40.745928 -73.945869 40.745914 11.300000 \n",
- "75% 40.802049 -73.913059 40.791076 17.750000 \n",
- "max 41.081047 0.000000 41.081055 450.000000 \n",
- "\n",
- " month_num day_of_month day_of_week hour_of_day hr_sin \\\n",
- "count 24000.000000 24000.000000 24000.000000 24000.000000 24000.000000 \n",
- "mean 6.500000 15.068750 3.247792 13.582875 -0.239687 \n",
- "std 3.452124 8.477555 1.951209 6.708372 0.667528 \n",
- "min 1.000000 1.000000 0.000000 0.000000 -1.000000 \n",
- "25% 3.750000 8.000000 2.000000 9.000000 -0.866025 \n",
- "50% 6.500000 15.000000 3.000000 15.000000 -0.500000 \n",
- "75% 9.250000 22.000000 5.000000 19.000000 0.258819 \n",
- "max 12.000000 30.000000 6.000000 23.000000 1.000000 \n",
- "\n",
- " hr_cos dy_sin dy_cos precipTime temperature \\\n",
- "count 2.400000e+04 24000.000000 24000.000000 24000.000000 24000.000000 \n",
- "mean -1.510585e-02 -0.079292 -0.059630 13.318667 13.878272 \n",
- "std 7.048175e-01 0.714457 0.692640 10.333162 9.484443 \n",
- "min -1.000000e+00 -0.974928 -0.900969 1.000000 -13.379464 \n",
- "25% -7.071068e-01 -0.781831 -0.900969 6.000000 6.620773 \n",
- "50% -1.836970e-16 0.000000 -0.222521 6.000000 13.108323 \n",
- "75% 7.071068e-01 0.781831 0.623490 24.000000 22.944737 \n",
- "max 1.000000e+00 0.974928 1.000000 24.000000 31.303665 \n",
- "\n",
- " precipDepth \n",
- "count 24000.000000 \n",
- "mean 1037.956292 \n",
- "std 2788.844868 \n",
- "min 0.000000 \n",
- "25% 0.000000 \n",
- "50% 10.000000 \n",
- "75% 127.000000 \n",
- "max 9999.000000 "
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "taxi_holidays_weather_df = pd.merge(taxi_holidays_df, weather_df_grouped, how=\"left\", on=[\"datetime\"])\n",
- "taxi_holidays_weather_df.describe()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "From the summary statistics, you see that there are several fields that have outliers or values that will reduce model accuracy. First filter the lat/long fields to be within the same bounds you used for filtering weather data. The `tripDistance` field has some bad data, because the minimum value is negative. The `passengerCount` field has bad data as well, with the max value being 210 passengers. Lastly, the `totalAmount` field has negative values, which don't make sense in the context of our model.\n",
- "\n",
- "Filter out these anomolies using query functions, and then remove the last few columns unnecesary for training.\n",
- "\n",
- "Note: since a random sample of 2000 was taken for each month of the taxi data, the statistics may vary each time this is ran."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "metadata": {},
- "outputs": [],
- "source": [
- "final_df = taxi_holidays_weather_df.query(\"pickupLatitude>=40.53 and pickupLatitude<=40.88 and \\\n",
- " pickupLongitude>=-74.09 and pickupLongitude<=-73.72 and \\\n",
- " tripDistance>0 and tripDistance<75 and \\\n",
- " passengerCount>0 and passengerCount<100 and \\\n",
- " totalAmount>0\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Call `describe()` again on the data to ensure cleansing worked as expected. The final data is prepared and cleansed, consisting of taxi, holiday, and weather data, and is ready to use for machine learning model training."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "metadata": {},
- "outputs": [
+ "metadata": {}
+ },
{
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " vendorID | \n",
- " passengerCount | \n",
- " tripDistance | \n",
- " pickupLongitude | \n",
- " pickupLatitude | \n",
- " dropoffLongitude | \n",
- " dropoffLatitude | \n",
- " totalAmount | \n",
- " month_num | \n",
- " day_of_month | \n",
- " day_of_week | \n",
- " hour_of_day | \n",
- " hr_sin | \n",
- " hr_cos | \n",
- " dy_sin | \n",
- " dy_cos | \n",
- " precipTime | \n",
- " temperature | \n",
- " precipDepth | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " count | \n",
- " 11763.000000 | \n",
- " 11763.000000 | \n",
- " 11763.000000 | \n",
- " 11763.000000 | \n",
- " 11763.000000 | \n",
- " 11763.000000 | \n",
- " 11763.000000 | \n",
- " 11763.000000 | \n",
- " 11763.000000 | \n",
- " 11763.000000 | \n",
- " 11763.000000 | \n",
- " 11763.000000 | \n",
- " 11763.000000 | \n",
- " 1.176300e+04 | \n",
- " 11763.000000 | \n",
- " 11763.000000 | \n",
- " 11763.000000 | \n",
- " 11763.000000 | \n",
- " 11763.000000 | \n",
- "
\n",
- " \n",
- " mean | \n",
- " 1.790190 | \n",
- " 1.369294 | \n",
- " 2.841407 | \n",
- " -73.937911 | \n",
- " 40.746224 | \n",
- " -73.910901 | \n",
- " 40.730818 | \n",
- " 14.557917 | \n",
- " 3.501318 | \n",
- " 14.929270 | \n",
- " 3.252317 | \n",
- " 13.538553 | \n",
- " -0.236544 | \n",
- " -2.265927e-03 | \n",
- " -0.070226 | \n",
- " -0.059059 | \n",
- " 11.993964 | \n",
- " 10.288261 | \n",
- " 192.179546 | \n",
- "
\n",
- " \n",
- " std | \n",
- " 0.407191 | \n",
- " 1.041634 | \n",
- " 2.829864 | \n",
- " 0.041121 | \n",
- " 0.056818 | \n",
- " 1.364114 | \n",
- " 0.753468 | \n",
- " 9.989165 | \n",
- " 1.707350 | \n",
- " 8.475793 | \n",
- " 1.948127 | \n",
- " 6.778012 | \n",
- " 0.668812 | \n",
- " 7.048492e-01 | \n",
- " 0.718871 | \n",
- " 0.689122 | \n",
- " 10.114775 | \n",
- " 8.530011 | \n",
- " 1223.101074 | \n",
- "
\n",
- " \n",
- " min | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 0.010000 | \n",
- " -74.035194 | \n",
- " 40.572906 | \n",
- " -74.183029 | \n",
- " 0.000000 | \n",
- " 0.010000 | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " -1.000000 | \n",
- " -1.000000e+00 | \n",
- " -0.974928 | \n",
- " -0.900969 | \n",
- " 1.000000 | \n",
- " -13.379464 | \n",
- " 0.000000 | \n",
- "
\n",
- " \n",
- " 25% | \n",
- " 2.000000 | \n",
- " 1.000000 | \n",
- " 1.090000 | \n",
- " -73.961601 | \n",
- " 40.693594 | \n",
- " -73.967793 | \n",
- " 40.695440 | \n",
- " 8.160000 | \n",
- " 2.000000 | \n",
- " 8.000000 | \n",
- " 2.000000 | \n",
- " 9.000000 | \n",
- " -0.866025 | \n",
- " -7.071068e-01 | \n",
- " -0.781831 | \n",
- " -0.900969 | \n",
- " 1.000000 | \n",
- " 3.504580 | \n",
- " 0.000000 | \n",
- "
\n",
- " \n",
- " 50% | \n",
- " 2.000000 | \n",
- " 1.000000 | \n",
- " 1.900000 | \n",
- " -73.947517 | \n",
- " 40.745842 | \n",
- " -73.946243 | \n",
- " 40.745789 | \n",
- " 11.300000 | \n",
- " 4.000000 | \n",
- " 15.000000 | \n",
- " 3.000000 | \n",
- " 15.000000 | \n",
- " -0.500000 | \n",
- " -1.836970e-16 | \n",
- " 0.000000 | \n",
- " -0.222521 | \n",
- " 6.000000 | \n",
- " 10.468276 | \n",
- " 3.000000 | \n",
- "
\n",
- " \n",
- " 75% | \n",
- " 2.000000 | \n",
- " 1.000000 | \n",
- " 3.530000 | \n",
- " -73.920509 | \n",
- " 40.801752 | \n",
- " -73.913807 | \n",
- " 40.789942 | \n",
- " 17.380000 | \n",
- " 5.000000 | \n",
- " 22.000000 | \n",
- " 5.000000 | \n",
- " 19.000000 | \n",
- " 0.258819 | \n",
- " 7.071068e-01 | \n",
- " 0.781831 | \n",
- " 0.623490 | \n",
- " 24.000000 | \n",
- " 16.966923 | \n",
- " 41.000000 | \n",
- "
\n",
- " \n",
- " max | \n",
- " 2.000000 | \n",
- " 6.000000 | \n",
- " 38.850000 | \n",
- " -73.738899 | \n",
- " 40.879982 | \n",
- " 0.000000 | \n",
- " 41.073185 | \n",
- " 123.800000 | \n",
- " 6.000000 | \n",
- " 30.000000 | \n",
- " 6.000000 | \n",
- " 23.000000 | \n",
- " 1.000000 | \n",
- " 1.000000e+00 | \n",
- " 0.974928 | \n",
- " 1.000000 | \n",
- " 24.000000 | \n",
- " 26.524107 | \n",
- " 9999.000000 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
+ "cell_type": "code",
+ "source": [
+ "holidays_df = holidays_df.rename(columns={\"countryRegionCode\": \"country_code\"})\n",
+ "holidays_df[\"datetime\"] = holidays_df[\"date\"].dt.normalize()\n",
+ "\n",
+ "holidays_df.drop([\"countryOrRegion\", \"holidayName\", \"date\"], axis=1, inplace=True)\n",
+ "\n",
+ "taxi_holidays_df = pd.merge(green_taxi_df, holidays_df, how=\"left\", on=[\"datetime\", \"country_code\"])\n",
+ "taxi_holidays_df[taxi_holidays_df[\"normalizeHolidayName\"].notnull()]"
],
- "text/plain": [
- " vendorID passengerCount tripDistance pickupLongitude \\\n",
- "count 11763.000000 11763.000000 11763.000000 11763.000000 \n",
- "mean 1.790190 1.369294 2.841407 -73.937911 \n",
- "std 0.407191 1.041634 2.829864 0.041121 \n",
- "min 1.000000 1.000000 0.010000 -74.035194 \n",
- "25% 2.000000 1.000000 1.090000 -73.961601 \n",
- "50% 2.000000 1.000000 1.900000 -73.947517 \n",
- "75% 2.000000 1.000000 3.530000 -73.920509 \n",
- "max 2.000000 6.000000 38.850000 -73.738899 \n",
- "\n",
- " pickupLatitude dropoffLongitude dropoffLatitude totalAmount \\\n",
- "count 11763.000000 11763.000000 11763.000000 11763.000000 \n",
- "mean 40.746224 -73.910901 40.730818 14.557917 \n",
- "std 0.056818 1.364114 0.753468 9.989165 \n",
- "min 40.572906 -74.183029 0.000000 0.010000 \n",
- "25% 40.693594 -73.967793 40.695440 8.160000 \n",
- "50% 40.745842 -73.946243 40.745789 11.300000 \n",
- "75% 40.801752 -73.913807 40.789942 17.380000 \n",
- "max 40.879982 0.000000 41.073185 123.800000 \n",
- "\n",
- " month_num day_of_month day_of_week hour_of_day hr_sin \\\n",
- "count 11763.000000 11763.000000 11763.000000 11763.000000 11763.000000 \n",
- "mean 3.501318 14.929270 3.252317 13.538553 -0.236544 \n",
- "std 1.707350 8.475793 1.948127 6.778012 0.668812 \n",
- "min 1.000000 1.000000 0.000000 0.000000 -1.000000 \n",
- "25% 2.000000 8.000000 2.000000 9.000000 -0.866025 \n",
- "50% 4.000000 15.000000 3.000000 15.000000 -0.500000 \n",
- "75% 5.000000 22.000000 5.000000 19.000000 0.258819 \n",
- "max 6.000000 30.000000 6.000000 23.000000 1.000000 \n",
- "\n",
- " hr_cos dy_sin dy_cos precipTime temperature \\\n",
- "count 1.176300e+04 11763.000000 11763.000000 11763.000000 11763.000000 \n",
- "mean -2.265927e-03 -0.070226 -0.059059 11.993964 10.288261 \n",
- "std 7.048492e-01 0.718871 0.689122 10.114775 8.530011 \n",
- "min -1.000000e+00 -0.974928 -0.900969 1.000000 -13.379464 \n",
- "25% -7.071068e-01 -0.781831 -0.900969 1.000000 3.504580 \n",
- "50% -1.836970e-16 0.000000 -0.222521 6.000000 10.468276 \n",
- "75% 7.071068e-01 0.781831 0.623490 24.000000 16.966923 \n",
- "max 1.000000e+00 0.974928 1.000000 24.000000 26.524107 \n",
- "\n",
- " precipDepth \n",
- "count 11763.000000 \n",
- "mean 192.179546 \n",
- "std 1223.101074 \n",
- "min 0.000000 \n",
- "25% 0.000000 \n",
- "50% 3.000000 \n",
- "75% 41.000000 \n",
- "max 9999.000000 "
- ]
- },
- "execution_count": 40,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "final_df.describe()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Train a model\n",
- "\n",
- "The data is ready to train a machine learning model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.linear_model import LinearRegression\n",
- "from sklearn.linear_model import RidgeCV\n",
- "from sklearn.linear_model import Ridge\n",
- "from sklearn.ensemble import RandomForestRegressor\n",
- "from sklearn.model_selection import train_test_split\n",
- "from sklearn.pipeline import Pipeline\n",
- "from sklearn.preprocessing import OneHotEncoder\n",
- "from sklearn.impute import SimpleImputer\n",
- "from sklearn.compose import ColumnTransformer\n",
- "from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Training Function\n",
- "\n",
- "Define a function that can be used to create a model pipeline that can be trained and then used for scoring. This pipeline has 2 steps: preprocessing and model training.\n",
- "\n",
- "Preprocessing Stages:\n",
- "The preprocessing step of the pipeline also has 2 stages, one for numerical features and one for categorical features.\n",
- "For the numerical features, let's fill in any blanks with 0's. While the training data may not have any nulls in the these fields, future data that is scored may and this step will take care of those for us. Optionally, a scaler transformation could be added in this step as well. Similarly for the categorical variables, let's have the null values filled with \"MISSING\". Additionally to the categorical variables, these will need to be one hot encoded, so we will include that step in our pipeline.\n",
- "\n",
- "Model Training Stage:\n",
- "An input parameter will determine which type of model of train. Let's test out a linear regression and random forest model to start. \n",
- "\n",
- "The two steps are put together into the pipeline which is what the function is returning."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "metadata": {},
- "outputs": [],
- "source": [
- "def createClassModel(algo_name, catg, nums):\n",
- " numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))])\n",
- " \n",
- " categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=\"MISSING\")), ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n",
- " \n",
- " preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)])\n",
- " \n",
- " if algo_name == 'linear_regression':\n",
- " model=Ridge(alpha=100)\n",
- " elif algo_name == 'random_forest':\n",
- " model = RandomForestRegressor()\n",
- " else:\n",
- " pass\n",
- " ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), (\"model\", model)])\n",
- " return ModelPipeline"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Let's define the arguments that will be passed to the function. `catg_cols` is a list of the categorical variables that will be transformed in our processing step. `num_cols` is a list of the numerical variables that will be transformed in our processing step. Let's define the target column as `label` so it can be used in future steps as well."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "metadata": {},
- "outputs": [],
- "source": [
- "catg_cols = [\"vendorID\", \"month_num\", \"day_of_month\", \"normalizeHolidayName\", \"isPaidTimeOff\"]\n",
- "num_cols = [\"passengerCount\", \"tripDistance\", \"precipTime\", \"temperature\", \"precipDepth\", \"hr_sin\", \"hr_cos\", \"dy_sin\", \"dy_cos\"]\n",
- "label = [\"totalAmount\"]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The training is ready to begin, but first, let's make sure that the categorical variables are strings in our dataframe to ensure no errors in our pipeline. \n",
- "\n",
- "Next, the data is split into training and test sets by using the `train_test_split()` function in the `scikit-learn` library. The `test_size` parameter determines the percentage of data to allocate to testing. The `random_state` parameter sets a seed to the random number generator, so that your train-test splits are deterministic.\n",
- "\n",
- "The training will happen in the for loop so that both algorithms can be tested. The createClassModel funtion is called to retreive the pipeline that can then be trained using the training dataset. \n",
- "\n",
- "Once trained, the test dataset is then ran through the model to test the model's performance. Using various functions from sklearn.metrics, the R2 score, MAPE, and RMSE can be used to measure model performance."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "metadata": {},
- "outputs": [
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "execution_count": 6,
+ "data": {
+ "text/plain": " vendorID lpepPickupDatetime passengerCount tripDistance \\\n10 2 2016-01-01 06:10:47 1 1.01 \n12 2 2016-01-01 20:35:55 1 3.81 \n20 2 2016-01-01 14:29:02 1 1.60 \n30 2 2016-01-01 22:38:33 1 0.60 \n73 2 2016-01-18 14:45:12 1 1.02 \n... ... ... ... ... \n23721 2 2016-12-26 19:17:20 1 1.00 \n23843 2 2016-12-25 22:11:04 1 0.65 \n23852 2 2016-12-26 23:48:40 1 2.84 \n23858 2 2016-12-25 02:19:58 3 6.83 \n23917 1 2016-12-25 16:57:15 1 2.00 \n\n pickupLongitude pickupLatitude dropoffLongitude dropoffLatitude \\\n10 -73.937195 40.679676 -73.922226 40.680149 \n12 -73.881638 40.767544 -73.917046 40.769688 \n20 -73.950226 40.678459 -73.958611 40.698792 \n30 -73.890671 40.746601 -73.896980 40.745064 \n73 -73.945190 40.792698 -73.935822 40.796143 \n... ... ... ... ... \n23721 NaN NaN NaN NaN \n23843 NaN NaN NaN NaN \n23852 NaN NaN NaN NaN \n23858 NaN NaN NaN NaN \n23917 NaN NaN NaN NaN \n\n totalAmount month_num ... day_of_week hour_of_day country_code \\\n10 5.80 1 ... 4 6 US \n12 16.80 1 ... 4 20 US \n20 14.80 1 ... 4 14 US \n30 5.30 1 ... 4 22 US \n73 6.30 1 ... 0 14 US \n... ... ... ... ... ... ... \n23721 7.30 12 ... 0 19 US \n23843 5.30 12 ... 6 22 US \n23852 12.30 12 ... 0 23 US \n23858 27.30 12 ... 6 2 US \n23917 11.15 12 ... 6 16 US \n\n hr_sin hr_cos dy_sin dy_cos datetime \\\n10 1.000000 6.123234e-17 -0.433884 -0.900969 2016-01-01 \n12 -0.866025 5.000000e-01 -0.433884 -0.900969 2016-01-01 \n20 -0.500000 -8.660254e-01 -0.433884 -0.900969 2016-01-01 \n30 -0.500000 8.660254e-01 -0.433884 -0.900969 2016-01-01 \n73 -0.500000 -8.660254e-01 0.000000 1.000000 2016-01-18 \n... ... ... ... ... ... \n23721 -0.965926 2.588190e-01 0.000000 1.000000 2016-12-26 \n23843 -0.500000 8.660254e-01 -0.781831 0.623490 2016-12-25 \n23852 -0.258819 9.659258e-01 0.000000 1.000000 2016-12-26 \n23858 0.500000 8.660254e-01 -0.781831 0.623490 2016-12-25 \n23917 -0.866025 -5.000000e-01 -0.781831 0.623490 2016-12-25 \n\n normalizeHolidayName isPaidTimeOff \n10 New Year's Day True \n12 New Year's Day True \n20 New Year's Day True \n30 New Year's Day True \n73 Martin Luther King Jr. Day None \n... ... ... \n23721 Christmas Day True \n23843 Christmas Day True \n23852 Christmas Day True \n23858 Christmas Day True \n23917 Christmas Day True \n\n[611 rows x 21 columns]",
+ "text/html": "\n\n
\n \n \n | \n vendorID | \n lpepPickupDatetime | \n passengerCount | \n tripDistance | \n pickupLongitude | \n pickupLatitude | \n dropoffLongitude | \n dropoffLatitude | \n totalAmount | \n month_num | \n ... | \n day_of_week | \n hour_of_day | \n country_code | \n hr_sin | \n hr_cos | \n dy_sin | \n dy_cos | \n datetime | \n normalizeHolidayName | \n isPaidTimeOff | \n
\n \n \n \n 10 | \n 2 | \n 2016-01-01 06:10:47 | \n 1 | \n 1.01 | \n -73.937195 | \n 40.679676 | \n -73.922226 | \n 40.680149 | \n 5.80 | \n 1 | \n ... | \n 4 | \n 6 | \n US | \n 1.000000 | \n 6.123234e-17 | \n -0.433884 | \n -0.900969 | \n 2016-01-01 | \n New Year's Day | \n True | \n
\n \n 12 | \n 2 | \n 2016-01-01 20:35:55 | \n 1 | \n 3.81 | \n -73.881638 | \n 40.767544 | \n -73.917046 | \n 40.769688 | \n 16.80 | \n 1 | \n ... | \n 4 | \n 20 | \n US | \n -0.866025 | \n 5.000000e-01 | \n -0.433884 | \n -0.900969 | \n 2016-01-01 | \n New Year's Day | \n True | \n
\n \n 20 | \n 2 | \n 2016-01-01 14:29:02 | \n 1 | \n 1.60 | \n -73.950226 | \n 40.678459 | \n -73.958611 | \n 40.698792 | \n 14.80 | \n 1 | \n ... | \n 4 | \n 14 | \n US | \n -0.500000 | \n -8.660254e-01 | \n -0.433884 | \n -0.900969 | \n 2016-01-01 | \n New Year's Day | \n True | \n
\n \n 30 | \n 2 | \n 2016-01-01 22:38:33 | \n 1 | \n 0.60 | \n -73.890671 | \n 40.746601 | \n -73.896980 | \n 40.745064 | \n 5.30 | \n 1 | \n ... | \n 4 | \n 22 | \n US | \n -0.500000 | \n 8.660254e-01 | \n -0.433884 | \n -0.900969 | \n 2016-01-01 | \n New Year's Day | \n True | \n
\n \n 73 | \n 2 | \n 2016-01-18 14:45:12 | \n 1 | \n 1.02 | \n -73.945190 | \n 40.792698 | \n -73.935822 | \n 40.796143 | \n 6.30 | \n 1 | \n ... | \n 0 | \n 14 | \n US | \n -0.500000 | \n -8.660254e-01 | \n 0.000000 | \n 1.000000 | \n 2016-01-18 | \n Martin Luther King Jr. Day | \n None | \n
\n \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n
\n \n 23721 | \n 2 | \n 2016-12-26 19:17:20 | \n 1 | \n 1.00 | \n NaN | \n NaN | \n NaN | \n NaN | \n 7.30 | \n 12 | \n ... | \n 0 | \n 19 | \n US | \n -0.965926 | \n 2.588190e-01 | \n 0.000000 | \n 1.000000 | \n 2016-12-26 | \n Christmas Day | \n True | \n
\n \n 23843 | \n 2 | \n 2016-12-25 22:11:04 | \n 1 | \n 0.65 | \n NaN | \n NaN | \n NaN | \n NaN | \n 5.30 | \n 12 | \n ... | \n 6 | \n 22 | \n US | \n -0.500000 | \n 8.660254e-01 | \n -0.781831 | \n 0.623490 | \n 2016-12-25 | \n Christmas Day | \n True | \n
\n \n 23852 | \n 2 | \n 2016-12-26 23:48:40 | \n 1 | \n 2.84 | \n NaN | \n NaN | \n NaN | \n NaN | \n 12.30 | \n 12 | \n ... | \n 0 | \n 23 | \n US | \n -0.258819 | \n 9.659258e-01 | \n 0.000000 | \n 1.000000 | \n 2016-12-26 | \n Christmas Day | \n True | \n
\n \n 23858 | \n 2 | \n 2016-12-25 02:19:58 | \n 3 | \n 6.83 | \n NaN | \n NaN | \n NaN | \n NaN | \n 27.30 | \n 12 | \n ... | \n 6 | \n 2 | \n US | \n 0.500000 | \n 8.660254e-01 | \n -0.781831 | \n 0.623490 | \n 2016-12-25 | \n Christmas Day | \n True | \n
\n \n 23917 | \n 1 | \n 2016-12-25 16:57:15 | \n 1 | \n 2.00 | \n NaN | \n NaN | \n NaN | \n NaN | \n 11.15 | \n 12 | \n ... | \n 6 | \n 16 | \n US | \n -0.866025 | \n -5.000000e-01 | \n -0.781831 | \n 0.623490 | \n 2016-12-25 | \n Christmas Day | \n True | \n
\n \n
\n
611 rows × 21 columns
\n
"
+ },
+ "metadata": {}
+ }
+ ],
+ "execution_count": 6,
+ "metadata": {
+ "gather": {
+ "logged": 1681193761522
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Enrich with weather data\n",
+ "\n",
+ "Now NOAA surface weather data can be appended to the taxi and holiday data. Use a similar approach to fetch the weather data by downloading one month at a time iteratively. Additionally, specify the `cols` parameter with an array of strings to filter the columns to download. This is a very large dataset containing weather surface data from all over the world, so before appending each month, filter the lat/long fields to near NYC using the `query()` function on the dataframe. This will ensure the `weather_df` doesn't get too large."
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from azureml.opendatasets import NoaaIsdWeather\n",
+ "start = datetime.strptime(\"1/1/2016\",\"%m/%d/%Y\")\n",
+ "end = datetime.strptime(\"1/31/2016\",\"%m/%d/%Y\")\n",
+ "\n",
+ "weather_df = pd.concat([NoaaIsdWeather(cols=[\"temperature\", \"precipTime\", \"precipDepth\"], start_date=start + relativedelta(months=x), end_date=end + relativedelta(months=x))\\\n",
+ " .to_pandas_dataframe().query(\"latitude>=40.53 and latitude<=40.88 and longitude>=-74.09 and longitude<=-73.72 and temperature==temperature\") for x in range(12)])"
+ ],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": "[Info] read from /tmp/tmpcav0ogcg/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=1/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-1.c000.snappy.parquet\n[Info] read from /tmp/tmpcav0ogcg/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=1/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-1.c000.snappy.parquet\n[Info] read from /tmp/tmpcav0ogcg/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=1/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-1.c000.snappy.parquet\n[Info] read from /tmp/tmpcav0ogcg/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=1/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-1.c000.snappy.parquet\n[Info] read from /tmp/tmpcav0ogcg/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=1/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-1.c000.snappy.parquet\n[Info] read from /tmp/tmpcav0ogcg/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=1/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-1.c000.snappy.parquet\n[Info] read from /tmp/tmpcav0ogcg/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=1/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-1.c000.snappy.parquet\n[Info] read from /tmp/tmpcav0ogcg/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=1/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-1.c000.snappy.parquet\n[Info] read from /tmp/tmpxnl2b_3o/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=2/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-2.c000.snappy.parquet\n[Info] read from /tmp/tmpxnl2b_3o/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=2/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-2.c000.snappy.parquet\n[Info] read from /tmp/tmpxnl2b_3o/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=2/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-2.c000.snappy.parquet\n[Info] read from /tmp/tmpxnl2b_3o/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=2/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-2.c000.snappy.parquet\n[Info] read from /tmp/tmpxnl2b_3o/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=2/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-2.c000.snappy.parquet\n[Info] read from /tmp/tmpxnl2b_3o/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=2/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-2.c000.snappy.parquet\n[Info] read from /tmp/tmpxnl2b_3o/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=2/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-2.c000.snappy.parquet\n[Info] read from /tmp/tmpxnl2b_3o/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=2/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-2.c000.snappy.parquet\n[Info] read from /tmp/tmpa6giy_qd/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=3/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-3.c000.snappy.parquet\n[Info] read from /tmp/tmpa6giy_qd/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=3/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-3.c000.snappy.parquet\n[Info] read from /tmp/tmpa6giy_qd/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=3/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-3.c000.snappy.parquet\n[Info] read from /tmp/tmpa6giy_qd/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=3/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-3.c000.snappy.parquet\n[Info] read from /tmp/tmpa6giy_qd/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=3/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-3.c000.snappy.parquet\n[Info] read from /tmp/tmpa6giy_qd/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=3/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-3.c000.snappy.parquet\n[Info] read from /tmp/tmpa6giy_qd/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=3/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-3.c000.snappy.parquet\n[Info] read from /tmp/tmpa6giy_qd/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=3/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-3.c000.snappy.parquet\n[Info] read from /tmp/tmpger9x31f/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=4/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-4.c000.snappy.parquet\n[Info] read from /tmp/tmpger9x31f/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=4/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-4.c000.snappy.parquet\n[Info] read from /tmp/tmpger9x31f/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=4/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-4.c000.snappy.parquet\n[Info] read from /tmp/tmpger9x31f/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=4/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-4.c000.snappy.parquet\n[Info] read from /tmp/tmpger9x31f/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=4/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-4.c000.snappy.parquet\n[Info] read from /tmp/tmpger9x31f/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=4/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-4.c000.snappy.parquet\n[Info] read from /tmp/tmpger9x31f/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=4/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-4.c000.snappy.parquet\n[Info] read from /tmp/tmpger9x31f/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=4/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-4.c000.snappy.parquet\n[Info] read from /tmp/tmp20krwtl4/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=5/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-5.c000.snappy.parquet\n[Info] read from /tmp/tmp20krwtl4/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=5/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-5.c000.snappy.parquet\n[Info] read from /tmp/tmp20krwtl4/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=5/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-5.c000.snappy.parquet\n[Info] read from /tmp/tmp20krwtl4/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=5/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-5.c000.snappy.parquet\n[Info] read from /tmp/tmp20krwtl4/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=5/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-5.c000.snappy.parquet\n[Info] read from /tmp/tmp20krwtl4/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=5/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-5.c000.snappy.parquet\n[Info] read from /tmp/tmp20krwtl4/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=5/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-5.c000.snappy.parquet\n[Info] read from /tmp/tmp20krwtl4/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=5/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-5.c000.snappy.parquet\n[Info] read from /tmp/tmpw67juif_/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=6/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-6.c000.snappy.parquet\n[Info] read from /tmp/tmpw67juif_/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=6/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-6.c000.snappy.parquet\n[Info] read from /tmp/tmpw67juif_/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=6/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-6.c000.snappy.parquet\n[Info] read from /tmp/tmpw67juif_/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=6/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-6.c000.snappy.parquet\n[Info] read from /tmp/tmpw67juif_/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=6/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-6.c000.snappy.parquet\n[Info] read from /tmp/tmpw67juif_/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=6/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-6.c000.snappy.parquet\n[Info] read from /tmp/tmpw67juif_/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=6/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-6.c000.snappy.parquet\n[Info] read from /tmp/tmpw67juif_/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=6/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-6.c000.snappy.parquet\n[Info] read from /tmp/tmp4bedjy2u/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=7/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-7.c000.snappy.parquet\n[Info] read from /tmp/tmp4bedjy2u/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=7/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-7.c000.snappy.parquet\n[Info] read from /tmp/tmp4bedjy2u/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=7/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-7.c000.snappy.parquet\n[Info] read from /tmp/tmp4bedjy2u/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=7/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-7.c000.snappy.parquet\n[Info] read from /tmp/tmp4bedjy2u/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=7/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-7.c000.snappy.parquet\n[Info] read from /tmp/tmp4bedjy2u/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=7/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-7.c000.snappy.parquet\n[Info] read from /tmp/tmp4bedjy2u/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=7/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-7.c000.snappy.parquet\n[Info] read from /tmp/tmp4bedjy2u/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=7/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-7.c000.snappy.parquet\n[Info] read from /tmp/tmp7urdqo5t/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=8/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-8.c000.snappy.parquet\n[Info] read from /tmp/tmp7urdqo5t/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=8/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-8.c000.snappy.parquet\n[Info] read from /tmp/tmp7urdqo5t/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=8/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-8.c000.snappy.parquet\n[Info] read from /tmp/tmp7urdqo5t/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=8/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-8.c000.snappy.parquet\n[Info] read from /tmp/tmp7urdqo5t/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=8/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-8.c000.snappy.parquet\n[Info] read from /tmp/tmp7urdqo5t/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=8/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-8.c000.snappy.parquet\n[Info] read from /tmp/tmp7urdqo5t/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=8/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-8.c000.snappy.parquet\n[Info] read from /tmp/tmp7urdqo5t/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=8/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-8.c000.snappy.parquet\n[Info] read from /tmp/tmptcaeisle/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=9/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-9.c000.snappy.parquet\n[Info] read from /tmp/tmptcaeisle/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=9/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-9.c000.snappy.parquet\n[Info] read from /tmp/tmptcaeisle/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=9/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-9.c000.snappy.parquet\n[Info] read from /tmp/tmptcaeisle/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=9/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-9.c000.snappy.parquet\n[Info] read from /tmp/tmptcaeisle/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=9/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-9.c000.snappy.parquet\n[Info] read from /tmp/tmptcaeisle/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=9/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-9.c000.snappy.parquet\n[Info] read from /tmp/tmptcaeisle/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=9/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-9.c000.snappy.parquet\n[Info] read from /tmp/tmptcaeisle/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=9/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-9.c000.snappy.parquet\n[Info] read from /tmp/tmp295irf24/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=10/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-10.c000.snappy.parquet\n[Info] read from /tmp/tmp295irf24/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=10/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-10.c000.snappy.parquet\n[Info] read from /tmp/tmp295irf24/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=10/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-10.c000.snappy.parquet\n[Info] read from /tmp/tmp295irf24/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=10/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-10.c000.snappy.parquet\n[Info] read from /tmp/tmp295irf24/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=10/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-10.c000.snappy.parquet\n[Info] read from /tmp/tmp295irf24/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=10/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-10.c000.snappy.parquet\n[Info] read from /tmp/tmp295irf24/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=10/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-10.c000.snappy.parquet\n[Info] read from /tmp/tmp295irf24/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=10/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-10.c000.snappy.parquet\n[Info] read from /tmp/tmp555rpnhr/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=11/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-11.c000.snappy.parquet\n[Info] read from /tmp/tmp555rpnhr/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=11/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-11.c000.snappy.parquet\n[Info] read from /tmp/tmp555rpnhr/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=11/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-11.c000.snappy.parquet\n[Info] read from /tmp/tmp555rpnhr/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=11/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-11.c000.snappy.parquet\n[Info] read from /tmp/tmp555rpnhr/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=11/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-11.c000.snappy.parquet\n[Info] read from /tmp/tmp555rpnhr/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=11/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-11.c000.snappy.parquet\n[Info] read from /tmp/tmp555rpnhr/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=11/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-11.c000.snappy.parquet\n[Info] read from /tmp/tmp555rpnhr/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=11/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-11.c000.snappy.parquet\n[Info] read from /tmp/tmp3c3qjt4b/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=12/part-00000-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2623-12.c000.snappy.parquet\n[Info] read from /tmp/tmp3c3qjt4b/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=12/part-00004-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2629-12.c000.snappy.parquet\n[Info] read from /tmp/tmp3c3qjt4b/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=12/part-00001-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2628-12.c000.snappy.parquet\n[Info] read from /tmp/tmp3c3qjt4b/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=12/part-00005-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2627-12.c000.snappy.parquet\n[Info] read from /tmp/tmp3c3qjt4b/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=12/part-00002-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2622-12.c000.snappy.parquet\n[Info] read from /tmp/tmp3c3qjt4b/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=12/part-00006-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2625-12.c000.snappy.parquet\n[Info] read from /tmp/tmp3c3qjt4b/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=12/part-00003-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2624-12.c000.snappy.parquet\n[Info] read from /tmp/tmp3c3qjt4b/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2016/month=12/part-00007-tid-738723883827836859-85e0759b-51fa-4430-84ec-cb8ab6a57033-2626-12.c000.snappy.parquet\n"
+ }
+ ],
+ "execution_count": 7,
+ "metadata": {
+ "gather": {
+ "logged": 1681193827810
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "weather_df"
+ ],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "execution_count": 8,
+ "data": {
+ "text/plain": " temperature precipTime latitude longitude datetime \\\n204647 2.8 NaN 40.783 -73.867 2016-01-02 03:00:00 \n204670 -4.4 1.0 40.779 -73.880 2016-01-22 13:51:00 \n204694 5.0 1.0 40.779 -73.880 2016-01-08 02:51:00 \n204701 -1.1 1.0 40.779 -73.880 2016-01-04 15:51:00 \n204715 4.4 1.0 40.779 -73.880 2016-01-01 21:51:00 \n... ... ... ... ... ... \n1248471 4.4 1.0 40.789 -73.967 2016-12-23 13:51:00 \n1248555 5.0 1.0 40.789 -73.967 2016-12-12 13:51:00 \n1248580 3.9 NaN 40.789 -73.967 2016-12-18 07:01:00 \n1248597 7.8 1.0 40.789 -73.967 2016-12-25 00:51:00 \n1248600 -2.8 1.0 40.789 -73.967 2016-12-17 11:10:00 \n\n wban precipDepth usaf \n204647 14732 NaN 725030 \n204670 14732 0.0 725030 \n204694 14732 0.0 725030 \n204701 14732 0.0 725030 \n204715 14732 0.0 725030 \n... ... ... ... \n1248471 94728 0.0 725053 \n1248555 94728 0.0 725053 \n1248580 94728 NaN 725053 \n1248597 94728 0.0 725053 \n1248600 94728 5.0 725053 \n\n[55683 rows x 8 columns]",
+ "text/html": "\n\n
\n \n \n | \n temperature | \n precipTime | \n latitude | \n longitude | \n datetime | \n wban | \n precipDepth | \n usaf | \n
\n \n \n \n 204647 | \n 2.8 | \n NaN | \n 40.783 | \n -73.867 | \n 2016-01-02 03:00:00 | \n 14732 | \n NaN | \n 725030 | \n
\n \n 204670 | \n -4.4 | \n 1.0 | \n 40.779 | \n -73.880 | \n 2016-01-22 13:51:00 | \n 14732 | \n 0.0 | \n 725030 | \n
\n \n 204694 | \n 5.0 | \n 1.0 | \n 40.779 | \n -73.880 | \n 2016-01-08 02:51:00 | \n 14732 | \n 0.0 | \n 725030 | \n
\n \n 204701 | \n -1.1 | \n 1.0 | \n 40.779 | \n -73.880 | \n 2016-01-04 15:51:00 | \n 14732 | \n 0.0 | \n 725030 | \n
\n \n 204715 | \n 4.4 | \n 1.0 | \n 40.779 | \n -73.880 | \n 2016-01-01 21:51:00 | \n 14732 | \n 0.0 | \n 725030 | \n
\n \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n
\n \n 1248471 | \n 4.4 | \n 1.0 | \n 40.789 | \n -73.967 | \n 2016-12-23 13:51:00 | \n 94728 | \n 0.0 | \n 725053 | \n
\n \n 1248555 | \n 5.0 | \n 1.0 | \n 40.789 | \n -73.967 | \n 2016-12-12 13:51:00 | \n 94728 | \n 0.0 | \n 725053 | \n
\n \n 1248580 | \n 3.9 | \n NaN | \n 40.789 | \n -73.967 | \n 2016-12-18 07:01:00 | \n 94728 | \n NaN | \n 725053 | \n
\n \n 1248597 | \n 7.8 | \n 1.0 | \n 40.789 | \n -73.967 | \n 2016-12-25 00:51:00 | \n 94728 | \n 0.0 | \n 725053 | \n
\n \n 1248600 | \n -2.8 | \n 1.0 | \n 40.789 | \n -73.967 | \n 2016-12-17 11:10:00 | \n 94728 | \n 5.0 | \n 725053 | \n
\n \n
\n
55683 rows × 8 columns
\n
"
+ },
+ "metadata": {}
+ }
+ ],
+ "execution_count": 8,
+ "metadata": {
+ "gather": {
+ "logged": 1681193828162
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Again call `pandas.Series.dt.normalize` on the `datetime` field in the weather data so it matches the time key in `taxi_holidays_df`.\n",
+ "\n",
+ "\n",
+ "Next group the weather data to have daily aggregated weather values. Define a dict `aggregations` to define how to aggregate each field at a daily level. For`temperature` take the mean and for `precipTime` and `precipDepth` take the daily maximum. Use the `groupby()` function along with the aggregations to group the data. Preview the data to ensure there is one record per day."
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "weather_df[\"datetime\"] = weather_df[\"datetime\"].dt.normalize()\n",
+ "\n",
+ "# group by datetime\n",
+ "aggregations = {\"precipTime\": \"max\", \"temperature\": \"mean\", \"precipDepth\": \"max\"}\n",
+ "weather_df_grouped = weather_df.groupby(\"datetime\").agg(aggregations)\n",
+ "weather_df_grouped.head(10)"
+ ],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "execution_count": 9,
+ "data": {
+ "text/plain": " precipTime temperature precipDepth\ndatetime \n2016-01-01 1.0 5.197345 0.0\n2016-01-02 1.0 2.567857 0.0\n2016-01-03 1.0 3.846429 0.0\n2016-01-04 1.0 0.123894 0.0\n2016-01-05 6.0 -7.206250 0.0\n2016-01-06 6.0 -0.896396 0.0\n2016-01-07 6.0 3.180645 0.0\n2016-01-08 1.0 4.384091 0.0\n2016-01-09 6.0 6.710274 3.0\n2016-01-10 24.0 10.943655 254.0",
+ "text/html": "\n\n
\n \n \n | \n precipTime | \n temperature | \n precipDepth | \n
\n \n datetime | \n | \n | \n | \n
\n \n \n \n 2016-01-01 | \n 1.0 | \n 5.197345 | \n 0.0 | \n
\n \n 2016-01-02 | \n 1.0 | \n 2.567857 | \n 0.0 | \n
\n \n 2016-01-03 | \n 1.0 | \n 3.846429 | \n 0.0 | \n
\n \n 2016-01-04 | \n 1.0 | \n 0.123894 | \n 0.0 | \n
\n \n 2016-01-05 | \n 6.0 | \n -7.206250 | \n 0.0 | \n
\n \n 2016-01-06 | \n 6.0 | \n -0.896396 | \n 0.0 | \n
\n \n 2016-01-07 | \n 6.0 | \n 3.180645 | \n 0.0 | \n
\n \n 2016-01-08 | \n 1.0 | \n 4.384091 | \n 0.0 | \n
\n \n 2016-01-09 | \n 6.0 | \n 6.710274 | \n 3.0 | \n
\n \n 2016-01-10 | \n 24.0 | \n 10.943655 | \n 254.0 | \n
\n \n
\n
"
+ },
+ "metadata": {}
+ }
+ ],
+ "execution_count": 9,
+ "metadata": {
+ "gather": {
+ "logged": 1681193828979
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Note: The examples in this tutorial merge data using Pandas functions and custom aggregations, but the Open Datasets SDK has classes designed to easily merge and enrich data sets. See the [notebook](https://github.com/Azure/OpenDatasetsNotebooks/blob/master/tutorials/data-join/04-nyc-taxi-join-weather-in-pandas.ipynb) for code examples of these design patterns."
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Cleanse data\n",
+ "\n",
+ "Merge the existing taxi and holiday data with the new weather data. This time `datetime` is the only key, and again perform a left-join of the data. Run the `describe()` function on the new dataframe to see summary statistics for each field."
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "taxi_holidays_weather_df = pd.merge(taxi_holidays_df, weather_df_grouped, how=\"left\", on=[\"datetime\"])\n",
+ "taxi_holidays_weather_df.describe()"
+ ],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "execution_count": 10,
+ "data": {
+ "text/plain": " vendorID passengerCount tripDistance pickupLongitude \\\ncount 24000.000000 24000.000000 24000.000000 12000.000000 \nmean 1.793625 1.359458 2.798265 -73.746045 \nstd 0.404711 1.033421 2.976438 3.753491 \nmin 1.000000 0.000000 0.000000 -74.163818 \n25% 2.000000 1.000000 1.020000 -73.961123 \n50% 2.000000 1.000000 1.830000 -73.946201 \n75% 2.000000 1.000000 3.430000 -73.918732 \nmax 2.000000 8.000000 86.700000 0.000000 \n\n pickupLatitude dropoffLongitude dropoffLatitude totalAmount \\\ncount 12000.000000 12000.000000 12000.000000 24000.000000 \nmean 40.641902 -73.812426 40.677156 14.603195 \nstd 2.069237 3.016449 1.663137 11.596075 \nmin 0.000000 -75.167496 0.000000 -83.900000 \n25% 40.694324 -73.968376 40.695145 7.880000 \n50% 40.746000 -73.945480 40.746264 11.300000 \n75% 40.801911 -73.912468 40.789734 17.300000 \nmax 41.015667 0.000000 41.085476 495.000000 \n\n month_num day_of_month day_of_week hour_of_day hr_sin \\\ncount 24000.000000 24000.000000 24000.000000 24000.000000 24000.000000 \nmean 6.500000 15.072875 3.236458 13.611000 -0.246484 \nstd 3.452124 8.475006 1.964295 6.682823 0.665381 \nmin 1.000000 1.000000 0.000000 0.000000 -1.000000 \n25% 3.750000 8.000000 2.000000 9.000000 -0.866025 \n50% 6.500000 15.000000 3.000000 15.000000 -0.500000 \n75% 9.250000 22.000000 5.000000 19.000000 0.258819 \nmax 12.000000 30.000000 6.000000 23.000000 1.000000 \n\n hr_cos dy_sin dy_cos precipTime temperature \\\ncount 2.400000e+04 24000.000000 24000.000000 24000.000000 24000.000000 \nmean -2.038304e-02 -0.085070 -0.050450 13.408667 13.876231 \nstd 7.043703e-01 0.713593 0.693574 10.330720 9.462154 \nmin -1.000000e+00 -0.974928 -0.900969 1.000000 -13.379464 \n25% -7.071068e-01 -0.781831 -0.900969 6.000000 6.591071 \n50% -1.836970e-16 0.000000 -0.222521 6.000000 13.125893 \n75% 7.071068e-01 0.781831 0.623490 24.000000 22.944737 \nmax 1.000000e+00 0.974928 1.000000 24.000000 31.303665 \n\n precipDepth \ncount 24000.000000 \nmean 1075.977667 \nstd 2849.048787 \nmin 0.000000 \n25% 0.000000 \n50% 10.000000 \n75% 132.000000 \nmax 9999.000000 ",
+ "text/html": "\n\n
\n \n \n | \n vendorID | \n passengerCount | \n tripDistance | \n pickupLongitude | \n pickupLatitude | \n dropoffLongitude | \n dropoffLatitude | \n totalAmount | \n month_num | \n day_of_month | \n day_of_week | \n hour_of_day | \n hr_sin | \n hr_cos | \n dy_sin | \n dy_cos | \n precipTime | \n temperature | \n precipDepth | \n
\n \n \n \n count | \n 24000.000000 | \n 24000.000000 | \n 24000.000000 | \n 12000.000000 | \n 12000.000000 | \n 12000.000000 | \n 12000.000000 | \n 24000.000000 | \n 24000.000000 | \n 24000.000000 | \n 24000.000000 | \n 24000.000000 | \n 24000.000000 | \n 2.400000e+04 | \n 24000.000000 | \n 24000.000000 | \n 24000.000000 | \n 24000.000000 | \n 24000.000000 | \n
\n \n mean | \n 1.793625 | \n 1.359458 | \n 2.798265 | \n -73.746045 | \n 40.641902 | \n -73.812426 | \n 40.677156 | \n 14.603195 | \n 6.500000 | \n 15.072875 | \n 3.236458 | \n 13.611000 | \n -0.246484 | \n -2.038304e-02 | \n -0.085070 | \n -0.050450 | \n 13.408667 | \n 13.876231 | \n 1075.977667 | \n
\n \n std | \n 0.404711 | \n 1.033421 | \n 2.976438 | \n 3.753491 | \n 2.069237 | \n 3.016449 | \n 1.663137 | \n 11.596075 | \n 3.452124 | \n 8.475006 | \n 1.964295 | \n 6.682823 | \n 0.665381 | \n 7.043703e-01 | \n 0.713593 | \n 0.693574 | \n 10.330720 | \n 9.462154 | \n 2849.048787 | \n
\n \n min | \n 1.000000 | \n 0.000000 | \n 0.000000 | \n -74.163818 | \n 0.000000 | \n -75.167496 | \n 0.000000 | \n -83.900000 | \n 1.000000 | \n 1.000000 | \n 0.000000 | \n 0.000000 | \n -1.000000 | \n -1.000000e+00 | \n -0.974928 | \n -0.900969 | \n 1.000000 | \n -13.379464 | \n 0.000000 | \n
\n \n 25% | \n 2.000000 | \n 1.000000 | \n 1.020000 | \n -73.961123 | \n 40.694324 | \n -73.968376 | \n 40.695145 | \n 7.880000 | \n 3.750000 | \n 8.000000 | \n 2.000000 | \n 9.000000 | \n -0.866025 | \n -7.071068e-01 | \n -0.781831 | \n -0.900969 | \n 6.000000 | \n 6.591071 | \n 0.000000 | \n
\n \n 50% | \n 2.000000 | \n 1.000000 | \n 1.830000 | \n -73.946201 | \n 40.746000 | \n -73.945480 | \n 40.746264 | \n 11.300000 | \n 6.500000 | \n 15.000000 | \n 3.000000 | \n 15.000000 | \n -0.500000 | \n -1.836970e-16 | \n 0.000000 | \n -0.222521 | \n 6.000000 | \n 13.125893 | \n 10.000000 | \n
\n \n 75% | \n 2.000000 | \n 1.000000 | \n 3.430000 | \n -73.918732 | \n 40.801911 | \n -73.912468 | \n 40.789734 | \n 17.300000 | \n 9.250000 | \n 22.000000 | \n 5.000000 | \n 19.000000 | \n 0.258819 | \n 7.071068e-01 | \n 0.781831 | \n 0.623490 | \n 24.000000 | \n 22.944737 | \n 132.000000 | \n
\n \n max | \n 2.000000 | \n 8.000000 | \n 86.700000 | \n 0.000000 | \n 41.015667 | \n 0.000000 | \n 41.085476 | \n 495.000000 | \n 12.000000 | \n 30.000000 | \n 6.000000 | \n 23.000000 | \n 1.000000 | \n 1.000000e+00 | \n 0.974928 | \n 1.000000 | \n 24.000000 | \n 31.303665 | \n 9999.000000 | \n
\n \n
\n
"
+ },
+ "metadata": {}
+ }
+ ],
+ "execution_count": 10,
+ "metadata": {
+ "gather": {
+ "logged": 1681193829356
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "From the summary statistics, you see that there are several fields that have outliers or values that will reduce model accuracy. First filter the lat/long fields to be within the same bounds you used for filtering weather data. The `tripDistance` field has some bad data, because the minimum value is negative. The `passengerCount` field has bad data as well, with the max value being 210 passengers. Lastly, the `totalAmount` field has negative values, which don't make sense in the context of our model.\n",
+ "\n",
+ "Filter out these anomolies using query functions, and then remove the last few columns unnecesary for training.\n",
+ "\n",
+ "Note: since a random sample of 2000 was taken for each month of the taxi data, the statistics may vary each time this is ran."
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "final_df = taxi_holidays_weather_df.query(\"pickupLatitude>=40.53 and pickupLatitude<=40.88 and \\\n",
+ " pickupLongitude>=-74.09 and pickupLongitude<=-73.72 and \\\n",
+ " tripDistance>0 and tripDistance<75 and \\\n",
+ " passengerCount>0 and passengerCount<100 and \\\n",
+ " totalAmount>0\")"
+ ],
+ "outputs": [],
+ "execution_count": 11,
+ "metadata": {
+ "gather": {
+ "logged": 1681193829696
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Call `describe()` again on the data to ensure cleansing worked as expected. The final data is prepared and cleansed, consisting of taxi, holiday, and weather data, and is ready to use for machine learning model training."
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "final_df.describe()"
+ ],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "execution_count": 12,
+ "data": {
+ "text/plain": " vendorID passengerCount tripDistance pickupLongitude \\\ncount 11720.000000 11720.000000 11720.000000 11720.000000 \nmean 1.794710 1.354778 2.829323 -73.937425 \nstd 0.403931 1.025023 2.899549 0.041218 \nmin 1.000000 1.000000 0.010000 -74.074181 \n25% 2.000000 1.000000 1.060000 -73.961384 \n50% 2.000000 1.000000 1.880000 -73.946762 \n75% 2.000000 1.000000 3.490000 -73.919127 \nmax 2.000000 6.000000 52.800000 -73.744164 \n\n pickupLatitude dropoffLongitude dropoffLatitude totalAmount \\\ncount 11720.000000 11720.000000 11720.000000 11720.000000 \nmean 40.746545 -73.879205 40.713237 14.597789 \nstd 0.056494 2.048760 1.130176 10.594542 \nmin 40.573597 -74.186638 0.000000 0.010000 \n25% 40.694648 -73.968775 40.695228 8.160000 \n50% 40.745876 -73.946011 40.746073 11.300000 \n75% 40.801327 -73.912937 40.788689 17.300000 \nmax 40.879837 0.000000 41.025719 223.890000 \n\n month_num day_of_month day_of_week hour_of_day hr_sin \\\ncount 11720.000000 11720.000000 11720.000000 11720.000000 11720.000000 \nmean 3.501024 14.890444 3.252645 13.621672 -0.244164 \nstd 1.707714 8.454712 1.967197 6.721303 0.666575 \nmin 1.000000 1.000000 0.000000 0.000000 -1.000000 \n25% 2.000000 8.000000 2.000000 9.000000 -0.866025 \n50% 4.000000 15.000000 4.000000 15.000000 -0.500000 \n75% 5.000000 22.000000 5.000000 19.000000 0.258819 \nmax 6.000000 30.000000 6.000000 23.000000 1.000000 \n\n hr_cos dy_sin dy_cos precipTime temperature \\\ncount 1.172000e+04 11720.000000 11720.000000 11720.000000 11720.000000 \nmean -1.142466e-02 -0.090539 -0.049453 12.066980 10.267549 \nstd 7.042813e-01 0.713570 0.693007 10.146518 8.484011 \nmin -1.000000e+00 -0.974928 -0.900969 1.000000 -13.379464 \n25% -7.071068e-01 -0.781831 -0.900969 1.000000 3.504580 \n50% -1.836970e-16 -0.433884 -0.222521 6.000000 10.130357 \n75% 7.071068e-01 0.781831 0.623490 24.000000 17.239744 \nmax 1.000000e+00 0.974928 1.000000 24.000000 26.524107 \n\n precipDepth \ncount 11720.000000 \nmean 190.603498 \nstd 1215.018267 \nmin 0.000000 \n25% 0.000000 \n50% 3.000000 \n75% 41.000000 \nmax 9999.000000 ",
+ "text/html": "\n\n
\n \n \n | \n vendorID | \n passengerCount | \n tripDistance | \n pickupLongitude | \n pickupLatitude | \n dropoffLongitude | \n dropoffLatitude | \n totalAmount | \n month_num | \n day_of_month | \n day_of_week | \n hour_of_day | \n hr_sin | \n hr_cos | \n dy_sin | \n dy_cos | \n precipTime | \n temperature | \n precipDepth | \n
\n \n \n \n count | \n 11720.000000 | \n 11720.000000 | \n 11720.000000 | \n 11720.000000 | \n 11720.000000 | \n 11720.000000 | \n 11720.000000 | \n 11720.000000 | \n 11720.000000 | \n 11720.000000 | \n 11720.000000 | \n 11720.000000 | \n 11720.000000 | \n 1.172000e+04 | \n 11720.000000 | \n 11720.000000 | \n 11720.000000 | \n 11720.000000 | \n 11720.000000 | \n
\n \n mean | \n 1.794710 | \n 1.354778 | \n 2.829323 | \n -73.937425 | \n 40.746545 | \n -73.879205 | \n 40.713237 | \n 14.597789 | \n 3.501024 | \n 14.890444 | \n 3.252645 | \n 13.621672 | \n -0.244164 | \n -1.142466e-02 | \n -0.090539 | \n -0.049453 | \n 12.066980 | \n 10.267549 | \n 190.603498 | \n
\n \n std | \n 0.403931 | \n 1.025023 | \n 2.899549 | \n 0.041218 | \n 0.056494 | \n 2.048760 | \n 1.130176 | \n 10.594542 | \n 1.707714 | \n 8.454712 | \n 1.967197 | \n 6.721303 | \n 0.666575 | \n 7.042813e-01 | \n 0.713570 | \n 0.693007 | \n 10.146518 | \n 8.484011 | \n 1215.018267 | \n
\n \n min | \n 1.000000 | \n 1.000000 | \n 0.010000 | \n -74.074181 | \n 40.573597 | \n -74.186638 | \n 0.000000 | \n 0.010000 | \n 1.000000 | \n 1.000000 | \n 0.000000 | \n 0.000000 | \n -1.000000 | \n -1.000000e+00 | \n -0.974928 | \n -0.900969 | \n 1.000000 | \n -13.379464 | \n 0.000000 | \n
\n \n 25% | \n 2.000000 | \n 1.000000 | \n 1.060000 | \n -73.961384 | \n 40.694648 | \n -73.968775 | \n 40.695228 | \n 8.160000 | \n 2.000000 | \n 8.000000 | \n 2.000000 | \n 9.000000 | \n -0.866025 | \n -7.071068e-01 | \n -0.781831 | \n -0.900969 | \n 1.000000 | \n 3.504580 | \n 0.000000 | \n
\n \n 50% | \n 2.000000 | \n 1.000000 | \n 1.880000 | \n -73.946762 | \n 40.745876 | \n -73.946011 | \n 40.746073 | \n 11.300000 | \n 4.000000 | \n 15.000000 | \n 4.000000 | \n 15.000000 | \n -0.500000 | \n -1.836970e-16 | \n -0.433884 | \n -0.222521 | \n 6.000000 | \n 10.130357 | \n 3.000000 | \n
\n \n 75% | \n 2.000000 | \n 1.000000 | \n 3.490000 | \n -73.919127 | \n 40.801327 | \n -73.912937 | \n 40.788689 | \n 17.300000 | \n 5.000000 | \n 22.000000 | \n 5.000000 | \n 19.000000 | \n 0.258819 | \n 7.071068e-01 | \n 0.781831 | \n 0.623490 | \n 24.000000 | \n 17.239744 | \n 41.000000 | \n
\n \n max | \n 2.000000 | \n 6.000000 | \n 52.800000 | \n -73.744164 | \n 40.879837 | \n 0.000000 | \n 41.025719 | \n 223.890000 | \n 6.000000 | \n 30.000000 | \n 6.000000 | \n 23.000000 | \n 1.000000 | \n 1.000000e+00 | \n 0.974928 | \n 1.000000 | \n 24.000000 | \n 26.524107 | \n 9999.000000 | \n
\n \n
\n
"
+ },
+ "metadata": {}
+ }
+ ],
+ "execution_count": 12,
+ "metadata": {
+ "gather": {
+ "logged": 1681193830079
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Train a model\n",
+ "\n",
+ "The data is ready to train a machine learning model."
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.linear_model import LinearRegression\n",
+ "from sklearn.linear_model import RidgeCV\n",
+ "from sklearn.linear_model import Ridge\n",
+ "from sklearn.ensemble import RandomForestRegressor\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.preprocessing import OneHotEncoder\n",
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.compose import ColumnTransformer\n",
+ "from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error"
+ ],
+ "outputs": [],
+ "execution_count": 13,
+ "metadata": {
+ "gather": {
+ "logged": 1681193830964
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Training Function\n",
+ "\n",
+ "Define a function that can be used to create a model pipeline that can be trained and then used for scoring. This pipeline has 2 steps: preprocessing and model training.\n",
+ "\n",
+ "Preprocessing Stages:\n",
+ "The preprocessing step of the pipeline also has 2 stages, one for numerical features and one for categorical features.\n",
+ "For the numerical features, let's fill in any blanks with 0's. While the training data may not have any nulls in the these fields, future data that is scored may and this step will take care of those for us. Optionally, a scaler transformation could be added in this step as well. Similarly for the categorical variables, let's have the null values filled with \"MISSING\". Additionally to the categorical variables, these will need to be one hot encoded, so we will include that step in our pipeline.\n",
+ "\n",
+ "Model Training Stage:\n",
+ "An input parameter will determine which type of model of train. Let's test out a linear regression and random forest model to start. \n",
+ "\n",
+ "The two steps are put together into the pipeline which is what the function is returning."
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def createClassModel(algo_name, catg, nums):\n",
+ " numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))])\n",
+ " \n",
+ " categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=\"MISSING\")), ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n",
+ " \n",
+ " preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)])\n",
+ " \n",
+ " if algo_name == 'linear_regression':\n",
+ " model=Ridge(alpha=100)\n",
+ " elif algo_name == 'random_forest':\n",
+ " model = RandomForestRegressor()\n",
+ " else:\n",
+ " pass\n",
+ " ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), (\"model\", model)])\n",
+ " return ModelPipeline"
+ ],
+ "outputs": [],
+ "execution_count": 14,
+ "metadata": {
+ "gather": {
+ "logged": 1681193831335
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Let's define the arguments that will be passed to the function. `catg_cols` is a list of the categorical variables that will be transformed in our processing step. `num_cols` is a list of the numerical variables that will be transformed in our processing step. Let's define the target column as `label` so it can be used in future steps as well."
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "catg_cols = [\"vendorID\", \"month_num\", \"day_of_month\", \"normalizeHolidayName\", \"isPaidTimeOff\"]\n",
+ "num_cols = [\"passengerCount\", \"tripDistance\", \"precipTime\", \"temperature\", \"precipDepth\", \"hr_sin\", \"hr_cos\", \"dy_sin\", \"dy_cos\"]\n",
+ "label = [\"totalAmount\"]"
+ ],
+ "outputs": [],
+ "execution_count": 15,
+ "metadata": {
+ "gather": {
+ "logged": 1681193831647
+ }
+ }
+ },
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "linear_regression\n",
- "R2: 0.8034971051723139\n",
- "MAPE: 0.15888983234876766\n",
- "RMSE: 4.606544019524053\n",
- "\n",
- "random_forest\n",
- "R2: 0.8073017231520601\n",
- "MAPE: 0.14715914748857337\n",
- "RMSE: 4.5617309259357475\n",
- "\n"
- ]
+ "cell_type": "markdown",
+ "source": [
+ "The training is ready to begin, but first, let's make sure that the categorical variables are strings in our dataframe to ensure no errors in our pipeline. \n",
+ "\n",
+ "Next, the data is split into training and test sets by using the `train_test_split()` function in the `scikit-learn` library. The `test_size` parameter determines the percentage of data to allocate to testing. The `random_state` parameter sets a seed to the random number generator, so that your train-test splits are deterministic.\n",
+ "\n",
+ "The training will happen in the for loop so that both algorithms can be tested. The createClassModel funtion is called to retreive the pipeline that can then be trained using the training dataset. \n",
+ "\n",
+ "Once trained, the test dataset is then ran through the model to test the model's performance. Using various functions from sklearn.metrics, the R2 score, MAPE, and RMSE can be used to measure model performance."
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# make sure categorical columns are strings\n",
+ "final_df[catg_cols] = final_df[catg_cols].astype(\"str\")\n",
+ "\n",
+ "# split data\n",
+ "X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222)\n",
+ "\n",
+ "# test 2 algorithms\n",
+ "for algorithmname in [\"linear_regression\", 'random_forest']:\n",
+ " fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline\n",
+ " fitPipeline.fit(X_train, y_train.values.ravel()) # fit pipeine\n",
+ "\n",
+ " y_pred = fitPipeline.predict(X_test) # score with fitted pipeline\n",
+ "\n",
+ " # Evaluate\n",
+ " r2 = r2_score(y_test, y_pred)\n",
+ " mape = mean_absolute_percentage_error(y_test, y_pred)\n",
+ " rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
+ "\n",
+ " print(algorithmname)\n",
+ " print(\"R2:\", r2)\n",
+ " print(\"MAPE:\", mape)\n",
+ " print(\"RMSE:\", rmse)\n",
+ " print()"
+ ],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": "linear_regression\nR2: 0.8939180427845623\nMAPE: 0.15217635144070302\nRMSE: 3.409148681526453\n\nrandom_forest\nR2: 0.8540936112427824\nMAPE: 0.15527304667688627\nRMSE: 3.998179929258663\n\n"
+ }
+ ],
+ "execution_count": 16,
+ "metadata": {
+ "gather": {
+ "logged": 1681193874528
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [],
+ "outputs": [],
+ "execution_count": null,
+ "metadata": {
+ "jupyter": {
+ "source_hidden": false,
+ "outputs_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [],
+ "outputs": [],
+ "execution_count": null,
+ "metadata": {
+ "jupyter": {
+ "source_hidden": false,
+ "outputs_hidden": false
+ },
+ "nteract": {
+ "transient": {
+ "deleting": false
+ }
+ }
+ }
+ }
+ ],
+ "metadata": {
+ "interpreter": {
+ "hash": "74e9702761b8f12846716a18132904990016d49f378e22e0e13a0e91318de754"
+ },
+ "kernelspec": {
+ "name": "python38-azureml",
+ "language": "python",
+ "display_name": "Python 3.8 - AzureML"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.8.10",
+ "mimetype": "text/x-python",
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "pygments_lexer": "ipython3",
+ "nbconvert_exporter": "python",
+ "file_extension": ".py"
+ },
+ "orig_nbformat": 4,
+ "microsoft": {
+ "ms_spell_check": {
+ "ms_spell_check_language": "en"
+ }
+ },
+ "kernel_info": {
+ "name": "python38-azureml"
+ },
+ "nteract": {
+ "version": "nteract-front-end@1.0.0"
}
- ],
- "source": [
- "# make sure categorical columns are strings\n",
- "final_df[catg_cols] = final_df[catg_cols].astype(\"str\")\n",
- "\n",
- "# split data\n",
- "X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222)\n",
- "\n",
- "# test 2 algorithms\n",
- "for algorithmname in [\"linear_regression\", 'random_forest']:\n",
- " fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline\n",
- " fitPipeline.fit(X_train, y_train.values.ravel()) # fit pipeine\n",
- "\n",
- " y_pred = fitPipeline.predict(X_test) # score with fitted pipeline\n",
- "\n",
- " # Evaluate\n",
- " r2 = r2_score(y_test, y_pred)\n",
- " mape = mean_absolute_percentage_error(y_test, y_pred)\n",
- " rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
- "\n",
- " print(algorithmname)\n",
- " print(\"R2:\", r2)\n",
- " print(\"MAPE:\", mape)\n",
- " print(\"RMSE:\", rmse)\n",
- " print()"
- ]
- }
- ],
- "metadata": {
- "interpreter": {
- "hash": "74e9702761b8f12846716a18132904990016d49f378e22e0e13a0e91318de754"
- },
- "kernelspec": {
- "display_name": "Python 3.8.12 ('mlopsenv')",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
},
- "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file