Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Harpreet dev #435

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/workshop_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
- name: Install az ml & set default values for AML
run: | #setup: provide group, workspace and location
az extension add -n ml -y --version 2.2.1
az configure --defaults group=azureml workspace=ws01ent location=westus2
az configure --defaults group=mlops-rg-910166 workspace=910166 location=westus2
- name: run training and model validation
run: |
az ml job create -s -f src/workshop/core/pipelines/training_pipeline.yml
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/workshop_unit_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
unit-test:
runs-on: ubuntu-latest
steps:
- name: Check out repository code
- name: Harpreet-dev
uses: actions/checkout@v3
- name: Setup python
uses: actions/setup-python@v2
Expand All @@ -31,7 +31,7 @@ jobs:
- name: Install AZ ML and tools
run: | # SETUP line 34 to point to your own AML workspace
az extension add -n ml -y --version 2.2.1
az configure --defaults group=azureml workspace=ws01ent location=westus2
az configure --defaults group=mlops-rg-910166 workspace=aml910166 location=westus2
- name: Run Feature Engineering
uses: ./.github/actions/aml-job-create
with:
Expand Down
2 changes: 1 addition & 1 deletion src/workshop/core/scoring/deployment.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
name: green
endpoint_name: mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name defined in endpoint.yml
endpoint_name: mlops-workshop-endpoint34 #setup replace `mlops-workshop-endpoint` with your own endpoint name defined in endpoint.yml
model: azureml:nyc_fare_prediction:1
code_configuration:
code: ./
Expand Down
2 changes: 1 addition & 1 deletion src/workshop/core/scoring/endpoint.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
name: mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name. It has to be globally unique
name: mlops-workshop-endpoint34 #setup replace `mlops-workshop-endpoint` with your own endpoint name. It has to be globally unique
auth_mode: key
6 changes: 6 additions & 0 deletions src/workshop/core/training/.amlignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
## This file was auto generated by the Azure Machine Learning Studio. Please do not remove.
## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots

.ipynb_aml_checkpoints/
*.amltmp
*.amltemp
6 changes: 6 additions & 0 deletions src/workshop/core/training/.amlignore.amltmp
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
## This file was auto generated by the Azure Machine Learning Studio. Please do not remove.
## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots

.ipynb_aml_checkpoints/
*.amltmp
*.amltemp
2 changes: 1 addition & 1 deletion src/workshop/core/training/conda_ml_training.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ dependencies:
- azureml-sdk==1.38.0
- azureml-mlflow==1.38.0
- pandas==1.3.5
- scikit-learn==1.0.2
- scikit-learn
11 changes: 11 additions & 0 deletions src/workshop/core/training/conda_ml_training.yml.amltmp
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: ml-training
channels:
- conda-forge
dependencies:
- python=3.8
- pip=21.3.1
- pip:
- azureml-sdk==1.38.0
- azureml-mlflow==1.38.0
- pandas==1.3.5
- scikit-learn
4 changes: 2 additions & 2 deletions src/workshop/core/training/ml_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error
from sklearn.metrics import r2_score,mean_absolute_percentage_error, mean_squared_error
import joblib
def parse_args():
# arg parser
Expand Down Expand Up @@ -43,7 +43,7 @@ def createClassModel(algo_name, catg, nums):
#---------------------------------------------
#setup: Update alpha value
#---------------------------------------------
model = Ridge(alpha=100000) #setup
model = Ridge(alpha=100) #setup
elif algo_name == 'random_forest':
model = RandomForestRegressor()
else:
Expand Down
103 changes: 103 additions & 0 deletions src/workshop/core/training/ml_training.py.amltmp
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import pandas as pd
import numpy as np
import os
import argparse
import mlflow
import mlflow.sklearn
from azureml.core import Run, Dataset,Datastore, Workspace
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score,mean_absolute_percentage_error, mean_squared_error
import joblib
def parse_args():
# arg parser
parser = argparse.ArgumentParser()

parser.add_argument("--prep_data", default="data", type=str, help="Path to prepped data, default to local folder")
parser.add_argument("--model_folder", type=str,default="data", help="Path of model ouput folder, default to local folder")
parser.add_argument("--input_file_name", type=str, default="final_df.parquet")
parser.add_argument("--run_mode", type=str, default="local")


# parse args
args = parser.parse_args()

# return args
return args


def createClassModel(algo_name, catg, nums):
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))])

categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value="MISSING")), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)])

if algo_name == 'linear_regression':
#---------------------------------------------
#setup: Update alpha value
#---------------------------------------------
model = Ridge(alpha=100) #setup
elif algo_name == 'random_forest':
model = RandomForestRegressor()
else:
pass

ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), ("model", model)])

return ModelPipeline

def main(args):

# read in data
final_df = pd.read_parquet(os.path.join(args.prep_data,args.input_file_name))
catg_cols = ["vendorID", "month_num", "day_of_month", "normalizeHolidayName", "isPaidTimeOff"]
num_cols = ["passengerCount", "tripDistance", "precipTime", "temperature", "precipDepth", "hr_sin", "hr_cos", "dy_sin", "dy_cos"]
label = ["totalAmount"]
# make sure categorical columns are strings
final_df[catg_cols] = final_df[catg_cols].astype("str")

# split data
X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222)

# test 2 algorithms
os.makedirs(args.model_folder, exist_ok=True)

algorithmname = "linear_regression"
fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline
fitPipeline.fit(X_train, y_train.values.ravel()) # fit pipeine

y_pred = fitPipeline.predict(X_test) # score with fitted pipeline

# Evaluate
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))


joblib.dump(fitPipeline,args.model_folder+"/"+algorithmname+".joblib")

print("Training finished!. Metrics:")
print(f"R2_{algorithmname}", r2)
print(f"MAPE_{algorithmname}", mape)
print(f"RMSE_{algorithmname}", rmse)
print("Model",args.model_folder+"/"+algorithmname+".joblib","saved!")

if args.run_mode == 'remote':
mlflow.log_metric(f"R2_{algorithmname}", r2)
mlflow.log_metric(f"MAPE_{algorithmname}", mape)
mlflow.log_metric(f"RMSE_{algorithmname}", rmse)
mlflow.sklearn.log_model(fitPipeline,f"{algorithmname}_model")

# run script
if __name__ == "__main__":
# parse args
args = parse_args()
# run main function
main(args)
Binary file modified src/workshop/data/linear_regression.joblib
Binary file not shown.
Loading