microsoft · github-cloudlabsuser-013 · Apr 12, 2023 · Apr 12, 2023 · Apr 12, 2023 · Apr 12, 2023
diff --git a/.github/workflows/workshop_ci.yml b/.github/workflows/workshop_ci.yml
@@ -32,7 +32,7 @@ jobs:
       - name: Install az ml & set default values for AML
         run: | #setup: provide group, workspace and location
           az extension add -n ml -y --version 2.2.1
-          az configure --defaults group=azureml workspace=ws01ent location=westus2   
+          az configure --defaults group=mlops-rg-910166 workspace=910166 location=westus2   
       - name: run training and model validation
         run: |
          az ml job create -s -f src/workshop/core/pipelines/training_pipeline.yml

diff --git a/.github/workflows/workshop_unit_test.yml b/.github/workflows/workshop_unit_test.yml
@@ -13,7 +13,7 @@ jobs:
   unit-test:
     runs-on: ubuntu-latest
     steps:
-      - name: Check out repository code
+      - name: Harpreet-dev
         uses: actions/checkout@v3
       - name: Setup python
         uses: actions/setup-python@v2
@@ -31,7 +31,7 @@ jobs:
       - name: Install AZ ML and tools
         run: | # SETUP line 34 to point to your own AML workspace
           az extension add -n ml -y --version 2.2.1
-          az configure --defaults group=azureml workspace=ws01ent location=westus2   
+          az configure --defaults group=mlops-rg-910166 workspace=aml910166 location=westus2   
       - name: Run Feature Engineering
         uses: ./.github/actions/aml-job-create
         with:

diff --git a/src/workshop/core/scoring/deployment.yml b/src/workshop/core/scoring/deployment.yml
@@ -1,6 +1,6 @@
 $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
 name: green
-endpoint_name: mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name defined in endpoint.yml
+endpoint_name: mlops-workshop-endpoint34 #setup replace `mlops-workshop-endpoint` with your own endpoint name defined in endpoint.yml
 model: azureml:nyc_fare_prediction:1
 code_configuration:
   code: ./

diff --git a/src/workshop/core/scoring/endpoint.yml b/src/workshop/core/scoring/endpoint.yml
@@ -1,3 +1,3 @@
 $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
-name: mlops-workshop-endpoint #setup replace `mlops-workshop-endpoint` with your own endpoint name. It has to be globally unique
+name: mlops-workshop-endpoint34 #setup replace `mlops-workshop-endpoint` with your own endpoint name. It has to be globally unique
 auth_mode: key
diff --git a/src/workshop/core/training/.amlignore b/src/workshop/core/training/.amlignore
@@ -0,0 +1,6 @@
+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. 
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+
+.ipynb_aml_checkpoints/ 
+*.amltmp 
+*.amltemp
diff --git a/src/workshop/core/training/.amlignore.amltmp b/src/workshop/core/training/.amlignore.amltmp
@@ -0,0 +1,6 @@
+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove. 
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+
+.ipynb_aml_checkpoints/ 
+*.amltmp 
+*.amltemp
diff --git a/src/workshop/core/training/conda_ml_training.yml b/src/workshop/core/training/conda_ml_training.yml
@@ -8,4 +8,4 @@ dependencies:
     - azureml-sdk==1.38.0
     - azureml-mlflow==1.38.0
     - pandas==1.3.5
-    - scikit-learn==1.0.2
+    - scikit-learn
diff --git a/src/workshop/core/training/conda_ml_training.yml.amltmp b/src/workshop/core/training/conda_ml_training.yml.amltmp
@@ -0,0 +1,11 @@
+name: ml-training
+channels:
+  - conda-forge
+dependencies:
+  - python=3.8
+  - pip=21.3.1
+  - pip:
+    - azureml-sdk==1.38.0
+    - azureml-mlflow==1.38.0
+    - pandas==1.3.5
+    - scikit-learn
diff --git a/src/workshop/core/training/ml_training.py b/src/workshop/core/training/ml_training.py
@@ -13,7 +13,7 @@
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.impute import SimpleImputer
 from sklearn.compose import ColumnTransformer
-from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error
+from sklearn.metrics import r2_score,mean_absolute_percentage_error, mean_squared_error
 import joblib
 def parse_args():
     # arg parser
@@ -43,7 +43,7 @@ def createClassModel(algo_name, catg, nums):
         #---------------------------------------------
         #setup: Update alpha value
         #---------------------------------------------
-        model = Ridge(alpha=100000)  #setup
+        model = Ridge(alpha=100)  #setup
     elif algo_name == 'random_forest':
         model = RandomForestRegressor()
     else:

diff --git a/src/workshop/core/training/ml_training.py.amltmp b/src/workshop/core/training/ml_training.py.amltmp
@@ -0,0 +1,103 @@
+import pandas as pd
+import numpy as np
+import os
+import argparse
+import mlflow
+import mlflow.sklearn
+from azureml.core import Run, Dataset,Datastore, Workspace
+from sklearn.linear_model import LinearRegression
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.linear_model import Ridge
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.impute import SimpleImputer
+from sklearn.compose import ColumnTransformer
+from sklearn.metrics import r2_score,mean_absolute_percentage_error, mean_squared_error
+import joblib
+def parse_args():
+    # arg parser
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--prep_data", default="data", type=str, help="Path to prepped data, default to local folder")
+    parser.add_argument("--model_folder", type=str,default="data", help="Path of model ouput folder, default to local folder")
+    parser.add_argument("--input_file_name", type=str, default="final_df.parquet")
+    parser.add_argument("--run_mode", type=str, default="local")
+
+
+    # parse args
+    args = parser.parse_args()
+
+    # return args
+    return args
+
+
+def createClassModel(algo_name, catg, nums):
+    numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))])
+
+    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value="MISSING")), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
+
+    preprocesser = ColumnTransformer(transformers=[('num', numeric_transformer, nums), ('cat', categorical_transformer, catg)])
+
+    if algo_name == 'linear_regression':
+        #---------------------------------------------
+        #setup: Update alpha value
+        #---------------------------------------------
+        model = Ridge(alpha=100)  #setup
+    elif algo_name == 'random_forest':
+        model = RandomForestRegressor()
+    else:
+        pass
+
+    ModelPipeline = Pipeline(steps=[('preprocessor', preprocesser), ("model", model)])
+
+    return ModelPipeline
+
+def main(args):
+
+    # read in data
+    final_df = pd.read_parquet(os.path.join(args.prep_data,args.input_file_name))
+    catg_cols = ["vendorID", "month_num", "day_of_month", "normalizeHolidayName", "isPaidTimeOff"]
+    num_cols = ["passengerCount", "tripDistance", "precipTime", "temperature", "precipDepth", "hr_sin", "hr_cos", "dy_sin", "dy_cos"]
+    label = ["totalAmount"]
+    # make sure categorical columns are strings
+    final_df[catg_cols] = final_df[catg_cols].astype("str")
+
+    # split data
+    X_train, X_test, y_train, y_test = train_test_split(final_df.drop(label, axis=1), final_df[label], test_size=0.2, random_state=222)
+
+    # test 2 algorithms
+    os.makedirs(args.model_folder, exist_ok=True)
+
+    algorithmname = "linear_regression"
+    fitPipeline = createClassModel(algorithmname, catg_cols, num_cols) # get pipeline
+    fitPipeline.fit(X_train, y_train.values.ravel())                   # fit pipeine
+
+    y_pred = fitPipeline.predict(X_test)                               # score with fitted pipeline
+
+    # Evaluate
+    r2 = r2_score(y_test, y_pred)
+    mape = mean_absolute_percentage_error(y_test, y_pred)
+    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
+
+
+    joblib.dump(fitPipeline,args.model_folder+"/"+algorithmname+".joblib")
+
+    print("Training finished!. Metrics:")
+    print(f"R2_{algorithmname}", r2)
+    print(f"MAPE_{algorithmname}", mape)
+    print(f"RMSE_{algorithmname}", rmse)
+    print("Model",args.model_folder+"/"+algorithmname+".joblib","saved!")
+
+    if args.run_mode == 'remote':
+        mlflow.log_metric(f"R2_{algorithmname}", r2)
+        mlflow.log_metric(f"MAPE_{algorithmname}", mape)
+        mlflow.log_metric(f"RMSE_{algorithmname}", rmse)
+        mlflow.sklearn.log_model(fitPipeline,f"{algorithmname}_model")
+
+# run script
+if __name__ == "__main__":
+    # parse args
+    args = parse_args()
+    # run main function
+    main(args)
diff --git a/src/workshop/data/linear_regression.joblib b/src/workshop/data/linear_regression.joblib