-
Notifications
You must be signed in to change notification settings - Fork 257
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
DataOps for LLMOps - AML Sample (#126)
Purpose of this PR is to add the first DataOps sample using AML The PR introduces the following new artefacts - DataOps folder: includes the scripts used by the CI/CD pipelines. named_entity_recognition/data_pipelines folder: a sample data pipeline to prepare the dataset for the NER use case named_entity_recognition_data_aml_cd_workflow.yml: this is the github CD pipeline under the .github/workflows folder --------- Co-authored-by: Raihan Alam <[email protected]> Co-authored-by: mohanajuhi166 <[email protected]> Co-authored-by: Ritesh Modi <[email protected]> Co-authored-by: mohanajuhi166 <[email protected]>
- Loading branch information
1 parent
ae91738
commit e1e6dbd
Showing
13 changed files
with
988 additions
and
0 deletions.
There are no files selected for viewing
114 changes: 114 additions & 0 deletions
114
.github/workflows/named_entity_recognition_data_aml_cd_workflow.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
name: named_entity_recognition_data_aml_pipeline | ||
on: | ||
# workflow_call allows reusable workflow that can be called by other workflows | ||
workflow_call: | ||
inputs: | ||
subscription_id: | ||
description: Azure subscription id | ||
type: string | ||
required: true | ||
resource_group_name: | ||
description: Azure resource group name | ||
type: string | ||
required: true | ||
workspace_name: | ||
description: Azure ML workspace name | ||
type: string | ||
required: true | ||
aml_env_name: | ||
description: Environment name | ||
type: string | ||
required: true | ||
config_path_root_dir: | ||
description: Root dir for config file | ||
type: string | ||
required: true | ||
default: "named_entity_recognition" | ||
|
||
# workflow_dispatch allows to run workflow manually from the Actions tab | ||
workflow_dispatch: | ||
inputs: | ||
subscription_id: | ||
description: Azure subscription id | ||
type: string | ||
required: true | ||
resource_group_name: | ||
description: Azure resource group name | ||
type: string | ||
required: true | ||
workspace_name: | ||
description: Azure ML workspace name | ||
type: string | ||
required: true | ||
aml_env_name: | ||
description: Environment name | ||
type: string | ||
required: true | ||
config_path_root_dir: | ||
description: Root dir for config file | ||
type: string | ||
required: true | ||
default: "named_entity_recognition" | ||
|
||
jobs: | ||
deploy_aml_data_pipeline: | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- name: Checkout current repository | ||
uses: actions/[email protected] | ||
|
||
- name: Set up python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: 3.9 | ||
|
||
- name: Azure login | ||
uses: azure/login@v1 | ||
with: | ||
creds: ${{ secrets.AZURE_CREDENTIALS }} | ||
|
||
- name: Configure Azure ML Agent | ||
uses: ./.github/actions/configure_azureml_agent | ||
|
||
- name: Load the current Azure subscription details | ||
id: subscription_details | ||
shell: bash | ||
run: | | ||
export subscriptionId=$(az account show --query id -o tsv) | ||
echo "SUBSCRIPTION_ID=$subscriptionId" >> $GITHUB_OUTPUT | ||
- name: Deploy data pipeline | ||
uses: ./.github/actions/execute_script | ||
with: | ||
step_name: "Deploy data pipeline" | ||
script_parameter: | | ||
python -m dataops.common.aml_pipeline \ | ||
--subscription_id ${{ inputs.subscription_id }} \ | ||
--resource_group_name ${{ inputs.resource_group_name }} \ | ||
--workspace_name ${{ inputs.workspace_name }} \ | ||
--aml_env_name ${{ inputs.aml_env_name }} \ | ||
--config_path_root_dir ${{ inputs.config_path_root_dir }} | ||
- name: Create data store | ||
uses: ./.github/actions/execute_script | ||
with: | ||
step_name: "Create data store" | ||
script_parameter: | | ||
python -m dataops.common.aml_data_store \ | ||
--subscription_id ${{ inputs.subscription_id }} \ | ||
--resource_group_name ${{ inputs.resource_group_name }} \ | ||
--workspace_name ${{ inputs.workspace_name }} \ | ||
--config_path_root_dir ${{ inputs.config_path_root_dir }} \ | ||
--sa_key ${{ secrets.SA_KEY }} | ||
- name: Register data asset | ||
uses: ./.github/actions/execute_script | ||
with: | ||
step_name: "Register data asset" | ||
script_parameter: | | ||
python -m dataops.common.aml_data_asset \ | ||
--subscription_id ${{ inputs.subscription_id }} \ | ||
--resource_group_name ${{ inputs.resource_group_name }} \ | ||
--workspace_name ${{ inputs.workspace_name }} \ | ||
--config_path_root_dir ${{ inputs.config_path_root_dir }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
""" | ||
dataops module. | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
""" | ||
common module. | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
""" | ||
This module creates the data assets. | ||
""" | ||
from azure.identity import DefaultAzureCredential | ||
from azure.ai.ml.entities import Data | ||
from azure.ai.ml import MLClient | ||
from azure.ai.ml.constants import AssetTypes | ||
import os | ||
import argparse | ||
import json | ||
|
||
pipeline_components = [] | ||
|
||
""" | ||
This function creates and returns an Azure Machine Learning (AML) client. | ||
The AML client is used to interact with Azure Machine Learning services. | ||
Args: | ||
--subscription_id: The Azure subscription ID. | ||
This argument is required for identifying the Azure subscription. | ||
--resource_group_name: The name of the resource group in Azure. | ||
This argument is required to specify the resource group in Azure. | ||
--workspace_name: The name of the workspace in Azure Machine Learning. | ||
This argument is required to specify the workspace in Azure Machine Learning. | ||
""" | ||
|
||
|
||
def get_aml_client( | ||
subscription_id, | ||
resource_group_name, | ||
workspace_name, | ||
): | ||
aml_client = MLClient( | ||
DefaultAzureCredential(), | ||
subscription_id=subscription_id, | ||
resource_group_name=resource_group_name, | ||
workspace_name=workspace_name, | ||
) | ||
|
||
return aml_client | ||
|
||
|
||
""" | ||
This function registers a data asset in Azure Machine Learning. | ||
The data asset is identified by its name and description, and is associated with a specific data store and file path. | ||
Args: | ||
--name: The name of the data asset. | ||
This argument is required to specify the name of the data asset. | ||
--description: The description of the data asset. | ||
This argument is required to provide a description of the data asset. | ||
--aml_client: The Azure Machine Learning client. | ||
This argument is required to interact with Azure Machine Learning services. | ||
--data_store: The name of the data store in Azure. | ||
This argument is required to specify the data store in Azure. | ||
--file_path: The file path of the data asset in the data store. | ||
This argument is required to specify the file path of the data asset in the data store. | ||
""" | ||
|
||
|
||
def register_data_asset( | ||
name, | ||
description, | ||
aml_client, | ||
data_store, | ||
file_path | ||
): | ||
target_path = f"azureml://datastores/{data_store}/paths/{file_path}" | ||
aml_dataset = Data( | ||
path=target_path, | ||
type=AssetTypes.URI_FILE, | ||
description=description, | ||
name=name | ||
) | ||
|
||
aml_client.data.create_or_update(aml_dataset) | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"--subscription_id", | ||
type=str, | ||
help="Azure subscription id", | ||
required=True, | ||
) | ||
parser.add_argument( | ||
"--resource_group_name", | ||
type=str, | ||
help="Azure resource group", | ||
required=True, | ||
) | ||
parser.add_argument( | ||
"--workspace_name", | ||
type=str, | ||
help="Azure ML workspace", | ||
required=True, | ||
) | ||
parser.add_argument( | ||
"--config_path_root_dir", | ||
type=str, | ||
help="Root dir for config file", | ||
required=True, | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
subscription_id = args.subscription_id | ||
resource_group_name = args.resource_group_name | ||
workspace_name = args.workspace_name | ||
config_path_root_dir = args.config_path_root_dir | ||
|
||
config_path = os.path.join(os.getcwd(), f"{config_path_root_dir}/configs/dataops_config.json") | ||
config = json.load(open(config_path)) | ||
|
||
aml_client = get_aml_client( | ||
subscription_id, | ||
resource_group_name, | ||
workspace_name, | ||
) | ||
|
||
data_store = config["DATA_STORE_NAME"] | ||
data_asset_configs = config['DATA_ASSETS'] | ||
for data_asset_config in data_asset_configs: | ||
data_asset_name = data_asset_config['NAME'] | ||
data_asset_file_path = data_asset_config['PATH'] | ||
data_asset_description = data_asset_config['DESCRIPTION'] | ||
|
||
register_data_asset( | ||
name=data_asset_name, | ||
description=data_asset_description, | ||
aml_client=aml_client, | ||
data_store=data_store, | ||
file_path=data_asset_file_path | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.