DataOps for LLMOps - AML Sample (#126)

Purpose of this PR is to add the first DataOps sample using AML The PR introduces the following new artefacts - DataOps folder: includes the scripts used by the CI/CD pipelines. named_entity_recognition/data_pipelines folder: a sample data pipeline to prepare the dataset for the NER use case named_entity_recognition_data_aml_cd_workflow.yml: this is the github CD pipeline under the .github/workflows folder --------- Co-authored-by: Raihan Alam <[email protected]> Co-authored-by: mohanajuhi166 <[email protected]> Co-authored-by: Ritesh Modi <[email protected]> Co-authored-by: mohanajuhi166 <[email protected]>
microsoft · May 17, 2024 · e1e6dbd · e1e6dbd
1 parent ae91738
commit e1e6dbd
Show file tree

Hide file tree

Showing 13 changed files with 988 additions and 0 deletions.
diff --git a/.github/workflows/named_entity_recognition_data_aml_cd_workflow.yml b/.github/workflows/named_entity_recognition_data_aml_cd_workflow.yml
@@ -0,0 +1,114 @@
+name: named_entity_recognition_data_aml_pipeline
+on:
+  # workflow_call allows reusable workflow that can be called by other workflows
+  workflow_call:
+    inputs:
+      subscription_id:
+        description: Azure subscription id
+        type: string
+        required: true      
+      resource_group_name:
+        description: Azure resource group name
+        type: string
+        required: true
+      workspace_name:
+          description: Azure ML workspace name
+          type: string
+          required: true
+      aml_env_name:
+            description: Environment name
+            type: string
+            required: true
+      config_path_root_dir:
+              description: Root dir for config file
+              type: string
+              required: true
+              default: "named_entity_recognition"
+
+  # workflow_dispatch allows to run workflow manually from the Actions tab
+  workflow_dispatch:
+    inputs:
+      subscription_id:
+        description: Azure subscription id
+        type: string
+        required: true      
+      resource_group_name:
+        description: Azure resource group name
+        type: string
+        required: true
+      workspace_name:
+          description: Azure ML workspace name
+          type: string
+          required: true
+      aml_env_name:
+            description: Environment name
+            type: string
+            required: true
+      config_path_root_dir:
+              description: Root dir for config file
+              type: string
+              required: true
+              default: "named_entity_recognition"
+
+jobs:
+  deploy_aml_data_pipeline:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout current repository
+        uses: actions/[email protected]
+
+      - name: Set up python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+
+      - name: Azure login
+        uses: azure/login@v1
+        with:
+          creds: ${{ secrets.AZURE_CREDENTIALS }}
+
+      - name: Configure Azure ML Agent
+        uses: ./.github/actions/configure_azureml_agent
+
+      - name: Load the current Azure subscription details
+        id: subscription_details
+        shell: bash
+        run: |
+          export subscriptionId=$(az account show --query id -o tsv)
+          echo "SUBSCRIPTION_ID=$subscriptionId" >> $GITHUB_OUTPUT
+
+      - name: Deploy data pipeline
+        uses: ./.github/actions/execute_script
+        with:
+          step_name: "Deploy data pipeline"
+          script_parameter: |
+            python -m dataops.common.aml_pipeline \
+            --subscription_id ${{ inputs.subscription_id }} \
+            --resource_group_name ${{ inputs.resource_group_name }} \
+            --workspace_name ${{ inputs.workspace_name }} \
+            --aml_env_name ${{ inputs.aml_env_name }} \
+            --config_path_root_dir ${{ inputs.config_path_root_dir }} 
+
+      - name: Create data store
+        uses: ./.github/actions/execute_script
+        with:
+          step_name: "Create data store"
+          script_parameter: |
+            python -m dataops.common.aml_data_store \
+            --subscription_id ${{ inputs.subscription_id }} \
+            --resource_group_name ${{ inputs.resource_group_name }} \
+            --workspace_name ${{ inputs.workspace_name }} \
+            --config_path_root_dir ${{ inputs.config_path_root_dir }} \
+            --sa_key ${{ secrets.SA_KEY }} 
+
+      - name: Register data asset
+        uses: ./.github/actions/execute_script
+        with:
+          step_name: "Register data asset"
+          script_parameter: |
+            python -m dataops.common.aml_data_asset \
+            --subscription_id ${{ inputs.subscription_id }} \
+            --resource_group_name ${{ inputs.resource_group_name }} \
+            --workspace_name ${{ inputs.workspace_name }} \
+            --config_path_root_dir ${{ inputs.config_path_root_dir }}      
diff --git a/README.md b/README.md
@@ -20,6 +20,7 @@ As LLMs rapidly evolve, the importance of Prompt Engineering becomes increasingl
 - LLM-infused applications are designed to understand and generate human-like text based on the input they receive. They comprise of prompts that need engineering cadence and rigour.
 - Prompt flow is a powerful feature that simplifies and streamlines the Prompt Engineering process for LLM-infused applications. It enables users to create, evaluate, and deploy high-quality flows with ease and efficiency.
 - How do we best augment LLM-infused applications with LLMOps and engineering rigour? This template aims to assist in the development of those types of applications using Prompt flow and LLMOps.
+- Bringing discipline to the data preparation for LLM app development by following DataOps best practices.
 
 # Solution
 
@@ -57,6 +58,7 @@ Each use case (set of Prompt flow standard and evaluation flows) should follow t
 - environment      : It contains a dockerfile used for running containers with flows for inferencing on Azure webapps.
 - flows            : It should contain minimally two folder - one for standard Prompt flow related files and another for Evaluation flow related file. There can be multiple evaluation flow related folders.
 - tests            : contains unit tests for the flows
+- data-pipelines   : It contains the data pipelines to generate the datasets (experimentation, evaluation etc.) necessary for the flows. This folder will have sub-folders specific to the data engineering tool - Microsoft Fabric, Azure ML etc.
 
 Additionally, there is a `experiment.yaml` file that configures the use-case (see file [description](./docs/the_experiment_file.md) and [specs](./docs/experiment.yaml) for more details). There is also a sample-request.json file containing test data for testing endpoints after deployment.
 
@@ -70,6 +72,8 @@ Additionally, there is a `experiment.yaml` file that configures the use-case (se
 
 - The 'llmops' folder contains all the code related to flow execution, evaluation and deployment.
 
+- The 'dataops' folder contains all the code related to data pipeline deployment.
+
 - The 'local_execution' folder contains python scripts for executing both the standard and evaluation flow locally.
 
 # Documentation
@@ -133,6 +137,14 @@ python -m pip install promptflow promptflow-tools promptflow-sdk jinja2 promptfl
 
 5. Write python scripts similar to the provided examples in local_execution folder.
 
+# DataOps
+
+DataOps combines aspects of DevOps, agile methodologies, and data management practices to streamline the process of collecting, processing, and analyzing data. DataOps can help to bring discipline in building the datasets (training, experimentation, evaluation etc.) necessary for LLM app development.
+
+The data pipelines are kept seperate from the prompt engineering flows. Data pipelines create the datasets and the datasets are registered as data assets in Azure ML for the flows to consume. This approach helps to scale and troubleshoot independently different parts of the system.
+
+For details on how to get started with DataOps, please follow this document - [How to Configure DataOps](./docs/how_to_configure_dataops.md).
+
 ## Contributing
 
 This project welcomes contributions and suggestions.  Most contributions require you to agree to a

diff --git a/dataops/__init__.py b/dataops/__init__.py
@@ -0,0 +1,4 @@
+"""
+dataops module.
+
+"""
diff --git a/dataops/common/__init__.py b/dataops/common/__init__.py
@@ -0,0 +1,4 @@
+"""
+common module.
+
+"""
diff --git a/dataops/common/aml_data_asset.py b/dataops/common/aml_data_asset.py
@@ -0,0 +1,139 @@
+"""
+This module creates the data assets.
+"""
+from azure.identity import DefaultAzureCredential
+from azure.ai.ml.entities import Data
+from azure.ai.ml import MLClient
+from azure.ai.ml.constants import AssetTypes
+import os
+import argparse
+import json
+
+pipeline_components = []
+
+"""
+This function creates and returns an Azure Machine Learning (AML) client.
+The AML client is used to interact with Azure Machine Learning services.
+
+Args:
+--subscription_id: The Azure subscription ID.
+This argument is required for identifying the Azure subscription.
+--resource_group_name: The name of the resource group in Azure.
+This argument is required to specify the resource group in Azure.
+--workspace_name: The name of the workspace in Azure Machine Learning.
+This argument is required to specify the workspace in Azure Machine Learning.
+"""
+
+
+def get_aml_client(
+        subscription_id,
+        resource_group_name,
+        workspace_name,
+):
+    aml_client = MLClient(
+        DefaultAzureCredential(),
+        subscription_id=subscription_id,
+        resource_group_name=resource_group_name,
+        workspace_name=workspace_name,
+    )
+
+    return aml_client
+
+
+"""
+This function registers a data asset in Azure Machine Learning.
+The data asset is identified by its name and description, and is associated with a specific data store and file path.
+
+Args:
+--name: The name of the data asset.
+This argument is required to specify the name of the data asset.
+--description: The description of the data asset.
+This argument is required to provide a description of the data asset.
+--aml_client: The Azure Machine Learning client.
+This argument is required to interact with Azure Machine Learning services.
+--data_store: The name of the data store in Azure.
+This argument is required to specify the data store in Azure.
+--file_path: The file path of the data asset in the data store.
+This argument is required to specify the file path of the data asset in the data store.
+"""
+
+
+def register_data_asset(
+        name,
+        description,
+        aml_client,
+        data_store,
+        file_path
+):
+    target_path = f"azureml://datastores/{data_store}/paths/{file_path}"
+    aml_dataset = Data(
+        path=target_path,
+        type=AssetTypes.URI_FILE,
+        description=description,
+        name=name
+    )
+
+    aml_client.data.create_or_update(aml_dataset)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--subscription_id",
+        type=str,
+        help="Azure subscription id",
+        required=True,
+    )
+    parser.add_argument(
+        "--resource_group_name",
+        type=str,
+        help="Azure resource group",
+        required=True,
+    )
+    parser.add_argument(
+        "--workspace_name",
+        type=str,
+        help="Azure ML workspace",
+        required=True,
+    )
+    parser.add_argument(
+        "--config_path_root_dir",
+        type=str,
+        help="Root dir for config file",
+        required=True,
+    )
+
+    args = parser.parse_args()
+
+    subscription_id = args.subscription_id
+    resource_group_name = args.resource_group_name
+    workspace_name = args.workspace_name
+    config_path_root_dir = args.config_path_root_dir
+
+    config_path = os.path.join(os.getcwd(), f"{config_path_root_dir}/configs/dataops_config.json")
+    config = json.load(open(config_path))
+
+    aml_client = get_aml_client(
+        subscription_id,
+        resource_group_name,
+        workspace_name,
+    )
+
+    data_store = config["DATA_STORE_NAME"]
+    data_asset_configs = config['DATA_ASSETS']
+    for data_asset_config in data_asset_configs:
+        data_asset_name = data_asset_config['NAME']
+        data_asset_file_path = data_asset_config['PATH']
+        data_asset_description = data_asset_config['DESCRIPTION']
+
+        register_data_asset(
+            name=data_asset_name,
+            description=data_asset_description,
+            aml_client=aml_client,
+            data_store=data_store,
+            file_path=data_asset_file_path
+        )
+
+
+if __name__ == "__main__":
+    main()