diff --git a/.github/workflows/named_entity_recognition_data_aml_cd_workflow.yml b/.github/workflows/named_entity_recognition_data_aml_cd_workflow.yml new file mode 100644 index 000000000..638823f00 --- /dev/null +++ b/.github/workflows/named_entity_recognition_data_aml_cd_workflow.yml @@ -0,0 +1,114 @@ +name: named_entity_recognition_data_aml_pipeline +on: + # workflow_call allows reusable workflow that can be called by other workflows + workflow_call: + inputs: + subscription_id: + description: Azure subscription id + type: string + required: true + resource_group_name: + description: Azure resource group name + type: string + required: true + workspace_name: + description: Azure ML workspace name + type: string + required: true + aml_env_name: + description: Environment name + type: string + required: true + config_path_root_dir: + description: Root dir for config file + type: string + required: true + default: "named_entity_recognition" + + # workflow_dispatch allows to run workflow manually from the Actions tab + workflow_dispatch: + inputs: + subscription_id: + description: Azure subscription id + type: string + required: true + resource_group_name: + description: Azure resource group name + type: string + required: true + workspace_name: + description: Azure ML workspace name + type: string + required: true + aml_env_name: + description: Environment name + type: string + required: true + config_path_root_dir: + description: Root dir for config file + type: string + required: true + default: "named_entity_recognition" + +jobs: + deploy_aml_data_pipeline: + runs-on: ubuntu-latest + + steps: + - name: Checkout current repository + uses: actions/checkout@v3.3.0 + + - name: Set up python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + + - name: Azure login + uses: azure/login@v1 + with: + creds: ${{ secrets.AZURE_CREDENTIALS }} + + - name: Configure Azure ML Agent + uses: ./.github/actions/configure_azureml_agent + + - name: Load the current Azure subscription details + id: subscription_details + shell: bash + run: | + export subscriptionId=$(az account show --query id -o tsv) + echo "SUBSCRIPTION_ID=$subscriptionId" >> $GITHUB_OUTPUT + + - name: Deploy data pipeline + uses: ./.github/actions/execute_script + with: + step_name: "Deploy data pipeline" + script_parameter: | + python -m dataops.common.aml_pipeline \ + --subscription_id ${{ inputs.subscription_id }} \ + --resource_group_name ${{ inputs.resource_group_name }} \ + --workspace_name ${{ inputs.workspace_name }} \ + --aml_env_name ${{ inputs.aml_env_name }} \ + --config_path_root_dir ${{ inputs.config_path_root_dir }} + + - name: Create data store + uses: ./.github/actions/execute_script + with: + step_name: "Create data store" + script_parameter: | + python -m dataops.common.aml_data_store \ + --subscription_id ${{ inputs.subscription_id }} \ + --resource_group_name ${{ inputs.resource_group_name }} \ + --workspace_name ${{ inputs.workspace_name }} \ + --config_path_root_dir ${{ inputs.config_path_root_dir }} \ + --sa_key ${{ secrets.SA_KEY }} + + - name: Register data asset + uses: ./.github/actions/execute_script + with: + step_name: "Register data asset" + script_parameter: | + python -m dataops.common.aml_data_asset \ + --subscription_id ${{ inputs.subscription_id }} \ + --resource_group_name ${{ inputs.resource_group_name }} \ + --workspace_name ${{ inputs.workspace_name }} \ + --config_path_root_dir ${{ inputs.config_path_root_dir }} \ No newline at end of file diff --git a/README.md b/README.md index 187cc8a4b..f7be9601d 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ As LLMs rapidly evolve, the importance of Prompt Engineering becomes increasingl - LLM-infused applications are designed to understand and generate human-like text based on the input they receive. They comprise of prompts that need engineering cadence and rigour. - Prompt flow is a powerful feature that simplifies and streamlines the Prompt Engineering process for LLM-infused applications. It enables users to create, evaluate, and deploy high-quality flows with ease and efficiency. - How do we best augment LLM-infused applications with LLMOps and engineering rigour? This template aims to assist in the development of those types of applications using Prompt flow and LLMOps. +- Bringing discipline to the data preparation for LLM app development by following DataOps best practices. # Solution @@ -57,6 +58,7 @@ Each use case (set of Prompt flow standard and evaluation flows) should follow t - environment : It contains a dockerfile used for running containers with flows for inferencing on Azure webapps. - flows : It should contain minimally two folder - one for standard Prompt flow related files and another for Evaluation flow related file. There can be multiple evaluation flow related folders. - tests : contains unit tests for the flows +- data-pipelines : It contains the data pipelines to generate the datasets (experimentation, evaluation etc.) necessary for the flows. This folder will have sub-folders specific to the data engineering tool - Microsoft Fabric, Azure ML etc. Additionally, there is a `experiment.yaml` file that configures the use-case (see file [description](./docs/the_experiment_file.md) and [specs](./docs/experiment.yaml) for more details). There is also a sample-request.json file containing test data for testing endpoints after deployment. @@ -70,6 +72,8 @@ Additionally, there is a `experiment.yaml` file that configures the use-case (se - The 'llmops' folder contains all the code related to flow execution, evaluation and deployment. +- The 'dataops' folder contains all the code related to data pipeline deployment. + - The 'local_execution' folder contains python scripts for executing both the standard and evaluation flow locally. # Documentation @@ -133,6 +137,14 @@ python -m pip install promptflow promptflow-tools promptflow-sdk jinja2 promptfl 5. Write python scripts similar to the provided examples in local_execution folder. +# DataOps + +DataOps combines aspects of DevOps, agile methodologies, and data management practices to streamline the process of collecting, processing, and analyzing data. DataOps can help to bring discipline in building the datasets (training, experimentation, evaluation etc.) necessary for LLM app development. + +The data pipelines are kept seperate from the prompt engineering flows. Data pipelines create the datasets and the datasets are registered as data assets in Azure ML for the flows to consume. This approach helps to scale and troubleshoot independently different parts of the system. + +For details on how to get started with DataOps, please follow this document - [How to Configure DataOps](./docs/how_to_configure_dataops.md). + ## Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a diff --git a/dataops/__init__.py b/dataops/__init__.py new file mode 100644 index 000000000..cfb6b7af7 --- /dev/null +++ b/dataops/__init__.py @@ -0,0 +1,4 @@ +""" +dataops module. + +""" diff --git a/dataops/common/__init__.py b/dataops/common/__init__.py new file mode 100644 index 000000000..863836886 --- /dev/null +++ b/dataops/common/__init__.py @@ -0,0 +1,4 @@ +""" +common module. + +""" diff --git a/dataops/common/aml_data_asset.py b/dataops/common/aml_data_asset.py new file mode 100644 index 000000000..058725181 --- /dev/null +++ b/dataops/common/aml_data_asset.py @@ -0,0 +1,139 @@ +""" +This module creates the data assets. +""" +from azure.identity import DefaultAzureCredential +from azure.ai.ml.entities import Data +from azure.ai.ml import MLClient +from azure.ai.ml.constants import AssetTypes +import os +import argparse +import json + +pipeline_components = [] + +""" +This function creates and returns an Azure Machine Learning (AML) client. +The AML client is used to interact with Azure Machine Learning services. + +Args: +--subscription_id: The Azure subscription ID. +This argument is required for identifying the Azure subscription. +--resource_group_name: The name of the resource group in Azure. +This argument is required to specify the resource group in Azure. +--workspace_name: The name of the workspace in Azure Machine Learning. +This argument is required to specify the workspace in Azure Machine Learning. +""" + + +def get_aml_client( + subscription_id, + resource_group_name, + workspace_name, +): + aml_client = MLClient( + DefaultAzureCredential(), + subscription_id=subscription_id, + resource_group_name=resource_group_name, + workspace_name=workspace_name, + ) + + return aml_client + + +""" +This function registers a data asset in Azure Machine Learning. +The data asset is identified by its name and description, and is associated with a specific data store and file path. + +Args: +--name: The name of the data asset. +This argument is required to specify the name of the data asset. +--description: The description of the data asset. +This argument is required to provide a description of the data asset. +--aml_client: The Azure Machine Learning client. +This argument is required to interact with Azure Machine Learning services. +--data_store: The name of the data store in Azure. +This argument is required to specify the data store in Azure. +--file_path: The file path of the data asset in the data store. +This argument is required to specify the file path of the data asset in the data store. +""" + + +def register_data_asset( + name, + description, + aml_client, + data_store, + file_path +): + target_path = f"azureml://datastores/{data_store}/paths/{file_path}" + aml_dataset = Data( + path=target_path, + type=AssetTypes.URI_FILE, + description=description, + name=name + ) + + aml_client.data.create_or_update(aml_dataset) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--subscription_id", + type=str, + help="Azure subscription id", + required=True, + ) + parser.add_argument( + "--resource_group_name", + type=str, + help="Azure resource group", + required=True, + ) + parser.add_argument( + "--workspace_name", + type=str, + help="Azure ML workspace", + required=True, + ) + parser.add_argument( + "--config_path_root_dir", + type=str, + help="Root dir for config file", + required=True, + ) + + args = parser.parse_args() + + subscription_id = args.subscription_id + resource_group_name = args.resource_group_name + workspace_name = args.workspace_name + config_path_root_dir = args.config_path_root_dir + + config_path = os.path.join(os.getcwd(), f"{config_path_root_dir}/configs/dataops_config.json") + config = json.load(open(config_path)) + + aml_client = get_aml_client( + subscription_id, + resource_group_name, + workspace_name, + ) + + data_store = config["DATA_STORE_NAME"] + data_asset_configs = config['DATA_ASSETS'] + for data_asset_config in data_asset_configs: + data_asset_name = data_asset_config['NAME'] + data_asset_file_path = data_asset_config['PATH'] + data_asset_description = data_asset_config['DESCRIPTION'] + + register_data_asset( + name=data_asset_name, + description=data_asset_description, + aml_client=aml_client, + data_store=data_store, + file_path=data_asset_file_path + ) + + +if __name__ == "__main__": + main() diff --git a/dataops/common/aml_data_store.py b/dataops/common/aml_data_store.py new file mode 100644 index 000000000..ec7111c44 --- /dev/null +++ b/dataops/common/aml_data_store.py @@ -0,0 +1,146 @@ +""" +This module registers the data store. +""" +from azure.ai.ml import MLClient +from azure.identity import DefaultAzureCredential +from azure.ai.ml.entities import AzureBlobDatastore, AccountKeyConfiguration +import os +import argparse +import json + +pipeline_components = [] +""" +This function creates and returns an Azure Machine Learning (AML) client. +The AML client is used to interact with Azure Machine Learning services. + +Args: +--subscription_id: The Azure subscription ID. +This argument is required for identifying the Azure subscription. +--resource_group_name: The name of the resource group in Azure. +This argument is required to specify the resource group in Azure. +--workspace_name: The name of the workspace in Azure Machine Learning. +This argument is required to specify the workspace in Azure Machine Learning. +""" + + +def get_aml_client( + subscription_id, + resource_group_name, + workspace_name, +): + aml_client = MLClient( + DefaultAzureCredential(), + subscription_id=subscription_id, + resource_group_name=resource_group_name, + workspace_name=workspace_name, + ) + + return aml_client + + +""" +This function registers a data store in Azure Machine Learning. +The data store is identified by its name and description, +and is associated with a specific storage account and container. + +Args: +--name_datastore: The name of the data store. +This argument is required to specify the name of the data store. +--description: The description of the data store. +This argument is required to provide a description of the data store. +--sa_account_name: The name of the storage account in Azure. +This argument is required to specify the storage account in Azure. +--sa_container_name: The name of the container in the storage account. +This argument is required to specify the container in the storage account. +--sa_key: The key of the storage account. +This argument is required to authenticate with the storage account. +--aml_client: The Azure Machine Learning client. +This argument is required to interact with Azure Machine Learning services. +""" + + +def register_data_store( + name_datastore, + description, + sa_account_name, + sa_container_name, + sa_key, + aml_client +): + store = AzureBlobDatastore( + name=name_datastore, + description=description, + account_name=sa_account_name, + container_name=sa_container_name, + credentials=AccountKeyConfiguration(account_key=sa_key) + ) + aml_client.create_or_update(store) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--subscription_id", + type=str, + help="Azure subscription id", + required=True, + ) + parser.add_argument( + "--resource_group_name", + type=str, + help="Azure resource group", + required=True, + ) + parser.add_argument( + "--workspace_name", + type=str, + help="Azure ML workspace", + required=True, + ) + parser.add_argument( + "--sa_key", + type=str, + help="Storage account key", + required=True, + ) + parser.add_argument( + "--config_path_root_dir", + type=str, + help="Root dir for config file", + required=True, + ) + + args = parser.parse_args() + + subscription_id = args.subscription_id + resource_group_name = args.resource_group_name + workspace_name = args.workspace_name + sa_key = args.sa_key + config_path_root_dir = args.config_path_root_dir + + config_path = os.path.join(os.getcwd(), + f"{config_path_root_dir}/configs/dataops_config.json") + config = json.load(open(config_path)) + + aml_client = get_aml_client( + subscription_id, + resource_group_name, + workspace_name, + ) + + storage_config = config['STORAGE'] + storage_account = storage_config['STORAGE_ACCOUNT'] + target_container_name = storage_config['TARGET_CONTAINER'] + + register_data_store( + name_datastore=config["DATA_STORE_NAME"], + description=config["DATA_STORE_DESCRIPTION"], + sa_account_name=storage_account, + sa_container_name=target_container_name, + sa_key=sa_key, + aml_client=aml_client + ) + + +if __name__ == "__main__": + main() diff --git a/dataops/common/aml_pipeline.py b/dataops/common/aml_pipeline.py new file mode 100644 index 000000000..049bca6cb --- /dev/null +++ b/dataops/common/aml_pipeline.py @@ -0,0 +1,365 @@ +""" +This module creates a AML job and schedule it for the data pipeline. +""" +from datetime import datetime +from azure.ai.ml.dsl import pipeline +from azure.identity import DefaultAzureCredential +from azure.ai.ml import command, UserIdentityConfiguration +from azure.ai.ml import Output +from azure.ai.ml import MLClient +from azure.ai.ml.entities import ( + JobSchedule, + CronTrigger +) +import os +import argparse +import json + +pipeline_components = [] + +() + +""" +This function defines a AML pipeline for data preparation in Named Entity Recognition (NER) tasks. +The pipeline is identified by its name and description, and consists of a data preparation job. + +The data preparation job is the first component in the pipeline components list. +The output of the data preparation job is a target directory, which is returned by the pipeline. + +Decorator: +@pipeline: A decorator to declare this function as a pipeline. +It takes two arguments - name and description of the pipeline. + +Returns: +A dictionary with the target directory as the output of the data preparation job. +""" + + +@pipeline( + name="ner_data_prep_test", + description="data prep pipeline", +) +def ner_data_prep_pipeline( +): + prep_data_job = pipeline_components[0]( + ) + + return { + "target_dir": prep_data_job.outputs.target_dir + } + + +""" +This function executes a data preparation component for a data pipeline. +The data component is identified by its name, display name, +and description, and is associated with a specific environment, storage account, +source and target containers, source blob, assets, and custom compute. + +Args: +--name: The name of the data component. +This argument is required to specify the name of the data component. +--display_name: The display name of the data component. +This argument is required to specify the display name of the data component. +--description: The description of the data component. +This argument is required to provide a description of the data component. +--data_pipeline_code_dir: The directory of the data pipeline code. +This argument is required to specify the directory of the data pipeline code. +--environment: The environment for the data component. +This argument is required to specify the environment for the data component. +--storage_account: The storage account in Azure. +This argument is required to specify the storage account in Azure. +--source_container_name: The name of the source container in the storage account. +This argument is required to specify the source container in the storage account. +--target_container_name: The name of the target container in the storage account. +This argument is required to specify the target container in the storage account. +--source_blob: The name of the source blob in the source container. +This argument is required to specify the source blob in the source container. +--assets: The assets in the target container. +This argument is required to specify the assets in the target container. +--custom_compute: The custom compute for the data component. +This argument is required to specify the custom compute for the data component. +""" + + +def get_prep_data_component( + name, + display_name, + description, + data_pipeline_code_dir, + environment, + storage_account, + source_container_name, + target_container_name, + source_blob, + assets, + custom_compute +): + data_pipeline_code_dir = os.path.join(os.getcwd(), data_pipeline_code_dir) + + # Initialize an empty list to store components + prep_data_components = [] + asset_str = ":".join(map(str, assets)) + + prep_data_component = command( + name=name, + display_name=display_name, + description=description, + inputs={}, + outputs=dict( + target_dir=Output(type="uri_folder", mode="rw_mount"), + ), + code=data_pipeline_code_dir, + command=f"""python prep_data.py \ + --storage_account {storage_account} \ + --source_container_name {source_container_name} \ + --target_container_name {target_container_name} \ + --source_blob {source_blob} \ + --assets_str {asset_str} + """, + environment=environment, + compute=custom_compute, + identity=UserIdentityConfiguration() + ) + prep_data_components.append(prep_data_component) + + return prep_data_components + + +""" +This function creates and returns an Azure Machine Learning (AML) client. +The AML client is used to interact with Azure Machine Learning services. + +Args: +--subscription_id: The Azure subscription ID. +This argument is required for identifying the Azure subscription. +--resource_group_name: The name of the resource group in Azure. +This argument is required to specify the resource group in Azure. +--workspace_name: The name of the workspace in Azure Machine Learning. +This argument is required to specify the workspace in Azure Machine Learning. +""" + + +def get_aml_client( + subscription_id, + resource_group_name, + workspace_name, +): + aml_client = MLClient( + DefaultAzureCredential(), + subscription_id=subscription_id, + resource_group_name=resource_group_name, + workspace_name=workspace_name, + ) + + return aml_client + + +""" +This function creates a pipeline job with a data component. +The pipeline job is associated with a specific component name, display name, +description, data pipeline, code directory, environment, storage account +source and target containers, source blob, assets, and custom compute. + +Args: +--component_name: The name of the data component. +This argument is required to specify the name of the data component. +--component_display_name: The display name of the data component. +This argument is required to specify the display name of the data component. +--component_description: The description of the data component. +This argument is required to provide a description of the data component. +--data_pipeline_code_dir: The directory of the data pipeline code. +This argument is required to specify the directory of the data pipeline code. +--aml_env_name: The name of the Azure Machine Learning environment. +This argument is required to specify the Azure Machine Learning environment. +--storage_account: The storage account in Azure. +This argument is required to specify the storage account in Azure. +--source_container_name: The name of the source container in the storage account. +This argument is required to specify the source container in the storage account. +--target_container_name: The name of the target container in the storage account. +This argument is required to specify the target container in the storage account. +--source_blob: The name of the source blob in the source container. +This argument is required to specify the source blob in the source container. +--assets: The assets in the target container. +This argument is required to specify the assets in the target container. +--custom_compute: The custom compute for the data component. +This argument is required to specify the custom compute for the data component. +""" + + +def create_pipeline_job( + component_name, + component_display_name, + component_description, + data_pipeline_code_dir, + aml_env_name, + storage_account, + source_container_name, + target_container_name, + source_blob, + assets, + custom_compute +): + prep_data_component = get_prep_data_component( + name=component_name, + display_name=component_display_name, + description=component_description, + data_pipeline_code_dir=data_pipeline_code_dir, + environment=aml_env_name, + storage_account=storage_account, + source_container_name=source_container_name, + target_container_name=target_container_name, + source_blob=source_blob, + assets=assets, + custom_compute=custom_compute + ) + + pipeline_components.extend(prep_data_component) + + pipeline_job = ner_data_prep_pipeline() + + return pipeline_job + + +""" +This function schedules a pipeline job. +The schedule is identified by its name, cron expression, and timezone, +and is associated with a specific job and Azure Machine Learning client. + +Args: +--schedule_name: The name of the schedule. +This argument is required to specify the name of the schedule. +--schedule_cron_expression: The cron expression for the schedule. +This argument is required to specify the cron expression for the schedule. +--schedule_timezone: The timezone for the schedule. +This argument is required to specify the timezone for the schedule. +--job: The job for the schedule. +This argument is required to specify the job for the schedule. +--aml_client: The Azure Machine Learning client. +This argument is required to interact with Azure Machine Learning services. +""" + + +def schedule_pipeline_job( + schedule_name, + schedule_cron_expression, + schedule_timezone, + job, + aml_client, +): + schedule_start_time = datetime.utcnow() + cron_trigger = CronTrigger( + expression=schedule_cron_expression, + start_time=schedule_start_time, + time_zone=schedule_timezone + ) + + job_schedule = JobSchedule( + name=schedule_name, trigger=cron_trigger, create_job=job + ) + + aml_client.schedules.begin_create_or_update( + schedule=job_schedule + ).result() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--subscription_id", + type=str, + help="Azure subscription id", + required=True, + ) + parser.add_argument( + "--resource_group_name", + type=str, + help="Azure resource group", + required=True, + ) + parser.add_argument( + "--workspace_name", + type=str, + help="Azure ML workspace", + required=True, + ) + parser.add_argument( + "--aml_env_name", + type=str, + help="Azure environment name", + required=True, + ) + parser.add_argument( + "--config_path_root_dir", + type=str, + help="Root dir for config file", + required=True, + ) + + args = parser.parse_args() + + subscription_id = args.subscription_id + resource_group_name = args.resource_group_name + workspace_name = args.workspace_name + aml_env_name = args.aml_env_name + config_path_root_dir = args.config_path_root_dir + + config_path = os.path.join(os.getcwd(), f"{config_path_root_dir}/configs/dataops_config.json") + config = json.load(open(config_path)) + + component_config = config['DATA_PREP_COMPONENT'] + component_name = component_config['COMPONENT_NAME'] + component_display_name = component_config['COMPONENT_DISPLAY_NAME'] + component_description = component_config['COMPONENT_DESCRIPTION'] + + storage_config = config['STORAGE'] + storage_account = storage_config['STORAGE_ACCOUNT'] + source_container_name = storage_config['SOURCE_CONTAINER'] + source_blob = storage_config['SOURCE_BLOB'] + target_container_name = storage_config['TARGET_CONTAINER'] + + path_config = config['PATH'] + data_pipeline_code_dir = path_config['DATA_PIPELINE_CODE_DIR'] + + schedule_config = config['SCHEDULE'] + schedule_name = schedule_config['NAME'] + schedule_cron_expression = schedule_config['CRON_EXPRESSION'] + schedule_timezone = schedule_config['TIMEZONE'] + + data_asset_configs = config['DATA_ASSETS'] + assets = [] + for data_asset_config in data_asset_configs: + assets.append(data_asset_config['PATH']) + + custom_compute = config["COMPUTE_NAME"] + + aml_client = get_aml_client( + subscription_id, + resource_group_name, + workspace_name, + ) + + job = create_pipeline_job( + component_name, + component_display_name, + component_description, + data_pipeline_code_dir, + aml_env_name, + storage_account, + source_container_name, + target_container_name, + source_blob, + assets, + custom_compute + ) + + schedule_pipeline_job( + schedule_name, + schedule_cron_expression, + schedule_timezone, + job, + aml_client + ) + + +if __name__ == "__main__": + main() diff --git a/docs/how_to_configure_dataops.md b/docs/how_to_configure_dataops.md new file mode 100644 index 000000000..f56b13cbd --- /dev/null +++ b/docs/how_to_configure_dataops.md @@ -0,0 +1,39 @@ +# How to Configure DataOps + +Implementing the DataOps pattern will help manage and scale the data pipelines. The following sections will explain the necessary steps to integrate DataOps into the LLMOps pattern. + +## Prerequisites + +This document assumes that you have already gone through [How to Onboard new flow](./how_to_onboard_new_flows.md) and implemented the steps. Once you have all the components from the document in place, you can start setting up DataOps. + +**Data Pipeline Environment:** You will need storage account containers to store the raw and processed data used in the sample DataOps implementation. + +## The Sample Implementation + +This repository includes an implementation of DataOps for the `named_entity_recognition` sample. The sample implementation uses Azure Machine Learning to run the data pipelines. + +The data pipeline loads data from the source system, processes it, and stores it in the target location. The processed data is stored as JSONL files, which are registered as data assets + +![dataops llmops](images/dataops_llmops.png) + +The sample CI/CD pipelines manage the lifecycle of the data pipelines. They build and deploy the pipelines to the target environments. The CI/CD pipelines also register the required Datastores and Data Assets according to the processed JSONL files for Promptflow to consume. + +If you are not using data pipelines to create the data assets, the Promptflow flows will use the JSONL files inside the `data` folder to create the data assets. + +## Steps to Configure DataOps + +Follow these steps to configure DataOps for your flow: + +**New Folder for data pipelines** The data pipelines for the `named_entity_recognition` flow are inside a sub-folder named `data_pipelines`. Create a similar folder under your flow folder for the data pipelines. + +**Configure source and target location** As mentioned earlier, the data pipeline loads data from a source storage account container and stores the processed data in a target storage account container. The processed data in the target storage account gets mapped to the azure machine learning Data Asset. Create these two containers and upload the source dataset to the source container. + +**Data Pipeline Configuration:** The `dataops_config.json` file contains configurations for the data pipeline. + +You can start by copying an existing config file and modify it with relevant values. Provide valid values for all the configuration elements. + +**Updating Flow Configuration:** The configuration of the use-case is managed by the `experiment.yaml` (sets the flow paths, datasets, and evaluations). The `experiment.yaml` in the repo uses local data files. If you are using DataOps,this config file needs to point to the Data Asset path. The data asset path will look like this `azureml://datastores/[data_store_name]/paths/[file_path]` + +Update any datasets elements in the `experiment.yaml` files and make sure the source field point to the Data Asset path in format `azureml:` + +**Create Data Pipelines** The `named_entity_recognition` use case provides a sample python file for a data pipeline - [prep_data.py](../named_entity_recognition/data_pipelines/aml/prep_data.py). This is a placeholder for your data pipeline code. Replace it with your actual data transformation script. \ No newline at end of file diff --git a/docs/images/dataops_llmops.png b/docs/images/dataops_llmops.png new file mode 100644 index 000000000..6d1ecc07c Binary files /dev/null and b/docs/images/dataops_llmops.png differ diff --git a/named_entity_recognition/configs/dataops_config.json b/named_entity_recognition/configs/dataops_config.json new file mode 100644 index 000000000..8519a1f9b --- /dev/null +++ b/named_entity_recognition/configs/dataops_config.json @@ -0,0 +1,40 @@ +{ + "DATA_STORE_NAME": "ner_data_store", + "COMPUTE_NAME": "ner_compute", + "DATA_STORE_DESCRIPTION": "pipeline data store description for evaluation", + "DATA_PREP_COMPONENT": + { + "COMPONENT_NAME": "prep_data_component", + "COMPONENT_DISPLAY_NAME": "Prepare data component", + "COMPONENT_DESCRIPTION": "Loading and processing data for prompt engineering" + }, + "STORAGE": + { + "STORAGE_ACCOUNT": "saner", + "SOURCE_CONTAINER": "source", + "SOURCE_BLOB": "ner_source.csv", + "TARGET_CONTAINER": "data" + }, + "PATH": + { + "DATA_PIPELINE_CODE_DIR": "named_entity_recognition/data_pipelines/aml" + }, + "SCHEDULE": + { + "NAME": "ner_data_pipeline_schedule", + "CRON_EXPRESSION": "10 14 * * 1", + "TIMEZONE": "Eastern Standard Time" + }, + "DATA_ASSETS":[ + { + "NAME": "ner_eval", + "PATH": "eval.jsonl", + "DESCRIPTION": "NER eval data asset" + }, + { + "NAME": "ner_exp", + "PATH": "exp.jsonl", + "DESCRIPTION": "NER experiment data asset" + } + ] +} \ No newline at end of file diff --git a/named_entity_recognition/data/source.txt b/named_entity_recognition/data/source.txt new file mode 100644 index 000000000..ee199586c --- /dev/null +++ b/named_entity_recognition/data/source.txt @@ -0,0 +1,16 @@ +text entity_type results +The software engineer is working on a new update for the application. job title software engineer +The project manager and the data analyst are collaborating to interpret the project data. job title "project manager, data analyst" +The marketing manager is coordinating with the graphic designer to create a new advertisement campaign. job title "marketing manager, graphic designer" +The CEO and CFO are discussing the financial forecast for the next quarter. job title "CEO, CFO" +The web developer and UX designer are working together to improve the website's user interface. job title "web developer, UX designer" +John finally decided to change his phone number after receiving too many spam calls. phone number None +"If you have any questions about our products, please call our customer service at (123) 456-7890." phone number (123) 456-7890 +"My new phone number is (098) 765-4321, please update your contact list." phone number (098) 765-4321 +The phone number (321) 654-0987 is no longer in service. phone number (321) 654-0987 +Please dial the following phone number: (555) 123-4567 to reach our technical support. phone number (555) 123-4567 +John Doe has been appointed as the new CEO of the company. people's full name John Doe +The novel 'The Great Gatsby' was written by F. Scott Fitzgerald. people's full name F. Scott Fitzgerald +Mary Jane Watson and Peter Parker are characters in the Spider-Man series. people's full name "Mary Jane Watson, Peter Parker" +"The famous physicists, Albert Einstein and Isaac Newton, made significant contributions to the field of physics." people's full name "Isaac Newton, Albert Einstein" +The Eiffel Tower is an iconic landmark in Paris. people's full name None diff --git a/named_entity_recognition/data_pipelines/aml/prep_data.py b/named_entity_recognition/data_pipelines/aml/prep_data.py new file mode 100644 index 000000000..5d684ecc1 --- /dev/null +++ b/named_entity_recognition/data_pipelines/aml/prep_data.py @@ -0,0 +1,100 @@ +import argparse +import json + +import pandas as pd +from azure.identity import DefaultAzureCredential +from azure.storage.blob import BlobServiceClient +import io + +""" +This function prepares data for processing. +It reads a CSV file from a source blob storage, +converts the CSV data to JSONL (JSON Lines) format, +and then uploads the JSONL data to a target blob storage. + +Args: +--blob_service_client: The Azure blob service client. +This argument is required for interacting with Azure blob storage. +--source_container_name: The name of the source container in blob storage. +This argument is required to specify the source container from where the CSV data is read. +--target_container_name: The name of the target container in blob storage. +This argument is required to specify the target container to where the JSONL data is uploaded. +--source_blob: The name of the source blob in the source container. +This argument is required to specify the source blob from where the CSV data is read. +--target_data_assets: The target data assets in the target container. +This argument is required to specify the target data assets to where the JSONL data is uploaded. +""" + + +def prepare_data(blob_service_client, + source_container_name, + target_container_name, + source_blob, + target_data_assets): + print('Data processing component') + + source_blob_client = blob_service_client.get_blob_client(container=source_container_name, + blob=source_blob) + source_blob_content = source_blob_client.download_blob().readall() + + assets = [item.strip() for item in target_data_assets.split(":")] + + df = pd.read_csv(io.StringIO(source_blob_content.decode('utf-8'))) + + jsonl_list = [] + for _, row in df.iterrows(): + jsonl_list.append(json.dumps(row.to_dict())) + + # Upload JSONL data to the target container + for asset in assets: + target_blob_client = blob_service_client.get_blob_client(container=target_container_name, + blob=asset) + target_blob_client.upload_blob('\n'.join(jsonl_list), overwrite=True) + print(f"CSV data converted to JSONL and uploaded successfully!: {asset}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--storage_account", + type=str, + help="storage account", + ) + parser.add_argument( + "--source_container_name", + type=str, + help="source container name", + ) + parser.add_argument( + "--target_container_name", + type=str, + help="target container name", + ) + parser.add_argument( + "--source_blob", + type=str, + help="source blob file (csv)", + ) + parser.add_argument( + "--assets_str", + type=str, + help="target assets to be created as a string" + ) + + args = parser.parse_args() + storage_account = args.storage_account + source_container_name = args.source_container_name + target_container_name = args.target_container_name + source_blob = args.source_blob + target_data_assets = args.assets_str + + storage_account_url = f"https://{storage_account}.blob.core.windows.net" + + blob_service_client = BlobServiceClient(storage_account_url, + credential=DefaultAzureCredential()) + + prepare_data(blob_service_client, + source_container_name, + target_container_name, + source_blob, + target_data_assets) diff --git a/named_entity_recognition/environment/conda.yml b/named_entity_recognition/environment/conda.yml new file mode 100644 index 000000000..6aed6c93c --- /dev/null +++ b/named_entity_recognition/environment/conda.yml @@ -0,0 +1,9 @@ +name: named-entity-env +channels: + - conda-forge +dependencies: + - python=3.9 + - pip + - pip: + - jinja2 +