diff --git a/.github/actions/azureml-test/action.yml b/.github/actions/azureml-test/action.yml index 85ae9f84a..91a437719 100644 --- a/.github/actions/azureml-test/action.yml +++ b/.github/actions/azureml-test/action.yml @@ -6,64 +6,39 @@ name: azureml-tests description: "Submit experiment to AzureML cluster" inputs: - # azureml experiment name EXP_NAME: required: true - type: string - # type of test - unit or nightly + description: AzureML experiment Name + ENV_NAME: + required: true + description: AzureML environment Name TEST_KIND: required: true - type: string - # test environment - cpu, gpu or spark - TEST_ENV: - required: false - type: string - # azureml compute credentials + description: Type of test - unit or nightly AZUREML_TEST_CREDENTIALS: required: true - type: string - # azureml compute subid + description: Credentials for AzureML login AZUREML_TEST_SUBID: required: true - type: string - # python version + description: AzureML subscription ID PYTHON_VERSION: required: true - type: string - # test group name + description: Python version used for the tests TEST_GROUP: required: true - type: string - # cpu cluster name - CPU_CLUSTER_NAME: - required: false - type: string - default: "cpu-cluster" - # gpu cluster name - GPU_CLUSTER_NAME: - required: false - type: string - default: "gpu-cluster" - # AzureML resource group name + description: Test group defined in test_group.py RG: required: false - type: string + description: AzureML resource group name default: "recommenders_project_resources" - # AzureML workspace name WS: required: false - type: string + description: AzureML workspace name default: "azureml-test-workspace" - # test logs path - TEST_LOGS_PATH: - required: false - type: string - default: '"test_logs.log"' - # pytest exit code - PYTEST_EXIT_CODE: + LOG_DIR: required: false - type: string - default: "pytest_exit_code.log" + description: Directory storing the test logs + default: "test_logs" runs: using: "composite" @@ -71,43 +46,45 @@ runs: - name: Setup python uses: actions/setup-python@v5 with: - python-version: "3.8" - - name: Install azureml-core and azure-cli on a GitHub hosted server + python-version: "3.10" + - name: Install AzureML Python SDK shell: bash - run: pip install --quiet "azureml-core>1,<2" "azure-cli>2,<3" + run: pip install --quiet "azure-ai-ml>1,<2" mlflow "azureml-mlflow>1,<2" - name: Log in to Azure uses: azure/login@v2 with: - creds: ${{inputs.AZUREML_TEST_CREDENTIALS}} - - name: Install wheel package - shell: bash - run: pip install --quiet wheel + creds: ${{ inputs.AZUREML_TEST_CREDENTIALS }} - name: Submit tests to AzureML shell: bash - run: >- + run: | + echo "::group::Running tests ..." python tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py \ - --subid ${{inputs.AZUREML_TEST_SUBID}} \ - --reponame "recommenders" \ - --branch ${{ github.ref }} \ - --rg ${{inputs.RG}} \ - --wsname ${{inputs.WS}} \ - --expname ${{inputs.EXP_NAME}}_${{inputs.TEST_GROUP}} \ - --testlogs ${{inputs.TEST_LOGS_PATH}} \ - --testkind ${{inputs.TEST_KIND}} \ - --conda_pkg_python ${{inputs.PYTHON_VERSION}} \ - --testgroup ${{inputs.TEST_GROUP}} \ - --disable-warnings \ - --sha "${GITHUB_SHA}" \ - --clustername $(if [[ ${{inputs.TEST_GROUP}} =~ "gpu" ]]; then echo "${{inputs.GPU_CLUSTER_NAME}}"; else echo "${{inputs.CPU_CLUSTER_NAME}}"; fi) \ - $(if [[ ${{inputs.TEST_GROUP}} =~ "gpu" ]]; then echo "--add_gpu_dependencies"; fi) \ - $(if [[ ${{inputs.TEST_GROUP}} =~ "spark" ]]; then echo "--add_spark_dependencies"; fi) - - name: Get exit status + --subid ${{ inputs.AZUREML_TEST_SUBID }} \ + --rg ${{ inputs.RG }} \ + --ws ${{ inputs.WS }} \ + --cluster ${{ contains(inputs.TEST_GROUP, 'gpu') && 'gpu-cluster' || 'cpu-cluster' }} \ + --expname ${{ inputs.EXP_NAME }} \ + --envname ${{ inputs.ENV_NAME }} \ + --testkind ${{ inputs.TEST_KIND}} \ + --python-version ${{ inputs.PYTHON_VERSION }} \ + --testgroup ${{ inputs.TEST_GROUP }} \ + --sha ${GITHUB_SHA} + echo "::endgroup::" + - name: Post tests + if: ${{ ! cancelled() }} shell: bash - id: exit_status - run: echo "code=$(cat ${{inputs.PYTEST_EXIT_CODE}})" >> $GITHUB_OUTPUT - - name: Check Success/Failure - if: ${{ steps.exit_status.outputs.code != 0 }} - uses: actions/github-script@v7 + run: | + echo "::group::Pytest logs" + python tests/ci/azureml_tests/post_pytest.py \ + --subid ${{ inputs.AZUREML_TEST_SUBID }} \ + --rg ${{ inputs.RG }} \ + --ws ${{ inputs.WS }} \ + --expname ${{ inputs.EXP_NAME }} \ + --log-dir ${{ inputs.LOG_DIR }} + echo "::endgroup::" + - name: Save logs + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 with: - script: | - core.setFailed('All tests did not pass!') + name: logs-${{ inputs.TEST_GROUP }}-python${{ inputs.PYTHON_VERSION }} + path: ${{ inputs.LOG_DIR }} diff --git a/.github/actions/get-test-groups/action.yml b/.github/actions/get-test-groups/action.yml index 39364fa81..dc50e4b93 100644 --- a/.github/actions/get-test-groups/action.yml +++ b/.github/actions/get-test-groups/action.yml @@ -6,18 +6,17 @@ name: get-test-groups description: "Get test group names from tests_groups.py" inputs: - # type of test - unit or nightly TEST_KIND: required: true - type: string - # test environment - cpu, gpu or spark + description: Type of test - unit or nightly TEST_ENV: required: false - type: string + description: Test environment - cpu, gpu or spark default: 'cpu' outputs: test_groups: - value: ${{steps.get_test_groups.outputs.test_groups}} + description: A list of test groups + value: ${{ steps.get_test_groups.outputs.test_groups }} runs: using: "composite" diff --git a/.github/workflows/azureml-cpu-nightly.yml b/.github/workflows/azureml-cpu-nightly.yml index 93e414564..89fc64757 100644 --- a/.github/workflows/azureml-cpu-nightly.yml +++ b/.github/workflows/azureml-cpu-nightly.yml @@ -34,7 +34,7 @@ on: # Enable manual trigger workflow_dispatch: - input: + inputs: tags: description: 'Tags to label this manual run (optional)' default: 'Manual trigger' @@ -67,7 +67,7 @@ jobs: strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: - python-version: ['"python=3.8"', '"python=3.9"', '"python=3.10"', '"python=3.11"'] + python-version: ["3.8", "3.9", "3.10", "3.11"] test-group: ${{ fromJSON(needs.get-test-groups.outputs.test_groups) }} steps: - name: Check out repository code @@ -76,9 +76,9 @@ jobs: uses: ./.github/actions/azureml-test id: execute_tests with: - EXP_NAME: 'nightly_tests' + EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }} + ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }} TEST_KIND: 'nightly' - TEST_ENV: 'cpu' AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }} AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }} PYTHON_VERSION: ${{ matrix.python-version }} diff --git a/.github/workflows/azureml-gpu-nightly.yml b/.github/workflows/azureml-gpu-nightly.yml index 3b9f6d6b4..16e3e6ed2 100644 --- a/.github/workflows/azureml-gpu-nightly.yml +++ b/.github/workflows/azureml-gpu-nightly.yml @@ -34,7 +34,7 @@ on: # Enable manual trigger workflow_dispatch: - input: + inputs: tags: description: 'Tags to label this manual run (optional)' default: 'Manual trigger' @@ -67,7 +67,7 @@ jobs: strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: - python-version: ['"python=3.8"', '"python=3.9"', '"python=3.10"', '"python=3.11"'] + python-version: ["3.8", "3.9", "3.10", "3.11"] test-group: ${{ fromJSON(needs.get-test-groups.outputs.test_groups) }} steps: - name: Check out repository code @@ -76,9 +76,9 @@ jobs: uses: ./.github/actions/azureml-test id: execute_tests with: - EXP_NAME: 'nightly_tests' + EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }} + ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }} TEST_KIND: 'nightly' - TEST_ENV: 'gpu' AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }} AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }} PYTHON_VERSION: ${{ matrix.python-version }} diff --git a/.github/workflows/azureml-spark-nightly.yml b/.github/workflows/azureml-spark-nightly.yml index 8f28be6f2..97789fccf 100644 --- a/.github/workflows/azureml-spark-nightly.yml +++ b/.github/workflows/azureml-spark-nightly.yml @@ -33,7 +33,7 @@ on: # Enable manual trigger workflow_dispatch: - input: + inputs: tags: description: 'Tags to label this manual run (optional)' default: 'Manual trigger' @@ -66,7 +66,7 @@ jobs: strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: - python-version: ['"python=3.8"', '"python=3.9"', '"python=3.10"', '"python=3.11"'] + python-version: ["3.8", "3.9", "3.10", "3.11"] test-group: ${{ fromJSON(needs.get-test-groups.outputs.test_groups) }} steps: - name: Check out repository code @@ -75,9 +75,9 @@ jobs: uses: ./.github/actions/azureml-test id: execute_tests with: - EXP_NAME: 'nightly_tests' + EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }} + ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }} TEST_KIND: 'nightly' - TEST_ENV: 'spark' AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }} AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }} PYTHON_VERSION: ${{ matrix.python-version }} diff --git a/.github/workflows/azureml-unit-tests.yml b/.github/workflows/azureml-unit-tests.yml index b39268318..ed3b5a98d 100644 --- a/.github/workflows/azureml-unit-tests.yml +++ b/.github/workflows/azureml-unit-tests.yml @@ -23,7 +23,7 @@ on: # Enable manual trigger workflow_dispatch: - input: + inputs: tags: description: 'Tags to label this manual run (optional)' default: 'Manual trigger' @@ -56,7 +56,7 @@ jobs: strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: - python-version: ['"python=3.8"', '"python=3.9"', '"python=3.10"', '"python=3.11"'] + python-version: ["3.8", "3.9", "3.10", "3.11"] test-group: ${{ fromJSON(needs.get-test-groups.outputs.test_groups) }} steps: - name: Check out repository code @@ -65,7 +65,8 @@ jobs: uses: ./.github/actions/azureml-test id: execute_tests with: - EXP_NAME: 'unit_tests' + EXP_NAME: recommenders-unit-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.sha }} + ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }} TEST_KIND: 'unit' AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }} AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }} diff --git a/setup.py b/setup.py index 631d6cd83..03df519ed 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ "nltk>=3.8.1,<4", # requires tqdm "notebook>=6.5.5,<8", # requires ipykernel, jinja2, jupyter, nbconvert, nbformat, packaging, requests "numba>=0.57.0,<1", + "numpy<2.0.0", # FIXME: Remove numpy<2.0.0 once cornac release a version newer than 2.2.1 that resolve ImportError: numpy.core.multiarray failed to import. "pandas>2.0.0,<3.0.0", # requires numpy "pandera[strategies]>=0.6.5,<0.18;python_version<='3.8'", # For generating fake datasets "pandera[strategies]>=0.15.0;python_version>='3.9'", diff --git a/tests/ci/azureml_tests/aml_utils.py b/tests/ci/azureml_tests/aml_utils.py new file mode 100644 index 000000000..d24ec1361 --- /dev/null +++ b/tests/ci/azureml_tests/aml_utils.py @@ -0,0 +1,198 @@ +# Copyright (c) Recommenders contributors. +# Licensed under the MIT License. + +""" +This module includes utilities for tests on AzureML via AML Python SDK v2. +See +* https://learn.microsoft.com/en-us/azure/machine-learning/concept-v2?view=azureml-api-2 +* https://learn.microsoft.com/en-us/azure/machine-learning/reference-migrate-sdk-v1-mlflow-tracking?view=azureml-api-2&tabs=aml%2Ccli%2Cmlflow +""" +import pathlib +import tempfile + +from azure.ai.ml import MLClient, command +from azure.ai.ml.entities import AmlCompute, BuildContext, Environment, Workspace +from azure.ai.ml.exceptions import JobException +from azure.core.exceptions import ResourceExistsError +from azure.identity import DefaultAzureCredential + +def get_client(subscription_id, resource_group, workspace_name): + """ + Get the client with specified AzureML workspace, or create one if not existing. + See https://github.com/Azure/azureml-examples/blob/main/sdk/python/resources/workspace/workspace.ipynb + """ + params = dict( + credential=DefaultAzureCredential(), + subscription_id=subscription_id, + resource_group_name=resource_group, + ) + client = MLClient(**params) + + workspace = client.workspaces.get(workspace_name) + if workspace is None: + workspace = client.workspaces.begin_create( + Workspace(name=workspace_name) + ).result() + + params["workspace_name"] = workspace_name + client = MLClient(**params) + return client + + +def create_or_start_compute(client, name, size, max_instances): + """ + Start the specified compute. + See https://github.com/Azure/azureml-examples/blob/main/sdk/python/resources/compute/compute.ipynb + """ + compute = client.compute.get(name) + if compute is None: + compute = client.compute.begin_create_or_update( + AmlCompute( + name=name, + type="amlcompute", + size=size, + max_instances=max_instances, + ) + ).result() + + +def get_or_create_environment( + client, + environment_name, + use_gpu, + use_spark, + conda_pkg_jdk, + python_version, + commit_sha, +): + """ + AzureML requires the run environment to be setup prior to submission. + This configures a docker persistent compute. + See https://github.com/Azure/azureml-examples/blob/main/sdk/python/assets/environment/environment.ipynb + + Args: + client (MLClient): the client to interact with AzureML services + environment_name (str): Environment name + use_gpu (bool): True if gpu packages should be + added to the conda environment, else False + use_spark (bool): True if PySpark packages should be + added to the conda environment, else False + conda_pkg_jdk (str): "openjdk=8" by default + python_version (str): python version, such as "3.9" + commit_sha (str): the commit that triggers the workflow + """ + conda_env_name = "reco" + conda_env_yml = "environment.yml" + condafile = fr""" +name: {conda_env_name} +channels: + - conda-forge +dependencies: + - python={python_version} + - {conda_pkg_jdk} + - pip + - pip: + - pymanopt@https://github.com/pymanopt/pymanopt/archive/fb36a272cdeecb21992cfd9271eb82baafeb316d.zip + - recommenders[dev{",gpu" if use_gpu else ""}{",spark" if use_spark else ""}]@git+https://github.com/recommenders-team/recommenders.git@{commit_sha} +""" + # See https://github.com/Azure/AzureML-Containers/blob/master/base/cpu/openmpi4.1.0-ubuntu22.04 + image = "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04" + # See https://github.com/Azure/AzureML-Containers/blob/master/base/gpu/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 + dockerfile = fr"""# syntax=docker/dockerfile:1 +FROM nvcr.io/nvidia/cuda:12.5.1-devel-ubuntu22.04 +SHELL ["/bin/bash", "-c"] +USER root:root +ENV NVIDIA_VISIBLE_DEVICES all +ENV NVIDIA_DRIVER_CAPABILITIES compute,utility +ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 +ENV DEBIAN_FRONTEND noninteractive +RUN apt-get update && \ + apt-get install -y wget git-all && \ + apt-get clean -y && \ + rm -rf /var/lib/apt/lists/* + +# Install Conda +ENV CONDA_PREFIX /opt/miniconda +RUN wget -qO /tmp/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_24.5.0-0-Linux-x86_64.sh && \ + bash /tmp/miniconda.sh -bf -p ${{CONDA_PREFIX}} && \ + ${{CONDA_PREFIX}}/bin/conda update --all -c conda-forge -y && \ + ${{CONDA_PREFIX}}/bin/conda clean -ay && \ + rm -rf ${{CONDA_PREFIX}}/pkgs && \ + rm /tmp/miniconda.sh && \ + find / -type d -name __pycache__ | xargs rm -rf + +# Create Conda environment +COPY {conda_env_yml} /tmp/{conda_env_yml} +RUN ${{CONDA_PREFIX}}/bin/conda env create -f /tmp/{conda_env_yml} + +# Activate Conda environment +ENV CONDA_DEFAULT_ENV {conda_env_name} +ENV CONDA_PREFIX ${{CONDA_PREFIX}}/envs/${{CONDA_DEFAULT_ENV}} +ENV PATH="${{CONDA_PREFIX}}/bin:${{PATH}}" LD_LIBRARY_PATH="${{CONDA_PREFIX}}/lib:$LD_LIBRARY_PATH" +""" + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = pathlib.Path(tmpdir) + dockerfile_path = tmpdir / "Dockerfile" + condafile_path = tmpdir / conda_env_yml + build = BuildContext(path=tmpdir, dockerfile_path=dockerfile_path.name) + + with open(dockerfile_path, "w") as file: + file.write(dockerfile) + with open(condafile_path, "w") as file: + file.write(condafile) + + try: + client.environments.create_or_update( + Environment( + name=environment_name, + image=None if use_gpu else image, + build=build if use_gpu else None, + conda_file=None if use_gpu else condafile_path, + ) + ) + except ResourceExistsError: + pass + + +def run_tests( + client, + compute, + environment_name, + experiment_name, + script, + testgroup, + testkind, +): + """ + Pytest on AzureML compute. + See https://github.com/Azure/azureml-examples/blob/main/sdk/python/jobs/single-step/debug-and-monitor/debug-and-monitor.ipynb + """ + job = client.jobs.create_or_update( + command( + experiment_name=experiment_name, + compute=compute, + environment=f"{environment_name}@latest", + code="./", + command=( + f"python {script} " + f"--expname {experiment_name} " + f"--testgroup {testgroup} " + f"--testkind {testkind}" + ), + ) + ) + client.jobs.stream(job.name) + job = client.jobs.get(job.name) + if job.status != "Completed": + raise JobException("Job Not Completed!") + + +def correct_resource_name(resource_name): + """ + Resource name can only contain alphanumeric characters, dashes, and + underscores, with a limit of 255 characters. + """ + name = resource_name.replace(".", "_") + name = name.replace("/", "_") + return name diff --git a/tests/ci/azureml_tests/post_pytest.py b/tests/ci/azureml_tests/post_pytest.py new file mode 100644 index 000000000..b457e709d --- /dev/null +++ b/tests/ci/azureml_tests/post_pytest.py @@ -0,0 +1,96 @@ +# Copyright (c) Recommenders contributors. +# Licensed under the MIT License. + +""" +This Python script completes post test tasks such as downloading logs. +""" + +import argparse +import mlflow +import logging +import pathlib + +from aml_utils import get_client, correct_resource_name + + +def parse_args(): + """ + Parse command line arguments. + """ + + parser = argparse.ArgumentParser(description="Process some inputs") + + parser.add_argument( + "--rg", action="store", + default="recommender", + help="Azure Resource Group", + ) + parser.add_argument( + "--ws", action="store", + default="RecoWS", + help="AzureML workspace name", + ) + parser.add_argument( + "--subid", + action="store", + default="123456", + help="Azure Subscription ID", + ) + parser.add_argument( + "--expname", + action="store", + default="persistentAzureML", + help="Experiment name on AzureML", + ) + parser.add_argument( + "--log-dir", + action="store", + default="test_logs", + help="Test logs will be downloaded to this path", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + logger = logging.getLogger("post_pytest.py") + args = parse_args() + + logger.info(f"Setting up workspace {args.ws}") + client = get_client( + subscription_id=args.subid, + resource_group=args.rg, + workspace_name=args.ws, + ) + + # See https://learn.microsoft.com/en-us/azure/machine-learning/how-to-use-mlflow-configure-tracking?view=azureml-api-2&tabs=python%2Cmlflow#configure-mlflow-tracking-uri + logger.info(f"Configuring mlflow") + mlflow.set_tracking_uri( + client.workspaces.get(client.workspace_name).mlflow_tracking_uri + ) + + # See https://learn.microsoft.com/en-us/azure/machine-learning/how-to-track-experiments-mlflow?view=azureml-api-2 + logger.info(f"Searching runs") + experiment_name = correct_resource_name(args.expname) + runs = mlflow.search_runs( + experiment_names=[experiment_name], + max_results=1, + output_format="list", + ) + if runs: + run = runs[0] + + # See https://www.mlflow.org/docs/latest/python_api/mlflow.artifacts.html#mlflow.artifacts.download_artifacts + # For more details on logs, see + # * https://learn.microsoft.com/en-us/azure/machine-learning/how-to-log-view-metrics?view=azureml-api-2&tabs=interactive#view-and-download-diagnostic-logs + # * https://azure.github.io/azureml-cheatsheets/docs/cheatsheets/python/v1/debugging/ + logger.info(f"Downloading AzureML logs") + mlflow.artifacts.download_artifacts( + run_id=run.info.run_id, + dst_path=args.log_dir, + ) + log_path = pathlib.Path("user_logs/std_log.txt") + with open(pathlib.Path(args.log_dir) / log_path, "r") as file: + print(f"\nDumping logs in {log_path}") + print("=====================================") + print(file.read()) diff --git a/tests/ci/azureml_tests/run_groupwise_pytest.py b/tests/ci/azureml_tests/run_groupwise_pytest.py index 92e1ee2bd..8a97fa481 100644 --- a/tests/ci/azureml_tests/run_groupwise_pytest.py +++ b/tests/ci/azureml_tests/run_groupwise_pytest.py @@ -2,90 +2,57 @@ # Licensed under the MIT License. """ -run_pytest.py is the script submitted to Azure ML that runs pytest. +run_groupwise_pytest.py is the script submitted to Azure ML that runs pytest. pytest runs all tests in the specified test folder unless parameters are set otherwise. """ -import sys +import argparse import logging import pytest -import argparse -import glob -import pkg_resources -from azureml.core import Run -from test_groups import nightly_test_groups, pr_gate_test_groups - -if __name__ == "__main__": +import sys - logger = logging.getLogger("submit_groupwise_azureml_pytest.py") - logging.basicConfig(stream=sys.stdout, level=logging.INFO) +from test_groups import nightly_test_groups, pr_gate_test_groups +def parse_args(): + """ + Parse command line arguments. + """ parser = argparse.ArgumentParser(description="Process inputs") + parser.add_argument( + "--expname", + action="store", + default="persistentAzureML", + help="Experiment name on AzureML", + ) parser.add_argument( "--testkind", - "-k", action="store", default="unit", help="Test kind - nightly or unit", ) parser.add_argument( "--testgroup", - "-g", action="store", default="group_cpu_001", help="Group name for the tests", ) - # Flag to indicate whether to turn off the warnings - parser.add_argument( - "--disable-warnings", - action="store_true", - help="Turn off warnings", - ) - args = parser.parse_args() + return parser.parse_args() + +if __name__ == "__main__": + + logger = logging.getLogger("run_groupwise_pytest.py") + + args = parse_args() if args.testkind == "nightly": test_group = nightly_test_groups[args.testgroup] else: test_group = pr_gate_test_groups[args.testgroup] - logger.info(f"Python version: {sys.version}") - - logger.info("Installed packages:") - for p in pkg_resources.working_set: - logger.info(f" {p.project_name}:{p.version}") - - logger.info("Tests to be executed") - logger.info(str(test_group)) - - # Run.get_context() is needed to save context as pytest causes corruption - # of env vars - run = Run.get_context() - - logger.info("Executing tests now...") - - # Add options to pytest command (Duration and disable warnings) + # Add options to pytest command (Duration) pytest_string = test_group + ["--durations"] + ["0"] - if args.disable_warnings is True: - pytest_string += ["--disable-warnings"] # Execute pytest command - pytest_exit_code = pytest.main(pytest_string) - - logger.info("Test execution completed!") - - # log pytest exit code as a metric - # to be used to indicate success/failure in github workflow - run.log("pytest_exit_code", pytest_exit_code.value) - - # # - # # Leveraged code from this notebook: - # # https://msdata.visualstudio.com/Vienna/_search?action=contents&text=upload_folder&type=code&lp=code-Project&filters=ProjectFilters%7BVienna%7DRepositoryFilters%7BAzureMlCli%7D&pageSize=25&sortOptions=%5B%7B%22field%22%3A%22relevance%22%2C%22sortOrder%22%3A%22desc%22%7D%5D&result=DefaultCollection%2FVienna%2FAzureMlCli%2FGBmaster%2F%2Fsrc%2Fazureml-core%2Fazureml%2Fcore%2Frun.py - # logger.info("os.listdir files {}".format(os.listdir("."))) - - # upload pytest stdout file - logs_path = ( - glob.glob("**/70_driver_log.txt", recursive=True) - + glob.glob("**/std_log.txt", recursive=True) - )[0] - run.upload_file(name="test_logs", path_or_stream=logs_path) + logger.info("Executing tests now...") + sys.exit(pytest.main(pytest_string)) diff --git a/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py b/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py index 761fe8950..4ce6106bf 100644 --- a/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py +++ b/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py @@ -4,7 +4,7 @@ """ This python script sets up an environment on AzureML and submits a script to it to run pytest. It is usually intended to be used as -part of a DevOps pipeline which runs testing on a github repo but +part of a DevOps pipeline which runs testing on a GitHub repo but can also be used from command line. Many parameters are set to default values and some are expected to be passed @@ -14,418 +14,110 @@ Args: - Required: - --clustername (str): the Azure cluster for this run. It can already exist - or it will be created. - --subid (str): the Azure subscription id - - Optional but suggested, this info will be stored on Azure as - text information as part of the experiment: - --pr (str): the Github PR number - --reponame (str): the Github repository name - --branch (str): the branch being run - It is also possible to put any text string in these. + See parse_args() below for more details. Example: Usually, this script is run by a DevOps pipeline. It can also be run from cmd line. >>> python tests/ci/submit_groupwise_azureml_pytest.py \ - --clustername 'cluster-d3-v2' \ - --subid '12345678-9012-3456-abcd-123456789012' \ - --pr '666' \ - --reponame 'Recommenders' \ - --branch 'staging' + --subid '12345678-9012-3456-abcd-123456789012' ... """ import argparse import logging -from azureml.core.authentication import AzureCliAuthentication -from azureml.core import Workspace -from azureml.core import Experiment -from azureml.core.runconfig import RunConfiguration, DockerConfiguration -from azureml.core.conda_dependencies import CondaDependencies -from azureml.core.script_run_config import ScriptRunConfig -from azureml.core.compute import ComputeTarget, AmlCompute -from azureml.core.compute_target import ComputeTargetException -from azureml.core.workspace import WorkspaceException - - -def setup_workspace( - workspace_name, subscription_id, resource_group, cli_auth, location -): - """ - This sets up an Azure Workspace. - An existing Azure Workspace is used or a new one is created if needed for - the pytest run. - - Args: - workspace_name (str): Centralized location on Azure to work - with all the artifacts used by AzureML - service - subscription_id (str): the Azure subscription id - resource_group (str): Azure Resource Groups are logical collections of - assets associated with a project. Resource groups - make it easy to track or delete all resources - associated with a project by tracking or deleting - the Resource group. - cli_auth Azure authentication - location (str): workspace reference - - Returns: - ws: workspace reference - """ - logger.debug("setup: workspace_name is {}".format(workspace_name)) - logger.debug("setup: resource_group is {}".format(resource_group)) - logger.debug("setup: subid is {}".format(subscription_id)) - logger.debug("setup: location is {}".format(location)) - - try: - # use existing workspace if there is one - ws = Workspace.get( - name=workspace_name, - subscription_id=subscription_id, - resource_group=resource_group, - auth=cli_auth, - ) - except WorkspaceException: - # this call might take a minute or two. - logger.debug("Creating new workspace") - ws = Workspace.create( - name=workspace_name, - subscription_id=subscription_id, - resource_group=resource_group, - # create_resource_group=True, - location=location, - auth=cli_auth, - show_output=False, - ) - return ws - - -def setup_persistent_compute_target(workspace, cluster_name, vm_size, max_nodes): - """ - Set up a persistent compute target on AzureML. - A persistent compute target runs noticeably faster than a - regular compute target for subsequent runs. The benefit - is that AzureML manages turning the compute on/off as needed for - each job so the user does not need to do this. - - Args: - workspace (str): Centralized location on Azure to work with - all the - artifacts used by AzureML service - cluster_name (str): the Azure cluster for this run. It can - already exist or it will be created. - vm_size (str): Azure VM size, like STANDARD_D3_V2 - max_nodes (int): Number of VMs, max_nodes=4 will - autoscale up to 4 VMs - Returns: - cpu_cluster : cluster reference - """ - # setting vmsize and num nodes creates a persistent AzureML - # compute resource - - logger.debug("setup: cluster_name {}".format(cluster_name)) - # https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets - - try: - cpu_cluster = ComputeTarget(workspace=workspace, name=cluster_name) - logger.debug("setup: Found existing cluster, use it.") - except ComputeTargetException: - logger.debug("setup: create cluster") - compute_config = AmlCompute.provisioning_configuration( - vm_size=vm_size, - max_nodes=max_nodes, - ssh_public_access_enabled=True, - idle_time_before_scale_down=3600, # 1 hour - ) - cpu_cluster = ComputeTarget.create(workspace, cluster_name, compute_config) - cpu_cluster.wait_for_completion(show_output=False) - return cpu_cluster - - -def create_run_config( - cpu_cluster, - add_gpu_dependencies, - add_spark_dependencies, - conda_pkg_jdk, - conda_pkg_python, - commit_sha, -): - """ - AzureML requires the run environment to be setup prior to submission. - This configures a docker persistent compute. Even though - it is called Persistent compute, AzureML handles startup/shutdown - of the compute environment. - - Args: - cpu_cluster (str) : Names the cluster for the test - In the case of unit tests, any of - the following: - - Reco_cpu_test - - Reco_gpu_test - add_gpu_dependencies (bool) : True if gpu packages should be - added to the conda environment, else False - add_spark_dependencies (bool) : True if PySpark packages should be - added to the conda environment, else False - commit_sha (str) : the commit that triggers the workflow - - Return: - run_azuremlcompute : AzureML run config - """ - - run_azuremlcompute = RunConfiguration() - run_azuremlcompute.target = cpu_cluster - if not add_gpu_dependencies: - # https://github.com/Azure/AzureML-Containers/blob/master/base/cpu/openmpi4.1.0-ubuntu22.04 - run_azuremlcompute.environment.docker.base_image = "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04" - else: - run_azuremlcompute.environment.docker.base_image = None - # Use the latest CUDA - # See - # * https://learn.microsoft.com/en-us/azure/machine-learning/how-to-train-with-custom-image?view=azureml-api-1#use-a-custom-dockerfile-optional - # * https://github.com/Azure/AzureML-Containers/blob/master/base/gpu/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 - run_azuremlcompute.environment.docker.base_dockerfile = r""" -FROM nvcr.io/nvidia/cuda:12.3.1-devel-ubuntu22.04 -USER root:root -ENV NVIDIA_VISIBLE_DEVICES all -ENV NVIDIA_DRIVER_CAPABILITIES compute,utility -ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 -ENV DEBIAN_FRONTEND noninteractive -RUN apt-get update && \ - apt-get install -y wget git-all && \ - apt-get clean -y && \ - rm -rf /var/lib/apt/lists/* -# Conda Environment -# Pin pip=20.1.1 due to the issue: No module named 'ruamel' -# See https://learn.microsoft.com/en-us/python/api/overview/azure/ml/install?view=azure-ml-py#troubleshooting -ENV MINICONDA_VERSION py38_23.3.1-0 -ENV PATH /opt/miniconda/bin:$PATH -ENV CONDA_PACKAGE 23.5.0 -RUN wget -qO /tmp/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-${MINICONDA_VERSION}-Linux-x86_64.sh && \ - bash /tmp/miniconda.sh -bf -p /opt/miniconda && \ - conda install -y conda=${CONDA_PACKAGE} python=3.8 pip=20.1.1 && \ - conda update --all -c conda-forge -y && \ - conda clean -ay && \ - rm -rf /opt/miniconda/pkgs && \ - rm /tmp/miniconda.sh && \ - find / -type d -name __pycache__ | xargs rm -rf -""" - - # Use conda_dependencies.yml to create a conda environment in - # the Docker image for execution - # False means the user will provide a conda file for setup - # True means the user will manually configure the environment - run_azuremlcompute.environment.python.user_managed_dependencies = False - - conda_dep = CondaDependencies() - conda_dep.add_conda_package(conda_pkg_python) - conda_dep.add_pip_package( - "pymanopt@https://github.com/pymanopt/pymanopt/archive/fb36a272cdeecb21992cfd9271eb82baafeb316d.zip" - ) - - # install recommenders - reco_extras = "dev" - conda_dep.add_conda_package("anaconda::git") - if add_gpu_dependencies and add_spark_dependencies: - conda_dep.add_channel("conda-forge") - conda_dep.add_conda_package(conda_pkg_jdk) - reco_extras = reco_extras + ",spark,gpu" - elif add_gpu_dependencies: - reco_extras = reco_extras + ",gpu" - elif add_spark_dependencies: - conda_dep.add_channel("conda-forge") - conda_dep.add_conda_package(conda_pkg_jdk) - reco_extras = reco_extras + ",spark" - - conda_dep.add_pip_package( - f"recommenders[{reco_extras}]@git+https://github.com/recommenders-team/recommenders.git@{commit_sha}" - ) - - run_azuremlcompute.environment.python.conda_dependencies = conda_dep - return run_azuremlcompute - - -def create_experiment(workspace, experiment_name): - """ - AzureML requires an experiment as a container of trials. - This will either create a new experiment or use an - existing one. - - Args: - workspace (str) : name of AzureML workspace - experiment_name (str) : AzureML experiment name - Return: - exp - AzureML experiment - """ - - logger.debug("create: experiment_name {}".format(experiment_name)) - exp = Experiment(workspace=workspace, name=experiment_name) - return exp - - -def submit_experiment_to_azureml( - test, run_config, experiment, test_group, test_kind, warnings -): - - """ - Submitting the experiment to AzureML actually runs the script. - - Args: - test (str): Pytest script, folder/test such as ./tests/ci/run_pytest.py - run_config (obj): Environment configuration - experiment (obj): Instance of an Experiment, a collection of - trials where each trial is a run. - test_group (str): Name of the test group. - test_kind (str): Name of the test kind, such as nightly or unit. - pytestargs (str): Pytest arguments. - - Return: - obj: AzureML run or trial - """ - - arguments = ["--testgroup", test_group, "--testkind", test_kind] - if warnings is True: - arguments.append("--disable-warnings") - - script_run_config = ScriptRunConfig( - source_directory=".", - script=test, - run_config=run_config, - docker_runtime_config=DockerConfiguration(use_docker=True), - arguments=arguments, - ) - - run = experiment.submit(script_run_config) - # waits only for configuration to complete - run.wait_for_completion(show_output=True, wait_post_processing=True) - - # test logs can also be found on azure - # go to azure portal to see log in azure ws and look for experiment name - # and look for individual run - logger.debug("files {}".format(run.get_file_names)) - - return run +from aml_utils import ( + correct_resource_name, + create_or_start_compute, + get_client, + get_or_create_environment, + run_tests, +) -def create_arg_parser(): +def parse_args(): """ Many of the argument defaults are used as arg_parser makes it easy to use defaults. The user has many options they can select. """ parser = argparse.ArgumentParser(description="Process some inputs") + parser.add_argument( "--sha", action="store", - help="the commit that triggers the workflow", + help="the commit triggering the workflow", ) - # script to run pytest parser.add_argument( - "--test", + "--script", action="store", default="tests/ci/azureml_tests/run_groupwise_pytest.py", - help="location of script to run pytest", + help="Path of script to run pytest", ) - # max num nodes in Azure cluster parser.add_argument( "--maxnodes", action="store", default=4, - help="specify the maximum number of nodes for the run", + help="Maximum number of nodes for the run", ) - # Test group parser.add_argument( - "--testgroup", action="store", default="group_criteo", help="Test Group" + "--testgroup", + action="store", + default="group_criteo", + help="Test Group", ) - # Azure resource group parser.add_argument( - "--rg", action="store", default="recommender", help="Azure Resource Group" + "--rg", + action="store", + default="recommender", + help="Azure Resource Group", ) - # AzureML workspace Name parser.add_argument( - "--wsname", action="store", default="RecoWS", help="AzureML workspace name" + "--ws", + action="store", + default="RecoWS", + help="AzureML workspace name", ) - # AzureML clustername parser.add_argument( - "--clustername", + "--cluster", action="store", default="azuremlcompute", - help="Set name of Azure cluster", + help="AzureML cluster name", ) - # Azure VM size parser.add_argument( "--vmsize", action="store", default="STANDARD_D3_V2", - help="Set the size of the VM either STANDARD_D3_V2", + help="VM size", ) - # Azure subscription id, when used in a pipeline, it is stored in keyvault parser.add_argument( - "--subid", action="store", default="123456", help="Azure Subscription ID" + "--subid", + action="store", + default="123456", + help="Azure Subscription ID", ) - # AzureML experiment name parser.add_argument( "--expname", action="store", default="persistentAzureML", - help="experiment name on Azure", + help="Experiment name on AzureML", ) - # Azure datacenter location - parser.add_argument("--location", default="EastUS", help="Azure location") - # github repo, stored in AzureML experiment for info purposes parser.add_argument( - "--reponame", + "--envname", action="store", - default="--reponame MyGithubRepo", - help="GitHub repo being tested", + default="recommenders", + help="Environment name on AzureML", ) - # github branch, stored in AzureML experiment for info purposes - parser.add_argument( - "--branch", - action="store", - default="--branch MyGithubBranch", - help=" Identify the branch test test is run on", - ) - # github pull request, stored in AzureML experiment for info purposes - parser.add_argument( - "--pr", - action="store", - default="--pr PRTestRun", - help="If a pr triggered the test, list it here", - ) - # flag to indicate whether gpu dependencies should be included in conda env - parser.add_argument( - "--add_gpu_dependencies", - action="store_true", - help="include packages for GPU support", - ) - # flag to indicate whether pyspark dependencies should be included in conda env - parser.add_argument( - "--add_spark_dependencies", - action="store_true", - help="include packages for PySpark support", - ) - # path where test logs should be downloaded - parser.add_argument( - "--testlogs", - action="store", - default="test_logs.log", - help="Test logs will be downloaded to this path", - ) - # conda package name for jdk parser.add_argument( "--conda_pkg_jdk", action="store", default="openjdk=8", - help="conda package name for jdk", + help="Conda package for JDK", ) - # conda package name for python parser.add_argument( - "--conda_pkg_python", + "--python-version", action="store", - default="python=3.7", - help="conda package for Python", + default="3.8", + help="Python version", ) parser.add_argument( "--testkind", @@ -433,73 +125,59 @@ def create_arg_parser(): default="unit", help="Test kind - nightly or unit", ) - # Flag to indicate whether to turn off the warnings - parser.add_argument( - "--disable-warnings", - action="store_true", - help="Turn off warnings", - ) - args = parser.parse_args() - return args + return parser.parse_args() if __name__ == "__main__": - logger = logging.getLogger("submit_groupwise_azureml_pytest.py") - args = create_arg_parser() - cli_auth = AzureCliAuthentication() + args = parse_args() - workspace = setup_workspace( - workspace_name=args.wsname, + logger.info(f"Setting up workspace {args.ws}") + client = get_client( subscription_id=args.subid, resource_group=args.rg, - cli_auth=cli_auth, - location=args.location, - ) - - cpu_cluster = setup_persistent_compute_target( - workspace=workspace, - cluster_name=args.clustername, - vm_size=args.vmsize, - max_nodes=args.maxnodes, - ) - - run_config = create_run_config( - cpu_cluster=cpu_cluster, - add_gpu_dependencies=args.add_gpu_dependencies, - add_spark_dependencies=args.add_spark_dependencies, + workspace_name=args.ws, + ) + + logger.info(f"Setting up compute {args.cluster}") + create_or_start_compute( + client=client, + name=args.cluster, + size=args.vmsize, + max_instances=args.maxnodes + ) + + # TODO: Unlike Azure DevOps pipelines, GitHub Actions only has simple + # string functions like startsWith() and contains(). And AzureML + # only accepts simple names that do not contain '.' and '/'. + # correct_resource_name() is used to replace '.' and '/' with '_' + # which makes names in the workflow and on AzureML inconsistent. + # For example, a name + # * in the workflow + # recommenders-unit-group_cpu_001-python3.8-c8adeafabc011b549f875dc145313ffbe3fc53a8 + # * on AzureML + # recommenders-unit-group_cpu_001-python3_8-c8adeafabc011b549f875dc145313ffbe3fc53a8 + environment_name = correct_resource_name(args.envname) + logger.info(f"Setting up environment {environment_name}") + get_or_create_environment( + client=client, + environment_name=environment_name, + use_gpu=True if "gpu" in args.testgroup else False, + use_spark=True if "spark" in args.testgroup else False, conda_pkg_jdk=args.conda_pkg_jdk, - conda_pkg_python=args.conda_pkg_python, + python_version=args.python_version, commit_sha=args.sha, ) - logger.info("exp: In Azure, look for experiment named {}".format(args.expname)) - - # create new or use existing experiment - experiment = Experiment(workspace=workspace, name=args.expname) - run = submit_experiment_to_azureml( - test=args.test, - run_config=run_config, - experiment=experiment, - test_group=args.testgroup, - test_kind=args.testkind, - warnings=args.disable_warnings, + experiment_name = correct_resource_name(args.expname) + logger.info(f"Running experiment {experiment_name}") + run_tests( + client=client, + compute=args.cluster, + environment_name=environment_name, + experiment_name=experiment_name, + script=args.script, + testgroup=args.testgroup, + testkind=args.testkind, ) - - # add helpful information to experiment on Azure - run.tag("Python", args.conda_pkg_python) - run.tag("RepoName", args.reponame) - run.tag("Branch", args.branch) - run.tag("PR", args.pr) - run.tag("script", args.test) - run.tag("testgroup", args.testgroup) - run.tag("testkind", args.testkind) - - # download logs file from AzureML - run.download_file(name="test_logs", output_file_path=args.testlogs) - - # save pytest exit code - metrics = run.get_metrics() - with open("pytest_exit_code.log", "w") as f: - f.write(str(metrics.get("pytest_exit_code")))