From 38713c28758444cf5fb4aa91ddc19bcc0d671a36 Mon Sep 17 00:00:00 2001 From: Github Executorch Date: Tue, 17 Dec 2024 15:28:23 -0800 Subject: [PATCH] Enable composable benchmark configs for flexible model+device+optimization scheduling --- .ci/scripts/gather_benchmark_configs.py | 204 ++++++++++++++++++++++++ .ci/scripts/test_llama.sh | 2 +- .ci/scripts/test_model.sh | 8 +- .github/workflows/android-perf.yml | 178 +++++++++------------ .github/workflows/apple-perf.yml | 164 +++++++++---------- 5 files changed, 360 insertions(+), 196 deletions(-) create mode 100755 .ci/scripts/gather_benchmark_configs.py diff --git a/.ci/scripts/gather_benchmark_configs.py b/.ci/scripts/gather_benchmark_configs.py new file mode 100755 index 00000000000..bcdb8c88e86 --- /dev/null +++ b/.ci/scripts/gather_benchmark_configs.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import json +import logging +import os +import re +from typing import Any, Dict + +from examples.models import MODEL_NAME_TO_MODEL + + +# Device pools for AWS Device Farm +DEVICE_POOLS = { + "apple_iphone_15": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d", + "samsung_galaxy_s22": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa", + "samsung_galaxy_s24": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db", + "google_pixel_8_pro": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a", +} + +# Predefined benchmark configurations +BENCHMARK_CONFIGS = { + "xplat": [ + "xnnpack_q8", + "hf_xnnpack_fp32", + "llama3_fb16", + "llama3_spinquant", + "llama3_qlora", + ], + "android": [ + "qnn_q8", + "llama3_qnn_htp", + ], + "ios": [ + "coreml_fp16", + "mps", + "llama3_coreml_ane", + ], +} + + +def parse_args() -> Any: + """ + Parse command-line arguments. + + Returns: + argparse.Namespace: Parsed command-line arguments. + + Example: + parse_args() -> Namespace(models=['mv3', 'meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8'], + os='android', + devices=['samsung_galaxy_s22']) + """ + from argparse import ArgumentParser + + def comma_separated(value: str): + """ + Parse a comma-separated string into a list. + """ + return value.split(",") + + parser = ArgumentParser("Gather all benchmark configs.") + parser.add_argument( + "--os", + type=str, + choices=["android", "ios"], + help="The target OS.", + ) + parser.add_argument( + "--models", + type=comma_separated, # Use the custom parser for comma-separated values + help=f"Comma-separated model IDs or names. Valid values include {MODEL_NAME_TO_MODEL}.", + ) + parser.add_argument( + "--devices", + type=comma_separated, # Use the custom parser for comma-separated values + help=f"Comma-separated device names. Available devices: {list(DEVICE_POOLS.keys())}", + ) + + return parser.parse_args() + + +def set_output(name: str, val: Any) -> None: + """ + Set the output value to be used by other GitHub jobs. + + Args: + name (str): The name of the output variable. + val (Any): The value to set for the output variable. + + Example: + set_output("benchmark_configs", {"include": [...]}) + """ + + if os.getenv("GITHUB_OUTPUT"): + print(f"Setting {val} to GitHub output") + with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env: + print(f"{name}={val}", file=env) + else: + print(f"::set-output name={name}::{val}") + + +def is_valid_huggingface_model_id(model_name: str) -> bool: + """ + Validate if the model name matches the pattern for HuggingFace model IDs. + + Args: + model_name (str): The model name to validate. + + Returns: + bool: True if the model name matches the valid pattern, False otherwise. + + Example: + is_valid_huggingface_model_id('meta-llama/Llama-3.2-1B') -> True + """ + pattern = r"^[a-zA-Z0-9-_]+/[a-zA-Z0-9-_.]+$" + return bool(re.match(pattern, model_name)) + + +def get_benchmark_configs() -> Dict[str, Dict]: + """ + Gather benchmark configurations for a given set of models on the target operating system and devices. + + Args: + None + + Returns: + Dict[str, Dict]: A dictionary containing the benchmark configurations. + + Example: + get_benchmark_configs() -> { + "include": [ + {"model": "meta-llama/Llama-3.2-1B", "benchmark_config": "hf_xnnpack_fp32", "device": "arn:aws:..."}, + {"model": "mv3", "benchmark_config": "xnnpack_q8", "device": "arn:aws:..."}, + ... + ] + } + """ + args = parse_args() + target_os = args.os + devices = args.devices + models = args.models + + benchmark_configs = {"include": []} + + for model_name in models: + configs = [] + if is_valid_huggingface_model_id(model_name): + if model_name.startswith("meta-llama/"): + # LLaMA models + repo_name = model_name.split("meta-llama/")[1] + if "qlora" in repo_name.lower(): + configs.append("llama3_qlora") + elif "spinquant" in repo_name.lower(): + configs.append("llama3_spinquant") + else: + configs.append("llama3_fb16") + configs.extend( + [ + config + for config in BENCHMARK_CONFIGS.get(target_os, []) + if config.startswith("llama") + ] + ) + else: + # Non-LLaMA models + configs.append("hf_xnnpack_fp32") + elif model_name in MODEL_NAME_TO_MODEL: + # ExecuTorch in-tree non-GenAI models + configs.append("xnnpack_q8") + configs.extend( + [ + config + for config in BENCHMARK_CONFIGS.get(target_os, []) + if not config.startswith("llama") + ] + ) + else: + # Skip unknown models with a warning + logging.warning(f"Unknown or invalid model name '{model_name}'. Skipping.") + continue + + # Add configurations for each valid device + for device in devices: + if device not in DEVICE_POOLS: + logging.warning(f"Unsupported device '{device}'. Skipping.") + continue + for config in configs: + record = { + "model": model_name, + "config": config, + "device": DEVICE_POOLS[device], + } + benchmark_configs["include"].append(record) + + set_output("benchmark_configs", json.dumps(benchmark_configs)) + + +if __name__ == "__main__": + get_benchmark_configs() diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 6d009ebad51..ddc7ad46185 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -208,7 +208,7 @@ prepare_artifacts_upload() { PARAMS="params.json" CHECKPOINT_FILE_NAME="" touch "${PARAMS}" -if [[ "${MODEL_NAME}" == "stories110M" ]]; then +if [[ "${MODEL_NAME}" == "llama" ]] || [[ "${MODEL_NAME}" == "stories"* ]] || [[ "${MODEL_NAME}" == "tinyllama" ]]; then CHECKPOINT_FILE_NAME="stories110M.pt" download_stories_model_artifacts else diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh index 0727eecf770..6e8749fe26d 100755 --- a/.ci/scripts/test_model.sh +++ b/.ci/scripts/test_model.sh @@ -209,25 +209,25 @@ test_model_with_mps() { if [[ "${BACKEND}" == "portable" ]]; then echo "Testing ${MODEL_NAME} with portable kernels..." test_model -elif [[ "${BACKEND}" == "qnn" ]]; then +elif [[ "${BACKEND}" == *"qnn"* ]]; then echo "Testing ${MODEL_NAME} with qnn..." test_model_with_qnn if [[ $? -eq 0 ]]; then prepare_artifacts_upload fi -elif [[ "${BACKEND}" == "coreml" ]]; then +elif [[ "${BACKEND}" == *"coreml"* ]]; then echo "Testing ${MODEL_NAME} with coreml..." test_model_with_coreml if [[ $? -eq 0 ]]; then prepare_artifacts_upload fi -elif [[ "${BACKEND}" == "mps" ]]; then +elif [[ "${BACKEND}" == *"mps"* ]]; then echo "Testing ${MODEL_NAME} with mps..." test_model_with_mps if [[ $? -eq 0 ]]; then prepare_artifacts_upload fi -elif [[ "${BACKEND}" == "xnnpack" ]]; then +elif [[ "${BACKEND}" == *"xnnpack"* ]]; then echo "Testing ${MODEL_NAME} with xnnpack..." WITH_QUANTIZATION=true WITH_DELEGATION=true diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index 8dbdecbee7d..dfab6b3d297 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -26,16 +26,6 @@ on: required: false type: string default: samsung_galaxy_s22 - delegates: - description: Backend delegates - required: false - type: string - default: xnnpack - threadpool: - description: Run with threadpool? - required: false - type: boolean - default: false benchmark_configs: description: The list of configs used the benchmark required: false @@ -52,16 +42,6 @@ on: required: false type: string default: samsung_galaxy_s22 - delegates: - description: Backend delegates - required: false - type: string - default: xnnpack - threadpool: - description: Run with threadpool? - required: false - type: boolean - default: false benchmark_configs: description: The list of configs used the benchmark required: false @@ -73,12 +53,16 @@ concurrency: jobs: set-parameters: - runs-on: linux.2xlarge + runs-on: ubuntu-22.04 outputs: - models: ${{ steps.set-parameters.outputs.models }} - devices: ${{ steps.set-parameters.outputs.devices }} - delegates: ${{ steps.set-parameters.outputs.delegates }} + benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }} steps: + - uses: actions/checkout@v3 + with: + submodules: 'false' + - uses: actions/setup-python@v4 + with: + python-version: '3.10' - name: Set parameters id: set-parameters shell: bash @@ -86,11 +70,10 @@ jobs: # Separate default values from the workflow dispatch. To ensure defaults are accessible # during scheduled runs and to provide flexibility for different defaults between # on-demand and periodic benchmarking. - CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'stories110M,dl3,mv3,mv2,ic4,ic3,vit' || 'stories110M' }} + CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }} CRON_DEFAULT_DEVICES: samsung_galaxy_s22 - CRON_DEFAULT_DELEGATES: ${{ github.event_name == 'schedule' && 'xnnpack,qnn' || 'xnnpack' }} run: | - set -ex + set -eux MODELS="${{ inputs.models }}" if [ -z "$MODELS" ]; then MODELS="$CRON_DEFAULT_MODELS" @@ -99,42 +82,17 @@ jobs: if [ -z "$DEVICES" ]; then DEVICES="$CRON_DEFAULT_DEVICES" fi - DELEGATES="${{ inputs.delegates }}" - if [ -z "$DELEGATES" ]; then - DELEGATES="$CRON_DEFAULT_DELEGATES" - fi - - # Mapping devices to their corresponding device-pool-arn - declare -A DEVICE_POOL_ARNS - DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa" - DEVICE_POOL_ARNS[samsung_galaxy_s24]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db" - DEVICE_POOL_ARNS[google_pixel_8_pro]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a" - - # Resolve device names with their corresponding ARNs - if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then - DEVICES=$(echo "$DEVICES" | jq -Rc 'split(",")') - fi - declare -a MAPPED_ARNS=() - for DEVICE in $(echo "$DEVICES" | jq -r '.[]'); do - if [[ -z "${DEVICE_POOL_ARNS[$DEVICE]}" ]]; then - echo "Error: No ARN found for device '$DEVICE'. Abort." >&2 - exit 1 - fi - MAPPED_ARNS+=("${DEVICE_POOL_ARNS[$DEVICE]}") - done - echo "models=$(echo $MODELS | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT - MAPPED_ARNS_JSON=$(printf '%s\n' "${MAPPED_ARNS[@]}" | jq -R . | jq -s .) - echo "devices=$(echo "$MAPPED_ARNS_JSON" | jq -c .)" >> $GITHUB_OUTPUT - echo "delegates=$(echo $DELEGATES | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT + PYTHONPATH="${PWD}" python .ci/scripts/gather_benchmark_configs.py \ + --os "android" \ + --models $MODELS \ + --devices $DEVICES prepare-test-specs: runs-on: linux.2xlarge needs: set-parameters strategy: - matrix: - model: ${{ fromJson(needs.set-parameters.outputs.models) }} - delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} + matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} fail-fast: false steps: - uses: actions/checkout@v3 @@ -146,7 +104,7 @@ jobs: set -eux # The model will be exported in the next step to this S3 path - MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip" + MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/model.zip" # We could write a script to properly use jinja here, but there is only one variable, # so let's just sed it sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' android-llm-device-farm-test-spec.yml.j2 @@ -160,7 +118,7 @@ jobs: with: s3-bucket: gha-artifacts s3-prefix: | - ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }} + ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }} retention-days: 1 if-no-files-found: error path: extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml @@ -171,9 +129,7 @@ jobs: needs: set-parameters secrets: inherit strategy: - matrix: - model: ${{ fromJson(needs.set-parameters.outputs.models) }} - delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} + matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} fail-fast: false with: runner: linux.2xlarge.memory @@ -188,30 +144,33 @@ jobs: echo "::group::Setting up dev environment" CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" - if [[ ${{ matrix.delegate }} == "qnn" ]]; then + if [[ ${{ matrix.config }} == *"qnn"* ]]; then PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh fi PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake" # Install requirements for export_llama PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh - ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }} + + pip install -U "huggingface_hub[cli]" + huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + pip install accelerate sentencepiece + pip list + + ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.config }} echo "::endgroup::" - echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}" + echo "::group::Exporting ${{ matrix.config }} model: ${{ matrix.model }}" BUILD_MODE="cmake" if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then - pip install -U "huggingface_hub[cli]" - huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN - pip install accelerate sentencepiece # HuggingFace model. Assume the pattern is always like "/" HF_MODEL_REPO=${{ matrix.model }} - OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}" + OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}" if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then # Llama models on Hugging Face - if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then + if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then # SpinQuant # Download prequantized chceckpoint from Hugging Face DOWNLOADED_PATH=$( @@ -237,7 +196,7 @@ jobs: --use_spin_quant native \ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' ls -lh "${OUT_ET_MODEL_NAME}.pte" - elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then + elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then # QAT + LoRA # Download prequantized chceckpoint from Hugging Face DOWNLOADED_PATH=$( @@ -264,27 +223,47 @@ jobs: --output_name "${OUT_ET_MODEL_NAME}.pte" \ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' ls -lh "${OUT_ET_MODEL_NAME}.pte" + elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then + # Original BF16 version, without any quantization + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") + python -m examples.models.llama.export_llama \ + --model "llama3_2" \ + --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ + --params "${DOWNLOADED_PATH}/params.json" \ + -kv \ + --use_sdpa_with_kv_cache \ + -X \ + -d bf16 \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ + --output_name="${OUT_ET_MODEL_NAME}.pte" + ls -lh "${OUT_ET_MODEL_NAME}.pte" + elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then + echo "ANDROID_NDK_ROOT: $ANDROID_NDK_ROOT" + echo "QNN_SDK_ROOT: $QNN_SDK_ROOT" + echo "EXECUTORCH_ROOT: $EXECUTORCH_ROOT" + export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/ + export PYTHONPATH=$EXECUTORCH_ROOT/.. + + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") + python -m examples.qualcomm.oss_scripts.llama3_2.llama -- \ + --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ + --params "${DOWNLOADED_PATH}/params.json" \ + --tokenizer_model "${DOWNLOADED_PATH}/tokenizer.model" \ + --compile_only \ + --ptq 16a4w \ + -m SM8650 \ + --model_size 1B \ + --model_mode kv \ + --prompt "Once" + + OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script + find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \; + ls -lh "${OUT_ET_MODEL_NAME}.pte" else - if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then - # Original BF16 version, without any quantization - DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") - python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - -kv \ - --use_sdpa_with_kv_cache \ - -X \ - -d bf16 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --output_name="${OUT_ET_MODEL_NAME}.pte" - ls -lh "${OUT_ET_MODEL_NAME}.pte" - else - # By default, test with the Hugging Face model and the xnnpack recipe - DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model") - python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME" - ls -lh "${OUT_ET_MODEL_NAME}.pte" - fi + # By default, test with the Hugging Face model and the xnnpack recipe + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model") + python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME" + ls -lh "${OUT_ET_MODEL_NAME}.pte" fi else echo "Unsupported model ${{ matrix.model }}" @@ -295,16 +274,16 @@ jobs: ls -lh model.zip mkdir -p "${ARTIFACTS_DIR_NAME}" mv model.zip "${ARTIFACTS_DIR_NAME}" - elif [[ ${{ matrix.model }} =~ ^stories* ]]; then + elif [[ ${{ matrix.model }} == "llama" ]]; then # Install requirements for export_llama PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh # Test llama2 - if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then + if [[ ${{ matrix.config }} == *"xnnpack"* ]]; then DELEGATE_CONFIG="xnnpack+custom+qe" - elif [[ ${{ matrix.delegate }} == "qnn" ]]; then + elif [[ ${{ matrix.config }} == *"qnn"* ]]; then DELEGATE_CONFIG="qnn" else - echo "Unsupported delegate ${{ matrix.delegate }}" + echo "Unsupported delegate ${{ matrix.config }}" exit 1 fi DTYPE="fp32" @@ -318,7 +297,7 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh \ "${{ matrix.model }}" \ "${BUILD_MODE}" \ - "${{ matrix.delegate }}" \ + "${{ matrix.config }}" \ "${ARTIFACTS_DIR_NAME}" fi echo "::endgroup::" @@ -363,10 +342,7 @@ jobs: - build-benchmark-app - export-models strategy: - matrix: - model: ${{ fromJson(needs.set-parameters.outputs.models) }} - delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} - device: ${{ fromJson(needs.set-parameters.outputs.devices) }} + matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} fail-fast: false with: # Due to scheduling a job may be pushed beyond the default 60m threshold @@ -379,7 +355,7 @@ jobs: device-pool-arn: ${{ matrix.device }} android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk - test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/android-llm-device-farm-test-spec.yml + test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/android-llm-device-farm-test-spec.yml upload-benchmark-results: needs: diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index f2a897f72f1..b9761074131 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -26,11 +26,6 @@ on: required: false type: string default: apple_iphone_15 - delegates: - description: Backend delegates - required: false - type: string - default: xnnpack benchmark_configs: description: The list of configs used the benchmark required: false @@ -47,11 +42,6 @@ on: required: false type: string default: apple_iphone_15 - delegates: - description: Backend delegates - required: false - type: string - default: xnnpack benchmark_configs: description: The list of configs used the benchmark required: false @@ -63,12 +53,16 @@ concurrency: jobs: set-parameters: - runs-on: linux.2xlarge + runs-on: ubuntu-22.04 outputs: - models: ${{ steps.set-parameters.outputs.models }} - devices: ${{ steps.set-parameters.outputs.devices }} - delegates: ${{ steps.set-parameters.outputs.delegates }} + benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }} steps: + - uses: actions/checkout@v3 + with: + submodules: 'false' + - uses: actions/setup-python@v4 + with: + python-version: '3.10' - name: Set parameters id: set-parameters shell: bash @@ -76,11 +70,10 @@ jobs: # Separate default values from the workflow dispatch. To ensure defaults are accessible # during scheduled runs and to provide flexibility for different defaults between # on-demand and periodic benchmarking. - CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'stories110M,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l' || 'stories110M' }} + CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }} CRON_DEFAULT_DEVICES: apple_iphone_15 - CRON_DEFAULT_DELEGATES: ${{ github.event_name == 'schedule' && 'xnnpack,coreml,mps' || 'xnnpack' }} run: | - set -ex + set -eux MODELS="${{ inputs.models }}" if [ -z "$MODELS" ]; then MODELS="$CRON_DEFAULT_MODELS" @@ -89,40 +82,19 @@ jobs: if [ -z "$DEVICES" ]; then DEVICES="$CRON_DEFAULT_DEVICES" fi - DELEGATES="${{ inputs.delegates }}" - if [ -z "$DELEGATES" ]; then - DELEGATES="$CRON_DEFAULT_DELEGATES" - fi - - # Mapping devices to their corresponding device-pool-arn - declare -A DEVICE_POOL_ARNS - DEVICE_POOL_ARNS[apple_iphone_15]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d" - # Resolve device names with their corresponding ARNs - if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then - DEVICES=$(echo "$DEVICES" | jq -Rc 'split(",")') - fi - declare -a MAPPED_ARNS=() - for DEVICE in $(echo "$DEVICES" | jq -r '.[]'); do - if [[ -z "${DEVICE_POOL_ARNS[$DEVICE]}" ]]; then - echo "Error: No ARN found for device '$DEVICE'. Abort." >&2 - exit 1 - fi - MAPPED_ARNS+=("${DEVICE_POOL_ARNS[$DEVICE]}") - done + PYTHONPATH="${PWD}" python .ci/scripts/gather_benchmark_configs.py \ + --os "ios" \ + --models $MODELS \ + --devices $DEVICES - echo "models=$(echo $MODELS | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT - MAPPED_ARNS_JSON=$(printf '%s\n' "${MAPPED_ARNS[@]}" | jq -R . | jq -s .) - echo "devices=$(echo "$MAPPED_ARNS_JSON" | jq -c .)" >> $GITHUB_OUTPUT - echo "delegates=$(echo $DELEGATES | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT + echo "benchmark_configs is: ${{ steps.set-parameters.outputs.benchmark_configs }}" prepare-test-specs: runs-on: linux.2xlarge needs: set-parameters strategy: - matrix: - model: ${{ fromJson(needs.set-parameters.outputs.models) }} - delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} + matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} fail-fast: false steps: - uses: actions/checkout@v3 @@ -132,8 +104,10 @@ jobs: working-directory: extension/benchmark/apple/Benchmark run: | set -eux + + echo "DEBUG: ${{ matrix.model }}" # The model will be exported in the next step to this S3 path - MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip" + MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/model.zip" # We could write a script to properly use jinja here, but there is only one variable, # so let's just sed it sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' default-ios-device-farm-appium-test-spec.yml.j2 @@ -146,7 +120,7 @@ jobs: with: s3-bucket: gha-artifacts s3-prefix: | - ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }} + ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }} retention-days: 1 if-no-files-found: error path: extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml @@ -157,9 +131,7 @@ jobs: needs: set-parameters secrets: inherit strategy: - matrix: - model: ${{ fromJson(needs.set-parameters.outputs.models) }} - delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} + matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} fail-fast: false with: # NB: Need to use our AWS MacOS runner to upload large models to S3 @@ -181,12 +153,12 @@ jobs: GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ .ci/scripts/setup-macos.sh "${BUILD_TOOL}" - if [[ ${{ matrix.delegate }} == "coreml" ]]; then + if [[ ${{ matrix.config }} == *"coreml"* ]]; then PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ backends/apple/coreml/scripts/install_requirements.sh fi - if [[ ${{ matrix.delegate }} == "mps" ]]; then + if [[ ${{ matrix.config }} == *"mps"* ]]; then PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ backends/apple/mps/install_requirements.sh fi @@ -194,23 +166,25 @@ jobs: # Install requirements for export_llama PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh - ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }} + pip install -U "huggingface_hub[cli]" + huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + ${CONDA_RUN} pip install accelerate sentencepiece + pip list + + ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.config }} echo "::endgroup::" - echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}" + echo "::group::Exporting ${{ matrix.config }} model: ${{ matrix.model }}" BUILD_MODE="cmake" if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then - pip install -U "huggingface_hub[cli]" - huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN - ${CONDA_RUN} pip install accelerate sentencepiece # HuggingFace model. Assume the pattern is always like "/" HF_MODEL_REPO=${{ matrix.model }} - OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}" + OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}" if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then # Llama models on Hugging Face - if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then + if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then # SpinQuant # Download prequantized chceckpoint from Hugging Face DOWNLOADED_PATH=$( @@ -236,7 +210,7 @@ jobs: --use_spin_quant native \ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' ls -lh "${OUT_ET_MODEL_NAME}.pte" - elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then + elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then # QAT + LoRA # Download prequantized chceckpoint from Hugging Face DOWNLOADED_PATH=$( @@ -263,27 +237,40 @@ jobs: --output_name "${OUT_ET_MODEL_NAME}.pte" \ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' ls -lh "${OUT_ET_MODEL_NAME}.pte" + elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then + # Original BF16 version, without any quantization + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") + ${CONDA_RUN} python -m examples.models.llama.export_llama \ + --model "llama3_2" \ + --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ + --params "${DOWNLOADED_PATH}/params.json" \ + -kv \ + --use_sdpa_with_kv_cache \ + -X \ + -d bf16 \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ + --output_name="${OUT_ET_MODEL_NAME}.pte" + ls -lh "${OUT_ET_MODEL_NAME}.pte" + elif [[ ${{ matrix.config }} == "llama3_coreml_ane" ]]; then + # ANE + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") + ${CONDA_RUN} python -m examples.models.llama.export_llama \ + --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ + --params "${DOWNLOADED_PATH}/params.json" \ + -E "4,32" \ + -kv \ + --disable_dynamic_shape \ + --coreml \ + --coreml-ios 18 \ + --coreml-quantize c4w \ + --coreml-compute-units cpu_and_ne \ + --output_name="${OUT_ET_MODEL_NAME}.pte" + ls -lh "${OUT_ET_MODEL_NAME}.pte" else - if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then - # Original BF16 version, without any quantization - DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") - ${CONDA_RUN} python -m examples.models.llama.export_llama \ - --model "llama3_2" \ - --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ - --params "${DOWNLOADED_PATH}/params.json" \ - -kv \ - --use_sdpa_with_kv_cache \ - -X \ - -d bf16 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --output_name="${OUT_ET_MODEL_NAME}.pte" - ls -lh "${OUT_ET_MODEL_NAME}.pte" - else - # By default, test with the Hugging Face model and the xnnpack recipe - DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model") - ${CONDA_RUN} python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME" - ls -lh "${OUT_ET_MODEL_NAME}.pte" - fi + # By default, test with the Hugging Face model and the xnnpack recipe + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model") + ${CONDA_RUN} python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME" + ls -lh "${OUT_ET_MODEL_NAME}.pte" fi else echo "Unsupported model ${{ matrix.model }}" @@ -294,23 +281,23 @@ jobs: ls -lh model.zip mkdir -p "${ARTIFACTS_DIR_NAME}" mv model.zip "${ARTIFACTS_DIR_NAME}" - elif [[ ${{ matrix.model }} =~ ^stories* ]]; then + elif [[ ${{ matrix.model }} == "llama" ]]; then # Install requirements for export_llama PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ bash examples/models/llama/install_requirements.sh # Test llama2 - if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then + if [[ ${{ matrix.config }} == *"xnnpack"* ]]; then DELEGATE_CONFIG="xnnpack+custom+qe" - elif [[ ${{ matrix.delegate }} == "coreml" ]]; then + elif [[ ${{ matrix.config }} == *"coreml"* ]]; then DELEGATE_CONFIG="coreml" - elif [[ ${{ matrix.delegate }} == "mps" ]]; then + elif [[ ${{ matrix.config }} == *"mps"* ]]; then DELEGATE_CONFIG="mps" fi DTYPE="fp32" PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ bash .ci/scripts/test_llama.sh \ - -model "${{ matrix.model }}" \ + -model "stories110M" \ -build_tool "${BUILD_MODE}" \ -dtype "${DTYPE}" \ -mode "${DELEGATE_CONFIG}" \ @@ -320,7 +307,7 @@ jobs: bash .ci/scripts/test_model.sh \ "${{ matrix.model }}" \ "${BUILD_MODE}" \ - "${{ matrix.delegate }}" \ + "${{ matrix.config }}" \ "${ARTIFACTS_DIR_NAME}" fi echo "::endgroup::" @@ -435,10 +422,7 @@ jobs: contents: read uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main strategy: - matrix: - model: ${{ fromJson(needs.set-parameters.outputs.models) }} - delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} - device: ${{ fromJson(needs.set-parameters.outputs.devices) }} + matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }} fail-fast: false with: # Due to scheduling a job may be pushed beyond the default 60m threshold @@ -453,7 +437,7 @@ jobs: # Uploaded to S3 from the previous job ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.ipa ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.xctestrun.zip - test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/default-ios-device-farm-appium-test-spec.yml + test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/default-ios-device-farm-appium-test-spec.yml upload-benchmark-results: needs: