From 38713c28758444cf5fb4aa91ddc19bcc0d671a36 Mon Sep 17 00:00:00 2001
From: Github Executorch <github_executorch@arm.com>
Date: Tue, 17 Dec 2024 15:28:23 -0800
Subject: [PATCH] Enable composable benchmark configs for flexible
 model+device+optimization scheduling

---
 .ci/scripts/gather_benchmark_configs.py | 204 ++++++++++++++++++++++++
 .ci/scripts/test_llama.sh               |   2 +-
 .ci/scripts/test_model.sh               |   8 +-
 .github/workflows/android-perf.yml      | 178 +++++++++------------
 .github/workflows/apple-perf.yml        | 164 +++++++++----------
 5 files changed, 360 insertions(+), 196 deletions(-)
 create mode 100755 .ci/scripts/gather_benchmark_configs.py

diff --git a/.ci/scripts/gather_benchmark_configs.py b/.ci/scripts/gather_benchmark_configs.py
new file mode 100755
index 00000000000..bcdb8c88e86
--- /dev/null
+++ b/.ci/scripts/gather_benchmark_configs.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import logging
+import os
+import re
+from typing import Any, Dict
+
+from examples.models import MODEL_NAME_TO_MODEL
+
+
+# Device pools for AWS Device Farm
+DEVICE_POOLS = {
+    "apple_iphone_15": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d",
+    "samsung_galaxy_s22": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa",
+    "samsung_galaxy_s24": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db",
+    "google_pixel_8_pro": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a",
+}
+
+# Predefined benchmark configurations
+BENCHMARK_CONFIGS = {
+    "xplat": [
+        "xnnpack_q8",
+        "hf_xnnpack_fp32",
+        "llama3_fb16",
+        "llama3_spinquant",
+        "llama3_qlora",
+    ],
+    "android": [
+        "qnn_q8",
+        "llama3_qnn_htp",
+    ],
+    "ios": [
+        "coreml_fp16",
+        "mps",
+        "llama3_coreml_ane",
+    ],
+}
+
+
+def parse_args() -> Any:
+    """
+    Parse command-line arguments.
+
+    Returns:
+        argparse.Namespace: Parsed command-line arguments.
+
+    Example:
+        parse_args() -> Namespace(models=['mv3', 'meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8'],
+                                   os='android',
+                                   devices=['samsung_galaxy_s22'])
+    """
+    from argparse import ArgumentParser
+
+    def comma_separated(value: str):
+        """
+        Parse a comma-separated string into a list.
+        """
+        return value.split(",")
+
+    parser = ArgumentParser("Gather all benchmark configs.")
+    parser.add_argument(
+        "--os",
+        type=str,
+        choices=["android", "ios"],
+        help="The target OS.",
+    )
+    parser.add_argument(
+        "--models",
+        type=comma_separated,  # Use the custom parser for comma-separated values
+        help=f"Comma-separated model IDs or names. Valid values include {MODEL_NAME_TO_MODEL}.",
+    )
+    parser.add_argument(
+        "--devices",
+        type=comma_separated,  # Use the custom parser for comma-separated values
+        help=f"Comma-separated device names. Available devices: {list(DEVICE_POOLS.keys())}",
+    )
+
+    return parser.parse_args()
+
+
+def set_output(name: str, val: Any) -> None:
+    """
+    Set the output value to be used by other GitHub jobs.
+
+    Args:
+        name (str): The name of the output variable.
+        val (Any): The value to set for the output variable.
+
+    Example:
+        set_output("benchmark_configs", {"include": [...]})
+    """
+
+    if os.getenv("GITHUB_OUTPUT"):
+        print(f"Setting {val} to GitHub output")
+        with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
+            print(f"{name}={val}", file=env)
+    else:
+        print(f"::set-output name={name}::{val}")
+
+
+def is_valid_huggingface_model_id(model_name: str) -> bool:
+    """
+    Validate if the model name matches the pattern for HuggingFace model IDs.
+
+    Args:
+        model_name (str): The model name to validate.
+
+    Returns:
+        bool: True if the model name matches the valid pattern, False otherwise.
+
+    Example:
+        is_valid_huggingface_model_id('meta-llama/Llama-3.2-1B') -> True
+    """
+    pattern = r"^[a-zA-Z0-9-_]+/[a-zA-Z0-9-_.]+$"
+    return bool(re.match(pattern, model_name))
+
+
+def get_benchmark_configs() -> Dict[str, Dict]:
+    """
+    Gather benchmark configurations for a given set of models on the target operating system and devices.
+
+    Args:
+        None
+
+    Returns:
+        Dict[str, Dict]: A dictionary containing the benchmark configurations.
+
+    Example:
+        get_benchmark_configs() -> {
+            "include": [
+                {"model": "meta-llama/Llama-3.2-1B", "benchmark_config": "hf_xnnpack_fp32", "device": "arn:aws:..."},
+                {"model": "mv3", "benchmark_config": "xnnpack_q8", "device": "arn:aws:..."},
+                ...
+            ]
+        }
+    """
+    args = parse_args()
+    target_os = args.os
+    devices = args.devices
+    models = args.models
+
+    benchmark_configs = {"include": []}
+
+    for model_name in models:
+        configs = []
+        if is_valid_huggingface_model_id(model_name):
+            if model_name.startswith("meta-llama/"):
+                # LLaMA models
+                repo_name = model_name.split("meta-llama/")[1]
+                if "qlora" in repo_name.lower():
+                    configs.append("llama3_qlora")
+                elif "spinquant" in repo_name.lower():
+                    configs.append("llama3_spinquant")
+                else:
+                    configs.append("llama3_fb16")
+                    configs.extend(
+                        [
+                            config
+                            for config in BENCHMARK_CONFIGS.get(target_os, [])
+                            if config.startswith("llama")
+                        ]
+                    )
+            else:
+                # Non-LLaMA models
+                configs.append("hf_xnnpack_fp32")
+        elif model_name in MODEL_NAME_TO_MODEL:
+            # ExecuTorch in-tree non-GenAI models
+            configs.append("xnnpack_q8")
+            configs.extend(
+                [
+                    config
+                    for config in BENCHMARK_CONFIGS.get(target_os, [])
+                    if not config.startswith("llama")
+                ]
+            )
+        else:
+            # Skip unknown models with a warning
+            logging.warning(f"Unknown or invalid model name '{model_name}'. Skipping.")
+            continue
+
+        # Add configurations for each valid device
+        for device in devices:
+            if device not in DEVICE_POOLS:
+                logging.warning(f"Unsupported device '{device}'. Skipping.")
+                continue
+            for config in configs:
+                record = {
+                    "model": model_name,
+                    "config": config,
+                    "device": DEVICE_POOLS[device],
+                }
+                benchmark_configs["include"].append(record)
+
+    set_output("benchmark_configs", json.dumps(benchmark_configs))
+
+
+if __name__ == "__main__":
+    get_benchmark_configs()
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index 6d009ebad51..ddc7ad46185 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -208,7 +208,7 @@ prepare_artifacts_upload() {
 PARAMS="params.json"
 CHECKPOINT_FILE_NAME=""
 touch "${PARAMS}"
-if [[ "${MODEL_NAME}" == "stories110M" ]]; then
+if [[ "${MODEL_NAME}" == "llama" ]] || [[ "${MODEL_NAME}" == "stories"* ]] || [[ "${MODEL_NAME}" == "tinyllama" ]]; then
   CHECKPOINT_FILE_NAME="stories110M.pt"
   download_stories_model_artifacts
 else
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
index 0727eecf770..6e8749fe26d 100755
--- a/.ci/scripts/test_model.sh
+++ b/.ci/scripts/test_model.sh
@@ -209,25 +209,25 @@ test_model_with_mps() {
 if [[ "${BACKEND}" == "portable" ]]; then
   echo "Testing ${MODEL_NAME} with portable kernels..."
   test_model
-elif [[ "${BACKEND}" == "qnn" ]]; then
+elif [[ "${BACKEND}" == *"qnn"* ]]; then
   echo "Testing ${MODEL_NAME} with qnn..."
   test_model_with_qnn
   if [[ $? -eq 0 ]]; then
     prepare_artifacts_upload
   fi
-elif [[ "${BACKEND}" == "coreml" ]]; then
+elif [[ "${BACKEND}" == *"coreml"* ]]; then
   echo "Testing ${MODEL_NAME} with coreml..."
   test_model_with_coreml
   if [[ $? -eq 0 ]]; then
     prepare_artifacts_upload
   fi
-elif [[ "${BACKEND}" == "mps" ]]; then
+elif [[ "${BACKEND}" == *"mps"* ]]; then
   echo "Testing ${MODEL_NAME} with mps..."
   test_model_with_mps
   if [[ $? -eq 0 ]]; then
     prepare_artifacts_upload
   fi
-elif [[ "${BACKEND}" == "xnnpack" ]]; then
+elif [[ "${BACKEND}" == *"xnnpack"* ]]; then
   echo "Testing ${MODEL_NAME} with xnnpack..."
   WITH_QUANTIZATION=true
   WITH_DELEGATION=true
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index 8dbdecbee7d..dfab6b3d297 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -26,16 +26,6 @@ on:
         required: false
         type: string
         default: samsung_galaxy_s22
-      delegates:
-        description: Backend delegates
-        required: false
-        type: string
-        default: xnnpack
-      threadpool:
-        description: Run with threadpool?
-        required: false
-        type: boolean
-        default: false
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -52,16 +42,6 @@ on:
         required: false
         type: string
         default: samsung_galaxy_s22
-      delegates:
-        description: Backend delegates
-        required: false
-        type: string
-        default: xnnpack
-      threadpool:
-        description: Run with threadpool?
-        required: false
-        type: boolean
-        default: false
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -73,12 +53,16 @@ concurrency:
 
 jobs:
   set-parameters:
-    runs-on: linux.2xlarge
+    runs-on: ubuntu-22.04
     outputs:
-      models: ${{ steps.set-parameters.outputs.models }}
-      devices: ${{ steps.set-parameters.outputs.devices }}
-      delegates: ${{ steps.set-parameters.outputs.delegates }}
+      benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }}
     steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: 'false'
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
       - name: Set parameters
         id: set-parameters
         shell: bash
@@ -86,11 +70,10 @@ jobs:
           # Separate default values from the workflow dispatch. To ensure defaults are accessible
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'stories110M,dl3,mv3,mv2,ic4,ic3,vit' || 'stories110M' }}
+          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }}
           CRON_DEFAULT_DEVICES: samsung_galaxy_s22
-          CRON_DEFAULT_DELEGATES: ${{ github.event_name == 'schedule' && 'xnnpack,qnn' || 'xnnpack' }}
         run: |
-          set -ex
+          set -eux
           MODELS="${{ inputs.models }}"
           if [ -z "$MODELS" ]; then
             MODELS="$CRON_DEFAULT_MODELS"
@@ -99,42 +82,17 @@ jobs:
           if [ -z "$DEVICES" ]; then
             DEVICES="$CRON_DEFAULT_DEVICES"
           fi
-          DELEGATES="${{ inputs.delegates }}"
-          if [ -z "$DELEGATES" ]; then
-            DELEGATES="$CRON_DEFAULT_DELEGATES"
-          fi
-
-          # Mapping devices to their corresponding device-pool-arn
-          declare -A DEVICE_POOL_ARNS
-          DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
-          DEVICE_POOL_ARNS[samsung_galaxy_s24]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db"
-          DEVICE_POOL_ARNS[google_pixel_8_pro]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a"
-
-          # Resolve device names with their corresponding ARNs
-          if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
-            DEVICES=$(echo "$DEVICES" | jq -Rc 'split(",")')
-          fi
-          declare -a MAPPED_ARNS=()
-          for DEVICE in $(echo "$DEVICES" | jq -r '.[]'); do
-            if [[ -z "${DEVICE_POOL_ARNS[$DEVICE]}" ]]; then
-              echo "Error: No ARN found for device '$DEVICE'. Abort." >&2
-              exit 1
-            fi
-            MAPPED_ARNS+=("${DEVICE_POOL_ARNS[$DEVICE]}")
-          done
 
-          echo "models=$(echo $MODELS | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
-          MAPPED_ARNS_JSON=$(printf '%s\n' "${MAPPED_ARNS[@]}" | jq -R . | jq -s .)
-          echo "devices=$(echo "$MAPPED_ARNS_JSON" | jq -c .)" >> $GITHUB_OUTPUT
-          echo "delegates=$(echo $DELEGATES | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
+          PYTHONPATH="${PWD}" python .ci/scripts/gather_benchmark_configs.py \
+            --os "android" \
+            --models $MODELS \
+            --devices $DEVICES
 
   prepare-test-specs:
     runs-on: linux.2xlarge
     needs: set-parameters
     strategy:
-      matrix:
-          model: ${{ fromJson(needs.set-parameters.outputs.models) }}
-          delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
+      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
       fail-fast: false
     steps:
       - uses: actions/checkout@v3
@@ -146,7 +104,7 @@ jobs:
           set -eux
 
           # The model will be exported in the next step to this S3 path
-          MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip"
+          MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/model.zip"
           # We could write a script to properly use jinja here, but there is only one variable,
           # so let's just sed it
           sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' android-llm-device-farm-test-spec.yml.j2
@@ -160,7 +118,7 @@ jobs:
         with:
           s3-bucket: gha-artifacts
           s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}
+            ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}
           retention-days: 1
           if-no-files-found: error
           path: extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml
@@ -171,9 +129,7 @@ jobs:
     needs: set-parameters
     secrets: inherit
     strategy:
-      matrix:
-          model: ${{ fromJson(needs.set-parameters.outputs.models) }}
-          delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
+      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
       fail-fast: false
     with:
       runner: linux.2xlarge.memory
@@ -188,30 +144,33 @@ jobs:
         echo "::group::Setting up dev environment"
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
-        if [[ ${{ matrix.delegate }} == "qnn" ]]; then
+        if [[ ${{ matrix.config }} == *"qnn"* ]]; then
             PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
             PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
         fi
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
-        ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
+
+        pip install -U "huggingface_hub[cli]"
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        pip install accelerate sentencepiece
+        pip list
+
+        ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.config }}
         echo "::endgroup::"
 
-        echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
+        echo "::group::Exporting ${{ matrix.config }} model: ${{ matrix.model }}"
         BUILD_MODE="cmake"
 
         if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
-            pip install -U "huggingface_hub[cli]"
-            huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
-            pip install accelerate sentencepiece
             # HuggingFace model. Assume the pattern is always like "<org>/<repo>"
             HF_MODEL_REPO=${{ matrix.model }}
-            OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"
+            OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"
 
             if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
                 # Llama models on Hugging Face
-                if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
+                if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
                     # SpinQuant
                     # Download prequantized chceckpoint from Hugging Face
                     DOWNLOADED_PATH=$(
@@ -237,7 +196,7 @@ jobs:
                       --use_spin_quant native \
                       --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
-                elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
+                elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
                     # QAT + LoRA
                     # Download prequantized chceckpoint from Hugging Face
                     DOWNLOADED_PATH=$(
@@ -264,27 +223,47 @@ jobs:
                       --output_name "${OUT_ET_MODEL_NAME}.pte" \
                       --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
+                elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
+                    # Original BF16 version, without any quantization
+                    DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
+                    python -m examples.models.llama.export_llama \
+                      --model "llama3_2" \
+                      --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                      --params "${DOWNLOADED_PATH}/params.json" \
+                      -kv \
+                      --use_sdpa_with_kv_cache \
+                      -X \
+                      -d bf16 \
+                      --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+                      --output_name="${OUT_ET_MODEL_NAME}.pte"
+                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
+                elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
+                    echo "ANDROID_NDK_ROOT: $ANDROID_NDK_ROOT"
+                    echo "QNN_SDK_ROOT: $QNN_SDK_ROOT"
+                    echo "EXECUTORCH_ROOT: $EXECUTORCH_ROOT"
+                    export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
+                    export PYTHONPATH=$EXECUTORCH_ROOT/..
+
+                    DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
+                    python -m examples.qualcomm.oss_scripts.llama3_2.llama -- \
+                      --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                      --params "${DOWNLOADED_PATH}/params.json" \
+                      --tokenizer_model "${DOWNLOADED_PATH}/tokenizer.model" \
+                      --compile_only \
+                      --ptq 16a4w \
+                      -m SM8650 \
+                      --model_size 1B \
+                      --model_mode kv \
+                      --prompt "Once"
+
+                    OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script
+                    find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \;
+                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
                 else
-                    if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
-                        # Original BF16 version, without any quantization
-                        DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
-                        python -m examples.models.llama.export_llama \
-                          --model "llama3_2" \
-                          --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
-                          --params "${DOWNLOADED_PATH}/params.json" \
-                          -kv \
-                          --use_sdpa_with_kv_cache \
-                          -X \
-                          -d bf16 \
-                          --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
-                          --output_name="${OUT_ET_MODEL_NAME}.pte"
-                        ls -lh "${OUT_ET_MODEL_NAME}.pte"
-                    else
-                        # By default, test with the Hugging Face model and the xnnpack recipe
-                        DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
-                        python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
-                        ls -lh "${OUT_ET_MODEL_NAME}.pte"
-                    fi
+                    # By default, test with the Hugging Face model and the xnnpack recipe
+                    DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
+                    python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
+                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
                 fi
             else
                 echo "Unsupported model ${{ matrix.model }}"
@@ -295,16 +274,16 @@ jobs:
             ls -lh model.zip
             mkdir -p "${ARTIFACTS_DIR_NAME}"
             mv model.zip "${ARTIFACTS_DIR_NAME}"
-        elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
+        elif [[ ${{ matrix.model }} == "llama" ]]; then
             # Install requirements for export_llama
             PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
             # Test llama2
-            if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then
+            if [[ ${{ matrix.config }} == *"xnnpack"* ]]; then
                 DELEGATE_CONFIG="xnnpack+custom+qe"
-            elif [[ ${{ matrix.delegate }} == "qnn" ]]; then
+            elif [[ ${{ matrix.config }} == *"qnn"* ]]; then
                 DELEGATE_CONFIG="qnn"
             else
-                echo "Unsupported delegate ${{ matrix.delegate }}"
+                echo "Unsupported delegate ${{ matrix.config }}"
                 exit 1
             fi
             DTYPE="fp32"
@@ -318,7 +297,7 @@ jobs:
             PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh \
               "${{ matrix.model }}" \
               "${BUILD_MODE}" \
-              "${{ matrix.delegate }}" \
+              "${{ matrix.config }}" \
               "${ARTIFACTS_DIR_NAME}"
         fi
         echo "::endgroup::"
@@ -363,10 +342,7 @@ jobs:
       - build-benchmark-app
       - export-models
     strategy:
-      matrix:
-        model: ${{ fromJson(needs.set-parameters.outputs.models) }}
-        delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
-        device: ${{ fromJson(needs.set-parameters.outputs.devices) }}
+      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
       fail-fast: false
     with:
       # Due to scheduling a job may be pushed beyond the default 60m threshold
@@ -379,7 +355,7 @@ jobs:
       device-pool-arn: ${{ matrix.device }}
       android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk
       android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk
-      test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/android-llm-device-farm-test-spec.yml
+      test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/android-llm-device-farm-test-spec.yml
 
   upload-benchmark-results:
     needs:
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
index f2a897f72f1..b9761074131 100644
--- a/.github/workflows/apple-perf.yml
+++ b/.github/workflows/apple-perf.yml
@@ -26,11 +26,6 @@ on:
         required: false
         type: string
         default: apple_iphone_15
-      delegates:
-        description: Backend delegates
-        required: false
-        type: string
-        default: xnnpack
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -47,11 +42,6 @@ on:
         required: false
         type: string
         default: apple_iphone_15
-      delegates:
-        description: Backend delegates
-        required: false
-        type: string
-        default: xnnpack
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -63,12 +53,16 @@ concurrency:
 
 jobs:
   set-parameters:
-    runs-on: linux.2xlarge
+    runs-on: ubuntu-22.04
     outputs:
-      models: ${{ steps.set-parameters.outputs.models }}
-      devices: ${{ steps.set-parameters.outputs.devices }}
-      delegates: ${{ steps.set-parameters.outputs.delegates }}
+      benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }}
     steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: 'false'
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
       - name: Set parameters
         id: set-parameters
         shell: bash
@@ -76,11 +70,10 @@ jobs:
           # Separate default values from the workflow dispatch. To ensure defaults are accessible
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'stories110M,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l' || 'stories110M' }}
+          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }}
           CRON_DEFAULT_DEVICES: apple_iphone_15
-          CRON_DEFAULT_DELEGATES: ${{ github.event_name == 'schedule' && 'xnnpack,coreml,mps' || 'xnnpack' }}
         run: |
-          set -ex
+          set -eux
           MODELS="${{ inputs.models }}"
           if [ -z "$MODELS" ]; then
             MODELS="$CRON_DEFAULT_MODELS"
@@ -89,40 +82,19 @@ jobs:
           if [ -z "$DEVICES" ]; then
             DEVICES="$CRON_DEFAULT_DEVICES"
           fi
-          DELEGATES="${{ inputs.delegates }}"
-          if [ -z "$DELEGATES" ]; then
-            DELEGATES="$CRON_DEFAULT_DELEGATES"
-          fi
-
-          # Mapping devices to their corresponding device-pool-arn
-          declare -A DEVICE_POOL_ARNS
-          DEVICE_POOL_ARNS[apple_iphone_15]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d"
 
-          # Resolve device names with their corresponding ARNs
-          if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
-            DEVICES=$(echo "$DEVICES" | jq -Rc 'split(",")')
-          fi
-          declare -a MAPPED_ARNS=()
-          for DEVICE in $(echo "$DEVICES" | jq -r '.[]'); do
-            if [[ -z "${DEVICE_POOL_ARNS[$DEVICE]}" ]]; then
-              echo "Error: No ARN found for device '$DEVICE'. Abort." >&2
-              exit 1
-            fi
-            MAPPED_ARNS+=("${DEVICE_POOL_ARNS[$DEVICE]}")
-          done
+          PYTHONPATH="${PWD}" python .ci/scripts/gather_benchmark_configs.py \
+            --os "ios" \
+            --models $MODELS \
+            --devices $DEVICES
 
-          echo "models=$(echo $MODELS | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
-          MAPPED_ARNS_JSON=$(printf '%s\n' "${MAPPED_ARNS[@]}" | jq -R . | jq -s .)
-          echo "devices=$(echo "$MAPPED_ARNS_JSON" | jq -c .)" >> $GITHUB_OUTPUT
-          echo "delegates=$(echo $DELEGATES | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
+          echo "benchmark_configs is: ${{ steps.set-parameters.outputs.benchmark_configs }}"
 
   prepare-test-specs:
     runs-on: linux.2xlarge
     needs: set-parameters
     strategy:
-      matrix:
-          model: ${{ fromJson(needs.set-parameters.outputs.models) }}
-          delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
+      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
       fail-fast: false
     steps:
       - uses: actions/checkout@v3
@@ -132,8 +104,10 @@ jobs:
         working-directory: extension/benchmark/apple/Benchmark
         run: |
           set -eux
+
+          echo "DEBUG: ${{ matrix.model }}"
           # The model will be exported in the next step to this S3 path
-          MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip"
+          MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/model.zip"
           # We could write a script to properly use jinja here, but there is only one variable,
           # so let's just sed it
           sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' default-ios-device-farm-appium-test-spec.yml.j2
@@ -146,7 +120,7 @@ jobs:
         with:
           s3-bucket: gha-artifacts
           s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}
+            ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}
           retention-days: 1
           if-no-files-found: error
           path: extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml
@@ -157,9 +131,7 @@ jobs:
     needs: set-parameters
     secrets: inherit
     strategy:
-      matrix:
-          model: ${{ fromJson(needs.set-parameters.outputs.models) }}
-          delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
+      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
       fail-fast: false
     with:
       # NB: Need to use our AWS MacOS runner to upload large models to S3
@@ -181,12 +153,12 @@ jobs:
         GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
           .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
 
-        if [[ ${{ matrix.delegate }} == "coreml" ]]; then
+        if [[ ${{ matrix.config }} == *"coreml"* ]]; then
           PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
             backends/apple/coreml/scripts/install_requirements.sh
         fi
 
-        if [[ ${{ matrix.delegate }} == "mps" ]]; then
+        if [[ ${{ matrix.config }} == *"mps"* ]]; then
           PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
             backends/apple/mps/install_requirements.sh
         fi
@@ -194,23 +166,25 @@ jobs:
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
 
-        ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
+        pip install -U "huggingface_hub[cli]"
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        ${CONDA_RUN} pip install accelerate sentencepiece
+        pip list
+
+        ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.config }}
         echo "::endgroup::"
 
-        echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
+        echo "::group::Exporting ${{ matrix.config }} model: ${{ matrix.model }}"
         BUILD_MODE="cmake"
 
         if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
-          pip install -U "huggingface_hub[cli]"
-          huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
-          ${CONDA_RUN} pip install accelerate sentencepiece
           # HuggingFace model. Assume the pattern is always like "<org>/<repo>"
           HF_MODEL_REPO=${{ matrix.model }}
-          OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"
+          OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"
 
           if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
             # Llama models on Hugging Face
-            if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
+            if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
               # SpinQuant
               # Download prequantized chceckpoint from Hugging Face
               DOWNLOADED_PATH=$(
@@ -236,7 +210,7 @@ jobs:
                 --use_spin_quant native \
                 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
               ls -lh "${OUT_ET_MODEL_NAME}.pte"
-            elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
+            elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
               # QAT + LoRA
               # Download prequantized chceckpoint from Hugging Face
               DOWNLOADED_PATH=$(
@@ -263,27 +237,40 @@ jobs:
                 --output_name "${OUT_ET_MODEL_NAME}.pte" \
                 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
               ls -lh "${OUT_ET_MODEL_NAME}.pte"
+            elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
+              # Original BF16 version, without any quantization
+              DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
+              ${CONDA_RUN} python -m examples.models.llama.export_llama \
+                --model "llama3_2" \
+                --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                --params "${DOWNLOADED_PATH}/params.json" \
+                -kv \
+                --use_sdpa_with_kv_cache \
+                -X \
+                -d bf16 \
+                --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+                --output_name="${OUT_ET_MODEL_NAME}.pte"
+              ls -lh "${OUT_ET_MODEL_NAME}.pte"
+            elif [[ ${{ matrix.config }} == "llama3_coreml_ane" ]]; then
+              # ANE
+              DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
+              ${CONDA_RUN} python -m examples.models.llama.export_llama \
+                --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                --params "${DOWNLOADED_PATH}/params.json" \
+                -E "4,32" \
+                -kv \
+                --disable_dynamic_shape \
+                --coreml \
+                --coreml-ios 18 \
+                --coreml-quantize c4w \
+                --coreml-compute-units cpu_and_ne \
+                --output_name="${OUT_ET_MODEL_NAME}.pte"
+              ls -lh "${OUT_ET_MODEL_NAME}.pte"
             else
-              if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
-                # Original BF16 version, without any quantization
-                DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
-                ${CONDA_RUN} python -m examples.models.llama.export_llama \
-                  --model "llama3_2" \
-                  --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
-                  --params "${DOWNLOADED_PATH}/params.json" \
-                  -kv \
-                  --use_sdpa_with_kv_cache \
-                  -X \
-                  -d bf16 \
-                  --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
-                  --output_name="${OUT_ET_MODEL_NAME}.pte"
-                ls -lh "${OUT_ET_MODEL_NAME}.pte"
-              else
-                # By default, test with the Hugging Face model and the xnnpack recipe
-                DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
-                ${CONDA_RUN} python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
-                ls -lh "${OUT_ET_MODEL_NAME}.pte"
-              fi
+              # By default, test with the Hugging Face model and the xnnpack recipe
+              DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
+              ${CONDA_RUN} python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
+              ls -lh "${OUT_ET_MODEL_NAME}.pte"
             fi
           else
             echo "Unsupported model ${{ matrix.model }}"
@@ -294,23 +281,23 @@ jobs:
           ls -lh model.zip
           mkdir -p "${ARTIFACTS_DIR_NAME}"
           mv model.zip "${ARTIFACTS_DIR_NAME}"
-        elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
+        elif [[ ${{ matrix.model }} == "llama" ]]; then
           # Install requirements for export_llama
           PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
             bash examples/models/llama/install_requirements.sh
 
           # Test llama2
-          if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then
+          if [[ ${{ matrix.config }} == *"xnnpack"* ]]; then
             DELEGATE_CONFIG="xnnpack+custom+qe"
-          elif [[ ${{ matrix.delegate }} == "coreml" ]]; then
+          elif [[ ${{ matrix.config }} == *"coreml"* ]]; then
             DELEGATE_CONFIG="coreml"
-          elif [[ ${{ matrix.delegate }} == "mps" ]]; then
+          elif [[ ${{ matrix.config }} == *"mps"* ]]; then
             DELEGATE_CONFIG="mps"
           fi
           DTYPE="fp32"
           PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
             bash .ci/scripts/test_llama.sh \
-              -model "${{ matrix.model }}" \
+              -model "stories110M" \
               -build_tool "${BUILD_MODE}" \
               -dtype "${DTYPE}" \
               -mode "${DELEGATE_CONFIG}" \
@@ -320,7 +307,7 @@ jobs:
             bash .ci/scripts/test_model.sh \
               "${{ matrix.model }}" \
               "${BUILD_MODE}" \
-              "${{ matrix.delegate }}" \
+              "${{ matrix.config }}" \
               "${ARTIFACTS_DIR_NAME}"
         fi
         echo "::endgroup::"
@@ -435,10 +422,7 @@ jobs:
       contents: read
     uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
     strategy:
-      matrix:
-        model: ${{ fromJson(needs.set-parameters.outputs.models) }}
-        delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
-        device: ${{ fromJson(needs.set-parameters.outputs.devices) }}
+      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
       fail-fast: false
     with:
       # Due to scheduling a job may be pushed beyond the default 60m threshold
@@ -453,7 +437,7 @@ jobs:
       # Uploaded to S3 from the previous job
       ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.ipa
       ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.xctestrun.zip
-      test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/default-ios-device-farm-appium-test-spec.yml
+      test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/default-ios-device-farm-appium-test-spec.yml
 
   upload-benchmark-results:
     needs: