Changes to SDPA to support no kv cache export #28910
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: pull | |
on: | |
pull_request: | |
push: | |
branches: | |
- main | |
- release/* | |
workflow_dispatch: | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | |
cancel-in-progress: true | |
jobs: | |
gather-models: | |
runs-on: ubuntu-22.04 | |
outputs: | |
models: ${{ steps.gather-models.outputs.models }} | |
steps: | |
- uses: actions/checkout@v3 | |
with: | |
submodules: 'false' | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: '3.10' | |
- name: Extract the list of models to test | |
id: gather-models | |
run: | | |
set -eux | |
PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "${GITHUB_EVENT_NAME}" | |
test-setup-linux-gcc: | |
name: test-setup-linux-gcc | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
strategy: | |
fail-fast: false | |
with: | |
runner: linux.2xlarge | |
docker-image: executorch-ubuntu-22.04-gcc9 | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 90 | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
BUILD_TOOL="cmake" | |
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" | |
# Build and test ExecuTorch with the add model on portable backend. | |
PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "add" "${BUILD_TOOL}" "portable" | |
test-models-linux: | |
name: test-models-linux | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
needs: gather-models | |
strategy: | |
matrix: ${{ fromJSON(needs.gather-models.outputs.models) }} | |
fail-fast: false | |
with: | |
runner: ${{ matrix.runner }} | |
docker-image: executorch-ubuntu-22.04-clang12 | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: ${{ matrix.timeout }} | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
MODEL_NAME=${{ matrix.model }} | |
BUILD_TOOL=${{ matrix.build-tool }} | |
BACKEND=${{ matrix.backend }} | |
DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }} | |
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" | |
# Build and test ExecuTorch | |
PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" | |
test-llama-runner-linux: | |
name: test-llama-runner-linux | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
strategy: | |
matrix: | |
dtype: [fp32] | |
mode: [portable, xnnpack+custom, xnnpack+custom+qe,xnnpack+custom+quantize_kv,xnnpack+quantize_kv] | |
include: | |
- dtype: bf16 | |
mode: portable | |
- dtype: bf16 | |
mode: custom | |
fail-fast: false | |
with: | |
runner: linux.2xlarge | |
docker-image: executorch-ubuntu-22.04-clang12 | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 900 | |
upload-artifact: android-models | |
upload-artifact-to-s3: true | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
DTYPE=${{ matrix.dtype }} | |
BUILD_TOOL="cmake" | |
MODE=${{ matrix.mode }} | |
ARTIFACTS_DIR_NAME="artifacts-to-be-uploaded/${DTYPE}-${MODE}" | |
ARTIFACTS_DIR_NAME="${ARTIFACTS_DIR_NAME/+/-}" | |
# Setup executorch | |
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" | |
# Install requirements for export_llama | |
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh | |
# Test llama2 | |
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}" -upload "${ARTIFACTS_DIR_NAME}" | |
test-llama-runner-linux-android: | |
name: test-llama-runner-linux-android | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
strategy: | |
fail-fast: false | |
with: | |
runner: linux.2xlarge | |
docker-image: executorch-ubuntu-22.04-clang12-android | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 90 | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
BUILD_TOOL="cmake" | |
PYTHON_EXECUTABLE=python \ | |
bash .ci/scripts/build_llama_android.sh "${BUILD_TOOL}" | |
test-custom-ops-linux: | |
name: test-custom-ops-linux | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
strategy: | |
fail-fast: false | |
with: | |
runner: linux.2xlarge | |
docker-image: executorch-ubuntu-22.04-clang12 | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 90 | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
BUILD_TOOL="cmake" | |
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" | |
# Test custom ops | |
PYTHON_EXECUTABLE=python bash examples/portable/custom_ops/test_custom_ops.sh "${BUILD_TOOL}" | |
test-selective-build-linux: | |
name: test-selective-build-linux | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
strategy: | |
fail-fast: false | |
with: | |
runner: linux.2xlarge | |
docker-image: executorch-ubuntu-22.04-clang12 | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 90 | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
BUILD_TOOL="cmake" | |
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" | |
# Test selective build | |
PYTHON_EXECUTABLE=python bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}" | |
test-llava-runner-linux: | |
name: test-llava-runner-linux | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
strategy: | |
fail-fast: false | |
with: | |
runner: linux.24xlarge | |
docker-image: executorch-ubuntu-22.04-clang12 | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 90 | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake" | |
# install pybind | |
bash install_requirements.sh --pybind xnnpack | |
# install Llava requirements | |
bash examples/models/llama/install_requirements.sh | |
bash examples/models/llava/install_requirements.sh | |
# run python unittest | |
python -m unittest examples.models.llava.test.test_llava | |
# run e2e (export, tokenizer and runner) | |
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llava.sh | |
test-quantized-aot-lib-linux: | |
name: test-quantized-aot-lib-linux | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
strategy: | |
fail-fast: false | |
with: | |
runner: linux.2xlarge | |
docker-image: executorch-ubuntu-22.04-clang12 | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 90 | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
BUILD_TOOL="cmake" | |
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" | |
PYTHON_EXECUTABLE=python bash examples/xnnpack/quantization/test_quantize.sh "${BUILD_TOOL}" mv2 | |
test-pybind-build-linux: | |
name: test-pybind-build-linux | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
strategy: | |
fail-fast: false | |
with: | |
runner: linux.2xlarge | |
docker-image: executorch-ubuntu-22.04-clang12 | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 90 | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
# build module for executorch.extension.pybindings.portable_lib | |
BUILD_TOOL="cmake" | |
PYTHON_EXECUTABLE=python \ | |
EXECUTORCH_BUILD_XNNPACK=ON \ | |
EXECUTORCH_BUILD_PYBIND=ON \ | |
bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" | |
# see if we can import the module successfully | |
python -c "from executorch.extension.pybindings import portable_lib; print('success!')" | |
test-binary-size-linux-gcc: | |
name: test-binary-size-linux-gcc | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
strategy: | |
fail-fast: false | |
with: | |
runner: linux.2xlarge | |
docker-image: executorch-ubuntu-22.04-gcc9 | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 90 | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
# build module for executorch.extension.pybindings.portable_lib | |
bash test/build_size_test.sh | |
strip cmake-out/test/size_test | |
output=$(ls -la cmake-out/test/size_test) | |
arr=($output) | |
size=${arr[4]} | |
# threshold=48120 on devserver with gcc11.4 | |
# todo(lfq): update once binary size is below 50kb. | |
threshold="51504" | |
if [[ "$size" -le "$threshold" ]]; then | |
echo "Success $size <= $threshold" | |
else | |
echo "Fail $size > $threshold" | |
exit 1 | |
fi | |
test-binary-size-linux: | |
name: test-binary-size-linux | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
strategy: | |
fail-fast: false | |
with: | |
runner: linux.2xlarge | |
docker-image: executorch-ubuntu-22.04-clang12 | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 90 | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
# build module for executorch.extension.pybindings.portable_lib | |
bash test/build_size_test.sh | |
strip cmake-out/test/size_test | |
output=$(ls -la cmake-out/test/size_test) | |
arr=($output) | |
size=${arr[4]} | |
# threshold=48120 on devserver with gcc11.4 | |
# todo(lfq): update once binary size is below 50kb. | |
threshold="51784" | |
if [[ "$size" -le "$threshold" ]]; then | |
echo "Success $size <= $threshold" | |
else | |
echo "Fail $size > $threshold" | |
exit 1 | |
fi | |
android: | |
uses: ./.github/workflows/_android.yml | |
needs: test-llama-runner-linux | |
unittest: | |
uses: ./.github/workflows/_unittest.yml | |
with: | |
docker-image: executorch-ubuntu-22.04-clang12 | |
unittest-arm: | |
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main | |
with: | |
runner: linux.2xlarge | |
docker-image: executorch-ubuntu-22.04-arm-sdk | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 90 | |
script: | | |
set -eux | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
BUILD_TOOL="cmake" | |
# Setup MacOS dependencies as there is no Docker support on MacOS atm | |
PYTHON_EXECUTABLE=python \ | |
EXECUTORCH_BUILD_PYBIND=ON \ | |
EXECUTORCH_BUILD_ARM_BAREMETAL=ON \ | |
.ci/scripts/setup-linux.sh "${BUILD_TOOL}" | |
source .ci/scripts/utils.sh | |
# Install Arm dependencies | |
install_arm | |
# Run pytest with coverage | |
pytest -c /dev/null -v -n auto --cov=./ --cov-report=xml backends/arm/test | |
test-llama-runner-qnn-linux: | |
name: test-llama-runner-qnn-linux | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
strategy: | |
matrix: | |
dtype: [fp32] | |
pt2e_quantize: [qnn_16a16w, qnn_8a8w] | |
mode: [qnn] | |
fail-fast: false | |
with: | |
runner: linux.2xlarge | |
docker-image: executorch-ubuntu-22.04-qnn-sdk | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 900 | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
DTYPE=${{ matrix.dtype }} | |
BUILD_TOOL="cmake" | |
MODE=${{ matrix.mode }} | |
PT2E_QUANTIZE=${{ matrix.pt2e_quantize }} | |
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh | |
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh | |
# Setup executorch | |
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" | |
# Install requirements for export_llama | |
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh | |
# Test llama2 | |
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}" | |
test-phi-3-mini-runner-linux: | |
name: test-phi-3-mini-runner-linux | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
strategy: | |
fail-fast: false | |
with: | |
runner: linux.24xlarge | |
docker-image: executorch-ubuntu-22.04-clang12 | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 90 | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake" | |
# install pybind | |
bash install_requirements.sh --pybind xnnpack | |
# install phi-3-mini requirements | |
bash examples/models/phi-3-mini/install_requirements.sh | |
# run e2e (export, tokenizer and runner) | |
PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh | |
test-eval_llama-wikitext-linux: | |
name: test-eval_llama-wikitext-linux | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
strategy: | |
fail-fast: false | |
with: | |
runner: linux.24xlarge | |
docker-image: executorch-ubuntu-22.04-clang12 | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 90 | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake" | |
# install pybind | |
bash install_requirements.sh --pybind xnnpack | |
# install llama requirements | |
bash examples/models/llama/install_requirements.sh | |
# run eval_llama wikitext task | |
PYTHON_EXECUTABLE=python bash .ci/scripts/test_eval_llama_wikitext.sh | |
test-eval_llama-mmlu-linux: | |
name: test-eval_llama-mmlu-linux | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
strategy: | |
fail-fast: false | |
with: | |
runner: linux.24xlarge | |
docker-image: executorch-ubuntu-22.04-clang12 | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 90 | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake" | |
# install pybind | |
bash install_requirements.sh --pybind xnnpack | |
# install llama requirements | |
bash examples/models/llama/install_requirements.sh | |
# run eval_llama mmlu task | |
PYTHON_EXECUTABLE=python bash .ci/scripts/test_eval_llama_mmlu.sh | |
test-llama_runner_eager-linux: | |
name: test-llama_runner_eager-linux | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
strategy: | |
fail-fast: false | |
with: | |
runner: linux.24xlarge | |
docker-image: executorch-ubuntu-22.04-clang12 | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 90 | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake" | |
# install pybind | |
bash install_requirements.sh --pybind xnnpack | |
# install llama requirements | |
bash examples/models/llama/install_requirements.sh | |
# run llama runner in eager mode | |
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama_runner_eager.sh | |
test-mediatek-models-linux: | |
name: test-mediatek-models-linux | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
strategy: | |
fail-fast: false | |
with: | |
runner: linux.24xlarge | |
docker-image: executorch-ubuntu-22.04-mediatek-sdk | |
submodules: 'true' | |
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
timeout: 90 | |
script: | | |
# The generic Linux job chooses to use base env, not the one setup by the image | |
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | |
conda activate "${CONDA_ENV}" | |
# placeholder for mediatek to add more tests |