[CI] Add CI workflow to run compute-benchmarks on incoming syclos PRs (…

…#14454) This PR: - adds a "benchmark" mode to sycl-linux-run-tests.yml, which benchmarks a given SYCL branch/build using [compute-benchmarks](https://github.com/intel/compute-benchmarks/) - stores benchmark results in a git repo, and - aggregates benchmark results in order to produce a median, which is used to pass or fail the benchmark workflow The current plan is to enable this benchmark to run nightly in order to catch regressions, although there is potential for this workflow to be used in precommit. As a result, a lot of components in this workflow are either separate reusable components, or directly written with precommit in mind. The current benchmarking workflow functions as so: 1. An "aggregate" workflow is ran, which aggregates historic benchmark results in the aforementioned git repo, and produces a historical median - This calls upon aggregate.py to handle the actual compute heavy-lifting 2. The core benchmarking workflow is ran: - This calls upon benchmark.sh, which handles the logic for building and running compute-benchmarks - Then, compare.py is called upon for the actual comparing of benchmark data against the historical median generated prior The workflows are fully configurable via benchmark-ci.conf; enabled compute-benchmarks tests can be configured via enabled_tests.conf. Feel free to test out the workflow via manual dispatches of sycl-linux-run-tests.yml on branch benchmarking-workflow, but be aware that the run currently will always fail, as Github repository secrets are not yet added. --------- Co-authored-by: aelovikov-intel <[email protected]>
intel · Feb 21, 2025 · 5250c0e · 5250c0e
1 parent 8a9e847
commit 5250c0e
Show file tree

Hide file tree

Showing 13 changed files with 1,237 additions and 1 deletion.
diff --git a/.github/workflows/sycl-benchmark-aggregate.yml b/.github/workflows/sycl-benchmark-aggregate.yml
@@ -0,0 +1,52 @@
+name: Aggregate compute-benchmark averages from historical data
+
+# The benchmarking workflow in sycl-linux-run-tests.yml passes or fails based on
+# how the benchmark results compare to a historical average: This historical
+# average is calculated in this workflow, which aggregates historical data and
+# produces measures of central tendency (median in this case) used for this
+# purpose.
+
+on:
+  workflow_dispatch:
+    inputs:
+      lookback_days:
+        description: |
+          Number of days from today to look back in historical results for:
+          This sets the age limit of data used in average calculation: Any
+          benchmark results created before `lookback_days` from today is
+          excluded from being aggregated in the historical average. 
+        type: number
+        required: true
+  workflow_call:
+    inputs:
+      lookback_days:
+        type: number
+        required: true
+    secrets:
+      LLVM_SYCL_BENCHMARK_TOKEN:
+        description: |
+          Github token used by the faceless account to push newly calculated
+          medians.
+        required: true
+
+
+permissions:
+  contents: read
+
+jobs:
+  aggregate:
+    name: Aggregate average (median) value for all metrics
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        sparse-checkout: |
+          devops/scripts/benchmarking
+          devops/benchmarking
+          devops/actions/benchmarking
+    - name: Aggregate benchmark results and produce historical average
+      uses: ./devops/actions/benchmarking/aggregate
+      with:
+        lookback_days: ${{ inputs.lookback_days }}
+      env:
+        GITHUB_TOKEN: ${{ secrets.LLVM_SYCL_BENCHMARK_TOKEN }}
diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml
@@ -25,7 +25,7 @@ on:
         required: False
       tests_selector:
         description: |
-          Two possible options: "e2e" and "cts".
+          Three possible options: "e2e", "cts", and "compute-benchmarks".
         type: string
         default: "e2e"
 
@@ -152,6 +152,7 @@ on:
         options:
           - e2e
           - cts
+          - compute-benchmarks
 
       env:
         description: |
@@ -314,3 +315,12 @@ jobs:
         sycl_cts_artifact: ${{ inputs.sycl_cts_artifact }}
         target_devices: ${{ inputs.target_devices }}
         retention-days: ${{ inputs.retention-days }}
+
+    - name: Run compute-benchmarks on SYCL
+      if: inputs.tests_selector == 'compute-benchmarks'
+      uses: ./devops/actions/run-tests/benchmark
+      with:
+        target_devices: ${{ inputs.target_devices }}
+      env:
+        RUNNER_TAG: ${{ inputs.runner }}
+        GITHUB_TOKEN: ${{ secrets.LLVM_SYCL_BENCHMARK_TOKEN }}
diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml
@@ -243,6 +243,46 @@ jobs:
       sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }}
       sycl_cts_artifact: sycl_cts_bin
 
+  aggregate_benchmark_results:
+    if: always() && !cancelled()
+    name: Aggregate benchmark results and produce historical averages
+    uses: ./.github/workflows/sycl-benchmark-aggregate.yml
+    secrets:
+      LLVM_SYCL_BENCHMARK_TOKEN: ${{ secrets.LLVM_SYCL_BENCHMARK_TOKEN }}
+    with:
+      lookback_days: 100
+
+  run-sycl-benchmarks:
+    needs: [ubuntu2204_build, aggregate_benchmark_results]
+    if: ${{ always() && !cancelled() && needs.ubuntu2204_build.outputs.build_conclusion == 'success' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: Run compute-benchmarks on L0 Gen12
+            runner: '["Linux", "gen12"]'
+            image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
+            target_devices: level_zero:gpu
+            reset_intel_gpu: true
+          - name: Run compute-benchmarks on L0 PVC
+            runner: '["Linux", "pvc"]'
+            image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
+            target_devices: level_zero:gpu
+            reset_intel_gpu: false
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
+    secrets: inherit
+    with:
+      name: ${{ matrix.name }}
+      runner: ${{ matrix.runner }}
+      image_options: ${{ matrix.image_options }}
+      target_devices: ${{ matrix.target_devices }}
+      tests_selector: compute-benchmarks
+      reset_intel_gpu: ${{ matrix.reset_intel_gpu }}
+      ref: ${{ github.sha }}
+      sycl_toolchain_artifact: sycl_linux_default
+      sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }}
+      sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }}
+
   nightly_build_upload:
     name: Nightly Build Upload
     if: ${{ github.ref_name == 'sycl' }}

diff --git a/devops/actions/benchmarking/aggregate/action.yml b/devops/actions/benchmarking/aggregate/action.yml
@@ -0,0 +1,95 @@
+name: 'Aggregate compute-benchmark results and produce historical averages'
+
+# The benchmarking workflow in sycl-linux-run-tests.yml passes or fails based on
+# how the benchmark results compare to a historical average: This historical
+# average is calculated in this composite workflow, which aggregates historical
+# data and produces measures of central tendency (median in this case) used for
+# this purpose.
+#
+# This action assumes that /devops has been checked out in ./devops. This action
+# also assumes that GITHUB_TOKEN was properly set in env, because according to
+# Github, that's apparently the recommended way to pass a secret into a github
+# action:
+#
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/using-secrets-in-github-actions#accessing-your-secrets
+#
+
+inputs:
+  lookback_days:
+    type: number
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+  - name: Obtain oldest timestamp allowed for data in aggregation
+    shell: bash
+    run: |
+      # DO NOT use inputs.lookback_days directly, only use SANITIZED_TIMESTAMP.
+      SANITIZED_LOOKBACK_DAYS="$(echo '${{ inputs.lookback_days }}' | grep -oE '^[0-9]+$')"
+      if [ -z "$SANITIZED_LOOKBACK_DAYS" ]; then
+        echo "Please ensure inputs.lookback_days is a number."
+        exit 1
+      fi
+      SANITIZED_TIMESTAMP="$(date -d "$SANITIZED_LOOKBACK_DAYS days ago" +%Y%m%d_%H%M%S)"
+      if [ -z "$(echo "$SANITIZED_TIMESTAMP" | grep -oE '^[0-9]{8}_[0-9]{6}$' )" ]; then
+        echo "Invalid timestamp generated: is inputs.lookback_days valid?"
+        exit 1
+      fi
+      echo "SANITIZED_TIMESTAMP=$SANITIZED_TIMESTAMP" >> $GITHUB_ENV
+  - name: Load benchmarking configuration
+    shell: bash
+    run: |
+      $(python ./devops/scripts/benchmarking/load_config.py ./devops constants)
+      echo "SANITIZED_PERF_RES_GIT_REPO=$SANITIZED_PERF_RES_GIT_REPO" >> $GITHUB_ENV
+      echo "SANITIZED_PERF_RES_GIT_BRANCH=$SANITIZED_PERF_RES_GIT_BRANCH" >> $GITHUB_ENV
+  - name: Checkout historical performance results repository
+    shell: bash
+    run: |
+      if [ ! -d ./llvm-ci-perf-results ]; then
+        git clone -b "$SANITIZED_PERF_RES_GIT_BRANCH" "https://github.com/$SANITIZED_PERF_RES_GIT_REPO" ./llvm-ci-perf-results
+      fi
+  - name: Run aggregator on historical results
+    shell: bash
+    run: |
+      # The current format of the historical results respository is:
+      #
+      # /<ONEAPI_DEVICE_SELECTOR>/<runner>/<test name>
+      #
+      # Thus, a min/max depth of 3 is used to enumerate all test cases in the
+      # repository. Test name is also derived from here.
+      find ./llvm-ci-perf-results -mindepth 3 -maxdepth 3 -type d ! -path '*.git*' |
+      while read -r dir; do
+        test_name="$(basename "$dir")"
+        python ./devops/scripts/benchmarking/aggregate.py ./devops "$test_name" "$dir" "$SANITIZED_TIMESTAMP"
+      done
+  - name: Upload average to the repo
+    shell: bash
+    run: |
+      cd ./llvm-ci-perf-results
+      git config user.name "SYCL Benchmarking Bot"
+      git config user.email "[email protected]"
+      git pull
+      # Make sure changes have been made
+      if git diff --quiet && git diff --cached --quiet; then
+        echo "No changes to median, skipping push."
+      else
+        git add .
+        git commit -m "[GHA] Aggregate median data from $SANITIZED_TIMESTAMP to $(date +%Y%m%d_%H%M%S)"
+        git push "https://[email protected]/$SANITIZED_PERF_RES_GIT_REPO.git" "$SANITIZED_PERF_RES_GIT_BRANCH"
+      fi
+  - name: Find aggregated average results artifact here
+    if: always()
+    shell: bash
+    run: |
+      cat << EOF
+      #
+      # Artifact link for aggregated averages here:
+      #
+      EOF
+  - name: Archive new medians
+    if: always()
+    uses: actions/upload-artifact@v4
+    with:
+      name: llvm-ci-perf-results new medians
+      path: ./llvm-ci-perf-results/**/*-median.csv
diff --git a/devops/actions/run-tests/benchmark/action.yml b/devops/actions/run-tests/benchmark/action.yml
@@ -0,0 +1,107 @@
+name: 'Run compute-benchmarks'
+
+# Run compute-benchmarks on SYCL
+# 
+# This action assumes SYCL is in ./toolchain, and that /devops has been
+# checked out in ./devops. This action also assumes that GITHUB_TOKEN
+# was properly set in env, because according to Github, that's apparently the
+# recommended way to pass a secret into a github action:
+#
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/using-secrets-in-github-actions#accessing-your-secrets
+#
+# This action also expects a RUNNER_TAG environment variable to be set to the
+# runner tag used to run this workflow: Currently, only gen12 and pvc on Linux
+# are fully supported. Although this workflow won't stop you from running other
+# devices, note that only gen12 and pvc has been tested to work.
+#
+
+inputs:
+  target_devices:
+    type: string
+    required: True
+
+runs:
+  using: "composite"
+  steps:
+  - name: Check specified runner type / target backend
+    shell: bash
+    env:
+      TARGET_DEVICE: ${{ inputs.target_devices }}
+    run: |
+      case "$RUNNER_TAG" in
+        '["Linux", "gen12"]' | '["Linux", "pvc"]') ;;
+        *)
+          echo "#"
+          echo "# WARNING: Only gen12/pvc on Linux is fully supported."
+          echo "# This workflow is not guaranteed to work with other runners."
+          echo "#" ;;
+      esac
+
+      # input.target_devices is not directly used, as this allows code injection
+      case "$TARGET_DEVICE" in
+        level_zero:*) ;;
+        *)
+          echo "#"
+          echo "# WARNING: Only level_zero backend is fully supported."
+          echo "# This workflow is not guaranteed to work with other backends."
+          echo "#" ;;
+      esac
+  - name: Run compute-benchmarks
+    shell: bash
+    run: |
+      cat << EOF
+      #
+      # NOTE TO DEVELOPERS:
+      #
+
+      Check latter steps of the workflow: This job produces an artifact with:
+        - benchmark results from passing/failing tests
+        - log containing all failing (too slow) benchmarks
+        - log containing all erroring benchmarks
+
+      While this step in the workflow provides debugging output describing this
+      information, it might be easier to inspect the logs from the artifact
+      instead.
+
+      EOF
+      export ONEAPI_DEVICE_SELECTOR="${{ inputs.target_devices }}"
+      export CMPLR_ROOT=./toolchain
+      echo "-----"
+      sycl-ls
+      echo "-----"
+      ./devops/scripts/benchmarking/benchmark.sh -n '${{ runner.name }}' -s || exit 1
+  - name: Push compute-benchmarks results
+    if: always()
+    shell: bash
+    run: |
+      # TODO -- waiting on security clearance
+      # Load configuration values
+      $(python ./devops/scripts/benchmarking/load_config.py ./devops constants)
+
+      cd "./llvm-ci-perf-results"
+      git config user.name "SYCL Benchmarking Bot"
+      git config user.email "[email protected]"
+      git pull
+      git add .
+      # Make sure changes have been made
+      if git diff --quiet && git diff --cached --quiet; then
+        echo "No new results added, skipping push."
+      else
+        git commit -m "[GHA] Upload compute-benchmarks results from https://github.com/intel/llvm/actions/runs/${{ github.run_id }}"
+        git push "https://[email protected]/$SANITIZED_PERF_RES_GIT_REPO.git" "$SANITIZED_PERF_RES_GIT_BRANCH"
+      fi
+  - name: Find benchmark result artifact here
+    if: always()
+    shell: bash
+    run: |
+      cat << EOF
+      #
+      # Artifact link for benchmark results here:
+      #
+      EOF
+  - name: Archive compute-benchmark results
+    if: always()
+    uses: actions/upload-artifact@v4
+    with:
+      name: Compute-benchmark run ${{ github.run_id }} (${{ runner.name }})
+      path: ./artifact
diff --git a/devops/benchmarking/config.ini b/devops/benchmarking/config.ini
@@ -0,0 +1,44 @@
+;
+; This file contains configuration options to change the behaviour of the
+; benchmarking workflow in sycl-linux-run-tests.yml.
+;
+; DO NOT USE THE CONTENTS OF THIS FILE DIRECTLY -- Due to security concerns, The
+; contents of this file must be sanitized first before use.
+; See: /devops/scripts/benchmarking/common.py
+;
+
+; Compute-benchmark compile/run options
+[compute_bench]
+; Value for -j during compilation of compute-benchmarks
+compile_jobs = 2
+; Number of iterations to run compute-benchmark tests
+iterations = 100
+
+; Options for benchmark result metrics (to record/compare against)
+[metrics]
+; Sets the metrics to record/aggregate in the historical average.
+; Format: comma-separated list of column names in compute-benchmark results
+recorded = Median,StdDev
+; Sets the tolerance for each recorded metric and their allowed deviation from
+; the historical average. Metrics not included here are not compared against
+; when passing/failing benchmark results.
+; Format: comma-separated list of <metric>:<deviation percentage in decimals>
+tolerances = Median:0.5
+
+; Options for computing historical averages
+[average]
+; Number of days (from today) to look back for results when computing historical
+; average 
+cutoff_range = 7
+; Minimum number of samples required to compute a historical average
+min_threshold = 3
+
+; ONEAPI_DEVICE_SELECTOR linting/options
+[device_selector]
+; Backends to allow in device_selector
+enabled_backends = level_zero,opencl,cuda,hip
+; native_cpu is disabled
+
+; Devices to allow in device_selector
+enabled_devices = cpu,gpu
+; fpga is disabled