[CI] Tune nightly benchmarking job for better reliability (#17122)

ianayl · sarnex · web-flow · commit e38db3a203b1 · 2025-03-13T14:49:35.000Z
This PR tunes the nightly benchmarking job to produce more consistent results: - Lowers the tolerance threshold of benchmarking results accepted from 50% to 8% - Nightly was flaking before even with a 50% tolerance threshold - Raises the iterations to 5000 - Using 10,000 iterations did not result in significantly more stable performance, although this may change as we obtain more data - However, the PVC benchmarking job in the overall nightly workflow now takes about ~47 minutes, whereas before the PVC benchmarking job took ~14 minutes - This should not have major impact on execution time however, considering the E2E tests take ~42 minutes: Since both these jobs run in parallel on different machines, the theoretical effect on the overall workflow should only be about 5 minutes, although this would depend on whether or not machines are able to be scheduled in time. - Changes the benchmarking workflows in sycl-nightly.yml to use the tuned PERF_PVC runner - Untuned machines are exhibiting large variations when running compute-benchmarks (20-25%, up to 50% in the worst case scenario): These are unacceptable variations and not particularly useful. - Disables nightly benchmarking on gen12: - Gen12 machines are currently untuned. Similar to PVC machines, these results are not accurate and not worth serious nightly benchmarking. - Adds guards for benchmarking jobs to prevent benchmark runs in forks #14454 (comment) --------- Co-authored-by: Nick Sarnie <nick.sarnie@intel.com>
diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml
@@ -126,6 +126,7 @@ on:
           - '["cts-cpu"]'
           - '["Linux", "build"]'
           - '["cuda"]'
+          - '["PVC_PERF"]'
       image:
         type: choice
         options:
@@ -170,17 +171,14 @@ on:
           Extra options to be added to LIT_OPTS.
         default: ''
 
-      install_igc_driver:
+      reset_intel_gpu:
+        description: |
+          Reset Intel GPUs
         type: choice
         options:
           - false
           - true
 
-      install_dev_igc_driver:
-        type: choice
-        options:
-          - false
-          - true
       e2e_testing_mode:
         type: choice
         options:
diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml
@@ -247,7 +247,7 @@ jobs:
       sycl_cts_artifact: sycl_cts_bin
 
   aggregate_benchmark_results:
-    if: always() && !cancelled()
+    if: github.repository == 'intel/llvm' && !cancelled()
     name: Aggregate benchmark results and produce historical averages
     uses: ./.github/workflows/sycl-benchmark-aggregate.yml
     secrets:
@@ -262,13 +262,8 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - name: Run compute-benchmarks on L0 Gen12
-            runner: '["Linux", "gen12"]'
-            image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
-            target_devices: level_zero:gpu
-            reset_intel_gpu: true
           - name: Run compute-benchmarks on L0 PVC
-            runner: '["Linux", "pvc"]'
+            runner: '["PVC_PERF"]'
             image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
             target_devices: level_zero:gpu
             reset_intel_gpu: true
diff --git a/devops/actions/run-tests/benchmark/action.yml b/devops/actions/run-tests/benchmark/action.yml
@@ -46,6 +46,27 @@ runs:
           echo "# This workflow is not guaranteed to work with other backends."
           echo "#" ;;
       esac
+  - name: Compute CPU core range to run benchmarks on
+    shell: bash
+    run: |
+      # Taken from ur-benchmark-reusable.yml:
+
+      # Compute the core range for the first NUMA node; second node is used by
+      # UMF. Skip the first 4 cores as the kernel is likely to schedule more
+      # work on these.
+      CORES="$(lscpu | awk '
+        /NUMA node0 CPU|On-line CPU/ {line=$0}
+        END {
+          split(line, a, " ")
+          split(a[4], b, ",")
+          sub(/^0/, "4", b[1])
+          print b[1]
+        }')"
+      echo "CPU core range to use: $CORES"
+      echo "CORES=$CORES" >> $GITHUB_ENV
+
+      ZE_AFFINITY_MASK=0
+      echo "ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK" >> $GITHUB_ENV
   - name: Run compute-benchmarks
     shell: bash
     run: |
@@ -69,7 +90,7 @@ runs:
       echo "-----"
       sycl-ls
       echo "-----"
-      ./devops/scripts/benchmarking/benchmark.sh -n '${{ runner.name }}' -s || exit 1
+      taskset -c "$CORES" ./devops/scripts/benchmarking/benchmark.sh -n '${{ runner.name }}' -s || exit 1
   - name: Push compute-benchmarks results
     if: always()
     shell: bash
diff --git a/devops/benchmarking/config.ini b/devops/benchmarking/config.ini
@@ -10,9 +10,9 @@
 ; Compute-benchmark compile/run options
 [compute_bench]
 ; Value for -j during compilation of compute-benchmarks
-compile_jobs = 2
+compile_jobs = 40
 ; Number of iterations to run compute-benchmark tests
-iterations = 100
+iterations = 5000
 
 ; Options for benchmark result metrics (to record/compare against)
 [metrics]
@@ -23,15 +23,15 @@ recorded = Median,StdDev
 ; the historical average. Metrics not included here are not compared against
 ; when passing/failing benchmark results.
 ; Format: comma-separated list of <metric>:<deviation percentage in decimals>
-tolerances = Median:0.5
+tolerances = Median:0.08
 
 ; Options for computing historical averages
 [average]
 ; Number of days (from today) to look back for results when computing historical
 ; average 
 cutoff_range = 7
 ; Minimum number of samples required to compute a historical average
-min_threshold = 3
+min_threshold = 10
 
 ; ONEAPI_DEVICE_SELECTOR linting/options
 [device_selector]