From b6e4cfc096d22ca79371f3336d0a013c30cac524 Mon Sep 17 00:00:00 2001
From: Lars Eggert <lars@eggert.org>
Date: Fri, 7 Feb 2025 12:30:13 +0200
Subject: [PATCH] ci: Add delta to `main` to bench table (#2426)

* ci: Add delta to `main` to bench table

WIP

* Again

* Delta

* Fix

* More

* More

* More

* More

* Again

* Again

* More

* Again

* Again

* Again

* Again

* Again

* Again

* Again

* Minimize diff
---
 .github/workflows/bench.yml | 110 ++++++++++++++++++++++++++++--------
 1 file changed, 88 insertions(+), 22 deletions(-)
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index d73ab48ab5..4e80e34354 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -9,7 +9,7 @@ env:
   CARGO_PROFILE_BENCH_BUILD_OVERRIDE_DEBUG: true
   CARGO_PROFILE_RELEASE_DEBUG: true
   TOOLCHAIN: stable
-  PERF_OPT: record -F997 --call-graph fp -g
+  PERF_OPT: record -F4999 --call-graph fp -g
   SCCACHE_CACHE_SIZE: 128G
   SCCACHE_DIRECT: true
   MTU: 1504 # https://github.com/microsoft/msquic/issues/4618
@@ -113,24 +113,28 @@ jobs:
           # both a client and a server, thus benefiting from multiple CPU cores.
           #
           # Run all benchmarks at elevated priority.
-          taskset -c 0 nice -n -20 cargo "+$TOOLCHAIN" bench --workspace --exclude neqo-bin --features bench -- --noplot | tee results.txt
+          taskset -c 0 nice -n -20 setarch --addr-no-randomize \
+            cargo "+$TOOLCHAIN" bench --workspace --exclude neqo-bin --features bench -- --noplot | tee results.txt
           sudo ip link set dev lo mtu "$MTU"
-          nice -n -20 cargo "+$TOOLCHAIN" bench --package neqo-bin --features bench -- --noplot | tee -a results.txt
+          nice -n -20 setarch --addr-no-randomize \
+            cargo "+$TOOLCHAIN" bench --package neqo-bin --features bench -- --noplot | tee -a results.txt
 
-      # Compare various configurations of neqo against msquic, and gather perf data
+      # Compare various configurations of neqo against msquic and google/quiche, and gather perf data
       # during the hyperfine runs.
       - name: Compare neqo, msquic and google/quiche
         env:
           HOST: 127.0.0.1
           PORT: 4433
           SIZE: 33554432 # 32 MB
-          RUNS: 20
+          RUNS: 30
         run: |
           TMP=$(mktemp -d)
           # Make a cert and key for msquic and google.
           openssl req -nodes -new -x509 -keyout "$TMP/key" -out "$TMP/cert" -subj "/CN=DOMAIN" 2>/dev/null
-          # Make a test file for msquic to serve.
+          # Make test files for msquic to serve.
           truncate -s "$SIZE" "$TMP/$SIZE"
+          BIGSIZE=$(bc -l <<< "$SIZE * $RUNS")
+          truncate -s "$BIGSIZE" "$TMP/$BIGSIZE"
           # Define the commands to run for each client and server.
           declare -A client_cmd=(
             ["neqo"]="target/release/neqo-client _cc _pacing --output-dir . _flags -Q 1 https://$HOST:$PORT/$SIZE"
@@ -170,6 +174,18 @@ jobs:
             CMD=${CMD//_flags/$flags}
           }
 
+          # A Welch's t-test to determine if a performance change is statistically significant.
+          # We use this later to highlight significant changes in the results.
+          cat <<EOF > welch.R
+            args <- commandArgs(trailingOnly = TRUE)
+            baseline <- scan(args[1], what = numeric())
+            result <- scan(args[2], what = numeric())
+            t_result <- t.test(baseline, result, alternative = "two.sided")
+            p_value <- t_result\$p.value
+            alpha <- 0.05
+            quit(status = as.integer(p_value < alpha))
+          EOF
+
           # See https://github.com/microsoft/msquic/issues/4618#issuecomment-2422611592
           sudo ip link set dev lo mtu "$MTU"
           for server in neqo google msquic; do
@@ -204,32 +220,78 @@ jobs:
                   transmogrify "${server_cmd[$server]}" "$cc" "$pacing" "${neqo_flags[$client]}"
                   FILENAME="$client-$server$EXT"
                   # shellcheck disable=SC2086
-                  taskset -c 0 nice -n -20 perf $PERF_OPT -o "$FILENAME.server.perf" $CMD &
+                  taskset -c 0 nice -n -20 setarch --addr-no-randomize \
+                    perf $PERF_OPT -o "$FILENAME.server.perf" $CMD &
                   PID=$!
                   transmogrify "${client_cmd[$client]}" "$cc" "$pacing" "${neqo_flags[$server]}"
                   # shellcheck disable=SC2086
-                  taskset -c 1 nice -n -20 \
-                    perf $PERF_OPT -o "$FILENAME.client.perf" \
-                      hyperfine --command-name "$TAG" --time-unit millisecond  \
-                        --export-json "hyperfine/$FILENAME.json" \
-                        --export-markdown "hyperfine/$FILENAME.md" \
-                        --output null --warmup 3 --runs $RUNS --prepare "sleep 1" "$CMD" |
-                      tee -a comparison.txt
+                  taskset -c 1 nice -n -20 setarch --addr-no-randomize \
+                    hyperfine --command-name "$TAG" --time-unit millisecond  \
+                      --export-json "hyperfine/$FILENAME.json" \
+                      --export-markdown "hyperfine/$FILENAME.md" \
+                      --output null --warmup 5 --runs $RUNS --prepare "sleep 1" "$CMD" |
+                    tee -a comparison.txt
                   echo >> comparison.txt
-                  kill $PID
-                  cat "hyperfine/$FILENAME.md" >> steps.md
                   # Sanity check the size of the last retrieved file.
                   # google/quiche outputs the HTTP header, too, so we can't just check for -eq.
                   [ "$(wc -c <"$SIZE")" -ge "$SIZE" ] || exit 1
+
+                  # Do a longer client run with perf separately. We used to just wrap the hyperfine command above in perf,
+                  # but that uses different processes for the individual runs, and there is apparently no way to merge
+                  # the perf profiles of those different runs.
+                  CMD=${CMD//$SIZE/$BIGSIZE}
+                  # shellcheck disable=SC2086
+                  taskset -c 1 nice -n -20 setarch --addr-no-randomize \
+                    perf $PERF_OPT -o "$FILENAME.client.perf" $CMD > /dev/null 2>&1
+                  kill $PID
+
+                  # Figure out if any performance difference to `main` is statistically relevant, and indicate that.
+                  BASELINE="hyperfine-main/$FILENAME.json"
+                  RESULT="hyperfine/$FILENAME.json"
+                  BASELINE_MEAN=$(jq -r '.results[0].mean' "$BASELINE")
+                  MEAN=$(jq -r '.results[0].mean' "$RESULT")
+                  # Even though we tell hyperfine to use milliseconds, it still outputs in seconds when dumping to JSON.
+                  DELTA=$(bc -l <<< "($MEAN - $BASELINE_MEAN) * 1000")
+                  PERCENT=$(bc -l <<< "sqrt(($MEAN - $BASELINE_MEAN)^2) / ($BASELINE_MEAN + $MEAN)/2 * 100")
+                  echo "Baseline: $BASELINE_MEAN, Result: $MEAN, Delta: $DELTA, Percent: $PERCENT"
+
+                  # If a performance change is statistically significant, highlight it.
+                  jq -r '.results[0].times[]' "$BASELINE" > baseline.txt
+                  jq -r '.results[0].times[]' "$RESULT" > result.txt
+                  if Rscript welch.R baseline.txt result.txt 2> /dev/null; then
+                    if (( $(bc -l <<< "$DELTA > 0") )); then
+                      echo "Performance has regressed: $BASELINE_MEAN -> $MEAN"
+                      SYMBOL=":broken_heart:"
+                      FORMAT='**'
+                    else
+                      echo "Performance has improved: $BASELINE_MEAN -> $MEAN"
+                      SYMBOL=":green_heart:"
+                      FORMAT='**'
+                    fi
+                  else
+                    echo "No statistically significant change: $BASELINE_MEAN -> $MEAN"
+                  fi
+
+                  {
+                    grep -Ev '^\|(:| Command)' < "hyperfine/$FILENAME.md" | \
+                      sed -E 's/`//g; s/,/ \| /g;' | cut -f1-8 -d\| | tr -d '\n'
+                    printf "| %s %s%.1f%s | %s%.1f%%%s |\n" \
+                      "$SYMBOL" "$FORMAT" "$DELTA" "$FORMAT" "$FORMAT" "$PERCENT" "$FORMAT"
+                  } >> steps.md
                 done
               done
             done
           done
-          # Merge the results tables generated by hyperfine into a single table.
-          echo "Transfer of $SIZE bytes over loopback, $RUNS runs." > comparison.md
-          awk '(!/^\| Command/ || !c++) && (!/^\|:/ || !d++)' < steps.md |\
-            sed -E 's/`//g; s/^\|:/\|:---\|:---\|:---\|/g; s/,/ \| /g; s/^\| Command/\| Client \| Server \| CC \| Pace/g' |\
-            cut -f1-8 -d\| | sed -e 's/$/|/' >> comparison.md
+
+          # Make a single results table.
+          {
+            echo "Transfer of $SIZE bytes over loopback, $RUNS runs. All unit-less numbers are in milliseconds."
+            echo
+            # shellcheck disable=SC2016
+            echo '| Client | Server | CC | Pacing | Mean ± σ | Min | Max | Δ `main` | Δ `main` |'
+            echo '|:---|:---|:---|---|---:|---:|---:|---:|---:|'
+            cat steps.md
+          } > comparison.md
           rm -r "$TMP"
 
       # Re-enable turboboost, hyperthreading and use powersave governor.
@@ -265,7 +327,7 @@ jobs:
             {
               echo "Performance differences relative to $SHA."
               echo
-            } >> results.md
+            } | tee sha.md >> results.md
           fi
           sed -E -e 's/^                 //gi' \
                  -e 's/((change|time|thrpt):[^%]*% )([^%]*%)(.*)/\1<b>\3<\/b>\4/gi' results.txt |\
@@ -277,6 +339,10 @@ jobs:
           {
             echo
             echo "### Client/server transfer results"
+            SHA=$(cat target/criterion/baseline-sha.txt || true)
+            if [ -n "$SHA" ]; then
+              cat sha.md >> results.md
+            fi
             cat comparison.md
           } >> results.md
           cat results.md > "$GITHUB_STEP_SUMMARY"