From b6e4cfc096d22ca79371f3336d0a013c30cac524 Mon Sep 17 00:00:00 2001 From: Lars Eggert Date: Fri, 7 Feb 2025 12:30:13 +0200 Subject: [PATCH] ci: Add delta to `main` to bench table (#2426) * ci: Add delta to `main` to bench table WIP * Again * Delta * Fix * More * More * More * More * Again * Again * More * Again * Again * Again * Again * Again * Again * Again * Minimize diff --- .github/workflows/bench.yml | 110 ++++++++++++++++++++++++++++-------- 1 file changed, 88 insertions(+), 22 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index d73ab48ab5..4e80e34354 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -9,7 +9,7 @@ env: CARGO_PROFILE_BENCH_BUILD_OVERRIDE_DEBUG: true CARGO_PROFILE_RELEASE_DEBUG: true TOOLCHAIN: stable - PERF_OPT: record -F997 --call-graph fp -g + PERF_OPT: record -F4999 --call-graph fp -g SCCACHE_CACHE_SIZE: 128G SCCACHE_DIRECT: true MTU: 1504 # https://github.com/microsoft/msquic/issues/4618 @@ -113,24 +113,28 @@ jobs: # both a client and a server, thus benefiting from multiple CPU cores. # # Run all benchmarks at elevated priority. - taskset -c 0 nice -n -20 cargo "+$TOOLCHAIN" bench --workspace --exclude neqo-bin --features bench -- --noplot | tee results.txt + taskset -c 0 nice -n -20 setarch --addr-no-randomize \ + cargo "+$TOOLCHAIN" bench --workspace --exclude neqo-bin --features bench -- --noplot | tee results.txt sudo ip link set dev lo mtu "$MTU" - nice -n -20 cargo "+$TOOLCHAIN" bench --package neqo-bin --features bench -- --noplot | tee -a results.txt + nice -n -20 setarch --addr-no-randomize \ + cargo "+$TOOLCHAIN" bench --package neqo-bin --features bench -- --noplot | tee -a results.txt - # Compare various configurations of neqo against msquic, and gather perf data + # Compare various configurations of neqo against msquic and google/quiche, and gather perf data # during the hyperfine runs. - name: Compare neqo, msquic and google/quiche env: HOST: 127.0.0.1 PORT: 4433 SIZE: 33554432 # 32 MB - RUNS: 20 + RUNS: 30 run: | TMP=$(mktemp -d) # Make a cert and key for msquic and google. openssl req -nodes -new -x509 -keyout "$TMP/key" -out "$TMP/cert" -subj "/CN=DOMAIN" 2>/dev/null - # Make a test file for msquic to serve. + # Make test files for msquic to serve. truncate -s "$SIZE" "$TMP/$SIZE" + BIGSIZE=$(bc -l <<< "$SIZE * $RUNS") + truncate -s "$BIGSIZE" "$TMP/$BIGSIZE" # Define the commands to run for each client and server. declare -A client_cmd=( ["neqo"]="target/release/neqo-client _cc _pacing --output-dir . _flags -Q 1 https://$HOST:$PORT/$SIZE" @@ -170,6 +174,18 @@ jobs: CMD=${CMD//_flags/$flags} } + # A Welch's t-test to determine if a performance change is statistically significant. + # We use this later to highlight significant changes in the results. + cat < welch.R + args <- commandArgs(trailingOnly = TRUE) + baseline <- scan(args[1], what = numeric()) + result <- scan(args[2], what = numeric()) + t_result <- t.test(baseline, result, alternative = "two.sided") + p_value <- t_result\$p.value + alpha <- 0.05 + quit(status = as.integer(p_value < alpha)) + EOF + # See https://github.com/microsoft/msquic/issues/4618#issuecomment-2422611592 sudo ip link set dev lo mtu "$MTU" for server in neqo google msquic; do @@ -204,32 +220,78 @@ jobs: transmogrify "${server_cmd[$server]}" "$cc" "$pacing" "${neqo_flags[$client]}" FILENAME="$client-$server$EXT" # shellcheck disable=SC2086 - taskset -c 0 nice -n -20 perf $PERF_OPT -o "$FILENAME.server.perf" $CMD & + taskset -c 0 nice -n -20 setarch --addr-no-randomize \ + perf $PERF_OPT -o "$FILENAME.server.perf" $CMD & PID=$! transmogrify "${client_cmd[$client]}" "$cc" "$pacing" "${neqo_flags[$server]}" # shellcheck disable=SC2086 - taskset -c 1 nice -n -20 \ - perf $PERF_OPT -o "$FILENAME.client.perf" \ - hyperfine --command-name "$TAG" --time-unit millisecond \ - --export-json "hyperfine/$FILENAME.json" \ - --export-markdown "hyperfine/$FILENAME.md" \ - --output null --warmup 3 --runs $RUNS --prepare "sleep 1" "$CMD" | - tee -a comparison.txt + taskset -c 1 nice -n -20 setarch --addr-no-randomize \ + hyperfine --command-name "$TAG" --time-unit millisecond \ + --export-json "hyperfine/$FILENAME.json" \ + --export-markdown "hyperfine/$FILENAME.md" \ + --output null --warmup 5 --runs $RUNS --prepare "sleep 1" "$CMD" | + tee -a comparison.txt echo >> comparison.txt - kill $PID - cat "hyperfine/$FILENAME.md" >> steps.md # Sanity check the size of the last retrieved file. # google/quiche outputs the HTTP header, too, so we can't just check for -eq. [ "$(wc -c <"$SIZE")" -ge "$SIZE" ] || exit 1 + + # Do a longer client run with perf separately. We used to just wrap the hyperfine command above in perf, + # but that uses different processes for the individual runs, and there is apparently no way to merge + # the perf profiles of those different runs. + CMD=${CMD//$SIZE/$BIGSIZE} + # shellcheck disable=SC2086 + taskset -c 1 nice -n -20 setarch --addr-no-randomize \ + perf $PERF_OPT -o "$FILENAME.client.perf" $CMD > /dev/null 2>&1 + kill $PID + + # Figure out if any performance difference to `main` is statistically relevant, and indicate that. + BASELINE="hyperfine-main/$FILENAME.json" + RESULT="hyperfine/$FILENAME.json" + BASELINE_MEAN=$(jq -r '.results[0].mean' "$BASELINE") + MEAN=$(jq -r '.results[0].mean' "$RESULT") + # Even though we tell hyperfine to use milliseconds, it still outputs in seconds when dumping to JSON. + DELTA=$(bc -l <<< "($MEAN - $BASELINE_MEAN) * 1000") + PERCENT=$(bc -l <<< "sqrt(($MEAN - $BASELINE_MEAN)^2) / ($BASELINE_MEAN + $MEAN)/2 * 100") + echo "Baseline: $BASELINE_MEAN, Result: $MEAN, Delta: $DELTA, Percent: $PERCENT" + + # If a performance change is statistically significant, highlight it. + jq -r '.results[0].times[]' "$BASELINE" > baseline.txt + jq -r '.results[0].times[]' "$RESULT" > result.txt + if Rscript welch.R baseline.txt result.txt 2> /dev/null; then + if (( $(bc -l <<< "$DELTA > 0") )); then + echo "Performance has regressed: $BASELINE_MEAN -> $MEAN" + SYMBOL=":broken_heart:" + FORMAT='**' + else + echo "Performance has improved: $BASELINE_MEAN -> $MEAN" + SYMBOL=":green_heart:" + FORMAT='**' + fi + else + echo "No statistically significant change: $BASELINE_MEAN -> $MEAN" + fi + + { + grep -Ev '^\|(:| Command)' < "hyperfine/$FILENAME.md" | \ + sed -E 's/`//g; s/,/ \| /g;' | cut -f1-8 -d\| | tr -d '\n' + printf "| %s %s%.1f%s | %s%.1f%%%s |\n" \ + "$SYMBOL" "$FORMAT" "$DELTA" "$FORMAT" "$FORMAT" "$PERCENT" "$FORMAT" + } >> steps.md done done done done - # Merge the results tables generated by hyperfine into a single table. - echo "Transfer of $SIZE bytes over loopback, $RUNS runs." > comparison.md - awk '(!/^\| Command/ || !c++) && (!/^\|:/ || !d++)' < steps.md |\ - sed -E 's/`//g; s/^\|:/\|:---\|:---\|:---\|/g; s/,/ \| /g; s/^\| Command/\| Client \| Server \| CC \| Pace/g' |\ - cut -f1-8 -d\| | sed -e 's/$/|/' >> comparison.md + + # Make a single results table. + { + echo "Transfer of $SIZE bytes over loopback, $RUNS runs. All unit-less numbers are in milliseconds." + echo + # shellcheck disable=SC2016 + echo '| Client | Server | CC | Pacing | Mean ± σ | Min | Max | Δ `main` | Δ `main` |' + echo '|:---|:---|:---|---|---:|---:|---:|---:|---:|' + cat steps.md + } > comparison.md rm -r "$TMP" # Re-enable turboboost, hyperthreading and use powersave governor. @@ -265,7 +327,7 @@ jobs: { echo "Performance differences relative to $SHA." echo - } >> results.md + } | tee sha.md >> results.md fi sed -E -e 's/^ //gi' \ -e 's/((change|time|thrpt):[^%]*% )([^%]*%)(.*)/\1\3<\/b>\4/gi' results.txt |\ @@ -277,6 +339,10 @@ jobs: { echo echo "### Client/server transfer results" + SHA=$(cat target/criterion/baseline-sha.txt || true) + if [ -n "$SHA" ]; then + cat sha.md >> results.md + fi cat comparison.md } >> results.md cat results.md > "$GITHUB_STEP_SUMMARY"