chore: try cachegrind for reproducibility

czosel · czosel · commit 2e03129e021f · 2022-02-27T22:05:59.000+01:00
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -24,8 +24,11 @@ jobs:
         yarn
         yarn test
 
+    - name: install valgrind
+      run: sudo apt-get install -y valgrind
+
     - name: benchmark
-      run: yarn benchmark | tee output.txt
+      run: python cachegrind.py node test/benchmark2.js > output.txt
 
     - name: Download previous benchmark result
       uses: actions/cache@v1
@@ -36,10 +39,10 @@ jobs:
     - name: Store benchmark result
       uses: benchmark-action/github-action-benchmark@v1
       with:
-        tool: 'benchmarkjs'
+        tool: 'customSmallerIsBetter'
         output-file-path: output.txt
         external-data-json-path: ./cache/benchmark-data.json
-        alert-threshold: '200%'
+        alert-threshold: '110%'
         fail-on-alert: true
       env:
         CI: true
diff --git a/cachegrind.py b/cachegrind.py
@@ -0,0 +1,137 @@
+"""
+Proof-of-concept: run_with_cachegrind a program under Cachegrind, combining all the various
+metrics into one single performance metric.
+
+Requires Python 3.
+
+License: https://opensource.org/licenses/MIT
+
+## Features
+
+* Disables ASLR.
+* Sets consistent cache sizes.
+* Calculates a combined performance metric.
+
+For more information see the detailed write up at:
+
+https://pythonspeed.com/articles/consistent-benchmarking-in-ci/
+
+## Usage
+
+This script has no compatibility guarnatees, I recommend copying it into your
+repository.  To use:
+
+$ python3 cachegrind.py ./yourprogram --yourparam=yourvalues
+
+If you're benchmarking Python, make sure to set PYTHONHASHSEED to a fixed value
+(e.g. `export PYTHONHASHSEED=1234`).  Other languages may have similar
+requirements to reduce variability.
+
+The last line printed will be a combined performance metric, but you can tweak
+the script to extract more info, or use it as a library.
+
+Copyright © 2020, Hyphenated Enterprises LLC.
+"""
+
+import json
+from typing import List, Dict
+from subprocess import check_call, check_output
+import sys
+from tempfile import NamedTemporaryFile
+
+ARCH = check_output(["uname", "-m"]).strip()
+
+
+def run_with_cachegrind(args_list: List[str]) -> Dict[str, int]:
+    """
+    Run the the given program and arguments under Cachegrind, parse the
+    Cachegrind specs.
+
+    For now we just ignore program output, and in general this is not robust.
+    """
+    temp_file = NamedTemporaryFile("r+")
+    check_call([
+        # Disable ASLR:
+        "setarch",
+        ARCH,
+        "-R",
+        "valgrind",
+        "--tool=cachegrind",
+        # Set some reasonable L1 and LL values, based on Haswell. You can set
+        # your own, important part is that they are consistent across runs,
+        # instead of the default of copying from the current machine.
+        "--I1=32768,8,64",
+        "--D1=32768,8,64",
+        "--LL=8388608,16,64",
+        "--cachegrind-out-file=" + temp_file.name,
+    ] + args_list)
+    return parse_cachegrind_output(temp_file)
+
+
+def parse_cachegrind_output(temp_file):
+    # Parse the output file:
+    lines = iter(temp_file)
+    for line in lines:
+        if line.startswith("events: "):
+            header = line[len("events: "):].strip()
+            break
+    for line in lines:
+        last_line = line
+    assert last_line.startswith("summary: ")
+    last_line = last_line[len("summary:"):].strip()
+    return dict(zip(header.split(), [int(i) for i in last_line.split()]))
+
+
+def get_counts(cg_results: Dict[str, int]) -> Dict[str, int]:
+    """
+    Given the result of run_with_cachegrind(), figure out the parameters we will use for final
+    estimate.
+
+    We pretend there's no L2 since Cachegrind doesn't currently support it.
+
+    Caveats: we're not including time to process instructions, only time to
+    access instruction cache(s), so we're assuming time to fetch and run_with_cachegrind
+    instruction is the same as time to retrieve data if they're both to L1
+    cache.
+    """
+    result = {}
+    d = cg_results
+
+    ram_hits = d["DLmr"] + d["DLmw"] + d["ILmr"]
+
+    l3_hits = d["I1mr"] + d["D1mw"] + d["D1mr"] - ram_hits
+
+    total_memory_rw = d["Ir"] + d["Dr"] + d["Dw"]
+    l1_hits = total_memory_rw - l3_hits - ram_hits
+    assert total_memory_rw == l1_hits + l3_hits + ram_hits
+
+    result["l1"] = l1_hits
+    result["l3"] = l3_hits
+    result["ram"] = ram_hits
+
+    return result
+
+
+def combined_instruction_estimate(counts: Dict[str, int]) -> int:
+    """
+    Given the result of run_with_cachegrind(), return estimate of total time to run_with_cachegrind.
+
+    Multipliers were determined empirically, but some research suggests they're
+    a reasonable approximation for cache time ratios.  L3 is probably too low,
+    but then we're not simulating L2...
+    """
+    return counts["l1"] + (5 * counts["l3"]) + (35 * counts["ram"])
+
+
+def github_action_benchmark_json(value):
+    return json.dumps([
+        {
+            "name": "score",
+            "unit": "",
+            "value": value,
+        }
+    ])
+
+
+if __name__ == "__main__":
+    print(github_action_benchmark_json(combined_instruction_estimate(get_counts(run_with_cachegrind(sys.argv[1:])))))
diff --git a/test/benchmark2.js b/test/benchmark2.js
@@ -0,0 +1,18 @@
+const parser = require("./main");
+
+const code = `
+  <?php
+  class foo extends bar {
+    const A = 1, B = 2, C = 3;
+    protected $foo = null;
+    public function __construct($a, $b, $c, array $d = []) {
+      echo $a . $b . $c . implode(";", $d);
+    }
+    static public function bar(): foo {
+      return new self(1, 2, 3);
+    }
+  }
+  `;
+
+const reader = new parser();
+reader.parseCode(code);