Add TorchAO speedup metric vs eager (#6178)

Addresses the first part of #6176 This PR adds another speedup metric vs eager. Because this is TorchAO dashboard, I think it's more appropriate to show TorchAO vs compile and TorchAO vs eager instead of TorchAO vs compile and compile vs eager because the last one (compile vs eager) is a fit for PT2 inductor dashboard instead. @jerryzh168 What do you think? I also fix another UX issue to show the oldest commit in the time range as the base commit instead. ### Testing https://torchci-git-fork-huydhn-improve-ao-speedup-metric-fbopensource.vercel.app/benchmark/llms?startTime=Thu%2C%2009%20Jan%202025%2010%3A21%3A42%20GMT&stopTime=Thu%2C%2016%20Jan%202025%2010%3A21%3A42%20GMT&granularity=day&lBranch=main&lCommit=2cddc67fe700579043e3e2d395d983764298b82e9746e9b2663c583710b3b08c&rBranch=main&rCommit=399034112cd82562f0d651bda8a8b5ab8840703ee0b40cd136d85181164d2280&repoName=pytorch%2Fao&modelName=All%20Models&backendName=All%20Backends&dtypeName=All%20DType&deviceName=All%20Devices
pytorch · Jan 16, 2025 · cb2e2d9 · cb2e2d9
1 parent 62ce219
commit cb2e2d9
Show file tree

Hide file tree

Showing 5 changed files with 134 additions and 32 deletions.
diff --git a/torchci/clickhouse_queries/oss_ci_benchmark_llms/query.sql b/torchci/clickhouse_queries/oss_ci_benchmark_llms/query.sql
@@ -23,6 +23,11 @@ WITH benchmarks AS (
             tupleElement(o.benchmark, 'extra_info') [ 'arch' ],
             tupleElement(o.runners [ 1 ], 'type')
         ) AS arch,
+        IF(
+            tupleElement(o.benchmark, 'extra_info') [ 'compile' ] = '',
+            'true',  -- Default to true
+            tupleElement(o.benchmark, 'extra_info') [ 'compile' ]
+        ) AS use_torch_compile,
         DATE_TRUNC(
             {granularity: String },
             fromUnixTimestamp(o.timestamp)
@@ -71,6 +76,7 @@ SELECT
     dtype,
     device,
     arch,
+    toBool(use_torch_compile) AS use_torch_compile,
     granularity_bucket
 FROM
     benchmarks

diff --git a/torchci/components/benchmark/llms/ModelGraphPanel.tsx b/torchci/components/benchmark/llms/ModelGraphPanel.tsx
@@ -6,7 +6,6 @@ import {
 import { TIME_FIELD_NAME } from "components/benchmark/common";
 import {
   DEFAULT_DEVICE_NAME,
-  DEFAULT_DTYPE_NAME,
   DEFAULT_MODEL_NAME,
   LLMsBenchmarkData,
   METRIC_DISPLAY_HEADERS,
@@ -18,7 +17,10 @@ import {
   TimeSeriesPanelWithData,
 } from "components/metrics/panels/TimeSeriesPanel";
 import dayjs from "dayjs";
-import { computeSpeedup } from "lib/benchmark/aoUtils";
+import {
+  computeSpeedup,
+  TORCHAO_SPEEDUP_METRIC_NAMES,
+} from "lib/benchmark/aoUtils";
 import { computeGeomean, useBenchmark } from "lib/benchmark/llmUtils";
 import { BranchAndCommit } from "lib/types";
 
@@ -64,7 +66,12 @@ export function GraphPanel({
     );
   }
 
-  const dataWithSpeedup = computeSpeedup(repoName, data);
+  const dataWithSpeedup = computeSpeedup(
+    repoName,
+    computeSpeedup(repoName, data, false, true),
+    true,
+    false
+  );
 
   // Clamp to the nearest granularity (e.g. nearest hour) so that the times will
   // align with the data we get from the database
@@ -80,8 +87,10 @@ export function GraphPanel({
   const chartData: { [k: string]: any } = {};
   const graphSeries: { [k: string]: any } = {};
   metricNames.forEach((metric: string) => {
-    // TODO (huydhn): Only display aggregated speedup metric for now
-    if (modelName === DEFAULT_MODEL_NAME && metric !== "speedup") {
+    if (
+      modelName === DEFAULT_MODEL_NAME &&
+      !TORCHAO_SPEEDUP_METRIC_NAMES.includes(metric)
+    ) {
       chartData[metric] = [];
       return;
     }
@@ -115,8 +124,6 @@ export function GraphPanel({
             .filter((record: LLMsBenchmarkData) => {
               return (
                 record.model === modelName &&
-                (record.dtype === dtypeName ||
-                  dtypeName === DEFAULT_DTYPE_NAME) &&
                 (`${record.device} (${record.arch})` === deviceName ||
                   deviceName === DEFAULT_DEVICE_NAME) &&
                 record.metric === metric

diff --git a/torchci/components/benchmark/llms/common.tsx b/torchci/components/benchmark/llms/common.tsx
@@ -14,7 +14,9 @@ export const METRIC_DISPLAY_HEADERS: { [k: string]: string } = {
   token_per_sec: "Token per second",
   flops_utilization: "FLOPs utilization",
   "compilation_time(s)": "Compilation Time (s)",
-  speedup: "Speedup",
+  compile_vs_eager_speedup: "Compile vs eager speedup",
+  autoquant_vs_compile_speedup: "Autoquant vs compile speedup",
+  eager_speedup: "Eager speedup",
 };
 // The variable name is a bit dumb, but it tells if a higher metric value
 // is good or bad so that we can highlight it on the dashboard accordingly.
@@ -53,6 +55,7 @@ export interface LLMsBenchmarkData {
   device: string;
   arch: string;
   display?: string;
+  use_torch_compile?: boolean;
 }
 
 export interface BranchAndCommitPerfData extends BranchAndCommit {

diff --git a/torchci/lib/benchmark/aoUtils.ts b/torchci/lib/benchmark/aoUtils.ts
@@ -10,6 +10,17 @@ export const TORCHAO_BASELINE = "noquant";
 // here on the dashboard
 const SPEEDUP_METRICS = ["tok/s", "time_ms(avg)", "time_s(avg)", "img_s(avg)"];
 
+export const TORCHAO_SPEEDUP_METRIC_NAMES = [
+  "autoquant_vs_compile_speedup",
+  "compile_vs_eager_speedup",
+  "eager_speedup",
+];
+// Different speedup metrics, the key is quantization-torch.compile
+export const TORCHAO_SPEEDUP_METRIC_NAMES_MAPPING: { [key: string]: string } = {
+  "noquant-false": "compile_vs_eager_speedup",
+  "-true": "autoquant_vs_compile_speedup",
+};
+
 // TODO (huydhn): Use this function to convert the generic benchmark data to the old
 // CompilerPerformanceData format. This is needed until the TorchInductor dashboard
 // is migrated to the new format
@@ -54,48 +65,111 @@ export function convertToCompilerPerformanceData(data: BenchmarkData[]) {
   return Object.values(convertData);
 }
 
-export function computeSpeedup(repoName: string, data: LLMsBenchmarkData[]) {
+export function computeSpeedup(
+  repoName: string,
+  data: LLMsBenchmarkData[],
+  useTorchCompile: boolean,
+  usebaseCommitbaseline: boolean
+) {
   if (repoName !== TORCHAO_REPO) {
     return data;
   }
 
-  const baselineMetrics: { [key: string]: LLMsBenchmarkData } = {};
+  // https://github.com/pytorch/test-infra/pull/6178#issuecomment-2596338457, we want
+  // to show 3 different speedup in AO:
+  // - Current eager perf vs base commit eager
+  const baseCommitBaseline: { [key: string]: LLMsBenchmarkData } = {};
+  // - Current compile perf vs current eager
+  // - Current autoquant perf vs current compile
+  const currentCommitBaseline: { [key: string]: LLMsBenchmarkData } = {};
+
   data.forEach((r: LLMsBenchmarkData) => {
-    if (r.dtype !== TORCHAO_BASELINE) {
+    if (
+      r.dtype !== TORCHAO_BASELINE ||
+      r.use_torch_compile !== useTorchCompile
+    ) {
       return;
     }
 
-    const k = `${r.workflow_id} ${r.job_id} ${r.model} ${r.metric} ${r.device} ${r.arch}`;
-    baselineMetrics[k] = r;
+    const baseCommitKey = `${r.model} ${r.metric} ${r.device} ${r.arch}`;
+    const currentCommitKey = `${r.workflow_id} ${r.job_id} ${baseCommitKey}`;
+
+    // To compare against the current commit
+    currentCommitBaseline[currentCommitKey] = r;
+
+    // To compare against the oldest base commit
+    if (
+      !usebaseCommitbaseline ||
+      (baseCommitKey in baseCommitBaseline &&
+        baseCommitBaseline[baseCommitKey].workflow_id < r.workflow_id)
+    ) {
+      return;
+    }
+    baseCommitBaseline[baseCommitKey] = r;
   });
 
   const withSpeedup: LLMsBenchmarkData[] = [];
   data.forEach((r: LLMsBenchmarkData) => {
-    if (r.dtype === TORCHAO_BASELINE) {
-      return;
+    withSpeedup.push(r);
+
+    // Compute eager speedup vs the base commit baseline
+    if (r.dtype === TORCHAO_BASELINE && r.use_torch_compile === false) {
+      if (SPEEDUP_METRICS.includes(r.metric)) {
+        const k = `${r.model} ${r.metric} ${r.device} ${r.arch}`;
+        if (
+          k in baseCommitBaseline &&
+          baseCommitBaseline[k].actual !== 0 &&
+          r.actual !== 0 &&
+          baseCommitBaseline[k].workflow_id <= r.workflow_id
+        ) {
+          const speedup = r.metric.includes("time")
+            ? baseCommitBaseline[k].actual / r.actual
+            : r.actual / baseCommitBaseline[k].actual;
+
+          withSpeedup.push({
+            ...r,
+            metric: "eager_speedup",
+            actual: Number(speedup.toFixed(2)),
+            target: 0,
+          });
+        }
+      }
     }
 
     if (SPEEDUP_METRICS.includes(r.metric)) {
       const k = `${r.workflow_id} ${r.job_id} ${r.model} ${r.metric} ${r.device} ${r.arch}`;
       if (
-        k in baselineMetrics &&
-        baselineMetrics[k].actual !== 0 &&
+        k in currentCommitBaseline &&
+        currentCommitBaseline[k].actual !== 0 &&
         r.actual !== 0
       ) {
         const speedup = r.metric.includes("time")
-          ? baselineMetrics[k].actual / r.actual
-          : r.actual / baselineMetrics[k].actual;
+          ? currentCommitBaseline[k].actual / r.actual
+          : r.actual / currentCommitBaseline[k].actual;
+
+        const speedupMetricName =
+          r.dtype === TORCHAO_BASELINE
+            ? // Compile vs eager
+              r !== currentCommitBaseline[k]
+              ? TORCHAO_SPEEDUP_METRIC_NAMES_MAPPING[
+                  `${r.dtype}-${useTorchCompile}`
+                ]
+              : ""
+            : // Autoquant vs compile or vs eager
+              TORCHAO_SPEEDUP_METRIC_NAMES_MAPPING[`-${useTorchCompile}`];
+
+        if (!speedupMetricName) {
+          return;
+        }
 
         withSpeedup.push({
           ...r,
-          metric: "speedup",
-          actual: Number(speedup.toFixed(4)),
+          metric: speedupMetricName,
+          actual: Number(speedup.toFixed(2)),
           target: 0,
         });
       }
     }
-
-    withSpeedup.push(r);
   });
 
   return withSpeedup;

diff --git a/torchci/pages/benchmark/llms.tsx b/torchci/pages/benchmark/llms.tsx
@@ -21,7 +21,11 @@ import CopyLink from "components/CopyLink";
 import GranularityPicker from "components/GranularityPicker";
 import { Granularity } from "components/metrics/panels/TimeSeriesPanel";
 import dayjs from "dayjs";
-import { computeSpeedup, TORCHAO_BASELINE } from "lib/benchmark/aoUtils";
+import {
+  computeSpeedup,
+  TORCHAO_BASELINE,
+  TORCHAO_SPEEDUP_METRIC_NAMES,
+} from "lib/benchmark/aoUtils";
 import { useBenchmark } from "lib/benchmark/llmUtils";
 import { fetcher } from "lib/GeneralUtils";
 import { BranchAndCommit } from "lib/types";
@@ -82,11 +86,22 @@ function Report({
     );
   }
 
-  const lDataWithSpeedup = computeSpeedup(repoName, lData);
-  const rDataWithSpeedup = computeSpeedup(repoName, rData);
+  const lDataWithSpeedup = computeSpeedup(
+    repoName,
+    computeSpeedup(repoName, lData, false, true),
+    true,
+    false
+  );
+
+  const rDataWithSpeedup = computeSpeedup(
+    repoName,
+    computeSpeedup(repoName, rData, false, true),
+    true,
+    false
+  );
 
   if (repoName === "pytorch/ao") {
-    metricNames = ["speedup", ...metricNames];
+    metricNames = [...TORCHAO_SPEEDUP_METRIC_NAMES, ...metricNames];
   }
 
   return (
@@ -288,10 +303,7 @@ export default function Page() {
   ];
   const dtypeNames: string[] = _.compact([
     DEFAULT_DTYPE_NAME,
-    ..._.filter(
-      _.uniq(data.map((r: any) => r.dtype)) as string[],
-      (r: string) => r !== TORCHAO_BASELINE
-    ),
+    ...(_.uniq(data.map((r: any) => r.dtype)) as string[]),
   ]);
   const metricNames: string[] = _.uniq(data.map((r: any) => r.metric));
 
@@ -367,7 +379,7 @@ export default function Page() {
           commit={lCommit}
           setCommit={setLCommit}
           titlePrefix={"Base"}
-          fallbackIndex={1} // Default to previous commit
+          fallbackIndex={-1} // Default to oldest commit
           timeRange={timeRange}
         />
         <Divider orientation="vertical" flexItem>