Filter out devices that are not run (#6277)

huydhn · web-flow · commit 5de69da72308 · 2025-02-19T13:11:07.000-08:00
Fixes pytorch/executorch#7986 (for real this time) The problem is that the records in the benchmark database alone don't have the information to differentiate between benchmarks that are failed to run and benchmarks that are not run. Both show up as 0 on the dashboard. Note that we can do a join with `workflow_job` table to get this information, but it's a rather slow and expensive route. So, I opt for a quicker approach to keep track of the list of valid devices on the dashboard side. A valid device is one that is run by the selected commit and has at least one non-zero record there. ### Testing https://torchci-git-fork-huydhn-better-model-filter-fbopensource.vercel.app/benchmark/llms?repoName=pytorch%2Fexecutorch
diff --git a/torchci/components/benchmark/llms/SummaryPanel.tsx b/torchci/components/benchmark/llms/SummaryPanel.tsx
@@ -142,14 +142,18 @@ export function SummaryPanel({
           flex: 1,
           cellClassName: (params: GridCellParams<any, any>) => {
             const v = params.value;
-            if (v === undefined || v.l.actual === 0) {
+            if (v === undefined) {
               return "";
             }
 
             // l is the old (base) value, r is the new value
             const l = v.l.actual;
             const r = v.r.actual;
 
+            if (!v.highlight) {
+              return "";
+            }
+
             if (lCommit === rCommit) {
               return "";
             } else {
@@ -163,6 +167,11 @@ export function SummaryPanel({
                 return styles.error;
               }
 
+              // If it didn't run and now it runs, mark it as green
+              if (l === 0) {
+                return styles.ok;
+              }
+
               if (metric in IS_INCREASING_METRIC_VALUE_GOOD) {
                 // Higher value
                 if (r - l > RELATIVE_THRESHOLD * l) {
@@ -206,12 +215,11 @@ export function SummaryPanel({
                 : "";
             const showTarget =
               target && target != 0 ? `[target = ${target}]` : "";
-            const isNewModel = l === 0 ? "(NEW!)" : "";
 
-            if (lCommit === rCommit || l === r) {
+            if (lCommit === rCommit || !v.highlight) {
               return `${r} ${rPercent} ${showTarget}`;
             } else {
-              return `${l} ${lPercent} → ${r} ${rPercent} ${showTarget} ${isNewModel} `;
+              return `${l} ${lPercent} → ${r} ${rPercent} ${showTarget}`;
             }
           },
         };
@@ -225,9 +233,9 @@ export function SummaryPanel({
       <Grid2
         size={{ xs: 12, lg: 12 }}
         height={
-          data.length > 98
-            ? 98 * ROW_HEIGHT
-            : data.length * ROW_HEIGHT + ROW_GAP
+          data.length > 90
+            ? 90 * ROW_HEIGHT
+            : (data.length + 1) * ROW_HEIGHT + ROW_GAP
         }
       >
         <TablePanelWithData
diff --git a/torchci/components/benchmark/llms/common.tsx b/torchci/components/benchmark/llms/common.tsx
@@ -27,12 +27,22 @@ export const IS_INCREASING_METRIC_VALUE_GOOD: { [k: string]: boolean } = {
   flops_utilization: true,
   "compilation_time(s)": false,
   speedup: true,
+  "avg_inference_latency(ms)": false,
+  "model_load_time(ms)": false,
+  "peak_inference_mem_usage(mb)": false,
+  "peak_load_mem_usuage(mb)": false,
+  "generate_time(ms)": false,
 };
 export const METRIC_DISPLAY_SHORT_HEADERS: { [k: string]: string } = {
   "memory_bandwidth(GB/s)": "Bandwidth",
   token_per_sec: "TPS",
   flops_utilization: "FLOPs",
   "compilation_time(s)": "CompTime",
+  "avg_inference_latency(ms)": "InferenceTime",
+  "model_load_time(ms)": "LoadTime",
+  "peak_inference_mem_usage(mb)": "InferenceMem",
+  "peak_load_mem_usuage(mb)": "LoadMem",
+  "generate_time(ms)": "GenerateTime",
 };
 export const DEFAULT_DEVICE_NAME = "All Devices";
 export const DEFAULT_ARCH_NAME = "All Platforms";
@@ -45,7 +55,7 @@ export const ARCH_NAMES: { [k: string]: string[] } = {
 };
 
 // Relative thresholds
-export const RELATIVE_THRESHOLD = 0.05;
+export const RELATIVE_THRESHOLD = 0.1;
 
 export interface LLMsBenchmarkData {
   granularity_bucket: string;
diff --git a/torchci/lib/benchmark/llmUtils.ts b/torchci/lib/benchmark/llmUtils.ts
@@ -88,6 +88,33 @@ export function combineLeftAndRight(
     });
   }
 
+  // NB: This is a hack to keep track of valid devices. The problem is that the records
+  // in the benchmark database alone don't have the information to differentiate between
+  // benchmarks that are failed to run and benchmarks that are not run. Both show up as
+  // 0 on the dashboard. Note that we can do a join with workflow_job table to get this
+  // information, but it's a rather slow and expensive route
+  const validDevices = new Set<string>();
+  const validBackends = new Set<string>();
+  // First round to get all the valid devices
+  Object.keys(dataGroupedByModel).forEach((key: string) => {
+    const [model, backend, dtype, device, arch] = key.split(";");
+    const row: { [k: string]: any } = {
+      // Keep the name as as the row ID as DataGrid requires it
+      name: `${model} ${backend} (${dtype} / ${device} / ${arch})`,
+    };
+
+    for (const metric in dataGroupedByModel[key]) {
+      const record = dataGroupedByModel[key][metric];
+      const hasL = "l" in record;
+      const hasR = "r" in record;
+
+      if (hasL && hasR) {
+        validDevices.add(device);
+        validBackends.add(`${model} ${backend}`);
+      }
+    }
+  });
+
   // Transform the data into a displayable format
   const data: { [k: string]: any }[] = [];
   Object.keys(dataGroupedByModel).forEach((key: string) => {
@@ -102,6 +129,20 @@ export function combineLeftAndRight(
       const hasL = "l" in record;
       const hasR = "r" in record;
 
+      // Skip devices and models that weren't run in this commit
+      if (
+        (validDevices.size !== 0 && !validDevices.has(device)) ||
+        (validBackends.size !== 0 && !validBackends.has(`${model} ${backend}`))
+      ) {
+        continue;
+      }
+
+      // No overlapping between left and right commits, just show what it's on the
+      // right commit instead of showing a blank page
+      if (!hasR) {
+        continue;
+      }
+
       if (!("metadata" in row)) {
         row["metadata"] = {
           model: model,
@@ -151,10 +192,17 @@ export function combineLeftAndRight(
               actual: 0,
               target: 0,
             },
+        highlight:
+          validDevices.size !== 0 &&
+          validBackends.has(`${model} ${backend}`) &&
+          hasL &&
+          hasR,
       };
     }
 
-    data.push(row);
+    if ("metadata" in row) {
+      data.push(row);
+    }
   });
 
   return data;