Skip to content

Commit 5de69da

Browse files
authored
Filter out devices that are not run (#6277)
Fixes pytorch/executorch#7986 (for real this time) The problem is that the records in the benchmark database alone don't have the information to differentiate between benchmarks that are failed to run and benchmarks that are not run. Both show up as 0 on the dashboard. Note that we can do a join with `workflow_job` table to get this information, but it's a rather slow and expensive route. So, I opt for a quicker approach to keep track of the list of valid devices on the dashboard side. A valid device is one that is run by the selected commit and has at least one non-zero record there. ### Testing https://torchci-git-fork-huydhn-better-model-filter-fbopensource.vercel.app/benchmark/llms?repoName=pytorch%2Fexecutorch
1 parent 14317d7 commit 5de69da

File tree

3 files changed

+75
-9
lines changed

3 files changed

+75
-9
lines changed

torchci/components/benchmark/llms/SummaryPanel.tsx

+15-7
Original file line numberDiff line numberDiff line change
@@ -142,14 +142,18 @@ export function SummaryPanel({
142142
flex: 1,
143143
cellClassName: (params: GridCellParams<any, any>) => {
144144
const v = params.value;
145-
if (v === undefined || v.l.actual === 0) {
145+
if (v === undefined) {
146146
return "";
147147
}
148148

149149
// l is the old (base) value, r is the new value
150150
const l = v.l.actual;
151151
const r = v.r.actual;
152152

153+
if (!v.highlight) {
154+
return "";
155+
}
156+
153157
if (lCommit === rCommit) {
154158
return "";
155159
} else {
@@ -163,6 +167,11 @@ export function SummaryPanel({
163167
return styles.error;
164168
}
165169

170+
// If it didn't run and now it runs, mark it as green
171+
if (l === 0) {
172+
return styles.ok;
173+
}
174+
166175
if (metric in IS_INCREASING_METRIC_VALUE_GOOD) {
167176
// Higher value
168177
if (r - l > RELATIVE_THRESHOLD * l) {
@@ -206,12 +215,11 @@ export function SummaryPanel({
206215
: "";
207216
const showTarget =
208217
target && target != 0 ? `[target = ${target}]` : "";
209-
const isNewModel = l === 0 ? "(NEW!)" : "";
210218

211-
if (lCommit === rCommit || l === r) {
219+
if (lCommit === rCommit || !v.highlight) {
212220
return `${r} ${rPercent} ${showTarget}`;
213221
} else {
214-
return `${l} ${lPercent}${r} ${rPercent} ${showTarget} ${isNewModel} `;
222+
return `${l} ${lPercent}${r} ${rPercent} ${showTarget}`;
215223
}
216224
},
217225
};
@@ -225,9 +233,9 @@ export function SummaryPanel({
225233
<Grid2
226234
size={{ xs: 12, lg: 12 }}
227235
height={
228-
data.length > 98
229-
? 98 * ROW_HEIGHT
230-
: data.length * ROW_HEIGHT + ROW_GAP
236+
data.length > 90
237+
? 90 * ROW_HEIGHT
238+
: (data.length + 1) * ROW_HEIGHT + ROW_GAP
231239
}
232240
>
233241
<TablePanelWithData

torchci/components/benchmark/llms/common.tsx

+11-1
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,22 @@ export const IS_INCREASING_METRIC_VALUE_GOOD: { [k: string]: boolean } = {
2727
flops_utilization: true,
2828
"compilation_time(s)": false,
2929
speedup: true,
30+
"avg_inference_latency(ms)": false,
31+
"model_load_time(ms)": false,
32+
"peak_inference_mem_usage(mb)": false,
33+
"peak_load_mem_usuage(mb)": false,
34+
"generate_time(ms)": false,
3035
};
3136
export const METRIC_DISPLAY_SHORT_HEADERS: { [k: string]: string } = {
3237
"memory_bandwidth(GB/s)": "Bandwidth",
3338
token_per_sec: "TPS",
3439
flops_utilization: "FLOPs",
3540
"compilation_time(s)": "CompTime",
41+
"avg_inference_latency(ms)": "InferenceTime",
42+
"model_load_time(ms)": "LoadTime",
43+
"peak_inference_mem_usage(mb)": "InferenceMem",
44+
"peak_load_mem_usuage(mb)": "LoadMem",
45+
"generate_time(ms)": "GenerateTime",
3646
};
3747
export const DEFAULT_DEVICE_NAME = "All Devices";
3848
export const DEFAULT_ARCH_NAME = "All Platforms";
@@ -45,7 +55,7 @@ export const ARCH_NAMES: { [k: string]: string[] } = {
4555
};
4656

4757
// Relative thresholds
48-
export const RELATIVE_THRESHOLD = 0.05;
58+
export const RELATIVE_THRESHOLD = 0.1;
4959

5060
export interface LLMsBenchmarkData {
5161
granularity_bucket: string;

torchci/lib/benchmark/llmUtils.ts

+49-1
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,33 @@ export function combineLeftAndRight(
8888
});
8989
}
9090

91+
// NB: This is a hack to keep track of valid devices. The problem is that the records
92+
// in the benchmark database alone don't have the information to differentiate between
93+
// benchmarks that are failed to run and benchmarks that are not run. Both show up as
94+
// 0 on the dashboard. Note that we can do a join with workflow_job table to get this
95+
// information, but it's a rather slow and expensive route
96+
const validDevices = new Set<string>();
97+
const validBackends = new Set<string>();
98+
// First round to get all the valid devices
99+
Object.keys(dataGroupedByModel).forEach((key: string) => {
100+
const [model, backend, dtype, device, arch] = key.split(";");
101+
const row: { [k: string]: any } = {
102+
// Keep the name as as the row ID as DataGrid requires it
103+
name: `${model} ${backend} (${dtype} / ${device} / ${arch})`,
104+
};
105+
106+
for (const metric in dataGroupedByModel[key]) {
107+
const record = dataGroupedByModel[key][metric];
108+
const hasL = "l" in record;
109+
const hasR = "r" in record;
110+
111+
if (hasL && hasR) {
112+
validDevices.add(device);
113+
validBackends.add(`${model} ${backend}`);
114+
}
115+
}
116+
});
117+
91118
// Transform the data into a displayable format
92119
const data: { [k: string]: any }[] = [];
93120
Object.keys(dataGroupedByModel).forEach((key: string) => {
@@ -102,6 +129,20 @@ export function combineLeftAndRight(
102129
const hasL = "l" in record;
103130
const hasR = "r" in record;
104131

132+
// Skip devices and models that weren't run in this commit
133+
if (
134+
(validDevices.size !== 0 && !validDevices.has(device)) ||
135+
(validBackends.size !== 0 && !validBackends.has(`${model} ${backend}`))
136+
) {
137+
continue;
138+
}
139+
140+
// No overlapping between left and right commits, just show what it's on the
141+
// right commit instead of showing a blank page
142+
if (!hasR) {
143+
continue;
144+
}
145+
105146
if (!("metadata" in row)) {
106147
row["metadata"] = {
107148
model: model,
@@ -151,10 +192,17 @@ export function combineLeftAndRight(
151192
actual: 0,
152193
target: 0,
153194
},
195+
highlight:
196+
validDevices.size !== 0 &&
197+
validBackends.has(`${model} ${backend}`) &&
198+
hasL &&
199+
hasR,
154200
};
155201
}
156202

157-
data.push(row);
203+
if ("metadata" in row) {
204+
data.push(row);
205+
}
158206
});
159207

160208
return data;

0 commit comments

Comments
 (0)