diff --git a/torchci/clickhouse_queries/oss_ci_benchmark_llms/query.sql b/torchci/clickhouse_queries/oss_ci_benchmark_llms/query.sql index d393e48141..d1bfdec706 100644 --- a/torchci/clickhouse_queries/oss_ci_benchmark_llms/query.sql +++ b/torchci/clickhouse_queries/oss_ci_benchmark_llms/query.sql @@ -23,6 +23,11 @@ WITH benchmarks AS ( tupleElement(o.benchmark, 'extra_info') [ 'arch' ], tupleElement(o.runners [ 1 ], 'type') ) AS arch, + IF( + tupleElement(o.benchmark, 'extra_info') [ 'compile' ] = '', + 'true', -- Default to true + tupleElement(o.benchmark, 'extra_info') [ 'compile' ] + ) AS use_torch_compile, DATE_TRUNC( {granularity: String }, fromUnixTimestamp(o.timestamp) @@ -71,6 +76,7 @@ SELECT dtype, device, arch, + toBool(use_torch_compile) AS use_torch_compile, granularity_bucket FROM benchmarks diff --git a/torchci/components/benchmark/llms/ModelGraphPanel.tsx b/torchci/components/benchmark/llms/ModelGraphPanel.tsx index aa0a83a8ba..1ad3a09f4d 100644 --- a/torchci/components/benchmark/llms/ModelGraphPanel.tsx +++ b/torchci/components/benchmark/llms/ModelGraphPanel.tsx @@ -6,7 +6,6 @@ import { import { TIME_FIELD_NAME } from "components/benchmark/common"; import { DEFAULT_DEVICE_NAME, - DEFAULT_DTYPE_NAME, DEFAULT_MODEL_NAME, LLMsBenchmarkData, METRIC_DISPLAY_HEADERS, @@ -18,7 +17,10 @@ import { TimeSeriesPanelWithData, } from "components/metrics/panels/TimeSeriesPanel"; import dayjs from "dayjs"; -import { computeSpeedup } from "lib/benchmark/aoUtils"; +import { + computeSpeedup, + TORCHAO_SPEEDUP_METRIC_NAMES, +} from "lib/benchmark/aoUtils"; import { computeGeomean, useBenchmark } from "lib/benchmark/llmUtils"; import { BranchAndCommit } from "lib/types"; @@ -64,7 +66,12 @@ export function GraphPanel({ ); } - const dataWithSpeedup = computeSpeedup(repoName, data); + const dataWithSpeedup = computeSpeedup( + repoName, + computeSpeedup(repoName, data, false, true), + true, + false + ); // Clamp to the nearest granularity (e.g. nearest hour) so that the times will // align with the data we get from the database @@ -80,8 +87,10 @@ export function GraphPanel({ const chartData: { [k: string]: any } = {}; const graphSeries: { [k: string]: any } = {}; metricNames.forEach((metric: string) => { - // TODO (huydhn): Only display aggregated speedup metric for now - if (modelName === DEFAULT_MODEL_NAME && metric !== "speedup") { + if ( + modelName === DEFAULT_MODEL_NAME && + !TORCHAO_SPEEDUP_METRIC_NAMES.includes(metric) + ) { chartData[metric] = []; return; } @@ -115,8 +124,6 @@ export function GraphPanel({ .filter((record: LLMsBenchmarkData) => { return ( record.model === modelName && - (record.dtype === dtypeName || - dtypeName === DEFAULT_DTYPE_NAME) && (`${record.device} (${record.arch})` === deviceName || deviceName === DEFAULT_DEVICE_NAME) && record.metric === metric diff --git a/torchci/components/benchmark/llms/common.tsx b/torchci/components/benchmark/llms/common.tsx index bf76204015..38bc9dadea 100644 --- a/torchci/components/benchmark/llms/common.tsx +++ b/torchci/components/benchmark/llms/common.tsx @@ -14,7 +14,9 @@ export const METRIC_DISPLAY_HEADERS: { [k: string]: string } = { token_per_sec: "Token per second", flops_utilization: "FLOPs utilization", "compilation_time(s)": "Compilation Time (s)", - speedup: "Speedup", + compile_vs_eager_speedup: "Compile vs eager speedup", + autoquant_vs_compile_speedup: "Autoquant vs compile speedup", + eager_speedup: "Eager speedup", }; // The variable name is a bit dumb, but it tells if a higher metric value // is good or bad so that we can highlight it on the dashboard accordingly. @@ -53,6 +55,7 @@ export interface LLMsBenchmarkData { device: string; arch: string; display?: string; + use_torch_compile?: boolean; } export interface BranchAndCommitPerfData extends BranchAndCommit { diff --git a/torchci/lib/benchmark/aoUtils.ts b/torchci/lib/benchmark/aoUtils.ts index f8e9911198..62e9eb5657 100644 --- a/torchci/lib/benchmark/aoUtils.ts +++ b/torchci/lib/benchmark/aoUtils.ts @@ -10,6 +10,17 @@ export const TORCHAO_BASELINE = "noquant"; // here on the dashboard const SPEEDUP_METRICS = ["tok/s", "time_ms(avg)", "time_s(avg)", "img_s(avg)"]; +export const TORCHAO_SPEEDUP_METRIC_NAMES = [ + "autoquant_vs_compile_speedup", + "compile_vs_eager_speedup", + "eager_speedup", +]; +// Different speedup metrics, the key is quantization-torch.compile +export const TORCHAO_SPEEDUP_METRIC_NAMES_MAPPING: { [key: string]: string } = { + "noquant-false": "compile_vs_eager_speedup", + "-true": "autoquant_vs_compile_speedup", +}; + // TODO (huydhn): Use this function to convert the generic benchmark data to the old // CompilerPerformanceData format. This is needed until the TorchInductor dashboard // is migrated to the new format @@ -54,48 +65,111 @@ export function convertToCompilerPerformanceData(data: BenchmarkData[]) { return Object.values(convertData); } -export function computeSpeedup(repoName: string, data: LLMsBenchmarkData[]) { +export function computeSpeedup( + repoName: string, + data: LLMsBenchmarkData[], + useTorchCompile: boolean, + usebaseCommitbaseline: boolean +) { if (repoName !== TORCHAO_REPO) { return data; } - const baselineMetrics: { [key: string]: LLMsBenchmarkData } = {}; + // https://github.com/pytorch/test-infra/pull/6178#issuecomment-2596338457, we want + // to show 3 different speedup in AO: + // - Current eager perf vs base commit eager + const baseCommitBaseline: { [key: string]: LLMsBenchmarkData } = {}; + // - Current compile perf vs current eager + // - Current autoquant perf vs current compile + const currentCommitBaseline: { [key: string]: LLMsBenchmarkData } = {}; + data.forEach((r: LLMsBenchmarkData) => { - if (r.dtype !== TORCHAO_BASELINE) { + if ( + r.dtype !== TORCHAO_BASELINE || + r.use_torch_compile !== useTorchCompile + ) { return; } - const k = `${r.workflow_id} ${r.job_id} ${r.model} ${r.metric} ${r.device} ${r.arch}`; - baselineMetrics[k] = r; + const baseCommitKey = `${r.model} ${r.metric} ${r.device} ${r.arch}`; + const currentCommitKey = `${r.workflow_id} ${r.job_id} ${baseCommitKey}`; + + // To compare against the current commit + currentCommitBaseline[currentCommitKey] = r; + + // To compare against the oldest base commit + if ( + !usebaseCommitbaseline || + (baseCommitKey in baseCommitBaseline && + baseCommitBaseline[baseCommitKey].workflow_id < r.workflow_id) + ) { + return; + } + baseCommitBaseline[baseCommitKey] = r; }); const withSpeedup: LLMsBenchmarkData[] = []; data.forEach((r: LLMsBenchmarkData) => { - if (r.dtype === TORCHAO_BASELINE) { - return; + withSpeedup.push(r); + + // Compute eager speedup vs the base commit baseline + if (r.dtype === TORCHAO_BASELINE && r.use_torch_compile === false) { + if (SPEEDUP_METRICS.includes(r.metric)) { + const k = `${r.model} ${r.metric} ${r.device} ${r.arch}`; + if ( + k in baseCommitBaseline && + baseCommitBaseline[k].actual !== 0 && + r.actual !== 0 && + baseCommitBaseline[k].workflow_id <= r.workflow_id + ) { + const speedup = r.metric.includes("time") + ? baseCommitBaseline[k].actual / r.actual + : r.actual / baseCommitBaseline[k].actual; + + withSpeedup.push({ + ...r, + metric: "eager_speedup", + actual: Number(speedup.toFixed(2)), + target: 0, + }); + } + } } if (SPEEDUP_METRICS.includes(r.metric)) { const k = `${r.workflow_id} ${r.job_id} ${r.model} ${r.metric} ${r.device} ${r.arch}`; if ( - k in baselineMetrics && - baselineMetrics[k].actual !== 0 && + k in currentCommitBaseline && + currentCommitBaseline[k].actual !== 0 && r.actual !== 0 ) { const speedup = r.metric.includes("time") - ? baselineMetrics[k].actual / r.actual - : r.actual / baselineMetrics[k].actual; + ? currentCommitBaseline[k].actual / r.actual + : r.actual / currentCommitBaseline[k].actual; + + const speedupMetricName = + r.dtype === TORCHAO_BASELINE + ? // Compile vs eager + r !== currentCommitBaseline[k] + ? TORCHAO_SPEEDUP_METRIC_NAMES_MAPPING[ + `${r.dtype}-${useTorchCompile}` + ] + : "" + : // Autoquant vs compile or vs eager + TORCHAO_SPEEDUP_METRIC_NAMES_MAPPING[`-${useTorchCompile}`]; + + if (!speedupMetricName) { + return; + } withSpeedup.push({ ...r, - metric: "speedup", - actual: Number(speedup.toFixed(4)), + metric: speedupMetricName, + actual: Number(speedup.toFixed(2)), target: 0, }); } } - - withSpeedup.push(r); }); return withSpeedup; diff --git a/torchci/pages/benchmark/llms.tsx b/torchci/pages/benchmark/llms.tsx index 1cb9f6f0fe..8568df6728 100644 --- a/torchci/pages/benchmark/llms.tsx +++ b/torchci/pages/benchmark/llms.tsx @@ -21,7 +21,11 @@ import CopyLink from "components/CopyLink"; import GranularityPicker from "components/GranularityPicker"; import { Granularity } from "components/metrics/panels/TimeSeriesPanel"; import dayjs from "dayjs"; -import { computeSpeedup, TORCHAO_BASELINE } from "lib/benchmark/aoUtils"; +import { + computeSpeedup, + TORCHAO_BASELINE, + TORCHAO_SPEEDUP_METRIC_NAMES, +} from "lib/benchmark/aoUtils"; import { useBenchmark } from "lib/benchmark/llmUtils"; import { fetcher } from "lib/GeneralUtils"; import { BranchAndCommit } from "lib/types"; @@ -82,11 +86,22 @@ function Report({ ); } - const lDataWithSpeedup = computeSpeedup(repoName, lData); - const rDataWithSpeedup = computeSpeedup(repoName, rData); + const lDataWithSpeedup = computeSpeedup( + repoName, + computeSpeedup(repoName, lData, false, true), + true, + false + ); + + const rDataWithSpeedup = computeSpeedup( + repoName, + computeSpeedup(repoName, rData, false, true), + true, + false + ); if (repoName === "pytorch/ao") { - metricNames = ["speedup", ...metricNames]; + metricNames = [...TORCHAO_SPEEDUP_METRIC_NAMES, ...metricNames]; } return ( @@ -288,10 +303,7 @@ export default function Page() { ]; const dtypeNames: string[] = _.compact([ DEFAULT_DTYPE_NAME, - ..._.filter( - _.uniq(data.map((r: any) => r.dtype)) as string[], - (r: string) => r !== TORCHAO_BASELINE - ), + ...(_.uniq(data.map((r: any) => r.dtype)) as string[]), ]); const metricNames: string[] = _.uniq(data.map((r: any) => r.metric)); @@ -367,7 +379,7 @@ export default function Page() { commit={lCommit} setCommit={setLCommit} titlePrefix={"Base"} - fallbackIndex={1} // Default to previous commit + fallbackIndex={-1} // Default to oldest commit timeRange={timeRange} />