diff --git a/.github/workflows/benchmarks-reusable.yml b/.github/workflows/benchmarks-reusable.yml index bfd6064ba1..f8ef31fcc2 100644 --- a/.github/workflows/benchmarks-reusable.yml +++ b/.github/workflows/benchmarks-reusable.yml @@ -220,11 +220,12 @@ jobs: --compute-runtime ${{ inputs.compute_runtime_commit }} --build-igc ${{ inputs.upload_report && '--output-html' || '' }} + ${{ inputs.pr_no != 0 && '--output-markdown' || '' }} ${{ inputs.bench_script_params }} - name: Print benchmark results run: | - cat ${{ github.workspace }}/ur-repo/benchmark_results.md + cat ${{ github.workspace }}/ur-repo/benchmark_results.md || true - name: Add comment to PR uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 7de3926daf..edcb5c02f2 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -24,7 +24,7 @@ on: type: number required: true bench_script_params: - description: Parameters passed to script executing benchmark + description: Parameters passed to the script executing benchmark (recommended `--compare baseline`) type: string required: false default: '' diff --git a/scripts/benchmarks/README.md b/scripts/benchmarks/README.md index 9cef0e52a3..ec4c75c3bf 100644 --- a/scripts/benchmarks/README.md +++ b/scripts/benchmarks/README.md @@ -27,7 +27,7 @@ You can also include additional benchmark parameters, such as environment variab Once all the required information is entered, click the "Run workflow" button to initiate a new workflow run. This will execute the benchmarks and then post the results as a comment on the specified Pull Request. -By default, all benchmark runs are compared against `baseline`, which is a well-established set of the latest data. +It is recommended that all benchmark runs should be compared against `baseline` by passing `--compare baseline` to benchmark parameters. `baseline` is a well-established set of the latest data. You must be a member of the `oneapi-src` organization to access these features. @@ -35,13 +35,14 @@ You must be a member of the `oneapi-src` organization to access these features. By default, the benchmark results are not stored. To store them, use the option `--save `. This will make the results available for comparison during the next benchmark runs. -To compare a benchmark run with a previously stored result, use the option `--compare `. You can compare with more than one result. - -If no `--compare` option is specified, the benchmark run is compared against a previously stored `baseline`. +You can compare benchmark results using `--compare` option. The comparison will be presented in a markdown output file (see below). If you want to calculate the relative performance of the new results against the previously saved data, use `--compare ` (i.e. `--compare baseline`). In case of comparing only stored data without generating new results, use `--dry-run --compare --compare --relative-perf `, where `name1` indicates the baseline for the relative performance calculation and `--dry-run` prevents the script for running benchmarks. Listing more than two `--compare` options results in displaying only execution time, without statistical analysis. Baseline, as well as baseline-v2 (for the level-zero adapter v2) is updated automatically during a nightly job. The results are stored [here](https://oneapi-src.github.io/unified-runtime/benchmark_results.html). +## Output formats +You can display the results in the form of a HTML file by using `--ouptut-html` and a markdown file by using `--output-markdown`. Due to character limits for posting PR comments, the final content of the markdown file might be reduced. In order to obtain the full markdown output, use `--output-markdown full`. + ## Requirements ### Python diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py index 77524a6e02..a740c02672 100755 --- a/scripts/benchmarks/main.py +++ b/scripts/benchmarks/main.py @@ -189,9 +189,12 @@ def main(directory, additional_env_vars, save_name, compare_names, filter): benchmark.teardown() print("complete.") - this_name = "This PR" - chart_data = {this_name : results} + this_name = options.current_run_name + chart_data = {} + + if not options.dry_run: + chart_data = {this_name : results} history = BenchmarkHistory(directory) # limit how many files we load. @@ -199,7 +202,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter): history.load(1000) # remove duplicates. this can happen if e.g., --compare baseline is specified manually. - compare_names = list(dict.fromkeys(compare_names)) + compare_names = list(dict.fromkeys(compare_names)) if compare_names is not None else [] for name in compare_names: compare_result = history.get_compare(name) @@ -207,7 +210,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter): chart_data[name] = compare_result.results if options.output_markdown: - markdown_content = generate_markdown(this_name, chart_data) + markdown_content = generate_markdown(this_name, chart_data, options.output_markdown) with open('benchmark_results.md', 'w') as file: file.write(markdown_content) @@ -251,7 +254,7 @@ def validate_and_parse_env_args(env_args): parser.add_argument("--no-rebuild", help='Do not rebuild the benchmarks from scratch.', action="store_true") parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[]) parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.') - parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"]) + parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append") parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=options.iterations) parser.add_argument("--stddev-threshold", type=float, help='If stddev pct is above this threshold, rerun all iterations', default=options.stddev_threshold) parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=options.timeout) @@ -261,12 +264,13 @@ def validate_and_parse_env_args(env_args): parser.add_argument("--exit-on-failure", help='Exit on first failure.', action="store_true") parser.add_argument("--compare-type", type=str, choices=[e.value for e in Compare], help='Compare results against previously saved data.', default=Compare.LATEST.value) parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=options.compare_max) + parser.add_argument("--output-markdown", nargs='?', const=options.output_markdown, help='Specify whether markdown output should fit the content size limit for request validation') parser.add_argument("--output-html", help='Create HTML output', action="store_true", default=False) - parser.add_argument("--output-markdown", help='Create Markdown output', action="store_true", default=True) parser.add_argument("--dry-run", help='Do not run any actual benchmarks', action="store_true", default=False) parser.add_argument("--compute-runtime", nargs='?', const=options.compute_runtime_tag, help="Fetch and build compute runtime") parser.add_argument("--iterations-stddev", type=int, help="Max number of iterations of the loop calculating stddev after completed benchmark runs", default=options.iterations_stddev) parser.add_argument("--build-igc", help="Build IGC from source instead of using the OS-installed version", action="store_true", default=options.build_igc) + parser.add_argument("--relative-perf", type=str, help="The name of the results which should be used as a baseline for metrics calculation", default=options.current_run_name) args = parser.parse_args() additional_env_vars = validate_and_parse_env_args(args.env) @@ -283,12 +287,13 @@ def validate_and_parse_env_args(env_args): options.exit_on_failure = args.exit_on_failure options.compare = Compare(args.compare_type) options.compare_max = args.compare_max - options.output_html = args.output_html options.output_markdown = args.output_markdown + options.output_html = args.output_html options.dry_run = args.dry_run options.umf = args.umf options.iterations_stddev = args.iterations_stddev options.build_igc = args.build_igc + options.current_run_name = args.relative_perf if args.build_igc and args.compute_runtime is None: parser.error("--build-igc requires --compute-runtime to be set") diff --git a/scripts/benchmarks/options.py b/scripts/benchmarks/options.py index 1bd79f6878..772fee2e02 100644 --- a/scripts/benchmarks/options.py +++ b/scripts/benchmarks/options.py @@ -6,6 +6,10 @@ class Compare(Enum): AVERAGE = 'average' MEDIAN = 'median' +class MarkdownSize(Enum): + SHORT = 'short' + FULL = 'full' + @dataclass class Options: workdir: str = None @@ -20,8 +24,8 @@ class Options: verbose: bool = False compare: Compare = Compare.LATEST compare_max: int = 10 # average/median over how many results + output_markdown: MarkdownSize = MarkdownSize.SHORT output_html: bool = False - output_markdown: bool = True dry_run: bool = False # these two should probably be merged into one setting stddev_threshold: float = 0.02 @@ -32,6 +36,7 @@ class Options: extra_env_vars: dict = field(default_factory=dict) compute_runtime_tag: str = '24.52.32224.10' build_igc: bool = False + current_run_name: str = "This PR" options = Options() diff --git a/scripts/benchmarks/output_markdown.py b/scripts/benchmarks/output_markdown.py index fc3b65507b..552e924f4f 100644 --- a/scripts/benchmarks/output_markdown.py +++ b/scripts/benchmarks/output_markdown.py @@ -1,12 +1,13 @@ -# Copyright (C) 2024 Intel Corporation -# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# Copyright (C) 2024-2025 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +# Exceptions. # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import collections, re +import collections from benches.result import Result -from options import options -import math +from options import options, MarkdownSize +import ast class OutputLine: def __init__(self, name): @@ -14,6 +15,8 @@ def __init__(self, name): self.diff = None self.bars = None self.row = "" + self.suite = "Unknown" + self.explicit_group = "" def __str__(self): return f"(Label:{self.label}, diff:{self.diff})" @@ -21,40 +24,167 @@ def __str__(self): def __repr__(self): return self.__str__() -# Function to generate the markdown collapsible sections for each variant -def generate_markdown_details(results: list[Result]): - markdown_sections = [] +# The number of the required columns in the markdown table, +# independent of the chart_data content. +# Required columns: +# - benchmark_name +# +# optional +1: relative performance +num_info_columns = 1 + +# Number of columns required for relative performance change calculation. +# In case of multiple provided saved baselines to compare, the relative +# performance is not calculated, since the base (hopefully) usage case +# for this script would be comparing the performance of PR with the main branch +num_baselines_required_for_rel_change = 2 + +# Maximum number of characters that is allowed in request validation +# for posting comments in GitHub PRs +max_markdown_size = 65536 + + +def is_relative_perf_comparison_to_be_performed(chart_data: + dict[str, list[Result]], + baseline_name: str): + return (len(chart_data) == num_baselines_required_for_rel_change) and \ + (baseline_name in chart_data.keys()) + + +def get_chart_markdown_header(chart_data: dict[str, list[Result]], + baseline_name: str): + summary_header = '' + final_num_columns = num_info_columns + + if is_relative_perf_comparison_to_be_performed(chart_data, baseline_name): + summary_header = "| Benchmark | " + " | ".join(chart_data.keys()) + \ + " | Change |\n" + final_num_columns += 1 + else: + summary_header = "| Benchmark | " + " | ".join(chart_data.keys()) + \ + " |\n" + + summary_header += "|---" * (len(chart_data) + final_num_columns) + "|\n" + + return summary_header + + +def get_improved_regressed_summary(is_improved: bool, rows_count: int): + title = "Improved" + if not is_improved: + title = "Regressed" + + summary = ( + "\n
\n" + "\n" + f"{title} {rows_count} " + f"(threshold {options.epsilon*100:.2f}%)\n" + "\n\n" + ) + + return summary + + +def get_relative_perf_summary(group_size: int, group_name: str): + summary = ( + "\n
\n" + f" Relative perf in group {group_name} " + f"({group_size})\n" + "\n\n" + ) + + return summary + - markdown_sections.append(f""" -
-Benchmark details - environment, command... -""") +def get_main_branch_run_name(chart_data: dict[str, list[Result]], + baseline_name: str): + for key in chart_data.keys(): + if key != baseline_name: + return key + + return None - for res in results: - env_vars_str = '\n'.join(f"{key}={value}" for key, value in res.env.items()) - markdown_sections.append(f""" -
-{res.label} -#### Environment Variables: -{env_vars_str} +def get_available_markdown_size(current_markdown_size: int): + return max(0, max_markdown_size - current_markdown_size) -#### Command: -{' '.join(res.command)} -
-""") - markdown_sections.append(f""" -
-""") - return "\n".join(markdown_sections) +def is_content_in_size_limit(content_size: int, current_markdown_size: int): + return content_size <= get_available_markdown_size(current_markdown_size) -def generate_summary_table_and_chart(chart_data: dict[str, list[Result]]): - summary_table = "| Benchmark | " + " | ".join(chart_data.keys()) + " | Relative perf | Change | - |\n" - summary_table += "|---" * (len(chart_data) + 4) + "|\n" + +def get_explicit_group_name(result: Result): + explicit_group_name = result.explicit_group + + if explicit_group_name != "": + return explicit_group_name + else: + return "Other" + + +# Function to generate the markdown collapsible sections for each variant +def generate_markdown_details(results: list[Result], + current_markdown_size: int, + markdown_size: MarkdownSize): + markdown_sections = [] + markdown_start = ("\n
\n" + "Benchmark details - environment, command..." + "\n") + markdown_sections.append(markdown_start) + + for res in results: + env_dict = res.env + command = res.command + + # If data is collected from already saved results, + # the content is parsed as strings + if isinstance(res.env, str): + # Since the scripts would be used solely on data prepared + # by our scripts, this should be safe + # However, maybe needs an additional blessing + # https://docs.python.org/3/library/ast.html#ast.literal_eval + env_dict = ast.literal_eval(res.env) + if isinstance(res.command, str): + command = ast.literal_eval(res.command) + + section = ("\n
\n" + f"{res.label}\n\n" + "#### Command:\n" + f"{' '.join(command)}\n\n") + + if env_dict: + env_vars_str = '\n'.join(f"{key}={value}" + for key, value in env_dict.items()) + section += (f"#### Environment Variables:\n {env_vars_str}\n") + + section += "\n
\n" + + markdown_sections.append(section) + + markdown_sections.append("\n
\n") + + full_markdown = "\n".join(markdown_sections) + + if markdown_size == MarkdownSize.FULL: + return full_markdown + else: + if is_content_in_size_limit(len(full_markdown), current_markdown_size): + return full_markdown + else: + return "\nBenchmark details contain too many chars to display\n" + +def generate_summary_table_and_chart(chart_data: dict[str, list[Result]], + baseline_name: str, + markdown_size: MarkdownSize): + summary_table = get_chart_markdown_header(chart_data=chart_data, + baseline_name=baseline_name) # Collect all benchmarks and their results + # key: benchmark name, + # value: dict(run_name : single_result_in_the_given_run) benchmark_results = collections.defaultdict(dict) + + # key: run name + # results: results from different benchmarks collected in the named run for key, results in chart_data.items(): for res in results: benchmark_results[res.name][key] = res @@ -62,159 +192,209 @@ def generate_summary_table_and_chart(chart_data: dict[str, list[Result]]): # Generate the table rows output_detailed_list = [] - - global_product = 1 - mean_cnt = 0 - improved = 0 - regressed = 0 - no_change = 0 - for bname, results in benchmark_results.items(): oln = OutputLine(bname) oln.row = f"| {bname} |" best_value = None best_key = None - # Determine the best value + are_suite_group_assigned = False + + # Determine the best value for the given benchmark, among the results + # from all saved runs specified by --compare + # key: run name, + # res: single result collected in the given run for key, res in results.items(): - if best_value is None or (res.lower_is_better and res.value < best_value) or (not res.lower_is_better and res.value > best_value): + if not are_suite_group_assigned: + oln.suite = res.suite + oln.explicit_group = get_explicit_group_name(res) + + are_suite_group_assigned = True + + if best_value is None or \ + (res.lower_is_better and res.value < best_value) or \ + (not res.lower_is_better and res.value > best_value): best_value = res.value best_key = key - # Generate the row with the best value highlighted + # Generate the row with all the results from saved runs specified by + # --compare, + # Highlight the best value in the row with data if options.verbose: print(f"Results: {results}") for key in chart_data.keys(): if key in results: intv = results[key].value if key == best_key: - oln.row += f" {intv:3f} {results[key].unit} |" # Highlight the best value + # Highlight the best value + oln.row += f" {intv:3f} {results[key].unit} |" else: oln.row += f" {intv:.3f} {results[key].unit} |" else: oln.row += " - |" - if len(chart_data.keys()) == 2: - key0 = list(chart_data.keys())[0] - key1 = list(chart_data.keys())[1] - if (key0 in results) and (key1 in results): - v0 = results[key0].value - v1 = results[key1].value + if is_relative_perf_comparison_to_be_performed(chart_data, + baseline_name): + pr_key = baseline_name + main_key = get_main_branch_run_name(chart_data, baseline_name) + + if (pr_key in results) and (main_key in results): + pr_val = results[pr_key].value + main_val = results[main_key].value diff = None - if v0 != 0 and results[key0].lower_is_better: - diff = v1/v0 - elif v1 != 0 and not results[key0].lower_is_better: - diff = v0/v1 + if pr_val != 0 and results[pr_key].lower_is_better: + diff = main_val / pr_val + elif main_val != 0 and not results[pr_key].lower_is_better: + diff = pr_val / main_val if diff != None: - oln.row += f"{(diff * 100):.2f}%" oln.diff = diff output_detailed_list.append(oln) - sorted_detailed_list = sorted(output_detailed_list, key=lambda x: (x.diff is not None, x.diff), reverse=True) + sorted_detailed_list = sorted(output_detailed_list, key=lambda x: + (x.diff is not None, x.diff), reverse=True) - diff_values = [oln.diff for oln in sorted_detailed_list if oln.diff is not None] + diff_values = [oln.diff for oln in sorted_detailed_list + if oln.diff is not None] - if len(diff_values) > 0: - max_diff = max(max(diff_values) - 1, 1 - min(diff_values)) + improved_rows = [] + regressed_rows = [] + if len(diff_values) > 0: for oln in sorted_detailed_list: if oln.diff != None: - oln.row += f" | {(oln.diff - 1)*100:.2f}%" delta = oln.diff - 1 - oln.bars = round(10*(oln.diff - 1)/max_diff) if max_diff != 0.0 else 0 - if oln.bars == 0 or abs(delta) < options.epsilon: - oln.row += " | . |" - elif oln.bars > 0: - oln.row += f" | {'+' * oln.bars} |" - else: - oln.row += f" | {'-' * (-oln.bars)} |" + oln.row += f" {delta*100:.2f}%" - mean_cnt += 1 if abs(delta) > options.epsilon: if delta > 0: - improved+=1 + improved_rows.append(oln.row + " | \n") else: - regressed+=1 - else: - no_change+=1 - - global_product *= oln.diff - else: - oln.row += " | |" + regressed_rows.append(oln.row + " | \n") if options.verbose: print(oln.row) + summary_table += oln.row + "\n" else: for oln in sorted_detailed_list: - oln.row += " | |" - if options.verbose: print(oln.row) summary_table += oln.row + "\n" - grouped_objects = collections.defaultdict(list) - - for oln in output_detailed_list: - s = oln.label - prefix = re.match(r'^[^_\s]+', s)[0] - grouped_objects[prefix].append(oln) - - grouped_objects = dict(grouped_objects) - - if mean_cnt > 0: - global_mean = global_product ** (1/mean_cnt) - summary_line = f"Total {mean_cnt} benchmarks in mean. " - summary_line += "\n" + f"Geomean {global_mean*100:.3f}%. \nImproved {improved} Regressed {regressed} (threshold {options.epsilon*100:.2f}%)" - else: + regressed_rows.reverse() + + is_at_least_one_diff = False + summary_line = '' + + if len(improved_rows) > 0: + is_at_least_one_diff = True + summary_line += get_improved_regressed_summary( + is_improved=True, + rows_count=len(improved_rows) + ) + summary_line += get_chart_markdown_header( + chart_data=chart_data, + baseline_name=baseline_name + ) + + for row in improved_rows: + summary_line += row + + summary_line += "\n
" + + if len(regressed_rows) > 0: + is_at_least_one_diff = True + summary_line += get_improved_regressed_summary( + is_improved=False, + rows_count=len(regressed_rows) + ) + + summary_line += get_chart_markdown_header( + chart_data=chart_data, + baseline_name=baseline_name + ) + + for row in regressed_rows: + summary_line += row + + summary_line += "\n
" + + if not is_at_least_one_diff: summary_line = f"No diffs to calculate performance change" if options.verbose: print(summary_line) - summary_table = "\n## Performance change in benchmark groups\n" - for name, outgroup in grouped_objects.items(): - outgroup_s = sorted(outgroup, key=lambda x: (x.diff is not None, x.diff), reverse=True) - product = 1.0 - n = len(outgroup_s) - r = 0 - for oln in outgroup_s: - if oln.diff != None: - product *= oln.diff - r += 1 - if r > 0: - summary_table += f""" -
- Relative perf in group {name} ({n}): {math.pow(product, 1/r)*100:.3f}% - -""" - else: - summary_table += f""" -
- Relative perf in group {name} ({n}): cannot calculate + grouped_in_suites = collections.defaultdict(lambda: + collections.defaultdict(list)) + for oln in output_detailed_list: + grouped_in_suites[oln.suite][oln.explicit_group].append(oln) + + for suite_name, suite_groups in grouped_in_suites.items(): + summary_table += f"
{suite_name}\n\n" -""" - summary_table += "| Benchmark | " + " | ".join(chart_data.keys()) + " | Relative perf | Change | - |\n" - summary_table += "|---" * (len(chart_data) + 4) + "|\n" + for name, outgroup in suite_groups.items(): + outgroup_s = sorted(outgroup, key=lambda x: + (x.diff is not None, x.diff), reverse=True) - for oln in outgroup_s: - summary_table += f"{oln.row}\n" + summary_table += get_relative_perf_summary( + group_size=len(outgroup_s), + group_name=name + ) + summary_table += get_chart_markdown_header(chart_data, + baseline_name) - summary_table += f""" -
+ for oln in outgroup_s: + summary_table += f"{oln.row}\n" -""" + summary_table += "\n
\n\n" - return summary_line, summary_table + summary_table += "
" -def generate_markdown(name: str, chart_data: dict[str, list[Result]]): - (summary_line, summary_table) = generate_summary_table_and_chart(chart_data) + if markdown_size == MarkdownSize.FULL: + return summary_line, summary_table + else: + full_content_size = len(summary_table) + len(summary_line) - return f""" -# Summary -{summary_line}\n -(result is better)\n -{summary_table} -# Details -{generate_markdown_details(chart_data[name])} -""" + if is_content_in_size_limit(content_size=full_content_size, + current_markdown_size=0): + return summary_line, summary_table + else: + if is_content_in_size_limit(content_size=len(summary_line), + current_markdown_size=0): + return summary_line, '' + else: + return ( + "\n# Summary\n" + "Benchmark output is too large to display\n\n" + ) + + +def generate_markdown(name: str, + chart_data: dict[str, list[Result]], + markdown_size: MarkdownSize): + (summary_line, summary_table) = generate_summary_table_and_chart( + chart_data, + name, + markdown_size + ) + + current_markdown_size = len(summary_line) + len(summary_table) + + generated_markdown = ( + "\n# Summary\n" + "(Emphasized values are the best results)\n" + f"{summary_line}\n" + f"{summary_table}\n\n" + ) + + if name in chart_data.keys(): + markdown_details = generate_markdown_details(chart_data[name], + current_markdown_size, + markdown_size) + generated_markdown += ( + "\n# Details\n" + f"{markdown_details}\n" + ) + + return generated_markdown