From 4cc007a4010418e6b9d4625b03bfdc33751e864e Mon Sep 17 00:00:00 2001 From: Lixun Zhang Date: Fri, 6 Sep 2024 08:51:52 -0500 Subject: [PATCH 1/6] Move utility tools from triton-mlir to main_perf branch - Plot layout script - occ.sh - amdgcn-cfg --- .../perf-kernels/tools/amdgcn-cfg/README.md | 14 + .../tools/amdgcn-cfg/amdgcn-cfg.py | 226 +++++ python/perf-kernels/tools/occ.sh | 71 ++ .../perf-kernels/tools/plot-layout/README.md | 117 +++ .../tools/plot-layout/plot_layout.py | 341 +++++++ .../tools/plot-layout/tikzplot.tex | 880 ++++++++++++++++++ 6 files changed, 1649 insertions(+) create mode 100644 python/perf-kernels/tools/amdgcn-cfg/README.md create mode 100644 python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py create mode 100755 python/perf-kernels/tools/occ.sh create mode 100644 python/perf-kernels/tools/plot-layout/README.md create mode 100755 python/perf-kernels/tools/plot-layout/plot_layout.py create mode 100755 python/perf-kernels/tools/plot-layout/tikzplot.tex diff --git a/python/perf-kernels/tools/amdgcn-cfg/README.md b/python/perf-kernels/tools/amdgcn-cfg/README.md new file mode 100644 index 000000000000..bea420ea530c --- /dev/null +++ b/python/perf-kernels/tools/amdgcn-cfg/README.md @@ -0,0 +1,14 @@ +# Control Flow Graph Generator from AMDGCN assembly + +The script reads an assembly file and generates a Control Flow Graph (CFG) for each function in the file. The graph can be saved in `dot`, `svg` and `pdf` formats. The nodes of a graph can be represented with 1) just labels or 2) the corresponding assembly code. The edges of a graph can help to identify cycles and, thus, to provide a better navigation through the code. + + +### Basic usage + +``` +python ./amdgcn-cfg.py -i -o / -f [dot|svg|pdf] +``` + +`dot`-files can be visualize with [this](https://dreampuf.github.io/GraphvizOnline) online tool. You just need to copy and paste the content of a generated `dot`-file. + +By default, the nodes are named with basic block labels. Use `-v` or `--verbose` option to add assembly source code to corresponding nodes. diff --git a/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py b/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py new file mode 100644 index 000000000000..4100528f28db --- /dev/null +++ b/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py @@ -0,0 +1,226 @@ +import os +import argparse +import re +from collections import OrderedDict +import graphviz + + +class Options: + def __init__(self, input_file, output_file, verbose, format): + if not os.path.exists(input_file): + raise RuntimeError('input file is not provided') + + output_dir = os.path.dirname(output_file) + if not os.path.exists(output_dir): + raise RuntimeError('output directory does not exist') + + self.input_file = input_file + self.output_file = output_file + self.verbose = verbose + self.format = format + self.output_dir = output_dir + + +class Block: + def __init__(self, label, code): + self.label = label + self.code = code + self.edges = [] + + +class Kernel: + def __init__(self, kernel_name, blocks): + self.name = kernel_name + self.blocks = blocks + self.cfg = None + + +begin_label = 'Begin' +end_label = 'End' + + +def find_kernel(text): + func_name_expr = r'^([^\s^\.]\w.+):' + func_name = None + start = None + for index, line in enumerate(text): + match = re.search(func_name_expr, line) + if not match is None: + func_name = match[1] + start = index + break + if start == None: + return None, None, None + + end = None + for index, line in enumerate(text): + if not re.search(r's_endpgm', line) is None: + end = index + break + + if end == None: + return None, None, None + + return func_name, text[start:end+1], end + + +def find_label(kernel): + label = None + index = None + for index, line in enumerate(kernel): + match = re.search(r'^\.(\w+):', line) + if not match is None: + label = match[1] + break + return label, index + + +def get_block_list(kernel): + label, index = find_label(kernel) + + blocks = OrderedDict() + if (index > 1): + blocks[begin_label] = Block(begin_label, kernel[:index-1]) + + while label != None: + kernel = kernel[index+1:] + next_label, next_index = find_label(kernel) + if next_label is None: + code = kernel[index:] + else: + code = kernel[:next_index] + blocks[label] = Block(label, code) + + label = next_label + index = next_index + + blocks[end_label] = Block(end_label, []) + + return blocks + + +def find_terminators(code): + terminator_labels = [] + for line in code: + branch = re.search(r'(c)?branch.*\s+\.?(.*)', line) + if not branch is None: + is_condional = True if len(branch.groups()) == 2 else False + label_idx = 2 if is_condional else 1 + terminator_labels.append(branch[label_idx]) + if not is_condional: + return terminator_labels, True + end = re.search(r's_endpgm', line) + if not end is None: + terminator_labels.append(end_label) + return terminator_labels, True + + return terminator_labels, False + + +def add_edges(kernel): + keys = list(kernel.blocks.keys()) + for index, curr_label in enumerate(keys): + if curr_label == end_label: + continue + + code = kernel.blocks[curr_label].code + terminators, is_last_unconditional = find_terminators(code[:-1]) + + if is_last_unconditional: + # unconditional jump in the middle of the block + break + + # handle the last terminator in the current BB + last_terminator, is_unconditional = find_terminators([code[-1]]) + + is_conditional = not is_unconditional + next_block_label = keys[index + 1] + is_next_covered = next_block_label in terminators + + if last_terminator: + terminators.extend(last_terminator) + if is_conditional and not is_next_covered: + next_block_label = keys[index + 1] + terminators.append(next_block_label) + else: + if not is_next_covered: + next_block_label = keys[index + 1] + terminators.append(next_block_label) + + assert(len(terminators)) + kernel.blocks[curr_label].edges = terminators + + +def generate_cfg(kernel, options): + graph = graphviz.Digraph(f'{kernel.name}') + for curr_label in kernel.blocks: + block = kernel.blocks[curr_label] + asm = [line.strip() for line in block.code] + if options.verbose: + label_text = repr('\n'.join([f'{curr_label}', *asm])) + else: + label_text = curr_label + graph.node(curr_label, + shape='rect', + labeljust='l', + margin='0.01', + label=label_text) + + for curr_label in kernel.blocks: + block = kernel.blocks[curr_label] + for edge in block.edges: + graph.edge(curr_label, edge) + + return graph + + +def main(options): + asm = [] + with open(options.input_file, 'r') as file: + context = file.readlines() + for line in context: + asm.append(line[:-1]) + + kernels = [] + last_end_index = 0 + while last_end_index != None: + func_name, kernel_asm, last_end_index = find_kernel(asm) + if kernel_asm == None: + break + + blocks = get_block_list(kernel_asm) + kernel = Kernel(func_name, blocks) + add_edges(kernel) + + cfg = generate_cfg(kernel, options) + kernel.cfg = cfg + kernels.append(kernel) + asm = asm[last_end_index+1:] + + for index, kernel in enumerate(kernels): + output_file_name = f'{options.output_file}.kernel-{index}' + if options.format == 'dot': + with open(f'{output_file_name}.dot', 'w') as file: + file.write(str(kernel.cfg)) + file.write('\n') + else: + kernel.cfg.render(filename=f'{output_file_name}', + format=options.format, + ).replace('\\', '/') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Generates Control Flow Graph (CFG) from amdgcn assembly file", + ) + parser.add_argument("-i", "--input", type=str, default=None, help="input file") + parser.add_argument("-o", "--output", type=str, default=None, help="output file prefix") + parser.add_argument("-v", "--verbose", action='store_true', help='verbose output') + parser.add_argument("-f", "--format", choices=['dot','svg', 'pdf'], + default="dot", + help="output format type") + args = parser.parse_args() + + options = Options(args.input, args.output, args.verbose, args.format) + + main(options) diff --git a/python/perf-kernels/tools/occ.sh b/python/perf-kernels/tools/occ.sh new file mode 100755 index 000000000000..51c8f9095907 --- /dev/null +++ b/python/perf-kernels/tools/occ.sh @@ -0,0 +1,71 @@ +#! /bin/bash + +## $1: input script that contains one kernel + +rm -rf ~/.triton/cache/ + +export MLIR_ENABLE_DUMP=1 +export AMDGCN_ENABLE_DUMP=1 +## Assume CDNA arch +SIMD=4 +LDS_SIZE=65536 +TOTAL_VGPR=512 + +get_occ_per_CU() { + ## $1: vgpr count + vgpr=$1 + occPerEU=$((TOTAL_VGPR/vgpr)) + if [[ $vgpr -gt 256 ]]; then + occPerEU=1 + elif [[ $vgpr -gt 168 ]]; then + occPerEU=2 + elif [[ $vgpr -gt 128 ]]; then + occPerEU=3 + elif [[ $vgpr -gt 96 ]]; then + occPerEU=4 + elif [[ $vgpr -gt 80 ]]; then + occPerEU=5 + elif [[ $vgpr -gt 72 ]]; then + occPerEU=6 + elif [[ $vgpr -gt 64 ]]; then + occPerEU=7 + else + occPerEU=8 + fi + + occPerCU=$((occPerEU*SIMD/num_warps)) + echo $occPerCU +} + +$1 > output.mlir 2>&1 + +LDS_line=$(sed -n '/triton_gpu\.shared\ /p' output.mlir | tail -n 1 | grep -o 'triton_gpu.shared = [0-9]*') +numWarps_line=$(sed -n '/triton_gpu\.num-warps/p' output.mlir | tail -n 1 | grep -o 'triton_gpu.num-warps. = [0-9]*') + +LDS=${LDS_line##*=} +num_warps=${numWarps_line##*=} +echo "LDS: $LDS, num_warps: $num_warps" + +VGPRs=$(sed -n '/vgpr_count/p' output.mlir | tail -n 1 | awk '{print $2}') +SPILLs=$(sed -n '/vgpr_spill/p' output.mlir | tail -n 1 | awk '{print $2}') + +echo "VGPRS: $VGPRs (spill: $SPILLs)" + +occLDSPerCU=$((LDS_SIZE/LDS)) +occVgprPerCU=$(get_occ_per_CU $VGPRs) +occPerCU=$occVgprPerCU +if [ $occLDSPerCU -lt $occVgprPerCU ];then + occPerCU=$occLDSPerCU +fi +occPerEU=$((occPerCU*num_warps/SIMD)) +echo "occupancy: $occPerEU waves/SIMD or $occPerCU workgroups/CU (occLDSPerCU: $occLDSPerCU, occVgprPerCU: $occVgprPerCU)" + +perf=$(tail -n 2 output.mlir) +echo "$perf" + +## remove distracting info from the assembly +sed -i '/local_/! {/\.loc/d}' output.mlir +sed -i '/\.Ltmp.*:/d' output.mlir +sed -i '/AMD clang version/d' output.mlir + +sed -n '/AMDGCN/, $p' output.mlir > output.amdgcn diff --git a/python/perf-kernels/tools/plot-layout/README.md b/python/perf-kernels/tools/plot-layout/README.md new file mode 100644 index 000000000000..e12cf9441d37 --- /dev/null +++ b/python/perf-kernels/tools/plot-layout/README.md @@ -0,0 +1,117 @@ +# Plot script for triton layouts + +This script is used to draw triton layouts in the context of matmul. +Here is the help info from the script. + +```bash +>$ python3 plot_layout.py -h +usage: Draw triton layouts [-h] [-shape SHAPE SHAPE SHAPE] [-plot {blocked,dot,wmma,lds}] [-nonKDim {16,32}] [-sizePerThread SIZEPERTHREAD SIZEPERTHREAD] [-threadsPerWarp THREADSPERWARP THREADSPERWARP] + [-warpsPerCTA WARPSPERCTA WARPSPERCTA] [-order ORDER ORDER] [-kWidth {4,8,16}] [-lds_layout {swizzle,padding,none}] [-lds_access {read,write,none}] [-wave_size {32,64}] [-o O] [-mfmaTrans] [-keep] + +options: + -h, --help show this help message and exit + -shape SHAPE SHAPE SHAPE + Tensor shape in the form of M,N,K + -plot {blocked,dot,wmma,lds} + choose plot mode + -nonKDim {16,32} mfma instruction dim + -sizePerThread SIZEPERTHREAD SIZEPERTHREAD + -threadsPerWarp THREADSPERWARP THREADSPERWARP + -warpsPerCTA WARPSPERCTA WARPSPERCTA + -order ORDER ORDER + -kWidth {4,8,16} number of elements per thread + -lds_layout {swizzle,padding,none} + choose the LDS data layout + -lds_access {read,write,none} + choose LDS access mode + -wave_size {32,64} choose the wmma instruction mode + -o O output pdf file name (without surfix) + -mfmaTrans If set, then use mfma.trans layout + -keep If set, keep the generated .tex file +``` + +## Installation +This script does not require torch or triton to be installed. The only package +it depends on is latex. On Ubuntu, do +```bash +sudo apt install texlive-full +``` + +## Draw blocked layout (`-plot blocked`) + +Examples: +```bash +python3 plot_layout.py -plot blocked -shape 128 128 64 -sizePerThread 1 8 -threadsPerWarp 8 8 -warpsPerCTA 4 1 +python3 plot_layout.py -plot blocked -shape 16 128 64 -sizePerThread 1 8 -threadsPerWarp 16 4 -warpsPerCTA 1 2 +python3 plot_layout.py -plot blocked -shape 32 128 64 -sizePerThread 8 1 -threadsPerWarp 4 16 -warpsPerCTA 1 2 -order 0 1 +``` + +Blocked layouts are used during global load. It is used to describe the layout of the tensor +for pointers and results. +We can provide tensor shape (`-shape M N K`) and blocked layout parameters ( +`-sizePerThread x y`, `-threadsPerWarp x y`, and `-warpsPerCTA x y`). +We can also provide the order of the tensor as `-order x y` to control which dim +is the fastest changing dimension. + +Notes +- All of the gemm dims (M, N, and K) are needed when providing the shape. But only + M and K will be used to plot the layout of the tensor. +- The script does not support the case when threads are loading elements that are + out of the boundary of the tensor dimensions. This means + - For M: sizePerThread[0] * threadsPerWarps[0] * warpsPerCTA[0] <= M + - For K: sizePerThread[1] * threadsPerWarps[1] * warpsPerCTA[1] <= K + + +## Draw mfma operand and result layouts (`-plot dot`) + +Examples: +```bash +python3 plot_layout.py -plot dot -shape 128 128 64 -warpsPerCTA 2 4 -nonKDim 32 -kWidth 4 +python3 plot_layout.py -plot dot -shape 128 128 64 -warpsPerCTA 2 4 -nonKDim 32 -kWidth 8 +python3 plot_layout.py -plot dot -shape 128 128 64 -warpsPerCTA 2 4 -nonKDim 32 -kWidth 8 -mfmaTrans +python3 plot_layout.py -plot dot -shape 128 128 64 -warpsPerCTA 2 4 -nonKDim 16 -kWidth 8 +python3 plot_layout.py -plot dot -shape 128 128 64 -warpsPerCTA 2 4 -nonKDim 16 -kWidth 16 +``` + +This mode draws two graphs: +1. The layout of the whole tile for tile A, B, and C +2. The layout of a single mfma block, operands and results of one or more mfma + instructions that share the same accumulating VGPRs. + This view has thread distributions among tensor elements. + +Knobs +- `-kWidth`: the number of elements that will be loaded into one thread at once +- `-nonKDim`: 16 ot 32, which is used to control the mfma instruction size +- `-mfmaTrans`: if set, the transposed mfma layout will be plotted. + +Notes +- The layout shows the mapping from the threads/wave to the elements in the + original tensor. It does not care if the elements are arranged in LDS, like + swizzling to avoid bank conflicts. +- The script does not allow settings for data type or k dim of the mfma instruction. + This can be controled by the `-kWidth` flag. + - For example, if we want `mfma_32x32x8xf16`, we can set `-nonKDim 32` and `-kWidth 4`. + - If we want `mfma_32x32x16xf8`, we can set `-nonKDim 32` and `-kWidth 8`. + + +## Draw LDS access (`-plot lds`) + +Examples: +```bash +python3 plot_layout.py -plot lds -lds_layout none -lds_access none -shape 128 128 64 -kWidth 8 +``` + +Knobs +- `kWidth` here means the vector size when accessing LDS +- Three options for `-lds_layout`: + - `none`: no swizzling, no padding + - `padding`: padding at every 128B + - `swizzling`: apply the swizzling pattern, which is derived from tensor shape and kWidth. +- Three options for `-lds_access`: + - `none`: do not plot access pattern + - `read`: plot accessed elements during ds_read + - `write`: plot accessed elements during ds_write. Note that this needs some infomation from + global load. Therefore, we need to provide `-sizePerThread` and `-threadsPerWarp`. + +Notes +- This mode is rarely used. If you have any questions, please contact Lixun Zhang directly. diff --git a/python/perf-kernels/tools/plot-layout/plot_layout.py b/python/perf-kernels/tools/plot-layout/plot_layout.py new file mode 100755 index 000000000000..c2387905f3e0 --- /dev/null +++ b/python/perf-kernels/tools/plot-layout/plot_layout.py @@ -0,0 +1,341 @@ +import argparse +import sys +import yaml +import os +import glob +import subprocess + + +def draw_preamble_cmd(): + return '''\\documentclass[tikz, border=1mm, dvipsnames]{standalone} +\\usepackage{ifthen} +\\usepackage{tikz} +\\usetikzlibrary{arrows.meta,arrows} +\\usetikzlibrary{intersections} +\\usetikzlibrary{calc, quotes} +\\usetikzlibrary{patterns} +\\usepackage{xparse} + +\\ExplSyntaxOn +\\NewExpandableDocumentCommand{\\bitwiseXor}{mm} + { + \\recuenco_bitwise_xor:nn { #1 } { #2 } + } + +\\cs_new:Nn \\recuenco_bitwise_xor:nn + { + \\int_from_bin:e + { + \\__recuenco_bitwise_xor:ee { \\int_to_bin:n { #1 } } { \\int_to_bin:n { #2 } } + } + } +\\cs_generate_variant:Nn \\int_from_bin:n { e } + +\\cs_new:Nn \\__recuenco_bitwise_xor:nn + { + \\__recuenco_bitwise_xor_binary:ee + { + \\prg_replicate:nn + { + \\int_max:nn { \\tl_count:n { #1 } } { \\tl_count:n { #2 } } - \\tl_count:n { #1 } + } + { 0 } + #1 + } + { + \\prg_replicate:nn + { + \\int_max:nn { \\tl_count:n { #1 } } { \\tl_count:n { #2 } } - \\tl_count:n { #2 } + } + { 0 } + #2 + } + } +\\cs_generate_variant:Nn \\__recuenco_bitwise_xor:nn { ee } + +\\cs_new:Nn \\__recuenco_bitwise_xor_binary:nn + { + \\__recuenco_bitwise_xor_binary:w #1;#2; + } +\\cs_generate_variant:Nn \\__recuenco_bitwise_xor_binary:nn { ee } + +\\cs_new:Npn \\__recuenco_bitwise_xor_binary:w #1#2;#3#4; + { + \\int_abs:n { #1-#3 } + \\tl_if_empty:nF { #2 } { \\__recuenco_bitwise_xor_binary:w #2;#4; } + } + +\\ExplSyntaxOff''' + + +def draw_dot_layout_cmd(M, N, K, mfmaNonKDim, warpsPerCTA, trans, kpack): + return f'''\\begin{{document}} + \\begin{{tikzpicture}} + \\def\\scale{{1}} + \\def\\elem{{0.04}} + \\coordinate (C TL) at (0,0); + \\def\\opColorAL{{magenta}} + \\def\\opColorAR{{cyan}} + \\def\\opColorBL{{Maroon}} + \\def\\opColorBR{{BlueGreen}} + \\drawDot{{{M}}}{{{N}}}{{{K}}}{{{mfmaNonKDim}}}{{{warpsPerCTA[0]}}}{{{warpsPerCTA[1]}}}{{{trans}}}{{{kpack}}} + + \\coordinate (C TL) at ($(C TL)+({N}*\elem+32*\elem, 0)$); + \\def\\mfmaTrans{{{trans}}} + + %% Draw zoomed in view of mfma + \\def\\elem{{.16}} + \\pgfmathsetmacro{{\\gap}}{{\\elem*5}} + \\pgfmathsetmacro{{\\nonTrans}}{{1-\\mfmaTrans}} + \\pgfmathsetmacro{{\\groups}}{{64/{mfmaNonKDim}}} + \\coordinate (C TL) at ($(C TL)+(.5*\\gap+1.2*\\nonTrans*\\gap+\\groups*{kpack}*\\elem, 0)$); + \\drawMFMAInstr{{{mfmaNonKDim}}}{{{kpack}}}{{\\mfmaTrans}} + + \\end{{tikzpicture}} +\\end{{document}}''' + + +def draw_blocked_layout_cmd(M, K, sizePerThread, threadsPerWarp, warpsPerCTA, + order): + return f'''\\begin{{document}} + \\begin{{tikzpicture}} + \\def\\scale{{1}} + \\def\\elem{{0.06}} + \\coordinate (TL) at (0,0); + \\drawBlockedTensor{{{M}}}{{{K}}}{{{sizePerThread[0]}}}{{{sizePerThread[1]}}}{{{threadsPerWarp[0]}}}{{{warpsPerCTA[0]}}}{{{warpsPerCTA[1]}}}{{{order[0]}}} + \\end{{tikzpicture}} +\\end{{document}}''' + + +def draw_lds_access_cmd(M, K, kpack, ldsLayout, ldsAccess, sizePerThread, + threadsPerWarp): + if ldsLayout == 'swizzle': + hasSwizzle = 1 + elif ldsLayout == 'padding': + hasSwizzle = 2 + else: + hasSwizzle = 0 + + if ldsAccess == 'read': + accessMode = 1 + elif ldsAccess == 'write': + accessMode = 2 + else: + accessMode = 0 + + return f'''\\begin{{document}} + \\begin{{tikzpicture}} + \\def\\scale{{1}} + \\def\\M{{{M}}} + \\def\\K{{{K}}} + \\def\\vec{{{kpack}}} + \\def\\hasSwizzle{{{hasSwizzle}}} + \\def\\accessMode{{{accessMode}}} + + \\def\\sizePerThreadK{{{sizePerThread[1]}}} + \\def\\sizePerThreadM{{{sizePerThread[0]}}} + \\def\\threadsPerWarpK{{{threadsPerWarp[1]}}} + + \\def\\elem{{0.18}} + \\coordinate (TL) at (0,0); + \\drawTensorLayoutGlobalMem + \\coordinate (TL) at ($(TL)+(0, -24*\\elem-10*\\elem)$); + \\drawLDSLayoutTritonSwizzling{{\\hasSwizzle}}{{\\accessMode}} + \\end{{tikzpicture}} +\\end{{document}}''' + + +def draw_wmma_instr_cmd(waveSize): + wmma_mode = 0 if waveSize == 32 else 1 + return f'''\\begin{{document}} + \\begin{{tikzpicture}} + \\def\\scale{{1}} + \\coordinate (C TL) at (0,0); + \\def\\elem{{0.25}} + \\drawWMMAInstr{{{wmma_mode}}}{{1}} + \\end{{tikzpicture}} +\\end{{document}}''' + + +def run_bash_command(commandstring): + proc = subprocess.run(commandstring, + shell=True, + check=True, + executable='/bin/bash', + stdout=subprocess.PIPE) + return proc.stdout.splitlines() + + +def parse_args(): + parser = argparse.ArgumentParser( + prog="Draw triton layouts", + allow_abbrev=False, + ) + ## tensor shapes + parser.add_argument("-shape", + type=int, + nargs=3, + default=(32, 128, 64), + help='Tensor shape in the form of M,N,K') + parser.add_argument("-plot", + type=str, + default="blocked", + choices=['blocked', 'dot', 'wmma', 'lds'], + help='choose plot mode') + parser.add_argument( + "-nonKDim", + type=int, + default=32, + choices=[16, 32], + help='mfma instruction dim') + ## blocked layout parameters + parser.add_argument("-sizePerThread", type=int, nargs=2, default=(1, 4)) + parser.add_argument("-threadsPerWarp", type=int, nargs=2, default=(16, 4)) + parser.add_argument("-warpsPerCTA", type=int, nargs=2, default=(1, 4)) + parser.add_argument("-order", type=int, nargs=2, default=(1, 0)) + ## LDS access parameters + parser.add_argument("-kWidth", + type=int, + default=4, + choices=[4, 8, 16], + help='number of elements per thread') + parser.add_argument("-lds_layout", + type=str, + default="none", + choices=['swizzle', 'padding', 'none'], + help='choose the LDS data layout') + parser.add_argument("-lds_access", + type=str, + default="none", + choices=['read', 'write', 'none'], + help='choose LDS access mode') + ## wmma instruction layout parameter + parser.add_argument("-wave_size", + type=int, + default=32, + choices=[32, 64], + help='choose the wmma instruction mode') + + parser.add_argument("-o", + type=str, + default="myplot", + help='output pdf file name (without surfix)') + parser.add_argument("-mfmaTrans", + action='store_true', + default=False, + help='If set, then use mfma.trans layout') + parser.add_argument("-keep", + action='store_true', + default=False, + help='If set, keep the generated .tex file') + + args = parser.parse_args() + + return args + + +def main(): + args = parse_args() + + shape = args.shape + M = shape[0] + N = shape[1] + K = shape[2] + plot_mode = args.plot + mfmaNonKDim = args.nonKDim + kpack = args.kWidth + trans = 1 if args.mfmaTrans else 0 + ofilename = args.o + keepSrc = args.keep + + ldsLayout = args.lds_layout + ldsAccess = args.lds_access + + waveSize = args.wave_size + + sizePerThread = args.sizePerThread + threadsPerWarp = args.threadsPerWarp + warpsPerCTA = args.warpsPerCTA + order = args.order + + CTAShape = [] + if plot_mode == 'blocked': + print(f"Plotting tensor M={M},K={K} with blocked layout:") + print(f"sizePerThread={sizePerThread}", end=" ") + print(f"threadsPerWarp={threadsPerWarp}", end=" ") + print(f"warpsPerCTA={warpsPerCTA}", end=" ") + print(f"order={order}", end=" ") + CTAShape.append(sizePerThread[0] * threadsPerWarp[0] * warpsPerCTA[0]) + CTAShape.append(sizePerThread[1] * threadsPerWarp[1] * warpsPerCTA[1]) + + if plot_mode == 'dot': + mfma_inst_str = "mfma_32x32" if mfmaNonKDim == 32 else "mfma_16x16" + mfma_trans_str = ".trans" if trans else "" + print(f"Plotting dot operation with shapes M={M},N={N},K={K}") + print("MFMA: " + mfma_inst_str + mfma_trans_str + f" kWidth = {kpack}", end=" ") + print(f"warpsPerCTA={warpsPerCTA}", end=" ") + CTAShape.append(mfmaNonKDim * warpsPerCTA[0]) + CTAShape.append(mfmaNonKDim * warpsPerCTA[1]) + + if plot_mode == 'blocked' or plot_mode == 'dot': + print(f"CTAShape={CTAShape}") + assert M != 0 and CTAShape[ + 0] <= M and M % CTAShape[0] == 0, "bad tensor dimension M" + + if plot_mode == 'blocked': + assert K != 0 and CTAShape[ + 1] <= K and K % CTAShape[1] == 0, "bad tensor dimension K" + + if plot_mode == 'dot': + assert N != 0 and CTAShape[ + 1] <= N and N % CTAShape[1] == 0, "bad tensor dimension N" + assert K != 0 and K % (2 * kpack) == 0, "bad tensor dimension K" + + if plot_mode == 'lds': + print(f"Plotting LDS access for tensor M={M},K={K} with vec={kpack}") + if ldsAccess == 'write': + print( + f"sizePerThread={sizePerThread}, threadsPerWarp={threadsPerWarp}" + ) + + with open("myplot.tex", 'w') as f_plot: + with open("tikzplot.tex") as file: + tikz_code = file.read() + + preamble_str = draw_preamble_cmd() + + draw_blockedLayout_str = draw_blocked_layout_cmd( + M, K, sizePerThread, threadsPerWarp, warpsPerCTA, order) + + draw_dotLayout_str = draw_dot_layout_cmd(M, N, K, mfmaNonKDim, + warpsPerCTA, trans, kpack) + + draw_lds_str = draw_lds_access_cmd(M, K, kpack, ldsLayout, ldsAccess, + sizePerThread, threadsPerWarp) + + draw_wmma_str = draw_wmma_instr_cmd(waveSize) + + f_plot.write(preamble_str + "\n") + f_plot.write(tikz_code) + if plot_mode == 'blocked': + f_plot.write(draw_blockedLayout_str) + elif plot_mode == 'dot': + f_plot.write(draw_dotLayout_str) + elif plot_mode == 'lds': + f_plot.write(draw_lds_str) + elif plot_mode == 'wmma': + f_plot.write(draw_wmma_str) + + run_bash_command(f"pdflatex -jobname {ofilename} myplot.tex") + print(f"plot saved in {ofilename}.pdf") + + ## Remove au files + os.remove(f"{ofilename}.aux") + os.remove(f"{ofilename}.log") + if not keepSrc: + os.remove("myplot.tex") + run_bash_command("rm -rf ./auto") + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/python/perf-kernels/tools/plot-layout/tikzplot.tex b/python/perf-kernels/tools/plot-layout/tikzplot.tex new file mode 100755 index 000000000000..e6292f7002e9 --- /dev/null +++ b/python/perf-kernels/tools/plot-layout/tikzplot.tex @@ -0,0 +1,880 @@ +\newcommand{\drawBlockedWave}[5]{ + %% + %% Draw a wave coverage with blocked layout + %% + %% Wave TL: pre defined top-left coordinate of the wave + %% \elem: pre defined variable + %% + %% #1: sizePerThread[0] --> sizePerThreadM + %% #2: sizePerThread[1] --> sizePerThreadN + %% #3: threadsPerWarp[0] --> threadsPerWarpM + %% #4: threadsPerWarp[1] --> threadsPerWarpN + %% #5: fastest changing dim --> order + + \pgfmathsetmacro{\sizePerThreadM}{#1} + \pgfmathsetmacro{\sizePerThreadN}{#2} + \pgfmathsetmacro{\threadsPerWarpM}{#3} + \pgfmathsetmacro{\threadsPerWarpN}{#4} + \pgfmathsetmacro{\order}{#5} + + \pgfmathsetmacro{\waveSizeM}{\sizePerThreadM*\threadsPerWarpM} + \pgfmathsetmacro{\waveSizeN}{\sizePerThreadN*\threadsPerWarpN} + + \foreach \tid in {0,...,63}{ + \pgfmathsetmacro{\tidM}{int(\tid/\threadsPerWarpN)} + \pgfmathsetmacro{\tidN}{mod(\tid,\threadsPerWarpN)} + \coordinate (Thread TL) at ($(Wave TL)+(\tidN*\sizePerThreadN*\elem, -\tidM*\sizePerThreadM*\elem)$); + \pgfmathsetmacro{\ratio}{\tidM*10} + + \ifthenelse{\tid = 0}{ + \draw [line width = 0.01mm, fill=red] (Thread TL) + rectangle ++(\sizePerThreadN*\elem, -\sizePerThreadM*\elem); + }{ + \draw [line width = 0.01mm, fill=blue!\ratio!white] (Thread TL) + rectangle ++(\sizePerThreadN*\elem, -\sizePerThreadM*\elem); + } + } + \draw (Wave TL) rectangle ++(\waveSizeN*\elem, -\waveSizeM*\elem); +} + +\newcommand{\drawBlockedCTA}[7]{ + %% + %% Draw a CTA coverage with blocked layout + %% + %% CTA TL: pre defined top-left coordinate of the CTA + %% \elem: pre defined variable + %% + %% #1: sizePerThread[0] --> sizePerThreadM + %% #2: sizePerThread[1] --> sizePerThreadN + %% #3: threadsPerWarp[0] --> threadsPerWarpM + %% #4: threadsPerWarp[1] --> threadsPerWarpN + %% #5: warpsPerCTA[0] --> warpsPerCTAM + %% #6: warpsPerCTA[1] --> warpsPerCTAN + %% #7: fastest changing dim --> order + + \pgfmathsetmacro{\sizePerThreadM}{#1} + \pgfmathsetmacro{\sizePerThreadN}{#2} + \pgfmathsetmacro{\threadsPerWarpM}{#3} + \pgfmathsetmacro{\threadsPerWarpN}{#4} + \pgfmathsetmacro{\warpsPerCTAM}{#5} + \pgfmathsetmacro{\warpsPerCTAN}{#6} + \pgfmathsetmacro{\order}{#7} + + \pgfmathsetmacro{\CTASizeM}{\sizePerThreadM*\threadsPerWarpM*\warpsPerCTAM} + \pgfmathsetmacro{\CTASizeN}{\sizePerThreadN*\threadsPerWarpN*\warpsPerCTAN} + \pgfmathsetmacro{\waveSizeM}{\sizePerThreadM*\threadsPerWarpM} + \pgfmathsetmacro{\waveSizeN}{\sizePerThreadN*\threadsPerWarpN} + + \pgfmathsetmacro{\maxWaveId}{\warpsPerCTAM*\warpsPerCTAN-1} + + \coordinate (Wave TL) at (CTA TL); + \drawBlockedWave{\sizePerThreadM}{\sizePerThreadN}{\threadsPerWarpM}{\threadsPerWarpN}{\order} + \foreach \waveId in {0,...,\maxWaveId}{ + \ifthenelse{\order=1} + { + \pgfmathsetmacro{\waveCoordM}{int(\waveId/\warpsPerCTAN)} + \pgfmathsetmacro{\waveCoordN}{mod(\waveId,\warpsPerCTAN)} + \pgfmathsetmacro{\rot}{0} + }{ + \pgfmathsetmacro{\waveCoordM}{mod(\waveId,\warpsPerCTAM)} + \pgfmathsetmacro{\waveCoordN}{int(\waveId/\warpsPerCTAM)} + \pgfmathsetmacro{\rot}{90} + } + + \coordinate (Wave TL) at ($(CTA TL)+(\waveCoordN*\waveSizeN*\elem, -\waveCoordM*\waveSizeM*\elem)$); + \draw [ultra thin] (Wave TL) rectangle ++(\waveSizeN*\elem, -\waveSizeM*\elem) + node [pos=.5, scale=.6*\scale, inner sep=0, fill=white, rotate=\rot] {wave\waveId}; + } + + \draw [thick] (CTA TL) rectangle ++(\CTASizeN*\elem, -\CTASizeM*\elem); +} + +\newcommand{\drawBlockedTensor}[8]{ + %% + %% Draw a tensor with blocked layout of the following parameters + %% sizePerThread[2] + %% threadsPerWarp[2] + %% warpsPerCTA[2] + %% order[2] + %% + %% TL: pre defined top-left coordinate of the tensor + %% \elem: pre defined variable + %% + %% #1: tensorShape[0] --> M + %% #2: tensorShape[1] --> N + %% #3: sizePerThread[0] --> sizePerThreadM + %% #4: sizePerThread[1] --> sizePerThreadN + %% #5: threadsPerWarp[0] --> threadsPerWarpM + %% Note that threadsPerWarp[1] is calculated by 64/threadsPerWarp[0] + %% #6: warpsPerCTA[0] --> warpsPerCTAM + %% #7: warpsPerCTA[1] --> warpsPerCTAN + %% #8: fastest changing dim --> order + + \pgfmathsetmacro{\M}{#1} + \pgfmathsetmacro{\N}{#2} + \pgfmathsetmacro{\sizePerThreadM}{#3} + \pgfmathsetmacro{\sizePerThreadN}{#4} + \pgfmathsetmacro{\threadsPerWarpM}{#5} + \pgfmathsetmacro{\warpsPerCTAM}{#6} + \pgfmathsetmacro{\warpsPerCTAN}{#7} + \pgfmathsetmacro{\order}{#8} + + \pgfmathsetmacro{\threadsPerWarpN}{64/\threadsPerWarpM} + \pgfmathsetmacro{\CTASizeM}{\sizePerThreadM*\threadsPerWarpM*\warpsPerCTAM} + \pgfmathsetmacro{\CTASizeN}{\sizePerThreadN*\threadsPerWarpN*\warpsPerCTAN} + \pgfmathsetmacro{\CTARepM}{\M/\CTASizeM} + \pgfmathsetmacro{\CTARepN}{\N/\CTASizeN} + \pgfmathsetmacro{\maxCTAId}{\CTARepM*\CTARepN-1} + + \foreach \ctaId in {0,...,\maxCTAId}{ + \pgfmathsetmacro{\ctaCoordM}{int(\ctaId/\CTARepN)} + \pgfmathsetmacro{\ctaCoordN}{mod(\ctaId,\CTARepN)} + \coordinate (CTA TL) at ($(TL)+(\ctaCoordN*\CTASizeN*\elem, -\ctaCoordM*\CTASizeM*\elem)$); + \drawBlockedCTA{\sizePerThreadM}{\sizePerThreadN}{\threadsPerWarpM}{\threadsPerWarpN}{\warpsPerCTAM}{\warpsPerCTAN}{\order} + } + + \node [scale=.7*\scale, above, rotate=90] at ($(TL)+(0, -.5*\M*\elem)$) {M=\M}; + \node [scale=.7*\scale, above] at ($(TL)+(.5*\N*\elem, 0)$) {K=\N}; + + \def\zoomR{1.5} + \coordinate (zoomin BL) at ($(TL)+(0, .3)$); + + \foreach \hl in {0,...,\sizePerThreadM}{ + \draw ($(zoomin BL)+(0, \hl*\elem*\zoomR)$) -- ++(\sizePerThreadN*\elem*\zoomR,0); + } + \foreach \vl in {0,...,\sizePerThreadN}{ + \draw ($(zoomin BL)+(\vl*\elem*\zoomR, 0)$) -- ++(0, \sizePerThreadM*\elem*\zoomR); + } + + \node [scale=.6*\scale, left] at ($(zoomin BL)+(0, .5*\sizePerThreadM*\elem*\zoomR)$) {$t_0$}; + \node [scale=.6*\scale, right] at ($(zoomin BL)+(\sizePerThreadN*\elem*\zoomR, .5*\sizePerThreadM*\elem*\zoomR)$) {\sizePerThreadM$\times$\sizePerThreadN}; + + \draw [densely dotted] (TL) -- (zoomin BL); + \draw [densely dotted] ($(TL)+(\sizePerThreadN*\elem, 0)$) -- ($(zoomin BL)+(\sizePerThreadN*\elem*\zoomR, 0)$); + \draw [fill=red] (TL) rectangle ++(\sizePerThreadN*\elem, -\sizePerThreadM*\elem); +} + +\newcommand{\drawBlockMFMALayoutLarge}[3]{ + %% + %% Draw a single block of MFMA_32x32x8xf16 or MFMA_16x16x16xf16 + %% + %% block TL: pre-defined top-left coordinate of the block + %% \elem: pre defined variable + %% + %% #1: 1 for mfma.trans, 0 for normal mfma + %% #2: mfmaNonKDim + %% #3: verbose. 1 means draw tid in each vec; 0 means draw nothing + + \pgfmathsetmacro{\trans}{#1} + \pgfmathsetmacro{\nonTrans}{1-#1} + \pgfmathsetmacro{\nonKDim}{#2} + \pgfmathsetmacro{\maxTID}{\nonKDim-1} + \pgfmathsetmacro{\groups}{64/\nonKDim} + \pgfmathsetmacro{\maxGID}{\groups-1} + \pgfmathsetmacro{\maxIVec}{\nonKDim*\nonKDim/256-1} + \pgfmathsetmacro{\verbose}{#3} + \foreach \iVec in {0,...,\maxIVec} { + \coordinate (wave TL) at ($(block TL)+(\trans*\iVec*\groups*4*\elem, -\nonTrans*\iVec*\groups*4*\elem)$); + \foreach \tg in {0,...,\maxGID}{ + \pgfmathsetmacro{\colID}{\tg+4} + \pgfmathsetmacro{\col}{\Colors[\colID]} + \foreach \tid in {0,...,\maxTID} { + \pgfmathsetmacro{\ratio}{\tid*2.5*\groups+15} + \ifthenelse{\verbose=0}{ + \draw [line width=0.005mm, fill=\col!\ratio!white] + ($(wave TL)+(\nonTrans*\tid*\elem+\tg*\trans*4*\elem, -\trans*\tid*\elem-\tg*\nonTrans*4*\elem)$) + rectangle ++(\nonTrans*\elem+\trans*4*\elem, -\nonTrans*4*\elem-\trans*\elem); + }{ + \pgfmathsetmacro{\drawTid}{int(\tid+\tg*\nonKDim)} + \draw [line width=0.005mm, fill=\col!\ratio!white] + ($(wave TL)+(\nonTrans*\tid*\elem+\tg*\trans*4*\elem, -\trans*\tid*\elem-\tg*\nonTrans*4*\elem)$) + rectangle ++(\nonTrans*\elem+\trans*4*\elem, -\nonTrans*4*\elem-\trans*\elem) + node [pos=.5, scale=.35*\scale, rotate=90*\nonTrans] {t\drawTid}; + } + } + } + } + \draw [thick] (block TL) rectangle ++(\nonKDim*\elem, -\nonKDim*\elem); +} + + +\newcommand{\drawTensorMFMALayout}[6]{ + %% + %% Draw a tensor with mfma layout. + %% + %% C TL: pre defined top-left coordinates of the tensor + %% + %% #1: M + %% #2: N + %% #3: MFMA nonKDim + %% #4: warpsPerCTA[0] + %% #5: warpsPerCTA[1] + %% #6: 1 for mfma.trans, 0 for normal mfma + + \pgfmathsetmacro{\tensorShapeH}{#1} + \pgfmathsetmacro{\tensorShapeW}{#2} + \pgfmathsetmacro{\mfmaNonKDim}{#3} + \pgfmathsetmacro{\warpsPerCTAH}{#4} + \pgfmathsetmacro{\warpsPerCTAW}{#5} + \pgfmathsetmacro{\mfmaTrans}{#6} + + \coordinate (old TL) at (TL); + \coordinate (TL) at (C TL); + + + \pgfmathsetmacro{\CTARepH}{\tensorShapeH/\mfmaNonKDim/\warpsPerCTAH} + \pgfmathsetmacro{\CTARepW}{\tensorShapeW/\mfmaNonKDim/\warpsPerCTAW} + \pgfmathsetmacro{\maxCTAId}{\CTARepH*\CTARepW-1} + \pgfmathsetmacro{\maxWaveId}{\warpsPerCTAH*\warpsPerCTAW-1} + \pgfmathsetmacro{\CTASizeH}{\warpsPerCTAH*\mfmaNonKDim} + \pgfmathsetmacro{\CTASizeW}{\warpsPerCTAW*\mfmaNonKDim} + + + \foreach \ctaId in {0,...,\maxCTAId}{ + \pgfmathsetmacro{\ctaCoordH}{int(\ctaId/\CTARepW)} + \pgfmathsetmacro{\ctaCoordW}{mod(\ctaId,\CTARepW)} + \coordinate (CTA TL) at ($(TL)+(\ctaCoordW*\CTASizeW*\elem, -\ctaCoordH*\CTASizeH*\elem)$); + %% Draw a detailed view of wave0 in each CTA + \coordinate (block TL) at (CTA TL); + \drawBlockMFMALayoutLarge{\mfmaTrans}{\mfmaNonKDim}{0} + + \foreach \waveId in {0,...,\maxWaveId}{ + \pgfmathsetmacro{\waveCoordH}{int(\waveId/\warpsPerCTAW)} + \pgfmathsetmacro{\waveCoordW}{mod(\waveId,\warpsPerCTAW)} + \coordinate (block TL) at ($(CTA TL)+(\waveCoordW*\mfmaNonKDim*\elem, -\waveCoordH*\mfmaNonKDim*\elem)$); + %% Inside the loop, only draw a rectangle + \draw [ultra thin] (block TL) rectangle ++(\mfmaNonKDim*\elem, -\mfmaNonKDim*\elem) + node [scale=.7*\mfmaNonKDim/32*\scale, pos=.5, fill=white, inner sep=0] {wave\waveId}; + } + + %% Draw the outline of each CTA rep + \draw [ultra thick] (CTA TL) rectangle ++(\CTASizeW*\elem, -\CTASizeH*\elem); + } + + \coordinate (TL) at (old TL); +} + +\newcommand{\drawMFMAOperand}[4]{ + %% + %% Draw one mfma operand + %% + %% mfma op TL: pre defined coordinates of the top-left + %% \elem: pre defined variable + %% + %% #1: mfmNonKDim + %% #2: kpack + %% #3: 0 for opA and 1 for opB + %% #4: verbose. 1 means draw tid in each vec; 0 means draw nothing + + \pgfmathsetmacro{\nonKDim}{#1} + \pgfmathsetmacro{\maxGID}{64/\nonKDim-1} + \pgfmathsetmacro{\maxTID}{\nonKDim-1} + \pgfmathsetmacro{\kpack}{#2} + \pgfmathsetmacro{\opIdxA}{#3} + \pgfmathsetmacro{\opIdxB}{1-\opIdxA} + \pgfmathsetmacro{\verbose}{#4} + + \foreach \col/\tg in {0,...,\maxGID}{ + \pgfmathsetmacro{\col}{\Colors[\tg]} + \foreach \tid in {0,...,\maxTID} { + % \pgfmathsetmacro{\ratio}{\tid*2.5+15} + \ifthenelse{\verbose=0}{ + \draw [line width=0.005mm, fill=\col] + ($(mfma op TL)+(\tg*\kpack*\elem*\opIdxB+\tid*\elem*\opIdxA, -\tid*\elem*\opIdxB-\tg*\kpack*\elem*\opIdxA)$) + rectangle ++(\kpack*\elem*\opIdxB + \elem*\opIdxA, -\elem*\opIdxB-\kpack*\elem*\opIdxA); + }{ + \pgfmathsetmacro{\drawTid}{int(\tid+\tg*\nonKDim)} + \draw [line width=0.005mm, fill=\col] + ($(mfma op TL)+(\tg*\kpack*\elem*\opIdxB+\tid*\elem*\opIdxA, -\tid*\elem*\opIdxB-\tg*\kpack*\elem*\opIdxA)$) + rectangle ++(\kpack*\elem*\opIdxB + \elem*\opIdxA, -\elem*\opIdxB-\kpack*\elem*\opIdxA) + node [pos=.5, scale=.35*\scale, rotate=90*\opIdxA] {t\drawTid}; + } + } + } +} + +\newcommand{\drawWaveOperand}[4]{ + %% + %% Draw the part of the tensor that is one operand of the wave + %% + %% Op TL: pre defined coordinates of the top-left of the operand + %% \elem: pre defined variable + %% + %% #1: K + %% #2: mfmNonKDim + %% #3: kpack + %% #4: 0 for opA and 1 for opB + + \pgfmathsetmacro{\K}{#1} + \pgfmathsetmacro{\nonKDim}{#2} + \pgfmathsetmacro{\groups}{64/\nonKDim} + \pgfmathsetmacro{\kpack}{#3} + \pgfmathsetmacro{\opIdx}{#4} + \pgfmathsetmacro{\opIdxOther}{1-\opIdx} + + \coordinate (TL) at (Op TL); + + \pgfmathsetmacro{\numKRep}{\K/\kpack/\groups} + \pgfmathsetmacro{\maxKRepId}{\numKRep-1} + + \foreach \repId in {0,...,\maxKRepId}{ + \coordinate (mfma op TL) at ($(TL)+(\repId*\groups*\kpack*\elem*\opIdxOther, -\repId*\groups*\kpack*\elem*\opIdx)$); + \drawMFMAOperand{\nonKDim}{\kpack}{\opIdx}{0} + \draw [thick] (mfma op TL) rectangle + ++(\groups*\kpack*\elem*\opIdxOther+\nonKDim*\opIdx*\elem, -\nonKDim*\opIdxOther*\elem-\groups*\kpack*\elem*\opIdx); + } +} + +\newcommand{\drawDotOperands}[7]{ + %% + %% Draw operand tensors of dot + %% + %% A TL and B TL: pre defined top-left coordinates of A and B tensor + %% \elem: pre defined variable + %% + %% #1: M + %% #2: N + %% #3: K + %% #4: MFMA nonKDim + %% #5: warpsPerCTA[0] + %% #6: warpsPerCTA[1] + %% #7: kpack + + \pgfmathsetmacro{\M}{#1} + \pgfmathsetmacro{\N}{#2} + \pgfmathsetmacro{\K}{#3} + \pgfmathsetmacro{\mfmaNonKDim}{#4} + \pgfmathsetmacro{\warpsPerCTAM}{#5} + \pgfmathsetmacro{\warpsPerCTAN}{#6} + \pgfmathsetmacro{\kpack}{#7} + + %% operand A + \pgfmathsetmacro{\CTARepM}{\M/\warpsPerCTAM/\mfmaNonKDim} + \pgfmathsetmacro{\maxCTAIdM}{\CTARepM-1} + \pgfmathsetmacro{\maxWaveId}{\warpsPerCTAM-1} + \foreach \ctaId in {0,...,\maxCTAIdM}{ + \coordinate (CTA TL) at ($(A TL)+(0, -\ctaId*\warpsPerCTAM*\mfmaNonKDim*\elem)$); + \foreach \waveId in {0,...,\maxWaveId}{ + \coordinate (wave TL) at ($(CTA TL)+(0, -\waveId*\mfmaNonKDim*\elem)$); + \draw [ultra thin] (wave TL) rectangle ++(\K*\elem, -\mfmaNonKDim*\elem); + } + %% Only draw the detailed view of the first wave in CTA + \coordinate (Op TL) at (CTA TL); + \drawWaveOperand{\K}{\mfmaNonKDim}{\kpack}{0} + + %% Draw the outline of each CTA rep + \draw [ultra thick] (CTA TL) rectangle ++(\K*\elem, -\warpsPerCTAM*\mfmaNonKDim*\elem); + } + \draw [ultra thin] (A TL) rectangle ++(\K*\elem, -\M*\elem); + + + %% operand B + \pgfmathsetmacro{\CTARepN}{\N/\warpsPerCTAN/\mfmaNonKDim} + \pgfmathsetmacro{\maxCTAIdN}{\CTARepN-1} + \pgfmathsetmacro{\maxWaveId}{\warpsPerCTAN-1} + \foreach \ctaId in {0,...,\maxCTAIdN}{ + \coordinate (CTA TL) at ($(B TL)+(\ctaId*\warpsPerCTAN*\mfmaNonKDim*\elem, 0)$); + \foreach \waveId in {0,...,\maxWaveId}{ + \coordinate (wave TL) at ($(CTA TL)+(\waveId*\mfmaNonKDim*\elem ,0)$); + \draw [ultra thin] (wave TL) rectangle ++(\mfmaNonKDim*\elem, -\K*\elem); + } + %% Only draw the detailed view of the first wave in CTA + \coordinate (Op TL) at (CTA TL); + \drawWaveOperand{\K}{\mfmaNonKDim}{\kpack}{1} + + %% Draw the outline of each CTA rep + \draw [ultra thick] (CTA TL) rectangle ++(\warpsPerCTAN*\mfmaNonKDim*\elem, -\K*\elem); + } + \draw [ultra thin] (B TL) rectangle ++(\N*\elem, -\K*\elem); +} + + +\newcommand{\drawDot}[8]{ + %% + %% Draw C = dot A, B + %% + %% C TL: pre defined top-left coordinates of the result tensor + %% \elem: pre defined variable + %% + %% #1: M + %% #2: N + %% #3: K + %% #4: MFMA nonKDim + %% #5: warpsPerCTA[0] + %% #6: warpsPerCTA[1] + %% #7: 1 for mfma.trans, 0 for normal mfma + %% #8: kpack + + \pgfmathsetmacro{\M}{#1} + \pgfmathsetmacro{\N}{#2} + \pgfmathsetmacro{\K}{#3} + \pgfmathsetmacro{\mfmaNonKDim}{#4} + \pgfmathsetmacro{\groups}{64/\mfmaNonKDim} + \pgfmathsetmacro{\warpsPerCTAM}{#5} + \pgfmathsetmacro{\warpsPerCTAN}{#6} + \pgfmathsetmacro{\mfmaTrans}{#7} + \pgfmathsetmacro{\kpack}{#8} + \pgfmathsetmacro{\kdim}{int(\groups*\kpack)} + + \pgfmathsetmacro{\gap}{\elem*20} + \coordinate (A TL) at ($(C TL)+(-\gap-\K*\elem, 0)$); + \coordinate (B TL) at ($(C TL)+(0, \gap+\K*\elem)$); + + \drawDotOperands{\M}{\N}{\K}{\mfmaNonKDim}{\warpsPerCTAM}{\warpsPerCTAN}{\kpack} + + \drawTensorMFMALayout{\M}{\N}{\mfmaNonKDim}{\warpsPerCTAM}{\warpsPerCTAN}{\mfmaTrans} + + %% Draw labels + \node [scale=\scale, above] at ($(A TL)+(.5*\K*\elem, 0)$) {K=\K}; + \node [scale=\scale, above, rotate=90] at ($(A TL)+(0, -.5*\M*\elem)$) {M=\M}; + + \node [scale=\scale, above, rotate=90] at ($(B TL)+(0, -.5*\K*\elem)$) {K=\K}; + \node [scale=\scale, above] at ($(B TL)+(.5*\N*\elem, 0)$) {N=\N}; + + \node [scale=\scale, above left] at (A TL) {A}; + \node [scale=\scale, above left] at (B TL) {B}; + \node [scale=\scale, above left] at (C TL) {C}; + + %% label nonKDim + \node [scale=.8*\scale, left] at ($(A TL)+(0, -.5*\mfmaNonKDim*\elem)$) {\mfmaNonKDim}; + \node [scale=.8*\scale, above] at ($(B TL)+(.5*\mfmaNonKDim*\elem, 0)$) {\mfmaNonKDim}; + %% label kpack + \node [scale=.8*\scale, above] at ($(A TL)+(0.5*\groups*\kpack*\elem, 0)$) {\kdim}; + \node [scale=.8*\scale, left] at ($(B TL)+(0, -0.5*\groups\kpack*\elem)$) {\kdim}; +} + +\newcommand{\Colors}{{ + "red", + "YellowGreen", + "blue", + "Maroon", + "orange", + "cyan", + "magenta", + "brown", + "teal", + "purple", + "gray", + "Green", + "BlueGreen", + "violet", + "olive", + "darkgray", + }} + +\newcommand{\drawTensorLayoutGlobalMem}{ + %% + %% Draw tensor layout in global memory without any swizzling + %% + %% TL: pre defined top-left coordinates of the tensor in global memory + %% \elem: per defined variable + %% \Colors: a pre defined array of 16 colors + %% + %% The following arguments are also expected to be pre defined + %% #1: M + %% #2: K + %% #3: vec: number of elements in a group + + \pgfmathsetmacro{\numVecK}{\K/\vec} + \pgfmathsetmacro{\maxVecId}{16*\numVecK-1} + \pgfmathsetmacro{\drawM}{20} + + %% Draw the tensor, but only draw 32 rows + \draw (TL) rectangle ++(\K*\elem, -\drawM*\elem); + %% Draw detailed vec view of the tensor + \foreach \vecId in {0,...,\maxVecId}{ + + \pgfmathsetmacro{\vecCoordM}{int(\vecId/\numVecK)} + \pgfmathsetmacro{\vecCoordK}{mod(\vecId,\numVecK)} + \coordinate (vec TL) at ($(TL)+(\vecCoordK*\vec*\elem, -\vecCoordM*\elem)$); + + \pgfmathsetmacro{\colorIdxK}{int(mod(\vecCoordK,16))} + \pgfmathsetmacro{\colorIdxM}{mod(\vecCoordM,16)} + \pgfmathsetmacro{\vecColor}{\Colors[\colorIdxK]} + \pgfmathsetmacro{\ratio}{100-floor(\vecCoordK/16)*40} + + \draw [ultra thin, fill=\vecColor!\ratio!white] (vec TL) rectangle ++(\vec*\elem, -\elem) + node [pos=.5, scale=.6*\scale, white] {m\vecCoordM}; + + } + %% M and K dim + \node [scale=\scale, rotate=90, above] at ($(TL)+(0, -.5*\drawM*\elem-8*\elem)$) {M=\M}; + \node [scale=.8*\scale, left] at ($(TL)+(0, -.5*16*\elem)$) {16}; + \node [scale=\scale, above] at ($(TL)+(.5*\K*\elem, 0)$) {K=\K}; + %% label for vecSize + \def\vecR{1.5} + \coordinate (vec TL) at ($(TL)+(-.25*\vec*\elem, 3*\elem*\vecR)$); + \pgfmathsetmacro{\maxVec}{\vec-1} + \foreach \vecId in {0,...,\maxVec}{ + \draw ($(vec TL)+(\vecId*\elem*\vecR, 0)$) rectangle ++(\elem*\vecR, -\elem*\vecR); + } + \draw [densely dotted] (TL) -- ($(vec TL)+(0, -\elem*\vecR)$); + \draw [densely dotted] ($(TL)+(\vec*\elem, 0)$) -- ($(vec TL)+(\vec*\elem*\vecR, -\elem*\vecR)$); + \node [scale=.8*\scale, above] at ($(vec TL)+(.5*\vec*\elem*\vecR, 0)$) {vec=\vec}; +} + + + +\newcommand{\drawLDSLayoutTritonSwizzling}[2]{ + %% + %% Draw tensor layout in LDS with swizzling + %% + %% TL: pre defined top-left coordinates of the tensor in global memory + %% \elem: per defined variable + %% \Colors: a pre defined array of 16 colors + %% + %% The following three arguments are expected to be pre defined + %% #1: M + %% #2: K + %% #3: vec: number of elements in a group + %% + %% #1: hasSwizzle, 0 means no swizzling and no padding, + %% 1 means optimal swizzling + %% 2 means padding + %% #2: access mode, 0 means draw nothing, 1 means ds_read, 2 means ds_write + %% For ds_write access, the following variables are assumed to be pre defined + %% \sizePerThreadK + %% \sizePerThreadM + %% \threadsPerWarpK + + \pgfmathsetmacro{\hasSwizzle}{#1} + \pgfmathsetmacro{\accessMode}{#2} + \pgfmathsetmacro{\numVecK}{\K/\vec} + + %% Assuming fp16 data type + \pgfmathsetmacro{\LDSK}{64} + \pgfmathsetmacro{\numLDSVec}{\LDSK/\vec} + \pgfmathsetmacro{\swizzleK}{max(\LDSK, \K)} + \pgfmathsetmacro{\LDSM}{int(\M/\LDSK*\K)} + + \ifthenelse{\accessMode = 2}{ + %% \accessMode == 2, draw 8 rows + \pgfmathsetmacro{\maxVecId}{8*\numVecK-1} + \pgfmathsetmacro{\drawM}{8*\K/\LDSK+4} + }{ + %% \accessMode == 0 or 1, draw 16 rows + \pgfmathsetmacro{\maxVecId}{16*\numVecK-1} + \pgfmathsetmacro{\drawM}{16*\K/\LDSK+4} + } + + %% Parameters used for swizzling + \pgfmathsetmacro{\numVecSwizzleK}{\swizzleK/\vec} + %% perPhase = ceil(LDSK / K) + %% The number of the rows of the tensor that can share the same swizzling pattern + \pgfmathsetmacro{\perPhase}{ceil(\LDSK/\K)} + %% maxPhase: the total number of different swizzling patterns + \ifthenelse{\hasSwizzle=0}{ + %% When swizzling is disabled + \pgfmathsetmacro{\maxPhase}{1} + }{ + %% When vec is small enough, we want 16/perPhase different swizzling patterns + %% When vec is large, we can only have 64 / \vec different swizzling pattern at most + \pgfmathsetmacro{\maxPhase}{min(16/\perPhase,64/\vec)} + } + + %% Draw the LDS + \draw (TL) rectangle ++(\LDSK*\elem, -\drawM*\elem); + + %% Draw detailed vec view of LDS + \foreach \vecId in {0,...,\maxVecId}{ + \pgfmathsetmacro{\vecCoordM}{int(\vecId/\numVecK)} + \pgfmathsetmacro{\vecCoordK}{int(mod(\vecId,\numVecK))} + \pgfmathsetmacro{\rawPhase}{floor(\vecId/\numVecSwizzleK)} + %% vec color + \pgfmathsetmacro{\colorIdxK}{int(mod(\vecCoordK,16))} + \pgfmathsetmacro{\colorIdxM}{mod(\vecCoordM,16)} + \pgfmathsetmacro{\ratio}{100-floor(\vecCoordK/16)*40} + \pgfmathsetmacro{\vecColor}{\Colors[\colorIdxK]} + + %% old vec coordinates + \coordinate (vec TL) at ($(TL)+(\vecCoordK*\vec*\elem, -\vecCoordM*\elem)$); + + %% new vec coordinates in LDS by swizzling + %% The following two conditions correspond to the relation between \LDSK and \K + \ifthenelse{\LDSK < \K}{ + \pgfmathsetmacro{\vecLDSM}{\vecCoordM*\K/\LDSK+floor(\vecCoordK*\vec/\LDSK)} + \pgfmathsetmacro{\vecLDSK}{int(mod(\vecCoordK, \LDSK/\vec))} + }{ + \pgfmathsetmacro{\vecLDSM}{floor(\vecCoordM/\perPhase)} + \pgfmathsetmacro{\vecLDSK}{int(\vecCoordK+mod(\vecCoordM,\perPhase)*\numVecK)} + } + %% + \pgfmathsetmacro{\phase}{int(mod(\rawPhase, \maxPhase))} + %% Compute the swizzled col id + \pgfmathsetmacro{\vecLDSKSwizzled}{\bitwiseXor{\vecLDSK}{\phase}} + + %% new vec coordinates in LDS by padding + \pgfmathsetmacro{\numPads}{floor(\vecId/\numLDSVec)} + \pgfmathsetmacro{\bankId}{\vec/2*\vecId+\numPads} + \pgfmathsetmacro{\vecPadM}{int(\bankId/32)} + \pgfmathsetmacro{\vecPadK}{int(mod(\bankId,32))} + + \ifthenelse{\hasSwizzle = 2}{ + %% vec coordinates by padding + \coordinate (new vec TL) at ($(TL)+(\vecPadK*2*\elem, -\vecPadM*\elem)$); + \pgfmathsetmacro{\tailBankId}{int(\vecPadK+\vec/2-1)} + }{ + %% vec coordinates by swizzling + \coordinate (new vec TL) at ($(TL)+(\vecLDSKSwizzled*\vec*\elem, -\vecLDSM*\elem)$); + \pgfmathsetmacro{\tailBankId}{0} + } + + \ifthenelse{\hasSwizzle = 2 \AND \tailBankId > 31}{ + \pgfmathsetmacro{\nextBanks}{\tailBankId-31} + \pgfmathsetmacro{\leftBanks}{\vec/2 - \nextBanks} + \draw [ultra thin, fill=\vecColor!\ratio!white] (new vec TL) rectangle ++(\leftBanks*2*\elem, -\elem) + node [pos=.5, scale=.6*\scale, white] {m\vecCoordM}; + \draw [ultra thin, fill=\vecColor!\ratio!white] ($(TL)+(0, -\vecPadM*\elem-\elem)$) + rectangle ++(\nextBanks*2*\elem, -\elem) node [pos=.5, scale=.6*\scale, white] {m\vecCoordM}; + }{ + \draw [ultra thin, fill=\vecColor!\ratio!white] (new vec TL) rectangle ++(\vec*\elem, -\elem) + node [pos=.5, scale=.6*\scale, white] {m\vecCoordM}; + } + + %% ds_read + %% Highlight the elements the first 16 threads access in the first cycle + %% This is used to visualize bank conflicts + \ifthenelse{\accessMode = 1}{ + \ifthenelse{\vecCoordK = 0}{ + \draw [fill=white] (new vec TL) rectangle ++(\elem, -\elem); + \draw (new vec TL) -- ++(\elem, -\elem); + \draw ($(new vec TL)+(0, -\elem)$) -- ++(\elem, \elem); + }{} + }{} + + %% Draw ds_write pattern + \ifthenelse{\accessMode = 2}{ + %% First compute the coverage of the first 16 threads + \pgfmathsetmacro{\covK}{min(16, \threadsPerWarpK)*\sizePerThreadK/\vec} + \pgfmathsetmacro{\covM}{ceil(16/\threadsPerWarpK)*\sizePerThreadM} + %% Check conditions for the first 16 threads + \pgfmathsetmacro{\vecInThread}{int(mod(\vecCoordK, \sizePerThreadK/\vec))} + \ifthenelse{\vecInThread=0}{ + \ifthenelse{\vecCoordK<\covK \AND \vecCoordM<\covM}{ + \draw [fill=white] (new vec TL) rectangle ++(\elem, -\elem); + \draw (new vec TL) -- ++(\elem, -\elem); + \draw ($(new vec TL)+(0, -\elem)$) -- ++(\elem, \elem); + }{} + }{} + }{} + + %% Label the phase of each line if swizzling is used + \ifthenelse{\hasSwizzle = 2}{}{ + \pgfmathsetmacro{\lastVecId}{int(64/\vec)-1} + \ifthenelse{\vecLDSKSwizzled = \lastVecId}{ + \draw [ultra thin] ($(new vec TL)+(\vec*\elem, -.5*\elem)$) -- ++(\elem, 0) + node [scale=.6*\scale, right] {\phase}; + }{} + } + } + + %% Draw boundary of 32 banks + %% Assume fp16 data type + \foreach \bank in {0,...,31}{ + \draw [ultra thin, gray] ($(TL)+(\bank*2*\elem, 0)$) -- ++(0, 2*\elem) + node [scale=.6*\scale, right, black] {\bank}; + } + \draw [ultra thin, gray] ($(TL)+(32*2*\elem, 0)$) -- ++(0, 2*\elem); + \node [scale=.6*\scale, left, black] at ($(TL)+(0, 2*\elem)$) {bank id}; + + \node [scale=\scale, above] at ($(TL)+(.5*\LDSK*\elem, 3*\elem)$) {LDS 32 banks}; + \node [scale=\scale, rotate=90, above] at ($(TL)+(0, -.5*\drawM*\elem)$) {LDSM=\LDSM}; + + %% label phase if swizzling is used + \ifthenelse{\hasSwizzle = 2}{}{ + \node [scale=.6*\scale, above right] at($(TL)+(32*2*\elem, 0)$) {phase}; + } +} + +\newcommand{\drawMFMAInstr}[3]{ + %% + %% Draw layout of mfma instructions with tid labeled + %% + %% C TL: pre defined top-left coordinates of the output matrix + %% \elem: pre defined variable + %% + %% #1: mfmaNonKDim + %% #2: kpack + %% #3: mfmaTrans + \pgfmathsetmacro{\mfmaNonKDim}{#1} + \pgfmathsetmacro{\groups}{64/\mfmaNonKDim} + \pgfmathsetmacro{\kpack}{#2} + \pgfmathsetmacro{\mfmaTrans}{#3} + \pgfmathsetmacro{\nonTrans}{1-#3} + + \pgfmathsetmacro{\gap}{\elem*5} + \coordinate (mfma opA TL) at ($(C TL)+(-.5*\gap-1.2*\nonTrans*\gap-\groups*\kpack*\elem, 0)$); + \coordinate (mfma op TL) at (mfma opA TL); + \drawMFMAOperand{\mfmaNonKDim}{\kpack}{0}{1} + \coordinate (mfma op TL) at ($(C TL)+(0, 1.5*\gap+.5*\mfmaTrans*\gap+\groups*\kpack*\elem)$); + \drawMFMAOperand{\mfmaNonKDim}{\kpack}{1}{1} + + \coordinate (block TL) at (C TL); + \drawBlockMFMALayoutLarge{\mfmaTrans}{\mfmaNonKDim}{1} + + %% Draw labels + \def\vecR{1.5} + \coordinate (vec TL) at ($(mfma opA TL)+(-.25*\kpack*\elem, 3*\elem*\vecR)$); + \pgfmathsetmacro{\maxVec}{\kpack-1} + \foreach \vecId in {0,...,\maxVec}{ + \draw ($(vec TL)+(\vecId*\elem*\vecR, 0)$) rectangle ++(\elem*\vecR, -\elem*\vecR); + } + \draw [densely dotted] (mfma opA TL) -- ($(vec TL)+(0, -\elem*\vecR)$); + \draw [densely dotted] ($(mfma opA TL)+(\kpack*\elem, 0)$) -- ($(vec TL)+(\kpack*\elem*\vecR, -\elem*\vecR)$); + \node [scale=.8*\scale, above] at ($(vec TL)+(.5*\kpack*\elem*\vecR, 0)$) {vec=\kpack}; + + \coordinate (vec TL) at ($(mfma op TL)+(-3*\elem*\vecR, .25*\kpack*\elem)$); + \foreach \vecId in {0,...,\maxVec}{ + \draw ($(vec TL)+(0, -\vecId*\elem*\vecR)$) rectangle ++(\elem*\vecR, -\elem*\vecR); + } + \draw [densely dotted] (mfma op TL) -- ($(vec TL)+(\elem*\vecR,0)$); + \draw [densely dotted] ($(mfma op TL)+(0, -\kpack*\elem)$) -- ($(vec TL)+(\elem*\vecR, -\kpack*\elem*\vecR)$); + \node [scale=.8*\scale, above, rotate=90] at ($(vec TL)+(0, -.5*\kpack*\elem*\vecR)$) {vec=\kpack}; + + \node [scale=\scale, below] at ($(block TL)+(.5*\mfmaNonKDim*\elem,-\mfmaNonKDim*\elem)$) {outC}; + \ifthenelse{\mfmaTrans=0}{ + \node [scale=\scale, below] at ($(mfma opA TL)+(\kpack*\elem, -\mfmaNonKDim*\elem)$) {opA}; + \node [scale=\scale, above] at (mfma op TL) {opB}; + \coordinate (vec TL) at ($(block TL)+(-3*\elem-\elem*\vecR, .25*4*\elem)$); + \foreach \vecId in {0,1,2,3}{ + \draw ($(vec TL)+(0, -\vecId*\elem*\vecR)$) rectangle ++(\elem*\vecR, -\elem*\vecR); + } + \draw [densely dotted] (block TL) -- ++(-3*\elem, .25*4*\elem); + \draw [densely dotted] ($(block TL)+(0, -4*\elem)$) -- ++(-3*\elem, -.25*4*\elem); + \node [scale=.8*\scale, above, rotate=90] at ($(vec TL)+(0, -.5*4*\elem*\vecR)$) {vec=4}; + \node [scale=.8*\scale, above, align=center] at ($(block TL)+(.5*\mfmaNonKDim*\elem, 0)$) {mfmaLayout\\trans=False}; + }{ + \node [scale=\scale, below] at ($(mfma opA TL)+(\kpack*\elem, -\mfmaNonKDim*\elem)$) {opB}; + \node [scale=\scale, above] at (mfma op TL) {opA}; + \coordinate (vec TL) at ($(block TL)+(-.25*4*\elem, 3*\elem+\elem*\vecR)$); + \foreach \vecId in {0,1,2,3}{ + \draw ($(vec TL)+(\vecId*\elem*\vecR, 0)$) rectangle ++(\elem*\vecR, -\elem*\vecR); + } + \draw [densely dotted] (block TL) -- ++(-.25*4*\elem, 3*\elem); + \draw [densely dotted] ($(block TL)+(4*\elem, 0)$) -- ++(.25*4*\elem, 3*\elem); + \node [scale=.8*\scale, above] at ($(vec TL)+(.5*4*\elem*\vecR, 0)$) {vec=4}; + \node [scale=.8*\scale, above, align=center] at ($(block TL)+(16*\elem, 0)$) {mfmaLayout\\trans=True}; + } +} + +\newcommand{\drawWMMAOperand}[3]{ + %% + %% Draw the layout of one operand of WMMA instruction + %% + %% #1: opIdx. 0 for opA, 1 for opB + %% #2: verbose. 1 means draw tid in each vec; 0 means draw nothing + %% #3: mode. 0 for w32, 1 for w64 + %% + %% wmma op TL: pre defined top-left coordinates of the operand matrix + + \pgfmathsetmacro{\isOpB}{#1} + \pgfmathsetmacro{\isOpA}{1-\isOpB} + \pgfmathsetmacro{\verbose}{#2} + \pgfmathsetmacro{\isWLarge}{#3} + + \foreach \row in {0,...,15}{ + \pgfmathsetmacro{\ratio}{\row*5+15} + \coordinate (vec TL) at ($(wmma op TL)+(\row*\isOpB*\elem, -\row*\elem*\isOpA)$); + \ifthenelse{\isWLarge=1}{ + \pgfmathsetmacro{\tidone}{int(\row+16)} + \pgfmathsetmacro{\tidtwo}{int(\row+32)} + \pgfmathsetmacro{\tidthree}{int(\row+48)} + \draw [line width=0.005mm, fill=brown!\ratio!white] (vec TL) + rectangle ++(16*\elem*\isOpA+\elem*\isOpB, -\elem*\isOpA-16*\elem*\isOpB) + node [scale=0.4*\scale, pos=.5, rotate=90*\isOpB] {t\row, t\tidone, t\tidtwo, t\tidthree}; + }{ + \pgfmathsetmacro{\tidone}{int(\row+16)} + \draw [line width=0.005mm, fill=brown!\ratio!white] (vec TL) + rectangle ++(16*\elem*\isOpA+\elem*\isOpB, -\elem*\isOpA-16*\elem*\isOpB) + node [scale=0.4*\scale, pos=.5, rotate=90*\isOpB] {t\row, t\tidone}; + } + } +} + +\newcommand{\drawWMMAResult}[2]{ + %% + %% Draw layout of WMMA result tensor + %% + %% #1: verbose. 1 means draw tid in each vec; 0 means draw nothing + %% #2: mode. 0 for w32, 1 for w64 + + \pgfmathsetmacro{\verbose}{#1} + \pgfmathsetmacro{\isWLarge}{#2} + + \pgfmathsetmacro{\numElem}{256} + \pgfmathsetmacro{\maxElemId}{\numElem-1} + + \foreach \elemId in {0,...,\maxElemId}{ + %% figure out the rowID + \pgfmathsetmacro{\rowId}{floor(\elemId/16)} + %% figure out the colID + \pgfmathsetmacro{\colId}{mod(\elemId,16)} + %% figure out the tid and color + \ifthenelse{\isWLarge=1}{ + \pgfmathsetmacro{\tid}{int(mod(\elemId,64))} + \pgfmathsetmacro{\laneId}{mod(\elemId,64)} + }{ + \pgfmathsetmacro{\tid}{int(mod(\elemId,32))} + \pgfmathsetmacro{\laneId}{mod(\elemId,32)} + } + %% figure out the color + \pgfmathsetmacro{\colorId}{floor(\laneId/16)} + \pgfmathsetmacro{\vecColor}{\Colors[\colorId]} + %% Coordinate + \coordinate (vec TL) at ($(C TL)+(\colId*\elem, -\rowId*\elem)$); + \draw [line width=0.005mm, fill=\vecColor!60!white] (vec TL) rectangle ++(\elem, -\elem) + node [scale=.4*\scale, pos=.5] {t\tid}; + } + + +} + +\newcommand{\drawWMMAInstr}[2]{ + %% + %% Draw wmma instruction layouts 16x16x16 + %% + %% #1: mode. 0 for w32, 1 for w64 + %% #2: verbose. 1 means draw tid in each vec; 0 means draw nothing + %% + %% C TL: pre defined top-left coordinates of output matrix + %% \elem: pre defined element size + + + \pgfmathsetmacro{\isWLarge}{#1} + \pgfmathsetmacro{\verbose}{#2} + + \pgfmathsetmacro{\gap}{\elem*2} + \coordinate (wmma op TL) at ($(C TL)+(-\gap-16*\elem, 0)$); + \coordinate (wmma opA TL) at (wmma op TL); + \drawWMMAOperand{0}{\verbose}{\isWLarge} + \coordinate (wmma op TL) at ($(C TL)+(0, \gap+16*\elem)$); + \drawWMMAOperand{1}{\verbose}{\isWLarge} + + \drawWMMAResult{1}{\isWLarge} + + %% labels + \pgfmathsetmacro{\gap}{\elem} + \node [above left, scale=\scale] at (wmma opA TL) {A}; + \node [above left, scale=\scale] at (wmma op TL) {B}; + \node [above right, scale=\scale] at ($(C TL)+(16*\elem, 0)$) {C}; + + %% A k dim + \node [scale=.8*\scale] (k dim A) at ($(wmma opA TL)+(8*\elem,\gap)$) {16}; + \draw [->, >=stealth] (k dim A.west) -- ($(wmma opA TL)+(0, \gap)$); + \draw [->, >=stealth] (k dim A.east) -- ($(wmma opA TL)+(16*\elem, \gap)$); + + %% B K dim + \node [scale=.8*\scale, rotate=90] (k dim B) at ($(wmma op TL)+(-\gap, -8*\elem)$) {16}; + \draw [->, >=stealth] (k dim B.east) -- ($(wmma op TL)+(-\gap, 0)$); + \draw [->, >=stealth] (k dim B.west) -- ($(wmma op TL)+(-\gap, -16*\elem)$); + + %% C M dim + \node [scale=.8*\scale] (m dim) at ($(C TL)+(8*\elem,-16*\elem-\gap)$) {16}; + \draw [->, >=stealth] (m dim.west) -- ($(C TL)+(0, -16*\elem-\gap)$); + \draw [->, >=stealth] (m dim.east) -- ($(C TL)+(16*\elem, -16*\elem-\gap)$); + + %% C N dim + \node [scale=.8*\scale, rotate=-90] (n dim) at ($(C TL)+(16*\elem+\gap, -8*\elem)$) {16}; + \draw [->, >=stealth] (n dim.west) -- ($(C TL)+(16*\elem+\gap, 0)$); + \draw [->, >=stealth] (n dim.east) -- ($(C TL)+(16*\elem+\gap, -16*\elem)$); +} From 5880a6b96bf7aec807c2fb45323a762f61bed1fd Mon Sep 17 00:00:00 2001 From: Lixun Zhang Date: Fri, 6 Sep 2024 08:57:48 -0500 Subject: [PATCH 2/6] yapf format --- .../tools/amdgcn-cfg/amdgcn-cfg.py | 338 +++++++++--------- .../tools/plot-layout/plot_layout.py | 91 ++--- 2 files changed, 188 insertions(+), 241 deletions(-) diff --git a/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py b/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py index 4100528f28db..9c3bcbea9d70 100644 --- a/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py +++ b/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py @@ -6,33 +6,36 @@ class Options: - def __init__(self, input_file, output_file, verbose, format): - if not os.path.exists(input_file): - raise RuntimeError('input file is not provided') - output_dir = os.path.dirname(output_file) - if not os.path.exists(output_dir): - raise RuntimeError('output directory does not exist') + def __init__(self, input_file, output_file, verbose, format): + if not os.path.exists(input_file): + raise RuntimeError('input file is not provided') - self.input_file = input_file - self.output_file = output_file - self.verbose = verbose - self.format = format - self.output_dir = output_dir + output_dir = os.path.dirname(output_file) + if not os.path.exists(output_dir): + raise RuntimeError('output directory does not exist') + + self.input_file = input_file + self.output_file = output_file + self.verbose = verbose + self.format = format + self.output_dir = output_dir class Block: - def __init__(self, label, code): - self.label = label - self.code = code - self.edges = [] + + def __init__(self, label, code): + self.label = label + self.code = code + self.edges = [] class Kernel: - def __init__(self, kernel_name, blocks): - self.name = kernel_name - self.blocks = blocks - self.cfg = None + + def __init__(self, kernel_name, blocks): + self.name = kernel_name + self.blocks = blocks + self.cfg = None begin_label = 'Begin' @@ -40,187 +43,180 @@ def __init__(self, kernel_name, blocks): def find_kernel(text): - func_name_expr = r'^([^\s^\.]\w.+):' - func_name = None - start = None - for index, line in enumerate(text): - match = re.search(func_name_expr, line) - if not match is None: - func_name = match[1] - start = index - break - if start == None: - return None, None, None - - end = None - for index, line in enumerate(text): - if not re.search(r's_endpgm', line) is None: - end = index - break - - if end == None: - return None, None, None - - return func_name, text[start:end+1], end + func_name_expr = r'^([^\s^\.]\w.+):' + func_name = None + start = None + for index, line in enumerate(text): + match = re.search(func_name_expr, line) + if not match is None: + func_name = match[1] + start = index + break + if start == None: + return None, None, None + + end = None + for index, line in enumerate(text): + if not re.search(r's_endpgm', line) is None: + end = index + break + + if end == None: + return None, None, None + + return func_name, text[start:end + 1], end def find_label(kernel): - label = None - index = None - for index, line in enumerate(kernel): - match = re.search(r'^\.(\w+):', line) - if not match is None: - label = match[1] - break - return label, index + label = None + index = None + for index, line in enumerate(kernel): + match = re.search(r'^\.(\w+):', line) + if not match is None: + label = match[1] + break + return label, index def get_block_list(kernel): - label, index = find_label(kernel) + label, index = find_label(kernel) - blocks = OrderedDict() - if (index > 1): - blocks[begin_label] = Block(begin_label, kernel[:index-1]) + blocks = OrderedDict() + if (index > 1): + blocks[begin_label] = Block(begin_label, kernel[:index - 1]) - while label != None: - kernel = kernel[index+1:] - next_label, next_index = find_label(kernel) - if next_label is None: - code = kernel[index:] - else: - code = kernel[:next_index] - blocks[label] = Block(label, code) + while label != None: + kernel = kernel[index + 1:] + next_label, next_index = find_label(kernel) + if next_label is None: + code = kernel[index:] + else: + code = kernel[:next_index] + blocks[label] = Block(label, code) - label = next_label - index = next_index + label = next_label + index = next_index - blocks[end_label] = Block(end_label, []) + blocks[end_label] = Block(end_label, []) - return blocks + return blocks def find_terminators(code): - terminator_labels = [] - for line in code: - branch = re.search(r'(c)?branch.*\s+\.?(.*)', line) - if not branch is None: - is_condional = True if len(branch.groups()) == 2 else False - label_idx = 2 if is_condional else 1 - terminator_labels.append(branch[label_idx]) - if not is_condional: - return terminator_labels, True - end = re.search(r's_endpgm', line) - if not end is None: - terminator_labels.append(end_label) - return terminator_labels, True - - return terminator_labels, False + terminator_labels = [] + for line in code: + branch = re.search(r'(c)?branch.*\s+\.?(.*)', line) + if not branch is None: + is_condional = True if len(branch.groups()) == 2 else False + label_idx = 2 if is_condional else 1 + terminator_labels.append(branch[label_idx]) + if not is_condional: + return terminator_labels, True + end = re.search(r's_endpgm', line) + if not end is None: + terminator_labels.append(end_label) + return terminator_labels, True + + return terminator_labels, False def add_edges(kernel): - keys = list(kernel.blocks.keys()) - for index, curr_label in enumerate(keys): - if curr_label == end_label: - continue + keys = list(kernel.blocks.keys()) + for index, curr_label in enumerate(keys): + if curr_label == end_label: + continue - code = kernel.blocks[curr_label].code - terminators, is_last_unconditional = find_terminators(code[:-1]) + code = kernel.blocks[curr_label].code + terminators, is_last_unconditional = find_terminators(code[:-1]) - if is_last_unconditional: - # unconditional jump in the middle of the block - break + if is_last_unconditional: + # unconditional jump in the middle of the block + break - # handle the last terminator in the current BB - last_terminator, is_unconditional = find_terminators([code[-1]]) + # handle the last terminator in the current BB + last_terminator, is_unconditional = find_terminators([code[-1]]) - is_conditional = not is_unconditional - next_block_label = keys[index + 1] - is_next_covered = next_block_label in terminators - - if last_terminator: - terminators.extend(last_terminator) - if is_conditional and not is_next_covered: - next_block_label = keys[index + 1] - terminators.append(next_block_label) - else: - if not is_next_covered: + is_conditional = not is_unconditional next_block_label = keys[index + 1] - terminators.append(next_block_label) + is_next_covered = next_block_label in terminators - assert(len(terminators)) - kernel.blocks[curr_label].edges = terminators + if last_terminator: + terminators.extend(last_terminator) + if is_conditional and not is_next_covered: + next_block_label = keys[index + 1] + terminators.append(next_block_label) + else: + if not is_next_covered: + next_block_label = keys[index + 1] + terminators.append(next_block_label) + + assert (len(terminators)) + kernel.blocks[curr_label].edges = terminators def generate_cfg(kernel, options): - graph = graphviz.Digraph(f'{kernel.name}') - for curr_label in kernel.blocks: - block = kernel.blocks[curr_label] - asm = [line.strip() for line in block.code] - if options.verbose: - label_text = repr('\n'.join([f'{curr_label}', *asm])) - else: - label_text = curr_label - graph.node(curr_label, - shape='rect', - labeljust='l', - margin='0.01', - label=label_text) - - for curr_label in kernel.blocks: - block = kernel.blocks[curr_label] - for edge in block.edges: - graph.edge(curr_label, edge) - - return graph + graph = graphviz.Digraph(f'{kernel.name}') + for curr_label in kernel.blocks: + block = kernel.blocks[curr_label] + asm = [line.strip() for line in block.code] + if options.verbose: + label_text = repr('\n'.join([f'{curr_label}', *asm])) + else: + label_text = curr_label + graph.node(curr_label, shape='rect', labeljust='l', margin='0.01', label=label_text) + + for curr_label in kernel.blocks: + block = kernel.blocks[curr_label] + for edge in block.edges: + graph.edge(curr_label, edge) + + return graph def main(options): - asm = [] - with open(options.input_file, 'r') as file: - context = file.readlines() - for line in context: - asm.append(line[:-1]) - - kernels = [] - last_end_index = 0 - while last_end_index != None: - func_name, kernel_asm, last_end_index = find_kernel(asm) - if kernel_asm == None: - break - - blocks = get_block_list(kernel_asm) - kernel = Kernel(func_name, blocks) - add_edges(kernel) - - cfg = generate_cfg(kernel, options) - kernel.cfg = cfg - kernels.append(kernel) - asm = asm[last_end_index+1:] - - for index, kernel in enumerate(kernels): - output_file_name = f'{options.output_file}.kernel-{index}' - if options.format == 'dot': - with open(f'{output_file_name}.dot', 'w') as file: - file.write(str(kernel.cfg)) - file.write('\n') - else: - kernel.cfg.render(filename=f'{output_file_name}', - format=options.format, - ).replace('\\', '/') + asm = [] + with open(options.input_file, 'r') as file: + context = file.readlines() + for line in context: + asm.append(line[:-1]) + + kernels = [] + last_end_index = 0 + while last_end_index != None: + func_name, kernel_asm, last_end_index = find_kernel(asm) + if kernel_asm == None: + break + + blocks = get_block_list(kernel_asm) + kernel = Kernel(func_name, blocks) + add_edges(kernel) + + cfg = generate_cfg(kernel, options) + kernel.cfg = cfg + kernels.append(kernel) + asm = asm[last_end_index + 1:] + + for index, kernel in enumerate(kernels): + output_file_name = f'{options.output_file}.kernel-{index}' + if options.format == 'dot': + with open(f'{output_file_name}.dot', 'w') as file: + file.write(str(kernel.cfg)) + file.write('\n') + else: + kernel.cfg.render( + filename=f'{output_file_name}', + format=options.format, + ).replace('\\', '/') if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="Generates Control Flow Graph (CFG) from amdgcn assembly file", - ) - parser.add_argument("-i", "--input", type=str, default=None, help="input file") - parser.add_argument("-o", "--output", type=str, default=None, help="output file prefix") - parser.add_argument("-v", "--verbose", action='store_true', help='verbose output') - parser.add_argument("-f", "--format", choices=['dot','svg', 'pdf'], - default="dot", - help="output format type") - args = parser.parse_args() - - options = Options(args.input, args.output, args.verbose, args.format) - - main(options) + parser = argparse.ArgumentParser(prog="Generates Control Flow Graph (CFG) from amdgcn assembly file", ) + parser.add_argument("-i", "--input", type=str, default=None, help="input file") + parser.add_argument("-o", "--output", type=str, default=None, help="output file prefix") + parser.add_argument("-v", "--verbose", action='store_true', help='verbose output') + parser.add_argument("-f", "--format", choices=['dot', 'svg', 'pdf'], default="dot", help="output format type") + args = parser.parse_args() + + options = Options(args.input, args.output, args.verbose, args.format) + + main(options) diff --git a/python/perf-kernels/tools/plot-layout/plot_layout.py b/python/perf-kernels/tools/plot-layout/plot_layout.py index c2387905f3e0..74554b4c3f02 100755 --- a/python/perf-kernels/tools/plot-layout/plot_layout.py +++ b/python/perf-kernels/tools/plot-layout/plot_layout.py @@ -95,8 +95,7 @@ def draw_dot_layout_cmd(M, N, K, mfmaNonKDim, warpsPerCTA, trans, kpack): \\end{{document}}''' -def draw_blocked_layout_cmd(M, K, sizePerThread, threadsPerWarp, warpsPerCTA, - order): +def draw_blocked_layout_cmd(M, K, sizePerThread, threadsPerWarp, warpsPerCTA, order): return f'''\\begin{{document}} \\begin{{tikzpicture}} \\def\\scale{{1}} @@ -107,8 +106,7 @@ def draw_blocked_layout_cmd(M, K, sizePerThread, threadsPerWarp, warpsPerCTA, \\end{{document}}''' -def draw_lds_access_cmd(M, K, kpack, ldsLayout, ldsAccess, sizePerThread, - threadsPerWarp): +def draw_lds_access_cmd(M, K, kpack, ldsLayout, ldsAccess, sizePerThread, threadsPerWarp): if ldsLayout == 'swizzle': hasSwizzle = 1 elif ldsLayout == 'padding': @@ -158,11 +156,7 @@ def draw_wmma_instr_cmd(waveSize): def run_bash_command(commandstring): - proc = subprocess.run(commandstring, - shell=True, - check=True, - executable='/bin/bash', - stdout=subprocess.PIPE) + proc = subprocess.run(commandstring, shell=True, check=True, executable='/bin/bash', stdout=subprocess.PIPE) return proc.stdout.splitlines() @@ -172,62 +166,27 @@ def parse_args(): allow_abbrev=False, ) ## tensor shapes - parser.add_argument("-shape", - type=int, - nargs=3, - default=(32, 128, 64), - help='Tensor shape in the form of M,N,K') - parser.add_argument("-plot", - type=str, - default="blocked", - choices=['blocked', 'dot', 'wmma', 'lds'], + parser.add_argument("-shape", type=int, nargs=3, default=(32, 128, 64), help='Tensor shape in the form of M,N,K') + parser.add_argument("-plot", type=str, default="blocked", choices=['blocked', 'dot', 'wmma', 'lds'], help='choose plot mode') - parser.add_argument( - "-nonKDim", - type=int, - default=32, - choices=[16, 32], - help='mfma instruction dim') + parser.add_argument("-nonKDim", type=int, default=32, choices=[16, 32], help='mfma instruction dim') ## blocked layout parameters parser.add_argument("-sizePerThread", type=int, nargs=2, default=(1, 4)) parser.add_argument("-threadsPerWarp", type=int, nargs=2, default=(16, 4)) parser.add_argument("-warpsPerCTA", type=int, nargs=2, default=(1, 4)) parser.add_argument("-order", type=int, nargs=2, default=(1, 0)) ## LDS access parameters - parser.add_argument("-kWidth", - type=int, - default=4, - choices=[4, 8, 16], - help='number of elements per thread') - parser.add_argument("-lds_layout", - type=str, - default="none", - choices=['swizzle', 'padding', 'none'], + parser.add_argument("-kWidth", type=int, default=4, choices=[4, 8, 16], help='number of elements per thread') + parser.add_argument("-lds_layout", type=str, default="none", choices=['swizzle', 'padding', 'none'], help='choose the LDS data layout') - parser.add_argument("-lds_access", - type=str, - default="none", - choices=['read', 'write', 'none'], + parser.add_argument("-lds_access", type=str, default="none", choices=['read', 'write', 'none'], help='choose LDS access mode') ## wmma instruction layout parameter - parser.add_argument("-wave_size", - type=int, - default=32, - choices=[32, 64], - help='choose the wmma instruction mode') - - parser.add_argument("-o", - type=str, - default="myplot", - help='output pdf file name (without surfix)') - parser.add_argument("-mfmaTrans", - action='store_true', - default=False, - help='If set, then use mfma.trans layout') - parser.add_argument("-keep", - action='store_true', - default=False, - help='If set, keep the generated .tex file') + parser.add_argument("-wave_size", type=int, default=32, choices=[32, 64], help='choose the wmma instruction mode') + + parser.add_argument("-o", type=str, default="myplot", help='output pdf file name (without surfix)') + parser.add_argument("-mfmaTrans", action='store_true', default=False, help='If set, then use mfma.trans layout') + parser.add_argument("-keep", action='store_true', default=False, help='If set, keep the generated .tex file') args = parser.parse_args() @@ -279,24 +238,19 @@ def main(): if plot_mode == 'blocked' or plot_mode == 'dot': print(f"CTAShape={CTAShape}") - assert M != 0 and CTAShape[ - 0] <= M and M % CTAShape[0] == 0, "bad tensor dimension M" + assert M != 0 and CTAShape[0] <= M and M % CTAShape[0] == 0, "bad tensor dimension M" if plot_mode == 'blocked': - assert K != 0 and CTAShape[ - 1] <= K and K % CTAShape[1] == 0, "bad tensor dimension K" + assert K != 0 and CTAShape[1] <= K and K % CTAShape[1] == 0, "bad tensor dimension K" if plot_mode == 'dot': - assert N != 0 and CTAShape[ - 1] <= N and N % CTAShape[1] == 0, "bad tensor dimension N" + assert N != 0 and CTAShape[1] <= N and N % CTAShape[1] == 0, "bad tensor dimension N" assert K != 0 and K % (2 * kpack) == 0, "bad tensor dimension K" if plot_mode == 'lds': print(f"Plotting LDS access for tensor M={M},K={K} with vec={kpack}") if ldsAccess == 'write': - print( - f"sizePerThread={sizePerThread}, threadsPerWarp={threadsPerWarp}" - ) + print(f"sizePerThread={sizePerThread}, threadsPerWarp={threadsPerWarp}") with open("myplot.tex", 'w') as f_plot: with open("tikzplot.tex") as file: @@ -304,14 +258,11 @@ def main(): preamble_str = draw_preamble_cmd() - draw_blockedLayout_str = draw_blocked_layout_cmd( - M, K, sizePerThread, threadsPerWarp, warpsPerCTA, order) + draw_blockedLayout_str = draw_blocked_layout_cmd(M, K, sizePerThread, threadsPerWarp, warpsPerCTA, order) - draw_dotLayout_str = draw_dot_layout_cmd(M, N, K, mfmaNonKDim, - warpsPerCTA, trans, kpack) + draw_dotLayout_str = draw_dot_layout_cmd(M, N, K, mfmaNonKDim, warpsPerCTA, trans, kpack) - draw_lds_str = draw_lds_access_cmd(M, K, kpack, ldsLayout, ldsAccess, - sizePerThread, threadsPerWarp) + draw_lds_str = draw_lds_access_cmd(M, K, kpack, ldsLayout, ldsAccess, sizePerThread, threadsPerWarp) draw_wmma_str = draw_wmma_instr_cmd(waveSize) From 8405b6bb66778c4a1c6451b22a9f280259d51a38 Mon Sep 17 00:00:00 2001 From: Lixun Zhang Date: Fri, 6 Sep 2024 09:00:24 -0500 Subject: [PATCH 3/6] More formats --- .../tools/amdgcn-cfg/amdgcn-cfg.py | 10 +-- .../perf-kernels/tools/plot-layout/README.md | 12 +-- .../tools/plot-layout/plot_layout.py | 2 - .../tools/plot-layout/tikzplot.tex | 80 +++++++++---------- 4 files changed, 51 insertions(+), 53 deletions(-) diff --git a/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py b/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py index 9c3bcbea9d70..570fc3399602 100644 --- a/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py +++ b/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py @@ -48,7 +48,7 @@ def find_kernel(text): start = None for index, line in enumerate(text): match = re.search(func_name_expr, line) - if not match is None: + if match is not None: func_name = match[1] start = index break @@ -57,7 +57,7 @@ def find_kernel(text): end = None for index, line in enumerate(text): - if not re.search(r's_endpgm', line) is None: + if re.search(r's_endpgm', line) is not None: end = index break @@ -72,7 +72,7 @@ def find_label(kernel): index = None for index, line in enumerate(kernel): match = re.search(r'^\.(\w+):', line) - if not match is None: + if match is not None: label = match[1] break return label, index @@ -106,14 +106,14 @@ def find_terminators(code): terminator_labels = [] for line in code: branch = re.search(r'(c)?branch.*\s+\.?(.*)', line) - if not branch is None: + if branch is not None: is_condional = True if len(branch.groups()) == 2 else False label_idx = 2 if is_condional else 1 terminator_labels.append(branch[label_idx]) if not is_condional: return terminator_labels, True end = re.search(r's_endpgm', line) - if not end is None: + if end is not None: terminator_labels.append(end_label) return terminator_labels, True diff --git a/python/perf-kernels/tools/plot-layout/README.md b/python/perf-kernels/tools/plot-layout/README.md index e12cf9441d37..40de35bdb3aa 100644 --- a/python/perf-kernels/tools/plot-layout/README.md +++ b/python/perf-kernels/tools/plot-layout/README.md @@ -31,7 +31,7 @@ options: ``` ## Installation -This script does not require torch or triton to be installed. The only package +This script does not require torch or triton to be installed. The only package it depends on is latex. On Ubuntu, do ```bash sudo apt install texlive-full @@ -60,8 +60,8 @@ Notes out of the boundary of the tensor dimensions. This means - For M: sizePerThread[0] * threadsPerWarps[0] * warpsPerCTA[0] <= M - For K: sizePerThread[1] * threadsPerWarps[1] * warpsPerCTA[1] <= K - - + + ## Draw mfma operand and result layouts (`-plot dot`) Examples: @@ -78,15 +78,15 @@ This mode draws two graphs: 2. The layout of a single mfma block, operands and results of one or more mfma instructions that share the same accumulating VGPRs. This view has thread distributions among tensor elements. - + Knobs - `-kWidth`: the number of elements that will be loaded into one thread at once - `-nonKDim`: 16 ot 32, which is used to control the mfma instruction size - `-mfmaTrans`: if set, the transposed mfma layout will be plotted. Notes -- The layout shows the mapping from the threads/wave to the elements in the - original tensor. It does not care if the elements are arranged in LDS, like +- The layout shows the mapping from the threads/wave to the elements in the + original tensor. It does not care if the elements are arranged in LDS, like swizzling to avoid bank conflicts. - The script does not allow settings for data type or k dim of the mfma instruction. This can be controled by the `-kWidth` flag. diff --git a/python/perf-kernels/tools/plot-layout/plot_layout.py b/python/perf-kernels/tools/plot-layout/plot_layout.py index 74554b4c3f02..599f92c790e4 100755 --- a/python/perf-kernels/tools/plot-layout/plot_layout.py +++ b/python/perf-kernels/tools/plot-layout/plot_layout.py @@ -1,8 +1,6 @@ import argparse import sys -import yaml import os -import glob import subprocess diff --git a/python/perf-kernels/tools/plot-layout/tikzplot.tex b/python/perf-kernels/tools/plot-layout/tikzplot.tex index e6292f7002e9..d8441b042f02 100755 --- a/python/perf-kernels/tools/plot-layout/tikzplot.tex +++ b/python/perf-kernels/tools/plot-layout/tikzplot.tex @@ -25,7 +25,7 @@ \pgfmathsetmacro{\tidN}{mod(\tid,\threadsPerWarpN)} \coordinate (Thread TL) at ($(Wave TL)+(\tidN*\sizePerThreadN*\elem, -\tidM*\sizePerThreadM*\elem)$); \pgfmathsetmacro{\ratio}{\tidM*10} - + \ifthenelse{\tid = 0}{ \draw [line width = 0.01mm, fill=red] (Thread TL) rectangle ++(\sizePerThreadN*\elem, -\sizePerThreadM*\elem); @@ -80,12 +80,12 @@ \pgfmathsetmacro{\waveCoordN}{int(\waveId/\warpsPerCTAM)} \pgfmathsetmacro{\rot}{90} } - + \coordinate (Wave TL) at ($(CTA TL)+(\waveCoordN*\waveSizeN*\elem, -\waveCoordM*\waveSizeM*\elem)$); \draw [ultra thin] (Wave TL) rectangle ++(\waveSizeN*\elem, -\waveSizeM*\elem) node [pos=.5, scale=.6*\scale, inner sep=0, fill=white, rotate=\rot] {wave\waveId}; } - + \draw [thick] (CTA TL) rectangle ++(\CTASizeN*\elem, -\CTASizeM*\elem); } @@ -108,7 +108,7 @@ %% Note that threadsPerWarp[1] is calculated by 64/threadsPerWarp[0] %% #6: warpsPerCTA[0] --> warpsPerCTAM %% #7: warpsPerCTA[1] --> warpsPerCTAN - %% #8: fastest changing dim --> order + %% #8: fastest changing dim --> order \pgfmathsetmacro{\M}{#1} \pgfmathsetmacro{\N}{#2} @@ -148,7 +148,7 @@ \node [scale=.6*\scale, left] at ($(zoomin BL)+(0, .5*\sizePerThreadM*\elem*\zoomR)$) {$t_0$}; \node [scale=.6*\scale, right] at ($(zoomin BL)+(\sizePerThreadN*\elem*\zoomR, .5*\sizePerThreadM*\elem*\zoomR)$) {\sizePerThreadM$\times$\sizePerThreadN}; - + \draw [densely dotted] (TL) -- (zoomin BL); \draw [densely dotted] ($(TL)+(\sizePerThreadN*\elem, 0)$) -- ($(zoomin BL)+(\sizePerThreadN*\elem*\zoomR, 0)$); \draw [fill=red] (TL) rectangle ++(\sizePerThreadN*\elem, -\sizePerThreadM*\elem); @@ -164,19 +164,19 @@ %% #1: 1 for mfma.trans, 0 for normal mfma %% #2: mfmaNonKDim %% #3: verbose. 1 means draw tid in each vec; 0 means draw nothing - + \pgfmathsetmacro{\trans}{#1} \pgfmathsetmacro{\nonTrans}{1-#1} - \pgfmathsetmacro{\nonKDim}{#2} - \pgfmathsetmacro{\maxTID}{\nonKDim-1} - \pgfmathsetmacro{\groups}{64/\nonKDim} - \pgfmathsetmacro{\maxGID}{\groups-1} - \pgfmathsetmacro{\maxIVec}{\nonKDim*\nonKDim/256-1} + \pgfmathsetmacro{\nonKDim}{#2} + \pgfmathsetmacro{\maxTID}{\nonKDim-1} + \pgfmathsetmacro{\groups}{64/\nonKDim} + \pgfmathsetmacro{\maxGID}{\groups-1} + \pgfmathsetmacro{\maxIVec}{\nonKDim*\nonKDim/256-1} \pgfmathsetmacro{\verbose}{#3} \foreach \iVec in {0,...,\maxIVec} { \coordinate (wave TL) at ($(block TL)+(\trans*\iVec*\groups*4*\elem, -\nonTrans*\iVec*\groups*4*\elem)$); \foreach \tg in {0,...,\maxGID}{ - \pgfmathsetmacro{\colID}{\tg+4} + \pgfmathsetmacro{\colID}{\tg+4} \pgfmathsetmacro{\col}{\Colors[\colID]} \foreach \tid in {0,...,\maxTID} { \pgfmathsetmacro{\ratio}{\tid*2.5*\groups+15} @@ -228,7 +228,7 @@ \pgfmathsetmacro{\maxWaveId}{\warpsPerCTAH*\warpsPerCTAW-1} \pgfmathsetmacro{\CTASizeH}{\warpsPerCTAH*\mfmaNonKDim} \pgfmathsetmacro{\CTASizeW}{\warpsPerCTAW*\mfmaNonKDim} - + \foreach \ctaId in {0,...,\maxCTAId}{ \pgfmathsetmacro{\ctaCoordH}{int(\ctaId/\CTARepW)} @@ -237,7 +237,7 @@ %% Draw a detailed view of wave0 in each CTA \coordinate (block TL) at (CTA TL); \drawBlockMFMALayoutLarge{\mfmaTrans}{\mfmaNonKDim}{0} - + \foreach \waveId in {0,...,\maxWaveId}{ \pgfmathsetmacro{\waveCoordH}{int(\waveId/\warpsPerCTAW)} \pgfmathsetmacro{\waveCoordW}{mod(\waveId,\warpsPerCTAW)} @@ -246,7 +246,7 @@ \draw [ultra thin] (block TL) rectangle ++(\mfmaNonKDim*\elem, -\mfmaNonKDim*\elem) node [scale=.7*\mfmaNonKDim/32*\scale, pos=.5, fill=white, inner sep=0] {wave\waveId}; } - + %% Draw the outline of each CTA rep \draw [ultra thick] (CTA TL) rectangle ++(\CTASizeW*\elem, -\CTASizeH*\elem); } @@ -289,7 +289,7 @@ rectangle ++(\kpack*\elem*\opIdxB + \elem*\opIdxA, -\elem*\opIdxB-\kpack*\elem*\opIdxA) node [pos=.5, scale=.35*\scale, rotate=90*\opIdxA] {t\drawTid}; } - } + } } } @@ -311,9 +311,9 @@ \pgfmathsetmacro{\kpack}{#3} \pgfmathsetmacro{\opIdx}{#4} \pgfmathsetmacro{\opIdxOther}{1-\opIdx} - + \coordinate (TL) at (Op TL); - + \pgfmathsetmacro{\numKRep}{\K/\kpack/\groups} \pgfmathsetmacro{\maxKRepId}{\numKRep-1} @@ -417,7 +417,7 @@ \pgfmathsetmacro{\kdim}{int(\groups*\kpack)} \pgfmathsetmacro{\gap}{\elem*20} - \coordinate (A TL) at ($(C TL)+(-\gap-\K*\elem, 0)$); + \coordinate (A TL) at ($(C TL)+(-\gap-\K*\elem, 0)$); \coordinate (B TL) at ($(C TL)+(0, \gap+\K*\elem)$); \drawDotOperands{\M}{\N}{\K}{\mfmaNonKDim}{\warpsPerCTAM}{\warpsPerCTAN}{\kpack} @@ -483,11 +483,11 @@ \draw (TL) rectangle ++(\K*\elem, -\drawM*\elem); %% Draw detailed vec view of the tensor \foreach \vecId in {0,...,\maxVecId}{ - + \pgfmathsetmacro{\vecCoordM}{int(\vecId/\numVecK)} \pgfmathsetmacro{\vecCoordK}{mod(\vecId,\numVecK)} \coordinate (vec TL) at ($(TL)+(\vecCoordK*\vec*\elem, -\vecCoordM*\elem)$); - + \pgfmathsetmacro{\colorIdxK}{int(mod(\vecCoordK,16))} \pgfmathsetmacro{\colorIdxM}{mod(\vecCoordM,16)} \pgfmathsetmacro{\vecColor}{\Colors[\colorIdxK]} @@ -495,7 +495,7 @@ \draw [ultra thin, fill=\vecColor!\ratio!white] (vec TL) rectangle ++(\vec*\elem, -\elem) node [pos=.5, scale=.6*\scale, white] {m\vecCoordM}; - + } %% M and K dim \node [scale=\scale, rotate=90, above] at ($(TL)+(0, -.5*\drawM*\elem-8*\elem)$) {M=\M}; @@ -510,15 +510,15 @@ } \draw [densely dotted] (TL) -- ($(vec TL)+(0, -\elem*\vecR)$); \draw [densely dotted] ($(TL)+(\vec*\elem, 0)$) -- ($(vec TL)+(\vec*\elem*\vecR, -\elem*\vecR)$); - \node [scale=.8*\scale, above] at ($(vec TL)+(.5*\vec*\elem*\vecR, 0)$) {vec=\vec}; + \node [scale=.8*\scale, above] at ($(vec TL)+(.5*\vec*\elem*\vecR, 0)$) {vec=\vec}; } \newcommand{\drawLDSLayoutTritonSwizzling}[2]{ - %% + %% %% Draw tensor layout in LDS with swizzling - %% + %% %% TL: pre defined top-left coordinates of the tensor in global memory %% \elem: per defined variable %% \Colors: a pre defined array of 16 colors @@ -540,7 +540,7 @@ \pgfmathsetmacro{\hasSwizzle}{#1} \pgfmathsetmacro{\accessMode}{#2} \pgfmathsetmacro{\numVecK}{\K/\vec} - + %% Assuming fp16 data type \pgfmathsetmacro{\LDSK}{64} \pgfmathsetmacro{\numLDSVec}{\LDSK/\vec} @@ -568,10 +568,10 @@ \pgfmathsetmacro{\maxPhase}{1} }{ %% When vec is small enough, we want 16/perPhase different swizzling patterns - %% When vec is large, we can only have 64 / \vec different swizzling pattern at most + %% When vec is large, we can only have 64 / \vec different swizzling pattern at most \pgfmathsetmacro{\maxPhase}{min(16/\perPhase,64/\vec)} } - + %% Draw the LDS \draw (TL) rectangle ++(\LDSK*\elem, -\drawM*\elem); @@ -598,7 +598,7 @@ \pgfmathsetmacro{\vecLDSM}{floor(\vecCoordM/\perPhase)} \pgfmathsetmacro{\vecLDSK}{int(\vecCoordK+mod(\vecCoordM,\perPhase)*\numVecK)} } - %% + %% \pgfmathsetmacro{\phase}{int(mod(\rawPhase, \maxPhase))} %% Compute the swizzled col id \pgfmathsetmacro{\vecLDSKSwizzled}{\bitwiseXor{\vecLDSK}{\phase}} @@ -631,14 +631,14 @@ node [pos=.5, scale=.6*\scale, white] {m\vecCoordM}; } - %% ds_read + %% ds_read %% Highlight the elements the first 16 threads access in the first cycle %% This is used to visualize bank conflicts \ifthenelse{\accessMode = 1}{ \ifthenelse{\vecCoordK = 0}{ \draw [fill=white] (new vec TL) rectangle ++(\elem, -\elem); - \draw (new vec TL) -- ++(\elem, -\elem); - \draw ($(new vec TL)+(0, -\elem)$) -- ++(\elem, \elem); + \draw (new vec TL) -- ++(\elem, -\elem); + \draw ($(new vec TL)+(0, -\elem)$) -- ++(\elem, \elem); }{} }{} @@ -652,8 +652,8 @@ \ifthenelse{\vecInThread=0}{ \ifthenelse{\vecCoordK<\covK \AND \vecCoordM<\covM}{ \draw [fill=white] (new vec TL) rectangle ++(\elem, -\elem); - \draw (new vec TL) -- ++(\elem, -\elem); - \draw ($(new vec TL)+(0, -\elem)$) -- ++(\elem, \elem); + \draw (new vec TL) -- ++(\elem, -\elem); + \draw ($(new vec TL)+(0, -\elem)$) -- ++(\elem, \elem); }{} }{} }{} @@ -665,7 +665,7 @@ \draw [ultra thin] ($(new vec TL)+(\vec*\elem, -.5*\elem)$) -- ++(\elem, 0) node [scale=.6*\scale, right] {\phase}; }{} - } + } } %% Draw boundary of 32 banks @@ -701,7 +701,7 @@ \pgfmathsetmacro{\kpack}{#2} \pgfmathsetmacro{\mfmaTrans}{#3} \pgfmathsetmacro{\nonTrans}{1-#3} - + \pgfmathsetmacro{\gap}{\elem*5} \coordinate (mfma opA TL) at ($(C TL)+(-.5*\gap-1.2*\nonTrans*\gap-\groups*\kpack*\elem, 0)$); \coordinate (mfma op TL) at (mfma opA TL); @@ -825,7 +825,7 @@ \draw [line width=0.005mm, fill=\vecColor!60!white] (vec TL) rectangle ++(\elem, -\elem) node [scale=.4*\scale, pos=.5] {t\tid}; } - + } @@ -838,11 +838,11 @@ %% %% C TL: pre defined top-left coordinates of output matrix %% \elem: pre defined element size - + \pgfmathsetmacro{\isWLarge}{#1} \pgfmathsetmacro{\verbose}{#2} - + \pgfmathsetmacro{\gap}{\elem*2} \coordinate (wmma op TL) at ($(C TL)+(-\gap-16*\elem, 0)$); \coordinate (wmma opA TL) at (wmma op TL); @@ -873,7 +873,7 @@ \draw [->, >=stealth] (m dim.west) -- ($(C TL)+(0, -16*\elem-\gap)$); \draw [->, >=stealth] (m dim.east) -- ($(C TL)+(16*\elem, -16*\elem-\gap)$); - %% C N dim + %% C N dim \node [scale=.8*\scale, rotate=-90] (n dim) at ($(C TL)+(16*\elem+\gap, -8*\elem)$) {16}; \draw [->, >=stealth] (n dim.west) -- ($(C TL)+(16*\elem+\gap, 0)$); \draw [->, >=stealth] (n dim.east) -- ($(C TL)+(16*\elem+\gap, -16*\elem)$); From 370f916dfefd7c37ebc8bad46e607f66b9bfdaa3 Mon Sep 17 00:00:00 2001 From: Lixun Zhang Date: Fri, 6 Sep 2024 09:01:30 -0500 Subject: [PATCH 4/6] remove executablility of plot_layout.py --- python/perf-kernels/tools/plot-layout/plot_layout.py | 0 python/perf-kernels/tools/plot-layout/tikzplot.tex | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 python/perf-kernels/tools/plot-layout/plot_layout.py mode change 100755 => 100644 python/perf-kernels/tools/plot-layout/tikzplot.tex diff --git a/python/perf-kernels/tools/plot-layout/plot_layout.py b/python/perf-kernels/tools/plot-layout/plot_layout.py old mode 100755 new mode 100644 diff --git a/python/perf-kernels/tools/plot-layout/tikzplot.tex b/python/perf-kernels/tools/plot-layout/tikzplot.tex old mode 100755 new mode 100644 From 731ea351a349878b46d8bd3f232ab300eeb4a96c Mon Sep 17 00:00:00 2001 From: Lixun Zhang Date: Fri, 6 Sep 2024 09:05:46 -0500 Subject: [PATCH 5/6] Address ruff complains --- python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py b/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py index 570fc3399602..ae2f65830766 100644 --- a/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py +++ b/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py @@ -52,7 +52,7 @@ def find_kernel(text): func_name = match[1] start = index break - if start == None: + if start is None: return None, None, None end = None @@ -61,7 +61,7 @@ def find_kernel(text): end = index break - if end == None: + if end is None: return None, None, None return func_name, text[start:end + 1], end @@ -85,7 +85,7 @@ def get_block_list(kernel): if (index > 1): blocks[begin_label] = Block(begin_label, kernel[:index - 1]) - while label != None: + while label is not None: kernel = kernel[index + 1:] next_label, next_index = find_label(kernel) if next_label is None: @@ -182,9 +182,9 @@ def main(options): kernels = [] last_end_index = 0 - while last_end_index != None: + while last_end_index is not None: func_name, kernel_asm, last_end_index = find_kernel(asm) - if kernel_asm == None: + if kernel_asm is None: break blocks = get_block_list(kernel_asm) From fca8ae24f0f4f3dcf4196ce65e42331b7519fa00 Mon Sep 17 00:00:00 2001 From: Lixun Zhang Date: Fri, 6 Sep 2024 12:03:48 -0500 Subject: [PATCH 6/6] Move tune_gemm to tools --- python/perf-kernels/{ => tools}/tune_gemm/README.md | 0 python/perf-kernels/{ => tools}/tune_gemm/icache_flush.py | 0 python/perf-kernels/{ => tools}/tune_gemm/matmul_kernel.py | 0 python/perf-kernels/{ => tools}/tune_gemm/one_config.py | 0 python/perf-kernels/{ => tools}/tune_gemm/tune_gemm.py | 0 python/perf-kernels/{ => tools}/tune_gemm/utils/file_generator.py | 0 python/perf-kernels/{ => tools}/tune_gemm/utils/utils.py | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename python/perf-kernels/{ => tools}/tune_gemm/README.md (100%) rename python/perf-kernels/{ => tools}/tune_gemm/icache_flush.py (100%) rename python/perf-kernels/{ => tools}/tune_gemm/matmul_kernel.py (100%) rename python/perf-kernels/{ => tools}/tune_gemm/one_config.py (100%) rename python/perf-kernels/{ => tools}/tune_gemm/tune_gemm.py (100%) rename python/perf-kernels/{ => tools}/tune_gemm/utils/file_generator.py (100%) rename python/perf-kernels/{ => tools}/tune_gemm/utils/utils.py (100%) diff --git a/python/perf-kernels/tune_gemm/README.md b/python/perf-kernels/tools/tune_gemm/README.md similarity index 100% rename from python/perf-kernels/tune_gemm/README.md rename to python/perf-kernels/tools/tune_gemm/README.md diff --git a/python/perf-kernels/tune_gemm/icache_flush.py b/python/perf-kernels/tools/tune_gemm/icache_flush.py similarity index 100% rename from python/perf-kernels/tune_gemm/icache_flush.py rename to python/perf-kernels/tools/tune_gemm/icache_flush.py diff --git a/python/perf-kernels/tune_gemm/matmul_kernel.py b/python/perf-kernels/tools/tune_gemm/matmul_kernel.py similarity index 100% rename from python/perf-kernels/tune_gemm/matmul_kernel.py rename to python/perf-kernels/tools/tune_gemm/matmul_kernel.py diff --git a/python/perf-kernels/tune_gemm/one_config.py b/python/perf-kernels/tools/tune_gemm/one_config.py similarity index 100% rename from python/perf-kernels/tune_gemm/one_config.py rename to python/perf-kernels/tools/tune_gemm/one_config.py diff --git a/python/perf-kernels/tune_gemm/tune_gemm.py b/python/perf-kernels/tools/tune_gemm/tune_gemm.py similarity index 100% rename from python/perf-kernels/tune_gemm/tune_gemm.py rename to python/perf-kernels/tools/tune_gemm/tune_gemm.py diff --git a/python/perf-kernels/tune_gemm/utils/file_generator.py b/python/perf-kernels/tools/tune_gemm/utils/file_generator.py similarity index 100% rename from python/perf-kernels/tune_gemm/utils/file_generator.py rename to python/perf-kernels/tools/tune_gemm/utils/file_generator.py diff --git a/python/perf-kernels/tune_gemm/utils/utils.py b/python/perf-kernels/tools/tune_gemm/utils/utils.py similarity index 100% rename from python/perf-kernels/tune_gemm/utils/utils.py rename to python/perf-kernels/tools/tune_gemm/utils/utils.py