From 4cc007a4010418e6b9d4625b03bfdc33751e864e Mon Sep 17 00:00:00 2001
From: Lixun Zhang <lixun.zhang@amd.com>
Date: Fri, 6 Sep 2024 08:51:52 -0500
Subject: [PATCH 1/6] Move utility tools from triton-mlir to main_perf branch

- Plot layout script
- occ.sh
- amdgcn-cfg
---
 .../perf-kernels/tools/amdgcn-cfg/README.md   |  14 +
 .../tools/amdgcn-cfg/amdgcn-cfg.py            | 226 +++++
 python/perf-kernels/tools/occ.sh              |  71 ++
 .../perf-kernels/tools/plot-layout/README.md  | 117 +++
 .../tools/plot-layout/plot_layout.py          | 341 +++++++
 .../tools/plot-layout/tikzplot.tex            | 880 ++++++++++++++++++
 6 files changed, 1649 insertions(+)
 create mode 100644 python/perf-kernels/tools/amdgcn-cfg/README.md
 create mode 100644 python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py
 create mode 100755 python/perf-kernels/tools/occ.sh
 create mode 100644 python/perf-kernels/tools/plot-layout/README.md
 create mode 100755 python/perf-kernels/tools/plot-layout/plot_layout.py
 create mode 100755 python/perf-kernels/tools/plot-layout/tikzplot.tex
diff --git a/python/perf-kernels/tools/amdgcn-cfg/README.md b/python/perf-kernels/tools/amdgcn-cfg/README.md
new file mode 100644
index 000000000000..bea420ea530c
--- /dev/null
+++ b/python/perf-kernels/tools/amdgcn-cfg/README.md
@@ -0,0 +1,14 @@
+# Control Flow Graph Generator from AMDGCN assembly
+
+The script reads an assembly file and generates a Control Flow Graph (CFG) for each function in the file. The graph can be saved in `dot`, `svg` and `pdf` formats. The nodes of a graph can be represented with 1) just labels or 2) the corresponding assembly code. The edges of a graph can help to identify cycles and, thus, to provide a better navigation through the code.
+
+
+### Basic usage
+
+```
+python ./amdgcn-cfg.py -i <path to assembly file>  -o <output directory>/<output prefix> -f [dot|svg|pdf]
+```
+
+`dot`-files can be visualize with [this](https://dreampuf.github.io/GraphvizOnline) online tool. You just need to copy and paste the content of a generated `dot`-file.
+
+By default, the nodes are named with basic block labels. Use `-v` or `--verbose` option to add assembly source code to corresponding nodes.
diff --git a/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py b/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py
new file mode 100644
index 000000000000..4100528f28db
--- /dev/null
+++ b/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py
@@ -0,0 +1,226 @@
+import os
+import argparse
+import re
+from collections import OrderedDict
+import graphviz
+
+
+class Options:
+  def __init__(self, input_file, output_file, verbose, format):
+    if not os.path.exists(input_file):
+      raise RuntimeError('input file is not provided')
+
+    output_dir = os.path.dirname(output_file)
+    if not os.path.exists(output_dir):
+      raise RuntimeError('output directory does not exist')
+
+    self.input_file = input_file
+    self.output_file = output_file
+    self.verbose = verbose
+    self.format = format
+    self.output_dir = output_dir
+
+
+class Block:
+  def __init__(self, label, code):
+    self.label = label
+    self.code = code
+    self.edges = []
+
+
+class Kernel:
+  def __init__(self, kernel_name, blocks):
+    self.name = kernel_name
+    self.blocks = blocks
+    self.cfg = None
+
+
+begin_label = 'Begin'
+end_label = 'End'
+
+
+def find_kernel(text):
+  func_name_expr = r'^([^\s^\.]\w.+):'
+  func_name = None
+  start = None
+  for index, line in enumerate(text):
+    match = re.search(func_name_expr, line)
+    if not match is None:
+      func_name = match[1]
+      start = index
+      break
+  if start == None:
+    return None, None, None
+
+  end = None
+  for index, line in enumerate(text):
+    if not re.search(r's_endpgm', line) is None:
+      end = index
+      break
+
+  if end == None:
+    return None, None, None
+
+  return func_name, text[start:end+1], end
+
+
+def find_label(kernel):
+  label = None
+  index = None
+  for index, line in enumerate(kernel):
+    match = re.search(r'^\.(\w+):', line)
+    if not match is None:
+      label = match[1]
+      break
+  return label, index
+
+
+def get_block_list(kernel):
+  label, index = find_label(kernel)
+
+  blocks = OrderedDict()
+  if (index > 1):
+    blocks[begin_label] = Block(begin_label, kernel[:index-1])
+
+  while label != None:
+    kernel = kernel[index+1:]
+    next_label, next_index = find_label(kernel)
+    if next_label is None:
+      code = kernel[index:]
+    else:
+      code = kernel[:next_index]
+    blocks[label] = Block(label, code)
+
+    label = next_label 
+    index = next_index
+
+  blocks[end_label] = Block(end_label, [])
+
+  return blocks
+
+
+def find_terminators(code):
+  terminator_labels = []
+  for line in code:
+    branch = re.search(r'(c)?branch.*\s+\.?(.*)', line)
+    if not branch is None:
+      is_condional = True if len(branch.groups()) == 2 else False
+      label_idx = 2 if is_condional else 1
+      terminator_labels.append(branch[label_idx])
+      if not is_condional:
+        return terminator_labels, True
+    end = re.search(r's_endpgm', line)
+    if not end is None:
+      terminator_labels.append(end_label)
+      return terminator_labels, True
+
+  return terminator_labels, False
+
+
+def add_edges(kernel):
+  keys = list(kernel.blocks.keys())
+  for index, curr_label in enumerate(keys):
+    if curr_label == end_label:
+      continue
+
+    code = kernel.blocks[curr_label].code
+    terminators, is_last_unconditional = find_terminators(code[:-1])
+
+    if is_last_unconditional:
+      # unconditional jump in the middle of the block
+      break
+
+    # handle the last terminator in the current BB
+    last_terminator, is_unconditional = find_terminators([code[-1]])
+
+    is_conditional = not is_unconditional
+    next_block_label = keys[index + 1]
+    is_next_covered = next_block_label in terminators
+
+    if last_terminator:
+      terminators.extend(last_terminator)
+      if is_conditional and not is_next_covered:
+        next_block_label = keys[index + 1]
+        terminators.append(next_block_label)
+    else:
+      if not is_next_covered:
+        next_block_label = keys[index + 1]
+        terminators.append(next_block_label)
+
+    assert(len(terminators))
+    kernel.blocks[curr_label].edges = terminators
+
+
+def generate_cfg(kernel, options):
+  graph = graphviz.Digraph(f'{kernel.name}')
+  for curr_label in kernel.blocks:
+    block = kernel.blocks[curr_label]
+    asm = [line.strip() for line in block.code]
+    if options.verbose:
+      label_text = repr('\n'.join([f'{curr_label}', *asm]))
+    else:
+      label_text = curr_label
+    graph.node(curr_label,
+               shape='rect',
+               labeljust='l',
+               margin='0.01',
+               label=label_text)
+
+  for curr_label in kernel.blocks:
+    block = kernel.blocks[curr_label]
+    for edge in block.edges:
+      graph.edge(curr_label, edge)
+
+  return graph
+
+
+def main(options):
+  asm = []
+  with open(options.input_file, 'r') as file:
+    context = file.readlines()
+    for line in context:
+      asm.append(line[:-1])
+
+  kernels = []
+  last_end_index = 0
+  while last_end_index != None:
+    func_name, kernel_asm, last_end_index = find_kernel(asm)
+    if kernel_asm == None:
+      break
+
+    blocks = get_block_list(kernel_asm)
+    kernel = Kernel(func_name, blocks)
+    add_edges(kernel)
+
+    cfg = generate_cfg(kernel, options)
+    kernel.cfg = cfg
+    kernels.append(kernel)
+    asm = asm[last_end_index+1:]
+
+    for index, kernel in enumerate(kernels):
+      output_file_name = f'{options.output_file}.kernel-{index}'
+      if options.format == 'dot':
+        with open(f'{output_file_name}.dot', 'w') as file:
+          file.write(str(kernel.cfg))
+          file.write('\n')
+      else:
+        kernel.cfg.render(filename=f'{output_file_name}',
+                          format=options.format,
+                          ).replace('\\', '/')
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(
+    prog="Generates Control Flow Graph (CFG) from amdgcn assembly file",
+  )
+  parser.add_argument("-i", "--input", type=str, default=None, help="input file")
+  parser.add_argument("-o", "--output", type=str, default=None, help="output file prefix")
+  parser.add_argument("-v", "--verbose", action='store_true', help='verbose output')
+  parser.add_argument("-f", "--format", choices=['dot','svg', 'pdf'],
+                      default="dot",
+                      help="output format type")
+  args = parser.parse_args()
+
+  options = Options(args.input, args.output, args.verbose, args.format)
+
+  main(options)
diff --git a/python/perf-kernels/tools/occ.sh b/python/perf-kernels/tools/occ.sh
new file mode 100755
index 000000000000..51c8f9095907
--- /dev/null
+++ b/python/perf-kernels/tools/occ.sh
@@ -0,0 +1,71 @@
+#! /bin/bash
+
+## $1: input script that contains one kernel
+
+rm -rf ~/.triton/cache/
+
+export MLIR_ENABLE_DUMP=1
+export AMDGCN_ENABLE_DUMP=1
+## Assume CDNA arch
+SIMD=4
+LDS_SIZE=65536
+TOTAL_VGPR=512
+
+get_occ_per_CU() {
+    ## $1: vgpr count
+    vgpr=$1
+    occPerEU=$((TOTAL_VGPR/vgpr))
+    if [[ $vgpr -gt 256 ]]; then
+        occPerEU=1
+    elif [[ $vgpr -gt 168 ]]; then
+        occPerEU=2
+    elif [[ $vgpr -gt 128 ]]; then
+        occPerEU=3
+    elif [[ $vgpr -gt 96 ]]; then
+        occPerEU=4
+    elif [[ $vgpr -gt 80 ]]; then
+        occPerEU=5
+    elif [[ $vgpr -gt 72 ]]; then
+        occPerEU=6
+    elif [[ $vgpr -gt 64 ]]; then
+        occPerEU=7
+    else
+        occPerEU=8
+    fi
+
+    occPerCU=$((occPerEU*SIMD/num_warps))
+    echo $occPerCU
+}
+
+$1 > output.mlir 2>&1
+
+LDS_line=$(sed -n '/triton_gpu\.shared\ /p' output.mlir | tail -n 1 | grep -o 'triton_gpu.shared = [0-9]*')
+numWarps_line=$(sed -n '/triton_gpu\.num-warps/p' output.mlir | tail -n 1 | grep -o 'triton_gpu.num-warps. = [0-9]*')
+
+LDS=${LDS_line##*=}
+num_warps=${numWarps_line##*=}
+echo "LDS: $LDS, num_warps: $num_warps"
+
+VGPRs=$(sed -n '/vgpr_count/p' output.mlir | tail -n 1 | awk '{print $2}')
+SPILLs=$(sed -n '/vgpr_spill/p' output.mlir | tail -n 1 | awk '{print $2}')
+
+echo "VGPRS: $VGPRs (spill: $SPILLs)"
+
+occLDSPerCU=$((LDS_SIZE/LDS))
+occVgprPerCU=$(get_occ_per_CU $VGPRs)
+occPerCU=$occVgprPerCU
+if [ $occLDSPerCU -lt $occVgprPerCU ];then
+    occPerCU=$occLDSPerCU
+fi
+occPerEU=$((occPerCU*num_warps/SIMD))
+echo "occupancy: $occPerEU waves/SIMD or $occPerCU workgroups/CU (occLDSPerCU: $occLDSPerCU, occVgprPerCU: $occVgprPerCU)"
+
+perf=$(tail -n 2 output.mlir)
+echo "$perf"
+
+## remove distracting info from the assembly
+sed -i '/local_/! {/\.loc/d}' output.mlir
+sed -i '/\.Ltmp.*:/d' output.mlir
+sed -i '/AMD clang version/d' output.mlir
+
+sed -n '/AMDGCN/, $p' output.mlir > output.amdgcn
diff --git a/python/perf-kernels/tools/plot-layout/README.md b/python/perf-kernels/tools/plot-layout/README.md
new file mode 100644
index 000000000000..e12cf9441d37
--- /dev/null
+++ b/python/perf-kernels/tools/plot-layout/README.md
@@ -0,0 +1,117 @@
+# Plot script for triton layouts
+
+This script is used to draw triton layouts in the context of matmul.
+Here is the help info from the script.
+
+```bash
+>$ python3 plot_layout.py -h
+usage: Draw triton layouts [-h] [-shape SHAPE SHAPE SHAPE] [-plot {blocked,dot,wmma,lds}] [-nonKDim {16,32}] [-sizePerThread SIZEPERTHREAD SIZEPERTHREAD] [-threadsPerWarp THREADSPERWARP THREADSPERWARP]
+                           [-warpsPerCTA WARPSPERCTA WARPSPERCTA] [-order ORDER ORDER] [-kWidth {4,8,16}] [-lds_layout {swizzle,padding,none}] [-lds_access {read,write,none}] [-wave_size {32,64}] [-o O] [-mfmaTrans] [-keep]
+
+options:
+  -h, --help            show this help message and exit
+  -shape SHAPE SHAPE SHAPE
+                        Tensor shape in the form of M,N,K
+  -plot {blocked,dot,wmma,lds}
+                        choose plot mode
+  -nonKDim {16,32}      mfma instruction dim
+  -sizePerThread SIZEPERTHREAD SIZEPERTHREAD
+  -threadsPerWarp THREADSPERWARP THREADSPERWARP
+  -warpsPerCTA WARPSPERCTA WARPSPERCTA
+  -order ORDER ORDER
+  -kWidth {4,8,16}      number of elements per thread
+  -lds_layout {swizzle,padding,none}
+                        choose the LDS data layout
+  -lds_access {read,write,none}
+                        choose LDS access mode
+  -wave_size {32,64}    choose the wmma instruction mode
+  -o O                  output pdf file name (without surfix)
+  -mfmaTrans            If set, then use mfma.trans layout
+  -keep                 If set, keep the generated .tex file
+```
+
+## Installation
+This script does not require torch or triton to be installed. The only package 
+it depends on is latex. On Ubuntu, do
+```bash
+sudo apt install texlive-full
+```
+
+## Draw blocked layout (`-plot blocked`)
+
+Examples:
+```bash
+python3 plot_layout.py -plot blocked -shape 128 128 64 -sizePerThread 1 8 -threadsPerWarp 8 8 -warpsPerCTA 4 1
+python3 plot_layout.py -plot blocked -shape 16 128 64 -sizePerThread 1 8 -threadsPerWarp 16 4 -warpsPerCTA 1 2
+python3 plot_layout.py -plot blocked -shape 32 128 64 -sizePerThread 8 1 -threadsPerWarp 4 16 -warpsPerCTA 1 2 -order 0 1
+```
+
+Blocked layouts are used during global load. It is used to describe the layout of the tensor
+for pointers and results.
+We can provide tensor shape (`-shape M N K`) and blocked layout parameters (
+`-sizePerThread x y`, `-threadsPerWarp x y`, and `-warpsPerCTA x y`).
+We can also provide the order of the tensor as `-order x y` to control which dim
+is the fastest changing dimension.
+
+Notes
+- All of the gemm dims (M, N, and K) are needed when providing the shape. But only
+  M and K will be used to plot the layout of the tensor.
+- The script does not support the case when threads are loading elements that are
+  out of the boundary of the tensor dimensions. This means
+  - For M: sizePerThread[0] * threadsPerWarps[0] * warpsPerCTA[0] <= M
+  - For K: sizePerThread[1] * threadsPerWarps[1] * warpsPerCTA[1] <= K
+  
+  
+## Draw mfma operand and result layouts (`-plot dot`)
+
+Examples:
+```bash
+python3 plot_layout.py -plot dot -shape 128 128 64 -warpsPerCTA 2 4 -nonKDim 32 -kWidth 4
+python3 plot_layout.py -plot dot -shape 128 128 64 -warpsPerCTA 2 4 -nonKDim 32 -kWidth 8
+python3 plot_layout.py -plot dot -shape 128 128 64 -warpsPerCTA 2 4 -nonKDim 32 -kWidth 8 -mfmaTrans
+python3 plot_layout.py -plot dot -shape 128 128 64 -warpsPerCTA 2 4 -nonKDim 16 -kWidth 8
+python3 plot_layout.py -plot dot -shape 128 128 64 -warpsPerCTA 2 4 -nonKDim 16 -kWidth 16
+```
+
+This mode draws two graphs:
+1. The layout of the whole tile for tile A, B, and C
+2. The layout of a single mfma block, operands and results of one or more mfma
+   instructions that share the same accumulating VGPRs.
+   This view has thread distributions among tensor elements.
+   
+Knobs
+- `-kWidth`: the number of elements that will be loaded into one thread at once
+- `-nonKDim`: 16 ot 32, which is used to control the mfma instruction size
+- `-mfmaTrans`: if set, the transposed mfma layout will be plotted.
+
+Notes
+- The layout shows the mapping from the threads/wave to the elements in the 
+  original tensor. It does not care if the elements are arranged in LDS, like 
+  swizzling to avoid bank conflicts.
+- The script does not allow settings for data type or k dim of the mfma instruction.
+  This can be controled by the `-kWidth` flag.
+  - For example, if we want `mfma_32x32x8xf16`, we can set `-nonKDim 32` and `-kWidth 4`.
+  - If we want `mfma_32x32x16xf8`, we can set `-nonKDim 32` and `-kWidth 8`.
+
+
+## Draw LDS access (`-plot lds`)
+
+Examples:
+```bash
+python3 plot_layout.py -plot lds -lds_layout none -lds_access none -shape 128 128 64 -kWidth 8
+```
+
+Knobs
+- `kWidth` here means the vector size when accessing LDS
+- Three options for `-lds_layout`:
+  - `none`: no swizzling, no padding
+  - `padding`: padding at every 128B
+  - `swizzling`: apply the swizzling pattern, which is derived from tensor shape and kWidth.
+- Three options for `-lds_access`:
+  - `none`: do not plot access pattern
+  - `read`: plot accessed elements during ds_read
+  - `write`: plot accessed elements during ds_write. Note that this needs some infomation from
+    global load. Therefore, we need to provide `-sizePerThread` and `-threadsPerWarp`.
+
+Notes
+- This mode is rarely used. If you have any questions, please contact Lixun Zhang directly.
diff --git a/python/perf-kernels/tools/plot-layout/plot_layout.py b/python/perf-kernels/tools/plot-layout/plot_layout.py
new file mode 100755
index 000000000000..c2387905f3e0
--- /dev/null
+++ b/python/perf-kernels/tools/plot-layout/plot_layout.py
@@ -0,0 +1,341 @@
+import argparse
+import sys
+import yaml
+import os
+import glob
+import subprocess
+
+
+def draw_preamble_cmd():
+    return '''\\documentclass[tikz, border=1mm, dvipsnames]{standalone}
+\\usepackage{ifthen}
+\\usepackage{tikz}
+\\usetikzlibrary{arrows.meta,arrows}
+\\usetikzlibrary{intersections}
+\\usetikzlibrary{calc, quotes}
+\\usetikzlibrary{patterns}
+\\usepackage{xparse}
+
+\\ExplSyntaxOn
+\\NewExpandableDocumentCommand{\\bitwiseXor}{mm}
+ {
+  \\recuenco_bitwise_xor:nn { #1 } { #2 }
+ }
+
+\\cs_new:Nn \\recuenco_bitwise_xor:nn
+ {
+  \\int_from_bin:e
+   {
+    \\__recuenco_bitwise_xor:ee { \\int_to_bin:n { #1 } } { \\int_to_bin:n { #2 } }
+   }
+ }
+\\cs_generate_variant:Nn \\int_from_bin:n { e }
+
+\\cs_new:Nn \\__recuenco_bitwise_xor:nn
+ {
+  \\__recuenco_bitwise_xor_binary:ee
+   {
+    \\prg_replicate:nn
+     {
+      \\int_max:nn { \\tl_count:n { #1 } } { \\tl_count:n { #2 } } - \\tl_count:n { #1 }
+     }
+     { 0 }
+     #1
+   }
+   {
+    \\prg_replicate:nn
+     {
+      \\int_max:nn { \\tl_count:n { #1 } } { \\tl_count:n { #2 } } - \\tl_count:n { #2 }
+     }
+     { 0 }
+     #2
+   }
+ }
+\\cs_generate_variant:Nn \\__recuenco_bitwise_xor:nn { ee }
+
+\\cs_new:Nn \\__recuenco_bitwise_xor_binary:nn
+ {
+  \\__recuenco_bitwise_xor_binary:w #1;#2;
+ }
+\\cs_generate_variant:Nn \\__recuenco_bitwise_xor_binary:nn { ee }
+
+\\cs_new:Npn \\__recuenco_bitwise_xor_binary:w #1#2;#3#4;
+ {
+  \\int_abs:n { #1-#3 }
+  \\tl_if_empty:nF { #2 } { \\__recuenco_bitwise_xor_binary:w #2;#4; }
+ }
+
+\\ExplSyntaxOff'''
+
+
+def draw_dot_layout_cmd(M, N, K, mfmaNonKDim, warpsPerCTA, trans, kpack):
+    return f'''\\begin{{document}}
+  \\begin{{tikzpicture}}
+    \\def\\scale{{1}}
+    \\def\\elem{{0.04}}
+    \\coordinate (C TL) at (0,0);
+    \\def\\opColorAL{{magenta}}
+    \\def\\opColorAR{{cyan}}
+    \\def\\opColorBL{{Maroon}}
+    \\def\\opColorBR{{BlueGreen}}
+    \\drawDot{{{M}}}{{{N}}}{{{K}}}{{{mfmaNonKDim}}}{{{warpsPerCTA[0]}}}{{{warpsPerCTA[1]}}}{{{trans}}}{{{kpack}}}
+
+    \\coordinate (C TL) at ($(C TL)+({N}*\elem+32*\elem, 0)$);
+    \\def\\mfmaTrans{{{trans}}}
+
+    %% Draw zoomed in view of mfma
+    \\def\\elem{{.16}}
+    \\pgfmathsetmacro{{\\gap}}{{\\elem*5}}
+    \\pgfmathsetmacro{{\\nonTrans}}{{1-\\mfmaTrans}}
+    \\pgfmathsetmacro{{\\groups}}{{64/{mfmaNonKDim}}}
+    \\coordinate (C TL) at ($(C TL)+(.5*\\gap+1.2*\\nonTrans*\\gap+\\groups*{kpack}*\\elem, 0)$);
+    \\drawMFMAInstr{{{mfmaNonKDim}}}{{{kpack}}}{{\\mfmaTrans}}
+
+  \\end{{tikzpicture}}
+\\end{{document}}'''
+
+
+def draw_blocked_layout_cmd(M, K, sizePerThread, threadsPerWarp, warpsPerCTA,
+                            order):
+    return f'''\\begin{{document}}
+  \\begin{{tikzpicture}}
+    \\def\\scale{{1}}
+    \\def\\elem{{0.06}}
+    \\coordinate (TL) at (0,0);
+    \\drawBlockedTensor{{{M}}}{{{K}}}{{{sizePerThread[0]}}}{{{sizePerThread[1]}}}{{{threadsPerWarp[0]}}}{{{warpsPerCTA[0]}}}{{{warpsPerCTA[1]}}}{{{order[0]}}}
+  \\end{{tikzpicture}}
+\\end{{document}}'''
+
+
+def draw_lds_access_cmd(M, K, kpack, ldsLayout, ldsAccess, sizePerThread,
+                        threadsPerWarp):
+    if ldsLayout == 'swizzle':
+        hasSwizzle = 1
+    elif ldsLayout == 'padding':
+        hasSwizzle = 2
+    else:
+        hasSwizzle = 0
+
+    if ldsAccess == 'read':
+        accessMode = 1
+    elif ldsAccess == 'write':
+        accessMode = 2
+    else:
+        accessMode = 0
+
+    return f'''\\begin{{document}}
+  \\begin{{tikzpicture}}
+    \\def\\scale{{1}}
+    \\def\\M{{{M}}}
+    \\def\\K{{{K}}}
+    \\def\\vec{{{kpack}}}
+    \\def\\hasSwizzle{{{hasSwizzle}}}
+    \\def\\accessMode{{{accessMode}}}
+
+    \\def\\sizePerThreadK{{{sizePerThread[1]}}}
+    \\def\\sizePerThreadM{{{sizePerThread[0]}}}
+    \\def\\threadsPerWarpK{{{threadsPerWarp[1]}}}
+
+    \\def\\elem{{0.18}}
+    \\coordinate (TL) at (0,0);
+    \\drawTensorLayoutGlobalMem
+    \\coordinate (TL) at ($(TL)+(0, -24*\\elem-10*\\elem)$);
+    \\drawLDSLayoutTritonSwizzling{{\\hasSwizzle}}{{\\accessMode}}
+  \\end{{tikzpicture}}
+\\end{{document}}'''
+
+
+def draw_wmma_instr_cmd(waveSize):
+    wmma_mode = 0 if waveSize == 32 else 1
+    return f'''\\begin{{document}}
+  \\begin{{tikzpicture}}
+    \\def\\scale{{1}}
+    \\coordinate (C TL) at (0,0);
+    \\def\\elem{{0.25}}
+    \\drawWMMAInstr{{{wmma_mode}}}{{1}}
+  \\end{{tikzpicture}}
+\\end{{document}}'''
+
+
+def run_bash_command(commandstring):
+    proc = subprocess.run(commandstring,
+                          shell=True,
+                          check=True,
+                          executable='/bin/bash',
+                          stdout=subprocess.PIPE)
+    return proc.stdout.splitlines()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        prog="Draw triton layouts",
+        allow_abbrev=False,
+    )
+    ## tensor shapes
+    parser.add_argument("-shape",
+                        type=int,
+                        nargs=3,
+                        default=(32, 128, 64),
+                        help='Tensor shape in the form of M,N,K')
+    parser.add_argument("-plot",
+                        type=str,
+                        default="blocked",
+                        choices=['blocked', 'dot', 'wmma', 'lds'],
+                        help='choose plot mode')
+    parser.add_argument(
+        "-nonKDim",
+        type=int,
+        default=32,
+        choices=[16, 32],
+        help='mfma instruction dim')
+    ## blocked layout parameters
+    parser.add_argument("-sizePerThread", type=int, nargs=2, default=(1, 4))
+    parser.add_argument("-threadsPerWarp", type=int, nargs=2, default=(16, 4))
+    parser.add_argument("-warpsPerCTA", type=int, nargs=2, default=(1, 4))
+    parser.add_argument("-order", type=int, nargs=2, default=(1, 0))
+    ## LDS access parameters
+    parser.add_argument("-kWidth",
+                        type=int,
+                        default=4,
+                        choices=[4, 8, 16],
+                        help='number of elements per thread')
+    parser.add_argument("-lds_layout",
+                        type=str,
+                        default="none",
+                        choices=['swizzle', 'padding', 'none'],
+                        help='choose the LDS data layout')
+    parser.add_argument("-lds_access",
+                        type=str,
+                        default="none",
+                        choices=['read', 'write', 'none'],
+                        help='choose LDS access mode')
+    ## wmma instruction layout parameter
+    parser.add_argument("-wave_size",
+                        type=int,
+                        default=32,
+                        choices=[32, 64],
+                        help='choose the wmma instruction mode')
+
+    parser.add_argument("-o",
+                        type=str,
+                        default="myplot",
+                        help='output pdf file name (without surfix)')
+    parser.add_argument("-mfmaTrans",
+                        action='store_true',
+                        default=False,
+                        help='If set, then use mfma.trans layout')
+    parser.add_argument("-keep",
+                        action='store_true',
+                        default=False,
+                        help='If set, keep the generated .tex file')
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    shape = args.shape
+    M = shape[0]
+    N = shape[1]
+    K = shape[2]
+    plot_mode = args.plot
+    mfmaNonKDim = args.nonKDim
+    kpack = args.kWidth
+    trans = 1 if args.mfmaTrans else 0
+    ofilename = args.o
+    keepSrc = args.keep
+
+    ldsLayout = args.lds_layout
+    ldsAccess = args.lds_access
+
+    waveSize = args.wave_size
+
+    sizePerThread = args.sizePerThread
+    threadsPerWarp = args.threadsPerWarp
+    warpsPerCTA = args.warpsPerCTA
+    order = args.order
+
+    CTAShape = []
+    if plot_mode == 'blocked':
+        print(f"Plotting tensor M={M},K={K} with blocked layout:")
+        print(f"sizePerThread={sizePerThread}", end=" ")
+        print(f"threadsPerWarp={threadsPerWarp}", end=" ")
+        print(f"warpsPerCTA={warpsPerCTA}", end=" ")
+        print(f"order={order}", end=" ")
+        CTAShape.append(sizePerThread[0] * threadsPerWarp[0] * warpsPerCTA[0])
+        CTAShape.append(sizePerThread[1] * threadsPerWarp[1] * warpsPerCTA[1])
+
+    if plot_mode == 'dot':
+        mfma_inst_str = "mfma_32x32" if mfmaNonKDim == 32 else "mfma_16x16"
+        mfma_trans_str = ".trans" if trans else ""
+        print(f"Plotting dot operation with shapes M={M},N={N},K={K}")
+        print("MFMA: " + mfma_inst_str + mfma_trans_str + f" kWidth = {kpack}", end=" ")
+        print(f"warpsPerCTA={warpsPerCTA}", end=" ")
+        CTAShape.append(mfmaNonKDim * warpsPerCTA[0])
+        CTAShape.append(mfmaNonKDim * warpsPerCTA[1])
+
+    if plot_mode == 'blocked' or plot_mode == 'dot':
+        print(f"CTAShape={CTAShape}")
+        assert M != 0 and CTAShape[
+            0] <= M and M % CTAShape[0] == 0, "bad tensor dimension M"
+
+    if plot_mode == 'blocked':
+        assert K != 0 and CTAShape[
+            1] <= K and K % CTAShape[1] == 0, "bad tensor dimension K"
+
+    if plot_mode == 'dot':
+        assert N != 0 and CTAShape[
+            1] <= N and N % CTAShape[1] == 0, "bad tensor dimension N"
+        assert K != 0 and K % (2 * kpack) == 0, "bad tensor dimension K"
+
+    if plot_mode == 'lds':
+        print(f"Plotting LDS access for tensor M={M},K={K} with vec={kpack}")
+        if ldsAccess == 'write':
+            print(
+                f"sizePerThread={sizePerThread}, threadsPerWarp={threadsPerWarp}"
+            )
+
+    with open("myplot.tex", 'w') as f_plot:
+        with open("tikzplot.tex") as file:
+            tikz_code = file.read()
+
+        preamble_str = draw_preamble_cmd()
+
+        draw_blockedLayout_str = draw_blocked_layout_cmd(
+            M, K, sizePerThread, threadsPerWarp, warpsPerCTA, order)
+
+        draw_dotLayout_str = draw_dot_layout_cmd(M, N, K, mfmaNonKDim,
+                                                 warpsPerCTA, trans, kpack)
+
+        draw_lds_str = draw_lds_access_cmd(M, K, kpack, ldsLayout, ldsAccess,
+                                           sizePerThread, threadsPerWarp)
+
+        draw_wmma_str = draw_wmma_instr_cmd(waveSize)
+
+        f_plot.write(preamble_str + "\n")
+        f_plot.write(tikz_code)
+        if plot_mode == 'blocked':
+            f_plot.write(draw_blockedLayout_str)
+        elif plot_mode == 'dot':
+            f_plot.write(draw_dotLayout_str)
+        elif plot_mode == 'lds':
+            f_plot.write(draw_lds_str)
+        elif plot_mode == 'wmma':
+            f_plot.write(draw_wmma_str)
+
+    run_bash_command(f"pdflatex -jobname {ofilename} myplot.tex")
+    print(f"plot saved in {ofilename}.pdf")
+
+    ## Remove au files
+    os.remove(f"{ofilename}.aux")
+    os.remove(f"{ofilename}.log")
+    if not keepSrc:
+        os.remove("myplot.tex")
+        run_bash_command("rm -rf ./auto")
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/python/perf-kernels/tools/plot-layout/tikzplot.tex b/python/perf-kernels/tools/plot-layout/tikzplot.tex
new file mode 100755
index 000000000000..e6292f7002e9
--- /dev/null
+++ b/python/perf-kernels/tools/plot-layout/tikzplot.tex
@@ -0,0 +1,880 @@
+\newcommand{\drawBlockedWave}[5]{
+  %%
+  %% Draw a wave coverage with blocked layout
+  %%
+  %% Wave TL: pre defined top-left coordinate of the wave
+  %% \elem: pre defined variable
+  %%
+  %% #1: sizePerThread[0] --> sizePerThreadM
+  %% #2: sizePerThread[1] --> sizePerThreadN
+  %% #3: threadsPerWarp[0] --> threadsPerWarpM
+  %% #4: threadsPerWarp[1] --> threadsPerWarpN
+  %% #5: fastest changing dim --> order
+
+  \pgfmathsetmacro{\sizePerThreadM}{#1}
+  \pgfmathsetmacro{\sizePerThreadN}{#2}
+  \pgfmathsetmacro{\threadsPerWarpM}{#3}
+  \pgfmathsetmacro{\threadsPerWarpN}{#4}
+  \pgfmathsetmacro{\order}{#5}
+
+  \pgfmathsetmacro{\waveSizeM}{\sizePerThreadM*\threadsPerWarpM}
+  \pgfmathsetmacro{\waveSizeN}{\sizePerThreadN*\threadsPerWarpN}
+
+  \foreach \tid in {0,...,63}{
+    \pgfmathsetmacro{\tidM}{int(\tid/\threadsPerWarpN)}
+    \pgfmathsetmacro{\tidN}{mod(\tid,\threadsPerWarpN)}
+    \coordinate (Thread TL) at ($(Wave TL)+(\tidN*\sizePerThreadN*\elem, -\tidM*\sizePerThreadM*\elem)$);
+    \pgfmathsetmacro{\ratio}{\tidM*10}
+    
+    \ifthenelse{\tid = 0}{
+      \draw [line width = 0.01mm, fill=red] (Thread TL)
+      rectangle ++(\sizePerThreadN*\elem, -\sizePerThreadM*\elem);
+    }{
+      \draw [line width = 0.01mm, fill=blue!\ratio!white] (Thread TL)
+      rectangle ++(\sizePerThreadN*\elem, -\sizePerThreadM*\elem);
+    }
+  }
+  \draw (Wave TL) rectangle ++(\waveSizeN*\elem, -\waveSizeM*\elem);
+}
+
+\newcommand{\drawBlockedCTA}[7]{
+  %%
+  %% Draw a CTA coverage with blocked layout
+  %%
+  %% CTA TL: pre defined top-left coordinate of the CTA
+  %% \elem: pre defined variable
+  %%
+  %% #1: sizePerThread[0] --> sizePerThreadM
+  %% #2: sizePerThread[1] --> sizePerThreadN
+  %% #3: threadsPerWarp[0] --> threadsPerWarpM
+  %% #4: threadsPerWarp[1] --> threadsPerWarpN
+  %% #5: warpsPerCTA[0] --> warpsPerCTAM
+  %% #6: warpsPerCTA[1] --> warpsPerCTAN
+  %% #7: fastest changing dim --> order
+
+  \pgfmathsetmacro{\sizePerThreadM}{#1}
+  \pgfmathsetmacro{\sizePerThreadN}{#2}
+  \pgfmathsetmacro{\threadsPerWarpM}{#3}
+  \pgfmathsetmacro{\threadsPerWarpN}{#4}
+  \pgfmathsetmacro{\warpsPerCTAM}{#5}
+  \pgfmathsetmacro{\warpsPerCTAN}{#6}
+  \pgfmathsetmacro{\order}{#7}
+
+  \pgfmathsetmacro{\CTASizeM}{\sizePerThreadM*\threadsPerWarpM*\warpsPerCTAM}
+  \pgfmathsetmacro{\CTASizeN}{\sizePerThreadN*\threadsPerWarpN*\warpsPerCTAN}
+  \pgfmathsetmacro{\waveSizeM}{\sizePerThreadM*\threadsPerWarpM}
+  \pgfmathsetmacro{\waveSizeN}{\sizePerThreadN*\threadsPerWarpN}
+
+  \pgfmathsetmacro{\maxWaveId}{\warpsPerCTAM*\warpsPerCTAN-1}
+
+  \coordinate (Wave TL) at (CTA TL);
+  \drawBlockedWave{\sizePerThreadM}{\sizePerThreadN}{\threadsPerWarpM}{\threadsPerWarpN}{\order}
+  \foreach \waveId in {0,...,\maxWaveId}{
+    \ifthenelse{\order=1}
+    {
+      \pgfmathsetmacro{\waveCoordM}{int(\waveId/\warpsPerCTAN)}
+      \pgfmathsetmacro{\waveCoordN}{mod(\waveId,\warpsPerCTAN)}
+      \pgfmathsetmacro{\rot}{0}
+    }{
+      \pgfmathsetmacro{\waveCoordM}{mod(\waveId,\warpsPerCTAM)}
+      \pgfmathsetmacro{\waveCoordN}{int(\waveId/\warpsPerCTAM)}
+      \pgfmathsetmacro{\rot}{90}
+    }
+    
+    \coordinate (Wave TL) at ($(CTA TL)+(\waveCoordN*\waveSizeN*\elem, -\waveCoordM*\waveSizeM*\elem)$);
+    \draw [ultra thin] (Wave TL) rectangle ++(\waveSizeN*\elem, -\waveSizeM*\elem)
+    node [pos=.5, scale=.6*\scale, inner sep=0, fill=white, rotate=\rot] {wave\waveId};
+  }
+  
+  \draw [thick] (CTA TL) rectangle ++(\CTASizeN*\elem, -\CTASizeM*\elem);
+}
+
+\newcommand{\drawBlockedTensor}[8]{
+  %%
+  %% Draw a tensor with blocked layout of the following parameters
+  %% sizePerThread[2]
+  %% threadsPerWarp[2]
+  %% warpsPerCTA[2]
+  %% order[2]
+  %%
+  %% TL: pre defined top-left coordinate of the tensor
+  %% \elem: pre defined variable
+  %%
+  %% #1: tensorShape[0] --> M
+  %% #2: tensorShape[1] --> N
+  %% #3: sizePerThread[0] --> sizePerThreadM
+  %% #4: sizePerThread[1] --> sizePerThreadN
+  %% #5: threadsPerWarp[0] --> threadsPerWarpM
+  %%     Note that threadsPerWarp[1] is calculated by 64/threadsPerWarp[0]
+  %% #6: warpsPerCTA[0] --> warpsPerCTAM
+  %% #7: warpsPerCTA[1] --> warpsPerCTAN
+  %% #8: fastest changing dim --> order 
+
+  \pgfmathsetmacro{\M}{#1}
+  \pgfmathsetmacro{\N}{#2}
+  \pgfmathsetmacro{\sizePerThreadM}{#3}
+  \pgfmathsetmacro{\sizePerThreadN}{#4}
+  \pgfmathsetmacro{\threadsPerWarpM}{#5}
+  \pgfmathsetmacro{\warpsPerCTAM}{#6}
+  \pgfmathsetmacro{\warpsPerCTAN}{#7}
+  \pgfmathsetmacro{\order}{#8}
+
+  \pgfmathsetmacro{\threadsPerWarpN}{64/\threadsPerWarpM}
+  \pgfmathsetmacro{\CTASizeM}{\sizePerThreadM*\threadsPerWarpM*\warpsPerCTAM}
+  \pgfmathsetmacro{\CTASizeN}{\sizePerThreadN*\threadsPerWarpN*\warpsPerCTAN}
+  \pgfmathsetmacro{\CTARepM}{\M/\CTASizeM}
+  \pgfmathsetmacro{\CTARepN}{\N/\CTASizeN}
+  \pgfmathsetmacro{\maxCTAId}{\CTARepM*\CTARepN-1}
+
+  \foreach \ctaId in {0,...,\maxCTAId}{
+    \pgfmathsetmacro{\ctaCoordM}{int(\ctaId/\CTARepN)}
+    \pgfmathsetmacro{\ctaCoordN}{mod(\ctaId,\CTARepN)}
+    \coordinate (CTA TL) at ($(TL)+(\ctaCoordN*\CTASizeN*\elem, -\ctaCoordM*\CTASizeM*\elem)$);
+    \drawBlockedCTA{\sizePerThreadM}{\sizePerThreadN}{\threadsPerWarpM}{\threadsPerWarpN}{\warpsPerCTAM}{\warpsPerCTAN}{\order}
+  }
+
+  \node [scale=.7*\scale, above, rotate=90] at ($(TL)+(0, -.5*\M*\elem)$) {M=\M};
+  \node [scale=.7*\scale, above] at ($(TL)+(.5*\N*\elem, 0)$) {K=\N};
+
+  \def\zoomR{1.5}
+  \coordinate (zoomin BL) at ($(TL)+(0, .3)$);
+
+  \foreach \hl in {0,...,\sizePerThreadM}{
+    \draw ($(zoomin BL)+(0, \hl*\elem*\zoomR)$) -- ++(\sizePerThreadN*\elem*\zoomR,0);
+  }
+  \foreach \vl in {0,...,\sizePerThreadN}{
+    \draw ($(zoomin BL)+(\vl*\elem*\zoomR, 0)$) -- ++(0, \sizePerThreadM*\elem*\zoomR);
+  }
+
+  \node [scale=.6*\scale, left] at ($(zoomin BL)+(0, .5*\sizePerThreadM*\elem*\zoomR)$) {$t_0$};
+  \node [scale=.6*\scale, right] at ($(zoomin BL)+(\sizePerThreadN*\elem*\zoomR, .5*\sizePerThreadM*\elem*\zoomR)$) {\sizePerThreadM$\times$\sizePerThreadN};
+  
+  \draw [densely dotted] (TL) -- (zoomin BL);
+  \draw [densely dotted] ($(TL)+(\sizePerThreadN*\elem, 0)$) -- ($(zoomin BL)+(\sizePerThreadN*\elem*\zoomR, 0)$);
+  \draw [fill=red] (TL) rectangle ++(\sizePerThreadN*\elem, -\sizePerThreadM*\elem);
+}
+
+\newcommand{\drawBlockMFMALayoutLarge}[3]{
+  %%
+  %% Draw a single block of MFMA_32x32x8xf16 or MFMA_16x16x16xf16
+  %%
+  %% block TL: pre-defined top-left coordinate of the block
+  %% \elem: pre defined variable
+  %%
+  %% #1: 1 for mfma.trans, 0 for normal mfma
+  %% #2: mfmaNonKDim
+  %% #3: verbose. 1 means draw tid in each vec; 0 means draw nothing
+  
+  \pgfmathsetmacro{\trans}{#1}
+  \pgfmathsetmacro{\nonTrans}{1-#1}
+  \pgfmathsetmacro{\nonKDim}{#2}  
+  \pgfmathsetmacro{\maxTID}{\nonKDim-1}  
+  \pgfmathsetmacro{\groups}{64/\nonKDim}  
+  \pgfmathsetmacro{\maxGID}{\groups-1}  
+  \pgfmathsetmacro{\maxIVec}{\nonKDim*\nonKDim/256-1}  
+  \pgfmathsetmacro{\verbose}{#3}
+  \foreach \iVec in {0,...,\maxIVec} {
+    \coordinate (wave TL) at ($(block TL)+(\trans*\iVec*\groups*4*\elem, -\nonTrans*\iVec*\groups*4*\elem)$);
+    \foreach \tg in {0,...,\maxGID}{
+      \pgfmathsetmacro{\colID}{\tg+4}  
+      \pgfmathsetmacro{\col}{\Colors[\colID]}
+      \foreach \tid in {0,...,\maxTID} {
+        \pgfmathsetmacro{\ratio}{\tid*2.5*\groups+15}
+        \ifthenelse{\verbose=0}{
+          \draw [line width=0.005mm, fill=\col!\ratio!white]
+          ($(wave TL)+(\nonTrans*\tid*\elem+\tg*\trans*4*\elem, -\trans*\tid*\elem-\tg*\nonTrans*4*\elem)$)
+          rectangle ++(\nonTrans*\elem+\trans*4*\elem, -\nonTrans*4*\elem-\trans*\elem);
+        }{
+          \pgfmathsetmacro{\drawTid}{int(\tid+\tg*\nonKDim)}
+          \draw [line width=0.005mm, fill=\col!\ratio!white]
+          ($(wave TL)+(\nonTrans*\tid*\elem+\tg*\trans*4*\elem, -\trans*\tid*\elem-\tg*\nonTrans*4*\elem)$)
+          rectangle ++(\nonTrans*\elem+\trans*4*\elem, -\nonTrans*4*\elem-\trans*\elem)
+          node [pos=.5, scale=.35*\scale, rotate=90*\nonTrans] {t\drawTid};
+        }
+      }
+    }
+  }
+  \draw [thick] (block TL) rectangle ++(\nonKDim*\elem, -\nonKDim*\elem);
+}
+
+
+\newcommand{\drawTensorMFMALayout}[6]{
+  %%
+  %% Draw a tensor with mfma layout.
+  %%
+  %% C TL: pre defined top-left coordinates of the tensor
+  %%
+  %% #1: M
+  %% #2: N
+  %% #3: MFMA nonKDim
+  %% #4: warpsPerCTA[0]
+  %% #5: warpsPerCTA[1]
+  %% #6: 1 for mfma.trans, 0 for normal mfma
+
+  \pgfmathsetmacro{\tensorShapeH}{#1}
+  \pgfmathsetmacro{\tensorShapeW}{#2}
+  \pgfmathsetmacro{\mfmaNonKDim}{#3}
+  \pgfmathsetmacro{\warpsPerCTAH}{#4}
+  \pgfmathsetmacro{\warpsPerCTAW}{#5}
+  \pgfmathsetmacro{\mfmaTrans}{#6}
+
+  \coordinate (old TL) at (TL);
+  \coordinate (TL) at (C TL);
+
+
+  \pgfmathsetmacro{\CTARepH}{\tensorShapeH/\mfmaNonKDim/\warpsPerCTAH}
+  \pgfmathsetmacro{\CTARepW}{\tensorShapeW/\mfmaNonKDim/\warpsPerCTAW}
+  \pgfmathsetmacro{\maxCTAId}{\CTARepH*\CTARepW-1}
+  \pgfmathsetmacro{\maxWaveId}{\warpsPerCTAH*\warpsPerCTAW-1}
+  \pgfmathsetmacro{\CTASizeH}{\warpsPerCTAH*\mfmaNonKDim}
+  \pgfmathsetmacro{\CTASizeW}{\warpsPerCTAW*\mfmaNonKDim}
+  
+
+  \foreach \ctaId in {0,...,\maxCTAId}{
+    \pgfmathsetmacro{\ctaCoordH}{int(\ctaId/\CTARepW)}
+    \pgfmathsetmacro{\ctaCoordW}{mod(\ctaId,\CTARepW)}
+    \coordinate (CTA TL) at ($(TL)+(\ctaCoordW*\CTASizeW*\elem, -\ctaCoordH*\CTASizeH*\elem)$);
+    %% Draw a detailed view of wave0 in each CTA
+    \coordinate (block TL) at (CTA TL);
+    \drawBlockMFMALayoutLarge{\mfmaTrans}{\mfmaNonKDim}{0}
+    
+    \foreach \waveId in {0,...,\maxWaveId}{
+      \pgfmathsetmacro{\waveCoordH}{int(\waveId/\warpsPerCTAW)}
+      \pgfmathsetmacro{\waveCoordW}{mod(\waveId,\warpsPerCTAW)}
+      \coordinate (block TL) at ($(CTA TL)+(\waveCoordW*\mfmaNonKDim*\elem, -\waveCoordH*\mfmaNonKDim*\elem)$);
+      %% Inside the loop, only draw a rectangle
+      \draw [ultra thin] (block TL) rectangle ++(\mfmaNonKDim*\elem, -\mfmaNonKDim*\elem)
+      node [scale=.7*\mfmaNonKDim/32*\scale, pos=.5, fill=white, inner sep=0] {wave\waveId};
+    }
+    
+    %% Draw the outline of each CTA rep
+    \draw [ultra thick] (CTA TL) rectangle ++(\CTASizeW*\elem, -\CTASizeH*\elem);
+  }
+
+  \coordinate (TL) at (old TL);
+}
+
+\newcommand{\drawMFMAOperand}[4]{
+  %%
+  %% Draw one mfma operand
+  %%
+  %% mfma op TL: pre defined coordinates of the top-left
+  %% \elem: pre defined variable
+  %%
+  %% #1: mfmNonKDim
+  %% #2: kpack
+  %% #3: 0 for opA and 1 for opB
+  %% #4: verbose. 1 means draw tid in each vec; 0 means draw nothing
+
+  \pgfmathsetmacro{\nonKDim}{#1}
+  \pgfmathsetmacro{\maxGID}{64/\nonKDim-1}
+  \pgfmathsetmacro{\maxTID}{\nonKDim-1}
+  \pgfmathsetmacro{\kpack}{#2}
+  \pgfmathsetmacro{\opIdxA}{#3}
+  \pgfmathsetmacro{\opIdxB}{1-\opIdxA}
+  \pgfmathsetmacro{\verbose}{#4}
+
+  \foreach \col/\tg in {0,...,\maxGID}{
+    \pgfmathsetmacro{\col}{\Colors[\tg]}
+    \foreach \tid in {0,...,\maxTID} {
+      % \pgfmathsetmacro{\ratio}{\tid*2.5+15}
+      \ifthenelse{\verbose=0}{
+        \draw [line width=0.005mm, fill=\col]
+        ($(mfma op TL)+(\tg*\kpack*\elem*\opIdxB+\tid*\elem*\opIdxA, -\tid*\elem*\opIdxB-\tg*\kpack*\elem*\opIdxA)$)
+        rectangle ++(\kpack*\elem*\opIdxB + \elem*\opIdxA, -\elem*\opIdxB-\kpack*\elem*\opIdxA);
+      }{
+        \pgfmathsetmacro{\drawTid}{int(\tid+\tg*\nonKDim)}
+        \draw [line width=0.005mm, fill=\col]
+        ($(mfma op TL)+(\tg*\kpack*\elem*\opIdxB+\tid*\elem*\opIdxA, -\tid*\elem*\opIdxB-\tg*\kpack*\elem*\opIdxA)$)
+        rectangle ++(\kpack*\elem*\opIdxB + \elem*\opIdxA, -\elem*\opIdxB-\kpack*\elem*\opIdxA)
+        node [pos=.5, scale=.35*\scale, rotate=90*\opIdxA] {t\drawTid};
+      }
+    }   
+  }
+}
+
+\newcommand{\drawWaveOperand}[4]{
+  %%
+  %% Draw the part of the tensor that is one operand of the wave
+  %%
+  %% Op TL: pre defined coordinates of the top-left of the operand
+  %% \elem: pre defined variable
+  %%
+  %% #1: K
+  %% #2: mfmNonKDim
+  %% #3: kpack
+  %% #4: 0 for opA and 1 for opB
+
+  \pgfmathsetmacro{\K}{#1}
+  \pgfmathsetmacro{\nonKDim}{#2}
+  \pgfmathsetmacro{\groups}{64/\nonKDim}
+  \pgfmathsetmacro{\kpack}{#3}
+  \pgfmathsetmacro{\opIdx}{#4}
+  \pgfmathsetmacro{\opIdxOther}{1-\opIdx}
+  
+  \coordinate (TL) at (Op TL);
+  
+  \pgfmathsetmacro{\numKRep}{\K/\kpack/\groups}
+  \pgfmathsetmacro{\maxKRepId}{\numKRep-1}
+
+  \foreach \repId in {0,...,\maxKRepId}{
+    \coordinate (mfma op TL) at ($(TL)+(\repId*\groups*\kpack*\elem*\opIdxOther, -\repId*\groups*\kpack*\elem*\opIdx)$);
+    \drawMFMAOperand{\nonKDim}{\kpack}{\opIdx}{0}
+    \draw [thick] (mfma op TL) rectangle
+    ++(\groups*\kpack*\elem*\opIdxOther+\nonKDim*\opIdx*\elem, -\nonKDim*\opIdxOther*\elem-\groups*\kpack*\elem*\opIdx);
+  }
+}
+
+\newcommand{\drawDotOperands}[7]{
+  %%
+  %% Draw operand tensors of dot
+  %%
+  %% A TL and B TL: pre defined top-left coordinates of A and B tensor
+  %% \elem: pre defined variable
+  %%
+  %% #1: M
+  %% #2: N
+  %% #3: K
+  %% #4: MFMA nonKDim
+  %% #5: warpsPerCTA[0]
+  %% #6: warpsPerCTA[1]
+  %% #7: kpack
+
+  \pgfmathsetmacro{\M}{#1}
+  \pgfmathsetmacro{\N}{#2}
+  \pgfmathsetmacro{\K}{#3}
+  \pgfmathsetmacro{\mfmaNonKDim}{#4}
+  \pgfmathsetmacro{\warpsPerCTAM}{#5}
+  \pgfmathsetmacro{\warpsPerCTAN}{#6}
+  \pgfmathsetmacro{\kpack}{#7}
+
+  %% operand A
+  \pgfmathsetmacro{\CTARepM}{\M/\warpsPerCTAM/\mfmaNonKDim}
+  \pgfmathsetmacro{\maxCTAIdM}{\CTARepM-1}
+  \pgfmathsetmacro{\maxWaveId}{\warpsPerCTAM-1}
+  \foreach \ctaId in {0,...,\maxCTAIdM}{
+    \coordinate (CTA TL) at ($(A TL)+(0, -\ctaId*\warpsPerCTAM*\mfmaNonKDim*\elem)$);
+    \foreach \waveId in {0,...,\maxWaveId}{
+      \coordinate (wave TL) at ($(CTA TL)+(0, -\waveId*\mfmaNonKDim*\elem)$);
+      \draw [ultra thin] (wave TL) rectangle ++(\K*\elem, -\mfmaNonKDim*\elem);
+    }
+    %% Only draw the detailed view of the first wave in CTA
+    \coordinate (Op TL) at (CTA TL);
+    \drawWaveOperand{\K}{\mfmaNonKDim}{\kpack}{0}
+
+    %% Draw the outline of each CTA rep
+    \draw [ultra thick] (CTA TL) rectangle ++(\K*\elem, -\warpsPerCTAM*\mfmaNonKDim*\elem);
+  }
+  \draw [ultra thin] (A TL) rectangle ++(\K*\elem, -\M*\elem);
+
+
+  %% operand B
+  \pgfmathsetmacro{\CTARepN}{\N/\warpsPerCTAN/\mfmaNonKDim}
+  \pgfmathsetmacro{\maxCTAIdN}{\CTARepN-1}
+  \pgfmathsetmacro{\maxWaveId}{\warpsPerCTAN-1}
+  \foreach \ctaId in {0,...,\maxCTAIdN}{
+    \coordinate (CTA TL) at ($(B TL)+(\ctaId*\warpsPerCTAN*\mfmaNonKDim*\elem, 0)$);
+    \foreach \waveId in {0,...,\maxWaveId}{
+      \coordinate (wave TL) at ($(CTA TL)+(\waveId*\mfmaNonKDim*\elem ,0)$);
+      \draw [ultra thin] (wave TL) rectangle ++(\mfmaNonKDim*\elem, -\K*\elem);
+    }
+    %% Only draw the detailed view of the first wave in CTA
+    \coordinate (Op TL) at (CTA TL);
+    \drawWaveOperand{\K}{\mfmaNonKDim}{\kpack}{1}
+
+    %% Draw the outline of each CTA rep
+    \draw [ultra thick] (CTA TL) rectangle ++(\warpsPerCTAN*\mfmaNonKDim*\elem, -\K*\elem);
+  }
+  \draw [ultra thin] (B TL) rectangle ++(\N*\elem, -\K*\elem);
+}
+
+
+\newcommand{\drawDot}[8]{
+  %%
+  %% Draw C = dot A, B
+  %%
+  %% C TL: pre defined top-left coordinates of the result tensor
+  %% \elem: pre defined variable
+  %%
+  %% #1: M
+  %% #2: N
+  %% #3: K
+  %% #4: MFMA nonKDim
+  %% #5: warpsPerCTA[0]
+  %% #6: warpsPerCTA[1]
+  %% #7: 1 for mfma.trans, 0 for normal mfma
+  %% #8: kpack
+
+  \pgfmathsetmacro{\M}{#1}
+  \pgfmathsetmacro{\N}{#2}
+  \pgfmathsetmacro{\K}{#3}
+  \pgfmathsetmacro{\mfmaNonKDim}{#4}
+  \pgfmathsetmacro{\groups}{64/\mfmaNonKDim}
+  \pgfmathsetmacro{\warpsPerCTAM}{#5}
+  \pgfmathsetmacro{\warpsPerCTAN}{#6}
+  \pgfmathsetmacro{\mfmaTrans}{#7}
+  \pgfmathsetmacro{\kpack}{#8}
+  \pgfmathsetmacro{\kdim}{int(\groups*\kpack)}
+
+  \pgfmathsetmacro{\gap}{\elem*20}
+  \coordinate (A TL) at ($(C TL)+(-\gap-\K*\elem, 0)$); 
+  \coordinate (B TL) at ($(C TL)+(0, \gap+\K*\elem)$);
+
+  \drawDotOperands{\M}{\N}{\K}{\mfmaNonKDim}{\warpsPerCTAM}{\warpsPerCTAN}{\kpack}
+
+  \drawTensorMFMALayout{\M}{\N}{\mfmaNonKDim}{\warpsPerCTAM}{\warpsPerCTAN}{\mfmaTrans}
+
+  %% Draw labels
+  \node [scale=\scale, above] at ($(A TL)+(.5*\K*\elem, 0)$) {K=\K};
+  \node [scale=\scale, above, rotate=90] at ($(A TL)+(0, -.5*\M*\elem)$) {M=\M};
+
+  \node [scale=\scale, above, rotate=90] at ($(B TL)+(0, -.5*\K*\elem)$) {K=\K};
+  \node [scale=\scale, above] at ($(B TL)+(.5*\N*\elem, 0)$) {N=\N};
+
+  \node [scale=\scale, above left] at (A TL) {A};
+  \node [scale=\scale, above left] at (B TL) {B};
+  \node [scale=\scale, above left] at (C TL) {C};
+
+  %% label nonKDim
+  \node [scale=.8*\scale, left] at ($(A TL)+(0, -.5*\mfmaNonKDim*\elem)$) {\mfmaNonKDim};
+  \node [scale=.8*\scale, above] at ($(B TL)+(.5*\mfmaNonKDim*\elem, 0)$) {\mfmaNonKDim};
+  %% label kpack
+  \node [scale=.8*\scale, above] at ($(A TL)+(0.5*\groups*\kpack*\elem, 0)$) {\kdim};
+  \node [scale=.8*\scale, left] at ($(B TL)+(0, -0.5*\groups\kpack*\elem)$) {\kdim};
+}
+
+\newcommand{\Colors}{{
+    "red",
+    "YellowGreen",
+    "blue",
+    "Maroon",
+    "orange",
+    "cyan",
+    "magenta",
+    "brown",
+    "teal",
+    "purple",
+    "gray",
+    "Green",
+    "BlueGreen",
+    "violet",
+    "olive",
+    "darkgray",
+  }}
+
+\newcommand{\drawTensorLayoutGlobalMem}{
+  %%
+  %% Draw tensor layout in global memory without any swizzling
+  %%
+  %% TL: pre defined top-left coordinates of the tensor in global memory
+  %% \elem: per defined variable
+  %% \Colors: a pre defined array of 16 colors
+  %%
+  %% The following arguments are also expected to be pre defined
+  %% #1: M
+  %% #2: K
+  %% #3: vec: number of elements in a group
+
+  \pgfmathsetmacro{\numVecK}{\K/\vec}
+  \pgfmathsetmacro{\maxVecId}{16*\numVecK-1}
+  \pgfmathsetmacro{\drawM}{20}
+
+  %% Draw the tensor, but only draw 32 rows
+  \draw (TL) rectangle ++(\K*\elem, -\drawM*\elem);
+  %% Draw detailed vec view of the tensor
+  \foreach \vecId in {0,...,\maxVecId}{
+    
+    \pgfmathsetmacro{\vecCoordM}{int(\vecId/\numVecK)}
+    \pgfmathsetmacro{\vecCoordK}{mod(\vecId,\numVecK)}
+    \coordinate (vec TL) at ($(TL)+(\vecCoordK*\vec*\elem, -\vecCoordM*\elem)$);
+    
+    \pgfmathsetmacro{\colorIdxK}{int(mod(\vecCoordK,16))}
+    \pgfmathsetmacro{\colorIdxM}{mod(\vecCoordM,16)}
+    \pgfmathsetmacro{\vecColor}{\Colors[\colorIdxK]}
+    \pgfmathsetmacro{\ratio}{100-floor(\vecCoordK/16)*40}
+
+    \draw [ultra thin, fill=\vecColor!\ratio!white] (vec TL) rectangle ++(\vec*\elem, -\elem)
+    node [pos=.5, scale=.6*\scale, white] {m\vecCoordM};
+    
+  }
+  %% M and K dim
+  \node [scale=\scale, rotate=90, above] at ($(TL)+(0, -.5*\drawM*\elem-8*\elem)$) {M=\M};
+  \node [scale=.8*\scale, left] at ($(TL)+(0, -.5*16*\elem)$) {16};
+  \node [scale=\scale, above] at ($(TL)+(.5*\K*\elem, 0)$) {K=\K};
+  %% label for vecSize
+  \def\vecR{1.5}
+  \coordinate (vec TL) at ($(TL)+(-.25*\vec*\elem, 3*\elem*\vecR)$);
+  \pgfmathsetmacro{\maxVec}{\vec-1}
+  \foreach \vecId in {0,...,\maxVec}{
+    \draw ($(vec TL)+(\vecId*\elem*\vecR, 0)$) rectangle ++(\elem*\vecR, -\elem*\vecR);
+  }
+  \draw [densely dotted] (TL) -- ($(vec TL)+(0, -\elem*\vecR)$);
+  \draw [densely dotted] ($(TL)+(\vec*\elem, 0)$) -- ($(vec TL)+(\vec*\elem*\vecR, -\elem*\vecR)$);
+  \node [scale=.8*\scale, above] at ($(vec TL)+(.5*\vec*\elem*\vecR, 0)$) {vec=\vec};  
+}
+
+
+
+\newcommand{\drawLDSLayoutTritonSwizzling}[2]{
+  %% 
+  %% Draw tensor layout in LDS with swizzling
+  %% 
+  %% TL: pre defined top-left coordinates of the tensor in global memory
+  %% \elem: per defined variable
+  %% \Colors: a pre defined array of 16 colors
+  %%
+  %% The following three arguments are expected to be pre defined
+  %% #1: M
+  %% #2: K
+  %% #3: vec: number of elements in a group
+  %%
+  %% #1: hasSwizzle, 0 means no swizzling and no padding,
+  %%                 1 means optimal swizzling
+  %%                 2 means padding
+  %% #2: access mode, 0 means draw nothing, 1 means ds_read, 2 means ds_write
+  %% For ds_write access, the following variables are assumed to be pre defined
+  %% \sizePerThreadK
+  %% \sizePerThreadM
+  %% \threadsPerWarpK
+
+  \pgfmathsetmacro{\hasSwizzle}{#1}
+  \pgfmathsetmacro{\accessMode}{#2}
+  \pgfmathsetmacro{\numVecK}{\K/\vec}
+  
+  %% Assuming fp16 data type
+  \pgfmathsetmacro{\LDSK}{64}
+  \pgfmathsetmacro{\numLDSVec}{\LDSK/\vec}
+  \pgfmathsetmacro{\swizzleK}{max(\LDSK, \K)}
+  \pgfmathsetmacro{\LDSM}{int(\M/\LDSK*\K)}
+
+  \ifthenelse{\accessMode = 2}{
+    %% \accessMode == 2, draw 8 rows
+    \pgfmathsetmacro{\maxVecId}{8*\numVecK-1}
+    \pgfmathsetmacro{\drawM}{8*\K/\LDSK+4}
+  }{
+    %% \accessMode == 0 or 1, draw 16 rows
+    \pgfmathsetmacro{\maxVecId}{16*\numVecK-1}
+    \pgfmathsetmacro{\drawM}{16*\K/\LDSK+4}
+  }
+
+  %% Parameters used for swizzling
+  \pgfmathsetmacro{\numVecSwizzleK}{\swizzleK/\vec}
+  %% perPhase = ceil(LDSK / K)
+  %% The number of the rows of the tensor that can share the same swizzling pattern
+  \pgfmathsetmacro{\perPhase}{ceil(\LDSK/\K)}
+  %% maxPhase: the total number of different swizzling patterns
+  \ifthenelse{\hasSwizzle=0}{
+    %% When swizzling is disabled
+    \pgfmathsetmacro{\maxPhase}{1}
+  }{
+    %% When vec is small enough, we want 16/perPhase different swizzling patterns
+    %% When vec is large, we can only have 64 / \vec different swizzling pattern at most 
+    \pgfmathsetmacro{\maxPhase}{min(16/\perPhase,64/\vec)}
+  }
+  
+  %% Draw the LDS
+  \draw (TL) rectangle ++(\LDSK*\elem, -\drawM*\elem);
+
+  %% Draw detailed vec view of LDS
+  \foreach \vecId in {0,...,\maxVecId}{
+    \pgfmathsetmacro{\vecCoordM}{int(\vecId/\numVecK)}
+    \pgfmathsetmacro{\vecCoordK}{int(mod(\vecId,\numVecK))}
+    \pgfmathsetmacro{\rawPhase}{floor(\vecId/\numVecSwizzleK)}
+    %% vec color
+    \pgfmathsetmacro{\colorIdxK}{int(mod(\vecCoordK,16))}
+    \pgfmathsetmacro{\colorIdxM}{mod(\vecCoordM,16)}
+    \pgfmathsetmacro{\ratio}{100-floor(\vecCoordK/16)*40}
+    \pgfmathsetmacro{\vecColor}{\Colors[\colorIdxK]}
+
+    %% old vec coordinates
+    \coordinate (vec TL) at ($(TL)+(\vecCoordK*\vec*\elem, -\vecCoordM*\elem)$);
+
+    %% new vec coordinates in LDS by swizzling
+    %% The following two conditions correspond to the relation between \LDSK and \K
+    \ifthenelse{\LDSK < \K}{
+      \pgfmathsetmacro{\vecLDSM}{\vecCoordM*\K/\LDSK+floor(\vecCoordK*\vec/\LDSK)}
+      \pgfmathsetmacro{\vecLDSK}{int(mod(\vecCoordK, \LDSK/\vec))}
+    }{
+      \pgfmathsetmacro{\vecLDSM}{floor(\vecCoordM/\perPhase)}
+      \pgfmathsetmacro{\vecLDSK}{int(\vecCoordK+mod(\vecCoordM,\perPhase)*\numVecK)}
+    }
+    %% 
+    \pgfmathsetmacro{\phase}{int(mod(\rawPhase, \maxPhase))}
+    %% Compute the swizzled col id
+    \pgfmathsetmacro{\vecLDSKSwizzled}{\bitwiseXor{\vecLDSK}{\phase}}
+
+    %% new vec coordinates in LDS by padding
+    \pgfmathsetmacro{\numPads}{floor(\vecId/\numLDSVec)}
+    \pgfmathsetmacro{\bankId}{\vec/2*\vecId+\numPads}
+    \pgfmathsetmacro{\vecPadM}{int(\bankId/32)}
+    \pgfmathsetmacro{\vecPadK}{int(mod(\bankId,32))}
+
+    \ifthenelse{\hasSwizzle = 2}{
+      %% vec coordinates by padding
+      \coordinate (new vec TL) at ($(TL)+(\vecPadK*2*\elem, -\vecPadM*\elem)$);
+      \pgfmathsetmacro{\tailBankId}{int(\vecPadK+\vec/2-1)}
+    }{
+      %% vec coordinates by swizzling
+      \coordinate (new vec TL) at ($(TL)+(\vecLDSKSwizzled*\vec*\elem, -\vecLDSM*\elem)$);
+      \pgfmathsetmacro{\tailBankId}{0}
+    }
+
+    \ifthenelse{\hasSwizzle = 2 \AND \tailBankId > 31}{
+      \pgfmathsetmacro{\nextBanks}{\tailBankId-31}
+      \pgfmathsetmacro{\leftBanks}{\vec/2 - \nextBanks}
+      \draw [ultra thin, fill=\vecColor!\ratio!white] (new vec TL) rectangle ++(\leftBanks*2*\elem, -\elem)
+      node [pos=.5, scale=.6*\scale, white] {m\vecCoordM};
+      \draw [ultra thin, fill=\vecColor!\ratio!white] ($(TL)+(0, -\vecPadM*\elem-\elem)$)
+      rectangle ++(\nextBanks*2*\elem, -\elem) node [pos=.5, scale=.6*\scale, white] {m\vecCoordM};
+    }{
+      \draw [ultra thin, fill=\vecColor!\ratio!white] (new vec TL) rectangle ++(\vec*\elem, -\elem)
+      node [pos=.5, scale=.6*\scale, white] {m\vecCoordM};
+    }
+
+    %% ds_read 
+    %% Highlight the elements the first 16 threads access in the first cycle
+    %% This is used to visualize bank conflicts
+    \ifthenelse{\accessMode = 1}{
+      \ifthenelse{\vecCoordK = 0}{
+        \draw [fill=white]  (new vec TL) rectangle ++(\elem, -\elem);
+        \draw (new vec TL) -- ++(\elem, -\elem);   
+        \draw ($(new vec TL)+(0, -\elem)$) -- ++(\elem, \elem);   
+      }{}
+    }{}
+
+    %% Draw ds_write pattern
+    \ifthenelse{\accessMode = 2}{
+      %% First compute the coverage of the first 16 threads
+      \pgfmathsetmacro{\covK}{min(16, \threadsPerWarpK)*\sizePerThreadK/\vec}
+      \pgfmathsetmacro{\covM}{ceil(16/\threadsPerWarpK)*\sizePerThreadM}
+      %% Check conditions for the first 16 threads
+      \pgfmathsetmacro{\vecInThread}{int(mod(\vecCoordK, \sizePerThreadK/\vec))}
+      \ifthenelse{\vecInThread=0}{
+        \ifthenelse{\vecCoordK<\covK \AND \vecCoordM<\covM}{
+          \draw [fill=white]  (new vec TL) rectangle ++(\elem, -\elem);
+          \draw (new vec TL) -- ++(\elem, -\elem);   
+          \draw ($(new vec TL)+(0, -\elem)$) -- ++(\elem, \elem);   
+        }{}
+      }{}
+    }{}
+
+    %% Label the phase of each line if swizzling is used
+    \ifthenelse{\hasSwizzle = 2}{}{
+      \pgfmathsetmacro{\lastVecId}{int(64/\vec)-1}
+      \ifthenelse{\vecLDSKSwizzled = \lastVecId}{
+        \draw [ultra thin] ($(new vec TL)+(\vec*\elem, -.5*\elem)$) -- ++(\elem, 0)
+        node [scale=.6*\scale, right] {\phase};
+      }{}
+    } 
+  }
+
+  %% Draw boundary of 32 banks
+  %% Assume fp16 data type
+  \foreach \bank in {0,...,31}{
+    \draw [ultra thin, gray] ($(TL)+(\bank*2*\elem, 0)$) -- ++(0, 2*\elem)
+    node [scale=.6*\scale, right, black] {\bank};
+  }
+  \draw [ultra thin, gray] ($(TL)+(32*2*\elem, 0)$) -- ++(0, 2*\elem);
+  \node [scale=.6*\scale, left, black] at ($(TL)+(0, 2*\elem)$) {bank id};
+
+  \node [scale=\scale, above] at ($(TL)+(.5*\LDSK*\elem, 3*\elem)$) {LDS 32 banks};
+  \node [scale=\scale, rotate=90, above] at ($(TL)+(0, -.5*\drawM*\elem)$) {LDSM=\LDSM};
+
+  %% label phase if swizzling is used
+  \ifthenelse{\hasSwizzle = 2}{}{
+    \node [scale=.6*\scale, above right] at($(TL)+(32*2*\elem, 0)$) {phase};
+  }
+}
+
+\newcommand{\drawMFMAInstr}[3]{
+  %%
+  %% Draw layout of mfma instructions with tid labeled
+  %%
+  %% C TL: pre defined top-left coordinates of the output matrix
+  %% \elem: pre defined variable
+  %%
+  %% #1: mfmaNonKDim
+  %% #2: kpack
+  %% #3: mfmaTrans
+  \pgfmathsetmacro{\mfmaNonKDim}{#1}
+  \pgfmathsetmacro{\groups}{64/\mfmaNonKDim}
+  \pgfmathsetmacro{\kpack}{#2}
+  \pgfmathsetmacro{\mfmaTrans}{#3}
+  \pgfmathsetmacro{\nonTrans}{1-#3}
+  
+  \pgfmathsetmacro{\gap}{\elem*5}
+  \coordinate (mfma opA TL) at ($(C TL)+(-.5*\gap-1.2*\nonTrans*\gap-\groups*\kpack*\elem, 0)$);
+  \coordinate (mfma op TL) at (mfma opA TL);
+  \drawMFMAOperand{\mfmaNonKDim}{\kpack}{0}{1}
+  \coordinate (mfma op TL) at ($(C TL)+(0, 1.5*\gap+.5*\mfmaTrans*\gap+\groups*\kpack*\elem)$);
+  \drawMFMAOperand{\mfmaNonKDim}{\kpack}{1}{1}
+
+  \coordinate (block TL) at (C TL);
+  \drawBlockMFMALayoutLarge{\mfmaTrans}{\mfmaNonKDim}{1}
+
+  %% Draw labels
+  \def\vecR{1.5}
+  \coordinate (vec TL) at ($(mfma opA TL)+(-.25*\kpack*\elem, 3*\elem*\vecR)$);
+  \pgfmathsetmacro{\maxVec}{\kpack-1}
+  \foreach \vecId in {0,...,\maxVec}{
+    \draw ($(vec TL)+(\vecId*\elem*\vecR, 0)$) rectangle ++(\elem*\vecR, -\elem*\vecR);
+  }
+  \draw [densely dotted] (mfma opA TL) -- ($(vec TL)+(0, -\elem*\vecR)$);
+  \draw [densely dotted] ($(mfma opA TL)+(\kpack*\elem, 0)$) -- ($(vec TL)+(\kpack*\elem*\vecR, -\elem*\vecR)$);
+  \node [scale=.8*\scale, above] at ($(vec TL)+(.5*\kpack*\elem*\vecR, 0)$) {vec=\kpack};
+
+  \coordinate (vec TL) at ($(mfma op TL)+(-3*\elem*\vecR, .25*\kpack*\elem)$);
+  \foreach \vecId in {0,...,\maxVec}{
+    \draw ($(vec TL)+(0, -\vecId*\elem*\vecR)$) rectangle ++(\elem*\vecR, -\elem*\vecR);
+  }
+  \draw [densely dotted] (mfma op TL) -- ($(vec TL)+(\elem*\vecR,0)$);
+  \draw [densely dotted] ($(mfma op TL)+(0, -\kpack*\elem)$) -- ($(vec TL)+(\elem*\vecR, -\kpack*\elem*\vecR)$);
+  \node [scale=.8*\scale, above, rotate=90] at ($(vec TL)+(0, -.5*\kpack*\elem*\vecR)$) {vec=\kpack};
+
+  \node [scale=\scale, below] at ($(block TL)+(.5*\mfmaNonKDim*\elem,-\mfmaNonKDim*\elem)$) {outC};
+  \ifthenelse{\mfmaTrans=0}{
+    \node [scale=\scale, below] at ($(mfma opA TL)+(\kpack*\elem, -\mfmaNonKDim*\elem)$) {opA};
+    \node [scale=\scale, above] at (mfma op TL) {opB};
+    \coordinate (vec TL) at ($(block TL)+(-3*\elem-\elem*\vecR, .25*4*\elem)$);
+    \foreach \vecId in {0,1,2,3}{
+      \draw ($(vec TL)+(0, -\vecId*\elem*\vecR)$) rectangle ++(\elem*\vecR, -\elem*\vecR);
+    }
+    \draw [densely dotted] (block TL) -- ++(-3*\elem, .25*4*\elem);
+    \draw [densely dotted] ($(block TL)+(0, -4*\elem)$) -- ++(-3*\elem, -.25*4*\elem);
+    \node [scale=.8*\scale, above, rotate=90] at ($(vec TL)+(0, -.5*4*\elem*\vecR)$) {vec=4};
+    \node [scale=.8*\scale, above, align=center] at ($(block TL)+(.5*\mfmaNonKDim*\elem, 0)$) {mfmaLayout\\trans=False};
+  }{
+    \node [scale=\scale, below] at ($(mfma opA TL)+(\kpack*\elem, -\mfmaNonKDim*\elem)$) {opB};
+    \node [scale=\scale, above] at (mfma op TL) {opA};
+    \coordinate (vec TL) at ($(block TL)+(-.25*4*\elem, 3*\elem+\elem*\vecR)$);
+    \foreach \vecId in {0,1,2,3}{
+      \draw ($(vec TL)+(\vecId*\elem*\vecR, 0)$) rectangle ++(\elem*\vecR, -\elem*\vecR);
+    }
+    \draw [densely dotted] (block TL) -- ++(-.25*4*\elem, 3*\elem);
+    \draw [densely dotted] ($(block TL)+(4*\elem, 0)$) -- ++(.25*4*\elem, 3*\elem);
+    \node [scale=.8*\scale, above] at ($(vec TL)+(.5*4*\elem*\vecR, 0)$) {vec=4};
+    \node [scale=.8*\scale, above, align=center] at ($(block TL)+(16*\elem, 0)$) {mfmaLayout\\trans=True};
+  }
+}
+
+\newcommand{\drawWMMAOperand}[3]{
+  %%
+  %% Draw the layout of one operand of WMMA instruction
+  %%
+  %% #1: opIdx. 0 for opA, 1 for opB
+  %% #2: verbose. 1 means draw tid in each vec; 0 means draw nothing
+  %% #3: mode. 0 for w32, 1 for w64
+  %%
+  %% wmma op TL: pre defined top-left coordinates of the operand matrix
+
+  \pgfmathsetmacro{\isOpB}{#1}
+  \pgfmathsetmacro{\isOpA}{1-\isOpB}
+  \pgfmathsetmacro{\verbose}{#2}
+  \pgfmathsetmacro{\isWLarge}{#3}
+
+  \foreach \row in {0,...,15}{
+    \pgfmathsetmacro{\ratio}{\row*5+15}
+    \coordinate (vec TL) at ($(wmma op TL)+(\row*\isOpB*\elem, -\row*\elem*\isOpA)$);
+    \ifthenelse{\isWLarge=1}{
+      \pgfmathsetmacro{\tidone}{int(\row+16)}
+      \pgfmathsetmacro{\tidtwo}{int(\row+32)}
+      \pgfmathsetmacro{\tidthree}{int(\row+48)}
+      \draw [line width=0.005mm, fill=brown!\ratio!white] (vec TL)
+      rectangle ++(16*\elem*\isOpA+\elem*\isOpB, -\elem*\isOpA-16*\elem*\isOpB)
+      node [scale=0.4*\scale, pos=.5, rotate=90*\isOpB] {t\row, t\tidone, t\tidtwo, t\tidthree};
+    }{
+      \pgfmathsetmacro{\tidone}{int(\row+16)}
+      \draw [line width=0.005mm, fill=brown!\ratio!white] (vec TL)
+      rectangle ++(16*\elem*\isOpA+\elem*\isOpB, -\elem*\isOpA-16*\elem*\isOpB)
+      node [scale=0.4*\scale, pos=.5, rotate=90*\isOpB] {t\row, t\tidone};
+    }
+  }
+}
+
+\newcommand{\drawWMMAResult}[2]{
+  %%
+  %% Draw layout of WMMA result tensor
+  %%
+  %% #1: verbose. 1 means draw tid in each vec; 0 means draw nothing
+  %% #2: mode. 0 for w32, 1 for w64
+
+  \pgfmathsetmacro{\verbose}{#1}
+  \pgfmathsetmacro{\isWLarge}{#2}
+
+  \pgfmathsetmacro{\numElem}{256}
+  \pgfmathsetmacro{\maxElemId}{\numElem-1}
+
+  \foreach \elemId in {0,...,\maxElemId}{
+    %% figure out the rowID
+    \pgfmathsetmacro{\rowId}{floor(\elemId/16)}
+    %% figure out the colID
+    \pgfmathsetmacro{\colId}{mod(\elemId,16)}
+    %% figure out the tid and color
+    \ifthenelse{\isWLarge=1}{
+      \pgfmathsetmacro{\tid}{int(mod(\elemId,64))}
+      \pgfmathsetmacro{\laneId}{mod(\elemId,64)}
+    }{
+      \pgfmathsetmacro{\tid}{int(mod(\elemId,32))}
+      \pgfmathsetmacro{\laneId}{mod(\elemId,32)}
+    }
+    %% figure out the color
+    \pgfmathsetmacro{\colorId}{floor(\laneId/16)}
+    \pgfmathsetmacro{\vecColor}{\Colors[\colorId]}
+    %% Coordinate
+    \coordinate (vec TL) at ($(C TL)+(\colId*\elem, -\rowId*\elem)$);
+    \draw [line width=0.005mm, fill=\vecColor!60!white] (vec TL) rectangle ++(\elem, -\elem)
+    node [scale=.4*\scale, pos=.5] {t\tid};
+  }
+  
+
+}
+
+\newcommand{\drawWMMAInstr}[2]{
+  %%
+  %% Draw wmma instruction layouts 16x16x16
+  %%
+  %% #1: mode. 0 for w32, 1 for w64
+  %% #2: verbose. 1 means draw tid in each vec; 0 means draw nothing
+  %%
+  %% C TL: pre defined top-left coordinates of output matrix
+  %% \elem: pre defined element size
+  
+
+  \pgfmathsetmacro{\isWLarge}{#1}
+  \pgfmathsetmacro{\verbose}{#2}
+  
+  \pgfmathsetmacro{\gap}{\elem*2}
+  \coordinate (wmma op TL) at ($(C TL)+(-\gap-16*\elem, 0)$);
+  \coordinate (wmma opA TL) at (wmma op TL);
+  \drawWMMAOperand{0}{\verbose}{\isWLarge}
+  \coordinate (wmma op TL) at ($(C TL)+(0, \gap+16*\elem)$);
+  \drawWMMAOperand{1}{\verbose}{\isWLarge}
+
+  \drawWMMAResult{1}{\isWLarge}
+
+  %% labels
+  \pgfmathsetmacro{\gap}{\elem}
+  \node [above left, scale=\scale] at (wmma opA TL) {A};
+  \node [above left, scale=\scale] at (wmma op TL) {B};
+  \node [above right, scale=\scale] at ($(C TL)+(16*\elem, 0)$) {C};
+
+  %% A k dim
+  \node [scale=.8*\scale] (k dim A) at ($(wmma opA TL)+(8*\elem,\gap)$) {16};
+  \draw [->, >=stealth] (k dim A.west) -- ($(wmma opA TL)+(0, \gap)$);
+  \draw [->, >=stealth] (k dim A.east) -- ($(wmma opA TL)+(16*\elem, \gap)$);
+
+  %% B K dim
+  \node [scale=.8*\scale, rotate=90] (k dim B) at ($(wmma op TL)+(-\gap, -8*\elem)$) {16};
+  \draw [->, >=stealth] (k dim B.east) -- ($(wmma op TL)+(-\gap, 0)$);
+  \draw [->, >=stealth] (k dim B.west) -- ($(wmma op TL)+(-\gap, -16*\elem)$);
+
+  %% C M dim
+  \node [scale=.8*\scale] (m dim) at ($(C TL)+(8*\elem,-16*\elem-\gap)$) {16};
+  \draw [->, >=stealth] (m dim.west) -- ($(C TL)+(0, -16*\elem-\gap)$);
+  \draw [->, >=stealth] (m dim.east) -- ($(C TL)+(16*\elem, -16*\elem-\gap)$);
+
+  %% C N dim 
+  \node [scale=.8*\scale, rotate=-90] (n dim) at ($(C TL)+(16*\elem+\gap, -8*\elem)$) {16};
+  \draw [->, >=stealth] (n dim.west) -- ($(C TL)+(16*\elem+\gap, 0)$);
+  \draw [->, >=stealth] (n dim.east) -- ($(C TL)+(16*\elem+\gap, -16*\elem)$);
+}

From 5880a6b96bf7aec807c2fb45323a762f61bed1fd Mon Sep 17 00:00:00 2001
From: Lixun Zhang <lixun.zhang@amd.com>
Date: Fri, 6 Sep 2024 08:57:48 -0500
Subject: [PATCH 2/6] yapf format

---
 .../tools/amdgcn-cfg/amdgcn-cfg.py            | 338 +++++++++---------
 .../tools/plot-layout/plot_layout.py          |  91 ++---
 2 files changed, 188 insertions(+), 241 deletions(-)

diff --git a/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py b/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py
index 4100528f28db..9c3bcbea9d70 100644
--- a/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py
+++ b/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py
@@ -6,33 +6,36 @@
 
 
 class Options:
-  def __init__(self, input_file, output_file, verbose, format):
-    if not os.path.exists(input_file):
-      raise RuntimeError('input file is not provided')
 
-    output_dir = os.path.dirname(output_file)
-    if not os.path.exists(output_dir):
-      raise RuntimeError('output directory does not exist')
+    def __init__(self, input_file, output_file, verbose, format):
+        if not os.path.exists(input_file):
+            raise RuntimeError('input file is not provided')
 
-    self.input_file = input_file
-    self.output_file = output_file
-    self.verbose = verbose
-    self.format = format
-    self.output_dir = output_dir
+        output_dir = os.path.dirname(output_file)
+        if not os.path.exists(output_dir):
+            raise RuntimeError('output directory does not exist')
+
+        self.input_file = input_file
+        self.output_file = output_file
+        self.verbose = verbose
+        self.format = format
+        self.output_dir = output_dir
 
 
 class Block:
-  def __init__(self, label, code):
-    self.label = label
-    self.code = code
-    self.edges = []
+
+    def __init__(self, label, code):
+        self.label = label
+        self.code = code
+        self.edges = []
 
 
 class Kernel:
-  def __init__(self, kernel_name, blocks):
-    self.name = kernel_name
-    self.blocks = blocks
-    self.cfg = None
+
+    def __init__(self, kernel_name, blocks):
+        self.name = kernel_name
+        self.blocks = blocks
+        self.cfg = None
 
 
 begin_label = 'Begin'
@@ -40,187 +43,180 @@ def __init__(self, kernel_name, blocks):
 
 
 def find_kernel(text):
-  func_name_expr = r'^([^\s^\.]\w.+):'
-  func_name = None
-  start = None
-  for index, line in enumerate(text):
-    match = re.search(func_name_expr, line)
-    if not match is None:
-      func_name = match[1]
-      start = index
-      break
-  if start == None:
-    return None, None, None
-
-  end = None
-  for index, line in enumerate(text):
-    if not re.search(r's_endpgm', line) is None:
-      end = index
-      break
-
-  if end == None:
-    return None, None, None
-
-  return func_name, text[start:end+1], end
+    func_name_expr = r'^([^\s^\.]\w.+):'
+    func_name = None
+    start = None
+    for index, line in enumerate(text):
+        match = re.search(func_name_expr, line)
+        if not match is None:
+            func_name = match[1]
+            start = index
+            break
+    if start == None:
+        return None, None, None
+
+    end = None
+    for index, line in enumerate(text):
+        if not re.search(r's_endpgm', line) is None:
+            end = index
+            break
+
+    if end == None:
+        return None, None, None
+
+    return func_name, text[start:end + 1], end
 
 
 def find_label(kernel):
-  label = None
-  index = None
-  for index, line in enumerate(kernel):
-    match = re.search(r'^\.(\w+):', line)
-    if not match is None:
-      label = match[1]
-      break
-  return label, index
+    label = None
+    index = None
+    for index, line in enumerate(kernel):
+        match = re.search(r'^\.(\w+):', line)
+        if not match is None:
+            label = match[1]
+            break
+    return label, index
 
 
 def get_block_list(kernel):
-  label, index = find_label(kernel)
+    label, index = find_label(kernel)
 
-  blocks = OrderedDict()
-  if (index > 1):
-    blocks[begin_label] = Block(begin_label, kernel[:index-1])
+    blocks = OrderedDict()
+    if (index > 1):
+        blocks[begin_label] = Block(begin_label, kernel[:index - 1])
 
-  while label != None:
-    kernel = kernel[index+1:]
-    next_label, next_index = find_label(kernel)
-    if next_label is None:
-      code = kernel[index:]
-    else:
-      code = kernel[:next_index]
-    blocks[label] = Block(label, code)
+    while label != None:
+        kernel = kernel[index + 1:]
+        next_label, next_index = find_label(kernel)
+        if next_label is None:
+            code = kernel[index:]
+        else:
+            code = kernel[:next_index]
+        blocks[label] = Block(label, code)
 
-    label = next_label 
-    index = next_index
+        label = next_label
+        index = next_index
 
-  blocks[end_label] = Block(end_label, [])
+    blocks[end_label] = Block(end_label, [])
 
-  return blocks
+    return blocks
 
 
 def find_terminators(code):
-  terminator_labels = []
-  for line in code:
-    branch = re.search(r'(c)?branch.*\s+\.?(.*)', line)
-    if not branch is None:
-      is_condional = True if len(branch.groups()) == 2 else False
-      label_idx = 2 if is_condional else 1
-      terminator_labels.append(branch[label_idx])
-      if not is_condional:
-        return terminator_labels, True
-    end = re.search(r's_endpgm', line)
-    if not end is None:
-      terminator_labels.append(end_label)
-      return terminator_labels, True
-
-  return terminator_labels, False
+    terminator_labels = []
+    for line in code:
+        branch = re.search(r'(c)?branch.*\s+\.?(.*)', line)
+        if not branch is None:
+            is_condional = True if len(branch.groups()) == 2 else False
+            label_idx = 2 if is_condional else 1
+            terminator_labels.append(branch[label_idx])
+            if not is_condional:
+                return terminator_labels, True
+        end = re.search(r's_endpgm', line)
+        if not end is None:
+            terminator_labels.append(end_label)
+            return terminator_labels, True
+
+    return terminator_labels, False
 
 
 def add_edges(kernel):
-  keys = list(kernel.blocks.keys())
-  for index, curr_label in enumerate(keys):
-    if curr_label == end_label:
-      continue
+    keys = list(kernel.blocks.keys())
+    for index, curr_label in enumerate(keys):
+        if curr_label == end_label:
+            continue
 
-    code = kernel.blocks[curr_label].code
-    terminators, is_last_unconditional = find_terminators(code[:-1])
+        code = kernel.blocks[curr_label].code
+        terminators, is_last_unconditional = find_terminators(code[:-1])
 
-    if is_last_unconditional:
-      # unconditional jump in the middle of the block
-      break
+        if is_last_unconditional:
+            # unconditional jump in the middle of the block
+            break
 
-    # handle the last terminator in the current BB
-    last_terminator, is_unconditional = find_terminators([code[-1]])
+        # handle the last terminator in the current BB
+        last_terminator, is_unconditional = find_terminators([code[-1]])
 
-    is_conditional = not is_unconditional
-    next_block_label = keys[index + 1]
-    is_next_covered = next_block_label in terminators
-
-    if last_terminator:
-      terminators.extend(last_terminator)
-      if is_conditional and not is_next_covered:
-        next_block_label = keys[index + 1]
-        terminators.append(next_block_label)
-    else:
-      if not is_next_covered:
+        is_conditional = not is_unconditional
         next_block_label = keys[index + 1]
-        terminators.append(next_block_label)
+        is_next_covered = next_block_label in terminators
 
-    assert(len(terminators))
-    kernel.blocks[curr_label].edges = terminators
+        if last_terminator:
+            terminators.extend(last_terminator)
+            if is_conditional and not is_next_covered:
+                next_block_label = keys[index + 1]
+                terminators.append(next_block_label)
+        else:
+            if not is_next_covered:
+                next_block_label = keys[index + 1]
+                terminators.append(next_block_label)
+
+        assert (len(terminators))
+        kernel.blocks[curr_label].edges = terminators
 
 
 def generate_cfg(kernel, options):
-  graph = graphviz.Digraph(f'{kernel.name}')
-  for curr_label in kernel.blocks:
-    block = kernel.blocks[curr_label]
-    asm = [line.strip() for line in block.code]
-    if options.verbose:
-      label_text = repr('\n'.join([f'{curr_label}', *asm]))
-    else:
-      label_text = curr_label
-    graph.node(curr_label,
-               shape='rect',
-               labeljust='l',
-               margin='0.01',
-               label=label_text)
-
-  for curr_label in kernel.blocks:
-    block = kernel.blocks[curr_label]
-    for edge in block.edges:
-      graph.edge(curr_label, edge)
-
-  return graph
+    graph = graphviz.Digraph(f'{kernel.name}')
+    for curr_label in kernel.blocks:
+        block = kernel.blocks[curr_label]
+        asm = [line.strip() for line in block.code]
+        if options.verbose:
+            label_text = repr('\n'.join([f'{curr_label}', *asm]))
+        else:
+            label_text = curr_label
+        graph.node(curr_label, shape='rect', labeljust='l', margin='0.01', label=label_text)
+
+    for curr_label in kernel.blocks:
+        block = kernel.blocks[curr_label]
+        for edge in block.edges:
+            graph.edge(curr_label, edge)
+
+    return graph
 
 
 def main(options):
-  asm = []
-  with open(options.input_file, 'r') as file:
-    context = file.readlines()
-    for line in context:
-      asm.append(line[:-1])
-
-  kernels = []
-  last_end_index = 0
-  while last_end_index != None:
-    func_name, kernel_asm, last_end_index = find_kernel(asm)
-    if kernel_asm == None:
-      break
-
-    blocks = get_block_list(kernel_asm)
-    kernel = Kernel(func_name, blocks)
-    add_edges(kernel)
-
-    cfg = generate_cfg(kernel, options)
-    kernel.cfg = cfg
-    kernels.append(kernel)
-    asm = asm[last_end_index+1:]
-
-    for index, kernel in enumerate(kernels):
-      output_file_name = f'{options.output_file}.kernel-{index}'
-      if options.format == 'dot':
-        with open(f'{output_file_name}.dot', 'w') as file:
-          file.write(str(kernel.cfg))
-          file.write('\n')
-      else:
-        kernel.cfg.render(filename=f'{output_file_name}',
-                          format=options.format,
-                          ).replace('\\', '/')
+    asm = []
+    with open(options.input_file, 'r') as file:
+        context = file.readlines()
+        for line in context:
+            asm.append(line[:-1])
+
+    kernels = []
+    last_end_index = 0
+    while last_end_index != None:
+        func_name, kernel_asm, last_end_index = find_kernel(asm)
+        if kernel_asm == None:
+            break
+
+        blocks = get_block_list(kernel_asm)
+        kernel = Kernel(func_name, blocks)
+        add_edges(kernel)
+
+        cfg = generate_cfg(kernel, options)
+        kernel.cfg = cfg
+        kernels.append(kernel)
+        asm = asm[last_end_index + 1:]
+
+        for index, kernel in enumerate(kernels):
+            output_file_name = f'{options.output_file}.kernel-{index}'
+            if options.format == 'dot':
+                with open(f'{output_file_name}.dot', 'w') as file:
+                    file.write(str(kernel.cfg))
+                    file.write('\n')
+            else:
+                kernel.cfg.render(
+                    filename=f'{output_file_name}',
+                    format=options.format,
+                ).replace('\\', '/')
 
 
 if __name__ == "__main__":
-  parser = argparse.ArgumentParser(
-    prog="Generates Control Flow Graph (CFG) from amdgcn assembly file",
-  )
-  parser.add_argument("-i", "--input", type=str, default=None, help="input file")
-  parser.add_argument("-o", "--output", type=str, default=None, help="output file prefix")
-  parser.add_argument("-v", "--verbose", action='store_true', help='verbose output')
-  parser.add_argument("-f", "--format", choices=['dot','svg', 'pdf'],
-                      default="dot",
-                      help="output format type")
-  args = parser.parse_args()
-
-  options = Options(args.input, args.output, args.verbose, args.format)
-
-  main(options)
+    parser = argparse.ArgumentParser(prog="Generates Control Flow Graph (CFG) from amdgcn assembly file", )
+    parser.add_argument("-i", "--input", type=str, default=None, help="input file")
+    parser.add_argument("-o", "--output", type=str, default=None, help="output file prefix")
+    parser.add_argument("-v", "--verbose", action='store_true', help='verbose output')
+    parser.add_argument("-f", "--format", choices=['dot', 'svg', 'pdf'], default="dot", help="output format type")
+    args = parser.parse_args()
+
+    options = Options(args.input, args.output, args.verbose, args.format)
+
+    main(options)
diff --git a/python/perf-kernels/tools/plot-layout/plot_layout.py b/python/perf-kernels/tools/plot-layout/plot_layout.py
index c2387905f3e0..74554b4c3f02 100755
--- a/python/perf-kernels/tools/plot-layout/plot_layout.py
+++ b/python/perf-kernels/tools/plot-layout/plot_layout.py
@@ -95,8 +95,7 @@ def draw_dot_layout_cmd(M, N, K, mfmaNonKDim, warpsPerCTA, trans, kpack):
 \\end{{document}}'''
 
 
-def draw_blocked_layout_cmd(M, K, sizePerThread, threadsPerWarp, warpsPerCTA,
-                            order):
+def draw_blocked_layout_cmd(M, K, sizePerThread, threadsPerWarp, warpsPerCTA, order):
     return f'''\\begin{{document}}
   \\begin{{tikzpicture}}
     \\def\\scale{{1}}
@@ -107,8 +106,7 @@ def draw_blocked_layout_cmd(M, K, sizePerThread, threadsPerWarp, warpsPerCTA,
 \\end{{document}}'''
 
 
-def draw_lds_access_cmd(M, K, kpack, ldsLayout, ldsAccess, sizePerThread,
-                        threadsPerWarp):
+def draw_lds_access_cmd(M, K, kpack, ldsLayout, ldsAccess, sizePerThread, threadsPerWarp):
     if ldsLayout == 'swizzle':
         hasSwizzle = 1
     elif ldsLayout == 'padding':
@@ -158,11 +156,7 @@ def draw_wmma_instr_cmd(waveSize):
 
 
 def run_bash_command(commandstring):
-    proc = subprocess.run(commandstring,
-                          shell=True,
-                          check=True,
-                          executable='/bin/bash',
-                          stdout=subprocess.PIPE)
+    proc = subprocess.run(commandstring, shell=True, check=True, executable='/bin/bash', stdout=subprocess.PIPE)
     return proc.stdout.splitlines()
 
 
@@ -172,62 +166,27 @@ def parse_args():
         allow_abbrev=False,
     )
     ## tensor shapes
-    parser.add_argument("-shape",
-                        type=int,
-                        nargs=3,
-                        default=(32, 128, 64),
-                        help='Tensor shape in the form of M,N,K')
-    parser.add_argument("-plot",
-                        type=str,
-                        default="blocked",
-                        choices=['blocked', 'dot', 'wmma', 'lds'],
+    parser.add_argument("-shape", type=int, nargs=3, default=(32, 128, 64), help='Tensor shape in the form of M,N,K')
+    parser.add_argument("-plot", type=str, default="blocked", choices=['blocked', 'dot', 'wmma', 'lds'],
                         help='choose plot mode')
-    parser.add_argument(
-        "-nonKDim",
-        type=int,
-        default=32,
-        choices=[16, 32],
-        help='mfma instruction dim')
+    parser.add_argument("-nonKDim", type=int, default=32, choices=[16, 32], help='mfma instruction dim')
     ## blocked layout parameters
     parser.add_argument("-sizePerThread", type=int, nargs=2, default=(1, 4))
     parser.add_argument("-threadsPerWarp", type=int, nargs=2, default=(16, 4))
     parser.add_argument("-warpsPerCTA", type=int, nargs=2, default=(1, 4))
     parser.add_argument("-order", type=int, nargs=2, default=(1, 0))
     ## LDS access parameters
-    parser.add_argument("-kWidth",
-                        type=int,
-                        default=4,
-                        choices=[4, 8, 16],
-                        help='number of elements per thread')
-    parser.add_argument("-lds_layout",
-                        type=str,
-                        default="none",
-                        choices=['swizzle', 'padding', 'none'],
+    parser.add_argument("-kWidth", type=int, default=4, choices=[4, 8, 16], help='number of elements per thread')
+    parser.add_argument("-lds_layout", type=str, default="none", choices=['swizzle', 'padding', 'none'],
                         help='choose the LDS data layout')
-    parser.add_argument("-lds_access",
-                        type=str,
-                        default="none",
-                        choices=['read', 'write', 'none'],
+    parser.add_argument("-lds_access", type=str, default="none", choices=['read', 'write', 'none'],
                         help='choose LDS access mode')
     ## wmma instruction layout parameter
-    parser.add_argument("-wave_size",
-                        type=int,
-                        default=32,
-                        choices=[32, 64],
-                        help='choose the wmma instruction mode')
-
-    parser.add_argument("-o",
-                        type=str,
-                        default="myplot",
-                        help='output pdf file name (without surfix)')
-    parser.add_argument("-mfmaTrans",
-                        action='store_true',
-                        default=False,
-                        help='If set, then use mfma.trans layout')
-    parser.add_argument("-keep",
-                        action='store_true',
-                        default=False,
-                        help='If set, keep the generated .tex file')
+    parser.add_argument("-wave_size", type=int, default=32, choices=[32, 64], help='choose the wmma instruction mode')
+
+    parser.add_argument("-o", type=str, default="myplot", help='output pdf file name (without surfix)')
+    parser.add_argument("-mfmaTrans", action='store_true', default=False, help='If set, then use mfma.trans layout')
+    parser.add_argument("-keep", action='store_true', default=False, help='If set, keep the generated .tex file')
 
     args = parser.parse_args()
 
@@ -279,24 +238,19 @@ def main():
 
     if plot_mode == 'blocked' or plot_mode == 'dot':
         print(f"CTAShape={CTAShape}")
-        assert M != 0 and CTAShape[
-            0] <= M and M % CTAShape[0] == 0, "bad tensor dimension M"
+        assert M != 0 and CTAShape[0] <= M and M % CTAShape[0] == 0, "bad tensor dimension M"
 
     if plot_mode == 'blocked':
-        assert K != 0 and CTAShape[
-            1] <= K and K % CTAShape[1] == 0, "bad tensor dimension K"
+        assert K != 0 and CTAShape[1] <= K and K % CTAShape[1] == 0, "bad tensor dimension K"
 
     if plot_mode == 'dot':
-        assert N != 0 and CTAShape[
-            1] <= N and N % CTAShape[1] == 0, "bad tensor dimension N"
+        assert N != 0 and CTAShape[1] <= N and N % CTAShape[1] == 0, "bad tensor dimension N"
         assert K != 0 and K % (2 * kpack) == 0, "bad tensor dimension K"
 
     if plot_mode == 'lds':
         print(f"Plotting LDS access for tensor M={M},K={K} with vec={kpack}")
         if ldsAccess == 'write':
-            print(
-                f"sizePerThread={sizePerThread}, threadsPerWarp={threadsPerWarp}"
-            )
+            print(f"sizePerThread={sizePerThread}, threadsPerWarp={threadsPerWarp}")
 
     with open("myplot.tex", 'w') as f_plot:
         with open("tikzplot.tex") as file:
@@ -304,14 +258,11 @@ def main():
 
         preamble_str = draw_preamble_cmd()
 
-        draw_blockedLayout_str = draw_blocked_layout_cmd(
-            M, K, sizePerThread, threadsPerWarp, warpsPerCTA, order)
+        draw_blockedLayout_str = draw_blocked_layout_cmd(M, K, sizePerThread, threadsPerWarp, warpsPerCTA, order)
 
-        draw_dotLayout_str = draw_dot_layout_cmd(M, N, K, mfmaNonKDim,
-                                                 warpsPerCTA, trans, kpack)
+        draw_dotLayout_str = draw_dot_layout_cmd(M, N, K, mfmaNonKDim, warpsPerCTA, trans, kpack)
 
-        draw_lds_str = draw_lds_access_cmd(M, K, kpack, ldsLayout, ldsAccess,
-                                           sizePerThread, threadsPerWarp)
+        draw_lds_str = draw_lds_access_cmd(M, K, kpack, ldsLayout, ldsAccess, sizePerThread, threadsPerWarp)
 
         draw_wmma_str = draw_wmma_instr_cmd(waveSize)
 

From 8405b6bb66778c4a1c6451b22a9f280259d51a38 Mon Sep 17 00:00:00 2001
From: Lixun Zhang <lixun.zhang@amd.com>
Date: Fri, 6 Sep 2024 09:00:24 -0500
Subject: [PATCH 3/6] More formats

---
 .../tools/amdgcn-cfg/amdgcn-cfg.py            | 10 +--
 .../perf-kernels/tools/plot-layout/README.md  | 12 +--
 .../tools/plot-layout/plot_layout.py          |  2 -
 .../tools/plot-layout/tikzplot.tex            | 80 +++++++++----------
 4 files changed, 51 insertions(+), 53 deletions(-)

diff --git a/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py b/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py
index 9c3bcbea9d70..570fc3399602 100644
--- a/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py
+++ b/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py
@@ -48,7 +48,7 @@ def find_kernel(text):
     start = None
     for index, line in enumerate(text):
         match = re.search(func_name_expr, line)
-        if not match is None:
+        if match is not None:
             func_name = match[1]
             start = index
             break
@@ -57,7 +57,7 @@ def find_kernel(text):
 
     end = None
     for index, line in enumerate(text):
-        if not re.search(r's_endpgm', line) is None:
+        if re.search(r's_endpgm', line) is not None:
             end = index
             break
 
@@ -72,7 +72,7 @@ def find_label(kernel):
     index = None
     for index, line in enumerate(kernel):
         match = re.search(r'^\.(\w+):', line)
-        if not match is None:
+        if match is not None:
             label = match[1]
             break
     return label, index
@@ -106,14 +106,14 @@ def find_terminators(code):
     terminator_labels = []
     for line in code:
         branch = re.search(r'(c)?branch.*\s+\.?(.*)', line)
-        if not branch is None:
+        if branch is not None:
             is_condional = True if len(branch.groups()) == 2 else False
             label_idx = 2 if is_condional else 1
             terminator_labels.append(branch[label_idx])
             if not is_condional:
                 return terminator_labels, True
         end = re.search(r's_endpgm', line)
-        if not end is None:
+        if end is not None:
             terminator_labels.append(end_label)
             return terminator_labels, True
 
diff --git a/python/perf-kernels/tools/plot-layout/README.md b/python/perf-kernels/tools/plot-layout/README.md
index e12cf9441d37..40de35bdb3aa 100644
--- a/python/perf-kernels/tools/plot-layout/README.md
+++ b/python/perf-kernels/tools/plot-layout/README.md
@@ -31,7 +31,7 @@ options:
 ```
 
 ## Installation
-This script does not require torch or triton to be installed. The only package 
+This script does not require torch or triton to be installed. The only package
 it depends on is latex. On Ubuntu, do
 ```bash
 sudo apt install texlive-full
@@ -60,8 +60,8 @@ Notes
   out of the boundary of the tensor dimensions. This means
   - For M: sizePerThread[0] * threadsPerWarps[0] * warpsPerCTA[0] <= M
   - For K: sizePerThread[1] * threadsPerWarps[1] * warpsPerCTA[1] <= K
-  
-  
+
+
 ## Draw mfma operand and result layouts (`-plot dot`)
 
 Examples:
@@ -78,15 +78,15 @@ This mode draws two graphs:
 2. The layout of a single mfma block, operands and results of one or more mfma
    instructions that share the same accumulating VGPRs.
    This view has thread distributions among tensor elements.
-   
+
 Knobs
 - `-kWidth`: the number of elements that will be loaded into one thread at once
 - `-nonKDim`: 16 ot 32, which is used to control the mfma instruction size
 - `-mfmaTrans`: if set, the transposed mfma layout will be plotted.
 
 Notes
-- The layout shows the mapping from the threads/wave to the elements in the 
-  original tensor. It does not care if the elements are arranged in LDS, like 
+- The layout shows the mapping from the threads/wave to the elements in the
+  original tensor. It does not care if the elements are arranged in LDS, like
   swizzling to avoid bank conflicts.
 - The script does not allow settings for data type or k dim of the mfma instruction.
   This can be controled by the `-kWidth` flag.
diff --git a/python/perf-kernels/tools/plot-layout/plot_layout.py b/python/perf-kernels/tools/plot-layout/plot_layout.py
index 74554b4c3f02..599f92c790e4 100755
--- a/python/perf-kernels/tools/plot-layout/plot_layout.py
+++ b/python/perf-kernels/tools/plot-layout/plot_layout.py
@@ -1,8 +1,6 @@
 import argparse
 import sys
-import yaml
 import os
-import glob
 import subprocess
 
 
diff --git a/python/perf-kernels/tools/plot-layout/tikzplot.tex b/python/perf-kernels/tools/plot-layout/tikzplot.tex
index e6292f7002e9..d8441b042f02 100755
--- a/python/perf-kernels/tools/plot-layout/tikzplot.tex
+++ b/python/perf-kernels/tools/plot-layout/tikzplot.tex
@@ -25,7 +25,7 @@
     \pgfmathsetmacro{\tidN}{mod(\tid,\threadsPerWarpN)}
     \coordinate (Thread TL) at ($(Wave TL)+(\tidN*\sizePerThreadN*\elem, -\tidM*\sizePerThreadM*\elem)$);
     \pgfmathsetmacro{\ratio}{\tidM*10}
-    
+
     \ifthenelse{\tid = 0}{
       \draw [line width = 0.01mm, fill=red] (Thread TL)
       rectangle ++(\sizePerThreadN*\elem, -\sizePerThreadM*\elem);
@@ -80,12 +80,12 @@
       \pgfmathsetmacro{\waveCoordN}{int(\waveId/\warpsPerCTAM)}
       \pgfmathsetmacro{\rot}{90}
     }
-    
+
     \coordinate (Wave TL) at ($(CTA TL)+(\waveCoordN*\waveSizeN*\elem, -\waveCoordM*\waveSizeM*\elem)$);
     \draw [ultra thin] (Wave TL) rectangle ++(\waveSizeN*\elem, -\waveSizeM*\elem)
     node [pos=.5, scale=.6*\scale, inner sep=0, fill=white, rotate=\rot] {wave\waveId};
   }
-  
+
   \draw [thick] (CTA TL) rectangle ++(\CTASizeN*\elem, -\CTASizeM*\elem);
 }
 
@@ -108,7 +108,7 @@
   %%     Note that threadsPerWarp[1] is calculated by 64/threadsPerWarp[0]
   %% #6: warpsPerCTA[0] --> warpsPerCTAM
   %% #7: warpsPerCTA[1] --> warpsPerCTAN
-  %% #8: fastest changing dim --> order 
+  %% #8: fastest changing dim --> order
 
   \pgfmathsetmacro{\M}{#1}
   \pgfmathsetmacro{\N}{#2}
@@ -148,7 +148,7 @@
 
   \node [scale=.6*\scale, left] at ($(zoomin BL)+(0, .5*\sizePerThreadM*\elem*\zoomR)$) {$t_0$};
   \node [scale=.6*\scale, right] at ($(zoomin BL)+(\sizePerThreadN*\elem*\zoomR, .5*\sizePerThreadM*\elem*\zoomR)$) {\sizePerThreadM$\times$\sizePerThreadN};
-  
+
   \draw [densely dotted] (TL) -- (zoomin BL);
   \draw [densely dotted] ($(TL)+(\sizePerThreadN*\elem, 0)$) -- ($(zoomin BL)+(\sizePerThreadN*\elem*\zoomR, 0)$);
   \draw [fill=red] (TL) rectangle ++(\sizePerThreadN*\elem, -\sizePerThreadM*\elem);
@@ -164,19 +164,19 @@
   %% #1: 1 for mfma.trans, 0 for normal mfma
   %% #2: mfmaNonKDim
   %% #3: verbose. 1 means draw tid in each vec; 0 means draw nothing
-  
+
   \pgfmathsetmacro{\trans}{#1}
   \pgfmathsetmacro{\nonTrans}{1-#1}
-  \pgfmathsetmacro{\nonKDim}{#2}  
-  \pgfmathsetmacro{\maxTID}{\nonKDim-1}  
-  \pgfmathsetmacro{\groups}{64/\nonKDim}  
-  \pgfmathsetmacro{\maxGID}{\groups-1}  
-  \pgfmathsetmacro{\maxIVec}{\nonKDim*\nonKDim/256-1}  
+  \pgfmathsetmacro{\nonKDim}{#2}
+  \pgfmathsetmacro{\maxTID}{\nonKDim-1}
+  \pgfmathsetmacro{\groups}{64/\nonKDim}
+  \pgfmathsetmacro{\maxGID}{\groups-1}
+  \pgfmathsetmacro{\maxIVec}{\nonKDim*\nonKDim/256-1}
   \pgfmathsetmacro{\verbose}{#3}
   \foreach \iVec in {0,...,\maxIVec} {
     \coordinate (wave TL) at ($(block TL)+(\trans*\iVec*\groups*4*\elem, -\nonTrans*\iVec*\groups*4*\elem)$);
     \foreach \tg in {0,...,\maxGID}{
-      \pgfmathsetmacro{\colID}{\tg+4}  
+      \pgfmathsetmacro{\colID}{\tg+4}
       \pgfmathsetmacro{\col}{\Colors[\colID]}
       \foreach \tid in {0,...,\maxTID} {
         \pgfmathsetmacro{\ratio}{\tid*2.5*\groups+15}
@@ -228,7 +228,7 @@
   \pgfmathsetmacro{\maxWaveId}{\warpsPerCTAH*\warpsPerCTAW-1}
   \pgfmathsetmacro{\CTASizeH}{\warpsPerCTAH*\mfmaNonKDim}
   \pgfmathsetmacro{\CTASizeW}{\warpsPerCTAW*\mfmaNonKDim}
-  
+
 
   \foreach \ctaId in {0,...,\maxCTAId}{
     \pgfmathsetmacro{\ctaCoordH}{int(\ctaId/\CTARepW)}
@@ -237,7 +237,7 @@
     %% Draw a detailed view of wave0 in each CTA
     \coordinate (block TL) at (CTA TL);
     \drawBlockMFMALayoutLarge{\mfmaTrans}{\mfmaNonKDim}{0}
-    
+
     \foreach \waveId in {0,...,\maxWaveId}{
       \pgfmathsetmacro{\waveCoordH}{int(\waveId/\warpsPerCTAW)}
       \pgfmathsetmacro{\waveCoordW}{mod(\waveId,\warpsPerCTAW)}
@@ -246,7 +246,7 @@
       \draw [ultra thin] (block TL) rectangle ++(\mfmaNonKDim*\elem, -\mfmaNonKDim*\elem)
       node [scale=.7*\mfmaNonKDim/32*\scale, pos=.5, fill=white, inner sep=0] {wave\waveId};
     }
-    
+
     %% Draw the outline of each CTA rep
     \draw [ultra thick] (CTA TL) rectangle ++(\CTASizeW*\elem, -\CTASizeH*\elem);
   }
@@ -289,7 +289,7 @@
         rectangle ++(\kpack*\elem*\opIdxB + \elem*\opIdxA, -\elem*\opIdxB-\kpack*\elem*\opIdxA)
         node [pos=.5, scale=.35*\scale, rotate=90*\opIdxA] {t\drawTid};
       }
-    }   
+    }
   }
 }
 
@@ -311,9 +311,9 @@
   \pgfmathsetmacro{\kpack}{#3}
   \pgfmathsetmacro{\opIdx}{#4}
   \pgfmathsetmacro{\opIdxOther}{1-\opIdx}
-  
+
   \coordinate (TL) at (Op TL);
-  
+
   \pgfmathsetmacro{\numKRep}{\K/\kpack/\groups}
   \pgfmathsetmacro{\maxKRepId}{\numKRep-1}
 
@@ -417,7 +417,7 @@
   \pgfmathsetmacro{\kdim}{int(\groups*\kpack)}
 
   \pgfmathsetmacro{\gap}{\elem*20}
-  \coordinate (A TL) at ($(C TL)+(-\gap-\K*\elem, 0)$); 
+  \coordinate (A TL) at ($(C TL)+(-\gap-\K*\elem, 0)$);
   \coordinate (B TL) at ($(C TL)+(0, \gap+\K*\elem)$);
 
   \drawDotOperands{\M}{\N}{\K}{\mfmaNonKDim}{\warpsPerCTAM}{\warpsPerCTAN}{\kpack}
@@ -483,11 +483,11 @@
   \draw (TL) rectangle ++(\K*\elem, -\drawM*\elem);
   %% Draw detailed vec view of the tensor
   \foreach \vecId in {0,...,\maxVecId}{
-    
+
     \pgfmathsetmacro{\vecCoordM}{int(\vecId/\numVecK)}
     \pgfmathsetmacro{\vecCoordK}{mod(\vecId,\numVecK)}
     \coordinate (vec TL) at ($(TL)+(\vecCoordK*\vec*\elem, -\vecCoordM*\elem)$);
-    
+
     \pgfmathsetmacro{\colorIdxK}{int(mod(\vecCoordK,16))}
     \pgfmathsetmacro{\colorIdxM}{mod(\vecCoordM,16)}
     \pgfmathsetmacro{\vecColor}{\Colors[\colorIdxK]}
@@ -495,7 +495,7 @@
 
     \draw [ultra thin, fill=\vecColor!\ratio!white] (vec TL) rectangle ++(\vec*\elem, -\elem)
     node [pos=.5, scale=.6*\scale, white] {m\vecCoordM};
-    
+
   }
   %% M and K dim
   \node [scale=\scale, rotate=90, above] at ($(TL)+(0, -.5*\drawM*\elem-8*\elem)$) {M=\M};
@@ -510,15 +510,15 @@
   }
   \draw [densely dotted] (TL) -- ($(vec TL)+(0, -\elem*\vecR)$);
   \draw [densely dotted] ($(TL)+(\vec*\elem, 0)$) -- ($(vec TL)+(\vec*\elem*\vecR, -\elem*\vecR)$);
-  \node [scale=.8*\scale, above] at ($(vec TL)+(.5*\vec*\elem*\vecR, 0)$) {vec=\vec};  
+  \node [scale=.8*\scale, above] at ($(vec TL)+(.5*\vec*\elem*\vecR, 0)$) {vec=\vec};
 }
 
 
 
 \newcommand{\drawLDSLayoutTritonSwizzling}[2]{
-  %% 
+  %%
   %% Draw tensor layout in LDS with swizzling
-  %% 
+  %%
   %% TL: pre defined top-left coordinates of the tensor in global memory
   %% \elem: per defined variable
   %% \Colors: a pre defined array of 16 colors
@@ -540,7 +540,7 @@
   \pgfmathsetmacro{\hasSwizzle}{#1}
   \pgfmathsetmacro{\accessMode}{#2}
   \pgfmathsetmacro{\numVecK}{\K/\vec}
-  
+
   %% Assuming fp16 data type
   \pgfmathsetmacro{\LDSK}{64}
   \pgfmathsetmacro{\numLDSVec}{\LDSK/\vec}
@@ -568,10 +568,10 @@
     \pgfmathsetmacro{\maxPhase}{1}
   }{
     %% When vec is small enough, we want 16/perPhase different swizzling patterns
-    %% When vec is large, we can only have 64 / \vec different swizzling pattern at most 
+    %% When vec is large, we can only have 64 / \vec different swizzling pattern at most
     \pgfmathsetmacro{\maxPhase}{min(16/\perPhase,64/\vec)}
   }
-  
+
   %% Draw the LDS
   \draw (TL) rectangle ++(\LDSK*\elem, -\drawM*\elem);
 
@@ -598,7 +598,7 @@
       \pgfmathsetmacro{\vecLDSM}{floor(\vecCoordM/\perPhase)}
       \pgfmathsetmacro{\vecLDSK}{int(\vecCoordK+mod(\vecCoordM,\perPhase)*\numVecK)}
     }
-    %% 
+    %%
     \pgfmathsetmacro{\phase}{int(mod(\rawPhase, \maxPhase))}
     %% Compute the swizzled col id
     \pgfmathsetmacro{\vecLDSKSwizzled}{\bitwiseXor{\vecLDSK}{\phase}}
@@ -631,14 +631,14 @@
       node [pos=.5, scale=.6*\scale, white] {m\vecCoordM};
     }
 
-    %% ds_read 
+    %% ds_read
     %% Highlight the elements the first 16 threads access in the first cycle
     %% This is used to visualize bank conflicts
     \ifthenelse{\accessMode = 1}{
       \ifthenelse{\vecCoordK = 0}{
         \draw [fill=white]  (new vec TL) rectangle ++(\elem, -\elem);
-        \draw (new vec TL) -- ++(\elem, -\elem);   
-        \draw ($(new vec TL)+(0, -\elem)$) -- ++(\elem, \elem);   
+        \draw (new vec TL) -- ++(\elem, -\elem);
+        \draw ($(new vec TL)+(0, -\elem)$) -- ++(\elem, \elem);
       }{}
     }{}
 
@@ -652,8 +652,8 @@
       \ifthenelse{\vecInThread=0}{
         \ifthenelse{\vecCoordK<\covK \AND \vecCoordM<\covM}{
           \draw [fill=white]  (new vec TL) rectangle ++(\elem, -\elem);
-          \draw (new vec TL) -- ++(\elem, -\elem);   
-          \draw ($(new vec TL)+(0, -\elem)$) -- ++(\elem, \elem);   
+          \draw (new vec TL) -- ++(\elem, -\elem);
+          \draw ($(new vec TL)+(0, -\elem)$) -- ++(\elem, \elem);
         }{}
       }{}
     }{}
@@ -665,7 +665,7 @@
         \draw [ultra thin] ($(new vec TL)+(\vec*\elem, -.5*\elem)$) -- ++(\elem, 0)
         node [scale=.6*\scale, right] {\phase};
       }{}
-    } 
+    }
   }
 
   %% Draw boundary of 32 banks
@@ -701,7 +701,7 @@
   \pgfmathsetmacro{\kpack}{#2}
   \pgfmathsetmacro{\mfmaTrans}{#3}
   \pgfmathsetmacro{\nonTrans}{1-#3}
-  
+
   \pgfmathsetmacro{\gap}{\elem*5}
   \coordinate (mfma opA TL) at ($(C TL)+(-.5*\gap-1.2*\nonTrans*\gap-\groups*\kpack*\elem, 0)$);
   \coordinate (mfma op TL) at (mfma opA TL);
@@ -825,7 +825,7 @@
     \draw [line width=0.005mm, fill=\vecColor!60!white] (vec TL) rectangle ++(\elem, -\elem)
     node [scale=.4*\scale, pos=.5] {t\tid};
   }
-  
+
 
 }
 
@@ -838,11 +838,11 @@
   %%
   %% C TL: pre defined top-left coordinates of output matrix
   %% \elem: pre defined element size
-  
+
 
   \pgfmathsetmacro{\isWLarge}{#1}
   \pgfmathsetmacro{\verbose}{#2}
-  
+
   \pgfmathsetmacro{\gap}{\elem*2}
   \coordinate (wmma op TL) at ($(C TL)+(-\gap-16*\elem, 0)$);
   \coordinate (wmma opA TL) at (wmma op TL);
@@ -873,7 +873,7 @@
   \draw [->, >=stealth] (m dim.west) -- ($(C TL)+(0, -16*\elem-\gap)$);
   \draw [->, >=stealth] (m dim.east) -- ($(C TL)+(16*\elem, -16*\elem-\gap)$);
 
-  %% C N dim 
+  %% C N dim
   \node [scale=.8*\scale, rotate=-90] (n dim) at ($(C TL)+(16*\elem+\gap, -8*\elem)$) {16};
   \draw [->, >=stealth] (n dim.west) -- ($(C TL)+(16*\elem+\gap, 0)$);
   \draw [->, >=stealth] (n dim.east) -- ($(C TL)+(16*\elem+\gap, -16*\elem)$);

From 370f916dfefd7c37ebc8bad46e607f66b9bfdaa3 Mon Sep 17 00:00:00 2001
From: Lixun Zhang <lixun.zhang@amd.com>
Date: Fri, 6 Sep 2024 09:01:30 -0500
Subject: [PATCH 4/6] remove executablility of plot_layout.py

---
 python/perf-kernels/tools/plot-layout/plot_layout.py | 0
 python/perf-kernels/tools/plot-layout/tikzplot.tex   | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 python/perf-kernels/tools/plot-layout/plot_layout.py
 mode change 100755 => 100644 python/perf-kernels/tools/plot-layout/tikzplot.tex

diff --git a/python/perf-kernels/tools/plot-layout/plot_layout.py b/python/perf-kernels/tools/plot-layout/plot_layout.py
old mode 100755
new mode 100644
diff --git a/python/perf-kernels/tools/plot-layout/tikzplot.tex b/python/perf-kernels/tools/plot-layout/tikzplot.tex
old mode 100755
new mode 100644

From 731ea351a349878b46d8bd3f232ab300eeb4a96c Mon Sep 17 00:00:00 2001
From: Lixun Zhang <lixun.zhang@amd.com>
Date: Fri, 6 Sep 2024 09:05:46 -0500
Subject: [PATCH 5/6] Address ruff complains

---
 python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py b/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py
index 570fc3399602..ae2f65830766 100644
--- a/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py
+++ b/python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py
@@ -52,7 +52,7 @@ def find_kernel(text):
             func_name = match[1]
             start = index
             break
-    if start == None:
+    if start is None:
         return None, None, None
 
     end = None
@@ -61,7 +61,7 @@ def find_kernel(text):
             end = index
             break
 
-    if end == None:
+    if end is None:
         return None, None, None
 
     return func_name, text[start:end + 1], end
@@ -85,7 +85,7 @@ def get_block_list(kernel):
     if (index > 1):
         blocks[begin_label] = Block(begin_label, kernel[:index - 1])
 
-    while label != None:
+    while label is not None:
         kernel = kernel[index + 1:]
         next_label, next_index = find_label(kernel)
         if next_label is None:
@@ -182,9 +182,9 @@ def main(options):
 
     kernels = []
     last_end_index = 0
-    while last_end_index != None:
+    while last_end_index is not None:
         func_name, kernel_asm, last_end_index = find_kernel(asm)
-        if kernel_asm == None:
+        if kernel_asm is None:
             break
 
         blocks = get_block_list(kernel_asm)

From fca8ae24f0f4f3dcf4196ce65e42331b7519fa00 Mon Sep 17 00:00:00 2001
From: Lixun Zhang <lixun.zhang@amd.com>
Date: Fri, 6 Sep 2024 12:03:48 -0500
Subject: [PATCH 6/6] Move tune_gemm to tools

---
 python/perf-kernels/{ => tools}/tune_gemm/README.md               | 0
 python/perf-kernels/{ => tools}/tune_gemm/icache_flush.py         | 0
 python/perf-kernels/{ => tools}/tune_gemm/matmul_kernel.py        | 0
 python/perf-kernels/{ => tools}/tune_gemm/one_config.py           | 0
 python/perf-kernels/{ => tools}/tune_gemm/tune_gemm.py            | 0
 python/perf-kernels/{ => tools}/tune_gemm/utils/file_generator.py | 0
 python/perf-kernels/{ => tools}/tune_gemm/utils/utils.py          | 0
 7 files changed, 0 insertions(+), 0 deletions(-)
 rename python/perf-kernels/{ => tools}/tune_gemm/README.md (100%)
 rename python/perf-kernels/{ => tools}/tune_gemm/icache_flush.py (100%)
 rename python/perf-kernels/{ => tools}/tune_gemm/matmul_kernel.py (100%)
 rename python/perf-kernels/{ => tools}/tune_gemm/one_config.py (100%)
 rename python/perf-kernels/{ => tools}/tune_gemm/tune_gemm.py (100%)
 rename python/perf-kernels/{ => tools}/tune_gemm/utils/file_generator.py (100%)
 rename python/perf-kernels/{ => tools}/tune_gemm/utils/utils.py (100%)

diff --git a/python/perf-kernels/tune_gemm/README.md b/python/perf-kernels/tools/tune_gemm/README.md
similarity index 100%
rename from python/perf-kernels/tune_gemm/README.md
rename to python/perf-kernels/tools/tune_gemm/README.md
diff --git a/python/perf-kernels/tune_gemm/icache_flush.py b/python/perf-kernels/tools/tune_gemm/icache_flush.py
similarity index 100%
rename from python/perf-kernels/tune_gemm/icache_flush.py
rename to python/perf-kernels/tools/tune_gemm/icache_flush.py
diff --git a/python/perf-kernels/tune_gemm/matmul_kernel.py b/python/perf-kernels/tools/tune_gemm/matmul_kernel.py
similarity index 100%
rename from python/perf-kernels/tune_gemm/matmul_kernel.py
rename to python/perf-kernels/tools/tune_gemm/matmul_kernel.py
diff --git a/python/perf-kernels/tune_gemm/one_config.py b/python/perf-kernels/tools/tune_gemm/one_config.py
similarity index 100%
rename from python/perf-kernels/tune_gemm/one_config.py
rename to python/perf-kernels/tools/tune_gemm/one_config.py
diff --git a/python/perf-kernels/tune_gemm/tune_gemm.py b/python/perf-kernels/tools/tune_gemm/tune_gemm.py
similarity index 100%
rename from python/perf-kernels/tune_gemm/tune_gemm.py
rename to python/perf-kernels/tools/tune_gemm/tune_gemm.py
diff --git a/python/perf-kernels/tune_gemm/utils/file_generator.py b/python/perf-kernels/tools/tune_gemm/utils/file_generator.py
similarity index 100%
rename from python/perf-kernels/tune_gemm/utils/file_generator.py
rename to python/perf-kernels/tools/tune_gemm/utils/file_generator.py
diff --git a/python/perf-kernels/tune_gemm/utils/utils.py b/python/perf-kernels/tools/tune_gemm/utils/utils.py
similarity index 100%
rename from python/perf-kernels/tune_gemm/utils/utils.py
rename to python/perf-kernels/tools/tune_gemm/utils/utils.py