Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move utility tools from triton-mlir to main_perf branch #635

Merged
merged 6 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions python/perf-kernels/tools/amdgcn-cfg/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Control Flow Graph Generator from AMDGCN assembly

The script reads an assembly file and generates a Control Flow Graph (CFG) for each function in the file. The graph can be saved in `dot`, `svg` and `pdf` formats. The nodes of a graph can be represented with 1) just labels or 2) the corresponding assembly code. The edges of a graph can help to identify cycles and, thus, to provide a better navigation through the code.


### Basic usage

```
python ./amdgcn-cfg.py -i <path to assembly file> -o <output directory>/<output prefix> -f [dot|svg|pdf]
```

`dot`-files can be visualize with [this](https://dreampuf.github.io/GraphvizOnline) online tool. You just need to copy and paste the content of a generated `dot`-file.

By default, the nodes are named with basic block labels. Use `-v` or `--verbose` option to add assembly source code to corresponding nodes.
222 changes: 222 additions & 0 deletions python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
import os
import argparse
import re
from collections import OrderedDict
import graphviz


class Options:

def __init__(self, input_file, output_file, verbose, format):
if not os.path.exists(input_file):
raise RuntimeError('input file is not provided')

output_dir = os.path.dirname(output_file)
if not os.path.exists(output_dir):
raise RuntimeError('output directory does not exist')

self.input_file = input_file
self.output_file = output_file
self.verbose = verbose
self.format = format
self.output_dir = output_dir


class Block:

def __init__(self, label, code):
self.label = label
self.code = code
self.edges = []


class Kernel:

def __init__(self, kernel_name, blocks):
self.name = kernel_name
self.blocks = blocks
self.cfg = None


begin_label = 'Begin'
end_label = 'End'


def find_kernel(text):
func_name_expr = r'^([^\s^\.]\w.+):'
func_name = None
start = None
for index, line in enumerate(text):
match = re.search(func_name_expr, line)
if match is not None:
func_name = match[1]
start = index
break
if start is None:
return None, None, None

end = None
for index, line in enumerate(text):
if re.search(r's_endpgm', line) is not None:
end = index
break

if end is None:
return None, None, None

return func_name, text[start:end + 1], end


def find_label(kernel):
label = None
index = None
for index, line in enumerate(kernel):
match = re.search(r'^\.(\w+):', line)
if match is not None:
label = match[1]
break
return label, index


def get_block_list(kernel):
label, index = find_label(kernel)

blocks = OrderedDict()
if (index > 1):
blocks[begin_label] = Block(begin_label, kernel[:index - 1])

while label is not None:
kernel = kernel[index + 1:]
next_label, next_index = find_label(kernel)
if next_label is None:
code = kernel[index:]
else:
code = kernel[:next_index]
blocks[label] = Block(label, code)

label = next_label
index = next_index

blocks[end_label] = Block(end_label, [])

return blocks


def find_terminators(code):
terminator_labels = []
for line in code:
branch = re.search(r'(c)?branch.*\s+\.?(.*)', line)
if branch is not None:
is_condional = True if len(branch.groups()) == 2 else False
label_idx = 2 if is_condional else 1
terminator_labels.append(branch[label_idx])
if not is_condional:
return terminator_labels, True
end = re.search(r's_endpgm', line)
if end is not None:
terminator_labels.append(end_label)
return terminator_labels, True

return terminator_labels, False


def add_edges(kernel):
keys = list(kernel.blocks.keys())
for index, curr_label in enumerate(keys):
if curr_label == end_label:
continue

code = kernel.blocks[curr_label].code
terminators, is_last_unconditional = find_terminators(code[:-1])

if is_last_unconditional:
# unconditional jump in the middle of the block
break

# handle the last terminator in the current BB
last_terminator, is_unconditional = find_terminators([code[-1]])

is_conditional = not is_unconditional
next_block_label = keys[index + 1]
is_next_covered = next_block_label in terminators

if last_terminator:
terminators.extend(last_terminator)
if is_conditional and not is_next_covered:
next_block_label = keys[index + 1]
terminators.append(next_block_label)
else:
if not is_next_covered:
next_block_label = keys[index + 1]
terminators.append(next_block_label)

assert (len(terminators))
kernel.blocks[curr_label].edges = terminators


def generate_cfg(kernel, options):
graph = graphviz.Digraph(f'{kernel.name}')
for curr_label in kernel.blocks:
block = kernel.blocks[curr_label]
asm = [line.strip() for line in block.code]
if options.verbose:
label_text = repr('\n'.join([f'{curr_label}', *asm]))
else:
label_text = curr_label
graph.node(curr_label, shape='rect', labeljust='l', margin='0.01', label=label_text)

for curr_label in kernel.blocks:
block = kernel.blocks[curr_label]
for edge in block.edges:
graph.edge(curr_label, edge)

return graph


def main(options):
asm = []
with open(options.input_file, 'r') as file:
context = file.readlines()
for line in context:
asm.append(line[:-1])

kernels = []
last_end_index = 0
while last_end_index is not None:
func_name, kernel_asm, last_end_index = find_kernel(asm)
if kernel_asm is None:
break

blocks = get_block_list(kernel_asm)
kernel = Kernel(func_name, blocks)
add_edges(kernel)

cfg = generate_cfg(kernel, options)
kernel.cfg = cfg
kernels.append(kernel)
asm = asm[last_end_index + 1:]

for index, kernel in enumerate(kernels):
output_file_name = f'{options.output_file}.kernel-{index}'
if options.format == 'dot':
with open(f'{output_file_name}.dot', 'w') as file:
file.write(str(kernel.cfg))
file.write('\n')
else:
kernel.cfg.render(
filename=f'{output_file_name}',
format=options.format,
).replace('\\', '/')


if __name__ == "__main__":
parser = argparse.ArgumentParser(prog="Generates Control Flow Graph (CFG) from amdgcn assembly file", )
parser.add_argument("-i", "--input", type=str, default=None, help="input file")
parser.add_argument("-o", "--output", type=str, default=None, help="output file prefix")
parser.add_argument("-v", "--verbose", action='store_true', help='verbose output')
parser.add_argument("-f", "--format", choices=['dot', 'svg', 'pdf'], default="dot", help="output format type")
args = parser.parse_args()

options = Options(args.input, args.output, args.verbose, args.format)

main(options)
71 changes: 71 additions & 0 deletions python/perf-kernels/tools/occ.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#! /bin/bash

## $1: input script that contains one kernel

rm -rf ~/.triton/cache/

export MLIR_ENABLE_DUMP=1
export AMDGCN_ENABLE_DUMP=1
## Assume CDNA arch
SIMD=4
LDS_SIZE=65536
TOTAL_VGPR=512

get_occ_per_CU() {
## $1: vgpr count
vgpr=$1
occPerEU=$((TOTAL_VGPR/vgpr))
if [[ $vgpr -gt 256 ]]; then
occPerEU=1
elif [[ $vgpr -gt 168 ]]; then
occPerEU=2
elif [[ $vgpr -gt 128 ]]; then
occPerEU=3
elif [[ $vgpr -gt 96 ]]; then
occPerEU=4
elif [[ $vgpr -gt 80 ]]; then
occPerEU=5
elif [[ $vgpr -gt 72 ]]; then
occPerEU=6
elif [[ $vgpr -gt 64 ]]; then
occPerEU=7
else
occPerEU=8
fi

occPerCU=$((occPerEU*SIMD/num_warps))
echo $occPerCU
}

$1 > output.mlir 2>&1

LDS_line=$(sed -n '/triton_gpu\.shared\ /p' output.mlir | tail -n 1 | grep -o 'triton_gpu.shared = [0-9]*')
numWarps_line=$(sed -n '/triton_gpu\.num-warps/p' output.mlir | tail -n 1 | grep -o 'triton_gpu.num-warps. = [0-9]*')

LDS=${LDS_line##*=}
num_warps=${numWarps_line##*=}
echo "LDS: $LDS, num_warps: $num_warps"

VGPRs=$(sed -n '/vgpr_count/p' output.mlir | tail -n 1 | awk '{print $2}')
SPILLs=$(sed -n '/vgpr_spill/p' output.mlir | tail -n 1 | awk '{print $2}')

echo "VGPRS: $VGPRs (spill: $SPILLs)"

occLDSPerCU=$((LDS_SIZE/LDS))
occVgprPerCU=$(get_occ_per_CU $VGPRs)
occPerCU=$occVgprPerCU
if [ $occLDSPerCU -lt $occVgprPerCU ];then
occPerCU=$occLDSPerCU
fi
occPerEU=$((occPerCU*num_warps/SIMD))
echo "occupancy: $occPerEU waves/SIMD or $occPerCU workgroups/CU (occLDSPerCU: $occLDSPerCU, occVgprPerCU: $occVgprPerCU)"

perf=$(tail -n 2 output.mlir)
echo "$perf"

## remove distracting info from the assembly
sed -i '/local_/! {/\.loc/d}' output.mlir
sed -i '/\.Ltmp.*:/d' output.mlir
sed -i '/AMD clang version/d' output.mlir

sed -n '/AMDGCN/, $p' output.mlir > output.amdgcn
Loading
Loading