Skip to content

Commit

Permalink
rocm reproduced
Browse files Browse the repository at this point in the history
  • Loading branch information
heheda12345 committed Apr 22, 2023
1 parent 186fe5b commit 48d0651
Show file tree
Hide file tree
Showing 10 changed files with 64 additions and 93 deletions.
7 changes: 5 additions & 2 deletions artifacts/ast_analyzer/tensor_opt/search_best_flags.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import importlib
from ast_analyzer.to_onnx.to_torch_func import DEFAULT_DEVICES, RT_DIRS
from ast_analyzer.to_onnx.to_torch_func import DEFAULT_DEVICES, RT_DIRS, SM_COUNT
from ast_analyzer.utils import config
import os
import stat
Expand Down Expand Up @@ -72,7 +72,10 @@ def gen_flags(onnx_model, model_name, platform, block_dim):
has_recursion = exists['Recursion']
has_conv = exists['Conv']

possible_grid_dims = [80, 128, 160, 240, 256, 320, 384, 400, 480]
possible_grid_dims = [128, 256, 384]
for i in [1, 2, 3, 4, 5, 6]:
possible_grid_dims.append(SM_COUNT[platform] * i)

if block_dim == -1: block_dim = 256

flags_to_try = []
Expand Down
5 changes: 5 additions & 0 deletions artifacts/ast_analyzer/to_onnx/to_torch_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
'MI100': 'rocm_codegen'
}

SM_COUNT = {
'V100': 80,
'MI100': 120,
}

def to_torch_func(
model_name, # type: str
n_in, # type: int
Expand Down
18 changes: 18 additions & 0 deletions artifacts/env/config.rocm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import os
import sys
import getpass

# config start
KERNELDB_REQUEST_FNAME="kerneldb_request.log"
NNFUSION_ROOT = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../..'))
TMP_DIR = os.path.join(NNFUSION_ROOT, "artifacts/models/tmp")
KERNELDB_PATH = os.path.expanduser("~/.cache/nnfusion/kernel_cache.db")
NUM_GPU = 1
# config end

os.environ["NNFUSION_ROOT"] = NNFUSION_ROOT
os.environ["PATH"] = os.path.join(NNFUSION_ROOT, "build/src/tools/nnfusion") + ":" + os.environ["PATH"]
sys.path.insert(1, os.path.abspath(NNFUSION_ROOT + "/src/python"))

sys.path.insert(1, TMP_DIR)
os.system(f"mkdir -p {TMP_DIR}")
2 changes: 2 additions & 0 deletions artifacts/env/install_grinder_rocm_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@ cd /root/nnfusion
mkdir -p build && cd build && cmake .. && make -j

cd /root/nnfusion/artifacts
cp env/config.rocm.py ast_analyzer/utils/config.py
sed -i 's/"num_threads": 8/"num_threads": 1/g' kernel_db/test_config/get_func.py
pip install -e .
30 changes: 30 additions & 0 deletions artifacts/kernel_db/ansor_kernels_rocm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# use the final.cu tuned on V100

import os
from db import save_to_db
from test_config import *
from tvm import te, auto_scheduler

def save(identifier, kernel_dir):
with open(os.path.join("ansor_kernels", kernel_dir, "final.cu")) as f:
final = f.readlines()
# cut until a line with '}'
best_source = "".join(final[:final.index("}\n") + 1])+"\n"
best_grid_size = None
best_block_size = None
for line in final:
if line.startswith("dim3 grid"):
assert best_grid_size is None
best_grid_size = line[len("dim3 grid"):].strip()
best_grid_size = best_grid_size.replace("(", "").replace(");", "").split(", ")
best_grid_size = tuple([int(x) for x in best_grid_size])
if line.startswith("dim3 block"):
assert best_block_size is None
best_block_size = line[len("dim3 block"):].strip()
best_block_size = best_block_size.replace("(", "").replace(");", "").split(", ")
best_block_size = tuple([int(x) for x in best_block_size])
assert best_grid_size is not None
assert best_block_size is not None
save_to_db(identifier, best_source, best_grid_size, best_block_size, device_type="ROCM_GPU")

save("BatchMatMul[1,12,64,64;1,12,64,1;1,12,64,1floatfloatfloat]", "batch_matmul_4d_4d_expr_1_12_64_1_64_ansor")
1 change: 0 additions & 1 deletion artifacts/kernel_db/manual_kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
}

kernels_rocm = {
"BatchMatMul[1,256;4,256,256;4,1,256floatfloatfloat]": "bmm_4_1_256_rocm", # lstm bs1
"Dot[1,256;3797,256;1,3797floatfloatfloat01]": "dot_1_256_3797_256_1_3797_01_rocm", # seq2seq bs1
"BatchMatMul[1,12,1,64;12,64,64;1,12,1,64floatfloatfloat]": "bmm_12_1_64_64_rocm", # attention bs1
"BatchMatMul[1,12,1,64;1,12,64,64;1,12,1,64floatfloatfloat]": "bmm_12_1_64_64_rocm", # attention bs1
Expand Down
87 changes: 0 additions & 87 deletions artifacts/kernel_db/manual_kernels/bmm_4_1_256_rocm.cu

This file was deleted.

1 change: 1 addition & 0 deletions artifacts/kernel_db/reproduce_rocm_kernel_db.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python3 manual_kernels.py --device ROCM_GPU
3 changes: 1 addition & 2 deletions artifacts/models/manual_seq2seq/bs1.rocm/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ def backward(ctx, _r0, _r1, _r2, _r3, _r4, _r5):
return output_tensors


prefix = "../data/seq2seq"
MAX_LENGTH = 50
OUTPUT_SIZE = 3797
HIDDEN_SIZE = 256
Expand Down Expand Up @@ -176,7 +175,7 @@ def read_bin(s, dtype=np.float32):
n_warmup = 200
n_run = 100
len_dataset = 6400
tokens = read_bin('../data/tatoeba-eng-fra/tokens', dtype=np.int64).cuda()
tokens = read_bin('../../../data/tatoeba-eng-fra/tokens', dtype=np.int64).cuda()
masks = gen_mask_from_sequence(tokens)
for i in range(0, len_dataset, batch_size):
if i >= n_warmup * batch_size: break
Expand Down
3 changes: 2 additions & 1 deletion src/nnfusion/engine/pass/graph/pattern_substitution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ DEFINE_bool(fpattern_substitution,
DEFINE_bool(fbiasadd_fix,
false,
"Fix biasadd shape for TVM Conv2d-Add fusion in pattern_substitution_pass");
DECLARE_string(fdefault_device);

// Only serial pattern supported in current implementation
// The substitution directly applied to computation graph, no back propagation involved
Expand Down Expand Up @@ -151,7 +152,7 @@ class PatternOptimizer
// Todo: more tags, more platform
identifier = generate_pattern_name() + "[" + identifier + "]";
std::set<std::string> tags = {};
auto fetched_kernel = kernel_db->fetch_all(identifier, "CUDA_GPU");
auto fetched_kernel = kernel_db->fetch_all(identifier, get_device_str(nnfusion::get_device_type(FLAGS_fdefault_device)));
if (fetched_kernel.size() > 0)
{
NNFUSION_CHECK(fetched_kernel[0]->function != "");
Expand Down

0 comments on commit 48d0651

Please sign in to comment.