From 256dcc228296c94e55528b3582eb7b64c121896d Mon Sep 17 00:00:00 2001 From: rocking Date: Mon, 16 Dec 2024 17:20:42 +0800 Subject: [PATCH 1/4] Add kernel example --- example/ck_tile/02_layernorm2d/CMakeLists.txt | 25 +- example/ck_tile/02_layernorm2d/README.md | 16 +- .../example_layernorm2d_fwd.cpp | 285 ++++++++++++++++++ 3 files changed, 310 insertions(+), 16 deletions(-) create mode 100644 example/ck_tile/02_layernorm2d/example_layernorm2d_fwd.cpp diff --git a/example/ck_tile/02_layernorm2d/CMakeLists.txt b/example/ck_tile/02_layernorm2d/CMakeLists.txt index 1bf74bc055..795bf31727 100644 --- a/example/ck_tile/02_layernorm2d/CMakeLists.txt +++ b/example/ck_tile/02_layernorm2d/CMakeLists.txt @@ -23,19 +23,28 @@ add_custom_command( --api ${LAYERNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --gen_blobs ) -set(EXAMPLE_LAYERNORM2D_FWD "tile_example_layernorm2d_fwd") +set(LAYERNORM2D_FWD "tile_layernorm2d_fwd") -message("adding example ${EXAMPLE_LAYERNORM2D_FWD}") -add_executable(${EXAMPLE_LAYERNORM2D_FWD} EXCLUDE_FROM_ALL layernorm2d_fwd.cpp) -target_include_directories(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) -target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS}) +message("adding ${LAYERNORM2D_FWD}") +add_executable(${LAYERNORM2D_FWD} EXCLUDE_FROM_ALL layernorm2d_fwd.cpp) +target_include_directories(${LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +target_sources(${LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS}) -set(EXAMPLE_LAYERNORM2D_FWD_COMPILE_OPTIONS) +set(LAYERNORM2D_FWD_COMPILE_OPTIONS) # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations -list(APPEND EXAMPLE_LAYERNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) +list(APPEND LAYERNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) + +target_compile_options(${LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_COMPILE_OPTIONS}) + +# kernel example +set(EXAMPLE_LAYERNORM2D_FWD "tile_example_layernorm2d_fwd") + +message("adding ${EXAMPLE_LAYERNORM2D_FWD}") +add_executable(${EXAMPLE_LAYERNORM2D_FWD} EXCLUDE_FROM_ALL example_layernorm2d_fwd.cpp) +target_include_directories(${LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +target_compile_options(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_COMPILE_OPTIONS}) -target_compile_options(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${EXAMPLE_LAYERNORM2D_FWD_COMPILE_OPTIONS}) # TODO: we have to turn off this global prop, otherwise the progress bar generated # by cmake will print too many files, execvp: /bin/sh: Argument list too long diff --git a/example/ck_tile/02_layernorm2d/README.md b/example/ck_tile/02_layernorm2d/README.md index 3573d70cd2..3251737331 100644 --- a/example/ck_tile/02_layernorm2d/README.md +++ b/example/ck_tile/02_layernorm2d/README.md @@ -24,8 +24,8 @@ we support smooth/dynamic quantization for `int8` output, by setting `-fquant=1` # assume output int8, hidden_states is [m, n] shape and in fp16/bf16 # [m, 1] per_token_amax, _ = torch.max( - input=torch.abs(hidden_states), - dim=-1, + input=torch.abs(hidden_states), + dim=-1, keepdim=True ) per_token_scale = per_token_amax.to(dtype=torch.float32) / 127.0 @@ -43,11 +43,11 @@ return hidden_states, per_token_scale # in the root of ck_tile mkdir build && cd build sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... -make tile_example_layernorm2d_fwd -j +make tile_layernorm2d_fwd -j ``` -This will result in an executable `build/bin/tile_example_layernorm2d_fwd` +This will result in an executable `build/bin/tile_layernorm2d_fwd` -## example +## argument ``` args: -m m dimension (default:3328) @@ -74,12 +74,12 @@ Note that `fquant=2`, `fadd=2`, `prec_sx/prec_sy` other than `fp32` are not by d ``` # some case # standard fp16 layernorm 2d, m=10. n=1024 -./build/bin/tile_example_layernorm2d_fwd -m=10 -n=1024 +./build/bin/tile_layernorm2d_fwd -m=10 -n=1024 # standard fp16 layernorm 2d, m=10. n=1024, fused-smooth-quant, output in int8 -./build/bin/tile_example_layernorm2d_fwd -m=10 -n=1024 -prec_o=int8 -fquant=1 +./build/bin/tile_layernorm2d_fwd -m=10 -n=1024 -prec_o=int8 -fquant=1 # standard fp16 layernorm 2d, m=10. n=1024, fused-smooth-quant+fused-add-store, output in int8 -./build/bin/tile_example_layernorm2d_fwd -m=10 -n=1024 -prec_o=int8 -fquant=1 -fadd=1 +./build/bin/tile_layernorm2d_fwd -m=10 -n=1024 -prec_o=int8 -fquant=1 -fadd=1 ``` diff --git a/example/ck_tile/02_layernorm2d/example_layernorm2d_fwd.cpp b/example/ck_tile/02_layernorm2d/example_layernorm2d_fwd.cpp new file mode 100644 index 0000000000..c900c956ca --- /dev/null +++ b/example/ck_tile/02_layernorm2d/example_layernorm2d_fwd.cpp @@ -0,0 +1,285 @@ +#include "ck_tile/host.hpp" +#include +#include "ck_tile/ops/layernorm2d.hpp" +#include +#include + +// different threshold for different dtype +template +auto get_elimit() +{ + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); +} + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("m", "3328", "m dimension") + .insert("n", "4096", "n dimension") + .insert("x_stride", "-1", "x row_stride, if -1 then equal to n") + .insert("xr_stride", "-1", "x residule row_stride, if -1 then equal to n") + .insert("y_stride", "-1", "y row_stride, if -1 then equal to n") + .insert("yr_stride", "-1", "y residule row_stride, if -1 then equal to n") + .insert("e", "1e-5", "epsilon") + .insert("v", "1", "cpu validation or not") + .insert("prec_i", "fp16", "input precision") + .insert("prec_o", "auto", "output precision, set auto will be the same as input") + .insert("prec_sx", + "auto", + "output quant scale type, set auto will use fp32. used when fquant=1") + .insert("prec_sy", + "auto", + "output quant scale type, set auto will use fp32. used when fquant=1 or 2") + .insert("warmup", "5", "cold iter") + .insert("repeat", "20", "hot iter"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +template +bool run(const ck_tile::ArgParser& arg_parser) +{ + ck_tile::index_t m = arg_parser.get_int("m"); + ck_tile::index_t n = arg_parser.get_int("n"); + ck_tile::index_t x_stride = arg_parser.get_int("x_stride"); + if(x_stride < 0) + x_stride = n; + ck_tile::index_t xr_stride = arg_parser.get_int("xr_stride"); + if(xr_stride < 0) + xr_stride = n; + ck_tile::index_t y_stride = arg_parser.get_int("y_stride"); + if(y_stride < 0) + y_stride = n; + ck_tile::index_t yr_stride = arg_parser.get_int("yr_stride"); + if(yr_stride < 0) + yr_stride = n; + float epsilon = arg_parser.get_float("e"); + std::string prec_i = arg_parser.get_str("prec_i"); + std::string prec_o = arg_parser.get_str("prec_o"); + std::string prec_sx = arg_parser.get_str("prec_sx"); + std::string prec_sy = arg_parser.get_str("prec_sy"); + if(prec_o == "auto") + { + prec_o = prec_i; + } + if(prec_sx == "auto") + { + prec_sx = "fp32"; + } + if(prec_sy == "auto") + { + prec_sy = "fp32"; + } + + int do_validation = arg_parser.get_int("v"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + + assert(x_stride >= n); + + using XDataType = InDataType; + using YDataType = InDataType; + using GammaDataType = InDataType; + using BetaDataType = InDataType; + using XResidualDataType = InDataType; + using YResidualDataType = InDataType; + + using ComputeDataType = float; + + // host verify + ck_tile::HostTensor x_host({m, n}, {x_stride, 1}); + ck_tile::HostTensor gamma_host({n}); + ck_tile::HostTensor beta_host({n}); + + ck_tile::HostTensor x_residual_host({m, n}, {xr_stride, 1}); + ck_tile::HostTensor y_residual_host({m, n}, {yr_stride, 1}); + + ck_tile::HostTensor y_host_ref({m, n}, {y_stride, 1}); + ck_tile::HostTensor y_host_dev({m, n}, {y_stride, 1}); + + ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); + ck_tile::FillUniformDistribution{-.5f, .5f}(x_residual_host); + ck_tile::FillUniformDistribution{-.5f, .5f}(gamma_host); + ck_tile::FillUniformDistribution{-.5f, .5f}(beta_host); + + ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem beta_buf(beta_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes()); + + ck_tile::DeviceMem x_residual_buf(x_residual_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem y_residual_buf(y_residual_host.get_element_space_size_in_bytes()); + + x_buf.ToDevice(x_host.data()); + gamma_buf.ToDevice(gamma_host.data()); + beta_buf.ToDevice(beta_host.data()); + x_residual_buf.ToDevice(x_residual_host.data()); + + constexpr bool kTwoPass = false; + constexpr auto kFuseAdd = ck_tile::Layernorm2dFusedAddEnum::PRE_ADD_STORE; + constexpr auto kFuseQuant = ck_tile::Layernorm2dFusedQuantEnum::NO_SWEEP; + + using BlockWarps = ck_tile::sequence<1, 4>; + using BlockTile = ck_tile::sequence<1, 8192>; + using WarpTile = ck_tile::sequence<1, 512>; + using Vector = ck_tile::sequence<1, 8>; + + using Shape = ck_tile::Generic2dBlockShape; + using Trait = ck_tile::Layernorm2dFwdTraits; + using PipelineProblem = ck_tile::Layernorm2dFwdPipelineProblem; + + using OnePassPipeline = ck_tile::Layernorm2dFwdPipelineOnePass; + using TwoPassPipeline = ck_tile::Layernorm2dFwdPipelineTwoPass; + using Pipeline = std::conditional_t; + + using EpilogueProblem = + ck_tile::Default2DEpilogueProblem; + using Epilogue = ck_tile::Default2DEpilogue; + using Kernel = ck_tile::Layernorm2dFwd; + + ck_tile::Layernorm2dFwdHostArgs args{x_buf.GetDeviceBuffer(), + x_residual_buf.GetDeviceBuffer(), + nullptr, // x_scale for quant + gamma_buf.GetDeviceBuffer(), + beta_buf.GetDeviceBuffer(), + y_buf.GetDeviceBuffer(), + y_residual_buf.GetDeviceBuffer(), + nullptr, // y_scale for quant + nullptr, // p_mean, unsupported yet + nullptr, // p_invStd, unsupported yet + epsilon, + m, + n, + x_stride, // x row_stride + xr_stride, // x residule row stride + y_stride, // y row stride + yr_stride}; // y residule row stride + + auto kargs = Kernel::MakeKargs(args); + + const dim3 grids = Kernel::GridSize(args); + constexpr dim3 blocks = Kernel::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = 1; + auto s = ck_tile::stream_config{nullptr, true, 1, warmup, repeat}; + + float ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(GammaDataType) * n + + sizeof(BetaDataType) * n + sizeof(YDataType) * m * n; + + float gb_per_sec = num_byte / 1.E6 / ave_time; + std::cout << Kernel::GetName() << std::endl; + std::cout << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush; + + bool pass = true; + + if(do_validation) + { + std::transform(x_host.mData.cbegin(), + x_host.mData.cend(), + x_residual_host.mData.cbegin(), + x_host.mData.begin(), + [](auto x_, auto r_) { + auto o_ = ck_tile::type_convert(x_) + + ck_tile::type_convert(r_); + return ck_tile::type_convert(o_); + }); + + ck_tile::HostTensor dummy({m}); + + ck_tile::reference_layernorm2d_fwd( + x_host, gamma_host, beta_host, y_host_ref, dummy, dummy, epsilon); + + y_buf.FromDevice(y_host_dev.data()); + + ck_tile::HostTensor y_residual_host_dev({m, n}, {yr_stride, 1}); + y_residual_buf.FromDevice(y_residual_host_dev.data()); + + auto [rtol, atol] = get_elimit(); + + if(x_stride == n) + { + pass &= ck_tile::check_err(y_residual_host_dev, + x_host, + std::string(" ADD Error: Incorrect results!"), + rtol, + atol); + + pass &= ck_tile::check_err( + y_host_dev, y_host_ref, std::string(" OUT Error: Incorrect results!"), rtol, atol); + } + + std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl; + } + + return pass; +} + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + std::string prec_i = arg_parser.get_str("prec_i"); + std::string prec_o = arg_parser.get_str("prec_o"); + std::string prec_sx = arg_parser.get_str("prec_sx"); + std::string prec_sy = arg_parser.get_str("prec_sy"); + + if(prec_o == "auto") + { + prec_o = prec_i; + } + if(prec_sx == "auto") + { + prec_sx = "fp32"; + } + if(prec_sy == "auto") + { + prec_sy = "fp32"; + } + + // no dynamic quant case + /*if(prec_i == "fp16" && prec_o == "fp16" && prec_sx == "fp32" && prec_sy == "fp32") + { + return run(arg_parser) ? 0 : -2; + } + else */ + if(prec_i == "bf16" && prec_o == "bf16" && prec_sx == "fp32" && prec_sy == "fp32") + { + return run(arg_parser) ? 0 : -2; + } + + return -3; +} From c636337e51204df8f5eaadf352982e18a4d85b67 Mon Sep 17 00:00:00 2001 From: rocking Date: Tue, 17 Dec 2024 02:30:19 +0800 Subject: [PATCH 2/4] Extract padN and RawStore --- .../02_layernorm2d/example_layernorm2d_fwd.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/example/ck_tile/02_layernorm2d/example_layernorm2d_fwd.cpp b/example/ck_tile/02_layernorm2d/example_layernorm2d_fwd.cpp index c900c956ca..7c94dcf514 100644 --- a/example/ck_tile/02_layernorm2d/example_layernorm2d_fwd.cpp +++ b/example/ck_tile/02_layernorm2d/example_layernorm2d_fwd.cpp @@ -40,8 +40,8 @@ auto create_args(int argc, char* argv[]) .insert("prec_sy", "auto", "output quant scale type, set auto will use fp32. used when fquant=1 or 2") - .insert("warmup", "5", "cold iter") - .insert("repeat", "20", "hot iter"); + .insert("warmup", "10", "cold iter") + .insert("repeat", "40", "hot iter"); bool result = arg_parser.parse(argc, argv); return std::make_tuple(result, arg_parser); @@ -129,9 +129,11 @@ bool run(const ck_tile::ArgParser& arg_parser) beta_buf.ToDevice(beta_host.data()); x_residual_buf.ToDevice(x_residual_host.data()); - constexpr bool kTwoPass = false; - constexpr auto kFuseAdd = ck_tile::Layernorm2dFusedAddEnum::PRE_ADD_STORE; - constexpr auto kFuseQuant = ck_tile::Layernorm2dFusedQuantEnum::NO_SWEEP; + constexpr bool kTwoPass = false; + constexpr bool kPadN = true; + constexpr bool UseRawStore = true; + constexpr auto kFuseAdd = ck_tile::Layernorm2dFusedAddEnum::PRE_ADD_STORE; + constexpr auto kFuseQuant = ck_tile::Layernorm2dFusedQuantEnum::NO_SWEEP; using BlockWarps = ck_tile::sequence<1, 4>; using BlockTile = ck_tile::sequence<1, 8192>; @@ -139,7 +141,7 @@ bool run(const ck_tile::ArgParser& arg_parser) using Vector = ck_tile::sequence<1, 8>; using Shape = ck_tile::Generic2dBlockShape; - using Trait = ck_tile::Layernorm2dFwdTraits; + using Trait = ck_tile::Layernorm2dFwdTraits; using PipelineProblem = ck_tile::Layernorm2dFwdPipelineProblem; using EpilogueProblem = - ck_tile::Default2DEpilogueProblem; + ck_tile::Default2DEpilogueProblem; using Epilogue = ck_tile::Default2DEpilogue; using Kernel = ck_tile::Layernorm2dFwd; From 62fa36cf85a1a5d18b320663e3b24d0c75d84a8a Mon Sep 17 00:00:00 2001 From: rocking Date: Tue, 17 Dec 2024 02:56:51 +0800 Subject: [PATCH 3/4] Fix incorrect flag --- example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp index b49c04619d..798a5070e8 100644 --- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp +++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp @@ -412,7 +412,7 @@ int main(int argc, char* argv[]) else if(prec_i == "bf16" && prec_o == "bf16" && prec_sx == "fp32" && prec_sy == "fp32" && !save_mv) { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } // dynamic quant case, only in inference From 25ae3d1b155e3b14b110ee31c1cf9d653216b1ce Mon Sep 17 00:00:00 2001 From: rocking Date: Tue, 17 Dec 2024 03:06:02 +0800 Subject: [PATCH 4/4] use raw store to improve performance --- example/ck_tile/02_layernorm2d/generate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py index ca9e432a4f..1e9f360302 100644 --- a/example/ck_tile/02_layernorm2d/generate.py +++ b/example/ck_tile/02_layernorm2d/generate.py @@ -204,7 +204,7 @@ class layernorm_fwd_codegen: using TwoPassPipeline = ck_tile::Layernorm2dFwdPipelineTwoPass; using Pipeline = std::conditional_t; - using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem; + using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem; using Default2DEpilogue = ck_tile::Default2DEpilogue; static constexpr bool UseSmoothInputScale = Traits_::kFusedQuant == 1; @@ -327,7 +327,7 @@ class k_problem: @dataclass class k_pipeline_one_pass: F_Problem : Any #k_problem - + @dataclass class k_pipeline_two_pass: F_Problem : Any #k_problem