From d2d4209d148c09356492a04000a878270896178c Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Wed, 13 Nov 2024 11:27:26 +0800 Subject: [PATCH] Support Qwen2-MoE models (#2723) * add qwen2-moe * eliminate `inter_size_` from ffn layer * clean up * fix lint * clean up * Update config.yaml --------- Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> --- autotest/config.yaml | 2 + lmdeploy/turbomind/deploy/config.py | 2 + lmdeploy/turbomind/deploy/module.py | 21 +- .../turbomind/deploy/source_model/mixtral.py | 2 + .../turbomind/deploy/source_model/qwen.py | 61 +++ lmdeploy/turbomind/supported_models.py | 1 + src/turbomind/kernels/gemm/moe_utils_v2.cu | 449 ++++++++++++++++-- src/turbomind/kernels/gemm/moe_utils_v2.h | 4 +- .../kernels/gemm/test/test_moe_utils.cu | 103 ++-- src/turbomind/kernels/gemm/test/testbed.h | 2 + .../models/llama/LlamaDecoderLayerWeight.cc | 52 +- src/turbomind/models/llama/LlamaDenseWeight.h | 29 +- src/turbomind/models/llama/LlamaFfnLayer.cc | 47 +- src/turbomind/models/llama/LlamaFfnLayer.h | 11 +- src/turbomind/models/llama/llama_params.h | 8 +- src/turbomind/models/llama/moe_ffn_layer.cc | 78 +-- src/turbomind/models/llama/moe_ffn_layer.h | 6 +- src/turbomind/models/llama/unified_decoder.cc | 21 +- .../triton_backend/llama/LlamaTritonModel.cc | 2 + 19 files changed, 711 insertions(+), 190 deletions(-) diff --git a/autotest/config.yaml b/autotest/config.yaml index 9357e473bb..587ee6331b 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -44,10 +44,12 @@ turbomind_chat_model: - Qwen/Qwen2-1.5B-Instruct - Qwen/Qwen1.5-7B-Chat - Qwen/Qwen1.5-4B-Chat-AWQ + - Qwen/Qwen1.5-MoE-A2.7B-Chat - Qwen/Qwen-VL-Chat - Qwen/Qwen2.5-0.5B-Instruct - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen2-7B-Instruct-GPTQ-Int4 + - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4 - mistralai/Mistral-7B-Instruct-v0.3 - mistralai/Mixtral-8x7B-Instruct-v0.1 - lmdeploy/llama2-chat-7b-w4 diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py index 7e8ebf7b47..a535b0d4c1 100644 --- a/lmdeploy/turbomind/deploy/config.py +++ b/lmdeploy/turbomind/deploy/config.py @@ -50,6 +50,8 @@ class ModelConfig: expert_num: int = 0 expert_inter_size: int = 0 experts_per_token: int = 0 + moe_shared_gate: int = False + moe_norm_topk: int = False def verify(self): invalid = {} diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py index a9f7385376..8d998abe2b 100644 --- a/lmdeploy/turbomind/deploy/module.py +++ b/lmdeploy/turbomind/deploy/module.py @@ -140,14 +140,18 @@ class MoeFfn(Ffn): requires: r.moe_ffn_expert(e, i, kind) r.moe_ffn_gate(i) + r.moe_ffn_shared_gate(i) """ _moe_ffn_expert = 'layers.{0}.moe_ffn.experts.E.{1}.{2}' - _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.{1}' + _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.weight' + _moe_ffn_shared_gate = 'layers.{0}.moe_ffn.shared_gate.weight' def __init__(self, model: BaseOutputModel): super().__init__(model) self.expert_num = model.model_config.expert_num + self.inter_size = model.model_config.expert_inter_size + self.shared_gate = model.model_config.moe_shared_gate def apply(self, i: int, r: BaseReader): for p in get_params(r.moe_ffn_expert()): @@ -157,7 +161,13 @@ def apply(self, i: int, r: BaseReader): i) gate = transpose(r.moe_ffn_gate(i)) - self.model.save_split(gate, self._moe_ffn_gate.format(i, 'weight')) + self.model.save_split(gate, self._moe_ffn_gate.format(i)) + + if self.shared_gate: + shared_gate = transpose(r.moe_ffn_shared_gate(i)) + # print(shared_gate) + self.model.save_split(shared_gate, + self._moe_ffn_shared_gate.format(i)) class Attn(Module): @@ -248,8 +258,11 @@ class Transformer: def __init__(self, model: BaseOutputModel): self.model = model - ffn = MoeFfn if model.model_config.expert_num else Ffn - modules = [Attn, LayerNorm, ffn] + modules = [Attn, LayerNorm] + if model.model_config.inter_size: + modules.append(Ffn) + if model.model_config.expert_num: + modules.append(MoeFfn) self.modules = [c(model) for c in modules] self.misc = Misc(model) diff --git a/lmdeploy/turbomind/deploy/source_model/mixtral.py b/lmdeploy/turbomind/deploy/source_model/mixtral.py index 102ede29f2..ff9df2d409 100644 --- a/lmdeploy/turbomind/deploy/source_model/mixtral.py +++ b/lmdeploy/turbomind/deploy/source_model/mixtral.py @@ -33,4 +33,6 @@ def model_info(self): info['expert_num'] = cfg['num_local_experts'] info['expert_inter_size'] = cfg['intermediate_size'] info['experts_per_token'] = cfg['num_experts_per_tok'] + info['moe_norm_topk'] = True + info['inter_size'] = 0 return info diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py index 0ec0586a37..772bd03037 100644 --- a/lmdeploy/turbomind/deploy/source_model/qwen.py +++ b/lmdeploy/turbomind/deploy/source_model/qwen.py @@ -120,3 +120,64 @@ def model_info(self): cfg = super().model_info() cfg['attn_bias'] = 1 return cfg + + +class Qwen2MoeReader(LlamaReader): + + ffn_pattern = r'shared_expert\.' + + def moe_ffn_expert(self, e=None, i=None, kind=None): + if not kind: + return self.filter(r'experts') + result = [] + for key in ['gate', 'down', 'up']: + name = f'model.layers.{i}.mlp.experts.{e}.{key}_proj.{kind}' + tensor = self.params.get(name) + tensor = self.transform(tensor, kind) + result.append(tensor) + return (*result, ) + + def moe_ffn_gate(self, i): + return self.params.get(f'model.layers.{i}.mlp.gate.weight') + + def _ffn(self, i: int, kind: str): + """Get ffn kind for layer i.""" + if not kind: + return self.filter(self.ffn_pattern) + result = [] + for key in ['gate', 'down', 'up']: + tensor = self.params[ + f'model.layers.{i}.mlp.shared_expert.{key}_proj.{kind}'] + tensor = self.transform(tensor, kind) + result.append(tensor) + return (*result, ) + + def moe_ffn_shared_gate(self, i): + return self.params.get( + f'model.layers.{i}.mlp.shared_expert_gate.weight') + + +@INPUT_MODELS.register_module(name='qwen2-moe') +class Qwen2MoeModel(LlamaModel): + + Reader = Qwen2MoeReader + + def tokenizer_info(self): + """https://huggingface.co/Qwen/Qwen1.5-7B-Chat/blob/main/generation_con + fig.json.""" # noqa: E501 + n_words = 152064 + bos_id = 151643 + eos_id = 151645 + return n_words, bos_id, eos_id + + def model_info(self): + cfg = self.model_config + info = super().model_info() + info['expert_num'] = cfg['num_experts'] + info['expert_inter_size'] = cfg['moe_intermediate_size'] + info['experts_per_token'] = cfg['num_experts_per_tok'] + info['inter_size'] = cfg['shared_expert_intermediate_size'] + info['moe_shared_gate'] = True + info['moe_norm_topk_prob'] = cfg['norm_topk_prob'] + info['attn_bias'] = 1 + return info diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py index fe0819d70f..f6772fddd5 100644 --- a/lmdeploy/turbomind/supported_models.py +++ b/lmdeploy/turbomind/supported_models.py @@ -20,6 +20,7 @@ QWenLMHeadModel='qwen', # Qwen2 Qwen2ForCausalLM='qwen2', + Qwen2MoeForCausalLM='qwen2-moe', # mistral MistralForCausalLM='llama', # llava diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu index acf6355856..5912c60a8a 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.cu +++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu @@ -8,7 +8,7 @@ #include #include -#include +#include #include "src/turbomind/kernels/core/array_ops.h" #include "src/turbomind/kernels/core/common.h" @@ -19,7 +19,7 @@ namespace turbomind { template __global__ void MoeGateKernel_V2(float* scales, // [e,n] - int* masks, // [E,n], padded + int8_t* masks, // [E,n], padded int* accum, // [E,tiles] const float* logits, // [E,n] int log_tile, @@ -88,6 +88,8 @@ __global__ void MoeGateKernel_V2(float* scales, // [e,n] const int lowbit = (mask & -mask); const int e = 31 - __clz(lowbit); + // printf("e = %d, ti = %d, idx = %d\n", e, ti, i); + masks[e * tokens_padded + ti] = i; atomicAdd(&shared_accum[e][ti >> log_tile], 1); top_val[i] = logits[ti * experts + e]; @@ -120,11 +122,11 @@ __global__ void MoeGateKernel_V2(float* scales, // [e,n] } } -template -__global__ void MoeScanKernel_V2(int* f2n, // [e*n] +template +__global__ void MoeScanKernel_v2(int* f2n, // [e*n] int* en2f, // [e,n] int* offsets, // [E+1] - int* masks, // [E,n], padded + Mask* masks, // [E,n], padded const int* accum, // [E,tiles] int log_tile, int tiles, @@ -142,13 +144,15 @@ __global__ void MoeScanKernel_V2(int* f2n, // [e*n] constexpr int vec_size = kMoeGateVecSize; - using Vec = Array; + using Vec = Array; const int tile_id = blockIdx.x; const int ei = blockIdx.y; - const int global_tile_id = ei * tiles + tile_id; + const int global_tile_id = ei * tiles + tile_id; + const bool is_valid = global_tile_id <= experts * tiles; +#if 0 int vacc[4]{}; { int idx = threadIdx.x; @@ -162,6 +166,18 @@ __global__ void MoeScanKernel_V2(int* f2n, // [e*n] } int offset = BlockReduce{temp_storage.reduce}.Sum(vacc); +#else + + int vacc = 0; + for (int i = threadIdx.x; i < global_tile_id; i += block_dim) { + if (is_valid && i < global_tile_id) { + vacc += accum[i]; + } + } + + int offset = BlockReduce{temp_storage.reduce}.Sum(vacc); + +#endif __shared__ int shared_offset; @@ -200,7 +216,7 @@ __global__ void MoeScanKernel_V2(int* f2n, // [e*n] const bool pred = vi < tile_vec_end; Vec data; - fill(data, -1); + fill(data, Mask{-1}); if (pred) { Ldg(data, mask_ptr[vi].data()); } @@ -231,17 +247,328 @@ __global__ void MoeScanKernel_V2(int* f2n, // [e*n] } } +template +__global__ void MoeGateKernel_v8(float* scales, // [e,n] + Mask* masks, // [E,n], padded + int* accum, // [E,tiles] + const float* logits, // [n,E] + int log_tile, + int tiles, + int token_num, + int token_num_padded, + int expert_num, + int top_k, + bool norm_topk) +{ + constexpr int max_tiles = kMoeGateMaxTiles; + constexpr int threads_per_token = max_expert_num / items_per_thread; // 8 + constexpr int tokens_per_cta = block_dim / threads_per_token; + + // We use bits in a uint32_t to represent selected experts + static_assert(items_per_thread <= 32); + // We use warp-level primitives for reduction + static_assert(threads_per_token <= 32); + + static_assert((threads_per_token & (threads_per_token - 1)) == 0); + + const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x; + + const int ti = thread_idx / threads_per_token; + const int ei = thread_idx % threads_per_token; + + const int bti = threadIdx.x / threads_per_token; + + const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token; + + const int warp_offset = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token; + const int block_offset = thread_idx / block_dim * block_dim / threads_per_token; + + float data[items_per_thread]; + int idxs[items_per_thread]; + +#if 0 + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + data[i] = -std::numeric_limits::infinity(); + idxs[i] = threads_per_token * (i / access_size * access_size) + i % access_size + ei * access_size; + } + if (ti < token_num) { + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; i += access_size) { + const int e = threads_per_token * i + ei * access_size; + if (e < expert_num) { + Ldg((Array&)data[i], &logits[ti * expert_num + e]); + } + } + } + + __shared__ union { + struct { + // +1 padding greatly reduced (-80%) bank conflicts + int shared_accum[max_tiles][max_expert_num + 1]; + float shared_scales[max_top_k][tokens_per_cta]; + int shared_exp_id[max_top_k][tokens_per_cta]; + }; + } smem; +#elif 1 + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + data[i] = -std::numeric_limits::infinity(); + // idxs[i] = threads_per_token * (i / access_size * access_size) + i % access_size + ei * access_size; + idxs[i] = ei * items_per_thread + i; + } + if (ti < token_num) { + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; i += access_size) { + // const int e = threads_per_token * i + ei * access_size; + const int e = ei * items_per_thread + i; + if (e < expert_num) { + Ldg((Array&)data[i], &logits[ti * expert_num + e]); + } + } + } + + __shared__ union { + struct { + // +1 padding greatly reduced (-80%) bank conflicts + int shared_accum[max_tiles][max_expert_num + 1]; + float shared_scales[max_top_k][tokens_per_cta]; + int shared_exp_id[max_top_k][tokens_per_cta]; + }; + } smem; +#else + + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + + constexpr int vecs_per_thread = items_per_thread / access_size; + + using Vec = Array; + constexpr int banks = 128 / sizeof(Vec); + constexpr int chunks = 4; // block_dim / WARP_SIZE; + + __shared__ union { + Vec shared_data[chunks][vecs_per_thread * WARP_SIZE / banks][banks + 1]; + struct { + // +1 padding greatly reduced (-80%) bank conflicts + int shared_accum[max_tiles][max_expert_num + 1]; + float shared_scales[max_top_k][tokens_per_cta]; + int shared_exp_id[max_top_k][tokens_per_cta]; + }; + } smem; + + __align__(16) Vec vecs[vecs_per_thread]; + + { + const int warp_end = min(warp_offset + WARP_SIZE / threads_per_token, token_num) * expert_num; + int p = warp_offset * expert_num + access_size * lane_id; + PRAGMA_UNROLL + for (int i = 0; i < vecs_per_thread; ++i) { + fill(vecs[i], -std::numeric_limits::infinity()); + // const int p = warp_offset * expert_num + access_size * (lane_id + i * WARP_SIZE); + if (p < warp_end) { + Ldg(vecs[i], &logits[p]); + } + p += access_size * WARP_SIZE; + } + } + + PRAGMA_UNROLL + for (int c = 0; c < block_dim / WARP_SIZE; c += chunks) { + PRAGMA_UNROLL + for (int i = 0; i < vecs_per_thread; ++i) { + int p = i * WARP_SIZE + lane_id; + if (c <= warp_id && warp_id < c + chunks) { + Store(smem.shared_data[warp_id - c][p / banks][p % banks].data(), vecs[i]); + } + } + + __syncwarp(); + + PRAGMA_UNROLL + for (int i = 0; i < vecs_per_thread; ++i) { + int p = lane_id * vecs_per_thread + i; + if (c <= warp_id && warp_id < c + chunks) { + Load(vecs[i], smem.shared_data[warp_id - c][p / banks][p % banks].data()); + } + } + + __syncthreads(); + } + + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + idxs[i] = ei * items_per_thread + i; + } + PRAGMA_UNROLL + for (int i = 0; i < vecs_per_thread; ++i) { + (Array&)data[i * access_size] = vecs[i]; + } + +#endif + + constexpr float kLog2e = 1.4426950408889634074; + + unsigned mask = (unsigned)-1; + float max_logit; + + int count{}; + float sum_prob{}; + + const int warp_ti_offset = warp_ti * threads_per_token; + + auto run = [&](int k) { + unsigned bit = 1; + unsigned max_bit = 0; + float max_val = -std::numeric_limits::infinity(); + // local maximum + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + if ((mask & bit) && data[i] > max_val) { + max_bit = bit; + max_val = data[i]; + } + asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); + } + + if (k == 0) { + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + data[i] *= kLog2e; + } + } + + int g_max_ei = ei; + float g_max_val = max_val; + if constexpr (threads_per_token > 1) { + // global maximum + PRAGMA_UNROLL + for (int m = threads_per_token / 2; m >= 1; m /= 2) { + g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m)); + } + // tie breaking + const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val); + g_max_ei = __ffs(active >> (unsigned)warp_ti_offset) - 1; + } + if (k == 0) { + max_logit = g_max_val; + } + if (ei == g_max_ei) { + mask -= max_bit; + ++count; + } + }; + + run(0); + + for (int k = 1; k < top_k; ++k) { + run(k); + } + + mask = ~mask; + + int used[items_per_thread]; + { + unsigned bit = 1; + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + used[i] = (mask & bit) > 0; + asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit)); + } + } + + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + if (!norm_topk || used[i]) { + data[i] = exp2f(data[i] - max_logit); + sum_prob += data[i]; + } + } + + PRAGMA_UNROLL + for (int m = threads_per_token / 2; m >= 1; m /= 2) { + sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m); + } + + sum_prob = fdividef(1.f, sum_prob); + + using WarpScan = cub::WarpScan; + __shared__ typename WarpScan::TempStorage temp_storage[tokens_per_cta]; + + int idx{}; + WarpScan{temp_storage[bti]}.ExclusiveSum(count, idx); + + PRAGMA_UNROLL + for (int i = 0; i < items_per_thread; ++i) { + if (used[i]) { + smem.shared_exp_id[idx][bti] = idxs[i]; + smem.shared_scales[idx][bti] = data[i] * sum_prob; + ++idx; + } + } + + PRAGMA_UNROLL + for (int i = 0; i < max_tiles * max_expert_num; i += block_dim) { + int e = (i + threadIdx.x) % max_expert_num; + int t = (i + threadIdx.x) / max_expert_num; + smem.shared_accum[t][e] = 0; + } + + __syncthreads(); + + constexpr int k_per_thread = cdiv(max_top_k, threads_per_token); + + const int bti2 = threadIdx.x % tokens_per_cta; + const int ei2 = threadIdx.x / tokens_per_cta; + const int ti2 = blockIdx.x * tokens_per_cta + bti2; + + PRAGMA_UNROLL + for (int i = 0; i < k_per_thread; ++i) { + const int idx = ei2 * k_per_thread + i; + const int expert_id = smem.shared_exp_id[idx][bti2]; + const float scale = smem.shared_scales[idx][bti2]; + + if (ti2 < token_num && idx < top_k) { + masks[expert_id * token_num_padded + ti2] = idx; + scales[idx * token_num + ti2] = scale; + atomicAdd(&smem.shared_accum[ti2 >> log_tile][expert_id], 1); + + // printf("%d %d %f\n", idx, expert_id, scale); + } + } + + __syncthreads(); + + for (int i = 0; i < max_expert_num * max_tiles; i += block_dim) { + int t = (threadIdx.x + i) % max_tiles; + int e = (threadIdx.x + i) / max_tiles; + if (e < expert_num && t < tiles) { + atomicAdd(accum + e * tiles + t, smem.shared_accum[t][e]); + } + } +} + +template +inline constexpr std::integral_constant _Int{}; + void invokeMoeGate_V2(int* f2n, // [e*n] -> n int* en2f, // [e,n] -> n*e int* offsets, // [E+1] float* scales, // [e,n] - int* masks, // [E,n] + void* masks, // [E,n] int* accum, // [E] const float* logits, // [e,n] int tokens, // n int tokens_padded, // round_up(n, 4) int experts, // E int experts_per_token, + bool norm_topk, cudaStream_t st) { constexpr int base_log_tile = 9; @@ -254,48 +581,64 @@ void invokeMoeGate_V2(int* f2n, // [e*n] -> n // std::cout << log_tile << " " << tiles << "\n"; - { - constexpr int threads = 128; - const int blocks = ceil_div(tokens, threads); + auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread) { + constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value; + constexpr int threads = 256; + const int blocks = ceil_div(tokens, threads / thrs_per_tok); + + cudaMemsetAsync(masks, -1, sizeof(int8_t) * experts * tokens_padded, st); - auto invoke = [&](auto e) { - static constexpr int top_k = decltype(e)::value; - MoeGateKernel_V2<<>>( // + MoeGateKernel_v8 + <<>>( // scales, - masks, + (int8_t*)masks, accum, logits, log_tile, tiles, tokens, tokens_padded, - experts); - }; + experts, + experts_per_token, + norm_topk); + }; - switch (experts_per_token) { - case 2: - invoke(std::integral_constant{}); - break; - // case 4: - // invoke(std::integral_constant{}); - // break; - default: - std::cerr << __FILE__ << ":" << __LINE__ << " Not implemented. " << std::endl; - std::abort(); + auto fail = [&] { + std::cerr << "unsupported moe config: expert_num=" << experts << ", top_k=" << experts_per_token << "\n"; + std::abort(); + }; + + if (experts <= 8) { + if (experts_per_token <= 2) { + invoke(_Int<8>, _Int<2>, _Int<8>); + } + else { + invoke(_Int<8>, _Int<8>, _Int<8>); } } - - // return; + else if (experts <= 64) { + if (experts_per_token <= 4) { + invoke(_Int<64>, _Int<4>, _Int<16>); + } + else if (experts_per_token <= 8) { + invoke(_Int<64>, _Int<8>, _Int<16>); + } + else { + fail(); + } + } + else { + fail(); + } { - // Check: tiles * experts <= threads - constexpr int threads = (1 << base_log_tile) / kMoeGateVecSize; const dim3 blocks(tiles, experts + 1); - MoeScanKernel_V2<<>>(f2n, // + + MoeScanKernel_v2<<>>(f2n, // en2f, offsets, - masks, + (int8_t*)masks, accum, log_tile, tiles, @@ -338,10 +681,11 @@ void invokeMoeGather(T* dst, const T* src, const int* f2n, int tokens, int exper template void invokeMoeGather(uint16_t*, const uint16_t*, const int*, int, int, int, cudaStream_t); template -__global__ void MoeReduceKernel(T* dst, // [ n, d] - const T* src, // [e*n, d] - const float* scales, // [ e, n] - const int* en2f, // [ e, n] :: (e,n) -> e*n +__global__ void MoeReduceKernel(T* dst, // [ n, d] + const T* src, // [e*n, d] + const float* scales, // [ e, n] + const int* en2f, // [ e, n] :: (e,n) -> e*n + const float* dst_scales, // [n] int dims, int tokens) { @@ -351,6 +695,12 @@ __global__ void MoeReduceKernel(T* dst, // [ n, d] auto dst_ptr = (Vec*)dst + dims * ti; + float dst_scale = 0; + if (dst_scales) { + dst_scale = dst_scales[ti]; + dst_scale = fdividef(1.f, 1.f + expf(-dst_scale)); + } + // Should be warp uniforms const Vec* src_ptr[exp_k]; float scale[exp_k]; @@ -362,6 +712,12 @@ __global__ void MoeReduceKernel(T* dst, // [ n, d] for (int i = threadIdx.x; i < dims; i += block_dim) { Array accum{}; + if (dst_scales) { + Vec v; + Ldg(v, dst_ptr[i].data()); + using namespace ops; + accum = cast(v) * dst_scale; + } PRAGMA_UNROLL for (int e = 0; e < exp_k; ++e) { Vec v; @@ -379,6 +735,7 @@ void invokeMoeReduce(T* dst, const T* src, const float* scales, const int* en2f, + const float* dst_scales, int tokens, int experts_per_token, int dims, @@ -395,6 +752,7 @@ void invokeMoeReduce(T* dst, src, scales, en2f, + dst_scales, dims / vec_size, tokens); }; @@ -404,19 +762,22 @@ void invokeMoeReduce(T* dst, return invoke(std::integral_constant{}); case 2: return invoke(std::integral_constant{}); - // case 4: - // return invoke(std::integral_constant{}); - // case 6: - // return invoke(std::integral_constant{}); + case 4: + return invoke(std::integral_constant{}); + case 6: + return invoke(std::integral_constant{}); + case 8: + return invoke(std::integral_constant{}); default: fprintf(stderr, "Unsupported experts_per_token %d\n", experts_per_token); std::abort(); } } -template void invokeMoeReduce(half*, const half*, const float*, const int*, int, int, int, cudaStream_t); +template void invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, cudaStream_t); #ifdef ENABLE_BF16 -template void invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, int, int, int, cudaStream_t); +template void +invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, cudaStream_t); #endif std::vector SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g) diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h index 334e2de272..0e4c36af09 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.h +++ b/src/turbomind/kernels/gemm/moe_utils_v2.h @@ -14,13 +14,14 @@ void invokeMoeGate_V2(int* f2n, int* en2f, int* offsets, float* scales, - int* masks, + void* masks, int* accum, const float* logits, int tokens, int tokens_padded, int experts, int exp_per_tok, + bool norm_topk, cudaStream_t st); template @@ -49,6 +50,7 @@ void invokeMoeReduce(T* dst, const T* src, const float* scales, const int* en2f, + const float* dst_scales, int tokens, int experts_per_token, int dims, diff --git a/src/turbomind/kernels/gemm/test/test_moe_utils.cu b/src/turbomind/kernels/gemm/test/test_moe_utils.cu index a311162193..47e3bfdb16 100644 --- a/src/turbomind/kernels/gemm/test/test_moe_utils.cu +++ b/src/turbomind/kernels/gemm/test/test_moe_utils.cu @@ -26,6 +26,25 @@ void print_vecs(const T* data, int m, int k, std::string msg, int width = 4) } } +template +void diff_vecs(const T* data, const T* refs, int m, int k, std::string msg) +{ + if (!msg.empty()) { + std::cout << msg << ": [" << m << ", " << k << "]\n"; + } + for (int mm = 0; mm < m; ++mm) { + std::cout << "m=" << mm << ": "; + for (int kk = 0; kk < k; ++kk) { + const auto& x = data[mm * k + kk]; + const auto& y = refs[mm * k + kk]; + if (x != y) { + std::cout << kk << "(" << x << ", " << y << ") "; + } + } + std::cout << "\n"; + } +} + #if 0 void func() { @@ -190,7 +209,7 @@ void moe_gate_ref(int tokens, } } -void mask2eids(const universal_vector& masks, universal_vector& eids, int tokens, int expert_num) +void mask2eids(universal_vector& masks, universal_vector& eids, int tokens, int expert_num) { const int tokens_padded = masks.size() / expert_num; // std::cout << eids.size() << std::endl; @@ -228,13 +247,13 @@ bool test_moe_gate(int tokens, // const int tokens_padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize; // const int max_coords = get_max_coords(tokens, expert_num, experts_per_token, tiling); - universal_vector offsets(expert_num + 1); - universal_vector accum(expert_num * kMoeGateMaxTiles); - universal_vector masks(expert_num * tokens_padded); - universal_vector eids(experts_per_token * tokens); - universal_vector f2n(experts_per_token * tokens); - universal_vector en2f(experts_per_token * tokens); - universal_vector scales(experts_per_token * tokens); + universal_vector offsets(expert_num + 1); + universal_vector accum(expert_num * kMoeGateMaxTiles); + universal_vector masks(expert_num * tokens_padded); + universal_vector eids(experts_per_token * tokens); + universal_vector f2n(experts_per_token * tokens); + universal_vector en2f(experts_per_token * tokens); + universal_vector scales(experts_per_token * tokens); // universal_vector coords(max_coords); // thrust::fill(coords.begin(), coords.end(), int2{-1, 0}); @@ -246,8 +265,16 @@ bool test_moe_gate(int tokens, // moe_gate_ref(tokens, expert_num, experts_per_token, logits, offsets_ref, eids_ref, f2n_ref, en2f_ref, scales_ref); - for (int i = 0; i < 10; ++i) { + cudaMemPrefetchAsync(f2n.data().get(), sizeof(int) * f2n.size(), 0); + cudaMemPrefetchAsync(en2f.data().get(), sizeof(int) * en2f.size(), 0); + cudaMemPrefetchAsync(offsets.data().get(), sizeof(int) * offsets.size(), 0); + cudaMemPrefetchAsync(scales.data().get(), sizeof(float) * scales.size(), 0); + cudaMemPrefetchAsync(logits.data().get(), sizeof(float) * logits.size(), 0); + + for (int i = 0; i < 1; ++i) { + gemm::CacheFlushing::flush(); cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size()); + cudaMemset(masks.data().get(), -1, sizeof(int8_t) * masks.size()); invokeMoeGate_V2(f2n.data().get(), en2f.data().get(), offsets.data().get(), @@ -259,6 +286,7 @@ bool test_moe_gate(int tokens, // tokens_padded, expert_num, experts_per_token, + true, 0); } @@ -306,7 +334,10 @@ bool test_moe_gate(int tokens, // success = false; } - if (!success || false) { + if (!success && 1) { + + diff_vecs(eids.data().get(), eids_ref.data().get(), experts_per_token, tokens, "eids"); + print_vecs(offsets_ref.data().get(), 1, expert_num + 1, "offsets_ref"); print_vecs(offsets.data().get(), 1, expert_num + 1, "offsets"); @@ -322,32 +353,32 @@ bool test_moe_gate(int tokens, // print_vecs(scales_ref.data().get(), experts_per_token, tokens, "scales_ref", 12); print_vecs(scales.data().get(), experts_per_token, tokens, "scales", 12); - print_vecs(accum.data().get(), expert_num, 1, "accum"); + // print_vecs(accum.data().get(), expert_num, 1, "accum"); // print_vecs(coords.data().get(), 1, max_coords, "coords"); - thrust::host_vector tile_offsets(tape.max_ctas); - std::cout << tape.max_ctas << std::endl; - cudaMemcpy(tile_offsets.data(), tape.tile_offsets, sizeof(int4) * tile_offsets.size(), cudaMemcpyDefault); - cudaDeviceSynchronize(); - - std::cout << "coords:\n"; - int last = -1; - for (int i = 0; i < tape.max_ctas; ++i) { - auto& c = tile_offsets[i]; - if (last >= 0 && c.w != last) { - std::cout << "\n"; - } - if (c.w == -1) { - std::cout << i << "\n"; - break; - } - last = c.w; - std::stringstream ss; - ss << c.x << "," << c.y; - std::cout << std::setw(6) << ss.str(); - } - std::cout << "\n"; + // thrust::host_vector tile_offsets(tape.max_ctas); + // std::cout << tape.max_ctas << std::endl; + // cudaMemcpy(tile_offsets.data(), tape.tile_offsets, sizeof(int4) * tile_offsets.size(), cudaMemcpyDefault); + // cudaDeviceSynchronize(); + + // std::cout << "coords:\n"; + // int last = -1; + // for (int i = 0; i < tape.max_ctas; ++i) { + // auto& c = tile_offsets[i]; + // if (last >= 0 && c.w != last) { + // std::cout << "\n"; + // } + // if (c.w == -1) { + // std::cout << i << "\n"; + // break; + // } + // last = c.w; + // std::stringstream ss; + // ss << c.x << "," << c.y; + // std::cout << std::setw(6) << ss.str(); + // } + // std::cout << "\n"; } return success; @@ -358,7 +389,11 @@ int main() gemm::Tape tape{}; constexpr Tiling tiling{14336, 128, {128, 128, 32}}; - test_moe_gate(8192, 8, 2, tape, tiling); + // test_moe_gate(32768 * 4, 60, 4, tape, tiling); + // test_moe_gate(32768, 64, 8, tape, tiling); + // test_moe_gate(8, 60, 4, tape, tiling); + + test_moe_gate(65536, 8, 2, tape, tiling); return 0; for (int i = 1; i < 16384; ++i) { diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h index 6b1ec88f58..7a089fbdf2 100644 --- a/src/turbomind/kernels/gemm/test/testbed.h +++ b/src/turbomind/kernels/gemm/test/testbed.h @@ -514,6 +514,7 @@ class Testbed { c_e_.data().get(), moe_scales_.data().get(), moe_en2f_.data().get(), + nullptr, batch_size_, expert_ids_.size() / batch_size_, output_dims_, @@ -523,6 +524,7 @@ class Testbed { c_e_ref_.data().get(), moe_scales_.data().get(), moe_en2f_.data().get(), + nullptr, batch_size_, expert_ids_.size() / batch_size_, output_dims_, diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc index 2d68ef3535..f6f9ab0efa 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc @@ -137,6 +137,7 @@ LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(int layer_idx, moe_param.inter_size, moe_param.expert_num, moe_param.method, + moe_param.shared_gate, tensor_para_size_, weight_type, group_size, @@ -349,18 +350,22 @@ void LlamaDecoderLayerWeight::mallocWeights() mallocWeights(self_attn_weights.qkv, attn_bias_); mallocWeights(self_attn_weights.output, attn_bias_); - if (moe_weights.experts.empty()) { + if (inter_size_) { mallocWeights(ffn_weights.gating, false); mallocWeights(ffn_weights.intermediate, false); mallocWeights(ffn_weights.output, false); } - else { + + if (!moe_weights.experts.empty()) { mallocWeights(moe_weights.gate, false); for (auto& e : moe_weights.experts) { mallocWeights(e.gating, false); mallocWeights(e.intermediate, false); mallocWeights(e.output, false); } + if (moe_weights.shared_gate.output_dims) { + mallocWeights(moe_weights.shared_gate, false); + } } } @@ -375,10 +380,25 @@ LlamaDecoderLayerWeight::~LlamaDecoderLayerWeight() freeWeights(self_attn_weights.qkv); freeWeights(self_attn_weights.output); - freeWeights(ffn_weights.fused_gating_intermediate); - freeWeights(ffn_weights.gating); - freeWeights(ffn_weights.intermediate); - freeWeights(ffn_weights.output); + if (inter_size_) { + freeWeights(ffn_weights.fused_gating_intermediate); + freeWeights(ffn_weights.gating); + freeWeights(ffn_weights.intermediate); + freeWeights(ffn_weights.output); + } + + if (!moe_weights.experts.empty()) { + freeWeights(moe_weights.gate); + for (auto& e : moe_weights.experts) { + freeWeights(e.fused_gating_intermediate); + freeWeights(e.gating); + freeWeights(e.intermediate); + freeWeights(e.output); + } + if (moe_weights.shared_gate.kernel) { + freeWeights(moe_weights.shared_gate); + } + } } template @@ -428,23 +448,30 @@ TensorMap LlamaDecoderLayerWeight::getParams(std::string prefix) getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output); getWeightTensor(self_attn_weights.output, attn_bias_, get_prefix("attention.wo"), output); - if (moe_weights.experts.empty()) { + if (inter_size_) { getWeightTensor(ffn_weights.gating, false, get_prefix("feed_forward.w1"), output); getWeightTensor(ffn_weights.intermediate, false, get_prefix("feed_forward.w3"), output); getWeightTensor(ffn_weights.output, false, get_prefix("feed_forward.w2"), output); } - else { + + if (!moe_weights.experts.empty()) { output.insert( concat(prefix, "moe_ffn.gate.weight"), Tensor{MEMORY_GPU, getTensorType(), {moe_weights.gate.kernel_size()}, moe_weights.gate.kernel}); auto& experts = moe_weights.experts; for (size_t i = 0; i < experts.size(); ++i) { const std::string name = "moe_ffn.experts." + std::to_string(i); - // std::cerr << "FUCK " << get_prefix(concat(name, "w1")) << "\n"; getWeightTensor(experts[i].gating, false, get_prefix(concat(name, "w1")), output); getWeightTensor(experts[i].intermediate, false, get_prefix(concat(name, "w3")), output); getWeightTensor(experts[i].output, false, get_prefix(concat(name, "w2")), output); } + if (moe_weights.shared_gate.kernel) { + output.insert(concat(prefix, "moe_ffn.shared_gate.weight"), + Tensor{MEMORY_GPU, + getTensorType(), + {moe_weights.shared_gate.kernel_size()}, + moe_weights.shared_gate.kernel}); + } } return output; @@ -681,10 +708,13 @@ void LlamaDecoderLayerWeight::prepare(void* workspace, size_t size, const cud convert(ffn.output, is_fused_moe, workspace, size, is_16xx); }; - if (moe_weights.experts.empty()) { + if (inter_size_) { + // std::cerr << "process FFN\n"; process_ffn(ffn_weights, false); } - else { + + if (!moe_weights.experts.empty()) { + // std::cerr << "process MoE\n"; std::vector> fused_ptrs; std::vector> output_ptrs; std::vector> fused_param_ptrs; diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h index 9a895243bc..169fb53bcf 100644 --- a/src/turbomind/models/llama/LlamaDenseWeight.h +++ b/src/turbomind/models/llama/LlamaDenseWeight.h @@ -145,24 +145,28 @@ struct LlamaFfnWeight { LlamaFfnWeight( size_t hidden_dim, size_t inter_size, size_t tp, WeightType weight_type, int group_size, bool fuse_silu_act) { + inter_size /= tp; + + this->inter_size = inter_size; + gating.input_dims = hidden_dim; - gating.output_dims = inter_size / tp; + gating.output_dims = inter_size; gating.type = weight_type; gating.group_size = group_size; intermediate.input_dims = hidden_dim; - intermediate.output_dims = inter_size / tp; + intermediate.output_dims = inter_size; intermediate.type = weight_type; intermediate.group_size = group_size; fused_gating_intermediate.input_dims = hidden_dim; - fused_gating_intermediate.output_dims = inter_size / tp * 2; + fused_gating_intermediate.output_dims = inter_size * 2; fused_gating_intermediate.type = weight_type; fused_gating_intermediate.group_size = group_size; is_fused_silu = fuse_silu_act; - output.input_dims = inter_size / tp; + output.input_dims = inter_size; output.output_dims = hidden_dim; output.type = weight_type; output.group_size = group_size; @@ -173,6 +177,7 @@ struct LlamaFfnWeight { LlamaDenseWeight output; LlamaDenseWeight fused_gating_intermediate; + int inter_size{}; bool is_fused_silu{}; }; @@ -185,11 +190,15 @@ struct MoeFfnWeight { int inter_size, int expert_num, int method, + bool has_shared_gate, size_t tp, WeightType weight_type, int group_size, bool fuse_silu_act) { + + // printf("%d %d %d\n", (int)hidden_dim, (int)inter_size, (int)expert_num); + if (expert_num == 0) { return; } @@ -208,11 +217,23 @@ struct MoeFfnWeight { // inter size is divided by tp in `FfnWeight` e = LlamaFfnWeight{hidden_dim, (size_t)inter_size, tp, weight_type, group_size, fuse_silu_act}; } + + if (has_shared_gate) { + shared_gate.input_dims = hidden_dim; + shared_gate.output_dims = 1; + shared_gate.type = get_default_weight_type(); + gate.group_size = group_size; + } + else { + shared_gate = {}; + } } LlamaDenseWeight gate; std::vector> experts; + LlamaDenseWeight shared_gate; + LlamaFfnWeight block; int method{}; diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc index f9ee0c4ad4..8cce207203 100644 --- a/src/turbomind/models/llama/LlamaFfnLayer.cc +++ b/src/turbomind/models/llama/LlamaFfnLayer.cc @@ -28,10 +28,11 @@ namespace turbomind { template void LlamaFfnLayer::allocateBuffer(size_t token_num, + int inter_size, const LlamaDenseWeight* gating, const LlamaDenseWeight* inter) { - const size_t sz = token_num * inter_size_; + const size_t sz = token_num * inter_size; const size_t sz_gate = token_num * gating->lora.r; const size_t sz_inter = token_num * inter->lora.r; @@ -51,24 +52,24 @@ template void LlamaFfnLayer::freeBuffer() { if (is_allocate_buffer_) { - // allocator_->free((void**)&inter_buf_); allocator_->free((void**)&gating_buf_); is_allocate_buffer_ = false; } } template -void LlamaFfnLayer::activation(int token_num, bool is_chunked) +void LlamaFfnLayer::activation(int token_num, int inter_size, bool is_chunked) { NvtxScope scope("activation"); if (is_chunked) { + // gate & up are in the SAME buffer invokeGenericActivation_v2( - gating_buf_, gating_buf_ + inter_size_, inter_size_ * 2, token_num, inter_size_, stream_); + gating_buf_, gating_buf_ + inter_size, inter_size * 2, token_num, inter_size, stream_); sync_check_cuda_error(); } else { - invokeGenericActivation_v2( - gating_buf_, inter_buf_, inter_size_, token_num, inter_size_, stream_); + // gate & up are in separate buffers + invokeGenericActivation_v2(gating_buf_, inter_buf_, inter_size, token_num, inter_size, stream_); sync_check_cuda_error(); } } @@ -88,11 +89,11 @@ void LlamaFfnLayer::forward(TensorMap* output_tensors, NvtxScope scope("ffn"); - const size_t num_token = input_tensors->at("ffn_input").shape[0]; - const int layer_id = input_tensors->getVal("layer_id"); - // LOG(WARNING); + const size_t token_num = input_tensors->at("ffn_input").shape[0]; + const int layer_id = input_tensors->getVal("layer_id"); + const int inter_size = weights->inter_size; - allocateBuffer(num_token, &weights->gating, &weights->intermediate); + allocateBuffer(token_num, inter_size, &weights->gating, &weights->intermediate); const T* ffn_input_data = input_tensors->at("ffn_input").getPtr(); T* ffn_output_data = output_tensors->at("ffn_output").getPtr(); @@ -103,50 +104,50 @@ void LlamaFfnLayer::forward(TensorMap* output_tensors, const auto type = weights->is_fused_silu ? LlamaLinear::kFusedSiluFfn : LlamaLinear::kGemm; - linear_->forward(gating_buf_, ffn_input_data, num_token, weights->fused_gating_intermediate, type); + linear_->forward(gating_buf_, ffn_input_data, token_num, weights->fused_gating_intermediate, type); sync_check_cuda_error(); if (!weights->is_fused_silu) { - activation(num_token, true); + activation(token_num, inter_size, true); } - count_and_fix(gating_buf_, num_token * weights->output.input_dims, Concat("w1_w3_silu", layer_id), 3); + count_and_fix(gating_buf_, token_num * weights->output.input_dims, Concat("w1_w3_silu", layer_id), 3); } else { { // w1(x) NvtxScope scope("w1"); - linear_->forward(gating_buf_, ffn_input_data, num_token, weights->gating, LlamaLinear::kGemm, lora_mask); + linear_->forward(gating_buf_, ffn_input_data, token_num, weights->gating, LlamaLinear::kGemm, lora_mask); sync_check_cuda_error(); } - count_and_fix(gating_buf_, num_token * weights->gating.output_dims, Concat("w1", layer_id), 3); + count_and_fix(gating_buf_, token_num * weights->gating.output_dims, Concat("w1", layer_id), 3); { // w3(x) NvtxScope scope("w3"); linear_->forward( - inter_buf_, ffn_input_data, num_token, weights->intermediate, LlamaLinear::kGemm, lora_mask); + inter_buf_, ffn_input_data, token_num, weights->intermediate, LlamaLinear::kGemm, lora_mask); sync_check_cuda_error(); } - count_and_fix(inter_buf_, num_token * weights->intermediate.output_dims, Concat("w3", layer_id), 3); + count_and_fix(inter_buf_, token_num * weights->intermediate.output_dims, Concat("w3", layer_id), 3); // silu(w1(x)) * w3(x) - activation(num_token, false); + activation(token_num, inter_size, false); - count_and_fix(gating_buf_, num_token * weights->output.input_dims, Concat("act", layer_id), 3); + count_and_fix(gating_buf_, token_num * weights->output.input_dims, Concat("act", layer_id), 3); } { // w2(x) NvtxScope scope("w2"); - const int pitch = (weights->fused_gating_intermediate.kernel && !weights->is_fused_silu) ? inter_size_ * 2 : 0; + const int pitch = (weights->fused_gating_intermediate.kernel && !weights->is_fused_silu) ? inter_size * 2 : 0; linear_->forward( - ffn_output_data, {gating_buf_, pitch}, num_token, weights->output, LlamaLinear::kGemm, lora_mask); + ffn_output_data, {gating_buf_, pitch}, token_num, weights->output, LlamaLinear::kGemm, lora_mask); sync_check_cuda_error(); } - count_and_fix(ffn_output_data, num_token * weights->output.output_dims, Concat("w2", layer_id), 3); + count_and_fix(ffn_output_data, token_num * weights->output.output_dims, Concat("w2", layer_id), 3); if (all_reduce_ && tensor_para_.world_size_ > 1) { NcclGuard nccl_guard(tensor_para_, stream_); - ftNcclAllReduceSum(ffn_output_data, ffn_output_data, num_token * hidden_units_, tensor_para_, stream_); + ftNcclAllReduceSum(ffn_output_data, ffn_output_data, token_num * hidden_units_, tensor_para_, stream_); sync_check_cuda_error(); } diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h index 75ced5f9ac..2daca2cc95 100644 --- a/src/turbomind/models/llama/LlamaFfnLayer.h +++ b/src/turbomind/models/llama/LlamaFfnLayer.h @@ -19,12 +19,11 @@ #pragma once -#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h" #include "src/turbomind/models/llama/LlamaLinear.h" #include "src/turbomind/models/llama/context.h" -#include "src/turbomind/utils/custom_ar_comm.h" +#include "src/turbomind/models/llama/llama_params.h" +#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/nccl_utils.h" -#include namespace turbomind { @@ -32,7 +31,6 @@ template class LlamaFfnLayer { public: LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context& ctx, bool all_reduce): - inter_size_(model.inter_size / tp.world_size_), hidden_units_(model.hidden_units), tensor_para_(tp), stream_(ctx.stream), @@ -50,13 +48,12 @@ class LlamaFfnLayer { void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaFfnWeight* weights); private: - void allocateBuffer(size_t token_num, const LlamaDenseWeight*, const LlamaDenseWeight*); + void allocateBuffer(size_t token_num, int inter_size, const LlamaDenseWeight*, const LlamaDenseWeight*); void freeBuffer(); - void activation(int token_num, bool is_chunked); + void activation(int token_num, int inter_size, bool is_chunked); - const size_t inter_size_; const size_t hidden_units_; const NcclParam tensor_para_; cudaStream_t const stream_; diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h index 1c039ca66a..2ea63f0410 100644 --- a/src/turbomind/models/llama/llama_params.h +++ b/src/turbomind/models/llama/llama_params.h @@ -31,9 +31,11 @@ struct MoeParam { kNaive, kFused } method; - int expert_num; - int experts_per_token; - int inter_size; + int expert_num; + int experts_per_token; + int inter_size; + bool norm_topk; + bool shared_gate; }; struct AttentionParam { diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc index def6b04abb..1ad76839d1 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.cc +++ b/src/turbomind/models/llama/moe_ffn_layer.cc @@ -30,6 +30,7 @@ void MoeFfnLayer::AllocateBuffer(size_t tokens, size_t padded) alloc(&f2n_, param_.experts_per_token * tokens); alloc(&en2f_, param_.experts_per_token * tokens); alloc(&scales_, param_.experts_per_token * tokens); + alloc(&shared_scales_, tokens); return (char*)alloc.ptr() - (char*)base; }; @@ -69,7 +70,7 @@ void MoeFfnLayer::gate(float* logits, const T* input, int tokens, const Llama getCudaDataType(), hidden_dim_, &beta, - logits_, + logits, CUDA_R_32F, weight.output_dims, CUDA_R_32F, @@ -77,13 +78,13 @@ void MoeFfnLayer::gate(float* logits, const T* input, int tokens, const Llama } template -void MoeFfnLayer::forward(T* inout, int tokens, int layer_id, const MoeFfnWeight& moe) +void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight& moe) { const size_t padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize; AllocateBuffer(tokens, padded); - gate(logits_, inout, tokens, moe.gate); + gate(logits_, input, tokens, moe.gate); sync_check_cuda_error(); check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * param_.expert_num * kMoeGateMaxTiles, stream_)); @@ -103,6 +104,7 @@ void MoeFfnLayer::forward(T* inout, int tokens, int layer_id, const MoeFfnWei padded, param_.expert_num, param_.experts_per_token, + param_.norm_topk, stream_); sync_check_cuda_error(); @@ -123,7 +125,7 @@ void MoeFfnLayer::forward(T* inout, int tokens, int layer_id, const MoeFfnWei if (param_.method == MoeParam::kNaive) { - dispatchMoeGather(inout_buf_, inout, f2n_, tokens, param_.experts_per_token, hidden_dim_, stream_); + dispatchMoeGather(inout_buf_, input, f2n_, tokens, param_.experts_per_token, hidden_dim_, stream_); sync_check_cuda_error(); check_cuda_error( @@ -155,28 +157,8 @@ void MoeFfnLayer::forward(T* inout, int tokens, int layer_id, const MoeFfnWei auto& block = moe.block; -#if 0 - FT_CHECK(!block.is_fused_silu); - for (int i = 0; i < param_.expert_num; ++i) { - if (size_t count = h_offsets_[i + 1] - h_offsets_[i]) { - cublas_->Gemm(CUBLAS_OP_T, // (m, k) W - CUBLAS_OP_N, // (k, n) X - inter_size_ * 2, - count, - hidden_dim_, - moe.experts[i].fused_gating_intermediate.kernel, - hidden_dim_, - inout_buf_ + h_offsets_[i] * hidden_dim_, - hidden_dim_, - inter_buf_ + h_offsets_[i] * inter_size_ * 2, - inter_size_ * 2); - sync_check_cuda_error(); - } - } - auto mode = kCmpWrite; -#else linear_->forward_moe(inter_buf_, - {inout, (int)hidden_dim_}, + {input, (int)hidden_dim_}, f2n_, offsets_, tokens * param_.experts_per_token, @@ -185,7 +167,6 @@ void MoeFfnLayer::forward(T* inout, int tokens, int layer_id, const MoeFfnWei context_.get()); sync_check_cuda_error(); auto mode = kCmpRead; -#endif // if (tensor_para_.rank_ == 0) { // Compare(inter_buf_, // @@ -205,25 +186,6 @@ void MoeFfnLayer::forward(T* inout, int tokens, int layer_id, const MoeFfnWei sync_check_cuda_error(); } -#if 0 - for (int i = 0; i < param_.expert_num; ++i) { - if (size_t count = h_offsets_[i + 1] - h_offsets_[i]) { - cublas_->Gemm(CUBLAS_OP_T, // (m, k) W - CUBLAS_OP_N, // (k, n) X - hidden_dim_, - count, - inter_size_, - moe.experts[i].output.kernel, - inter_size_, - inter_buf_ + h_offsets_[i] * inter_size_ * 2, - inter_size_ * 2, - inout_buf_ + h_offsets_[i] * hidden_dim_, - hidden_dim_); - sync_check_cuda_error(); - } - } - auto mode1 = kCmpWrite; -#else linear_->forward_moe(inout_buf_, {inter_buf_, block.is_fused_silu ? (int)inter_size_ : (int)inter_size_ * 2}, nullptr, @@ -234,7 +196,6 @@ void MoeFfnLayer::forward(T* inout, int tokens, int layer_id, const MoeFfnWei context_.get()); sync_check_cuda_error(); auto mode1 = kCmpRead; -#endif // if (tensor_para_.rank_ == 0) { // Compare(inter_buf_2_, // @@ -250,18 +211,29 @@ void MoeFfnLayer::forward(T* inout, int tokens, int layer_id, const MoeFfnWei // } } - invokeMoeReduce(inout, inout_buf_, scales_, en2f_, tokens, param_.experts_per_token, hidden_dim_, stream_); + if (moe.shared_gate.kernel) { + gate(shared_scales_, input, tokens, moe.shared_gate); + } +} + +template +void MoeFfnLayer::reduce(T* output, int tokens, const MoeFfnWeight& moe) +{ + invokeMoeReduce(output, + inout_buf_, + scales_, + en2f_, + moe.shared_gate.kernel ? shared_scales_ : nullptr, + tokens, + param_.experts_per_token, + hidden_dim_, + stream_); sync_check_cuda_error(); if (tensor_para_.world_size_ > 1) { - ftNcclAllReduceSum(inout, inout, tokens * hidden_dim_, tensor_para_, stream_); + ftNcclAllReduceSum(output, output, tokens * hidden_dim_, tensor_para_, stream_); sync_check_cuda_error(); } - - // if (tensor_para_.rank_ == 0) { - // check_cuda_error(cudaStreamSynchronize(stream_)); - // std::abort(); - // } } template diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h index ef65aaa464..0f1713f7b5 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.h +++ b/src/turbomind/models/llama/moe_ffn_layer.h @@ -51,7 +51,9 @@ class MoeFfnLayer { FreeBuffer(); } - void forward(T* inout, int tokens, int layer_id, const MoeFfnWeight& moe); + void forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight& moe); + + void reduce(T* output, int tokens, const MoeFfnWeight& moe); void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight& weight); @@ -85,6 +87,8 @@ class MoeFfnLayer { int* en2f_{}; float* scales_{}; + float* shared_scales_{}; + int* accum_{}; int* offsets_{}; }; diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc index 68392215f6..28e8b5f649 100644 --- a/src/turbomind/models/llama/unified_decoder.cc +++ b/src/turbomind/models/llama/unified_decoder.cc @@ -26,9 +26,15 @@ UnifiedDecoder::UnifiedDecoder(const ModelParam& model, dtype_(getTensorType()) { - attn_layer_ = std::make_unique>(model, attn, lora, tp, ctx); - ffn_layer_ = std::make_unique>(model, tp, ctx, true); - moe_ffn_layer_ = std::make_unique>(model, moe, tp, ctx); + attn_layer_ = std::make_unique>(model, attn, lora, tp, ctx); + + if (moe.expert_num) { + moe_ffn_layer_ = std::make_unique>(model, moe, tp, ctx); + } + + if (model.inter_size) { + ffn_layer_ = std::make_unique>(model, tp, ctx, !moe_ffn_layer_); + } check_cuda_error(cudaEventCreateWithFlags(&ev_h_cu_x_, cudaEventDisableTiming)); } @@ -190,9 +196,10 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con /// feed-forward network if (!weights->at(layer)->moe_weights.experts.empty()) { - moe_ffn_layer_->forward(decoder_output, token_num, layer, weights->at(layer)->moe_weights); + moe_ffn_layer_->forward(nullptr, decoder_output, token_num, layer, weights->at(layer)->moe_weights); } - else { + + if (ffn_layer_) { int layer_id = layer; // int is needed TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}}, {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}}; @@ -203,6 +210,10 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights); } + if (!weights->at(layer)->moe_weights.experts.empty()) { + moe_ffn_layer_->reduce(decoder_output, token_num, weights->at(layer)->moe_weights); + } + count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2); const bool is_last_layer = layer == layer_num_ - 1; diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 8db13652f5..38552be0cf 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -301,6 +301,8 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, moe_param_.expert_num = model_reader["expert_num"].as(0); moe_param_.experts_per_token = model_reader["experts_per_token"].as(0); moe_param_.inter_size = model_reader["expert_inter_size"].as(0); + moe_param_.shared_gate = model_reader["moe_shared_gate"].as(0); + moe_param_.norm_topk = model_reader["moe_norm_topk"].as(false); handleMissingParams();