From d2d4209d148c09356492a04000a878270896178c Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Wed, 13 Nov 2024 11:27:26 +0800
Subject: [PATCH] Support Qwen2-MoE models (#2723)

* add qwen2-moe

* eliminate `inter_size_` from ffn layer

* clean up

* fix lint

* clean up

* Update config.yaml

---------

Co-authored-by: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
---
 autotest/config.yaml                          |   2 +
 lmdeploy/turbomind/deploy/config.py           |   2 +
 lmdeploy/turbomind/deploy/module.py           |  21 +-
 .../turbomind/deploy/source_model/mixtral.py  |   2 +
 .../turbomind/deploy/source_model/qwen.py     |  61 +++
 lmdeploy/turbomind/supported_models.py        |   1 +
 src/turbomind/kernels/gemm/moe_utils_v2.cu    | 449 ++++++++++++++++--
 src/turbomind/kernels/gemm/moe_utils_v2.h     |   4 +-
 .../kernels/gemm/test/test_moe_utils.cu       | 103 ++--
 src/turbomind/kernels/gemm/test/testbed.h     |   2 +
 .../models/llama/LlamaDecoderLayerWeight.cc   |  52 +-
 src/turbomind/models/llama/LlamaDenseWeight.h |  29 +-
 src/turbomind/models/llama/LlamaFfnLayer.cc   |  47 +-
 src/turbomind/models/llama/LlamaFfnLayer.h    |  11 +-
 src/turbomind/models/llama/llama_params.h     |   8 +-
 src/turbomind/models/llama/moe_ffn_layer.cc   |  78 +--
 src/turbomind/models/llama/moe_ffn_layer.h    |   6 +-
 src/turbomind/models/llama/unified_decoder.cc |  21 +-
 .../triton_backend/llama/LlamaTritonModel.cc  |   2 +
 19 files changed, 711 insertions(+), 190 deletions(-)

diff --git a/autotest/config.yaml b/autotest/config.yaml
index 9357e473bb..587ee6331b 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -44,10 +44,12 @@ turbomind_chat_model:
     - Qwen/Qwen2-1.5B-Instruct
     - Qwen/Qwen1.5-7B-Chat
     - Qwen/Qwen1.5-4B-Chat-AWQ
+    - Qwen/Qwen1.5-MoE-A2.7B-Chat
     - Qwen/Qwen-VL-Chat
     - Qwen/Qwen2.5-0.5B-Instruct
     - Qwen/Qwen2.5-7B-Instruct
     - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
+    - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4
     - mistralai/Mistral-7B-Instruct-v0.3
     - mistralai/Mixtral-8x7B-Instruct-v0.1
     - lmdeploy/llama2-chat-7b-w4
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index 7e8ebf7b47..a535b0d4c1 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -50,6 +50,8 @@ class ModelConfig:
     expert_num: int = 0
     expert_inter_size: int = 0
     experts_per_token: int = 0
+    moe_shared_gate: int = False
+    moe_norm_topk: int = False
 
     def verify(self):
         invalid = {}
diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
index a9f7385376..8d998abe2b 100644
--- a/lmdeploy/turbomind/deploy/module.py
+++ b/lmdeploy/turbomind/deploy/module.py
@@ -140,14 +140,18 @@ class MoeFfn(Ffn):
     requires:
         r.moe_ffn_expert(e, i, kind)
         r.moe_ffn_gate(i)
+        r.moe_ffn_shared_gate(i)
     """
 
     _moe_ffn_expert = 'layers.{0}.moe_ffn.experts.E.{1}.{2}'
-    _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.{1}'
+    _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.weight'
+    _moe_ffn_shared_gate = 'layers.{0}.moe_ffn.shared_gate.weight'
 
     def __init__(self, model: BaseOutputModel):
         super().__init__(model)
         self.expert_num = model.model_config.expert_num
+        self.inter_size = model.model_config.expert_inter_size
+        self.shared_gate = model.model_config.moe_shared_gate
 
     def apply(self, i: int, r: BaseReader):
         for p in get_params(r.moe_ffn_expert()):
@@ -157,7 +161,13 @@ def apply(self, i: int, r: BaseReader):
                   i)
 
         gate = transpose(r.moe_ffn_gate(i))
-        self.model.save_split(gate, self._moe_ffn_gate.format(i, 'weight'))
+        self.model.save_split(gate, self._moe_ffn_gate.format(i))
+
+        if self.shared_gate:
+            shared_gate = transpose(r.moe_ffn_shared_gate(i))
+            # print(shared_gate)
+            self.model.save_split(shared_gate,
+                                  self._moe_ffn_shared_gate.format(i))
 
 
 class Attn(Module):
@@ -248,8 +258,11 @@ class Transformer:
 
     def __init__(self, model: BaseOutputModel):
         self.model = model
-        ffn = MoeFfn if model.model_config.expert_num else Ffn
-        modules = [Attn, LayerNorm, ffn]
+        modules = [Attn, LayerNorm]
+        if model.model_config.inter_size:
+            modules.append(Ffn)
+        if model.model_config.expert_num:
+            modules.append(MoeFfn)
         self.modules = [c(model) for c in modules]
         self.misc = Misc(model)
 
diff --git a/lmdeploy/turbomind/deploy/source_model/mixtral.py b/lmdeploy/turbomind/deploy/source_model/mixtral.py
index 102ede29f2..ff9df2d409 100644
--- a/lmdeploy/turbomind/deploy/source_model/mixtral.py
+++ b/lmdeploy/turbomind/deploy/source_model/mixtral.py
@@ -33,4 +33,6 @@ def model_info(self):
         info['expert_num'] = cfg['num_local_experts']
         info['expert_inter_size'] = cfg['intermediate_size']
         info['experts_per_token'] = cfg['num_experts_per_tok']
+        info['moe_norm_topk'] = True
+        info['inter_size'] = 0
         return info
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
index 0ec0586a37..772bd03037 100644
--- a/lmdeploy/turbomind/deploy/source_model/qwen.py
+++ b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -120,3 +120,64 @@ def model_info(self):
         cfg = super().model_info()
         cfg['attn_bias'] = 1
         return cfg
+
+
+class Qwen2MoeReader(LlamaReader):
+
+    ffn_pattern = r'shared_expert\.'
+
+    def moe_ffn_expert(self, e=None, i=None, kind=None):
+        if not kind:
+            return self.filter(r'experts')
+        result = []
+        for key in ['gate', 'down', 'up']:
+            name = f'model.layers.{i}.mlp.experts.{e}.{key}_proj.{kind}'
+            tensor = self.params.get(name)
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def moe_ffn_gate(self, i):
+        return self.params.get(f'model.layers.{i}.mlp.gate.weight')
+
+    def _ffn(self, i: int, kind: str):
+        """Get ffn kind for layer i."""
+        if not kind:
+            return self.filter(self.ffn_pattern)
+        result = []
+        for key in ['gate', 'down', 'up']:
+            tensor = self.params[
+                f'model.layers.{i}.mlp.shared_expert.{key}_proj.{kind}']
+            tensor = self.transform(tensor, kind)
+            result.append(tensor)
+        return (*result, )
+
+    def moe_ffn_shared_gate(self, i):
+        return self.params.get(
+            f'model.layers.{i}.mlp.shared_expert_gate.weight')
+
+
+@INPUT_MODELS.register_module(name='qwen2-moe')
+class Qwen2MoeModel(LlamaModel):
+
+    Reader = Qwen2MoeReader
+
+    def tokenizer_info(self):
+        """https://huggingface.co/Qwen/Qwen1.5-7B-Chat/blob/main/generation_con
+        fig.json."""  # noqa: E501
+        n_words = 152064
+        bos_id = 151643
+        eos_id = 151645
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        cfg = self.model_config
+        info = super().model_info()
+        info['expert_num'] = cfg['num_experts']
+        info['expert_inter_size'] = cfg['moe_intermediate_size']
+        info['experts_per_token'] = cfg['num_experts_per_tok']
+        info['inter_size'] = cfg['shared_expert_intermediate_size']
+        info['moe_shared_gate'] = True
+        info['moe_norm_topk_prob'] = cfg['norm_topk_prob']
+        info['attn_bias'] = 1
+        return info
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index fe0819d70f..f6772fddd5 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -20,6 +20,7 @@
     QWenLMHeadModel='qwen',
     # Qwen2
     Qwen2ForCausalLM='qwen2',
+    Qwen2MoeForCausalLM='qwen2-moe',
     # mistral
     MistralForCausalLM='llama',
     # llava
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu
index acf6355856..5912c60a8a 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.cu
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu
@@ -8,7 +8,7 @@
 
 #include <cub/block/block_reduce.cuh>
 #include <cub/block/block_scan.cuh>
-#include <cuda_pipeline_primitives.h>
+#include <cub/warp/warp_scan.cuh>
 
 #include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/core/common.h"
@@ -19,7 +19,7 @@ namespace turbomind {
 
 template<int top_k, int block_dim>
 __global__ void MoeGateKernel_V2(float*       scales,  // [e,n]
-                                 int*         masks,   // [E,n], padded
+                                 int8_t*      masks,   // [E,n], padded
                                  int*         accum,   // [E,tiles]
                                  const float* logits,  // [E,n]
                                  int          log_tile,
@@ -88,6 +88,8 @@ __global__ void MoeGateKernel_V2(float*       scales,  // [e,n]
             const int lowbit = (mask & -mask);
             const int e      = 31 - __clz(lowbit);
 
+            // printf("e = %d, ti = %d, idx = %d\n", e, ti, i);
+
             masks[e * tokens_padded + ti] = i;
             atomicAdd(&shared_accum[e][ti >> log_tile], 1);
             top_val[i] = logits[ti * experts + e];
@@ -120,11 +122,11 @@ __global__ void MoeGateKernel_V2(float*       scales,  // [e,n]
     }
 }
 
-template<int block_dim>
-__global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
+template<int block_dim, class Mask>
+__global__ void MoeScanKernel_v2(int*       f2n,      // [e*n]
                                  int*       en2f,     // [e,n]
                                  int*       offsets,  // [E+1]
-                                 int*       masks,    // [E,n], padded
+                                 Mask*      masks,    // [E,n], padded
                                  const int* accum,    // [E,tiles]
                                  int        log_tile,
                                  int        tiles,
@@ -142,13 +144,15 @@ __global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
 
     constexpr int vec_size = kMoeGateVecSize;
 
-    using Vec = Array<int, vec_size>;
+    using Vec = Array<Mask, vec_size>;
 
     const int tile_id = blockIdx.x;
     const int ei      = blockIdx.y;
 
-    const int global_tile_id = ei * tiles + tile_id;
+    const int  global_tile_id = ei * tiles + tile_id;
+    const bool is_valid       = global_tile_id <= experts * tiles;
 
+#if 0
     int vacc[4]{};
     {
         int idx = threadIdx.x;
@@ -162,6 +166,18 @@ __global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
     }
 
     int offset = BlockReduce{temp_storage.reduce}.Sum(vacc);
+#else
+
+    int vacc = 0;
+    for (int i = threadIdx.x; i < global_tile_id; i += block_dim) {
+        if (is_valid && i < global_tile_id) {
+            vacc += accum[i];
+        }
+    }
+
+    int offset = BlockReduce{temp_storage.reduce}.Sum(vacc);
+
+#endif
 
     __shared__ int shared_offset;
 
@@ -200,7 +216,7 @@ __global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
         const bool pred = vi < tile_vec_end;
 
         Vec data;
-        fill(data, -1);
+        fill(data, Mask{-1});
         if (pred) {
             Ldg(data, mask_ptr[vi].data());
         }
@@ -231,17 +247,328 @@ __global__ void MoeScanKernel_V2(int*       f2n,      // [e*n]
     }
 }
 
+template<int max_expert_num,
+         int max_top_k,
+         //  bool norm_top_k,
+         int items_per_thread,
+         int block_dim,
+         int access_size,
+         class Mask>
+__global__ void MoeGateKernel_v8(float*       scales,  // [e,n]
+                                 Mask*        masks,   // [E,n], padded
+                                 int*         accum,   // [E,tiles]
+                                 const float* logits,  // [n,E]
+                                 int          log_tile,
+                                 int          tiles,
+                                 int          token_num,
+                                 int          token_num_padded,
+                                 int          expert_num,
+                                 int          top_k,
+                                 bool         norm_topk)
+{
+    constexpr int max_tiles         = kMoeGateMaxTiles;
+    constexpr int threads_per_token = max_expert_num / items_per_thread;  // 8
+    constexpr int tokens_per_cta    = block_dim / threads_per_token;
+
+    // We use bits in a uint32_t to represent selected experts
+    static_assert(items_per_thread <= 32);
+    // We use warp-level primitives for reduction
+    static_assert(threads_per_token <= 32);
+
+    static_assert((threads_per_token & (threads_per_token - 1)) == 0);
+
+    const int thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    const int ti = thread_idx / threads_per_token;
+    const int ei = thread_idx % threads_per_token;
+
+    const int bti = threadIdx.x / threads_per_token;
+
+    const int warp_ti = threadIdx.x % WARP_SIZE / threads_per_token;
+
+    const int warp_offset  = thread_idx / WARP_SIZE * WARP_SIZE / threads_per_token;
+    const int block_offset = thread_idx / block_dim * block_dim / threads_per_token;
+
+    float data[items_per_thread];
+    int   idxs[items_per_thread];
+
+#if 0
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i] = -std::numeric_limits<float>::infinity();
+        idxs[i] = threads_per_token * (i / access_size * access_size) + i % access_size + ei * access_size;
+    }
+    if (ti < token_num) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += access_size) {
+            const int e = threads_per_token * i + ei * access_size;
+            if (e < expert_num) {
+                Ldg((Array<float, access_size>&)data[i], &logits[ti * expert_num + e]);
+            }
+        }
+    }
+
+    __shared__ union {
+        struct {
+            // +1 padding greatly reduced (-80%) bank conflicts
+            int   shared_accum[max_tiles][max_expert_num + 1];
+            float shared_scales[max_top_k][tokens_per_cta];
+            int   shared_exp_id[max_top_k][tokens_per_cta];
+        };
+    } smem;
+#elif 1
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        data[i] = -std::numeric_limits<float>::infinity();
+        // idxs[i] = threads_per_token * (i / access_size * access_size) + i % access_size + ei * access_size;
+        idxs[i] = ei * items_per_thread + i;
+    }
+    if (ti < token_num) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; i += access_size) {
+            // const int e = threads_per_token * i + ei * access_size;
+            const int e = ei * items_per_thread + i;
+            if (e < expert_num) {
+                Ldg((Array<float, access_size>&)data[i], &logits[ti * expert_num + e]);
+            }
+        }
+    }
+
+    __shared__ union {
+        struct {
+            // +1 padding greatly reduced (-80%) bank conflicts
+            int   shared_accum[max_tiles][max_expert_num + 1];
+            float shared_scales[max_top_k][tokens_per_cta];
+            int   shared_exp_id[max_top_k][tokens_per_cta];
+        };
+    } smem;
+#else
+
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+
+    constexpr int vecs_per_thread = items_per_thread / access_size;
+
+    using Vec            = Array<float, access_size>;
+    constexpr int banks  = 128 / sizeof(Vec);
+    constexpr int chunks = 4;  // block_dim / WARP_SIZE;
+
+    __shared__ union {
+        Vec shared_data[chunks][vecs_per_thread * WARP_SIZE / banks][banks + 1];
+        struct {
+            // +1 padding greatly reduced (-80%) bank conflicts
+            int   shared_accum[max_tiles][max_expert_num + 1];
+            float shared_scales[max_top_k][tokens_per_cta];
+            int   shared_exp_id[max_top_k][tokens_per_cta];
+        };
+    } smem;
+
+    __align__(16) Vec vecs[vecs_per_thread];
+
+    {
+        const int warp_end = min(warp_offset + WARP_SIZE / threads_per_token, token_num) * expert_num;
+        int       p        = warp_offset * expert_num + access_size * lane_id;
+        PRAGMA_UNROLL
+        for (int i = 0; i < vecs_per_thread; ++i) {
+            fill(vecs[i], -std::numeric_limits<float>::infinity());
+            // const int p = warp_offset * expert_num + access_size * (lane_id + i * WARP_SIZE);
+            if (p < warp_end) {
+                Ldg(vecs[i], &logits[p]);
+            }
+            p += access_size * WARP_SIZE;
+        }
+    }
+
+    PRAGMA_UNROLL
+    for (int c = 0; c < block_dim / WARP_SIZE; c += chunks) {
+        PRAGMA_UNROLL
+        for (int i = 0; i < vecs_per_thread; ++i) {
+            int p = i * WARP_SIZE + lane_id;
+            if (c <= warp_id && warp_id < c + chunks) {
+                Store(smem.shared_data[warp_id - c][p / banks][p % banks].data(), vecs[i]);
+            }
+        }
+
+        __syncwarp();
+
+        PRAGMA_UNROLL
+        for (int i = 0; i < vecs_per_thread; ++i) {
+            int p = lane_id * vecs_per_thread + i;
+            if (c <= warp_id && warp_id < c + chunks) {
+                Load(vecs[i], smem.shared_data[warp_id - c][p / banks][p % banks].data());
+            }
+        }
+
+        __syncthreads();
+    }
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        idxs[i] = ei * items_per_thread + i;
+    }
+    PRAGMA_UNROLL
+    for (int i = 0; i < vecs_per_thread; ++i) {
+        (Array<float, access_size>&)data[i * access_size] = vecs[i];
+    }
+
+#endif
+
+    constexpr float kLog2e = 1.4426950408889634074;
+
+    unsigned mask = (unsigned)-1;
+    float    max_logit;
+
+    int   count{};
+    float sum_prob{};
+
+    const int warp_ti_offset = warp_ti * threads_per_token;
+
+    auto run = [&](int k) {
+        unsigned bit     = 1;
+        unsigned max_bit = 0;
+        float    max_val = -std::numeric_limits<float>::infinity();
+        // local maximum
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; ++i) {
+            if ((mask & bit) && data[i] > max_val) {
+                max_bit = bit;
+                max_val = data[i];
+            }
+            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+        }
+
+        if (k == 0) {
+            PRAGMA_UNROLL
+            for (int i = 0; i < items_per_thread; ++i) {
+                data[i] *= kLog2e;
+            }
+        }
+
+        int   g_max_ei  = ei;
+        float g_max_val = max_val;
+        if constexpr (threads_per_token > 1) {
+            // global maximum
+            PRAGMA_UNROLL
+            for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+                g_max_val = fmaxf(g_max_val, __shfl_xor_sync((uint32_t)-1, g_max_val, m));
+            }
+            // tie breaking
+            const auto active = __ballot_sync((uint32_t)-1, max_val == g_max_val);
+            g_max_ei          = __ffs(active >> (unsigned)warp_ti_offset) - 1;
+        }
+        if (k == 0) {
+            max_logit = g_max_val;
+        }
+        if (ei == g_max_ei) {
+            mask -= max_bit;
+            ++count;
+        }
+    };
+
+    run(0);
+
+    for (int k = 1; k < top_k; ++k) {
+        run(k);
+    }
+
+    mask = ~mask;
+
+    int used[items_per_thread];
+    {
+        unsigned bit = 1;
+        PRAGMA_UNROLL
+        for (int i = 0; i < items_per_thread; ++i) {
+            used[i] = (mask & bit) > 0;
+            asm("shl.b32 %0, %1, 1;\n" : "=r"(bit) : "r"(bit));
+        }
+    }
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        if (!norm_topk || used[i]) {
+            data[i] = exp2f(data[i] - max_logit);
+            sum_prob += data[i];
+        }
+    }
+
+    PRAGMA_UNROLL
+    for (int m = threads_per_token / 2; m >= 1; m /= 2) {
+        sum_prob += __shfl_xor_sync((uint32_t)-1, sum_prob, m);
+    }
+
+    sum_prob = fdividef(1.f, sum_prob);
+
+    using WarpScan = cub::WarpScan<int, threads_per_token>;
+    __shared__ typename WarpScan::TempStorage temp_storage[tokens_per_cta];
+
+    int idx{};
+    WarpScan{temp_storage[bti]}.ExclusiveSum(count, idx);
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < items_per_thread; ++i) {
+        if (used[i]) {
+            smem.shared_exp_id[idx][bti] = idxs[i];
+            smem.shared_scales[idx][bti] = data[i] * sum_prob;
+            ++idx;
+        }
+    }
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < max_tiles * max_expert_num; i += block_dim) {
+        int e                   = (i + threadIdx.x) % max_expert_num;
+        int t                   = (i + threadIdx.x) / max_expert_num;
+        smem.shared_accum[t][e] = 0;
+    }
+
+    __syncthreads();
+
+    constexpr int k_per_thread = cdiv(max_top_k, threads_per_token);
+
+    const int bti2 = threadIdx.x % tokens_per_cta;
+    const int ei2  = threadIdx.x / tokens_per_cta;
+    const int ti2  = blockIdx.x * tokens_per_cta + bti2;
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < k_per_thread; ++i) {
+        const int   idx       = ei2 * k_per_thread + i;
+        const int   expert_id = smem.shared_exp_id[idx][bti2];
+        const float scale     = smem.shared_scales[idx][bti2];
+
+        if (ti2 < token_num && idx < top_k) {
+            masks[expert_id * token_num_padded + ti2] = idx;
+            scales[idx * token_num + ti2]             = scale;
+            atomicAdd(&smem.shared_accum[ti2 >> log_tile][expert_id], 1);
+
+            // printf("%d %d %f\n", idx, expert_id, scale);
+        }
+    }
+
+    __syncthreads();
+
+    for (int i = 0; i < max_expert_num * max_tiles; i += block_dim) {
+        int t = (threadIdx.x + i) % max_tiles;
+        int e = (threadIdx.x + i) / max_tiles;
+        if (e < expert_num && t < tiles) {
+            atomicAdd(accum + e * tiles + t, smem.shared_accum[t][e]);
+        }
+    }
+}
+
+template<int N>
+inline constexpr std::integral_constant<int, N> _Int{};
+
 void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
                       int*         en2f,           // [e,n] -> n*e
                       int*         offsets,        // [E+1]
                       float*       scales,         // [e,n]
-                      int*         masks,          // [E,n]
+                      void*        masks,          // [E,n]
                       int*         accum,          // [E]
                       const float* logits,         // [e,n]
                       int          tokens,         //  n
                       int          tokens_padded,  //  round_up(n, 4)
                       int          experts,        //  E
                       int          experts_per_token,
+                      bool         norm_topk,
                       cudaStream_t st)
 {
     constexpr int base_log_tile = 9;
@@ -254,48 +581,64 @@ void invokeMoeGate_V2(int*         f2n,            // [e*n]  -> n
 
     // std::cout << log_tile << " " << tiles << "\n";
 
-    {
-        constexpr int threads = 128;
-        const int     blocks  = ceil_div(tokens, threads);
+    auto invoke = [&](auto max_expert_num, auto top_k, auto items_per_thread) {
+        constexpr int thrs_per_tok = max_expert_num.value / items_per_thread.value;
+        constexpr int threads      = 256;
+        const int     blocks       = ceil_div(tokens, threads / thrs_per_tok);
+
+        cudaMemsetAsync(masks, -1, sizeof(int8_t) * experts * tokens_padded, st);
 
-        auto invoke = [&](auto e) {
-            static constexpr int top_k = decltype(e)::value;
-            MoeGateKernel_V2<top_k, threads><<<blocks, threads, 0, st>>>(  //
+        MoeGateKernel_v8<max_expert_num.value, top_k.value, items_per_thread.value, threads, 4>
+            <<<blocks, threads, 0, st>>>(  //
                 scales,
-                masks,
+                (int8_t*)masks,
                 accum,
                 logits,
                 log_tile,
                 tiles,
                 tokens,
                 tokens_padded,
-                experts);
-        };
+                experts,
+                experts_per_token,
+                norm_topk);
+    };
 
-        switch (experts_per_token) {
-            case 2:
-                invoke(std::integral_constant<int, 2>{});
-                break;
-            // case 4:
-            //     invoke(std::integral_constant<int, 4>{});
-            //     break;
-            default:
-                std::cerr << __FILE__ << ":" << __LINE__ << " Not implemented. " << std::endl;
-                std::abort();
+    auto fail = [&] {
+        std::cerr << "unsupported moe config: expert_num=" << experts << ", top_k=" << experts_per_token << "\n";
+        std::abort();
+    };
+
+    if (experts <= 8) {
+        if (experts_per_token <= 2) {
+            invoke(_Int<8>, _Int<2>, _Int<8>);
+        }
+        else {
+            invoke(_Int<8>, _Int<8>, _Int<8>);
         }
     }
-
-    // return;
+    else if (experts <= 64) {
+        if (experts_per_token <= 4) {
+            invoke(_Int<64>, _Int<4>, _Int<16>);
+        }
+        else if (experts_per_token <= 8) {
+            invoke(_Int<64>, _Int<8>, _Int<16>);
+        }
+        else {
+            fail();
+        }
+    }
+    else {
+        fail();
+    }
 
     {
-        // Check: tiles * experts <= threads
-
         constexpr int threads = (1 << base_log_tile) / kMoeGateVecSize;
         const dim3    blocks(tiles, experts + 1);
-        MoeScanKernel_V2<threads><<<blocks, threads, 0, st>>>(f2n,  //
+
+        MoeScanKernel_v2<threads><<<blocks, threads, 0, st>>>(f2n,  //
                                                               en2f,
                                                               offsets,
-                                                              masks,
+                                                              (int8_t*)masks,
                                                               accum,
                                                               log_tile,
                                                               tiles,
@@ -338,10 +681,11 @@ void invokeMoeGather(T* dst, const T* src, const int* f2n, int tokens, int exper
 template void invokeMoeGather(uint16_t*, const uint16_t*, const int*, int, int, int, cudaStream_t);
 
 template<int vec_size, int exp_k, int block_dim, class T>
-__global__ void MoeReduceKernel(T*           dst,     // [  n, d]
-                                const T*     src,     // [e*n, d]
-                                const float* scales,  // [  e, n]
-                                const int*   en2f,    // [  e, n] :: (e,n) -> e*n
+__global__ void MoeReduceKernel(T*           dst,         // [  n, d]
+                                const T*     src,         // [e*n, d]
+                                const float* scales,      // [  e, n]
+                                const int*   en2f,        // [  e, n] :: (e,n) -> e*n
+                                const float* dst_scales,  // [n]
                                 int          dims,
                                 int          tokens)
 {
@@ -351,6 +695,12 @@ __global__ void MoeReduceKernel(T*           dst,     // [  n, d]
 
     auto dst_ptr = (Vec*)dst + dims * ti;
 
+    float dst_scale = 0;
+    if (dst_scales) {
+        dst_scale = dst_scales[ti];
+        dst_scale = fdividef(1.f, 1.f + expf(-dst_scale));
+    }
+
     // Should be warp uniforms
     const Vec* src_ptr[exp_k];
     float      scale[exp_k];
@@ -362,6 +712,12 @@ __global__ void MoeReduceKernel(T*           dst,     // [  n, d]
 
     for (int i = threadIdx.x; i < dims; i += block_dim) {
         Array<float, vec_size> accum{};
+        if (dst_scales) {
+            Vec v;
+            Ldg(v, dst_ptr[i].data());
+            using namespace ops;
+            accum = cast<float>(v) * dst_scale;
+        }
         PRAGMA_UNROLL
         for (int e = 0; e < exp_k; ++e) {
             Vec v;
@@ -379,6 +735,7 @@ void invokeMoeReduce(T*           dst,
                      const T*     src,
                      const float* scales,
                      const int*   en2f,
+                     const float* dst_scales,
                      int          tokens,
                      int          experts_per_token,
                      int          dims,
@@ -395,6 +752,7 @@ void invokeMoeReduce(T*           dst,
             src,
             scales,
             en2f,
+            dst_scales,
             dims / vec_size,
             tokens);
     };
@@ -404,19 +762,22 @@ void invokeMoeReduce(T*           dst,
             return invoke(std::integral_constant<int, 1>{});
         case 2:
             return invoke(std::integral_constant<int, 2>{});
-        // case 4:
-        //     return invoke(std::integral_constant<int, 4>{});
-        // case 6:
-        //     return invoke(std::integral_constant<int, 6>{});
+        case 4:
+            return invoke(std::integral_constant<int, 4>{});
+        case 6:
+            return invoke(std::integral_constant<int, 6>{});
+        case 8:
+            return invoke(std::integral_constant<int, 8>{});
         default:
             fprintf(stderr, "Unsupported experts_per_token %d\n", experts_per_token);
             std::abort();
     }
 }
 
-template void invokeMoeReduce(half*, const half*, const float*, const int*, int, int, int, cudaStream_t);
+template void invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, cudaStream_t);
 #ifdef ENABLE_BF16
-template void invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, int, int, int, cudaStream_t);
+template void
+invokeMoeReduce(nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, cudaStream_t);
 #endif
 
 std::vector<int> SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g)
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h
index 334e2de272..0e4c36af09 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.h
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.h
@@ -14,13 +14,14 @@ void invokeMoeGate_V2(int*         f2n,
                       int*         en2f,
                       int*         offsets,
                       float*       scales,
-                      int*         masks,
+                      void*        masks,
                       int*         accum,
                       const float* logits,
                       int          tokens,
                       int          tokens_padded,
                       int          experts,
                       int          exp_per_tok,
+                      bool         norm_topk,
                       cudaStream_t st);
 
 template<class T>
@@ -49,6 +50,7 @@ void invokeMoeReduce(T*           dst,
                      const T*     src,
                      const float* scales,
                      const int*   en2f,
+                     const float* dst_scales,
                      int          tokens,
                      int          experts_per_token,
                      int          dims,
diff --git a/src/turbomind/kernels/gemm/test/test_moe_utils.cu b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
index a311162193..47e3bfdb16 100644
--- a/src/turbomind/kernels/gemm/test/test_moe_utils.cu
+++ b/src/turbomind/kernels/gemm/test/test_moe_utils.cu
@@ -26,6 +26,25 @@ void print_vecs(const T* data, int m, int k, std::string msg, int width = 4)
     }
 }
 
+template<class T>
+void diff_vecs(const T* data, const T* refs, int m, int k, std::string msg)
+{
+    if (!msg.empty()) {
+        std::cout << msg << ": [" << m << ", " << k << "]\n";
+    }
+    for (int mm = 0; mm < m; ++mm) {
+        std::cout << "m=" << mm << ": ";
+        for (int kk = 0; kk < k; ++kk) {
+            const auto& x = data[mm * k + kk];
+            const auto& y = refs[mm * k + kk];
+            if (x != y) {
+                std::cout << kk << "(" << x << ", " << y << ") ";
+            }
+        }
+        std::cout << "\n";
+    }
+}
+
 #if 0
 void func()
 {
@@ -190,7 +209,7 @@ void moe_gate_ref(int                            tokens,
     }
 }
 
-void mask2eids(const universal_vector<int>& masks, universal_vector<int>& eids, int tokens, int expert_num)
+void mask2eids(universal_vector<int8_t>& masks, universal_vector<int>& eids, int tokens, int expert_num)
 {
     const int tokens_padded = masks.size() / expert_num;
     // std::cout << eids.size() << std::endl;
@@ -228,13 +247,13 @@ bool test_moe_gate(int                     tokens,  //
     const int tokens_padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
     // const int max_coords    = get_max_coords(tokens, expert_num, experts_per_token, tiling);
 
-    universal_vector<int>   offsets(expert_num + 1);
-    universal_vector<int>   accum(expert_num * kMoeGateMaxTiles);
-    universal_vector<int>   masks(expert_num * tokens_padded);
-    universal_vector<int>   eids(experts_per_token * tokens);
-    universal_vector<int>   f2n(experts_per_token * tokens);
-    universal_vector<int>   en2f(experts_per_token * tokens);
-    universal_vector<float> scales(experts_per_token * tokens);
+    universal_vector<int>    offsets(expert_num + 1);
+    universal_vector<int>    accum(expert_num * kMoeGateMaxTiles);
+    universal_vector<int8_t> masks(expert_num * tokens_padded);
+    universal_vector<int>    eids(experts_per_token * tokens);
+    universal_vector<int>    f2n(experts_per_token * tokens);
+    universal_vector<int>    en2f(experts_per_token * tokens);
+    universal_vector<float>  scales(experts_per_token * tokens);
     // universal_vector<int2>  coords(max_coords);
     // thrust::fill(coords.begin(), coords.end(), int2{-1, 0});
 
@@ -246,8 +265,16 @@ bool test_moe_gate(int                     tokens,  //
 
     moe_gate_ref(tokens, expert_num, experts_per_token, logits, offsets_ref, eids_ref, f2n_ref, en2f_ref, scales_ref);
 
-    for (int i = 0; i < 10; ++i) {
+    cudaMemPrefetchAsync(f2n.data().get(), sizeof(int) * f2n.size(), 0);
+    cudaMemPrefetchAsync(en2f.data().get(), sizeof(int) * en2f.size(), 0);
+    cudaMemPrefetchAsync(offsets.data().get(), sizeof(int) * offsets.size(), 0);
+    cudaMemPrefetchAsync(scales.data().get(), sizeof(float) * scales.size(), 0);
+    cudaMemPrefetchAsync(logits.data().get(), sizeof(float) * logits.size(), 0);
+
+    for (int i = 0; i < 1; ++i) {
+        gemm::CacheFlushing::flush();
         cudaMemset(accum.data().get(), 0, sizeof(int) * accum.size());
+        cudaMemset(masks.data().get(), -1, sizeof(int8_t) * masks.size());
         invokeMoeGate_V2(f2n.data().get(),
                          en2f.data().get(),
                          offsets.data().get(),
@@ -259,6 +286,7 @@ bool test_moe_gate(int                     tokens,  //
                          tokens_padded,
                          expert_num,
                          experts_per_token,
+                         true,
                          0);
     }
 
@@ -306,7 +334,10 @@ bool test_moe_gate(int                     tokens,  //
         success = false;
     }
 
-    if (!success || false) {
+    if (!success && 1) {
+
+        diff_vecs(eids.data().get(), eids_ref.data().get(), experts_per_token, tokens, "eids");
+
         print_vecs(offsets_ref.data().get(), 1, expert_num + 1, "offsets_ref");
         print_vecs(offsets.data().get(), 1, expert_num + 1, "offsets");
 
@@ -322,32 +353,32 @@ bool test_moe_gate(int                     tokens,  //
         print_vecs(scales_ref.data().get(), experts_per_token, tokens, "scales_ref", 12);
         print_vecs(scales.data().get(), experts_per_token, tokens, "scales", 12);
 
-        print_vecs(accum.data().get(), expert_num, 1, "accum");
+        // print_vecs(accum.data().get(), expert_num, 1, "accum");
 
         // print_vecs(coords.data().get(), 1, max_coords, "coords");
 
-        thrust::host_vector<int4> tile_offsets(tape.max_ctas);
-        std::cout << tape.max_ctas << std::endl;
-        cudaMemcpy(tile_offsets.data(), tape.tile_offsets, sizeof(int4) * tile_offsets.size(), cudaMemcpyDefault);
-        cudaDeviceSynchronize();
-
-        std::cout << "coords:\n";
-        int last = -1;
-        for (int i = 0; i < tape.max_ctas; ++i) {
-            auto& c = tile_offsets[i];
-            if (last >= 0 && c.w != last) {
-                std::cout << "\n";
-            }
-            if (c.w == -1) {
-                std::cout << i << "\n";
-                break;
-            }
-            last = c.w;
-            std::stringstream ss;
-            ss << c.x << "," << c.y;
-            std::cout << std::setw(6) << ss.str();
-        }
-        std::cout << "\n";
+        // thrust::host_vector<int4> tile_offsets(tape.max_ctas);
+        // std::cout << tape.max_ctas << std::endl;
+        // cudaMemcpy(tile_offsets.data(), tape.tile_offsets, sizeof(int4) * tile_offsets.size(), cudaMemcpyDefault);
+        // cudaDeviceSynchronize();
+
+        // std::cout << "coords:\n";
+        // int last = -1;
+        // for (int i = 0; i < tape.max_ctas; ++i) {
+        //     auto& c = tile_offsets[i];
+        //     if (last >= 0 && c.w != last) {
+        //         std::cout << "\n";
+        //     }
+        //     if (c.w == -1) {
+        //         std::cout << i << "\n";
+        //         break;
+        //     }
+        //     last = c.w;
+        //     std::stringstream ss;
+        //     ss << c.x << "," << c.y;
+        //     std::cout << std::setw(6) << ss.str();
+        // }
+        // std::cout << "\n";
     }
 
     return success;
@@ -358,7 +389,11 @@ int main()
     gemm::Tape       tape{};
     constexpr Tiling tiling{14336, 128, {128, 128, 32}};
 
-    test_moe_gate(8192, 8, 2, tape, tiling);
+    // test_moe_gate(32768 * 4, 60, 4, tape, tiling);
+    // test_moe_gate(32768, 64, 8, tape, tiling);
+    // test_moe_gate(8, 60, 4, tape, tiling);
+
+    test_moe_gate(65536, 8, 2, tape, tiling);
     return 0;
 
     for (int i = 1; i < 16384; ++i) {
diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h
index 6b1ec88f58..7a089fbdf2 100644
--- a/src/turbomind/kernels/gemm/test/testbed.h
+++ b/src/turbomind/kernels/gemm/test/testbed.h
@@ -514,6 +514,7 @@ class Testbed {
                             c_e_.data().get(),
                             moe_scales_.data().get(),
                             moe_en2f_.data().get(),
+                            nullptr,
                             batch_size_,
                             expert_ids_.size() / batch_size_,
                             output_dims_,
@@ -523,6 +524,7 @@ class Testbed {
                             c_e_ref_.data().get(),
                             moe_scales_.data().get(),
                             moe_en2f_.data().get(),
+                            nullptr,
                             batch_size_,
                             expert_ids_.size() / batch_size_,
                             output_dims_,
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index 2d68ef3535..f6f9ab0efa 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -137,6 +137,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
                                   moe_param.inter_size,
                                   moe_param.expert_num,
                                   moe_param.method,
+                                  moe_param.shared_gate,
                                   tensor_para_size_,
                                   weight_type,
                                   group_size,
@@ -349,18 +350,22 @@ void LlamaDecoderLayerWeight<T>::mallocWeights()
     mallocWeights(self_attn_weights.qkv, attn_bias_);
     mallocWeights(self_attn_weights.output, attn_bias_);
 
-    if (moe_weights.experts.empty()) {
+    if (inter_size_) {
         mallocWeights(ffn_weights.gating, false);
         mallocWeights(ffn_weights.intermediate, false);
         mallocWeights(ffn_weights.output, false);
     }
-    else {
+
+    if (!moe_weights.experts.empty()) {
         mallocWeights(moe_weights.gate, false);
         for (auto& e : moe_weights.experts) {
             mallocWeights(e.gating, false);
             mallocWeights(e.intermediate, false);
             mallocWeights(e.output, false);
         }
+        if (moe_weights.shared_gate.output_dims) {
+            mallocWeights(moe_weights.shared_gate, false);
+        }
     }
 }
 
@@ -375,10 +380,25 @@ LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight()
     freeWeights(self_attn_weights.qkv);
     freeWeights(self_attn_weights.output);
 
-    freeWeights(ffn_weights.fused_gating_intermediate);
-    freeWeights(ffn_weights.gating);
-    freeWeights(ffn_weights.intermediate);
-    freeWeights(ffn_weights.output);
+    if (inter_size_) {
+        freeWeights(ffn_weights.fused_gating_intermediate);
+        freeWeights(ffn_weights.gating);
+        freeWeights(ffn_weights.intermediate);
+        freeWeights(ffn_weights.output);
+    }
+
+    if (!moe_weights.experts.empty()) {
+        freeWeights(moe_weights.gate);
+        for (auto& e : moe_weights.experts) {
+            freeWeights(e.fused_gating_intermediate);
+            freeWeights(e.gating);
+            freeWeights(e.intermediate);
+            freeWeights(e.output);
+        }
+        if (moe_weights.shared_gate.kernel) {
+            freeWeights(moe_weights.shared_gate);
+        }
+    }
 }
 
 template<typename T>
@@ -428,23 +448,30 @@ TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
     getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
     getWeightTensor(self_attn_weights.output, attn_bias_, get_prefix("attention.wo"), output);
 
-    if (moe_weights.experts.empty()) {
+    if (inter_size_) {
         getWeightTensor(ffn_weights.gating, false, get_prefix("feed_forward.w1"), output);
         getWeightTensor(ffn_weights.intermediate, false, get_prefix("feed_forward.w3"), output);
         getWeightTensor(ffn_weights.output, false, get_prefix("feed_forward.w2"), output);
     }
-    else {
+
+    if (!moe_weights.experts.empty()) {
         output.insert(
             concat(prefix, "moe_ffn.gate.weight"),
             Tensor{MEMORY_GPU, getTensorType<T>(), {moe_weights.gate.kernel_size()}, moe_weights.gate.kernel});
         auto& experts = moe_weights.experts;
         for (size_t i = 0; i < experts.size(); ++i) {
             const std::string name = "moe_ffn.experts." + std::to_string(i);
-            // std::cerr << "FUCK " << get_prefix(concat(name, "w1")) << "\n";
             getWeightTensor(experts[i].gating, false, get_prefix(concat(name, "w1")), output);
             getWeightTensor(experts[i].intermediate, false, get_prefix(concat(name, "w3")), output);
             getWeightTensor(experts[i].output, false, get_prefix(concat(name, "w2")), output);
         }
+        if (moe_weights.shared_gate.kernel) {
+            output.insert(concat(prefix, "moe_ffn.shared_gate.weight"),
+                          Tensor{MEMORY_GPU,
+                                 getTensorType<T>(),
+                                 {moe_weights.shared_gate.kernel_size()},
+                                 moe_weights.shared_gate.kernel});
+        }
     }
 
     return output;
@@ -681,10 +708,13 @@ void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cud
         convert(ffn.output, is_fused_moe, workspace, size, is_16xx);
     };
 
-    if (moe_weights.experts.empty()) {
+    if (inter_size_) {
+        // std::cerr << "process FFN\n";
         process_ffn(ffn_weights, false);
     }
-    else {
+
+    if (!moe_weights.experts.empty()) {
+        // std::cerr << "process MoE\n";
         std::vector<std::pair<void*, int>> fused_ptrs;
         std::vector<std::pair<void*, int>> output_ptrs;
         std::vector<std::pair<void*, int>> fused_param_ptrs;
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
index 9a895243bc..169fb53bcf 100644
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -145,24 +145,28 @@ struct LlamaFfnWeight {
     LlamaFfnWeight(
         size_t hidden_dim, size_t inter_size, size_t tp, WeightType weight_type, int group_size, bool fuse_silu_act)
     {
+        inter_size /= tp;
+
+        this->inter_size = inter_size;
+
         gating.input_dims  = hidden_dim;
-        gating.output_dims = inter_size / tp;
+        gating.output_dims = inter_size;
         gating.type        = weight_type;
         gating.group_size  = group_size;
 
         intermediate.input_dims  = hidden_dim;
-        intermediate.output_dims = inter_size / tp;
+        intermediate.output_dims = inter_size;
         intermediate.type        = weight_type;
         intermediate.group_size  = group_size;
 
         fused_gating_intermediate.input_dims  = hidden_dim;
-        fused_gating_intermediate.output_dims = inter_size / tp * 2;
+        fused_gating_intermediate.output_dims = inter_size * 2;
         fused_gating_intermediate.type        = weight_type;
         fused_gating_intermediate.group_size  = group_size;
 
         is_fused_silu = fuse_silu_act;
 
-        output.input_dims  = inter_size / tp;
+        output.input_dims  = inter_size;
         output.output_dims = hidden_dim;
         output.type        = weight_type;
         output.group_size  = group_size;
@@ -173,6 +177,7 @@ struct LlamaFfnWeight {
     LlamaDenseWeight<T> output;
     LlamaDenseWeight<T> fused_gating_intermediate;
 
+    int  inter_size{};
     bool is_fused_silu{};
 };
 
@@ -185,11 +190,15 @@ struct MoeFfnWeight {
                  int        inter_size,
                  int        expert_num,
                  int        method,
+                 bool       has_shared_gate,
                  size_t     tp,
                  WeightType weight_type,
                  int        group_size,
                  bool       fuse_silu_act)
     {
+
+        // printf("%d %d %d\n", (int)hidden_dim, (int)inter_size, (int)expert_num);
+
         if (expert_num == 0) {
             return;
         }
@@ -208,11 +217,23 @@ struct MoeFfnWeight {
             // inter size is divided by tp in `FfnWeight`
             e = LlamaFfnWeight<T>{hidden_dim, (size_t)inter_size, tp, weight_type, group_size, fuse_silu_act};
         }
+
+        if (has_shared_gate) {
+            shared_gate.input_dims  = hidden_dim;
+            shared_gate.output_dims = 1;
+            shared_gate.type        = get_default_weight_type<T>();
+            gate.group_size         = group_size;
+        }
+        else {
+            shared_gate = {};
+        }
     }
 
     LlamaDenseWeight<T>            gate;
     std::vector<LlamaFfnWeight<T>> experts;
 
+    LlamaDenseWeight<T> shared_gate;
+
     LlamaFfnWeight<T> block;
 
     int method{};
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc
index f9ee0c4ad4..8cce207203 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.cc
+++ b/src/turbomind/models/llama/LlamaFfnLayer.cc
@@ -28,10 +28,11 @@ namespace turbomind {
 
 template<typename T>
 void LlamaFfnLayer<T>::allocateBuffer(size_t                     token_num,
+                                      int                        inter_size,
                                       const LlamaDenseWeight<T>* gating,
                                       const LlamaDenseWeight<T>* inter)
 {
-    const size_t sz = token_num * inter_size_;
+    const size_t sz = token_num * inter_size;
 
     const size_t sz_gate  = token_num * gating->lora.r;
     const size_t sz_inter = token_num * inter->lora.r;
@@ -51,24 +52,24 @@ template<typename T>
 void LlamaFfnLayer<T>::freeBuffer()
 {
     if (is_allocate_buffer_) {
-        // allocator_->free((void**)&inter_buf_);
         allocator_->free((void**)&gating_buf_);
         is_allocate_buffer_ = false;
     }
 }
 
 template<typename T>
-void LlamaFfnLayer<T>::activation(int token_num, bool is_chunked)
+void LlamaFfnLayer<T>::activation(int token_num, int inter_size, bool is_chunked)
 {
     NvtxScope scope("activation");
     if (is_chunked) {
+        // gate & up are in the SAME buffer
         invokeGenericActivation_v2<SiluActivation>(
-            gating_buf_, gating_buf_ + inter_size_, inter_size_ * 2, token_num, inter_size_, stream_);
+            gating_buf_, gating_buf_ + inter_size, inter_size * 2, token_num, inter_size, stream_);
         sync_check_cuda_error();
     }
     else {
-        invokeGenericActivation_v2<SiluActivation>(
-            gating_buf_, inter_buf_, inter_size_, token_num, inter_size_, stream_);
+        // gate & up are in separate buffers
+        invokeGenericActivation_v2<SiluActivation>(gating_buf_, inter_buf_, inter_size, token_num, inter_size, stream_);
         sync_check_cuda_error();
     }
 }
@@ -88,11 +89,11 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
 
     NvtxScope scope("ffn");
 
-    const size_t num_token = input_tensors->at("ffn_input").shape[0];
-    const int    layer_id  = input_tensors->getVal<int>("layer_id");
-    // LOG(WARNING);
+    const size_t token_num  = input_tensors->at("ffn_input").shape[0];
+    const int    layer_id   = input_tensors->getVal<int>("layer_id");
+    const int    inter_size = weights->inter_size;
 
-    allocateBuffer(num_token, &weights->gating, &weights->intermediate);
+    allocateBuffer(token_num, inter_size, &weights->gating, &weights->intermediate);
 
     const T* ffn_input_data  = input_tensors->at("ffn_input").getPtr<T>();
     T*       ffn_output_data = output_tensors->at("ffn_output").getPtr<T>();
@@ -103,50 +104,50 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
 
         const auto type = weights->is_fused_silu ? LlamaLinear<T>::kFusedSiluFfn : LlamaLinear<T>::kGemm;
 
-        linear_->forward(gating_buf_, ffn_input_data, num_token, weights->fused_gating_intermediate, type);
+        linear_->forward(gating_buf_, ffn_input_data, token_num, weights->fused_gating_intermediate, type);
         sync_check_cuda_error();
 
         if (!weights->is_fused_silu) {
-            activation(num_token, true);
+            activation(token_num, inter_size, true);
         }
 
-        count_and_fix(gating_buf_, num_token * weights->output.input_dims, Concat("w1_w3_silu", layer_id), 3);
+        count_and_fix(gating_buf_, token_num * weights->output.input_dims, Concat("w1_w3_silu", layer_id), 3);
     }
     else {
         {  // w1(x)
             NvtxScope scope("w1");
-            linear_->forward(gating_buf_, ffn_input_data, num_token, weights->gating, LlamaLinear<T>::kGemm, lora_mask);
+            linear_->forward(gating_buf_, ffn_input_data, token_num, weights->gating, LlamaLinear<T>::kGemm, lora_mask);
             sync_check_cuda_error();
         }
-        count_and_fix(gating_buf_, num_token * weights->gating.output_dims, Concat("w1", layer_id), 3);
+        count_and_fix(gating_buf_, token_num * weights->gating.output_dims, Concat("w1", layer_id), 3);
 
         {  // w3(x)
             NvtxScope scope("w3");
             linear_->forward(
-                inter_buf_, ffn_input_data, num_token, weights->intermediate, LlamaLinear<T>::kGemm, lora_mask);
+                inter_buf_, ffn_input_data, token_num, weights->intermediate, LlamaLinear<T>::kGemm, lora_mask);
             sync_check_cuda_error();
         }
-        count_and_fix(inter_buf_, num_token * weights->intermediate.output_dims, Concat("w3", layer_id), 3);
+        count_and_fix(inter_buf_, token_num * weights->intermediate.output_dims, Concat("w3", layer_id), 3);
 
         // silu(w1(x)) * w3(x)
-        activation(num_token, false);
+        activation(token_num, inter_size, false);
 
-        count_and_fix(gating_buf_, num_token * weights->output.input_dims, Concat("act", layer_id), 3);
+        count_and_fix(gating_buf_, token_num * weights->output.input_dims, Concat("act", layer_id), 3);
     }
 
     {  // w2(x)
         NvtxScope scope("w2");
-        const int pitch = (weights->fused_gating_intermediate.kernel && !weights->is_fused_silu) ? inter_size_ * 2 : 0;
+        const int pitch = (weights->fused_gating_intermediate.kernel && !weights->is_fused_silu) ? inter_size * 2 : 0;
         linear_->forward(
-            ffn_output_data, {gating_buf_, pitch}, num_token, weights->output, LlamaLinear<T>::kGemm, lora_mask);
+            ffn_output_data, {gating_buf_, pitch}, token_num, weights->output, LlamaLinear<T>::kGemm, lora_mask);
         sync_check_cuda_error();
     }
 
-    count_and_fix(ffn_output_data, num_token * weights->output.output_dims, Concat("w2", layer_id), 3);
+    count_and_fix(ffn_output_data, token_num * weights->output.output_dims, Concat("w2", layer_id), 3);
 
     if (all_reduce_ && tensor_para_.world_size_ > 1) {
         NcclGuard nccl_guard(tensor_para_, stream_);
-        ftNcclAllReduceSum(ffn_output_data, ffn_output_data, num_token * hidden_units_, tensor_para_, stream_);
+        ftNcclAllReduceSum(ffn_output_data, ffn_output_data, token_num * hidden_units_, tensor_para_, stream_);
         sync_check_cuda_error();
     }
 
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h
index 75ced5f9ac..2daca2cc95 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.h
+++ b/src/turbomind/models/llama/LlamaFfnLayer.h
@@ -19,12 +19,11 @@
 
 #pragma once
 
-#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
 #include "src/turbomind/models/llama/context.h"
-#include "src/turbomind/utils/custom_ar_comm.h"
+#include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/nccl_utils.h"
-#include <functional>
 
 namespace turbomind {
 
@@ -32,7 +31,6 @@ template<typename T>
 class LlamaFfnLayer {
 public:
     LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context<T>& ctx, bool all_reduce):
-        inter_size_(model.inter_size / tp.world_size_),
         hidden_units_(model.hidden_units),
         tensor_para_(tp),
         stream_(ctx.stream),
@@ -50,13 +48,12 @@ class LlamaFfnLayer {
     void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaFfnWeight<T>* weights);
 
 private:
-    void allocateBuffer(size_t token_num, const LlamaDenseWeight<T>*, const LlamaDenseWeight<T>*);
+    void allocateBuffer(size_t token_num, int inter_size, const LlamaDenseWeight<T>*, const LlamaDenseWeight<T>*);
 
     void freeBuffer();
 
-    void activation(int token_num, bool is_chunked);
+    void activation(int token_num, int inter_size, bool is_chunked);
 
-    const size_t          inter_size_;
     const size_t          hidden_units_;
     const NcclParam       tensor_para_;
     cudaStream_t const    stream_;
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index 1c039ca66a..2ea63f0410 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -31,9 +31,11 @@ struct MoeParam {
         kNaive,
         kFused
     } method;
-    int expert_num;
-    int experts_per_token;
-    int inter_size;
+    int  expert_num;
+    int  experts_per_token;
+    int  inter_size;
+    bool norm_topk;
+    bool shared_gate;
 };
 
 struct AttentionParam {
diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc
index def6b04abb..1ad76839d1 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.cc
+++ b/src/turbomind/models/llama/moe_ffn_layer.cc
@@ -30,6 +30,7 @@ void MoeFfnLayer<T>::AllocateBuffer(size_t tokens, size_t padded)
         alloc(&f2n_, param_.experts_per_token * tokens);
         alloc(&en2f_, param_.experts_per_token * tokens);
         alloc(&scales_, param_.experts_per_token * tokens);
+        alloc(&shared_scales_, tokens);
         return (char*)alloc.ptr() - (char*)base;
     };
 
@@ -69,7 +70,7 @@ void MoeFfnLayer<T>::gate(float* logits, const T* input, int tokens, const Llama
                   getCudaDataType<T>(),
                   hidden_dim_,
                   &beta,
-                  logits_,
+                  logits,
                   CUDA_R_32F,
                   weight.output_dims,
                   CUDA_R_32F,
@@ -77,13 +78,13 @@ void MoeFfnLayer<T>::gate(float* logits, const T* input, int tokens, const Llama
 }
 
 template<class T>
-void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWeight<T>& moe)
+void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe)
 {
     const size_t padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
 
     AllocateBuffer(tokens, padded);
 
-    gate(logits_, inout, tokens, moe.gate);
+    gate(logits_, input, tokens, moe.gate);
     sync_check_cuda_error();
 
     check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * param_.expert_num * kMoeGateMaxTiles, stream_));
@@ -103,6 +104,7 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
                      padded,
                      param_.expert_num,
                      param_.experts_per_token,
+                     param_.norm_topk,
                      stream_);
     sync_check_cuda_error();
 
@@ -123,7 +125,7 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
 
     if (param_.method == MoeParam::kNaive) {
 
-        dispatchMoeGather(inout_buf_, inout, f2n_, tokens, param_.experts_per_token, hidden_dim_, stream_);
+        dispatchMoeGather(inout_buf_, input, f2n_, tokens, param_.experts_per_token, hidden_dim_, stream_);
         sync_check_cuda_error();
 
         check_cuda_error(
@@ -155,28 +157,8 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
 
         auto& block = moe.block;
 
-#if 0
-        FT_CHECK(!block.is_fused_silu);
-        for (int i = 0; i < param_.expert_num; ++i) {
-            if (size_t count = h_offsets_[i + 1] - h_offsets_[i]) {
-                cublas_->Gemm(CUBLAS_OP_T,  // (m, k)  W
-                              CUBLAS_OP_N,  // (k, n)  X
-                              inter_size_ * 2,
-                              count,
-                              hidden_dim_,
-                              moe.experts[i].fused_gating_intermediate.kernel,
-                              hidden_dim_,
-                              inout_buf_ + h_offsets_[i] * hidden_dim_,
-                              hidden_dim_,
-                              inter_buf_ + h_offsets_[i] * inter_size_ * 2,
-                              inter_size_ * 2);
-                sync_check_cuda_error();
-            }
-        }
-        auto mode = kCmpWrite;
-#else
         linear_->forward_moe(inter_buf_,
-                             {inout, (int)hidden_dim_},
+                             {input, (int)hidden_dim_},
                              f2n_,
                              offsets_,
                              tokens * param_.experts_per_token,
@@ -185,7 +167,6 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
                              context_.get());
         sync_check_cuda_error();
         auto mode = kCmpRead;
-#endif
 
         // if (tensor_para_.rank_ == 0) {
         //     Compare(inter_buf_,  //
@@ -205,25 +186,6 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
             sync_check_cuda_error();
         }
 
-#if 0
-        for (int i = 0; i < param_.expert_num; ++i) {
-            if (size_t count = h_offsets_[i + 1] - h_offsets_[i]) {
-                cublas_->Gemm(CUBLAS_OP_T,  // (m, k) W
-                              CUBLAS_OP_N,  // (k, n) X
-                              hidden_dim_,
-                              count,
-                              inter_size_,
-                              moe.experts[i].output.kernel,
-                              inter_size_,
-                              inter_buf_ + h_offsets_[i] * inter_size_ * 2,
-                              inter_size_ * 2,
-                              inout_buf_ + h_offsets_[i] * hidden_dim_,
-                              hidden_dim_);
-                sync_check_cuda_error();
-            }
-        }
-        auto mode1 = kCmpWrite;
-#else
         linear_->forward_moe(inout_buf_,
                              {inter_buf_, block.is_fused_silu ? (int)inter_size_ : (int)inter_size_ * 2},
                              nullptr,
@@ -234,7 +196,6 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
                              context_.get());
         sync_check_cuda_error();
         auto mode1 = kCmpRead;
-#endif
 
         // if (tensor_para_.rank_ == 0) {
         //     Compare(inter_buf_2_,  //
@@ -250,18 +211,29 @@ void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWei
         // }
     }
 
-    invokeMoeReduce(inout, inout_buf_, scales_, en2f_, tokens, param_.experts_per_token, hidden_dim_, stream_);
+    if (moe.shared_gate.kernel) {
+        gate(shared_scales_, input, tokens, moe.shared_gate);
+    }
+}
+
+template<class T>
+void MoeFfnLayer<T>::reduce(T* output, int tokens, const MoeFfnWeight<T>& moe)
+{
+    invokeMoeReduce(output,
+                    inout_buf_,
+                    scales_,
+                    en2f_,
+                    moe.shared_gate.kernel ? shared_scales_ : nullptr,
+                    tokens,
+                    param_.experts_per_token,
+                    hidden_dim_,
+                    stream_);
     sync_check_cuda_error();
 
     if (tensor_para_.world_size_ > 1) {
-        ftNcclAllReduceSum(inout, inout, tokens * hidden_dim_, tensor_para_, stream_);
+        ftNcclAllReduceSum(output, output, tokens * hidden_dim_, tensor_para_, stream_);
         sync_check_cuda_error();
     }
-
-    // if (tensor_para_.rank_ == 0) {
-    //     check_cuda_error(cudaStreamSynchronize(stream_));
-    //     std::abort();
-    // }
 }
 
 template<class T>
diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h
index ef65aaa464..0f1713f7b5 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.h
+++ b/src/turbomind/models/llama/moe_ffn_layer.h
@@ -51,7 +51,9 @@ class MoeFfnLayer {
         FreeBuffer();
     }
 
-    void forward(T* inout, int tokens, int layer_id, const MoeFfnWeight<T>& moe);
+    void forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe);
+
+    void reduce(T* output, int tokens, const MoeFfnWeight<T>& moe);
 
     void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight<T>& weight);
 
@@ -85,6 +87,8 @@ class MoeFfnLayer {
     int*   en2f_{};
     float* scales_{};
 
+    float* shared_scales_{};
+
     int* accum_{};
     int* offsets_{};
 };
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index 68392215f6..28e8b5f649 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -26,9 +26,15 @@ UnifiedDecoder<T>::UnifiedDecoder(const ModelParam&     model,
     dtype_(getTensorType<T>())
 {
 
-    attn_layer_    = std::make_unique<UnifiedAttentionLayer<T>>(model, attn, lora, tp, ctx);
-    ffn_layer_     = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, true);
-    moe_ffn_layer_ = std::make_unique<MoeFfnLayer<T>>(model, moe, tp, ctx);
+    attn_layer_ = std::make_unique<UnifiedAttentionLayer<T>>(model, attn, lora, tp, ctx);
+
+    if (moe.expert_num) {
+        moe_ffn_layer_ = std::make_unique<MoeFfnLayer<T>>(model, moe, tp, ctx);
+    }
+
+    if (model.inter_size) {
+        ffn_layer_ = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx, !moe_ffn_layer_);
+    }
 
     check_cuda_error(cudaEventCreateWithFlags(&ev_h_cu_x_, cudaEventDisableTiming));
 }
@@ -190,9 +196,10 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
         /// feed-forward network
 
         if (!weights->at(layer)->moe_weights.experts.empty()) {
-            moe_ffn_layer_->forward(decoder_output, token_num, layer, weights->at(layer)->moe_weights);
+            moe_ffn_layer_->forward(nullptr, decoder_output, token_num, layer, weights->at(layer)->moe_weights);
         }
-        else {
+
+        if (ffn_layer_) {
             int       layer_id = layer;  // int is needed
             TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}},
                                  {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}};
@@ -203,6 +210,10 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
             ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights);
         }
 
+        if (!weights->at(layer)->moe_weights.experts.empty()) {
+            moe_ffn_layer_->reduce(decoder_output, token_num, weights->at(layer)->moe_weights);
+        }
+
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2);
 
         const bool is_last_layer = layer == layer_num_ - 1;
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 8db13652f5..38552be0cf 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -301,6 +301,8 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     moe_param_.expert_num        = model_reader["expert_num"].as<int>(0);
     moe_param_.experts_per_token = model_reader["experts_per_token"].as<int>(0);
     moe_param_.inter_size        = model_reader["expert_inter_size"].as<int>(0);
+    moe_param_.shared_gate       = model_reader["moe_shared_gate"].as<int>(0);
+    moe_param_.norm_topk         = model_reader["moe_norm_topk"].as<bool>(false);
 
     handleMissingParams();