From aaaad7612770db34d0646290946e9a4a2aa05eda Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Mon, 4 Nov 2024 08:48:03 +0000 Subject: [PATCH 1/2] fix tp --- src/turbomind/triton_backend/llama/LlamaTritonModel.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 5fbd4287a8..23abd12c99 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -380,6 +380,10 @@ std::unique_ptr> LlamaTritonModel::createSharedModelInstance( shared_state_, device_id); + // Wait for pinned buffers to be allocated for all ranks, otherwise tuning will hang + // due to concurrent kernel launch & cudaMallocHost + shared_state_->barrier->wait(); + engine->Start(); return engine; From 4f673171728ac782dc2218982430838ad319e405 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Mon, 4 Nov 2024 10:43:59 +0000 Subject: [PATCH 2/2] use fused moe for all arches by default --- src/turbomind/triton_backend/llama/LlamaTritonModel.cc | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 23abd12c99..8db13652f5 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -336,12 +336,6 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, } else { moe_param_.method = ft::MoeParam::kFused; - // Note: This will fail when GPUs of different SMs are mixed - if (weight_type_ != ft::WeightType::kINT4 && ft::getSMVersion() >= 90) { - // On sm90 the cuBLAS method may be faster as our grouped GEMM is not - // optimized for GMMA yet - moe_param_.method = ft::MoeParam::kNaive; - } } TM_LOG_INFO("%s", toString().c_str());