Skip to content

Commit

Permalink
fix turbomind TP for v0.6.2 (#2713)
Browse files Browse the repository at this point in the history
  • Loading branch information
lzhangzz authored Nov 5, 2024
1 parent 522108c commit 5ea819f
Showing 1 changed file with 4 additions and 6 deletions.
10 changes: 4 additions & 6 deletions src/turbomind/triton_backend/llama/LlamaTritonModel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -333,12 +333,6 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t tensor_para_size,
}
else {
moe_param_.method = ft::MoeParam::kFused;
// Note: This will fail when GPUs of different SMs are mixed
if (weight_type_ != ft::WeightType::kINT4 && ft::getSMVersion() >= 90) {
// On sm90 the cuBLAS method may be faster as our grouped GEMM is not
// optimized for GMMA yet
moe_param_.method = ft::MoeParam::kNaive;
}
}

TM_LOG_INFO("%s", toString().c_str());
Expand Down Expand Up @@ -377,6 +371,10 @@ std::unique_ptr<ft::Engine<T>> LlamaTritonModel<T>::createSharedModelInstance(
shared_state_,
device_id);

// Wait for pinned buffers to be allocated for all ranks, otherwise tuning will hang
// due to concurrent kernel launch & cudaMallocHost
shared_state_->barrier->wait();

engine->Start();

return engine;
Expand Down

0 comments on commit 5ea819f

Please sign in to comment.