-
Notifications
You must be signed in to change notification settings - Fork 451
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support Qwen2-MoE models #2723
Support Qwen2-MoE models #2723
Conversation
lzhangzz
commented
Nov 7, 2024
- Qwen1.5-MoE-A2.7B-Chat
- Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4
- Qwen2-57B-A14B-Instruct
- Qwen2-57B-A14B-Instruct-GPTQ-Int4
@zhulinJulia24 may update the TCs |
@@ -50,6 +50,8 @@ class ModelConfig: | |||
expert_num: int = 0 | |||
expert_inter_size: int = 0 | |||
experts_per_token: int = 0 | |||
moe_shared_gate: int = False |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
int -> bool
@@ -301,6 +301,8 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t tensor_para_size, | |||
moe_param_.expert_num = model_reader["expert_num"].as<int>(0); | |||
moe_param_.experts_per_token = model_reader["experts_per_token"].as<int>(0); | |||
moe_param_.inter_size = model_reader["expert_inter_size"].as<int>(0); | |||
moe_param_.shared_gate = model_reader["moe_shared_gate"].as<int>(0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
as(false)
info['experts_per_token'] = cfg['num_experts_per_tok'] | ||
info['inter_size'] = cfg['shared_expert_intermediate_size'] | ||
info['moe_shared_gate'] = True | ||
info['moe_norm_topk_prob'] = cfg['norm_topk_prob'] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
moe_norm_topk_prob is not defined in class ModelConfig
CUDA_R_32F, | ||
weight.output_dims, | ||
CUDA_R_32F, | ||
CUBLAS_GEMM_DEFAULT_TENSOR_OP); | ||
} | ||
|
||
template<class T> | ||
void MoeFfnLayer<T>::forward(T* inout, int tokens, int layer_id, const MoeFfnWeight<T>& moe) | ||
void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the added output
seems unused.