From 1b1a17712e0f968dff20ae4594f905e941251083 Mon Sep 17 00:00:00 2001 From: vince62s Date: Thu, 13 Jun 2024 21:30:18 +0200 Subject: [PATCH] fix missing layers names --- docs/docusaurus_tsx/docs/FAQ/lora.md | 4 ++-- eole/models/model.py | 6 +++--- eole/models/model_saver.py | 6 +++--- recipes/llama2/llama-finetune.yaml | 2 +- recipes/llama2/llama-inference-tp-2gpu.yaml | 2 +- recipes/llama2/llama-inference.yaml | 2 +- recipes/llama3/llama-inference.yaml | 2 +- recipes/llama3/llama-mmlu.yaml | 2 +- recipes/mistral/mistral-7b-awq-gemm-inference.yaml | 2 +- recipes/mixtral/mixtral-inference-awq.yaml | 4 ++-- 10 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/docusaurus_tsx/docs/FAQ/lora.md b/docs/docusaurus_tsx/docs/FAQ/lora.md index 39512356..04d67716 100644 --- a/docs/docusaurus_tsx/docs/FAQ/lora.md +++ b/docs/docusaurus_tsx/docs/FAQ/lora.md @@ -18,8 +18,8 @@ Also you can read the blog post here: https://huggingface.co/blog/hf-bitsandbyte You need to add the following option: -* `quant_layers: ['w_1', 'w_2', 'linear_values', 'linear_query']` +* `quant_layers: ['up_proj', 'down_proj', 'linear_values', 'linear_query']` * `quant_type: ['bnb_NF4']` You can for instane quantize the layers of the PositionWise Feed-Forward from the Encoder/Decoder and the key/query/values/final from the Multi-head attention. -Choices for quantization are ["bnb_8bit", "bnb_FP4", "bnb_NF4"] \ No newline at end of file +Choices for quantization are ["bnb_8bit", "bnb_FP4", "bnb_NF4"] diff --git a/eole/models/model.py b/eole/models/model.py index 8c347b21..1fbba94c 100644 --- a/eole/models/model.py +++ b/eole/models/model.py @@ -553,8 +553,8 @@ def _load_param(self, name, module, param_name, param, buf_list, ckpt_t, offset) "linear_keys", "linear_values", "linear_query", - "w_1", - "w_3", + "gate_up_proj", + "up_proj", ]: col_slice_start = param.data.size(0) * offset col_slice_end = param.data.size(0) * (offset + 1) @@ -562,7 +562,7 @@ def _load_param(self, name, module, param_name, param, buf_list, ckpt_t, offset) col_slice_start = 0 col_slice_end = param.data.size(0) if param.data.dim() == 2: - if name.split(".")[-1] in ["final_linear", "w_2"]: + if name.split(".")[-1] in ["final_linear", "up_proj"]: row_slice_start = param.data.size(1) * offset row_slice_end = param.data.size(1) * (offset + 1) else: diff --git a/eole/models/model_saver.py b/eole/models/model_saver.py index 3181bba6..dd4b0bf9 100644 --- a/eole/models/model_saver.py +++ b/eole/models/model_saver.py @@ -162,10 +162,10 @@ def _tensor_parallel_state_dict(self, model_state_dict, world_size): "linear_keys", "linear_values", "linear_query", - "w_1", - "w_3", + "gate_up_proj", + "up_proj", } - cat_params = {"final_linear", "w_2"} + cat_params = {"final_linear", "down_proj"} # we probably should try and improve this to rely on dimensions instead of names match key_1, key_2: case "lora_A", _ if key_2 in averaged_params: diff --git a/recipes/llama2/llama-finetune.yaml b/recipes/llama2/llama-finetune.yaml index 3fb2f766..d4b25f5f 100755 --- a/recipes/llama2/llama-finetune.yaml +++ b/recipes/llama2/llama-finetune.yaml @@ -85,7 +85,7 @@ training: valid_steps: 100 # 4/8bit - quant_layers: ['w_1', 'w_2', 'w_3', 'linear_values', 'linear_query', 'linear_keys', 'final_linear'] + quant_layers: ['gate_up_proj', 'down_proj', 'up_proj', 'linear_values', 'linear_query', 'linear_keys', 'final_linear'] quant_type: "bnb_NF4" # LoRa diff --git a/recipes/llama2/llama-inference-tp-2gpu.yaml b/recipes/llama2/llama-inference-tp-2gpu.yaml index a960f529..146bdcf5 100755 --- a/recipes/llama2/llama-inference-tp-2gpu.yaml +++ b/recipes/llama2/llama-inference-tp-2gpu.yaml @@ -16,7 +16,7 @@ batch_size: 8 world_size: 2 gpu_ranks: [0, 1] parallel_mode: "tensor_parallel" -quant_layers: ['w_1', 'w_2', 'w_3', 'linear_values', 'linear_query', 'linear_keys', 'final_linear'] +quant_layers: ['gate_up_proj', 'down_proj', 'up_proj', 'linear_values', 'linear_query', 'linear_keys', 'final_linear'] quant_type: "bnb_NF4" precision: fp16 random_sampling_topk: 5 diff --git a/recipes/llama2/llama-inference.yaml b/recipes/llama2/llama-inference.yaml index 8544fc8d..3105987b 100755 --- a/recipes/llama2/llama-inference.yaml +++ b/recipes/llama2/llama-inference.yaml @@ -16,7 +16,7 @@ batch_size: 8 world_size: 1 gpu_ranks: [0] #parallel_mode: "tensor_parallel" -quant_layers: ['w_1', 'w_2', 'w_3', 'linear_values', 'linear_query', 'linear_keys', 'final_linear'] +quant_layers: ['gate_up_proj', 'down_proj', 'up_proj', 'linear_values', 'linear_query', 'linear_keys', 'final_linear'] quant_type: "bnb_NF4" precision: fp16 #random_sampling_topk: 1 diff --git a/recipes/llama3/llama-inference.yaml b/recipes/llama3/llama-inference.yaml index 309f91e3..d0f0611f 100755 --- a/recipes/llama3/llama-inference.yaml +++ b/recipes/llama3/llama-inference.yaml @@ -23,7 +23,7 @@ gpu_ranks: [0] # world_size: 2 # gpu_ranks: [0, 1] # parallel_mode: "tensor_parallel" -# quant_layers: ['w_1', 'w_2', 'w_3', 'linear_values', 'linear_query', 'linear_keys', 'final_linear'] +# quant_layers: ['gate_up_proj', 'down_proj', 'up_proj', 'linear_values', 'linear_query', 'linear_keys', 'final_linear'] # quant_type: "bnb_NF4" precision: fp16 #random_sampling_topk: 1 diff --git a/recipes/llama3/llama-mmlu.yaml b/recipes/llama3/llama-mmlu.yaml index 546beb6b..54ba12dc 100755 --- a/recipes/llama3/llama-mmlu.yaml +++ b/recipes/llama3/llama-mmlu.yaml @@ -23,7 +23,7 @@ gpu_ranks: [0] # world_size: 2 # gpu_ranks: [0, 1] # parallel_mode: "tensor_parallel" -# quant_layers: ['w_1', 'w_2', 'w_3', 'linear_values', 'linear_query', 'linear_keys', 'final_linear'] +# quant_layers: ['gate_up_proj', 'down_proj', 'up_proj', 'linear_values', 'linear_query', 'linear_keys', 'final_linear'] # quant_type: "bnb_NF4" precision: fp16 #random_sampling_topk: 1 diff --git a/recipes/mistral/mistral-7b-awq-gemm-inference.yaml b/recipes/mistral/mistral-7b-awq-gemm-inference.yaml index d0878241..b88f9534 100755 --- a/recipes/mistral/mistral-7b-awq-gemm-inference.yaml +++ b/recipes/mistral/mistral-7b-awq-gemm-inference.yaml @@ -18,7 +18,7 @@ gpu_ranks: [0] # world_size: 2 # gpu_ranks: [0, 1] # parallel_mode: "tensor_parallel" -#quant_layers: ['w_1', 'w_2', 'w_3', 'linear_values', 'linear_query', 'linear_keys', 'final_linear'] +#quant_layers: ['gate_up_proj', 'down_proj', 'up_proj', 'linear_values', 'linear_query', 'linear_keys', 'final_linear'] #quant_type: "bnb_NF4" precision: fp16 #random_sampling_topk: 1 diff --git a/recipes/mixtral/mixtral-inference-awq.yaml b/recipes/mixtral/mixtral-inference-awq.yaml index 79e3c752..b2daa4d5 100755 --- a/recipes/mixtral/mixtral-inference-awq.yaml +++ b/recipes/mixtral/mixtral-inference-awq.yaml @@ -16,8 +16,8 @@ batch_size: 1 world_size: 2 gpu_ranks: [0, 1] parallel_mode: "tensor_parallel" -#quant_layers: ['w_1', 'w_2', 'w_3', 'linear_values', 'linear_query', 'linear_keys', 'final_linear'] -#quant_layers: ['w_1', 'w_2', 'w_3'] +#quant_layers: ['gate_up_proj', 'down_proj', 'up_proj', 'linear_values', 'linear_query', 'linear_keys', 'final_linear'] +#quant_layers: ['gate_up_proj', 'down_proj', 'up_proj'] #quant_type: "bnb_sparse" precision: fp16 #random_sampling_topk: 1