Merge branch 'update_transform' of https://github.com/LeiWang1999/MSB…

…itBLAS into update_transform
microsoft · Aug 9, 2024 · 40a9b04 · 40a9b04
2 parents 2327e14 + 72a1863
commit 40a9b04
Show file tree

Hide file tree

Showing 13 changed files with 914 additions and 13 deletions.
diff --git a/bitblas/base/roller/policy/tensorcore.py b/bitblas/base/roller/policy/tensorcore.py
@@ -333,7 +333,6 @@ def _score(node, thread):  # small is better
             logger.info(info_message)
             codegen_dict.shared_scope = "shared.dyn"
 
-        # Or assume we always use shared memory
         codegen_dict.shared_scope = "shared.dyn"
 
         codegen_dict.complete_config(node)

diff --git a/install.sh b/install.sh
@@ -46,7 +46,7 @@ fi
 
 echo "Download and extraction completed successfully."
 
-LLVM_CONFIG_PATH="${EXTRACT_PATH}/$(basename ${FILE_NAME} .tar.xz)/bin/llvm-config"
+LLVM_CONFIG_PATH="$(realpath ${EXTRACT_PATH}/$(basename ${FILE_NAME} .tar.xz)/bin/llvm-config)"
 echo "LLVM config path: $LLVM_CONFIG_PATH"
 
 # clone and build tvm

diff --git a/integration/BitNet/README.md b/integration/BitNet/README.md
@@ -2,8 +2,42 @@
 license: mit
 ---
 
+
 This is a BitBLAS Implementation for the reproduced 1.58bit model from [1bitLLM/bitnet_b1_58-3B](https://huggingface.co/1bitLLM/bitnet_b1_58-3B). We replaced the original simulated Int8x3bit Quantized Inference Kernel with BitBLAS INT8xINT2 Kernel. We also evaluated the model's correctness and performance through `eval_correctness.py` and `benchmark_inference_latency.py`.
 
+## Latest News
+
+- 08/09/2024 ✨: We provide a more efficient implementation for bitnet with vLLM, which should use special model checkpoints, to make the ckpt and study how to deploy, please checkout [Make Checkpoints for vLLM](#make-checkpoints-for-vllm).
+
+## Make Checkpoints for vLLM
+
+We provide two scripts to make the checkpoints for vLLM. The first script is `generate_bitnet_model_native_format.sh`, which is used to make a checkpoint with fp16 uncompressed metaadta, the main difference with the original checkpoint is the `quant_config.json`, which allow vLLM to load the model and execute with a quant extension.
+
+```bash
+# move to the integration directory
+cd /root/to/BitBLAS/integration/BitNet
+# make the checkpoint
+./maint/generate_bitnet_model_native_format.sh
+# the output ckpy will be saved in the `./models/ckpt_bitnet_b1_58-3B` directory
+```
+
+The second script is `generate_bitnet_model_bitblas_format.sh`, which is used to make a checkpoint with BitBLAS compressed metadata, which can avoid the online dequantize sage for the profiling of vLLM, which lead to more efficient memory utilization.
+
+```bash
+./maint/generate_bitnet_model_bitblas_format.sh ./models/ckpt_bitnet_b1_58-3B ./models/ckpt_bitnet_b1_58-3B_bitblas
+# the output ckpy will be saved in the `./models/ckpt_bitnet_b1_58-3B_bitblas` directory
+```
+
+Finnaly, you can use the ckpt in vLLM with:
+
+```bash
+cd vllm_workspace
+# inference with the ckpt with fp16 uncompressed metadata
+python3 inference_with_native_format.py
+# inference with the ckpt with BitBLAS compressed metadata
+python3 inference_with_bitblas_format.py
+```
+
 ## BitBLAS Results
 
 ### Performance

diff --git a/integration/BitNet/eval_correctness.py b/integration/BitNet/eval_correctness.py
@@ -18,9 +18,6 @@ def generate_text(model, tokenizer, prompt, max_length=100):
     seq_length = input_ids.size(1)
     position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
     position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
-    # position_embeddings = model.embed_positions(position_ids)
-    # cos = position_embeddings[:, :, 0::2].cos()
-    # sin = position_embeddings[:, :, 1::2].sin()
 
     generation_config = GenerationConfig(
         max_length=max_length,
@@ -32,7 +29,6 @@ def generate_text(model, tokenizer, prompt, max_length=100):
 
     start_time = time.time()
     output_ids = model.generate(input_ids, generation_config=generation_config)
-    # output_ids = model.generate(input_ids, generation_config=generation_config, cos=cos, sin=sin)
     end_time = time.time()
 
     generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

diff --git a/integration/BitNet/create_bitblas_ckpt.py → ...ation/BitNet/maint/create_bitblas_ckpt.py b/integration/BitNet/create_bitblas_ckpt.py → ...ation/BitNet/maint/create_bitblas_ckpt.py
@@ -1,24 +1,36 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
+import argparse
 import torch
 import bitblas
-from modeling_bitnet import BitnetForCausalLM
-from tokenization_bitnet import BitnetTokenizer
 from transformers.utils.hub import cached_file
 import os
 from transformers import GenerationConfig
 import time
 import json
 
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + "/../")
+from modeling_bitnet import BitnetForCausalLM
+from tokenization_bitnet import BitnetTokenizer
+
 filepath = os.path.abspath(__file__)
 dirpath = os.path.dirname(filepath)
 
 torch.set_grad_enabled(False)
 bitblas.set_log_level("INFO")
 
-model_name_or_path = "BitBLASModel/open_llama_3b_1.58bits"
-saved_model_path = os.path.join(dirpath, "models", f"{model_name_or_path}_bitblas")
+parser = argparse.ArgumentParser()
+parser.add_argument("--model_name_or_path", type=str, default="1bitLLM/bitnet_b1_58-3B")
+parser.add_argument("--saved_model_path", type=str, default=None)
+args = parser.parse_args()
+
+model_name_or_path = args.model_name_or_path
+saved_model_path = os.path.join(
+    dirpath, "models",
+    f"{model_name_or_path}_bitblas") if args.saved_model_path is None else args.saved_model_path
 
 
 def generate_text(model, tokenizer, prompt, max_length=100):

diff --git a/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh b/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# retrieve the native model input and saved model directory
+MODEL_DIR=$1
+SAVED_MODEL_DIR=$2
+
+# check if the model directory exists
+if [ ! -d "$MODEL_DIR" ]; then
+    echo "Model directory does not exist!"
+    exit 1
+fi
+
+# if the saved model directory does not exist, create it
+# if SAVED_MODEL_DIR is not provided, we do not pass it to the script
+if [ -z "$SAVED_MODEL_DIR" ]; then
+  python ./maint/create_bitblas_ckpt.py --model_name_or_path $MODEL_DIR
+else
+  python ./maint/create_bitblas_ckpt.py --model_name_or_path $MODEL_DIR --saved_model_path $SAVED_MODEL_DIR
+fi
+
+# get the realpath of the saved model directory
+SAVED_MODEL_DIR=$(realpath $SAVED_MODEL_DIR)
+
+# cp files
+cp $MODEL_DIR/quantize_config.json $SAVED_MODEL_DIR/
+cp $MODEL_DIR/tokenizer.json $SAVED_MODEL_DIR/
+cp $MODEL_DIR/tokenizer.model $SAVED_MODEL_DIR/
+cp $MODEL_DIR/tokenizer_config.json $SAVED_MODEL_DIR/
+
+echo "Model has been converted and save to $SAVED_MODEL_DIR"
diff --git a/integration/BitNet/maint/generate_bitnet_model_native_format.sh b/integration/BitNet/maint/generate_bitnet_model_native_format.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# require git lfs
+if ! command -v git-lfs &> /dev/null; then
+    echo "Please install git-lfs first by running 'sudo apt install git-lfs'"
+    exit 1
+fi
+
+mkdir -p models
+
+cd models
+
+# download the model
+git clone https://huggingface.co/1bitLLM/bitnet_b1_58-3B ckpt_bitnet_b1_58-3B --depth 1
+
+# copy quantized config into the model directory
+cp ../maint/quantize_config.json ckpt_bitnet_b1_58-3B
+
+# get the realpath of the model directory
+MODEL_DIR=$(realpath ckpt_bitnet_b1_58-3B)
+
+cd ..
+
+echo "Model has been converted and save to $MODEL_DIR"
diff --git a/integration/BitNet/maint/quantize_config.json b/integration/BitNet/maint/quantize_config.json
@@ -0,0 +1,10 @@
+{
+    "bits": 2,
+    "desc_act": false,
+    "static_groups": false,
+    "sym": true,
+    "lm_head": false,
+    "model_name_or_path": "1bitLLM/bitnet_b1_58-3B",
+    "quant_method": "bitnet",
+    "checkpoint_format": "bitnet"
+}
diff --git a/integration/BitNet/utils_quant.py b/integration/BitNet/utils_quant.py
@@ -138,6 +138,7 @@ def weight_quant(weight):
         result = (weight * s).round().clamp(-1, 1)
         return result.type(torch.int8)
 
+    @torch.compile
     def activation_quant(self, x, num_bits=8):
         x = x.float()
         Qn = -(2**(num_bits - 1))
@@ -146,6 +147,13 @@ def activation_quant(self, x, num_bits=8):
         result = (x * s).round().clamp(Qn, Qp)
         return result.type(torch.int8)
 
+    @torch.compile
+    def post_quant_process(self, input, si, sw):
+        out = input / si
+        out = out / sw
+        out = out.half()
+        return out
+
     # for the correctness evaluation.
     def native_forward(self, input):
         quant_input = (input + (activation_quant(input, self.input_bits) - input).detach())
@@ -184,9 +192,8 @@ def forward(self, input):
         Qp = self.Qp
         si = Qp / input.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
         # if / (si * sw) it will inf in some cases
-        out = fp32_out / si
-        out = out / sw
-        out = out.half()
+        out = self.post_quant_process(fp32_out, si, sw)
+
         if self.bias is not None:
             out += self.bias.view(1, -1).expand_as(out)
         return out