From 0e1e3663d3096ad4bee7ee0c119d0662c69b0b3e Mon Sep 17 00:00:00 2001 From: Lingxiao Ma Date: Fri, 9 Aug 2024 22:10:34 +0800 Subject: [PATCH] fix BitNet integration for vLLM (#139) * fix BitNet integration for vLLM * update ckpt name of BitNet integration for vLLM * format code * fix BitNet integration for vLLM native version --- .../vllm_workspace/inference_with_compress_format.py | 4 +++- .../BitNet/vllm_workspace/inference_with_native_format.py | 7 +++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/integration/BitNet/vllm_workspace/inference_with_compress_format.py b/integration/BitNet/vllm_workspace/inference_with_compress_format.py index 9e60fa974..55a24543e 100644 --- a/integration/BitNet/vllm_workspace/inference_with_compress_format.py +++ b/integration/BitNet/vllm_workspace/inference_with_compress_format.py @@ -35,7 +35,9 @@ ckpt_path, dtype="half", quantization="bitblas", - enforce_eager=True, # set False to enable cuda graph + # set enforce_eager = False to enable cuda graph + # set enforce_eager = True to disable cuda graph + enforce_eager=False, ) as bitnet_model: bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"], max_tokens=1024) diff --git a/integration/BitNet/vllm_workspace/inference_with_native_format.py b/integration/BitNet/vllm_workspace/inference_with_native_format.py index 579c5e17d..4f5f87f6f 100644 --- a/integration/BitNet/vllm_workspace/inference_with_native_format.py +++ b/integration/BitNet/vllm_workspace/inference_with_native_format.py @@ -18,7 +18,7 @@ # get the path of the current file current_file_path = os.path.realpath(__file__) current_dir = os.path.dirname(current_file_path) -ckpt_path = os.path.join(current_dir, "../models/ckpt_bitnet_b1_58-3B_bitblas") +ckpt_path = os.path.join(current_dir, "../models/ckpt_bitnet_b1_58-3B") parser = argparse.ArgumentParser(description="Inference with BitNet") parser.add_argument( @@ -35,8 +35,11 @@ with VllmRunner( ckpt_path, dtype="half", - quantization="bitnet", + quantization="bitnet_bitblas", gpu_memory_utilization=0.5, + # set enforce_eager = False to enable cuda graph + # set enforce_eager = True to disable cuda graph + enforce_eager=False, ) as bitnet_model: bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"], max_tokens=128) print("bitnet inference output:")