Skip to content

Commit

Permalink
fix BitNet integration for vLLM (#139)
Browse files Browse the repository at this point in the history
* fix BitNet integration for vLLM

* update ckpt name of BitNet integration for vLLM

* format code

* fix BitNet integration for vLLM native version
  • Loading branch information
xysmlx committed Aug 9, 2024
1 parent 22b5262 commit 0e1e366
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@
ckpt_path,
dtype="half",
quantization="bitblas",
enforce_eager=True, # set False to enable cuda graph
# set enforce_eager = False to enable cuda graph
# set enforce_eager = True to disable cuda graph
enforce_eager=False,
) as bitnet_model:
bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"],
max_tokens=1024)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# get the path of the current file
current_file_path = os.path.realpath(__file__)
current_dir = os.path.dirname(current_file_path)
ckpt_path = os.path.join(current_dir, "../models/ckpt_bitnet_b1_58-3B_bitblas")
ckpt_path = os.path.join(current_dir, "../models/ckpt_bitnet_b1_58-3B")

parser = argparse.ArgumentParser(description="Inference with BitNet")
parser.add_argument(
Expand All @@ -35,8 +35,11 @@
with VllmRunner(
ckpt_path,
dtype="half",
quantization="bitnet",
quantization="bitnet_bitblas",
gpu_memory_utilization=0.5,
# set enforce_eager = False to enable cuda graph
# set enforce_eager = True to disable cuda graph
enforce_eager=False,
) as bitnet_model:
bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"], max_tokens=128)
print("bitnet inference output:")
Expand Down

0 comments on commit 0e1e366

Please sign in to comment.