-
Notifications
You must be signed in to change notification settings - Fork 34
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Exception during saving cache to db #31
Comments
We should take a look at the bitblas operator config under this BitLinear settings, let me take a look. |
@ostix360 Would you mind provide the whole test code instead of a snippet to reproduce? |
# Licensed under the MIT License.
# pylint: disable=missing-docstring, invalid-name
"""This is modified from https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/main/utils_quant.py to work with BitBLAS."""
import torch
from torch import nn
import bitblas
from bitblas.cache import global_operator_cache, get_database_path
from bitblas import Matmul, MatmulConfig
from bitblas import auto_detect_nvidia_target
from logging import getLogger
logger = getLogger(__name__)
bitblas.set_log_level("DEBUG")
BITBLAS_TARGET = auto_detect_nvidia_target()
BITBLAS_DATABASE_PATH = get_database_path()
def weight_quant(weight, num_bits=1):
dtype = weight.dtype
weight = weight.float()
s = 1 / weight.abs().mean().clamp(min=1e-5)
result = (weight * s).round().clamp(-1, 1) / s
return result.type(dtype)
def activation_quant(x, num_bits=8):
dtype = x.dtype
x = x.float()
Qn = -(2**(num_bits - 1))
Qp = 2**(num_bits - 1) - 1
s = Qp / x.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
result = (x * s).round().clamp(Qn, Qp) / s
return result.type(dtype)
# BitBLAS BitLinear
class BitLinear(nn.Linear):
def __init__(self, *kargs, weight_bits=1, input_bits=8, **kwargs):
super(BitLinear, self).__init__(*kargs, **kwargs)
"""
RMSNorm is placed outside BitLinear
"""
self.weight_bits = weight_bits
self.input_bits = input_bits
matmul_config = MatmulConfig(
N=self.out_features, # N dimension
K=self.in_features, # K dimension
A_dtype="int8", # activation A dtype
W_dtype="int2", # weight W dtype
accum_dtype="int32", # accumulation dtype
out_dtype="float32", # output dtype
layout="nt", # matrix layout, "nt" indicates the layout of A is non-transpose and the layout of W is transpose
with_bias=False, # bias
# configs for weight only quantization
group_size=None, # setting for grouped quantization
with_scaling=False, # setting for scaling factor
with_zeros=False, # setting for zeros
zeros_mode=None, # setting for how to calculating zeros
)
ENABLE_TUNING = True
self.bitblas_matmul = self._get_or_create_bitblas_operator(matmul_config, ENABLE_TUNING)
self.Qp = 2**(self.input_bits - 1) - 1
def _get_or_create_bitblas_operator(self, config, enable_tuning):
if global_operator_cache.size() == 0:
global_operator_cache.load_from_database(BITBLAS_DATABASE_PATH, BITBLAS_TARGET)
logger.info(f"Loaded {global_operator_cache.size()} operators from database.")
bitblas_matmul = global_operator_cache.get(config)
if bitblas_matmul is None:
# should disable tuning for the first time because we may require loading bitblas operator from database.
bitblas_matmul = Matmul(config, target=BITBLAS_TARGET, enable_tuning=False)
if enable_tuning:
bitblas_matmul.hardware_aware_finetune(topk=20)
global_operator_cache.add(config, bitblas_matmul)
global_operator_cache.save_into_database(BITBLAS_DATABASE_PATH, BITBLAS_TARGET)
print("BitBLAS Tuning done, appended operator to global_operator_cache.")
else:
print("BitBLAS Operator created.")
else:
print("BitBLAS Operator found in global_operator_cache.")
return bitblas_matmul
def post_process_weights(self):
sw = 1 / self.weight.abs().mean().clamp(min=1e-5)
self.sw = sw
quant_weight = self.weight_quant(self.weight).detach()
quant_weight = self.bitblas_matmul.transform_weight(quant_weight)
self.weight = nn.Parameter(quant_weight, requires_grad=False)
def weight_quant(self, weight):
weight = weight.float()
s = 1 / weight.abs().mean().clamp(min=1e-5)
result = (weight * s).round().clamp(-1, 1)
return result.type(torch.int8)
def activation_quant(self, x, num_bits=8):
x = x.float()
Qn = -(2**(num_bits - 1))
Qp = 2**(num_bits - 1) - 1
s = Qp / x.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
result = (x * s).round().clamp(Qn, Qp)
return result.type(torch.int8)
# for the correctness evaluation.
def native_forward(self, input):
quant_input = (input + (activation_quant(input, self.input_bits) - input).detach())
quant_weight = (
self.weight + (weight_quant(self.weight, self.weight_bits) - self.weight).detach())
out = nn.functional.linear(quant_input, quant_weight)
if self.bias is not None:
out += self.bias.view(1, -1).expand_as(out)
return out
def forward_fp32_simulated(self, input):
print("input: ", input)
quant_input = self.activation_quant(input, self.input_bits).detach()
quant_weight = self.weight_quant(self.weight).detach()
fp32_simulated_input = quant_input.float()
fp32_simulated_weight = quant_weight.float()
fp32_simulated_out = nn.functional.linear(fp32_simulated_input, fp32_simulated_weight)
sw = 1 / self.weight.abs().mean().clamp(min=1e-5)
Qp = 2**(self.input_bits - 1) - 1
si = Qp / input.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
# if / (si * sw) it will inf in some cases
out = fp32_simulated_out / si
out = out / sw
out = out.half()
if self.bias is not None:
out += self.bias.view(1, -1).expand_as(out)
return out
def forward(self, input):
quant_input = self.activation_quant(input, self.input_bits).detach()
fp32_out = self.bitblas_matmul(quant_input, self.weight)
sw = self.sw
Qp = self.Qp
si = Qp / input.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
# if / (si * sw) it will inf in some cases
out = fp32_out / si
out = out / sw
out = out.half()
if self.bias is not None:
out += self.bias.view(1, -1).expand_as(out)
return out
# # Naive BitLinear from HuggingFace
# class BitLinear(nn.Linear):
# def __init__(self, *kargs, weight_bits=1, input_bits=8, **kwargs):
# super(BitLinear, self).__init__(*kargs, **kwargs)
# """
# RMSNorm is placed outside BitLinear
# """
# self.weight_bits = weight_bits
# self.input_bits = input_bits
# def forward(self, input):
# quant_input = input + (activation_quant(input, self.input_bits) - input).detach()
# quant_weight = self.weight + (weight_quant(self.weight, self.weight_bits) -
# self.weight).detach()
# out = nn.functional.linear(quant_input, quant_weight)
# if not self.bias is None:
# out += self.bias.view(1, -1).expand_as(out)
# return out
my_linear = BitLinear(
2048, 6000, bias=False,
weight_bits=1, input_bits=8,
) The output with the cache empty:
It's the first time I debug and I have nvcc if you're asking And the output after first execution so the cache is not empty anymore: Sorry for this so long message but I'll give you everything that I think can help you. And btw in the bitblas cache folder there is a folder nvidia and inside geforce-rtx-3070-ti but I have a 4070-ti Thank you for your help |
@ostix360 Hi, That's the reason. BitBLAS automatically detects your device by analyzing your nvidia-smi information. It seems to have detected the wrong device, which also affects the compilation of sm architecture-related items. You can disable it by set |
How can we fix the auto analysis so it detects the device correctly? Since same thing happens for multiple gpus including Tesla T4, Tesla P100 |
I execute the command nvidia-smi and the 4070-ti is recognized. I also set TVM_TARGET="cuda" but it doesn't change any thing After some investigation of what you said in the bitlinear use a BITBLAS_TARGET that is define by auto_detect_nvidia_target() in target detector and when debugging the all_tags variable doesn't contains any 4070-ti tag (that's my be why it choose 3070-ti as a best matching tag) @DanFosing There is also no T4 is this tag list... This tag list seems to come from the tvm library... Is there something we can do? |
@ostix360 I just append the device in https://github.com/LeiWang1999/tvm/blob/bitblas/src/target/tag.cc. You can pull the latest code and try again :) The auto detector compares the SM version with the default target Actually the auto detector will compare the sm version with default target |
@DanFosing set |
The compilation time take too long time for my cpu so I can't try |
I tested the scripts you provided, there indeed a bug for small shapes tensorcore codegen, I just made a pr to fix it and it passed on my 4090. #32 |
Ok thanks |
I still have an error and I don't know why I have it: 2024-05-06 19:43:16 [BitBLAS:DEBUG]: LocalBuilder: An exception occurred Traceback (most recent call last):
File "/home/ostix/.virtualenvs/optimized-LLM/lib/python3.11/site-packages/bitblas/3rdparty/tvm/python/tvm/exec/popen_worker.py", line 87, in main
result = fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/home/ostix/.virtualenvs/optimized-LLM/lib/python3.11/site-packages/bitblas/base/utils.py", line 211, in _build
rt_mod = tvm.build(mod, target=arch.target)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ostix/.virtualenvs/optimized-LLM/lib/python3.11/site-packages/bitblas/3rdparty/tvm/python/tvm/driver/build_module.py", line 297, in build
rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ostix/.virtualenvs/optimized-LLM/lib/python3.11/site-packages/bitblas/3rdparty/tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 239, in __call__
raise_last_ffi_error()
File "/home/ostix/.virtualenvs/optimized-LLM/lib/python3.11/site-packages/bitblas/3rdparty/tvm/python/tvm/_ffi/base.py", line 481, in raise_last_ffi_error
raise py_err
File "/home/ostix/.virtualenvs/optimized-LLM/lib/python3.11/site-packages/bitblas/3rdparty/tvm/python/tvm/contrib/nvcc.py", line 204, in tvm_callback_cuda_compile
ptx = compile_cuda(code, target_format="fatbin")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ostix/.virtualenvs/optimized-LLM/lib/python3.11/site-packages/bitblas/3rdparty/tvm/python/tvm/contrib/nvcc.py", line 120, in compile_cuda
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ostix/miniconda3/lib/python3.11/subprocess.py", line 1026, in __init__
self._execute_child(args, executable, preexec_fn, close_fds,
File "/home/ostix/miniconda3/lib/python3.11/subprocess.py", line 1950, in _execute_child
raise child_exception_type(errno_num, err_msg, err_filename)
FileNotFoundError: [Errno 2] No such file or directory: 'nvcc' This error is repeated a lot of time. I don't understand because nvcc is installed and in the PATH env variable. And I have the same error I as before when I change the dimension of the linear layer from in=2048, out=6000 to in=3200, out=12800. if you want I can repost it |
After reinstalling cuda and conda it works well |
When running benchmark_inference_latency for bitnet I got this exception:
When restarting python throw the same error but for a different layer and the log indicate that the previous matmult has been loaded from cache (but hasn't been saved correctly the previous run).
After debugging a bit, it seems that the Operator has optimized_func=None that cause rt_mod=None and make the get_source return None
Python: 3.11.5
CUDA version: V12.3.107
Bitblas version: 0.0.1.dev3
OS: Linux WSL
What does mean this error and is it bad?
Test code executed to debug:
Thanks
The text was updated successfully, but these errors were encountered: