Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CUDA setup cleanup #996

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 23 additions & 46 deletions bitsandbytes/__main__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import glob
import os
from os.path import isdir
import sys
from warnings import warn

Expand All @@ -8,17 +8,9 @@
HEADER_WIDTH = 60


def find_file_recursive(folder, filename):
import glob
outs = []
try:
for ext in ["so", "dll", "dylib"]:
out = glob.glob(os.path.join(folder, "**", filename + ext))
outs.extend(out)
except Exception as e:
raise RuntimeError('Error: Something when wrong when trying to find file.') from e

return outs
def find_dynamic_library(folder, filename):
for ext in ("so", "dll", "dylib"):
yield from glob.glob(os.path.join(folder, "**", filename + ext))


def generate_bug_report_information():
Expand All @@ -27,40 +19,25 @@ def generate_bug_report_information():
print_header("")
print('')

if 'CONDA_PREFIX' in os.environ:
paths = find_file_recursive(os.environ['CONDA_PREFIX'], '*cuda*')
print_header("ANACONDA CUDA PATHS")
print(paths)
print('')
if isdir('/usr/local/'):
paths = find_file_recursive('/usr/local', '*cuda*')
print_header("/usr/local CUDA PATHS")
print(paths)
print('')
if 'CUDA_PATH' in os.environ and isdir(os.environ['CUDA_PATH']):
paths = find_file_recursive(os.environ['CUDA_PATH'], '*cuda*')
print_header("CUDA PATHS")
print(paths)
print('')

if isdir(os.getcwd()):
paths = find_file_recursive(os.getcwd(), '*cuda*')
print_header("WORKING DIRECTORY CUDA PATHS")
print(paths)
print('')

print_header("LD_LIBRARY CUDA PATHS")
if 'LD_LIBRARY_PATH' in os.environ:
lib_path = os.environ['LD_LIBRARY_PATH'].strip()
for path in set(lib_path.split(os.pathsep)):
try:
if isdir(path):
print_header(f"{path} CUDA PATHS")
paths = find_file_recursive(path, '*cuda*')
print(paths)
except Exception as e:
print(f'Could not read LD_LIBRARY_PATH: {path} ({e})')
print('')
path_sources = [
("ANACONDA CUDA PATHS", os.environ.get("CONDA_PREFIX")),
("/usr/local CUDA PATHS", "/usr/local"),
("CUDA PATHS", os.environ.get("CUDA_PATH")),
("WORKING DIRECTORY CUDA PATHS", os.getcwd()),
]
try:
ld_library_path = os.environ.get("LD_LIBRARY_PATH")
if ld_library_path:
for path in set(ld_library_path.strip().split(os.pathsep)):
path_sources.append((f"LD_LIBRARY_PATH {path} CUDA PATHS", path))
except Exception as e:
print(f"Could not parse LD_LIBRARY_PATH: {e}")

for name, path in path_sources:
if path and os.path.isdir(path):
print_header(name)
print(list(find_dynamic_library(path, '*cuda*')))
print("")


def print_header(
Expand Down
83 changes: 43 additions & 40 deletions bitsandbytes/cuda_setup/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,17 @@

from .env_vars import get_potentially_lib_path_containing_env_vars

# these are the most common libs names
# libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead
# we have libcudart.so.11.0 which causes a lot of errors before
# not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt
system = platform.system()
if system == 'Windows':
if platform.system() == 'Windows': # Windows
CUDA_RUNTIME_LIBS = ["nvcuda.dll"]
else: # Linux or other
CUDA_RUNTIME_LIBS = ["libcudart.so", 'libcudart.so.11.0', 'libcudart.so.12.0', 'libcudart.so.12.1', 'libcudart.so.12.2']
DYNAMIC_LIBRARY_SUFFIX = ".dll"
else: # Linux or other
# these are the most common libs names
# libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead
# we have libcudart.so.11.0 which causes a lot of errors before
# not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt
CUDA_RUNTIME_LIBS = ["libcudart.so", "libcudart.so.11.0", "libcudart.so.12.0", "libcudart.so.12.1", "libcudart.so.12.2"]
DYNAMIC_LIBRARY_SUFFIX = ".so"

# this is a order list of backup paths to search CUDA in, if it cannot be found in the main environmental paths
backup_paths = []
backup_paths.append('$CONDA_PREFIX/lib/libcudart.so.11.0')

class CUDASetup:
_instance = None
Expand Down Expand Up @@ -108,22 +106,30 @@ def initialize(self):
self.error = False

def manual_override(self):
if torch.cuda.is_available():
if 'BNB_CUDA_VERSION' in os.environ:
if len(os.environ['BNB_CUDA_VERSION']) > 0:
warn(
f'\n\n{"=" * 80}\n'
'WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n'
'BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n'
'If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n'
'If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n'
'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n'
f'Loading CUDA version: BNB_CUDA_VERSION={os.environ["BNB_CUDA_VERSION"]}'
f'\n{"=" * 80}\n\n'
)
binary_name = self.binary_name.rsplit(".", 1)[0]
suffix = ".so" if os.name != "nt" else ".dll"
self.binary_name = binary_name[:-3] + f'{os.environ["BNB_CUDA_VERSION"]}.{suffix}'
if not torch.cuda.is_available():
return
override_value = os.environ.get('BNB_CUDA_VERSION')
if not override_value:
return

binary_name_stem, _, binary_name_ext = self.binary_name.rpartition(".")
# `binary_name_stem` will now be e.g. `/foo/bar/libbitsandbytes_cuda118`;
# let's remove any trailing numbers:
binary_name_stem = binary_name_stem.rstrip("0123456789")
# `binary_name_stem` will now be e.g. `/foo/bar/libbitsandbytes_cuda`;
# let's tack the new version number and the original extension back on.
self.binary_name = f"{binary_name_stem}{override_value}.{binary_name_ext}"

warn(
f'\n\n{"=" * 80}\n'
'WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n'
'BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n'
'If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n'
'If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n'
'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n'
f'Loading: {self.binary_name}'
f'\n{"=" * 80}\n\n'
)

def run_cuda_setup(self):
self.initialized = True
Expand All @@ -140,11 +146,10 @@ def run_cuda_setup(self):
package_dir = Path(__file__).parent.parent
binary_path = package_dir / self.binary_name

suffix = ".so" if os.name != "nt" else ".dll"
try:
if not binary_path.exists():
self.add_log_entry(f"CUDA SETUP: Required library version not found: {binary_name}. Maybe you need to compile it from source?")
legacy_binary_name = f"libbitsandbytes_cpu{suffix}"
legacy_binary_name = f"libbitsandbytes_cpu{DYNAMIC_LIBRARY_SUFFIX}"
self.add_log_entry(f"CUDA SETUP: Defaulting to {legacy_binary_name}...")
binary_path = package_dir / legacy_binary_name
if not binary_path.exists() or torch.cuda.is_available():
Expand Down Expand Up @@ -348,19 +353,18 @@ def get_compute_capabilities():

def evaluate_cuda_setup():
cuda_setup = CUDASetup.get_instance()
suffix = ".so" if os.name != "nt" else ".dll"
if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0':
cuda_setup.add_log_entry('')
cuda_setup.add_log_entry('='*35 + 'BUG REPORT' + '='*35)
cuda_setup.add_log_entry(('Welcome to bitsandbytes. For bug reports, please run\n\npython -m bitsandbytes\n\n'),
('and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues'))
cuda_setup.add_log_entry('='*80)
if not torch.cuda.is_available(): return f'libbitsandbytes_cpu{suffix}', None, None, None

if not torch.cuda.is_available():
return f'libbitsandbytes_cpu{DYNAMIC_LIBRARY_SUFFIX}', None, None, None

cudart_path = determine_cuda_runtime_lib_path()
ccs = get_compute_capabilities()
ccs.sort()
cc = ccs[-1] # we take the highest capability
cc = get_compute_capabilities()[-1] # we take the highest capability
cuda_version_string = get_cuda_version()

cuda_setup.add_log_entry(f"CUDA SETUP: PyTorch settings found: CUDA_VERSION={cuda_version_string}, Highest Compute Capability: {cc}.")
Expand All @@ -380,12 +384,11 @@ def evaluate_cuda_setup():
# we use ls -l instead of nvcc to determine the cuda version
# since most installations will have the libcudart.so installed, but not the compiler

if has_cublaslt:
binary_name = f"libbitsandbytes_cuda{cuda_version_string}"
else:
"if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt"
binary_name = f"libbitsandbytes_cuda{cuda_version_string}_nocublaslt"
binary_name = f"libbitsandbytes_cuda{cuda_version_string}"
if not has_cublaslt:
# if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt
binary_name += "_nocublaslt"

binary_name = f"{binary_name}{suffix}"
binary_name = f"{binary_name}{DYNAMIC_LIBRARY_SUFFIX}"

return binary_name, cudart_path, cc, cuda_version_string
Loading