From fcf496a7165c34b810e5ebfad3dd2dc02b62eec0 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Mon, 29 Jan 2024 14:34:01 +0200 Subject: [PATCH 1/7] Diagnostics: streamline debug printing code --- bitsandbytes/__main__.py | 69 ++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 46 deletions(-) diff --git a/bitsandbytes/__main__.py b/bitsandbytes/__main__.py index af5c1c523..61b42e78f 100644 --- a/bitsandbytes/__main__.py +++ b/bitsandbytes/__main__.py @@ -1,5 +1,5 @@ +import glob import os -from os.path import isdir import sys from warnings import warn @@ -8,17 +8,9 @@ HEADER_WIDTH = 60 -def find_file_recursive(folder, filename): - import glob - outs = [] - try: - for ext in ["so", "dll", "dylib"]: - out = glob.glob(os.path.join(folder, "**", filename + ext)) - outs.extend(out) - except Exception as e: - raise RuntimeError('Error: Something when wrong when trying to find file.') from e - - return outs +def find_dynamic_library(folder, filename): + for ext in ("so", "dll", "dylib"): + yield from glob.glob(os.path.join(folder, "**", filename + ext)) def generate_bug_report_information(): @@ -27,40 +19,25 @@ def generate_bug_report_information(): print_header("") print('') - if 'CONDA_PREFIX' in os.environ: - paths = find_file_recursive(os.environ['CONDA_PREFIX'], '*cuda*') - print_header("ANACONDA CUDA PATHS") - print(paths) - print('') - if isdir('/usr/local/'): - paths = find_file_recursive('/usr/local', '*cuda*') - print_header("/usr/local CUDA PATHS") - print(paths) - print('') - if 'CUDA_PATH' in os.environ and isdir(os.environ['CUDA_PATH']): - paths = find_file_recursive(os.environ['CUDA_PATH'], '*cuda*') - print_header("CUDA PATHS") - print(paths) - print('') - - if isdir(os.getcwd()): - paths = find_file_recursive(os.getcwd(), '*cuda*') - print_header("WORKING DIRECTORY CUDA PATHS") - print(paths) - print('') - - print_header("LD_LIBRARY CUDA PATHS") - if 'LD_LIBRARY_PATH' in os.environ: - lib_path = os.environ['LD_LIBRARY_PATH'].strip() - for path in set(lib_path.split(os.pathsep)): - try: - if isdir(path): - print_header(f"{path} CUDA PATHS") - paths = find_file_recursive(path, '*cuda*') - print(paths) - except Exception as e: - print(f'Could not read LD_LIBRARY_PATH: {path} ({e})') - print('') + path_sources = [ + ("ANACONDA CUDA PATHS", os.environ.get("CONDA_PREFIX")), + ("/usr/local CUDA PATHS", "/usr/local"), + ("CUDA PATHS", os.environ.get("CUDA_PATH")), + ("WORKING DIRECTORY CUDA PATHS", os.getcwd()), + ] + try: + ld_library_path = os.environ.get("LD_LIBRARY_PATH") + if ld_library_path: + for path in set(ld_library_path.strip().split(os.pathsep)): + path_sources.append((f"LD_LIBRARY_PATH {path} CUDA PATHS", path)) + except Exception as e: + print(f"Could not parse LD_LIBRARY_PATH: {e}") + + for name, path in path_sources: + if path and os.path.isdir(path): + print_header(name) + print(list(find_dynamic_library(path, '*cuda*'))) + print("") def print_header( From bae3eabae4ba0c9898595669838e60d1b6dac041 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Mon, 29 Jan 2024 14:37:45 +0200 Subject: [PATCH 2/7] CUDA setup: Remove unused `backup_paths` --- bitsandbytes/cuda_setup/main.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index a34385b1f..04990a2d2 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -38,9 +38,6 @@ else: # Linux or other CUDA_RUNTIME_LIBS = ["libcudart.so", 'libcudart.so.11.0', 'libcudart.so.12.0', 'libcudart.so.12.1', 'libcudart.so.12.2'] -# this is a order list of backup paths to search CUDA in, if it cannot be found in the main environmental paths -backup_paths = [] -backup_paths.append('$CONDA_PREFIX/lib/libcudart.so.11.0') class CUDASetup: _instance = None From e7c695f2efde09491d32a2412a86cedeab8fea6c Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 30 Jan 2024 08:39:09 +0200 Subject: [PATCH 3/7] CUDA setup: DRY OS detection --- bitsandbytes/cuda_setup/main.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index 04990a2d2..cb3c31502 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -28,15 +28,16 @@ from .env_vars import get_potentially_lib_path_containing_env_vars -# these are the most common libs names -# libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead -# we have libcudart.so.11.0 which causes a lot of errors before -# not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt -system = platform.system() -if system == 'Windows': +if platform.system() == 'Windows': # Windows CUDA_RUNTIME_LIBS = ["nvcuda.dll"] -else: # Linux or other - CUDA_RUNTIME_LIBS = ["libcudart.so", 'libcudart.so.11.0', 'libcudart.so.12.0', 'libcudart.so.12.1', 'libcudart.so.12.2'] + DYNAMIC_LIBRARY_SUFFIX = ".dll" +else: # Linux or other + # these are the most common libs names + # libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead + # we have libcudart.so.11.0 which causes a lot of errors before + # not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt + CUDA_RUNTIME_LIBS = ["libcudart.so", "libcudart.so.11.0", "libcudart.so.12.0", "libcudart.so.12.1", "libcudart.so.12.2"] + DYNAMIC_LIBRARY_SUFFIX = ".so" class CUDASetup: @@ -119,8 +120,7 @@ def manual_override(self): f'\n{"=" * 80}\n\n' ) binary_name = self.binary_name.rsplit(".", 1)[0] - suffix = ".so" if os.name != "nt" else ".dll" - self.binary_name = binary_name[:-3] + f'{os.environ["BNB_CUDA_VERSION"]}.{suffix}' + self.binary_name = binary_name[:-3] + f'{os.environ["BNB_CUDA_VERSION"]}{DYNAMIC_LIBRARY_SUFFIX}' def run_cuda_setup(self): self.initialized = True @@ -137,11 +137,10 @@ def run_cuda_setup(self): package_dir = Path(__file__).parent.parent binary_path = package_dir / self.binary_name - suffix = ".so" if os.name != "nt" else ".dll" try: if not binary_path.exists(): self.add_log_entry(f"CUDA SETUP: Required library version not found: {binary_name}. Maybe you need to compile it from source?") - legacy_binary_name = f"libbitsandbytes_cpu{suffix}" + legacy_binary_name = f"libbitsandbytes_cpu{DYNAMIC_LIBRARY_SUFFIX}" self.add_log_entry(f"CUDA SETUP: Defaulting to {legacy_binary_name}...") binary_path = package_dir / legacy_binary_name if not binary_path.exists() or torch.cuda.is_available(): @@ -345,14 +344,15 @@ def get_compute_capabilities(): def evaluate_cuda_setup(): cuda_setup = CUDASetup.get_instance() - suffix = ".so" if os.name != "nt" else ".dll" if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0': cuda_setup.add_log_entry('') cuda_setup.add_log_entry('='*35 + 'BUG REPORT' + '='*35) cuda_setup.add_log_entry(('Welcome to bitsandbytes. For bug reports, please run\n\npython -m bitsandbytes\n\n'), ('and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues')) cuda_setup.add_log_entry('='*80) - if not torch.cuda.is_available(): return f'libbitsandbytes_cpu{suffix}', None, None, None + + if not torch.cuda.is_available(): + return f'libbitsandbytes_cpu{DYNAMIC_LIBRARY_SUFFIX}', None, None, None cudart_path = determine_cuda_runtime_lib_path() ccs = get_compute_capabilities() @@ -383,6 +383,6 @@ def evaluate_cuda_setup(): "if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt" binary_name = f"libbitsandbytes_cuda{cuda_version_string}_nocublaslt" - binary_name = f"{binary_name}{suffix}" + binary_name = f"{binary_name}{DYNAMIC_LIBRARY_SUFFIX}" return binary_name, cudart_path, cc, cuda_version_string From d3f8930eece0a78e53b84116ffc48ed2763d2f7b Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 30 Jan 2024 08:39:34 +0200 Subject: [PATCH 4/7] CUDA setup: Streamline `manual_override()` --- bitsandbytes/cuda_setup/main.py | 35 +++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index cb3c31502..acaec77cf 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -106,21 +106,26 @@ def initialize(self): self.error = False def manual_override(self): - if torch.cuda.is_available(): - if 'BNB_CUDA_VERSION' in os.environ: - if len(os.environ['BNB_CUDA_VERSION']) > 0: - warn( - f'\n\n{"=" * 80}\n' - 'WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n' - 'BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n' - 'If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n' - 'If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n' - 'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH: Date: Tue, 30 Jan 2024 08:45:02 +0200 Subject: [PATCH 5/7] CUDA setup: Use comment instead of string literal, simplify --- bitsandbytes/cuda_setup/main.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index acaec77cf..a4ef60d5b 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -382,11 +382,10 @@ def evaluate_cuda_setup(): # we use ls -l instead of nvcc to determine the cuda version # since most installations will have the libcudart.so installed, but not the compiler - if has_cublaslt: - binary_name = f"libbitsandbytes_cuda{cuda_version_string}" - else: - "if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt" - binary_name = f"libbitsandbytes_cuda{cuda_version_string}_nocublaslt" + binary_name = f"libbitsandbytes_cuda{cuda_version_string}" + if not has_cublaslt: + # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt + binary_name += "_nocublaslt" binary_name = f"{binary_name}{DYNAMIC_LIBRARY_SUFFIX}" From 5adf65340927a6206fc6ed5eaf811e97d764ccd5 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 30 Jan 2024 08:55:09 +0200 Subject: [PATCH 6/7] CUDA setup: remove duplicate sort The "sort compute capabilities" fix from #703 (#527) would actually do nothing due to this. --- bitsandbytes/cuda_setup/main.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index a4ef60d5b..760d557a4 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -360,9 +360,7 @@ def evaluate_cuda_setup(): return f'libbitsandbytes_cpu{DYNAMIC_LIBRARY_SUFFIX}', None, None, None cudart_path = determine_cuda_runtime_lib_path() - ccs = get_compute_capabilities() - ccs.sort() - cc = ccs[-1] # we take the highest capability + cc = get_compute_capabilities()[-1] # we take the highest capability cuda_version_string = get_cuda_version() cuda_setup.add_log_entry(f"CUDA SETUP: PyTorch settings found: CUDA_VERSION={cuda_version_string}, Highest Compute Capability: {cc}.") @@ -384,7 +382,7 @@ def evaluate_cuda_setup(): binary_name = f"libbitsandbytes_cuda{cuda_version_string}" if not has_cublaslt: - # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt + # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt binary_name += "_nocublaslt" binary_name = f"{binary_name}{DYNAMIC_LIBRARY_SUFFIX}" From a724c05f31f0bf39b8da30e0beaca5c05385197f Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 30 Jan 2024 13:23:45 +0200 Subject: [PATCH 7/7] CUDA setup: make version number replacement logic more obvious --- bitsandbytes/cuda_setup/main.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index 760d557a4..1669b08e1 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -112,9 +112,13 @@ def manual_override(self): if not override_value: return - binary_name = self.binary_name.rsplit(".", 1)[0] - # TODO: what's the magic value `-3` here? - self.binary_name = binary_name[:-3] + f'{override_value}{DYNAMIC_LIBRARY_SUFFIX}' + binary_name_stem, _, binary_name_ext = self.binary_name.rpartition(".") + # `binary_name_stem` will now be e.g. `/foo/bar/libbitsandbytes_cuda118`; + # let's remove any trailing numbers: + binary_name_stem = binary_name_stem.rstrip("0123456789") + # `binary_name_stem` will now be e.g. `/foo/bar/libbitsandbytes_cuda`; + # let's tack the new version number and the original extension back on. + self.binary_name = f"{binary_name_stem}{override_value}.{binary_name_ext}" warn( f'\n\n{"=" * 80}\n'