From b263fc6d2e16b150930b67491bb0602a88412ffc Mon Sep 17 00:00:00 2001 From: Rickard Date: Tue, 2 Jan 2024 14:48:54 +0100 Subject: [PATCH 01/52] Make native code portable and add GitHub workflow for building --- .github/workflows/python-package.yml | 201 ++++++++++++++++++ .gitignore | 22 +- CMakeLists.txt | 121 +++++++++++ Makefile | 141 ------------ csrc/common.cpp | 24 +-- csrc/common.h | 2 +- csrc/cpu_ops.cpp | 42 ++-- csrc/kernels.cu | 12 +- csrc/mps_kernels.metal | 117 ++++++++++ csrc/ops.cuh | 2 + ...{pythonInterface.c => pythonInterface.cpp} | 4 + include/Algo-Direct2.h | 2 + include/Portable.h | 35 ++- include/SIMD.h | 67 ++++-- include/Type.h | 2 +- requirements.txt | 8 +- setup.py | 8 +- 17 files changed, 593 insertions(+), 217 deletions(-) create mode 100644 .github/workflows/python-package.yml create mode 100644 CMakeLists.txt delete mode 100644 Makefile create mode 100644 csrc/mps_kernels.metal rename csrc/{pythonInterface.c => pythonInterface.cpp} (99%) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 000000000..f50091980 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,201 @@ +name: Python package + +on: + push: + branches: [ "*" ] + pull_request: + branches: [ master ] + release: + types: [ published ] + +jobs: + + ## + # This job matrix builds the non-CUDA versions of the libraries for all supported platforms. + ## + build-shared-libs: + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + arch: [x86_64, aarch64] + exclude: + - os: windows-latest # This probably requres arm64 Windows agents + arch: aarch64 + runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents + steps: + # Check out code + - uses: actions/checkout@v3 + # On Linux we use CMake within Docker + - name: Setup cmake + uses: jwlawson/actions-setup-cmake@v1.13 + with: + cmake-version: '3.26.x' + - name: Add msbuild to PATH + uses: microsoft/setup-msbuild@v1.1 + if: ${{ startsWith(matrix.os, 'windows') }} + # Compile C++ code + - name: Build C++ + shell: bash + run: | + set -ex + build_os=${{ matrix.os }} + build_arch=${{ matrix.arch }} + ( git clone https://github.com/NVlabs/cub ./dependencies/cub; cd dependencies/cub; git checkout 1.11.0 ) + if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then + # Allow cross-compile om aarch64 + sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu + fi + if [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then + cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DENABLE_CUDA=OFF . + else + cmake -DENABLE_CUDA=OFF . + fi + if [ ${build_os:0:7} == windows ]; then + pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" + else + make + fi + mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} + ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) + - name: Upload build artifact + uses: actions/upload-artifact@v3 + with: + name: shared_library + path: output/* + retention-days: 7 + ## + # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64) + ## + build-shared-libs-cuda: + strategy: + matrix: + os: [ubuntu-latest, windows-latest] + arch: [x86_64, aarch64] + cuda_version: ['12.1.0'] + exclude: + - os: windows-latest # This probably requres arm64 Windows agents + arch: aarch64 + runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents + steps: + # Check out code + - uses: actions/checkout@v3 + # Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation) + - name: Set up Docker multiarch + if: startsWith(matrix.os, 'ubuntu') + uses: docker/setup-qemu-action@v2 + # On Linux we use CMake within Docker + - name: Setup cmake + if: ${{ !startsWith(matrix.os, 'linux') }} + uses: jwlawson/actions-setup-cmake@v1.13 + with: + cmake-version: '3.26.x' + # Windows: We install Cuda on the agent (slow) + - uses: Jimver/cuda-toolkit@v0.2.10 + if: startsWith(matrix.os, 'windows') + id: cuda-toolkit + with: + cuda: ${{ matrix.cuda_version }} + method: 'local' + #sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]' + - name: Add msbuild to PATH + uses: microsoft/setup-msbuild@v1.1 + if: ${{ startsWith(matrix.os, 'windows') }} + # Compile C++ code + - name: Build C++ + shell: bash + run: | + set -ex + build_os=${{ matrix.os }} + build_arch=${{ matrix.arch }} + ( git clone https://github.com/NVlabs/cub ./dependencies/cub; cd dependencies/cub; git checkout 1.11.0 ) + if [ ${build_os:0:6} == ubuntu ]; then + image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04 + echo "Using image $image" + docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ + "apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ + && cmake -DENABLE_CUDA=ON . \ + && make" + else + cmake -DENABLE_CUDA=ON . + pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" + fi + mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} + ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) + - name: Upload build artifact + uses: actions/upload-artifact@v3 + with: + name: shared_library + path: output/* + retention-days: 7 + build-wheels: + needs: + - build-shared-libs + - build-shared-libs-cuda + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ["3.8", "3.9", "3.10", "3.11"] + arch: [x86_64, aarch64] + exclude: + - os: windows-latest # This probably requres arm64 Windows agents + arch: aarch64 + runs-on: ${{ matrix.os }} + steps: + # Check out code + - uses: actions/checkout@v3 + # Download shared libraries + - name: Download build artifact + uses: actions/download-artifact@v3 + with: + name: shared_library + path: output/ + - name: Copy correct platform shared library + shell: bash + run: | + cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ + # Compile C++ code + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + # + - name: Install Python dependencies + shell: bash + run: | + pip install -r requirements.txt + # TODO: How to run CUDA tests on GitHub actions? + #- name: Run unit tests + # if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents + # run: | + # PYTHONPATH=. pytest --log-cli-level=DEBUG tests + - name: Build wheel + shell: bash + run: | + python setup.py bdist_wheel + - name: Upload build artifact + uses: actions/upload-artifact@v3 + with: + name: bdist_wheel + path: dist/bitsandbytes-*.whl + retention-days: 7 + publish: + needs: build-wheels + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Build dist + run: | + python setup.py sdist + - name: Download build artifact + uses: actions/download-artifact@v3 + with: + name: bdist_wheel + path: dist/ + - run: | + ls -lR dist/ + - name: Publish to PyPi + if: startsWith(github.ref, 'refs/tags') + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.pypi }} diff --git a/.gitignore b/.gitignore index 2f929968b..202dcb13d 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,26 @@ __pycache__/ *.py[cod] *$py.class - -# C extensions *.so +*.dll +*.dylib +*.o +*.obj +*.air +*.metallib + +# CMake generated files +CMakeCache.txt +CMakeScripts/ +cmake_install.cmake +Makefile +CMakeFiles/ +*.sln +*.vcxproj* +*.xcodeproj/ +bitsandbytes.dir/ +Debug/ +Release/ # Distribution / packaging .Python @@ -133,4 +150,5 @@ dmypy.json dependencies cuda_build +output/ .vscode/* diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..d6e269d15 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,121 @@ +cmake_minimum_required(VERSION 3.22.1) + +option(ENABLE_CUDA "Build for CUDA (Nvidia)" OFF) +option(ENABLE_MPS "Build for Metal Performance Shaders (Apple)" OFF) + +if(ENABLE_CUDA) + if(APPLE) + message(FATAL_ERROR "CUDA is not supported on macOS" ) + endif() + option(NO_CUBLASLT "Don't use CUBLAST" OFF) + if(NO_CUBLASLT) + set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72) + else() + set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89 90) + endif() +endif() + +if(ENABLE_CUDA) + message("Building CUDA support for ${CMAKE_CUDA_ARCHITECTURES}") + # Find CUDA tools if we are compiling with CUDA + find_package(CUDAToolkit REQUIRED) + if(NO_CUBLASLT) + set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}_nocublaslt") + else() + set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}") + endif() + + project(bitsandbytes LANGUAGES CXX CUDA) + add_compile_definitions(BUILD_CUDA) + set(CMAKE_CUDA_STANDARD 14) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) + set(GPU_SOURCES csrc/ops.cu csrc/kernels.cu) +elseif(ENABLE_MPS) + if(NOT APPLE) + message(FATAL_ERROR "MPS is only supported on macOS" ) + endif() + message("Building MPS support") + set(LIBSUFFIX "mps") + project(bitsandbytes LANGUAGES CXX OBJCXX) + add_compile_definitions(BUILD_MPS) + set(METAL_SOURCES csrc/mps_kernels.metal) + file(MAKE_DIRECTORY "build") + add_custom_command(OUTPUT "bitsandbytes/bitsandbytes.metallib" + COMMAND xcrun metal -c -o "build/bitsandbytes.air" ${METAL_SOURCES} + COMMAND xcrun metallib "build/bitsandbytes.air" -o "bitsandbytes/bitsandbytes.metallib" + DEPENDS "${METAL_SOURCES}" + COMMENT "Compiling Metal kernels" + VERBATIM) + add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib") + set(GPU_SOURCES csrc/mps_ops.mm) +else() + message("Building with CPU only") + set(LIBSUFFIX "cpu") + + project(bitsandbytes LANGUAGES CXX) + set(GPU_SOURCES) +endif() + +if(APPLE) + set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1) +endif() +set(CMAKE_CXX_STANDARD 14) +set(CXX_STANDARD_REQUIRED C++14) + +if(WIN32) + # Mute warnings + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -diag-suppress=177") + + # Enable fast math on VC++ + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast") + + # Export all symbols + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) +endif() + +# Weird MSVC hacks +if(MSVC) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:msvcprtd /NODEFAULTLIB:MSVCRTD /NODEFAULTLIB:LIBCMT") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2") +endif() + +# Add csrc files +add_library(bitsandbytes SHARED + ${GPU_SOURCES} + csrc/common.cpp + csrc/cpu_ops.cpp + csrc/pythonInterface.cpp) + +target_include_directories(bitsandbytes PUBLIC + ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} + ${CMAKE_CURRENT_SOURCE_DIR}/csrc + ${CMAKE_CURRENT_SOURCE_DIR}/include) + +if(ENABLE_CUDA) + target_include_directories(bitsandbytes PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/include) + + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math") + + set_target_properties( + bitsandbytes + PROPERTIES + CUDA_SEPARABLE_COMPILATION ON) + + target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cusparse) +endif() +if(ENABLE_MPS) + add_dependencies(bitsandbytes metallib) + target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph") +endif() + +set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME "bitsandbytes_${LIBSUFFIX}") +# Set the output name of the CUDA library +if(MSVC) +set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE bitsandbytes) +set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG bitsandbytes) +set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bitsandbytes) +set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG bitsandbytes) +endif() + +set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY bitsandbytes) diff --git a/Makefile b/Makefile deleted file mode 100644 index 5f997a122..000000000 --- a/Makefile +++ /dev/null @@ -1,141 +0,0 @@ -MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) -ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH))) - -GPP:= /usr/bin/g++ -#GPP:= /sw/gcc/11.2.0/bin/g++ -ifeq ($(CUDA_HOME),) - CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev) -endif - -ifndef CUDA_VERSION -ifneq ($(MAKECMDGOALS),clean) -$(warning WARNING: CUDA_VERSION not set. Call make with CUDA string, for example: make cuda11x CUDA_VERSION=115 or make cpuonly CUDA_VERSION=CPU) -CUDA_VERSION:= -endif -endif - - - -NVCC := $(CUDA_HOME)/bin/nvcc - -########################################### - -CSRC := $(ROOT_DIR)/csrc -BUILD_DIR:= $(ROOT_DIR)/build - -FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu -FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c - -INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include -LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib - -# NVIDIA NVCC compilation flags -COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell -COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell -COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal -COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal -COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta - -CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler -CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler - -# Later versions of CUDA support the new architectures -CC_CUDA11x := -gencode arch=compute_75,code=sm_75 -CC_CUDA11x += -gencode arch=compute_80,code=sm_80 -CC_CUDA11x += -gencode arch=compute_86,code=sm_86 - - -CC_cublasLt110 := -gencode arch=compute_75,code=sm_75 -CC_cublasLt110 += -gencode arch=compute_80,code=sm_80 - -CC_cublasLt111 := -gencode arch=compute_75,code=sm_75 -CC_cublasLt111 += -gencode arch=compute_80,code=sm_80 -CC_cublasLt111 += -gencode arch=compute_86,code=sm_86 - -CC_ADA_HOPPER := -gencode arch=compute_89,code=sm_89 -CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90 - - -all: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda110_nomatmul_kepler: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda11x_nomatmul_kepler: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - - -cuda110_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda11x_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda118_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda12x_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda110: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda11x: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda118: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda12x: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cpuonly: $(BUILD_DIR) env - $(GPP) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu.so - -env: - @echo "ENVIRONMENT" - @echo "============================" - @echo "CUDA_VERSION: $(CUDA_VERSION)" - @echo "============================" - @echo "NVCC path: $(NVCC)" - @echo "GPP path: $(GPP) VERSION: `$(GPP) --version | head -n 1`" - @echo "CUDA_HOME: $(CUDA_HOME)" - @echo "CONDA_PREFIX: $(CONDA_PREFIX)" - @echo "PATH: $(PATH)" - @echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)" - @echo "============================" - -$(BUILD_DIR): - mkdir -p build - mkdir -p dependencies - -$(ROOT_DIR)/dependencies/cub: - git clone https://github.com/NVlabs/cub $(ROOT_DIR)/dependencies/cub - cd dependencies/cub; git checkout 1.11.0 - -clean: - rm -rf build/* *.egg* - rm -f bitsandbytes/libbitsandbytes*.so diff --git a/csrc/common.cpp b/csrc/common.cpp index 52f029917..0a9601689 100644 --- a/csrc/common.cpp +++ b/csrc/common.cpp @@ -1,39 +1,35 @@ #include #include -void *quantize_block(void *arguments) { +void quantize_block(const quantize_block_args& args) { // 1. find absmax in block // 2. divide input value by absmax to normalize into [-1.0, 1.0] // 3. do binary search to find the closest value // 4. check minimal distance // 5. store index - struct quantize_block_args *args = (quantize_block_args *) arguments; - // 1. find absmax in block float absmax_block = -FLT_MAX; - for (long long i = args->block_idx; i < args->block_end; i++) - absmax_block = fmax(absmax_block, fabs(args->A[i])); + for (long long i = args.block_idx; i < args.block_end; i++) + absmax_block = fmax(absmax_block, fabs(args.A[i])); - args->absmax[args->block_idx / args->blocksize] = absmax_block; + args.absmax[args.block_idx / args.blocksize] = absmax_block; - for (long long i = args->block_idx; i < args->block_end; i++) { + for (long long i = args.block_idx; i < args.block_end; i++) { // 2. divide input value by absmax to normalize into [-1.0, 1.0] // 3. do binary search to find the closest value - float normed_value = args->A[i] / absmax_block; - long long idx = args->bin_searcher->scalar(normed_value); + float normed_value = args.A[i] / absmax_block; + long long idx = args.bin_searcher->scalar(normed_value); // 4. check minimal distance // The binary search returns always the value to the left, which might not be the closest value if (idx < 255) { - float dist_left = fabs(normed_value - (args->code[idx])); - float dist_right = fabs(normed_value - (args->code[idx + 1])); + float dist_left = fabs(normed_value - (args.code[idx])); + float dist_right = fabs(normed_value - (args.code[idx + 1])); if (dist_right < dist_left) { idx += 1; } } // 5. store index - args->out[i] = (unsigned char) idx; + args.out[i] = (unsigned char) idx; } - - return NULL; } diff --git a/csrc/common.h b/csrc/common.h index c99034e78..e513f2875 100644 --- a/csrc/common.h +++ b/csrc/common.h @@ -20,6 +20,6 @@ struct quantize_block_args { }; -void *quantize_block(void *arguments); +void quantize_block(const quantize_block_args& args); #endif diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index e28e7b2c2..478c1f4ff 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -1,6 +1,6 @@ #include -#include #include +#include using namespace BinSearch; @@ -31,12 +31,8 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size) { long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset; - pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * valid_chunks); - - struct quantize_block_args **args = (quantize_block_args **) malloc(valid_chunks * sizeof(quantize_block_args *)); - - for(long long i = 0; i < valid_chunks; i++) - args[i] = (quantize_block_args *) malloc(sizeof(quantize_block_args)); + std::vector threads(valid_chunks); + std::vector args(valid_chunks); int chunks_processed = 0; for(long long block_idx = offset*blocksize; block_idx < n; block_idx += blocksize) @@ -44,30 +40,24 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx; long long block_end = block_idx + valid_items; - struct quantize_block_args *arg = args[chunks_processed]; - arg->bin_searcher = &bin_searcher; - arg->code = code; - arg->A = A; - arg->absmax = absmax; - arg->out = out; - arg->block_end = block_end; - arg->block_idx = block_idx; - arg->threadidx = block_idx / blocksize; - arg->blocksize = blocksize; - - pthread_create(&threads[chunks_processed], NULL, &quantize_block, (void *) arg); + struct quantize_block_args& arg = args[chunks_processed]; + arg.bin_searcher = &bin_searcher; + arg.code = code; + arg.A = A; + arg.absmax = absmax; + arg.out = out; + arg.block_end = block_end; + arg.block_idx = block_idx; + arg.threadidx = block_idx / blocksize; + arg.blocksize = blocksize; + + threads[chunks_processed] = std::thread([arg] { quantize_block(arg); }); chunks_processed += 1; if(chunks_processed == valid_chunks){ break; } } for (int i = 0; i < valid_chunks; i++) - int err = pthread_join(threads[i], NULL); - - free(threads); - for (int i = 0; i < valid_chunks; i++) - free(args[i]); - free(args); - + threads[i].join(); } } diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 1ab8aa242..c2e2d7da7 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3816,12 +3816,12 @@ template __global__ void kgemm_4bit_inference_naive(int M, int N template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 0, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols); template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols); diff --git a/csrc/mps_kernels.metal b/csrc/mps_kernels.metal new file mode 100644 index 000000000..a5c8e35b2 --- /dev/null +++ b/csrc/mps_kernels.metal @@ -0,0 +1,117 @@ +#include +using namespace metal; + +#define HLF_MAX 65504 +#define TH 1024 +#define NUM 4 +#define NUM_BLOCK 4096 + +template +static unsigned char quantize_scalar( + float rand, + device float* code, + float x) +{ + int pivot = 127; + int upper_pivot = 255; + int lower_pivot = 0; + + float lower = -1.0f; + float upper = 1.0f; + + float val = code[pivot]; + // i>>=1 = {32, 16, 8, 4, 2, 1} + for(int i = 64; i > 0; i>>=1) + { + if(x > val) + { + lower_pivot = pivot; + lower = val; + pivot+=i; + } + else + { + upper_pivot = pivot; + upper = val; + pivot-=i; + } + val = code[pivot]; + } + + if(upper_pivot == 255) + upper = code[upper_pivot]; + if(lower_pivot == 0) + lower = code[lower_pivot]; + + if(!STOCHASTIC) + { + if(x > val) + { + float midpoint = (upper+val)*0.5f; + if(x > midpoint) + { + return upper_pivot; + } + else + return pivot; + } + else + { + float midpoint = (lower+val)*0.5f; + if(x < midpoint) + return lower_pivot; + else + return pivot; + } + } + else + { + if(x > val) + { + float dist_to_upper = fabs(upper-x); + float dist_full = upper-val; + if(rand >= dist_to_upper/dist_full) return upper_pivot; + else return pivot; + } + else + { + float dist_to_lower = fabs(lower-x); + float dist_full = val-lower; + if(rand >= dist_to_lower/dist_full) return lower_pivot; + else return pivot; + } + } +} + +kernel void quantize(device float* code [[buffer(0)]], + device float* A [[buffer(1)]], + device uchar* out [[buffer(2)]], + constant uint& n [[buffer(3)]], + uint id [[thread_position_in_grid]]) { + const uint n_full = (NUM_BLOCK * (n / NUM_BLOCK)) + (n % NUM_BLOCK == 0 ? 0 : NUM_BLOCK); + uint valid_items = (id / NUM_BLOCK + 1 == (n + NUM_BLOCK - 1) / NUM_BLOCK) ? n - (id / NUM_BLOCK * NUM_BLOCK) : NUM_BLOCK; + const uint base_idx = (id / NUM_BLOCK * NUM_BLOCK); + + float vals[NUM]; + uchar qvals[NUM]; + + for (uint i = base_idx; i < n_full; i += ((n + NUM_BLOCK - 1) / NUM_BLOCK) * NUM_BLOCK) { + valid_items = n - i > NUM_BLOCK ? NUM_BLOCK : n - i; + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint j = 0; j < valid_items; j++) { + vals[j] = A[i + j]; + } + + for (uint j = 0; j < valid_items; j++) { + qvals[j] = quantize_scalar(0.0f, code, vals[j]); + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint j = 0; j < valid_items; j++) { + out[i + j] = qvals[j]; + } + } +} diff --git a/csrc/ops.cuh b/csrc/ops.cuh index f37b3b3af..cc7b59505 100644 --- a/csrc/ops.cuh +++ b/csrc/ops.cuh @@ -9,7 +9,9 @@ #include #include +#ifndef _MSC_VER #include +#endif #include #include diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.cpp similarity index 99% rename from csrc/pythonInterface.c rename to csrc/pythonInterface.cpp index 865e4b6d5..a6b348ca6 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.cpp @@ -6,6 +6,9 @@ #if BUILD_CUDA #include #endif +#if BUILD_MPS +// #include +#endif #include // We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary. @@ -412,6 +415,7 @@ extern "C" { gemm_4bit_inference_naive_fp32(m, n, k, A, B, absmax, datatype, out, lda, ldb, ldc, blocksize); } #endif + void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); } void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); } } diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h index d5fa58d12..7f52fce14 100644 --- a/include/Algo-Direct2.h +++ b/include/Algo-Direct2.h @@ -52,6 +52,7 @@ struct AlgoVecBase::val private: typedef AlgoScalarBase base_t; +#ifdef USE_SSE2 FORCE_INLINE //NO_INLINE void resolve(const FVec& vz, const IVec& bidx, uint32 *pr) const @@ -135,6 +136,7 @@ struct AlgoVecBase::val pr[0] = u.ui32[0]; pr[1] = u.ui32[2]; } +#endif // USE_SSE2 #ifdef USE_AVX diff --git a/include/Portable.h b/include/Portable.h index 1710b0502..78599944e 100644 --- a/include/Portable.h +++ b/include/Portable.h @@ -4,10 +4,40 @@ #include #include +#if defined(__aarch64__) +#ifdef __CUDACC__ +#undef USE_NEON // Doesn't work with nvcc, undefined symbols +#else +#include +#undef USE_NEON // Not yet implemented +#endif +#undef USE_AVX // x86_64 only +#undef USE_AVX2 // x86_64 only +#undef USE_SSE2 // x86_64 only +#undef USE_SSE41 // x86_64 only +#undef USE_SSE42 // x86_64 only +#undef USE_FMA // x86_64 only +#ifdef USE_NEON +typedef float32x4_t __m128; +typedef int32x4_t __m128i; +typedef float64x2_t __m128d; +#else +typedef struct {float a; float b; float c; float d;} __m128; +typedef struct {int a; int b; int c; int d;} __m128i; +typedef struct {double a; double b;} __m128d; +#endif +#else +#undef USE_NEON // ARM64 only #ifdef __FMA__ #define USE_FMA #endif +#if !defined(__SSE2__) && !defined(_MSC_VER) +#error Compiler must support SSE2 +#endif +#define USE_SSE2 +#if defined(__aarch64__) +#else #ifdef __AVX2__ #define USE_AVX2 #endif @@ -24,7 +54,8 @@ #ifdef __SSE4_2__ #define USE_SSE42 #endif - +#endif +#endif #ifndef _MSC_VER #include @@ -50,7 +81,7 @@ typedef unsigned __int64 uint64; namespace Details { -#define myassert(cond, msg) if (!cond){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); } +#define myassert(cond, msg) if (!(cond)){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); } // log2 is not defined in VS2008 #if defined(_MSC_VER) diff --git a/include/SIMD.h b/include/SIMD.h index a2ac1a9ae..18a38dbfd 100644 --- a/include/SIMD.h +++ b/include/SIMD.h @@ -2,6 +2,46 @@ #include "Portable.h" +#ifdef USE_SSE2 +#include +#if defined(USE_AVX) || defined(USE_AVX2) +#include +#else +#ifdef USE_SSE41 +#include +#endif +#endif +#endif + +namespace BinSearch { +namespace Details { + +template +struct FTOITraits{}; + +template +struct FVec; + +template +struct IVec; + +template +struct FVec1; + +template <> struct InstrFloatTraits +{ + typedef __m128 vec_t; +}; + +template <> struct InstrFloatTraits +{ + typedef __m128d vec_t; +}; + +} +} + +#if !defined(__aarch64__) #ifdef USE_SSE42 #ifndef _MSC_VER #include @@ -26,29 +66,11 @@ FORCE_INLINE int popcnt32(int x32) } // namespace #endif -#if defined(USE_AVX) || defined(USE_AVX2) -#include -#else -#include -#ifdef USE_SSE41 -#include -#endif -#endif - #include "Type.h" namespace BinSearch { namespace Details { -template -struct FVec; - -template -struct IVec; - -template -struct FVec1; - template <> struct InstrIntTraits { typedef __m128i vec_t; @@ -64,8 +86,8 @@ template <> struct InstrFloatTraits typedef __m128d vec_t; }; -template -struct FTOITraits +template <> +struct FTOITraits { typedef IVec vec_t; }; @@ -285,9 +307,11 @@ FORCE_INLINE FVec operator- (const FVec& a, const FVec< FORCE_INLINE FVec operator* (const FVec& a, const FVec& b) { return _mm_mul_ps( a, b ); } FORCE_INLINE FVec operator/ (const FVec& a, const FVec& b) { return _mm_div_ps( a, b ); } FORCE_INLINE IVec ftoi (const FVec& a) { return _mm_cvttps_epi32(a); } +#ifndef __clang__ // Conflicts with builtin operator FORCE_INLINE IVec operator<= (const FVec& a, const FVec& b) { return _mm_castps_si128( _mm_cmple_ps( a, b ) ); } FORCE_INLINE IVec operator>= (const FVec& a, const FVec& b) { return _mm_castps_si128( _mm_cmpge_ps( a, b ) ); } FORCE_INLINE IVec operator< (const FVec& a, const FVec& b) { return _mm_castps_si128(_mm_cmplt_ps(a, b)); } +#endif #ifdef USE_FMA FORCE_INLINE FVec mulSub(const FVec& a, const FVec& b, const FVec& c) { return _mm_fmsub_ps(a, b, c); } #endif @@ -339,9 +363,11 @@ FORCE_INLINE FVec operator- (const FVec& a, const FVec FORCE_INLINE FVec operator* (const FVec& a, const FVec& b) { return _mm_mul_pd( a, b ); } FORCE_INLINE FVec operator/ (const FVec& a, const FVec& b) { return _mm_div_pd( a, b ); } FORCE_INLINE IVec ftoi (const FVec& a) { return _mm_cvttpd_epi32(a); } +#ifndef __clang__ // Conflicts with builtin operator FORCE_INLINE IVec operator<= (const FVec& a, const FVec& b) { return _mm_castpd_si128( _mm_cmple_pd( a, b ) ); } FORCE_INLINE IVec operator< (const FVec& a, const FVec& b) { return _mm_castpd_si128(_mm_cmplt_pd(a, b)); } FORCE_INLINE IVec operator>= (const FVec& a, const FVec& b) { return _mm_castpd_si128( _mm_cmpge_pd( a, b ) ); } +#endif #ifdef USE_FMA FORCE_INLINE FVec mulSub(const FVec& a, const FVec& b, const FVec& c ) { return _mm_fmsub_pd(a, b, c); } #endif @@ -560,3 +586,4 @@ FORCE_INLINE FVec mulSub(const FVec& a, const FVec Date: Tue, 2 Jan 2024 14:56:05 +0100 Subject: [PATCH 02/52] Removed deprecated Python versions --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index f50091980..004d0b7e7 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -135,7 +135,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] arch: [x86_64, aarch64] exclude: - os: windows-latest # This probably requres arm64 Windows agents From aae5ff7055746082f7e1dc28adfeddc0139af717 Mon Sep 17 00:00:00 2001 From: Rickard Date: Thu, 25 Jan 2024 10:09:40 +0100 Subject: [PATCH 03/52] Update python-package.yml Co-authored-by: Aarni Koskela --- .github/workflows/python-package.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 004d0b7e7..6408ad971 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -186,7 +186,8 @@ jobs: - uses: actions/checkout@v2 - name: Build dist run: | - python setup.py sdist + pip install build + python -m build -s . - name: Download build artifact uses: actions/download-artifact@v3 with: From b06590d7ea7156f02ba2d80f2bb2e27254e763d6 Mon Sep 17 00:00:00 2001 From: Rickard Date: Thu, 25 Jan 2024 10:18:56 +0100 Subject: [PATCH 04/52] Update python-package.yml Co-authored-by: Aarni Koskela --- .github/workflows/python-package.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 6408ad971..73d8e95e0 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -172,7 +172,8 @@ jobs: - name: Build wheel shell: bash run: | - python setup.py bdist_wheel + pip install build + python -m build . - name: Upload build artifact uses: actions/upload-artifact@v3 with: From 03744cba3e21a688aa8de8f25654c68b5000d038 Mon Sep 17 00:00:00 2001 From: Rickard Date: Thu, 25 Jan 2024 10:57:03 +0100 Subject: [PATCH 05/52] Update python-package.yml Co-authored-by: Aarni Koskela --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 73d8e95e0..bfffd0c3d 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -24,7 +24,7 @@ jobs: runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents steps: # Check out code - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 # On Linux we use CMake within Docker - name: Setup cmake uses: jwlawson/actions-setup-cmake@v1.13 From 648e2f5d6c4aae007aa838d306d968aa0c3bee0b Mon Sep 17 00:00:00 2001 From: Rickard Date: Thu, 25 Jan 2024 11:03:12 +0100 Subject: [PATCH 06/52] Update python-package.yml Co-authored-by: Aarni Koskela --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index bfffd0c3d..501d9bfef 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -78,7 +78,7 @@ jobs: runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents steps: # Check out code - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 # Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation) - name: Set up Docker multiarch if: startsWith(matrix.os, 'ubuntu') From cba2b1a0767b25de874877812269b72a7bb3106b Mon Sep 17 00:00:00 2001 From: Rickard Date: Thu, 25 Jan 2024 11:05:34 +0100 Subject: [PATCH 07/52] Update python-package.yml Co-authored-by: Aarni Koskela --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 501d9bfef..3ce5078d4 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -143,7 +143,7 @@ jobs: runs-on: ${{ matrix.os }} steps: # Check out code - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 # Download shared libraries - name: Download build artifact uses: actions/download-artifact@v3 From 6f70a5e9a516416859ce3f1261a101a57b002696 Mon Sep 17 00:00:00 2001 From: Rickard Date: Thu, 25 Jan 2024 15:28:01 +0100 Subject: [PATCH 08/52] Update python-package.yml Co-authored-by: Aarni Koskela --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 3ce5078d4..e2d11ab9d 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -156,7 +156,7 @@ jobs: cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ # Compile C++ code - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} # From 90fa8b1d680e2f07f435b306de3d8cc43e85414a Mon Sep 17 00:00:00 2001 From: Rickard Date: Thu, 25 Jan 2024 15:28:56 +0100 Subject: [PATCH 09/52] Update python-package.yml Co-authored-by: Aarni Koskela --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index e2d11ab9d..fe1071a09 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -184,7 +184,7 @@ jobs: needs: build-wheels runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Build dist run: | pip install build From c815ca09e27049eb6bbef912067b6cdab2c4144e Mon Sep 17 00:00:00 2001 From: Rickard Date: Thu, 25 Jan 2024 17:58:28 +0100 Subject: [PATCH 10/52] Update python-package.yml --- .github/workflows/python-package.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index fe1071a09..b560cd5db 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,10 +1,9 @@ name: Python package on: - push: - branches: [ "*" ] + push: {} pull_request: - branches: [ master ] + branches: [ main ] release: types: [ published ] From 36b1ef203722d53ac578f095aeed14336471db05 Mon Sep 17 00:00:00 2001 From: Rickard Date: Thu, 25 Jan 2024 18:08:11 +0100 Subject: [PATCH 11/52] Do not test on Python 3.13 until released --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index b560cd5db..1200a5ca5 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -134,7 +134,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + python-version: ["3.9", "3.10", "3.11", "3.12"] arch: [x86_64, aarch64] exclude: - os: windows-latest # This probably requres arm64 Windows agents From 44e3f174d241150f3d74b372dd6944f46c59ede1 Mon Sep 17 00:00:00 2001 From: Rickard Date: Thu, 25 Jan 2024 18:16:39 +0100 Subject: [PATCH 12/52] Update python-package.yml --- .github/workflows/python-package.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 1200a5ca5..c36e0c618 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -32,6 +32,13 @@ jobs: - name: Add msbuild to PATH uses: microsoft/setup-msbuild@v1.1 if: ${{ startsWith(matrix.os, 'windows') }} + # Check out code + - uses: actions/checkout@v4 + name: Check out NVidia cub + with: + repository: nvidia/cub + ref: 1.11.0 + path: dependencies/cub # Compile C++ code - name: Build C++ shell: bash From 6fe8d0cf69a5f6d204f346d8149455dda2f3f6e9 Mon Sep 17 00:00:00 2001 From: Rickard Date: Fri, 26 Jan 2024 13:39:43 +0100 Subject: [PATCH 13/52] Update python-package.yml --- .github/workflows/python-package.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index c36e0c618..aa73eea18 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -46,7 +46,6 @@ jobs: set -ex build_os=${{ matrix.os }} build_arch=${{ matrix.arch }} - ( git clone https://github.com/NVlabs/cub ./dependencies/cub; cd dependencies/cub; git checkout 1.11.0 ) if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then # Allow cross-compile om aarch64 sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu From 572225e8e9486fc2f9ab92a1c995043907ff6a87 Mon Sep 17 00:00:00 2001 From: Rickard Date: Fri, 26 Jan 2024 13:51:39 +0100 Subject: [PATCH 14/52] Update python-package.yml --- .github/workflows/python-package.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index aa73eea18..3c25cf4af 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -32,7 +32,7 @@ jobs: - name: Add msbuild to PATH uses: microsoft/setup-msbuild@v1.1 if: ${{ startsWith(matrix.os, 'windows') }} - # Check out code + # Check out dependencies code - uses: actions/checkout@v4 name: Check out NVidia cub with: @@ -105,6 +105,13 @@ jobs: - name: Add msbuild to PATH uses: microsoft/setup-msbuild@v1.1 if: ${{ startsWith(matrix.os, 'windows') }} + # Check out dependencies code + - uses: actions/checkout@v4 + name: Check out NVidia cub + with: + repository: nvidia/cub + ref: 1.11.0 + path: dependencies/cub # Compile C++ code - name: Build C++ shell: bash @@ -112,7 +119,6 @@ jobs: set -ex build_os=${{ matrix.os }} build_arch=${{ matrix.arch }} - ( git clone https://github.com/NVlabs/cub ./dependencies/cub; cd dependencies/cub; git checkout 1.11.0 ) if [ ${build_os:0:6} == ubuntu ]; then image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04 echo "Using image $image" From 7a8676e18f814dd92eacbacb697e35b04bede8b9 Mon Sep 17 00:00:00 2001 From: Rickard Date: Fri, 26 Jan 2024 14:00:52 +0100 Subject: [PATCH 15/52] Update python-package.yml --- .github/workflows/python-package.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 3c25cf4af..4e9fa0f73 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -26,7 +26,7 @@ jobs: - uses: actions/checkout@v4 # On Linux we use CMake within Docker - name: Setup cmake - uses: jwlawson/actions-setup-cmake@v1.13 + uses: jwlawson/actions-setup-cmake@v1.14 with: cmake-version: '3.26.x' - name: Add msbuild to PATH @@ -91,7 +91,7 @@ jobs: # On Linux we use CMake within Docker - name: Setup cmake if: ${{ !startsWith(matrix.os, 'linux') }} - uses: jwlawson/actions-setup-cmake@v1.13 + uses: jwlawson/actions-setup-cmake@v1.14 with: cmake-version: '3.26.x' # Windows: We install Cuda on the agent (slow) @@ -134,7 +134,7 @@ jobs: mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) - name: Upload build artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: shared_library path: output/* @@ -186,7 +186,7 @@ jobs: pip install build python -m build . - name: Upload build artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: bdist_wheel path: dist/bitsandbytes-*.whl @@ -201,7 +201,7 @@ jobs: pip install build python -m build -s . - name: Download build artifact - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: bdist_wheel path: dist/ From 8dd8d6333d5b7c232de2b952f61fb09a2e9ae354 Mon Sep 17 00:00:00 2001 From: Rickard Lyrenius Date: Sun, 28 Jan 2024 15:31:18 +0100 Subject: [PATCH 16/52] Refactor build stage --- .github/workflows/python-package.yml | 25 ++++++++++--------------- pyproject.toml | 8 ++++++-- requirements.txt => requirements.ci.txt | 1 + 3 files changed, 17 insertions(+), 17 deletions(-) rename requirements.txt => requirements.ci.txt (66%) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 4e9fa0f73..8b285218d 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -136,7 +136,7 @@ jobs: - name: Upload build artifact uses: actions/upload-artifact@v4 with: - name: shared_library + name: shared_library_cuda path: output/* retention-days: 7 build-wheels: @@ -157,24 +157,25 @@ jobs: - uses: actions/checkout@v4 # Download shared libraries - name: Download build artifact - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: - name: shared_library path: output/ - name: Copy correct platform shared library shell: bash run: | cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ - # Compile C++ code + # Set up the Python version needed - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - # - - name: Install Python dependencies + cache: pip + - name: Install build package shell: bash - run: | - pip install -r requirements.txt + run: pip install build + - name: Install Python test dependencies + shell: bash + run: pip install -r requirements.ci.txt # TODO: How to run CUDA tests on GitHub actions? #- name: Run unit tests # if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents @@ -182,9 +183,7 @@ jobs: # PYTHONPATH=. pytest --log-cli-level=DEBUG tests - name: Build wheel shell: bash - run: | - pip install build - python -m build . + run: python -m build . - name: Upload build artifact uses: actions/upload-artifact@v4 with: @@ -196,10 +195,6 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - name: Build dist - run: | - pip install build - python -m build -s . - name: Download build artifact uses: actions/download-artifact@v4 with: diff --git a/pyproject.toml b/pyproject.toml index 74d17dd90..c028ca265 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,11 @@ [build-system] requires = [ - "setuptools>=42", - "wheel" + "setuptools>=63", + "pytest~=7.2.2", + "einops~=0.6.0", + "wheel~=0.40.0", + "lion-pytorch~=0.0.6", + "scipy~=1.11.4" ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.ci.txt similarity index 66% rename from requirements.txt rename to requirements.ci.txt index ad64af67a..e9d86293d 100644 --- a/requirements.txt +++ b/requirements.ci.txt @@ -1,3 +1,4 @@ +# Requirements used for GitHub actions pytest==7.2.2 einops==0.6.0 wheel==0.40.0 From 8b1ceb7aecacd92c7f492aa60c9c28c42c43072b Mon Sep 17 00:00:00 2001 From: Rickard Lyrenius Date: Sun, 28 Jan 2024 16:01:16 +0100 Subject: [PATCH 17/52] Fixed breaking actions change --- .github/workflows/python-package.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 8b285218d..ded1b5484 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -65,7 +65,7 @@ jobs: - name: Upload build artifact uses: actions/upload-artifact@v3 with: - name: shared_library + name: shared_library/${{ matrix.os }}/${{ matrix.arch }} path: output/* retention-days: 7 ## @@ -136,7 +136,7 @@ jobs: - name: Upload build artifact uses: actions/upload-artifact@v4 with: - name: shared_library_cuda + name: shared_library_cuda/${{ matrix.os }}/${{ matrix.arch }} path: output/* retention-days: 7 build-wheels: @@ -187,7 +187,7 @@ jobs: - name: Upload build artifact uses: actions/upload-artifact@v4 with: - name: bdist_wheel + name: bdist_wheel/${{ matrix.os }}/${{ matrix.arch }} path: dist/bitsandbytes-*.whl retention-days: 7 publish: @@ -198,7 +198,6 @@ jobs: - name: Download build artifact uses: actions/download-artifact@v4 with: - name: bdist_wheel path: dist/ - run: | ls -lR dist/ From e11867bcacdb7f9ab1844e843b40e3e1dc33ccab Mon Sep 17 00:00:00 2001 From: Rickard Lyrenius Date: Sun, 28 Jan 2024 16:04:10 +0100 Subject: [PATCH 18/52] Slim down Windows cuda --- .github/workflows/python-package.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index ded1b5484..07b74c851 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -65,7 +65,7 @@ jobs: - name: Upload build artifact uses: actions/upload-artifact@v3 with: - name: shared_library/${{ matrix.os }}/${{ matrix.arch }} + name: shared_library_${{ matrix.os }}_${{ matrix.arch }} path: output/* retention-days: 7 ## @@ -95,13 +95,13 @@ jobs: with: cmake-version: '3.26.x' # Windows: We install Cuda on the agent (slow) - - uses: Jimver/cuda-toolkit@v0.2.10 + - uses: Jimver/cuda-toolkit@v0.2.14 if: startsWith(matrix.os, 'windows') id: cuda-toolkit with: cuda: ${{ matrix.cuda_version }} method: 'local' - #sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]' + sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev"]' - name: Add msbuild to PATH uses: microsoft/setup-msbuild@v1.1 if: ${{ startsWith(matrix.os, 'windows') }} @@ -136,7 +136,7 @@ jobs: - name: Upload build artifact uses: actions/upload-artifact@v4 with: - name: shared_library_cuda/${{ matrix.os }}/${{ matrix.arch }} + name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }} path: output/* retention-days: 7 build-wheels: @@ -187,7 +187,7 @@ jobs: - name: Upload build artifact uses: actions/upload-artifact@v4 with: - name: bdist_wheel/${{ matrix.os }}/${{ matrix.arch }} + name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }} path: dist/bitsandbytes-*.whl retention-days: 7 publish: From 57625dba9161fec458384046d69a5b7374f69970 Mon Sep 17 00:00:00 2001 From: Rickard Date: Sun, 28 Jan 2024 16:27:10 +0100 Subject: [PATCH 19/52] Create dependabot.yml --- .github/dependabot.yml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..8fb637562 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: pip + directory: "/" + schedule: + interval: "weekly" From ada2e9aa1e06ade396a013ffd1ffd7fd25cf5d7c Mon Sep 17 00:00:00 2001 From: Rickard Lyrenius Date: Sun, 28 Jan 2024 16:27:54 +0100 Subject: [PATCH 20/52] Bespoke local dev requirements.txt --- .github/workflows/python-package.yml | 2 +- .github/workflows/stale.yml.disabled | 27 ---------------------- requirements.ci.txt => requirements-ci.txt | 1 + requirements-dev.txt | 9 ++++++++ 4 files changed, 11 insertions(+), 28 deletions(-) delete mode 100644 .github/workflows/stale.yml.disabled rename requirements.ci.txt => requirements-ci.txt (89%) create mode 100644 requirements-dev.txt diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 07b74c851..92c49a196 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -175,7 +175,7 @@ jobs: run: pip install build - name: Install Python test dependencies shell: bash - run: pip install -r requirements.ci.txt + run: pip install -r requirements-ci.txt # TODO: How to run CUDA tests on GitHub actions? #- name: Run unit tests # if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents diff --git a/.github/workflows/stale.yml.disabled b/.github/workflows/stale.yml.disabled deleted file mode 100644 index ec011c7fb..000000000 --- a/.github/workflows/stale.yml.disabled +++ /dev/null @@ -1,27 +0,0 @@ -name: Stale Bot - -on: - schedule: - - cron: "0 15 * * *" - -jobs: - close_stale_issues: - name: Close Stale Issues - if: github.repository == 'TimDettmers/bitsandbytes' - runs-on: ubuntu-latest - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - steps: - - uses: actions/checkout@v3 - - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: 3.8 - - - name: Install requirements - run: | - pip install PyGithub - - name: Close stale issues - run: | - python scripts/stale.py \ No newline at end of file diff --git a/requirements.ci.txt b/requirements-ci.txt similarity index 89% rename from requirements.ci.txt rename to requirements-ci.txt index e9d86293d..46bd5b9cd 100644 --- a/requirements.ci.txt +++ b/requirements-ci.txt @@ -4,3 +4,4 @@ einops==0.6.0 wheel==0.40.0 lion-pytorch==0.0.6 scipy==1.11.4 +pandas==2.2.0 diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 000000000..7ede5b061 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,9 @@ +# Requirements used for local development +setuptools>=63 +pytest~=7.2.2 +einops~=0.6.0 +wheel~=0.40.0 +lion-pytorch~=0.0.6 +scipy~=1.11.4 +pandas~=2.2.0 +matplotlib~=3.8.2 From e0093e9cc473749b8861a7723018c6f537c6eaaa Mon Sep 17 00:00:00 2001 From: Rickard Lyrenius Date: Sun, 28 Jan 2024 16:33:04 +0100 Subject: [PATCH 21/52] Enable VS integration --- .github/workflows/python-package.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 92c49a196..c85224292 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -101,7 +101,7 @@ jobs: with: cuda: ${{ matrix.cuda_version }} method: 'local' - sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev"]' + sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]' - name: Add msbuild to PATH uses: microsoft/setup-msbuild@v1.1 if: ${{ startsWith(matrix.os, 'windows') }} @@ -175,7 +175,7 @@ jobs: run: pip install build - name: Install Python test dependencies shell: bash - run: pip install -r requirements-ci.txt + run: pip install -r requirements.ci.txt # TODO: How to run CUDA tests on GitHub actions? #- name: Run unit tests # if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents From 23bdf05b69a57fa26238fd8a37428c5c49bc1b1a Mon Sep 17 00:00:00 2001 From: Rickard Lyrenius Date: Sun, 28 Jan 2024 16:35:20 +0100 Subject: [PATCH 22/52] Group Dependabot updates --- .github/dependabot.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 8fb637562..616a1f98e 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -4,3 +4,8 @@ updates: directory: "/" schedule: interval: "weekly" + groups: + major: + update-types: [major] + minor-patch: + update-types: [minor, patch] \ No newline at end of file From 87414c3148600f44eabd15f5003a01de6dafa712 Mon Sep 17 00:00:00 2001 From: Rickard Lyrenius Date: Sun, 28 Jan 2024 16:42:29 +0100 Subject: [PATCH 23/52] Cleanup --- pyproject.toml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c028ca265..0ddfaa0e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,5 @@ [build-system] -requires = [ - "setuptools>=63", - "pytest~=7.2.2", - "einops~=0.6.0", - "wheel~=0.40.0", - "lion-pytorch~=0.0.6", - "scipy~=1.11.4" -] +requires = [ "setuptools", "wheel" ] build-backend = "setuptools.build_meta" [tool.ruff] From 0ee8f7f8790c103bfea22fc43f567c68d69d5a9d Mon Sep 17 00:00:00 2001 From: Rickard Date: Sun, 28 Jan 2024 23:27:26 +0100 Subject: [PATCH 24/52] Update python-package.yml --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index c85224292..2c2ea64a2 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -101,7 +101,7 @@ jobs: with: cuda: ${{ matrix.cuda_version }} method: 'local' - sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]' + # sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]' - name: Add msbuild to PATH uses: microsoft/setup-msbuild@v1.1 if: ${{ startsWith(matrix.os, 'windows') }} From 816eee0a7bfe6f1d47f70741e2c4603065fb5f13 Mon Sep 17 00:00:00 2001 From: Rickard Date: Wed, 31 Jan 2024 23:10:17 +0100 Subject: [PATCH 25/52] Reinstate file that was wrongly merged --- .github/workflows/stale.yml.disabled | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 .github/workflows/stale.yml.disabled diff --git a/.github/workflows/stale.yml.disabled b/.github/workflows/stale.yml.disabled new file mode 100644 index 000000000..ec011c7fb --- /dev/null +++ b/.github/workflows/stale.yml.disabled @@ -0,0 +1,27 @@ +name: Stale Bot + +on: + schedule: + - cron: "0 15 * * *" + +jobs: + close_stale_issues: + name: Close Stale Issues + if: github.repository == 'TimDettmers/bitsandbytes' + runs-on: ubuntu-latest + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v3 + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: 3.8 + + - name: Install requirements + run: | + pip install PyGithub + - name: Close stale issues + run: | + python scripts/stale.py \ No newline at end of file From 05283245675b5d4fb85f51ae61dcc157b68b0d1b Mon Sep 17 00:00:00 2001 From: Rickard Date: Wed, 31 Jan 2024 23:18:06 +0100 Subject: [PATCH 26/52] Fixed regression caused by new version of download-artifact --- .github/workflows/python-package.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 2c2ea64a2..3891a4b88 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -63,7 +63,7 @@ jobs: mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) - name: Upload build artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: shared_library_${{ matrix.os }}_${{ matrix.arch }} path: output/* @@ -159,10 +159,13 @@ jobs: - name: Download build artifact uses: actions/download-artifact@v4 with: + merge-multiple: true + pattern: "shared_library_*_${{ matrix.os }}_${{ matrix.arch }}" path: output/ - name: Copy correct platform shared library shell: bash run: | + ls -lR output/ cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ # Set up the Python version needed - name: Set up Python ${{ matrix.python-version }} @@ -199,6 +202,8 @@ jobs: uses: actions/download-artifact@v4 with: path: dist/ + merge-multiple: true + pattern: "bdist_wheel_*" - run: | ls -lR dist/ - name: Publish to PyPi From 8152e213c5b4b6cd7ded6be329a436922aeec469 Mon Sep 17 00:00:00 2001 From: Rickard Date: Thu, 1 Feb 2024 01:40:50 +0100 Subject: [PATCH 27/52] Update python-package.yml --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 3891a4b88..db71b357e 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -160,7 +160,7 @@ jobs: uses: actions/download-artifact@v4 with: merge-multiple: true - pattern: "shared_library_*_${{ matrix.os }}_${{ matrix.arch }}" + pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}" path: output/ - name: Copy correct platform shared library shell: bash From 9aad25a70bd1cb7ab93c841ed9c5216ab6643583 Mon Sep 17 00:00:00 2001 From: Rickard Date: Thu, 1 Feb 2024 08:25:35 +0100 Subject: [PATCH 28/52] Update python-package.yml --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index db71b357e..243b0008c 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -178,7 +178,7 @@ jobs: run: pip install build - name: Install Python test dependencies shell: bash - run: pip install -r requirements.ci.txt + run: pip install -r requirements-ci.txt # TODO: How to run CUDA tests on GitHub actions? #- name: Run unit tests # if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents From bcc67808070fc798afc046facf211a5236b8d399 Mon Sep 17 00:00:00 2001 From: Rickard Date: Fri, 2 Feb 2024 00:35:52 +0100 Subject: [PATCH 29/52] Fix matrix --- .github/workflows/python-package.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 243b0008c..3fcb1b55f 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -136,7 +136,7 @@ jobs: - name: Upload build artifact uses: actions/upload-artifact@v4 with: - name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }} + name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }} path: output/* retention-days: 7 build-wheels: @@ -190,7 +190,7 @@ jobs: - name: Upload build artifact uses: actions/upload-artifact@v4 with: - name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }} + name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}}_${{ matrix.python-version }} path: dist/bitsandbytes-*.whl retention-days: 7 publish: From 2951e2c7f5c33826184f435e7bb8a646935ac7f4 Mon Sep 17 00:00:00 2001 From: Rickard Date: Fri, 2 Feb 2024 01:40:45 +0100 Subject: [PATCH 30/52] Update python-package.yml --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 3fcb1b55f..56fefe7da 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -160,7 +160,7 @@ jobs: uses: actions/download-artifact@v4 with: merge-multiple: true - pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}" + pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*" path: output/ - name: Copy correct platform shared library shell: bash From 9867392b16714e143475fb6bb36ee91345c44328 Mon Sep 17 00:00:00 2001 From: Rickard Date: Fri, 2 Feb 2024 09:06:07 +0100 Subject: [PATCH 31/52] Merge --- .github/ISSUE_TEMPLATE/bug-report.yml | 8 +- .github/ISSUE_TEMPLATE/feature-request.yml | 4 +- .github/workflows/build_pr_documentation.yml | 3 +- .github/workflows/cmake.yml | 159 ++++++ .github/workflows/stale.yml.disabled | 2 +- .github/workflows/upload_pr_documentation.yml | 16 + .pre-commit-config.yaml | 11 + .style.yapf | 2 +- CMakeLists.txt | 217 +++++--- README.md | 4 +- benchmarking/switchback/README.md | 2 +- .../switchback/make_plot_with_jsonl.py | 9 +- benchmarking/switchback/speed_benchmark.py | 4 +- bitsandbytes/__init__.py | 2 +- bitsandbytes/cuda_setup/main.py | 2 +- bitsandbytes/optim/adamw.py | 1 - bitsandbytes/research/autograd/_functions.py | 4 +- bitsandbytes/triton/dequantize_rowwise.py | 2 +- .../triton/int8_matmul_mixed_dequantize.py | 2 +- .../triton/int8_matmul_rowwise_dequantize.py | 2 +- .../quantize_columnwise_and_transpose.py | 3 +- bitsandbytes/triton/quantize_global.py | 17 +- bitsandbytes/triton/quantize_rowwise.py | 3 +- compile_from_source.md | 5 +- csrc/cpu_ops.cpp | 2 +- csrc/kernels.cu | 70 +-- csrc/mps_ops.h | 0 csrc/mps_ops.mm | 67 +++ csrc/ops.cuh | 3 - csrc/pythonInterface.cpp | 2 +- docs/source/_toctree.yml | 4 +- docs/source/index.mdx | 6 +- docs/source/installation.mdx | 42 +- docs/source/quickstart.mdx | 4 +- environment-bnb.yml | 21 + environment.yml | 2 +- examples/int8_inference_huggingface.py | 3 - how_to_use_nonpytorch_cuda.md | 2 +- include/SIMD.h | 20 +- install_cuda.py | 8 +- pytest.ini | 5 +- scripts/stale.py | 2 +- setup.py | 4 +- tests/conftest.py | 4 + tests/helpers.py | 51 ++ tests/test_autograd.py | 213 ++----- tests/test_cuda_setup_evaluator.py | 8 - tests/test_functional.py | 527 ++++++------------ tests/test_generation.py | 23 +- tests/test_linear4bit.py | 10 +- tests/test_linear8bitlt.py | 8 +- tests/test_modules.py | 38 +- tests/test_optim.py | 99 ++-- tests/test_triton.py | 4 +- 54 files changed, 914 insertions(+), 822 deletions(-) create mode 100644 .github/workflows/cmake.yml create mode 100644 .github/workflows/upload_pr_documentation.yml create mode 100644 csrc/mps_ops.h create mode 100644 csrc/mps_ops.mm create mode 100644 environment-bnb.yml create mode 100644 tests/helpers.py diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index ac8e9de00..6ae3c7c0a 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -18,15 +18,15 @@ body: label: Reproduction description: | Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet. - Please provide the simplest reproducer as possible so that we can quickly fix the issue. + Please provide the simplest reproducer as possible so that we can quickly fix the issue. placeholder: | - Reproducer: - + Reproducer: + - type: textarea id: expected-behavior validations: required: true attributes: label: Expected behavior - description: "A clear and concise description of what you would expect to happen." \ No newline at end of file + description: "A clear and concise description of what you would expect to happen." diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml index 4e75c2a64..c39f346b9 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.yml +++ b/.github/ISSUE_TEMPLATE/feature-request.yml @@ -18,7 +18,7 @@ body: attributes: label: Motivation description: | - Please outline the motivation for the proposal. Is your feature request related to a problem? + Please outline the motivation for the proposal. Is your feature request related to a problem? - type: textarea id: contribution @@ -27,4 +27,4 @@ body: attributes: label: Your contribution description: | - Is there any way that you could help, e.g. by submitting a PR? \ No newline at end of file + Is there any way that you could help, e.g. by submitting a PR? diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index dace206b1..d6455fd11 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -9,9 +9,10 @@ concurrency: jobs: build: + if: github.repository == 'TimDettmers/bitsandbytes' uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main with: commit_sha: ${{ github.event.pull_request.head.sha }} pr_number: ${{ github.event.number }} package: bitsandbytes - repo_owner: TimDettmers \ No newline at end of file + repo_owner: TimDettmers diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml new file mode 100644 index 000000000..728dd09fb --- /dev/null +++ b/.github/workflows/cmake.yml @@ -0,0 +1,159 @@ +name: CMake on multiple platforms + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +concurrency: + group: cmake-${{ github.ref }} + cancel-in-progress: true + +jobs: + build: + runs-on: ${{ matrix.os }} + + strategy: + # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable. + fail-fast: false + + matrix: + os: [ubuntu-latest, windows-latest] + cuda-version: ['11.8', '12.1'] + build_type: [Release] + + steps: + - uses: actions/checkout@v4 + + - name: Set up MSVC + if: matrix.os == 'windows-latest' + uses: ilammy/msvc-dev-cmd@v1.12.1 + with: + arch: amd64 + + - name: Setup Mambaforge + uses: conda-incubator/setup-miniconda@v3.0.1 + with: + miniforge-variant: Mambaforge + miniforge-version: latest + activate-environment: bnb-env + use-mamba: true + + - uses: conda-incubator/setup-miniconda@v3.0.1 + with: + auto-update-conda: true + activate-environment: bnb-env + environment-file: environment-bnb.yml + use-only-tar-bz2: false + auto-activate-base: true + python-version: "3.10" + mamba-version: "*" + + - name: Set reusable strings + # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file. + id: strings + shell: bash + run: | + echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT" + + - name: CUDA Toolkit + shell: bash -el {0} + run: | + if [ "${{ matrix.os }}" = "ubuntu-latest" ]; then + # to prepare space + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf /usr/local/share/boost + fi + addon="" + cuda_version=${{ matrix.cuda-version }} + [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "ubuntu-latest" ] && addon="cuda-cudart-static cuda-nvrtc" + [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "windows-latest" ] && addon="cuda-nvrtc" + [ "$cuda_version" = "11.8" ] && cuda_version="11.8.0" + [ "$cuda_version" = "12.1" ] && cuda_version="12.1.1" + + conda install pytorch-cuda=${{ matrix.cuda-version }} -c pytorch # it's dependency not correctly resolved sometime + conda install cuda-python=${{ matrix.cuda-version }} cuda-libraries-dev cuda-nvcc cuda-nvtx cuda-cupti cuda-cudart cuda-cudart-dev cuda-runtime cuda-libraries $addon -c "nvidia/label/cuda-$cuda_version" + + [ "${{ matrix.os }}" = "windows-latest" ] && conda install "clang>=17.0.6" "clangxx>=17.0.6" -c conda-forge + + CUDA_HOME="${{ env.CONDA }}/envs/bnb-env" + echo CUDA_HOME=$CUDA_HOME >> "$GITHUB_ENV" + echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV" + + if [ "${{ matrix.os }}" = "windows-latest" ]; then + echo CXX_COMPILER=cl >> "$GITHUB_ENV" + echo C_COMPILER=cl >> "$GITHUB_ENV" + # without -DCMAKE_CUDA_COMPILER=nvcc, cmake config always fail for cuda-11.8 + echo DCMAKE_CUDA_COMPILER=-DCMAKE_CUDA_COMPILER=nvcc >> "$GITHUB_ENV" + else + echo CXX_COMPILER=g++ >> "$GITHUB_ENV" + echo C_COMPILER=gcc >> "$GITHUB_ENV" + fi + + nvcc --version + + - name: Update environment + run: mamba env update -n bnb-env -f environment-bnb.yml + + - name: Prep build + run: python -m pip install cmake==3.27.9 ninja setuptools wheel + + # TODO: the following steps (CUDA, NOBLASLT, CPU) could be moved to the matrix, so they're built in parallel + + - name: Configure CUDA + run: > + cmake -B ${{ steps.strings.outputs.build-output-dir }} + -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} + -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} + -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90" + -S ${{ github.workspace }} + + - name: Build CUDA + run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} + + - name: Configure NOBLASLT + run: > + cmake -B ${{ steps.strings.outputs.build-output-dir }} + -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} + -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} + -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90" + -DNO_CUBLASLT=ON + -S ${{ github.workspace }} + + - name: Build NOBLASLT + run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} + + - name: Configure CPU + run: > + cmake -B ${{ steps.strings.outputs.build-output-dir }} + -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} + -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} + -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + -DNO_CUBLASLT=ON + -DBUILD_CUDA=OFF + -S ${{ github.workspace }} + + - name: Build CPU + run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} + + - name: Build dist + shell: bash -el {0} + run: | + python -m pip install build + python -m build --wheel + mkdir dist/cu${{ matrix.cuda-version }} + mv dist/bitsandbytes*.* dist/cu${{ matrix.cuda-version }}/ + + - name: Upload Build Artifacts + uses: actions/upload-artifact@v4.3.0 + with: + name: bitsandbytes-${{ matrix.os }}-${{ matrix.cuda-version }} + path: | + ${{ github.workspace }}/dist/ diff --git a/.github/workflows/stale.yml.disabled b/.github/workflows/stale.yml.disabled index ec011c7fb..0b4f789ea 100644 --- a/.github/workflows/stale.yml.disabled +++ b/.github/workflows/stale.yml.disabled @@ -24,4 +24,4 @@ jobs: pip install PyGithub - name: Close stale issues run: | - python scripts/stale.py \ No newline at end of file + python scripts/stale.py diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml new file mode 100644 index 000000000..6497caf2d --- /dev/null +++ b/.github/workflows/upload_pr_documentation.yml @@ -0,0 +1,16 @@ +name: Upload PR Documentation + +on: + workflow_run: + workflows: ["Build PR Documentation"] + types: + - completed + +jobs: + build: + uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main + with: + package_name: bitsandbytes + secrets: + hf_token: ${{ secrets.HUGGINGFACE_PUSH }} + comment_bot_token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d568a849f..039139b95 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,3 +6,14 @@ repos: args: - --fix # - id: ruff-format # TODO: enable when the time is right + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-merge-conflict + - id: check-yaml + - id: end-of-file-fixer + - id: fix-byte-order-marker + - id: trailing-whitespace + - id: mixed-line-ending + args: + - --fix=lf diff --git a/.style.yapf b/.style.yapf index a185235cf..e60ac16e5 100644 --- a/.style.yapf +++ b/.style.yapf @@ -10,4 +10,4 @@ SPLIT_BEFORE_BITWISE_OPERATOR = True SPLIT_BEFORE_FIRST_ARGUMENT = True SPLIT_BEFORE_LOGICAL_OPERATOR = True SPLIT_BEFORE_NAMED_ASSIGNS = True -SPLIT_COMPLEX_COMPREHENSION = True \ No newline at end of file +SPLIT_COMPLEX_COMPREHENSION = True diff --git a/CMakeLists.txt b/CMakeLists.txt index d6e269d15..b9a55ae5e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,74 +1,148 @@ +# This CMake config hopefully makes it easier to compile. +# Ensure the CUDA Toolkit is available on your path. Then run: +# For GCC: `cmake -B build . && cmake --build build` +# For MSVC: `cmake -B build . && cmake --build build --config Release` +# You can also use the following options and variables +# - COMPUTE_BACKEND: Set to `cpu`, `cuda`, or `mps` to select the backend +# - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support +# - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version +# is whatever CMake finds on your path. +# - COMPUTE_CAPABILITY: Which GPU Arch/Compute codes to provide to NVCC. +# Separate by semicolons, i.e. `-DCOMPUTE_CAPABILITY=89;90` +# Check your compute capability here: https://developer.nvidia.com/cuda-gpus +# - PTXAS_VERBOSE: Pass the `-v` option to the PTX Assembler cmake_minimum_required(VERSION 3.22.1) -option(ENABLE_CUDA "Build for CUDA (Nvidia)" OFF) -option(ENABLE_MPS "Build for Metal Performance Shaders (Apple)" OFF) +project(bitsandbytes LANGUAGES CXX) -if(ENABLE_CUDA) +# Define included source files +set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.cpp) +set(CUDA_FILES csrc/ops.cu csrc/kernels.cu) +set(MPS_FILES csrc/mps_ops.mm) +set(METAL_FILES csrc/mps_kernels.metal) +# C++ sources are always included +list(APPEND SRC_FILES ${CPP_FILES}) + +set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, mps)") +set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda mps) +option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF) + +if(APPLE) + set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1) +endif() +set(CMAKE_CXX_STANDARD 14) +set(CXX_STANDARD_REQUIRED C++14) + +set(BNB_OUTPUT_NAME "bitsandbytes") + +message(STATUS "Building with backend ${COMPUTE_BACKEND}") + +if(${COMPUTE_BACKEND} STREQUAL "cuda") if(APPLE) message(FATAL_ERROR "CUDA is not supported on macOS" ) endif() - option(NO_CUBLASLT "Don't use CUBLAST" OFF) - if(NO_CUBLASLT) - set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72) - else() - set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89 90) + option(NO_CUBLASLT "Disable CUBLAS" OFF) + set(BUILD_CUDA ON) + set(BUILD_MPS OFF) + message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}") +elseif(${COMPUTE_BACKEND} STREQUAL "mps") + if(NOT APPLE) + message(FATAL_ERROR "MPS is only supported on macOS" ) endif() + set(BUILD_CUDA OFF) + set(BUILD_MPS ON) +else() + set(BUILD_CUDA OFF) + set(BUILD_MPS OFF) endif() -if(ENABLE_CUDA) - message("Building CUDA support for ${CMAKE_CUDA_ARCHITECTURES}") - # Find CUDA tools if we are compiling with CUDA - find_package(CUDAToolkit REQUIRED) - if(NO_CUBLASLT) - set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}_nocublaslt") - else() - set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}") + +if(BUILD_CUDA) + enable_language(CUDA) # This will fail if CUDA is not found + + # Convert the CUDA version from X.Y.z to XY. There's probably a shorter way of doing this + string(REGEX MATCH "^[0-9]+.[0-9]+" _CUDA_VERSION_FIRST_TWO "${CMAKE_CUDA_COMPILER_VERSION}") + string(REPLACE "." "" CUDA_VERSION_SHORT "${_CUDA_VERSION_FIRST_TWO}") + + # Expose a cache variable that the user can set to ensure the correct version of CUDA is found + set(CUDA_VERSION "${CUDA_VERSION_SHORT}" CACHE STRING "Expected CUDA Version Shortcode") + + message(STATUS "CUDA Version: ${CUDA_VERSION_SHORT} (${CMAKE_CUDA_COMPILER_VERSION})") + message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}") + + # It should match the discovered version + if(NOT CUDA_VERSION STREQUAL "${CUDA_VERSION_SHORT}") + message(FATAL_ERROR "You've specified CUDA version ${CUDA_VERSION} however the CUDA compiler found is ${CUDA_VERSION_SHORT}." + " Ensure the desired CUDA compiler is the first one available on your PATH." + ) + endif() + + if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.0") + message(FATAL_ERROR "CUDA Version < 11 is not supported") + elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") + message(FATAL_ERROR "CUDA Version > 12 is not supported") + endif() + + string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math") + if(PTXAS_VERBOSE) + # Verbose? Outputs register usage information, and other things... + string(APPEND CMAKE_CUDA_FLAGS " -Xptxas=-v") endif() - project(bitsandbytes LANGUAGES CXX CUDA) + foreach(capability ${CMAKE_CUDA_ARCHITECTURES_ALL}) + # Most of the items here are like: `xx-real`, so we just extract the `xx` portion + string(REGEX MATCH "[0-9]+" capability_id "${capability}") + if(capability_id GREATER 0) + list(APPEND POSSIBLE_CAPABILITIES ${capability_id}) + endif() + endforeach() + + # This can be changed via -D argument to CMake + # By default all possible capabilities are compiled + set(COMPUTE_CAPABILITY "${POSSIBLE_CAPABILITIES}" CACHE STRING "Compute Capabilities Targeted") + + message(STATUS "CUDA Capabilities Available: ${POSSIBLE_CAPABILITIES}") + message(STATUS "CUDA Capabilities Selected: ${COMPUTE_CAPABILITY}") + + foreach(capability ${COMPUTE_CAPABILITY}) + string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_${capability},code=sm_${capability}") + endforeach() + + message(STATUS "CUDA NVCC Flags: ${CMAKE_CUDA_FLAGS}") + + list(APPEND SRC_FILES ${CUDA_FILES}) + + string(APPEND BNB_OUTPUT_NAME "_cuda${CUDA_VERSION_SHORT}") + if(NO_CUBLASLT) + string(APPEND BNB_OUTPUT_NAME "_nocublaslt") + endif() add_compile_definitions(BUILD_CUDA) - set(CMAKE_CUDA_STANDARD 14) - set(CMAKE_CUDA_STANDARD_REQUIRED ON) - set(GPU_SOURCES csrc/ops.cu csrc/kernels.cu) -elseif(ENABLE_MPS) +elseif(BUILD_MPS) if(NOT APPLE) message(FATAL_ERROR "MPS is only supported on macOS" ) endif() - message("Building MPS support") - set(LIBSUFFIX "mps") - project(bitsandbytes LANGUAGES CXX OBJCXX) + + enable_language(OBJCXX) + + list(APPEND SRC_FILES ${MPS_FILES}) + + string(APPEND BNB_OUTPUT_NAME "_mps") add_compile_definitions(BUILD_MPS) - set(METAL_SOURCES csrc/mps_kernels.metal) file(MAKE_DIRECTORY "build") add_custom_command(OUTPUT "bitsandbytes/bitsandbytes.metallib" - COMMAND xcrun metal -c -o "build/bitsandbytes.air" ${METAL_SOURCES} - COMMAND xcrun metallib "build/bitsandbytes.air" -o "bitsandbytes/bitsandbytes.metallib" - DEPENDS "${METAL_SOURCES}" - COMMENT "Compiling Metal kernels" - VERBATIM) + COMMAND xcrun metal -c -o "build/bitsandbytes.air" ${METAL_FILES} + COMMAND xcrun metallib "build/bitsandbytes.air" -o "bitsandbytes/bitsandbytes.metallib" + DEPENDS "${METAL_FILES}" + COMMENT "Compiling Metal kernels" + VERBATIM) add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib") - set(GPU_SOURCES csrc/mps_ops.mm) else() - message("Building with CPU only") set(LIBSUFFIX "cpu") - - project(bitsandbytes LANGUAGES CXX) set(GPU_SOURCES) endif() -if(APPLE) - set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1) -endif() -set(CMAKE_CXX_STANDARD 14) -set(CXX_STANDARD_REQUIRED C++14) if(WIN32) - # Mute warnings - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -diag-suppress=177") - - # Enable fast math on VC++ - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast") - # Export all symbols set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) endif() @@ -76,46 +150,43 @@ endif() # Weird MSVC hacks if(MSVC) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:msvcprtd /NODEFAULTLIB:MSVCRTD /NODEFAULTLIB:LIBCMT") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2 /fp:fast") endif() -# Add csrc files -add_library(bitsandbytes SHARED - ${GPU_SOURCES} - csrc/common.cpp - csrc/cpu_ops.cpp - csrc/pythonInterface.cpp) - -target_include_directories(bitsandbytes PUBLIC - ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} - ${CMAKE_CURRENT_SOURCE_DIR}/csrc - ${CMAKE_CURRENT_SOURCE_DIR}/include) +set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX) +add_library(bitsandbytes SHARED ${SRC_FILES}) +target_include_directories(bitsandbytes PUBLIC csrc include) -if(ENABLE_CUDA) - target_include_directories(bitsandbytes PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/include) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math") +if(BUILD_CUDA) + target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cusparse) + if(NO_CUBLASLT) + target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT) + else() + target_link_libraries(bitsandbytes PUBLIC CUDA::cublasLt) + endif() - set_target_properties( - bitsandbytes + set_target_properties(bitsandbytes PROPERTIES - CUDA_SEPARABLE_COMPILATION ON) - - target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cusparse) + CUDA_SEPARABLE_COMPILATION ON + ) endif() -if(ENABLE_MPS) +if(BUILD_MPS) add_dependencies(bitsandbytes metallib) target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph") endif() -set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME "bitsandbytes_${LIBSUFFIX}") -# Set the output name of the CUDA library +if(WIN32) + set_target_properties(bitsandbytes PROPERTIES PREFIX "lib") +endif() +set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME ${BNB_OUTPUT_NAME}) if(MSVC) -set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE bitsandbytes) -set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG bitsandbytes) -set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bitsandbytes) -set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG bitsandbytes) + set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE bitsandbytes) + set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG bitsandbytes) + set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bitsandbytes) + set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG bitsandbytes) endif() set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY bitsandbytes) diff --git a/README.md b/README.md index a4586d6ca..61dede8c1 100644 --- a/README.md +++ b/README.md @@ -153,10 +153,10 @@ To compile from source, you need an installation of CUDA. If `nvcc` is not insta wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH # CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} -# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True +# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc -bash install_cuda.sh 117 ~/local 1 +bash install_cuda.sh 117 ~/local 1 ``` To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`: diff --git a/benchmarking/switchback/README.md b/benchmarking/switchback/README.md index bb33b5bbd..b73569030 100644 --- a/benchmarking/switchback/README.md +++ b/benchmarking/switchback/README.md @@ -1,4 +1,4 @@ Steps: 1. Run `python speed_benchmark/speed_benchmark.py` which times operations and writes their time to `speed_benchmark/info_a100_py2.jsonl` (change the name of the jsonl to a different name for your profiling). -2. Run `python speed_benchmark/make_plot_with_jsonl.py`, which produces the `speed_benchmark/plot_with_info.pdf`. Again make sure you change the jsonl which is being processed. \ No newline at end of file +2. Run `python speed_benchmark/make_plot_with_jsonl.py`, which produces the `speed_benchmark/plot_with_info.pdf`. Again make sure you change the jsonl which is being processed. diff --git a/benchmarking/switchback/make_plot_with_jsonl.py b/benchmarking/switchback/make_plot_with_jsonl.py index 3ef87d6b2..177270346 100644 --- a/benchmarking/switchback/make_plot_with_jsonl.py +++ b/benchmarking/switchback/make_plot_with_jsonl.py @@ -33,7 +33,7 @@ ('global_fwd', '^', '--', 'C4', 'Int8 Matmul XW (switchback)'), ('global_bwd', '^', '-.', 'C4', 'Int8 Matmul GW (switchback)'), - + ('x_quantize_rowwise', 'P', '--', 'C4', 'Quantize rowwise X (switchback)'), ('g_quantize_rowwise', 'P', '-.', 'C4', 'Quantize rowwise G (switchback)'), ('w_quantize_global', '.', '--', 'C4', 'Quatnize global W (switchback)'), @@ -55,7 +55,7 @@ y_ += df_[k_].values[0] ys.append(y_ * 0.5) - + ax.plot(xs, ys, color=color, label=name, marker=marker, markersize=5 if marker=='s' else 5, linestyle=ls, linewidth=2 if '+' in k else 1.) @@ -67,7 +67,7 @@ ax.set_xscale('log') if logscale_plot1: ax.set_yscale('log') - + ax.tick_params(axis='x', labelsize=11) ax.tick_params(axis='y', labelsize=11) @@ -91,7 +91,7 @@ ('standard_gx+standard_gw+standard_fwd', 's', '-', 'C2', 'Standard fp16 (total time)'), ('x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd', 'o', '-', 'C4', 'SwitchBack int8 (total time)'), ]: - + xs, ys = [], [] df = rdf[rdf.batch_size == batch_size] for embed_dim in dims_to_consider: @@ -133,4 +133,3 @@ plt.savefig('speed_benchmark/plot_with_info.pdf', bbox_inches='tight') - diff --git a/benchmarking/switchback/speed_benchmark.py b/benchmarking/switchback/speed_benchmark.py index d70df0386..c4f3cd4c6 100644 --- a/benchmarking/switchback/speed_benchmark.py +++ b/benchmarking/switchback/speed_benchmark.py @@ -42,7 +42,7 @@ def get_time(k, fn, info_dict): for dim in [1024, 1280, 1408, 1664, 2048, 4096]: # note "batch_size" is actually "batch_size * embed_dim", which is why it's large for batch_size in [256*32, 256*64, 256*128, 256*256, 256*512]: - + # switch switches dim_in and dim_out for switch in [False, True]: @@ -62,7 +62,7 @@ def get_time(k, fn, info_dict): x = torch.randn(batch_size, dim_in, dtype=torch.float16).cuda() g = torch.randn(batch_size, dim_out, dtype=torch.float16).cuda() w = torch.randn(dim_out, dim_in, dtype=torch.float16).cuda() - + x_int8 = x.clone().to(torch.int8) g_int8 = g.clone().to(torch.int8) w_int8 = w.clone().to(torch.int8) diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py index 87307a9d2..e54e933d9 100644 --- a/bitsandbytes/__init__.py +++ b/bitsandbytes/__init__.py @@ -24,6 +24,6 @@ "optim.optimizer.MockArgs": False, } -__version__ = "0.42.0" +__version__ = "0.43.0.dev" PACKAGE_GITHUB_URL = "https://github.com/TimDettmers/bitsandbytes" diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index a34385b1f..0db9df343 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -210,7 +210,7 @@ def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]: if path.exists(): existent_directories.add(path) except PermissionError: - # Handle the PermissionError first as it is a subtype of OSError + # Handle the PermissionError first as it is a subtype of OSError # https://docs.python.org/3/library/exceptions.html#exception-hierarchy pass except OSError as exc: diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py index 9ea5812ea..17383eed5 100644 --- a/bitsandbytes/optim/adamw.py +++ b/bitsandbytes/optim/adamw.py @@ -35,4 +35,3 @@ class PagedAdamW32bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) - diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py index e515bfeff..7d869e39a 100644 --- a/bitsandbytes/research/autograd/_functions.py +++ b/bitsandbytes/research/autograd/_functions.py @@ -83,7 +83,7 @@ def backward(ctx, grad_output): # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) # not supported by PyTorch. TODO: create work-around - if req_gradA: + if req_gradA: grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) if req_gradB: @@ -167,7 +167,7 @@ def backward(ctx, grad_output): # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) # not supported by PyTorch. TODO: create work-around - if req_gradA: + if req_gradA: grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) if req_gradB: diff --git a/bitsandbytes/triton/dequantize_rowwise.py b/bitsandbytes/triton/dequantize_rowwise.py index daa59da9c..3d7529852 100644 --- a/bitsandbytes/triton/dequantize_rowwise.py +++ b/bitsandbytes/triton/dequantize_rowwise.py @@ -50,7 +50,7 @@ def _dequantize_rowwise( max_val = tl.load(state_x + pid) output = max_val * x * inv_127 tl.store(output_ptr + offsets, output, mask=row_mask) - + def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor): output = torch.empty(*x.shape, device=x.device, dtype=torch.float16) diff --git a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py index 1b80ab1a0..dc3047d7e 100644 --- a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py +++ b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py @@ -120,7 +120,7 @@ def _int8_matmul_mixed_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, acc += tl.dot(a, b) A += BLOCK_K * SPLIT_K * stride_ak B += BLOCK_K * SPLIT_K * stride_bk - + acc = (w_factor * (x_factor * (acc * divfactor))) acc = acc.to(C.dtype.element_ty) diff --git a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py index 1f28b0d10..4881e1468 100644 --- a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py +++ b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py @@ -119,7 +119,7 @@ def _int8_matmul_rowwise_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, acc += tl.dot(a, b) A += BLOCK_K * SPLIT_K * stride_ak B += BLOCK_K * SPLIT_K * stride_bk - + acc = (w_factor * (x_factor * (acc * divfactor))) acc = acc.to(C.dtype.element_ty) diff --git a/bitsandbytes/triton/quantize_columnwise_and_transpose.py b/bitsandbytes/triton/quantize_columnwise_and_transpose.py index fcadaba3e..e7961cf53 100644 --- a/bitsandbytes/triton/quantize_columnwise_and_transpose.py +++ b/bitsandbytes/triton/quantize_columnwise_and_transpose.py @@ -54,7 +54,7 @@ def _quantize_columnwise_and_transpose( max_val = tl.max(tl.where(p2_arange_mask, abs_x, 0), axis=0) output = tl.libdevice.llrint(127. * (x / max_val)) - new_start = pid * M + new_start = pid * M new_offsets = new_start + p2_arange tl.store(output_ptr + new_offsets, output, mask=p2_arange_mask) tl.store(output_maxs + pid, max_val) @@ -71,4 +71,3 @@ def quantize_columnwise_and_transpose(x: torch.Tensor): grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) _quantize_columnwise_and_transpose[grid](x, output, output_maxs, n_elements, M, N, BLOCK_SIZE=M, P2=P2) return output, output_maxs - diff --git a/bitsandbytes/triton/quantize_global.py b/bitsandbytes/triton/quantize_global.py index a73a5bbaa..5cf194744 100644 --- a/bitsandbytes/triton/quantize_global.py +++ b/bitsandbytes/triton/quantize_global.py @@ -59,27 +59,27 @@ def quantize_global(x: torch.Tensor): key=['M', 'N'] ) @triton.jit - def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N, - BLOCK_M : tl.constexpr, - BLOCK_N : tl.constexpr, + def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N, + BLOCK_M : tl.constexpr, + BLOCK_N : tl.constexpr, GROUP_M : tl.constexpr): pid = tl.program_id(0) grid_m = (M + BLOCK_M - 1) // BLOCK_M grid_n = (N + BLOCK_N - 1) // BLOCK_N - + width = GROUP_M * grid_n group_id = pid // width group_size = min(grid_m - group_id * GROUP_M, GROUP_M) pid_m = group_id * GROUP_M + (pid % group_size) pid_n = (pid % width) // group_size - + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) A = A + (rm[:, None] * stride_am + rn[None, :] * stride_an) mask = (rm < M)[:, None] & (rn < N)[None, :] a = tl.load(A, mask=mask) absmax_inv = tl.load(absmax_inv_ptr) - + # rematerialize to save registers rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) @@ -95,12 +95,11 @@ def quantize_global_transpose(input): absmax_inv = 1./ absmax M, N = input.shape out = torch.empty(N, M, device='cuda', dtype=torch.int8) - + assert out.size(0) == N and out.size(1) == M assert input.stride(0) == 1 or input.stride(1) == 1 assert out.stride(0) == 1 or out.stride(1) == 1 - + grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),) _quantize_global_transpose[grid](input, absmax_inv, out, input.stride(0), input.stride(1), out.stride(0), out.stride(1), M, N) return out, absmax - diff --git a/bitsandbytes/triton/quantize_rowwise.py b/bitsandbytes/triton/quantize_rowwise.py index fce464b19..078f4aa2d 100644 --- a/bitsandbytes/triton/quantize_rowwise.py +++ b/bitsandbytes/triton/quantize_rowwise.py @@ -46,7 +46,7 @@ def _quantize_rowwise( offsets = block_start + arange row_mask = arange < BLOCK_SIZE x = tl.load(x_ptr + offsets, mask=row_mask) - + abs_x = tl.abs(x) max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) output = tl.libdevice.llrint(127. * (x / max_val)) @@ -64,4 +64,3 @@ def quantize_rowwise(x: torch.Tensor): grid = lambda meta: (x.shape[0],) _quantize_rowwise[grid](x, output, output_maxs, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) return output, output_maxs - diff --git a/compile_from_source.md b/compile_from_source.md index 23afe1591..6310fd6c6 100644 --- a/compile_from_source.md +++ b/compile_from_source.md @@ -12,10 +12,10 @@ You can install CUDA locally without sudo by following the following steps: wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH # CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} -# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True +# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc -bash install_cuda.sh 117 ~/local 1 +bash install_cuda.sh 117 ~/local 1 ``` By default, the Makefile will look at your `CUDA_HOME` environmental variable to find your CUDA version for compiling the library. If this path is not set it is inferred from the path of your `nvcc` compiler. @@ -37,4 +37,3 @@ If you have problems compiling the library with these instructions from source, ## Compilation with Kepler Since 0.39.1 bitsandbytes installed via pip no longer provides Kepler binaries and these need to be compiled from source. Follow the steps above and instead of `cuda11x_nomatmul` etc use `cuda11x_nomatmul_kepler` - diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index 478c1f4ff..e67135360 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -26,7 +26,7 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long BinAlgo bin_searcher(code, elements_code); int thread_wave_size = 256; - // we chunk the thresds into waves of 256 since the max limit is + // we chunk the threads into waves of 256 since the max limit is // between 16k and 64k on Linux (we reach this when running BLOOM-176B with a large batch size) for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size) { diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 0fff83665..f117547ed 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -110,7 +110,7 @@ __device__ float dDequantizeFP4Tree(unsigned char val, float absmax) return 1.00000000f*absmax*sign; // 1011 else return 0.66666667f*absmax*sign; // 1010 - else + else if((val & 0b0001) == 1) // 100 return 5.208333333e-03f*absmax*sign; // 1001 else @@ -174,36 +174,36 @@ __device__ half dhDequantizeNF4(unsigned char val) if((val & 0b0100) == 4) // 1 if((val & 0b0010) == 2) // 11 if((val & 0b0001) == 1) // 111 - return 1.0f; + return 1.0f; else return 0.7229568362236023f; else if((val & 0b0001) == 1) // 110 - return 0.5626170039176941f; + return 0.5626170039176941f; else - return 0.44070982933044434f; + return 0.44070982933044434f; else if((val & 0b0010) == 2) //10 if((val & 0b0001) == 1) // 101 - return 0.33791524171829224f; + return 0.33791524171829224f; else - return 0.24611230194568634f; - else + return 0.24611230194568634f; + else if((val & 0b0001) == 1) // 100 - return 0.16093020141124725f; + return 0.16093020141124725f; else - return 0.07958029955625534f; + return 0.07958029955625534f; else if((val & 0b0100) == 4) // 0 if((val & 0b0010) == 2) //01 if((val & 0b0001) == 1) // 011 - return 0.0f; + return 0.0f; else - return -0.09105003625154495f; + return -0.09105003625154495f; else if((val & 0b0001) == 1) // 010 - return -0.18477343022823334f; + return -0.18477343022823334f; else return -0.28444138169288635f; else @@ -211,12 +211,12 @@ __device__ half dhDequantizeNF4(unsigned char val) if((val & 0b0001) == 1) // 001 return -0.39491748809814453f; else - return -0.5250730514526367f; - else + return -0.5250730514526367f; + else if((val & 0b0001) == 1) // 000 - return -0.6961928009986877f; + return -0.6961928009986877f; else - return -1.0f; + return -1.0f; } @@ -229,36 +229,36 @@ __device__ float dDequantizeNF4(unsigned char val) if((val & 0b0100) == 4) // 1 if((val & 0b0010) == 2) // 11 if((val & 0b0001) == 1) // 111 - return 1.0f; + return 1.0f; else return 0.7229568362236023f; else if((val & 0b0001) == 1) // 110 - return 0.5626170039176941f; + return 0.5626170039176941f; else - return 0.44070982933044434f; + return 0.44070982933044434f; else if((val & 0b0010) == 2) //10 if((val & 0b0001) == 1) // 101 - return 0.33791524171829224f; + return 0.33791524171829224f; else - return 0.24611230194568634f; - else + return 0.24611230194568634f; + else if((val & 0b0001) == 1) // 100 - return 0.16093020141124725f; + return 0.16093020141124725f; else - return 0.07958029955625534f; + return 0.07958029955625534f; else if((val & 0b0100) == 4) // 0 if((val & 0b0010) == 2) //01 if((val & 0b0001) == 1) // 011 - return 0.0f; + return 0.0f; else - return -0.09105003625154495f; + return -0.09105003625154495f; else if((val & 0b0001) == 1) // 010 - return -0.18477343022823334f; + return -0.18477343022823334f; else return -0.28444138169288635f; else @@ -266,12 +266,12 @@ __device__ float dDequantizeNF4(unsigned char val) if((val & 0b0001) == 1) // 001 return -0.39491748809814453f; else - return -0.5250730514526367f; - else + return -0.5250730514526367f; + else if((val & 0b0001) == 1) // 000 - return -0.6961928009986877f; + return -0.6961928009986877f; else - return -1.0f; + return -1.0f; } @@ -1863,7 +1863,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char //float ratio = (g_val*g_val)/fmaxf(s2_vals[j], eps*eps); //g_val = ratio > 2.0f ? 2.0f*g_val/ratio : g_val; g_val *= gnorm_scale; - + s2_vals[j] = (s2_vals[j]*beta2) + (((1.0f-beta2)*g_val*g_val)); s1_vals[j] = smem_quantiles1[lane_id][c1s[j]]*absmax1[i/BLOCK_SIZE]; @@ -3069,7 +3069,7 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * //// use k warps per thread block //// 1. threadblock use read-only cache to read in register tile for A into shared memory //// 2. each warp loops over shared memory tiles of A of size 8x16 and loads them into fragments -//// 3. each warp reads a segment of values 16x32 from B +//// 3. each warp reads a segment of values 16x32 from B //// 4. do dequantization from register of B into second pair of registers //// 5. store (4) into fragment //// 6. matmul aggregate into fragment C @@ -3531,7 +3531,7 @@ template __global__ void kgemm_4bit_inference(int M, i template __global__ void kgemm_4bit_inference_naive(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, const float *datatype, T * out, int lda, int ldb, int ldc, int blocksize) { - // per threadblock: + // per threadblock: // load step-by-step in chunks of [32,warps]: 1x32 * [32,warps] -> [1,warps] // 4 warps -> 4 loads per iter // 1x32 * 32x4 -> 1x4 outputs per thread block @@ -3764,7 +3764,7 @@ template __global__ void kfunc(T *A, T *B, T value, long { switch(FUNC) { - case FILL: + case FILL: A[i] = (T)value; break; case ARANGE: diff --git a/csrc/mps_ops.h b/csrc/mps_ops.h new file mode 100644 index 000000000..e69de29bb diff --git a/csrc/mps_ops.mm b/csrc/mps_ops.mm new file mode 100644 index 000000000..5e3adeebe --- /dev/null +++ b/csrc/mps_ops.mm @@ -0,0 +1,67 @@ +#import + +#define HLF_MAX 65504 +#define TH 1024 +#define NUM 4 +#define NUM_BLOCK 4096 + +static inline MPSGraph* get_graph() +{ + static MPSGraph* cur = nil; + if(!cur) { + cur = [[MPSGraph alloc] init]; + } + return cur; +} + +static inline id get_device() +{ + NSError *error = nil; + static id device = nil; + if(!device) { + device = MTLCreateSystemDefaultDevice(); + } + if(!device) { + NSLog(@"Failed to get MPS device"); + abort(); + } + return device; +} + +static inline id get_library() +{ + NSError *error = nil; + static id library = nil; + if(!library) { + library = [get_device() newLibraryWithURL:[NSURL fileURLWithPath:@"bitsandbytes.metallib"] error:&error]; + } + if(!library) { + NSLog(@"Failed to load bitsandbytes.metallib"); + abort(); + } + return library; +} + +/*MPSGraphTensor* dequantize_mps(MPSGraphTensor* code, MPSGraphTensor* A, int n) +{ + id out = [get_graph() dequantizeTensor:(MPSGraphTensor*)A scaleTensor:(MPSGraphTensor*)code zeroPoint:0.0 dataType:MPSDataTypeInt8 axis:0 name:@"out"]; + return out; +}*/ + + +// MPSGraph function for quantize +extern "C" MPSGraphTensor* quantize_mps(MPSGraph* graph, MPSGraphTensor* code, MPSGraphTensor* A, int n) +{ + id device = get_device(); + id library = get_library(); + static id kernel = nil; + if(!kernel) { + kernel = [library newFunctionWithName:@"quantize"]; + if(!kernel) { + NSLog(@"Failed to load bitsandbytes.metallib"); + abort(); + } + } + NSLog(@"Not implemented"); + return nil; +} \ No newline at end of file diff --git a/csrc/ops.cuh b/csrc/ops.cuh index cc7b59505..da9df6af0 100644 --- a/csrc/ops.cuh +++ b/csrc/ops.cuh @@ -9,9 +9,6 @@ #include #include -#ifndef _MSC_VER -#include -#endif #include #include diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp index a6b348ca6..ea2283504 100644 --- a/csrc/pythonInterface.cpp +++ b/csrc/pythonInterface.cpp @@ -392,7 +392,7 @@ extern "C" int hasPrefetch = 0; CUDA_CHECK_RETURN(cudaDeviceGetAttribute(&hasPrefetch, cudaDevAttrConcurrentManagedAccess, device)); // 40ns overhead if (hasPrefetch == 0) return; - + CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0)); CUDA_CHECK_RETURN(cudaPeekAtLastError()); } diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 28da69eb0..043597177 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -1,8 +1,8 @@ -- sections: +- sections: - local: index title: Bits & Bytes - local: quickstart title: Quickstart - local: installation title: Installation - title: Get started \ No newline at end of file + title: Get started diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 68ad433e6..67c928309 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -149,10 +149,10 @@ To compile from source, you need an installation of CUDA. If `nvcc` is not insta wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH # CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} -# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True +# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc -bash install_cuda.sh 117 ~/local 1 +bash install_cuda.sh 117 ~/local 1 ``` To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`: @@ -188,4 +188,4 @@ For 8-bit optimizers or quantization routines, please consider citing the follow journal={9th International Conference on Learning Representations, ICLR}, year={2022} } -``` \ No newline at end of file +``` diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index 035e3e70d..50031acf7 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -1,3 +1,43 @@ # Installation -... work in progress ... \ No newline at end of file +Note currently `bitsandbytes` is only supported on CUDA GPU hardwares, support for AMD GPUs and M1 chips (MacOS) is coming soon. + + + + +## Linux + +### From Pypi + +```bash +pip install bitsandbytes +``` + +### From source + +```bash +git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/ +CUDA_VERSION=XXX make cuda12x +python setup.py install +``` + +with `XXX` being your CUDA version, for <12.0 call `make cuda 11x` + + + + +## Windows + +Currently for Windows users, you need to build bitsandbytes from source + +```bash +git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/ +cmake -B build -DBUILD_CUDA=ON -S . +cmake --build build --config Release +python -m build --wheel +``` + +Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com/Jamezo97), [rickardp](https://github.com/rickardp), [akx](https://github.com/akx) for their amazing contributions to make bitsandbytes compatible with Windows. + + + diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index 4dff2ba46..d1028c655 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -6,7 +6,7 @@ ## Minimal example -The following code illustrates the steps above. +The following code illustrates the steps above. ```python -``` \ No newline at end of file +``` diff --git a/environment-bnb.yml b/environment-bnb.yml new file mode 100644 index 000000000..92c7761bb --- /dev/null +++ b/environment-bnb.yml @@ -0,0 +1,21 @@ +# for cmake build +name: bnb +channels: + - pytorch + - nvidia + - conda-forge + +dependencies: + - python + - accelerate + - einops + - scipy + - transformers + - pytest + - pytest-cases + - ipython + - debugpy + - yapf + - monkeytype + - rich + - pytest-sugar diff --git a/environment.yml b/environment.yml index c0e07f153..9ab48dedc 100644 --- a/environment.yml +++ b/environment.yml @@ -42,4 +42,4 @@ dependencies: ## ENV UPDATE: # # add new packages to environment.yml, then: -# mamba env update -n bnb -f environment.yml \ No newline at end of file +# mamba env update -n bnb -f environment.yml diff --git a/examples/int8_inference_huggingface.py b/examples/int8_inference_huggingface.py index dc80a44db..2cee48e8e 100644 --- a/examples/int8_inference_huggingface.py +++ b/examples/int8_inference_huggingface.py @@ -22,6 +22,3 @@ ) generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS) print(tokenizer.decode(generated_ids[0], skip_special_tokens=True)) - - - diff --git a/how_to_use_nonpytorch_cuda.md b/how_to_use_nonpytorch_cuda.md index b5f01fbe5..566b0170e 100644 --- a/how_to_use_nonpytorch_cuda.md +++ b/how_to_use_nonpytorch_cuda.md @@ -18,7 +18,7 @@ You can also install CUDA version that you need locally with a script provided b wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH # CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} -# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True +# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc diff --git a/include/SIMD.h b/include/SIMD.h index 18a38dbfd..0cd8d13d6 100644 --- a/include/SIMD.h +++ b/include/SIMD.h @@ -86,12 +86,28 @@ template <> struct InstrFloatTraits typedef __m128d vec_t; }; -template <> -struct FTOITraits +template <> struct InstrFloatTraits +{ + typedef float vec_t; +}; + +template <> struct InstrFloatTraits +{ + typedef double vec_t; +}; + +template +struct FTOITraits { typedef IVec vec_t; }; +template <> +struct FTOITraits +{ + typedef IVec vec_t; +}; + #ifdef USE_AVX template <> diff --git a/install_cuda.py b/install_cuda.py index 77e258609..4b041b8d0 100644 --- a/install_cuda.py +++ b/install_cuda.py @@ -49,13 +49,13 @@ def install_cuda(version, base_path, download_path): # Install CUDA print(f"Installing CUDA version {version}...") install_command = [ - "bash", filepath, - "--no-drm", "--no-man-page", "--override", + "bash", filepath, + "--no-drm", "--no-man-page", "--override", "--toolkitpath=" + install_path, "--toolkit", "--silent" ] print(f"Running command: {' '.join(install_command)}") - + try: subprocess.run(install_command, check=True) except subprocess.CalledProcessError as e: @@ -99,4 +99,4 @@ def main(): sys.exit(1) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/pytest.ini b/pytest.ini index 9902b98fa..ac6d72e63 100644 --- a/pytest.ini +++ b/pytest.ini @@ -7,4 +7,7 @@ addopts = -rP log_cli = True log_cli_level = INFO -log_file = logs/pytest.log \ No newline at end of file +log_file = logs/pytest.log +markers = + benchmark: mark test as benchmark + slow: mark test as slow diff --git a/scripts/stale.py b/scripts/stale.py index c299643ae..613f5b7cb 100644 --- a/scripts/stale.py +++ b/scripts/stale.py @@ -55,4 +55,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/setup.py b/setup.py index 68c6058cd..0ee090ba5 100644 --- a/setup.py +++ b/setup.py @@ -19,12 +19,12 @@ def read(fname): # Tested with wheel v0.29.0 class BinaryDistribution(Distribution): - def has_ext_modules(foo): + def has_ext_modules(self): return True setup( name="bitsandbytes", - version="0.42.0", + version="0.43.0.dev0", author="Tim Dettmers", author_email="dettmers@cs.washington.edu", description="k-bit optimizers and matrix multiplication routines.", diff --git a/tests/conftest.py b/tests/conftest.py index 0b4b91225..7aee8c922 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,6 +5,10 @@ def pytest_runtest_call(item): try: item.runtest() + except NotImplementedError as nie: + if "NO_CUBLASLT" in str(nie): + pytest.skip("CUBLASLT not available") + raise except AssertionError as ae: if str(ae) == "Torch not compiled with CUDA enabled": pytest.skip("Torch not compiled with CUDA enabled") diff --git a/tests/helpers.py b/tests/helpers.py new file mode 100644 index 000000000..46c6ef93d --- /dev/null +++ b/tests/helpers.py @@ -0,0 +1,51 @@ +from itertools import product +import random +from typing import Any + +import torch + +test_dims_rng = random.Random(42) + + +def get_test_dims(min: int, max: int, *, n: int) -> list[int]: + return [test_dims_rng.randint(min, max) for _ in range(n)] + + +def format_with_label(label: str, value: Any) -> str: + if isinstance(value, bool): + formatted = "T" if value else "F" + elif isinstance(value, (list, tuple)) and all(isinstance(v, bool) for v in value): + formatted = "".join("T" if b else "F" for b in value) + else: + formatted = str(value) + return f"{label}={formatted}" + + +def id_formatter(label: str): + """ + Return a function that formats the value given to it with the given label. + """ + return lambda value: format_with_label(label, value) + + +DTYPE_NAMES = { + torch.bfloat16: "bf16", + torch.bool: "bool", + torch.float16: "fp16", + torch.float32: "fp32", + torch.float64: "fp64", + torch.int32: "int32", + torch.int64: "int64", + torch.int8: "int8", +} + + +def describe_dtype(dtype: torch.dtype) -> str: + return DTYPE_NAMES.get(dtype) or str(dtype).rpartition(".")[2] + + +TRUE_FALSE = (True, False) +BOOLEAN_TRIPLES = list( + product(TRUE_FALSE, repeat=3) +) # all combinations of (bool, bool, bool) +BOOLEAN_TUPLES = list(product(TRUE_FALSE, repeat=2)) # all combinations of (bool, bool) diff --git a/tests/test_autograd.py b/tests/test_autograd.py index ed482b356..d01e5e9db 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -1,50 +1,35 @@ -from itertools import product +from typing import Tuple import pytest import torch import bitsandbytes as bnb - -n = 1 -k = 25 -dim1 = torch.randint(16, 64, size=(n,)).tolist() -dim2 = torch.randint(32, 96, size=(n,)).tolist() -dim3 = torch.randint(32, 96, size=(n,)).tolist() -dim4 = torch.randint(32, 96, size=(n,)).tolist() -funcs = [(torch.bmm, bnb.bmm_cublas), (torch.matmul, bnb.matmul_cublas)] -str_funcs = ["bmm", "matmul"] -req_grad = [(False, False), (True, False), (True, True), (False, True)] -req_grad_str = ["FF", "TF", "TT", "FT"] -transpose = [(False, False), (False, True), (True, True), (True, False)] -str_transpose = ["FF", "FT", "TT", "TF"] -dtype = [torch.float32, torch.float16] -values = list( - product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose) -) -str_values = list( - product( - dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose - ) -) -names = [ - "dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}".format( - *vals - ) - for vals in str_values -] - - -@pytest.mark.parametrize( - "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose", - values, - ids=names, +from tests.helpers import ( + BOOLEAN_TRIPLES, + BOOLEAN_TUPLES, + TRUE_FALSE, + describe_dtype, + get_test_dims, + id_formatter, ) -def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): + +TRANSPOSE_VALS = [(False, True), (False, False)] + + +@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(32, 96, n=1), ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3")) +@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4")) +@pytest.mark.parametrize("funcs", [(torch.bmm, bnb.bmm_cublas), (torch.matmul, bnb.matmul_cublas)], ids=["func=bmm", "func=matmul"]) +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=describe_dtype) +@pytest.mark.parametrize("req_grad", BOOLEAN_TUPLES, ids=id_formatter("req_grad")) +@pytest.mark.parametrize("transpose", BOOLEAN_TUPLES, ids=id_formatter("transpose")) +def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad: Tuple[bool, bool], transpose: Tuple[bool, bool]): if dim2 > 0: dim2 = dim2 - (dim2 % 16) dim3 = dim3 - (dim3 % 16) dim4 = dim4 - (dim4 % 16) - for i in range(k): + for i in range(25): # normal multiply if funcs[0] in [torch.mm, torch.matmul]: @@ -228,71 +213,17 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): assert (idx == 0).sum().item() < n * 0.02 -n = 1 -k = 3 -dim1 = torch.randint(16, 64, size=(n,)).tolist() -dim2 = torch.randint(32, 96, size=(n,)).tolist() -dim3 = torch.randint(32, 96, size=(n,)).tolist() -dim4 = torch.randint(32, 96, size=(n,)).tolist() - -dim2.append(0) - -decomp = [0.0, 6.0] -funcs = [(torch.matmul, bnb.matmul), (torch.matmul, bnb.research.switchback_bnb)] -str_funcs = ["matmullt", 'switchback_bnb'] -req_grad = [(False, False), (True, False), (True, True), (False, True)] -req_grad = list(product([True, False], repeat=3)) -req_grad_str = [] -for c in req_grad: - strval = '' - for v in c: - if v == True: strval += 'T' - else: strval += 'F' - req_grad_str.append(strval) - -transpose = [(False, True), (False, False)] -str_transpose = ["NT", "NN"] -dtype = [torch.float16, torch.bfloat16, torch.float32] -has_fp16_weights = [True, False] -has_bias = [True, False] -values = list( - product( - dim1, - dim2, - dim3, - dim4, - funcs, - dtype, - req_grad, - transpose, - decomp, - has_fp16_weights, - has_bias - ) -) -str_values = list( - product( - dim1, - dim2, - dim3, - dim4, - str_funcs, - dtype, - req_grad_str, - str_transpose, - decomp, - has_fp16_weights, - has_bias - ) -) -names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_decomp_{}_has_fp16_weights_{}_has_bias_{}".format(*vals) for vals in str_values] - - -@pytest.mark.parametrize( - "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, decomp, has_fp16_weights, has_bias", - values, - ids=names, -) +@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [*get_test_dims(32, 96, n=1), 0], ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3")) +@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4")) +@pytest.mark.parametrize("decomp", [0.0, 6.0], ids=id_formatter("decomp")) +@pytest.mark.parametrize("funcs", [(torch.matmul, bnb.matmul), (torch.matmul, bnb.research.switchback_bnb)], ids=["func=matmul", "func=switchback_bnb"]) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype) +@pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad")) +@pytest.mark.parametrize("transpose", TRANSPOSE_VALS, ids=id_formatter("transpose")) +@pytest.mark.parametrize("has_fp16_weights", TRUE_FALSE, ids=id_formatter("has_fp16_weights")) +@pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias")) def test_matmullt( dim1, dim2, @@ -313,7 +244,7 @@ def test_matmullt( req_grad = list(req_grad) req_grad[2] = False - for i in range(k): + for i in range(3): # normal multiply if funcs[0] in [torch.mm, torch.matmul]: @@ -429,45 +360,25 @@ def test_matmullt( torch.testing.assert_close(gradBias1, gradBias2) -n = 1 -k = 3 -dim1 = torch.randint(16, 64, size=(n,)).tolist() -dim2 = torch.randint(32, 96, size=(n,)).tolist() -dim3 = torch.randint(32, 96, size=(n,)).tolist() -dim4 = torch.randint(32, 96, size=(n,)).tolist() - -dim2.append(0) - -funcs = [(torch.matmul, bnb.matmul_4bit)] -str_funcs = ["matmul"] -req_grad = list(product([True, False], repeat=3)) -req_grad_str = [] -for c in req_grad: - strval = '' - for v in c: - if v == True: strval += 'T' - else: strval += 'F' - req_grad_str.append(strval) - -transpose = [(False, True), (False, False)] -str_transpose = ["NT", "NN"] -dtype = [torch.float16, torch.float32] -compress_statistics = [False, True] -has_fp16_weights = [True, False] -has_bias = [True, False] -quant_type = ['fp4', 'nf4'] -values = list(product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type)) -str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose, has_bias, compress_statistics, quant_type)) -names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_has_bias_{}_compress_statistics_{}_quant_type_{}".format(*vals) for vals in str_values] -@pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type", values, ids=names) -def test_matmul_4bit( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type): +@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [*get_test_dims(32, 96, n=1), 0], ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3")) +@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4")) +@pytest.mark.parametrize("funcs", [(torch.matmul, bnb.matmul_4bit)], ids=["func=matmul"]) +@pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad")) +@pytest.mark.parametrize("transpose", TRANSPOSE_VALS, ids=id_formatter("transpose")) +@pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias")) +@pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=describe_dtype) +@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) +@pytest.mark.parametrize("quant_type", ['fp4', 'nf4'], ids=id_formatter("quant_type")) +def test_matmul_4bit(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type): dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2) dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3) if has_bias == False: req_grad = list(req_grad) req_grad[2] = False - for i in range(k): + for i in range(3): # normal multiply if funcs[0] in [torch.mm, torch.matmul]: A = torch.randn(size=dimA, device="cuda", requires_grad=req_grad[0], dtype=dtype) @@ -530,32 +441,21 @@ def test_matmul_4bit( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, torch.testing.assert_close(gradBias1, gradBias2) -funcs = [(torch.matmul, bnb.research.matmul_fp8_mixed), (torch.matmul, bnb.research.matmul_fp8_global)] -str_funcs = ["matmul_fp8_mixed", 'matmul_fp8_global'] -req_grad = list(product([True, False], repeat=3)) -req_grad_str = [] -for c in req_grad: - strval = '' - for v in c: - if v == True: strval += 'T' - else: strval += 'F' - req_grad_str.append(strval) - -transpose = [(False, True), (False, False)] -str_transpose = ["NT", "NN"] -dtype = [torch.float16, torch.float32] -has_fp16_weights = [True, False] -values = list(product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose)) -str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose)) -names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}".format(*vals) for vals in str_values] -@pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [*get_test_dims(32, 96, n=1), 0], ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3")) +@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4")) +@pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad")) +@pytest.mark.parametrize("transpose", TRANSPOSE_VALS, ids=id_formatter("transpose")) +@pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=describe_dtype) +@pytest.mark.parametrize("funcs", [(torch.matmul, bnb.research.matmul_fp8_mixed), (torch.matmul, bnb.research.matmul_fp8_global)], ids=["matmul_fp8_mixed", 'matmul_fp8_global']) def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2) dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3) req_grad = list(req_grad) req_grad[2] = False - for i in range(k): + for i in range(3): # normal multiply if funcs[0] in [torch.mm, torch.matmul]: A = torch.randn(size=dimA, device="cuda", requires_grad=req_grad[0], dtype=dtype) @@ -619,4 +519,3 @@ def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): torch.testing.assert_close( gradB1, gradB2, atol=0.18, rtol=0.3 ) - diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py index 5e1a548e5..189aa75b5 100644 --- a/tests/test_cuda_setup_evaluator.py +++ b/tests/test_cuda_setup_evaluator.py @@ -19,11 +19,3 @@ def test_manual_override(requires_cuda): import bitsandbytes as bnb loaded_lib = bnb.cuda_setup.main.CUDASetup.get_instance().binary_name #assert loaded_lib == 'libbitsandbytes_cuda122.so' - - - - - - - - diff --git a/tests/test_functional.py b/tests/test_functional.py index 5b7f83bc3..2d4e959ad 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -11,6 +11,13 @@ import bitsandbytes as bnb from bitsandbytes import functional as F +from tests.helpers import ( + BOOLEAN_TUPLES, + TRUE_FALSE, + describe_dtype, + get_test_dims, + id_formatter, +) torch.set_printoptions( precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000 @@ -155,10 +162,10 @@ def test_dynamic_quantization(): -@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=["fp32", "fp16", "bf16"]) -@pytest.mark.parametrize("nested", [False, True], ids=["False", "True"]) +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype) +@pytest.mark.parametrize("nested", TRUE_FALSE, ids=id_formatter("nested")) @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64]) -@pytest.mark.parametrize("signed", [True, False], ids=['signed_True', 'signed_False']) +@pytest.mark.parametrize("signed", TRUE_FALSE, ids=id_formatter("signed")) def test_dynamic_blockwise_quantization(dtype, nested, blocksize, signed): #print('') diffs = [] @@ -281,34 +288,22 @@ def mean(xx): return sum(xx) / float(len(xx)) -# dim1 = torch.randint(1,1024*4, size=(4,)).tolist() -# dim2 = torch.randint(1,1024*4, size=(4,)).tolist() -dim1 = [1024 * 2] -dim2 = [1024 * 16] -methods = [ - ( +methods = { + "linear": ( lambda x, dim: quant(x), lambda x, dim: quant(x), dequant, dequant, mm_dequant, - ) -] -methods.append((quant_multi, quant_multi, dequant, dequant, mm_dequant)) -# methods.append((lambda x: quant_multi_chunk(x, dim=-1), lambda x: quant_multi_chunk(x, dim=0), dequant, dequant, mm_dequant)) -method_names = ["linear", "vectorwise"] -batched = [False, True] -values = list(product(dim1, dim2, methods, batched)) -values_names = list(product(dim1, dim2, method_names, batched)) -names = [ - "dim1_{}_dim2_{}_quant_{}_batched_{}".format(*vals) - for vals in values_names -] + ), + "vectorwise": (quant_multi, quant_multi, dequant, dequant, mm_dequant), +} -@pytest.mark.parametrize( - "dim1, dim2, quant_methods, batched", values, ids=names -) +@pytest.mark.parametrize("dim1", [1024 * 2], ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [1024 * 16], ids=id_formatter("dim2")) +@pytest.mark.parametrize("quant_methods", methods.values(), ids=methods.keys()) +@pytest.mark.parametrize("batched", TRUE_FALSE, ids=id_formatter("batched")) def test_approx_igemm(dim1, dim2, quant_methods, batched): dim1 = dim1 - (dim1 % 32) dim2 = dim2 - (dim2 % 32) @@ -352,21 +347,10 @@ def test_stable_embedding(): layer.reset_parameters() -n = 2 -hidden_dim = torch.randint(32, 256, size=(n,)).tolist() -batch_dim = torch.randint(16, 256, size=(n,)).tolist() -seq_dim = torch.randint(16, 256, size=(n,)).tolist() -transpose = [(False, False), (False, True), (True, False), (True, True)] -values = list(product(hidden_dim, batch_dim, transpose, seq_dim)) -names = [ - "hidden_dim_{}_batch_dim_{},transpose_{}_seq_dim_{}".format(*vals) - for vals in values -] - - -@pytest.mark.parametrize( - "hidden_dim, batch_dim, transpose, seq_dim", values, ids=names -) +@pytest.mark.parametrize("hidden_dim", get_test_dims(32, 256, n=2), ids=id_formatter("hidden_dim")) +@pytest.mark.parametrize("batch_dim", get_test_dims(16, 256, n=2), ids=id_formatter("batch_dim")) +@pytest.mark.parametrize("seq_dim", get_test_dims(16, 256, n=2), ids=id_formatter("seq_dim")) +@pytest.mark.parametrize("transpose", BOOLEAN_TUPLES, ids=id_formatter("transpose")) def test_igemm(hidden_dim, batch_dim, transpose, seq_dim): hidden_dim = hidden_dim - (hidden_dim % 32) batch_dim = batch_dim - (batch_dim % 16) @@ -418,17 +402,9 @@ def test_igemm(hidden_dim, batch_dim, transpose, seq_dim): torch.testing.assert_close(out.float(), out2) -n = 3 -seq_dim = torch.randint(32, 512, size=(n,)).tolist() -hidden_dim = torch.randint(32, 1024 * 4, size=(n,)).tolist() -batch_dim = torch.randint(2, 16, size=(n,)).tolist() -values = list(product(seq_dim, hidden_dim, batch_dim)) -names = [ - "seq_dim{}_hidden_dim{}_batch_dim{}".format(*vals) for vals in values -] - - -@pytest.mark.parametrize("seq_dim, hidden_dim, batch_dim", values, ids=names) +@pytest.mark.parametrize("seq_dim", get_test_dims(32, 512, n=3), ids=id_formatter("seq_dim")) +@pytest.mark.parametrize("hidden_dim", get_test_dims(32, 1024 * 4, n=3), ids=id_formatter("hidden_dim")) +@pytest.mark.parametrize("batch_dim", get_test_dims(2, 16, n=3), ids=id_formatter("batch_dim")) def test_dim3_igemm(seq_dim, hidden_dim, batch_dim): seq_dim = seq_dim - (seq_dim % 32) hidden_dim = hidden_dim - (hidden_dim % 32) @@ -449,21 +425,10 @@ def test_dim3_igemm(seq_dim, hidden_dim, batch_dim): torch.testing.assert_close(out.float(), out2) -n = 2 -seq_dim = torch.randint(32, 512, size=(n,)).tolist() -hidden_dim = torch.randint(32, 1024 * 4, size=(n,)).tolist() -batch_dim = torch.randint(2, 16, size=(n,)).tolist() -transpose = [False, True] -values = list(product(seq_dim, hidden_dim, batch_dim, transpose)) -names = [ - "seq_dim={}_hidden_dim={}_batch_dim={}_transpose{}".format(*vals) - for vals in values -] - - -@pytest.mark.parametrize( - "seq_dim, hidden_dim, batch_dim, transpose", values, ids=names -) +@pytest.mark.parametrize("seq_dim", get_test_dims(32, 512, n=2), ids=id_formatter("seq_dim")) +@pytest.mark.parametrize("hidden_dim", get_test_dims(32, 1024 * 4, n=2), ids=id_formatter("hidden_dim")) +@pytest.mark.parametrize("batch_dim", get_test_dims(2, 16, n=2), ids=id_formatter("batch_dim")) +@pytest.mark.parametrize("transpose", TRUE_FALSE, ids=id_formatter("transpose")) def test_minmax_igemm(seq_dim, hidden_dim, batch_dim, transpose): def min_max(x): maxA = torch.amax(x, dim=2, keepdim=True) @@ -533,20 +498,11 @@ def min_max(x): assert mean(relerrs) < 0.3 -n = 2 -dim1 = torch.randint(1, 64, size=(n,)).tolist() -dim2 = torch.randint(32, 128, size=(n,)).tolist() -dim3 = torch.randint(32, 256, size=(n,)).tolist() -dim4 = torch.randint(32, 256, size=(n,)).tolist() -transpose = [(False, False), (True, False), (False, True), (True, True)] -values = list(product(dim1, dim2, dim3, dim4, transpose)) -names = [ - "dim1_{}_dim2_{}_dim3_{}_dim4_{}_transpose_{}".format(*vals) - for vals in values -] - - -@pytest.mark.parametrize("dim1, dim2, dim3, dim4, transpose", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(1, 64, n=2), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(32, 128, n=2), ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", get_test_dims(32, 256, n=2), ids=id_formatter("dim3")) +@pytest.mark.parametrize("dim4", get_test_dims(32, 256, n=2), ids=id_formatter("dim4")) +@pytest.mark.parametrize("transpose", BOOLEAN_TUPLES, ids=id_formatter("transpose")) def test_ibmm(dim1, dim2, dim3, dim4, transpose): dim2 = dim2 - (dim2 % 16) dim3 = dim3 - (dim3 % 16) @@ -574,15 +530,9 @@ def test_ibmm(dim1, dim2, dim3, dim4, transpose): torch.testing.assert_close(out.float(), out2.float()) -n = 1 -dim1 = torch.randint(1, 64, size=(n,)).tolist() -dim2 = torch.randint(32, 128, size=(n,)).tolist() -dim3 = torch.randint(32, 256, size=(n,)).tolist() -values = list(product(dim1, dim2, dim3)) -names = ["dim1_{}_dim2_{}_dim3_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim2, dim3", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(1, 64, n=1), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(32, 128, n=1), ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", get_test_dims(32, 256, n=1), ids=id_formatter("dim3")) def test_vector_quant(dim1, dim2, dim3): dim2 = dim2 - (dim2 % 16) dim3 = dim3 - (dim3 % 16) @@ -594,28 +544,18 @@ def test_vector_quant(dim1, dim2, dim3): assert_all_approx_close(A1, A, atol=0.01, rtol=0.1, count=int(n*0.002)) - - -n = 2 -dim1 = torch.randint(2, 256, size=(n,)).tolist() -dim2 = torch.randint(2, 256, size=(n,)).tolist() -dim3 = torch.randint(2, 256, size=(n,)).tolist() -# dim1, dim2 = (256,), (256,) -dtype = [torch.int8, torch.int32] -a_order = ["row"] -out_order = ["col", "row", "col32"] -transpose = [False] -dims = [2, 3] -values = list(product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose)) - -names = ["dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_transpose_{}".format(*vals)for vals in values] - - -@pytest.mark.parametrize("dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",values,ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(2, 256, n=2), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(2, 256, n=2), ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", get_test_dims(2, 256, n=2), ids=id_formatter("dim3")) +@pytest.mark.parametrize("dtype", [torch.int8, torch.int32], ids=describe_dtype) +@pytest.mark.parametrize("orderA", ["row"], ids=id_formatter("orderA")) +@pytest.mark.parametrize("orderOut", ["col", "row", "col32"], ids=id_formatter("orderOut")) +@pytest.mark.parametrize("transpose", [False], ids=id_formatter("transpose")) +@pytest.mark.parametrize("dims", [2, 3], ids=id_formatter("dims")) def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose): - if dims == 3 and out_order != "col32": + if dims == 3 and orderOut != "col32": return - if dtype == torch.int32 and out_order != "col32": + if dtype == torch.int32 and orderOut != "col32": return try: func = F.get_transform_func(dtype, orderA, orderOut, transpose) @@ -677,28 +617,12 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans torch.testing.assert_close(A, out2) -n = 1 -dim1 = torch.randint(1, 256, size=(n,)).tolist() -dim2 = torch.randint(32, 512, size=(n,)).tolist() -dim3 = torch.randint(32, 1024, size=(n,)).tolist() -dim4 = torch.randint(32, 1024, size=(n,)).tolist() - -# dim1 = [2] -# dim2 = [2] -# dim3 = [2] -# dim4 = [2] - -dims = (2, 3) -ldb = [0] -# ldb = list(range(256, 1*1024, 256)) -values = list(product(dim1, dim2, dim3, dim4, dims, ldb)) -names = [ - "dim1_{}_dim2_{}_dim3_{}_dim4_{}_dims_{}_ldb_{}".format(*vals) - for vals in values -] - - -@pytest.mark.parametrize("dim1, dim2, dim3, dim4, dims, ldb", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(1, 256, n=1), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(32, 512, n=1), ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", get_test_dims(32, 1024, n=1), ids=id_formatter("dim3")) +@pytest.mark.parametrize("dim4", get_test_dims(32, 1024, n=1), ids=id_formatter("dim4")) +@pytest.mark.parametrize("dims", (2, 3), ids=id_formatter("dims")) +@pytest.mark.parametrize("ldb", (0,), ids=id_formatter("ldb")) def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb): for i in range(k): if dims == 2: @@ -732,21 +656,11 @@ def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb): torch.testing.assert_close(C1, C3.float()) -dim1 = [32] -dim2 = [32] -dim3 = [32] -dim4 = [32] - -dims = (2,) -# ldb = list(range(256, 1*1024, 256)) -values = list(product(dim1, dim2, dim3, dim4, dims)) -names = [ - "dim1_{}_dim2_{}_dim3_{}_dim4_{}_dims_{}".format(*vals) - for vals in values -] - - -@pytest.mark.parametrize("dim1, dim2, dim3, dim4, dims", values, ids=names) +@pytest.mark.parametrize("dim1", [32], ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [32], ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", [32], ids=id_formatter("dim3")) +@pytest.mark.parametrize("dim4", [32], ids=id_formatter("dim4")) +@pytest.mark.parametrize("dims", (2,), ids=id_formatter("dims")) def test_igemmlt_half(dim1, dim2, dim3, dim4, dims): formatB = F.get_special_format_str() for i in range(k): @@ -786,24 +700,15 @@ def test_igemmlt_half(dim1, dim2, dim3, dim4, dims): # C3, S = F.transform(C2, 'row', state=SC) # torch.testing.assert_close(C1, C3.float()) - -batch_size = 2 -seqdim = 512 -# values = [(batch_size, seqdim, 4*1024, 16*1024),(batch_size, seqdim, 5120, 4*5120),(batch_size, seqdim, 12*1024, 4*12*1024)] -values = [ - (batch_size, seqdim, 4 * 1024, 3 * 4 * 1024), - (batch_size, seqdim, 5120, 3 * 5120), - (batch_size, seqdim, 12 * 1024, 4 * 12 * 1024), -] - - -# values = list(product(batch, seq, model, hidden)) -names = [ - "batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values -] - - -@pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names) +@pytest.mark.parametrize( + ("batch", "seq", "model", "hidden"), + [ + pytest.param(2, 512, 4 * 1024, 3 * 4 * 1024, id="batch=2, seq=512, model=4k, hidden=12k"), + pytest.param(2, 512, 5120, 3 * 5120, id="batch=2, seq=512, model=5k, hidden=15k"), + pytest.param(2, 512, 12 * 1024, 4 * 12 * 1024, id="batch=2, seq=512, model=12k, hidden=48k"), + ], +) +@pytest.mark.benchmark def test_bench_8bit_training(batch, seq, model, hidden): formatB = F.get_special_format_str() A = torch.randn(batch, seq, model, device="cuda").half() @@ -953,24 +858,11 @@ def test_bench_8bit_training(batch, seq, model, hidden): # print(t8) -n = 2 -dim1 = torch.randint(64, 256, size=(n,)).tolist() -dim4 = torch.randint(64, 1024, size=(n,)).tolist() - -#dim1 = [2*1024] -#dim4 = [2*1024] - -#dim1 = [4] -#dim4 = [4] - -dims = (2,) -formatB = ["col_turing", "col_ampere"] -has_bias = [True, False] -values = list(product(dim1, dim4, dims, formatB, has_bias)) -names = ["dim1_{}_dim4_{}_dims_{}_formatB_{}_has_bias_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim4, dims, formatB, has_bias", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(64, 256, n=2), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim4", get_test_dims(64, 1024, n=2), ids=id_formatter("dim4")) +@pytest.mark.parametrize("dims", (2,), ids=id_formatter("dims")) +@pytest.mark.parametrize("formatB", ["col_turing", "col_ampere"], ids=id_formatter("formatB")) +@pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias")) def test_dequant_mm(dim1, dim4, dims, formatB, has_bias): inner = torch.randint(1, 128, size=(1,)).item() bias = None @@ -994,33 +886,23 @@ def test_dequant_mm(dim1, dim4, dims, formatB, has_bias): if has_bias: C4 += bias # TODO: is something wrong here? If so, the problem goes deeper - #n = C1.numel() - #p = 0.06 + # n = C1.numel() + # p = 0.06 std = C1.std(0).view(1, -1) C1 /= std C4 /= std - #assert_all_approx_close(C1, C4, atol=0.02, rtol=0.1, count=int(n*0.06)) - #assert (count / n < p), f"error in more than {p} of elements: {count}/{n}={count/n}" + # assert_all_approx_close(C1, C4, atol=0.02, rtol=0.1, count=int(n*0.06)) + # assert (count / n < p), f"error in more than {p} of elements: {count}/{n}={count/n}" C5 = F.mm_dequant(C2, SC, maxA.flatten(), maxB.flatten(), bias=bias) - #torch.testing.assert_close(C5, C4, atol=0.015, rtol=0.1) + # torch.testing.assert_close(C5, C4, atol=0.015, rtol=0.1) n = C5.numel() - assert_all_approx_close(C1, C4, atol=0.015, rtol=0.1, count=int(0.01*n)) - - -n = 2 -dim1 = [1 * 1024] -dim2 = [1 * 1024] -# dim1 = torch.randint(1,4*1024, size=(n,)).tolist() -# dim2 = torch.randint(1,4*1024, size=(n,)).tolist() + assert_all_approx_close(C1, C4, atol=0.015, rtol=0.1, count=int(0.01 * n)) -dims = (2,) -# ldb = list(range(256, 1*1024, 256)) -values = list(product(dim1, dim2, dims)) -names = ["dim1_{}_dim2_{}_dims_{}".format(*vals) for vals in values] - -@pytest.mark.parametrize("dim1, dim2, dims", values, ids=names) +@pytest.mark.parametrize("dim1", [1 * 1024], ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [1 * 1024], ids=id_formatter("dim2")) +@pytest.mark.parametrize("dims", (2,), ids=id_formatter("dims")) def test_colrow_absmax(dim1, dim2, dims): for i in range(k): threshold = 3.0 @@ -1066,17 +948,8 @@ def test_colrow_absmax(dim1, dim2, dims): assert nnz_block_ptr2 is None -n = 2 -# dim1 = [8*1024] -# dim2 = [4*1024] -dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist() -dim2 = torch.randint(1, 4 * 1024, size=(n,)).tolist() - -values = list(product(dim1, dim2)) -names = ["dim1_{}_dim2_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim2", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim2")) def test_double_quant(dim1, dim2): for i in range(k): A = torch.randn(dim1, dim2, device="cuda").half() @@ -1114,16 +987,18 @@ def test_double_quant(dim1, dim2): torch.testing.assert_close(Scol.flatten().float(), statsAt) -n = 4 -dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist() -dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist() -inner = torch.randint(1, 4 * 1024, size=(n,)).tolist() - -values = list(zip(dim1, dim4, inner)) -names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim4, inner", values, ids=names) +@pytest.mark.parametrize( + ("dim1", "dim4", "inner"), + ( + pytest.param(dim1, dim4, inner, id=f"{dim1=},{dim4=},{inner=}") + for (dim1, dim4, inner) + in zip( + get_test_dims(1, 4 * 1024, n=4), + get_test_dims(1, 4 * 1024, n=4), + get_test_dims(1, 4 * 1024, n=4), + ) + ) +) def test_integrated_igemmlt(dim1, dim4, inner): for i in range(k): A = torch.randn(dim1, inner, device="cuda").half() @@ -1158,16 +1033,18 @@ def test_integrated_igemmlt(dim1, dim4, inner): assert err2 <= err1 * 1.025 -n = 6 -dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist() -dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist() -inner = torch.randint(1, 4 * 1024, size=(n,)).tolist() - -values = list(zip(dim1, dim4, inner)) -names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim4, inner", values, ids=names) +@pytest.mark.parametrize( + ("dim1", "dim4", "inner"), + ( + pytest.param(dim1, dim4, inner, id=f"{dim1=},{dim4=},{inner=}") + for (dim1, dim4, inner) + in zip( + get_test_dims(1, 4 * 1024, n=6), + get_test_dims(1, 4 * 1024, n=6), + get_test_dims(1, 4 * 1024, n=6), + ) + ) +) @pytest.mark.skip("Row scale has some bugs for ampere") def test_igemmlt_row_scale(dim1, dim4, inner): formatB = F.get_special_format_str() @@ -1234,17 +1111,17 @@ def test_igemmlt_row_scale(dim1, dim4, inner): print(sum(err3) / len(err3)) -dim1 = [1024, 2048] -inner = [12288 * 4, 4096 * 4] -dim4 = [12288, 4096] - -values = list(zip(dim1, dim4, inner)) -names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim4, inner", values, ids=names) +@pytest.mark.parametrize( + ("dim1", "dim4", "inner"), + [ + pytest.param(1024, 12288 * 4, 12288, id="1024, 12288*4, 12288"), + pytest.param(2048, 4096 * 4, 4096, id="2048, 4096*4, 4096"), + ], +) @pytest.mark.skip("Row scale has some bugs for ampere") +@pytest.mark.benchmark def test_row_scale_bench(dim1, dim4, inner): + formatB = F.get_special_format_str() err1, err2, err3 = [], [], [] relerr1, relerr2 = [], [] scale = 1 @@ -1289,34 +1166,14 @@ def test_row_scale_bench(dim1, dim4, inner): print("vector-wise", time.time() - t0) -n = 2 -dim1 = torch.randint(2, 1024, size=(n,)).tolist() -dim2 = torch.randint(2, 1024, size=(n,)).tolist() -# dim1 = [8*1024] -# dim2 = [4*1024] - -dim3 = [0] -dtype = [torch.int8] -a_order = ["row"] -out_order = ["col32", "col_turing", "col_ampere"] -transpose = [False, True] -dims = [2] -values = list( - product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose) -) -names = [ - "dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_{}".format( - *vals - ) - for vals in values -] - - -@pytest.mark.parametrize( - "dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose", - values, - ids=names, -) +@pytest.mark.parametrize("dim1", get_test_dims(2, 1024, n=2), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(2, 1024, n=2), ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", [0], ids=id_formatter("dim3")) +@pytest.mark.parametrize("dims", [2], ids=id_formatter("dims")) +@pytest.mark.parametrize("dtype", [torch.int8], ids=describe_dtype) +@pytest.mark.parametrize("orderA", ["row"], ids=id_formatter("orderA")) +@pytest.mark.parametrize("orderOut", ["col32", "col_turing", "col_ampere"], ids=id_formatter("orderOut")) +@pytest.mark.parametrize("transpose", TRUE_FALSE, ids=id_formatter("transpose")) def test_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose): for i in range(k): if dims == 2: @@ -1344,23 +1201,6 @@ def test_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose): torch.testing.assert_close(out1, out2) -n = 2 -# dim1 = torch.randint(2,1024, size=(n,)).tolist() -# dim2 = torch.randint(2,1024, size=(n,)).tolist() -dim1 = [1] -dim2 = [33] - -dtype = [torch.int8] -# a_order = ['col_turing', 'col_ampere'] -a_order = ["col_turing"] -out_order = ["row"] -values = list(product(dim1, dim2, dtype, a_order, out_order)) -names = [ - "dim1_{}_dim2_{}_dtype_{}_orderA_{}_orderOut_{}".format(*vals) - for vals in values -] - - def test_overflow(): formatB = F.get_special_format_str() print(formatB) @@ -1375,17 +1215,8 @@ def test_overflow(): c2 = torch.matmul(a.float(), b.float().t()) -n = 2 -dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist() -dim2 = torch.randint(1, 4 * 1024, size=(n,)).tolist() -# dim1 = [4] -# dim2 = [5] - -values = list(product(dim1, dim2)) -names = ["dim1_{}_dim2_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim2", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim2")) def test_coo_double_quant(dim1, dim2): threshold = 3.00 for i in range(k): @@ -1412,17 +1243,9 @@ def test_coo_double_quant(dim1, dim2): ) -n = 2 -dim1 = torch.randint(1, 1 * 1024, size=(n,)).tolist() -dim2 = torch.randint(1, 1 * 1024, size=(n,)).tolist() -# dim1 = [7] -# dim2 = [11] -transposed_B = [False, True] -values = list(product(dim1, dim2, transposed_B)) -names = ["dim1_{}_dim2_{}_transposed_B_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim2, transposed_B", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(1, 1 * 1024, n=2), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(1, 1 * 1024, n=2), ids=id_formatter("dim2")) +@pytest.mark.parametrize("transposed_B", TRUE_FALSE, ids=id_formatter("transposed_B")) def test_spmm_coo(dim1, dim2, transposed_B): threshold = 1.5 dim3 = torch.randint(32, 128, size=(1,)).item() @@ -1453,6 +1276,7 @@ def test_spmm_coo(dim1, dim2, transposed_B): assert_all_approx_close(out1, out2, rtol=0.01, atol=3.0e-2, count=30) +@pytest.mark.benchmark def test_spmm_bench(): batch = 2 model = 1024 * 1 @@ -1496,14 +1320,8 @@ def test_spmm_bench(): print(tsp / t8) -n = 2 -dim1 = torch.randint(256, 1 * 1024, size=(n,)).tolist() -dim2 = torch.randint(256, 1 * 1024, size=(n,)).tolist() -values = list(product(dim1, dim2)) -names = ["dim1_{}_dim2_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim2", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(256, 1024, n=2), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(256, 1024, n=2), ids=id_formatter("dim2")) def test_integrated_sparse_decomp(dim1, dim2): threshold = 3.0 formatB = "col_turing" @@ -1553,23 +1371,10 @@ def test_matmuls(): print(err1, err2) -n = 2 -# dim1 = torch.randint(1,1*1024, size=(n,)).tolist() -# dim2 = torch.randint(1,4*1024, size=(n,)).tolist() -dim1 = [1 * 2048] -dim2 = [12288] -# dim1 = [32] -# dim2 = [32] -# dtype = [torch.float16, torch.int8] -dtype = [torch.float16] -out_function = ["zeros", "ones"] -values = list(product(dim1, dim2, dtype, out_function)) -names = [ - "dim1_{}_dim2_{}_dtype_{}_out_func_{}".format(*vals) for vals in values -] - - -@pytest.mark.parametrize("dim1, dim2, dtype, out_func", values, ids=names) +@pytest.mark.parametrize("dim1", [1 * 2048], ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [12288], ids=id_formatter("dim2")) +@pytest.mark.parametrize("dtype", [torch.float16], ids=describe_dtype) +@pytest.mark.parametrize("out_func", ["zeros", "ones"], ids=id_formatter("out_func")) def test_spmm_coo_very_sparse(dim1, dim2, dtype, out_func): out_func = getattr(torch, out_func) @@ -1672,20 +1477,9 @@ def test_coo2csc(): torch.testing.assert_close(A2.t()[idx], cscA.values) -n = 2 -# dim1 = torch.randint(1,1*1024, size=(n,)).tolist() -# dim2 = torch.randint(1,4*1024, size=(n,)).tolist() -dim1 = [1 * 2048] -# dim2 = [12288] -dim2 = [2048] -# dim1 = [2] -# dim2 = [2] -dtype = [torch.int8] -values = list(product(dim1, dim2, dtype)) -names = ["dim1_{}_dim2_{}_dtype_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim2, dtype", values, ids=names) +@pytest.mark.parametrize("dim1", [1 * 2048]) +@pytest.mark.parametrize("dim2", [2048]) +@pytest.mark.parametrize("dtype", [torch.int8]) def test_spmm_coo_dequant(dim1, dim2, dtype): threshold = 6.0 # threshold = 2.8 @@ -1786,22 +1580,11 @@ def test_spmm_coo_dequant(dim1, dim2, dtype): print("partial matmul", time.time() - t0) -batch_size = 1 -seqdim = 1 -values = [] -#values.append((batch_size, seqdim, 768, 4 * 768)) -#values.append((batch_size, seqdim, 1024, 4*1024)) -#values.append((batch_size, seqdim, 1536, 4*1536)) -#values.append((batch_size, seqdim, 2048, 4*2048)) -#values.append((batch_size, seqdim, 2560, 4*2560)) -#values.append((batch_size, seqdim, 4096, 4*4096)) -#values.append((batch_size, seqdim, 5120, 4*5120)) -values.append((batch_size, seqdim, 6656, 4*6656)) -#values.append((batch_size, seqdim, 8192, 4*8192)) -#values.append((batch_size, seqdim, 5140, 4*5140)) -#values.append((batch_size, seqdim, 12288, 4*12288)) -names = ["batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values] -@pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names) +@pytest.mark.parametrize( + ("batch", "seq", "model", "hidden"), + [pytest.param(1, 1, 6656, 4*6656, id="batch=1, seq=1, model=6656, hidden=26k")], +) +@pytest.mark.benchmark def test_bench_matmul(batch, seq, model, hidden): iters = 1000 formatB = F.get_special_format_str() @@ -2226,6 +2009,7 @@ def test_kbit_quantile_estimation(): assert err < 0.035 +@pytest.mark.benchmark def test_bench_dequantization(): a = torch.rand(1024, 1024, device='cuda').half() code =F.create_fp8_map(True, 3, 0, 4).cuda() @@ -2244,7 +2028,7 @@ def test_bench_dequantization(): -@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=["fp32", "fp16", "bf16"]) +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype) def test_fp4_quant(dtype): vals = list(product([0, 1], repeat=4)) @@ -2321,6 +2105,7 @@ def test_4bit_compressed_stats(quant_type): #@pytest.mark.parametrize("quant_type", ['fp4', 'nf4']) @pytest.mark.parametrize("quant_type", ['nf4']) +@pytest.mark.benchmark def test_bench_4bit_dequant(quant_type): blocksize = 256 a = torch.rand(1024*12*4, 1024*12, device='cuda').half() @@ -2367,11 +2152,11 @@ def test_normal_map_tree(): #print(pivots) -@pytest.mark.parametrize("double_quant", [True, False], ids=['DQ_True', 'DQ_False']) -@pytest.mark.parametrize("storage_type", ['nf4', 'fp4'], ids=['nf4', 'fp4']) -@pytest.mark.parametrize("kind", ['fc1', 'fc2', 'attn', 'attn_packed'], ids=['fc1', 'fc2', 'attn', 'attn_packed']) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32']) -@pytest.mark.parametrize("quant_storage", [torch.uint8, torch.float16, torch.bfloat16, torch.float32], ids=['uint8', 'fp16', 'bf16', 'fp32']) +@pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}") +@pytest.mark.parametrize("storage_type", ['nf4', 'fp4']) +@pytest.mark.parametrize("kind", ['fc1', 'fc2', 'attn', 'attn_packed']) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype) +@pytest.mark.parametrize("quant_storage", [torch.uint8, torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype) def test_gemv_4bit(dtype, storage_type, quant_storage, double_quant, kind): for dim in [128, 256, 512, 1024]: #for dim in [4*1024]: @@ -2537,12 +2322,12 @@ def test_managed(): @pytest.mark.parametrize("storage_type", ['nf4', 'fp4'], ids=['nf4', 'fp4']) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32']) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype) @pytest.mark.parametrize("double_quant", [False], ids=['DQ_True']) def test_gemv_eye_4bit(storage_type, dtype, double_quant): dims = 10 torch.random.manual_seed(np.random.randint(0, 412424242)) - dims = torch.randint(0, 8192, size=(dims,)).tolist() + dims = get_test_dims(0, 8192, n=dims) dims = [dim + (64-(dim % 64)) for dim in dims] #for dim in [576, 5120, 3520, 5184, 1280, 4992, 5312, 2048]: for dim in dims: @@ -2560,5 +2345,3 @@ def test_gemv_eye_4bit(storage_type, dtype, double_quant): torch.testing.assert_close(A, C2) #torch.testing.assert_close(A, C1, rtol=1e-5, atol=0.00001) #torch.testing.assert_close(A, C2, rtol=1e-5, atol=0.080) - - diff --git a/tests/test_generation.py b/tests/test_generation.py index 753623b27..b05749bf8 100644 --- a/tests/test_generation.py +++ b/tests/test_generation.py @@ -9,6 +9,8 @@ BitsAndBytesConfig, ) +from tests.helpers import TRUE_FALSE, describe_dtype, id_formatter + def get_4bit_config(): return BitsAndBytesConfig( @@ -59,23 +61,19 @@ def generate(model, tokenizer, text, generation_config, prompt_func=get_prompt_f models = ['huggyllama/llama-7b', 'bigscience/bloom-1b7'] dtypes = ['nf4', 'fp4'] -load_in_4bit = [True, False] -values = list(product(models, dtypes)) -strfunc = lambda lst: [str(x) for x in lst] -ids = ['_'.join(strfunc(x)) for x in values] -@pytest.fixture(scope='session', params=values, ids=ids) + +@pytest.fixture(scope='session', params=product(models, dtypes)) def model_and_tokenizer(request): model, tokenizer = get_model_and_tokenizer(request.param) yield request.param, model, tokenizer del model -@pytest.mark.parametrize("DQ", [True, False], ids=['DQ_True', 'DQ_False']) -@pytest.mark.parametrize("inference_kernel", [True, False], ids=['inference_kernel_True', 'inference_kernel_False']) -#@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32']) -def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ): - print('') - dtype = torch.float16 +@pytest.mark.parametrize("DQ", TRUE_FALSE, ids=id_formatter("dq")) +@pytest.mark.parametrize("inference_kernel", TRUE_FALSE, ids=id_formatter("inference_kernel")) +@pytest.mark.parametrize("dtype", [torch.float16], ids=describe_dtype) +@pytest.mark.slow +def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ, dtype): fixture_config, model, tokenizer = model_and_tokenizer generation_config = transformers.GenerationConfig( @@ -122,6 +120,3 @@ def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ): for out in outputs: print(out) raise ValueError(f'Failure count: {failure_count}/{n_cases}') - - - diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index d396a910b..13db28ed4 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -1,4 +1,3 @@ -from itertools import product import os from tempfile import TemporaryDirectory @@ -6,6 +5,7 @@ import torch import bitsandbytes as bnb +from tests.helpers import TRUE_FALSE storage = { 'uint8': torch.uint8, @@ -14,10 +14,10 @@ 'float32': torch.float32 } -@pytest.mark.parametrize( - "quant_type, compress_statistics, bias, quant_storage", - list(product(["nf4", "fp4"], [False, True], [False, True], ['uint8', 'float16', 'bfloat16', 'float32'])), -) +@pytest.mark.parametrize("quant_storage", ['uint8', 'float16', 'bfloat16', 'float32']) +@pytest.mark.parametrize("bias", TRUE_FALSE) +@pytest.mark.parametrize("compress_statistics", TRUE_FALSE) +@pytest.mark.parametrize("quant_type", ["nf4", "fp4"]) def test_linear_serialization(quant_type, compress_statistics, bias, quant_storage): original_dtype = torch.float16 compute_dtype = None diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py index d4967969c..6fa7efb8d 100644 --- a/tests/test_linear8bitlt.py +++ b/tests/test_linear8bitlt.py @@ -1,5 +1,4 @@ from contextlib import nullcontext -from itertools import product import os from tempfile import TemporaryDirectory @@ -10,6 +9,7 @@ from bitsandbytes import functional as F from bitsandbytes.autograd import get_inverse_transform_indices, undo_layout from bitsandbytes.nn.modules import Linear8bitLt +from tests.helpers import TRUE_FALSE, id_formatter # contributed by Alex Borzunov, see: # https://github.com/bigscience-workshop/petals/blob/main/tests/test_linear8bitlt.py @@ -66,8 +66,10 @@ def test_linear_no_igemmlt(): assert linear_custom.state.CxB is None -@pytest.mark.parametrize("has_fp16_weights, serialize_before_forward, deserialize_before_cuda, force_no_igemmlt", - list(product([False, True], [False, True], [False, True], [False, True]))) +@pytest.mark.parametrize("has_fp16_weights", TRUE_FALSE, ids=id_formatter("has_fp16_weights")) +@pytest.mark.parametrize("serialize_before_forward", TRUE_FALSE, ids=id_formatter("serialize_before_forward")) +@pytest.mark.parametrize("deserialize_before_cuda", TRUE_FALSE, ids=id_formatter("deserialize_before_cuda")) +@pytest.mark.parametrize("force_no_igemmlt", TRUE_FALSE, ids=id_formatter("force_no_igemmlt")) def test_linear_serialization(has_fp16_weights, serialize_before_forward, deserialize_before_cuda, force_no_igemmlt): linear = torch.nn.Linear(32, 96) x = torch.randn(3, 32, dtype=torch.half) diff --git a/tests/test_modules.py b/tests/test_modules.py index c98f7a6d4..32d90938d 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -6,6 +6,7 @@ from torch import nn import bitsandbytes as bnb +from tests.helpers import id_formatter class MockArgs: @@ -311,12 +312,7 @@ def forward(self, x): return LinearFunction.apply(x, self.weight, self.bias, self.args) -threshold = [0.0, 3.0] -values = threshold -names = [f"threshold_{vals}" for vals in values] - - -@pytest.mark.parametrize("threshold", values, ids=names) +@pytest.mark.parametrize("threshold", [0.0, 3.0], ids=id_formatter("threshold")) def test_linear8bitlt_inference(threshold): l1 = bnb.nn.Linear8bitLt(32, 64, threshold=threshold).cuda().half() assert l1.weight.device.type == "cuda" @@ -510,18 +506,21 @@ def test_linear_kbit_fp32_bias(module): o1 = l1(b1) assert l1.bias is None -modules = [] -modules.append(bnb.nn.Linear8bitLt) -modules.append(bnb.nn.Linear4bit) -modules.append(bnb.nn.LinearFP4) -modules.append(bnb.nn.LinearNF4) -modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compress_statistics=True)) -modules.append(lambda d1, d2: bnb.nn.LinearNF4(d1, d2, compress_statistics=True)) -modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float32)) -modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float16)) -modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.bfloat16)) -names = ['Int8Lt', '4bit', 'FP4', 'NF4', 'FP4+C', 'NF4+C', 'NF4+fp32', 'NF4+fp16', 'NF4+bf16'] -@pytest.mark.parametrize("module", modules, ids=names) + +module_dict = { + "Int8Lt": bnb.nn.Linear8bitLt, + "4bit": bnb.nn.Linear4bit, + "FP4": bnb.nn.LinearFP4, + "NF4": bnb.nn.LinearNF4, + "FP4+C": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compress_statistics=True), + "NF4+C": lambda d1, d2: bnb.nn.LinearNF4(d1, d2, compress_statistics=True), + "NF4+fp32": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float32), + "NF4+fp16": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float16), + "NF4+bf16": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.bfloat16), +} + + +@pytest.mark.parametrize("module", module_dict.values(), ids=module_dict.keys()) def test_kbit_backprop(module): b = 17 dim1 = 37 @@ -638,6 +637,3 @@ def test_4bit_warnings(): net(inp) assert len(record) == 2 - - - diff --git a/tests/test_optim.py b/tests/test_optim.py index 993ac8b60..e379c424a 100644 --- a/tests/test_optim.py +++ b/tests/test_optim.py @@ -1,4 +1,3 @@ -from itertools import product import os from os.path import join import shutil @@ -11,6 +10,7 @@ import bitsandbytes as bnb import bitsandbytes.functional as F +from tests.helpers import describe_dtype, id_formatter # import apex @@ -101,15 +101,16 @@ def rm_path(path): str2statenames["lion8bit_blockwise"] = [("exp_avg", "state1", "qmap1", "absmax1")] str2statenames["paged_lion8bit_blockwise"] = [("exp_avg", "state1", "qmap1", "absmax1")] -dim1 = [1024] -dim2 = [32, 1024, 4097, 1] -gtype = [torch.float32, torch.float16, torch.bfloat16] -optimizer_names = ["adam", "momentum", "rmsprop", 'paged_adamw', 'paged_adam', 'lion', 'paged_lion'] -values = list(product(dim1, dim2, gtype, optimizer_names)) -names = ["dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values] -@pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names) +optimizer_names_32bit = ["adam", "momentum", "rmsprop", 'paged_adamw', 'paged_adam', 'lion', 'paged_lion'] + + +@pytest.mark.parametrize("optim_name", optimizer_names_32bit, ids=id_formatter("opt")) +@pytest.mark.parametrize("gtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype) +@pytest.mark.parametrize("dim1", [1024], ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [32, 1024, 4097, 1], ids=id_formatter("dim2")) def test_optimizer32bit(dim1, dim2, gtype, optim_name): - if gtype == torch.bfloat16 and optim_name in ['momentum', 'rmsprop']: pytest.skip() + if gtype == torch.bfloat16 and optim_name in ['momentum', 'rmsprop']: + pytest.skip() if dim1 == 1 and dim2 == 1: return p1 = torch.randn(dim1, dim2, device="cuda", dtype=gtype) * 0.1 @@ -134,7 +135,6 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name): bnb_optimizer.step() torch_optimizer.step() - for name1, name2 in str2statenames[optim_name]: torch.testing.assert_close( torch_optimizer.state[p1][name1], @@ -177,14 +177,9 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name): assert bnb_optimizer.state[p2]["unorm_vec"] > 0.0 -dim1 = [1024] -dim2 = [32, 1024, 4097] -gtype = [torch.float32, torch.float16] -values = list(product(dim1, dim2, gtype)) -names = ["dim1_{}_dim2_{}_gtype_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim2, gtype", values, ids=names) +@pytest.mark.parametrize("dim1", [1024], ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [32, 1024, 4097], ids=id_formatter("dim2")) +@pytest.mark.parametrize("gtype", [torch.float32, torch.float16], ids=describe_dtype) def test_global_config(dim1, dim2, gtype): if dim1 == 1 and dim2 == 1: return @@ -230,10 +225,7 @@ def test_global_config(dim1, dim2, gtype): assert adam2.state[p3]["state2"].dtype == torch.uint8 -dim1 = [1024] -dim2 = [32, 1024, 4097] -gtype = [torch.float32, torch.float16, torch.bfloat16] -optimizer_names = [ +optimizer_names_8bit = [ "adam8bit", "lion8bit", "momentum8bit", @@ -243,13 +235,12 @@ def test_global_config(dim1, dim2, gtype): "momentum8bit_blockwise", "rmsprop8bit_blockwise", ] -values = list(product(dim1, dim2, gtype, optimizer_names)) -names = [ - "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values -] -@pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names) +@pytest.mark.parametrize("optim_name", optimizer_names_8bit, ids=id_formatter("opt")) +@pytest.mark.parametrize("gtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype) +@pytest.mark.parametrize("dim2", [32, 1024, 4097], ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim1", [1024], ids=id_formatter("dim1")) def test_optimizer8bit(dim1, dim2, gtype, optim_name): if gtype == torch.bfloat16 and optim_name not in ['adam8bit_blockwise', 'lion8bit_blockwise']: pytest.skip() if dim1 == 1 and dim2 == 1: @@ -375,18 +366,10 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name): # print(sum(relerrors)/len(relerrors)) -dim1 = [1024] -dim2 = [32, 1024, 4097] -gtype = [torch.float32] -optim_bits = [32, 8] -values = list(product(dim1, dim2, gtype, optim_bits)) -names = [ - "dim1_{}_dim2_{}_gtype_{}_optim_bits_{}".format(*vals) - for vals in values -] - - -@pytest.mark.parametrize("dim1, dim2, gtype, optim_bits", values, ids=names) +@pytest.mark.parametrize("optim_bits", [32, 8], ids=id_formatter("optim_bits")) +@pytest.mark.parametrize("gtype", [torch.float32], ids=describe_dtype) +@pytest.mark.parametrize("dim2", [32, 1024, 4097], ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim1", [1024], ids=id_formatter("dim1")) def test_adam_percentile_clipping(dim1, dim2, gtype, optim_bits): if dim1 == 1 and dim2 == 1: return @@ -474,22 +457,19 @@ def test_adam_percentile_clipping(dim1, dim2, gtype, optim_bits): adam2.load_state_dict(torch.load(join(path, "opt.pt"))) -dim1 = [4096] -dim2 = [4096] -gtype = [torch.float32, torch.float16] -# optimizer_names = ['adam8bit_blockwise', 'adam8bit', 'lamb8bit'] -# optimizer_names = ['adam8bit_blockwise', 'adam_apex', 'adam8bit', 'adam', 'adam_pytorch'] -# optimizer_names = ['momentum_apex', 'momentum8bit', 'momentum_pytorch'] -# optimizer_names = ['lamb_apex', 'lamb8bit'] -# optimizer_names = ['lars_apex', 'lars8bit'] -optimizer_names = ["adam8bit_blockwise", 'paged_adam8bit_blockwise', 'paged_adamw8bit_blockwise', 'paged_lion8bit_blockwise'] -values = list(product(dim1, dim2, gtype, optimizer_names)) -names = [ - "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values +optimizer_names_benchmark = [ + "adam8bit_blockwise", + "paged_adam8bit_blockwise", + "paged_adamw8bit_blockwise", + "paged_lion8bit_blockwise", ] -@pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names) +@pytest.mark.parametrize("dim1", [4096], ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [4096], ids=id_formatter("dim2")) +@pytest.mark.parametrize("gtype", [torch.float32, torch.float16], ids=describe_dtype) +@pytest.mark.parametrize("optim_name", optimizer_names_benchmark, ids=id_formatter("opt")) +@pytest.mark.benchmark def test_benchmark_blockwise(dim1, dim2, gtype, optim_name): if dim1 == 1 and dim2 == 1: return @@ -514,15 +494,12 @@ def test_benchmark_blockwise(dim1, dim2, gtype, optim_name): print(optim_name, gtype, s / params) # assert s < 3.9 -dim1 = [2*1024] -gtype = [torch.float16] -#mode = ['torch', 'bnb'] -mode = ['bnb'] -optimizer_names = ['paged_adamw'] -#optimizer_names = ['paged_adamw8bit_blockwise'] -values = list(product(dim1,gtype, optimizer_names, mode)) -names = ['dim1_{0}_gtype_{1}_optim_{2}_mode_{3}'.format(*vals) for vals in values] -@pytest.mark.parametrize("dim1, gtype, optim_name, mode", values, ids=names) + +@pytest.mark.parametrize("dim1", [2 * 1024], ids=id_formatter("dim1")) +@pytest.mark.parametrize("gtype", [torch.float16], ids=describe_dtype) +@pytest.mark.parametrize("optim_name", ['paged_adamw'], ids=id_formatter("optim_name")) +@pytest.mark.parametrize("mode", ['bnb'], ids=id_formatter("mode")) +@pytest.mark.benchmark def test_stream_optimizer_bench(dim1, gtype, optim_name, mode): layers1 = torch.nn.Sequential(*torch.nn.ModuleList([torch.nn.Linear(dim1, dim1) for i in range(10)])) layers1 = layers1.to(gtype) diff --git a/tests/test_triton.py b/tests/test_triton.py index d0397ee4a..218a533d5 100644 --- a/tests/test_triton.py +++ b/tests/test_triton.py @@ -4,11 +4,12 @@ from bitsandbytes.nn import Linear8bitLt from bitsandbytes.nn.triton_based_modules import SwitchBackLinear from bitsandbytes.triton.triton_utils import is_triton_available +from tests.helpers import TRUE_FALSE @pytest.mark.skipif(not is_triton_available() or not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8, reason="This test requires triton and a GPU with compute capability 8.0 or higher.") -@pytest.mark.parametrize("vector_wise_quantization", [False, True]) +@pytest.mark.parametrize("vector_wise_quantization", TRUE_FALSE) def test_switchback(vector_wise_quantization): for dim in [83]: for batch in [13]: @@ -57,4 +58,3 @@ def test_switchback(vector_wise_quantization): print('GX1', err_sb, err_baseline) assert err_sb < 2 * err_baseline - From b773dfb6295e16e445f583d1bda15bdb849639c2 Mon Sep 17 00:00:00 2001 From: Rickard Date: Fri, 2 Feb 2024 09:08:04 +0100 Subject: [PATCH 32/52] Pipeline --- .github/workflows/cmake.yml | 159 ------------------------------------ 1 file changed, 159 deletions(-) delete mode 100644 .github/workflows/cmake.yml diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml deleted file mode 100644 index 728dd09fb..000000000 --- a/.github/workflows/cmake.yml +++ /dev/null @@ -1,159 +0,0 @@ -name: CMake on multiple platforms - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - -concurrency: - group: cmake-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - runs-on: ${{ matrix.os }} - - strategy: - # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable. - fail-fast: false - - matrix: - os: [ubuntu-latest, windows-latest] - cuda-version: ['11.8', '12.1'] - build_type: [Release] - - steps: - - uses: actions/checkout@v4 - - - name: Set up MSVC - if: matrix.os == 'windows-latest' - uses: ilammy/msvc-dev-cmd@v1.12.1 - with: - arch: amd64 - - - name: Setup Mambaforge - uses: conda-incubator/setup-miniconda@v3.0.1 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: bnb-env - use-mamba: true - - - uses: conda-incubator/setup-miniconda@v3.0.1 - with: - auto-update-conda: true - activate-environment: bnb-env - environment-file: environment-bnb.yml - use-only-tar-bz2: false - auto-activate-base: true - python-version: "3.10" - mamba-version: "*" - - - name: Set reusable strings - # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file. - id: strings - shell: bash - run: | - echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT" - - - name: CUDA Toolkit - shell: bash -el {0} - run: | - if [ "${{ matrix.os }}" = "ubuntu-latest" ]; then - # to prepare space - sudo rm -rf /usr/share/dotnet - sudo rm -rf /opt/ghc - sudo rm -rf /usr/local/share/boost - fi - addon="" - cuda_version=${{ matrix.cuda-version }} - [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "ubuntu-latest" ] && addon="cuda-cudart-static cuda-nvrtc" - [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "windows-latest" ] && addon="cuda-nvrtc" - [ "$cuda_version" = "11.8" ] && cuda_version="11.8.0" - [ "$cuda_version" = "12.1" ] && cuda_version="12.1.1" - - conda install pytorch-cuda=${{ matrix.cuda-version }} -c pytorch # it's dependency not correctly resolved sometime - conda install cuda-python=${{ matrix.cuda-version }} cuda-libraries-dev cuda-nvcc cuda-nvtx cuda-cupti cuda-cudart cuda-cudart-dev cuda-runtime cuda-libraries $addon -c "nvidia/label/cuda-$cuda_version" - - [ "${{ matrix.os }}" = "windows-latest" ] && conda install "clang>=17.0.6" "clangxx>=17.0.6" -c conda-forge - - CUDA_HOME="${{ env.CONDA }}/envs/bnb-env" - echo CUDA_HOME=$CUDA_HOME >> "$GITHUB_ENV" - echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV" - - if [ "${{ matrix.os }}" = "windows-latest" ]; then - echo CXX_COMPILER=cl >> "$GITHUB_ENV" - echo C_COMPILER=cl >> "$GITHUB_ENV" - # without -DCMAKE_CUDA_COMPILER=nvcc, cmake config always fail for cuda-11.8 - echo DCMAKE_CUDA_COMPILER=-DCMAKE_CUDA_COMPILER=nvcc >> "$GITHUB_ENV" - else - echo CXX_COMPILER=g++ >> "$GITHUB_ENV" - echo C_COMPILER=gcc >> "$GITHUB_ENV" - fi - - nvcc --version - - - name: Update environment - run: mamba env update -n bnb-env -f environment-bnb.yml - - - name: Prep build - run: python -m pip install cmake==3.27.9 ninja setuptools wheel - - # TODO: the following steps (CUDA, NOBLASLT, CPU) could be moved to the matrix, so they're built in parallel - - - name: Configure CUDA - run: > - cmake -B ${{ steps.strings.outputs.build-output-dir }} - -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} - -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} - -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90" - -S ${{ github.workspace }} - - - name: Build CUDA - run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} - - - name: Configure NOBLASLT - run: > - cmake -B ${{ steps.strings.outputs.build-output-dir }} - -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} - -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} - -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90" - -DNO_CUBLASLT=ON - -S ${{ github.workspace }} - - - name: Build NOBLASLT - run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} - - - name: Configure CPU - run: > - cmake -B ${{ steps.strings.outputs.build-output-dir }} - -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} - -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} - -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - -DNO_CUBLASLT=ON - -DBUILD_CUDA=OFF - -S ${{ github.workspace }} - - - name: Build CPU - run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} - - - name: Build dist - shell: bash -el {0} - run: | - python -m pip install build - python -m build --wheel - mkdir dist/cu${{ matrix.cuda-version }} - mv dist/bitsandbytes*.* dist/cu${{ matrix.cuda-version }}/ - - - name: Upload Build Artifacts - uses: actions/upload-artifact@v4.3.0 - with: - name: bitsandbytes-${{ matrix.os }}-${{ matrix.cuda-version }} - path: | - ${{ github.workspace }}/dist/ From 45ad3948978a456894f8a1952c39c498404cdeca Mon Sep 17 00:00:00 2001 From: Rickard Date: Fri, 2 Feb 2024 09:11:00 +0100 Subject: [PATCH 33/52] Fixed conflict --- include/SIMD.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/include/SIMD.h b/include/SIMD.h index 0cd8d13d6..7c6a4094b 100644 --- a/include/SIMD.h +++ b/include/SIMD.h @@ -28,16 +28,6 @@ struct IVec; template struct FVec1; -template <> struct InstrFloatTraits -{ - typedef __m128 vec_t; -}; - -template <> struct InstrFloatTraits -{ - typedef __m128d vec_t; -}; - } } From e2e4874b31e896cc42119c62503785c7e1518e69 Mon Sep 17 00:00:00 2001 From: Rickard Date: Fri, 2 Feb 2024 09:12:30 +0100 Subject: [PATCH 34/52] Fixed conflict --- .github/workflows/python-package.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 56fefe7da..815602386 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -51,9 +51,9 @@ jobs: sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu fi if [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then - cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DENABLE_CUDA=OFF . + cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu . else - cmake -DENABLE_CUDA=OFF . + cmake -DCOMPUTE_BACKEND=cpu . fi if [ ${build_os:0:7} == windows ]; then pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" @@ -125,10 +125,10 @@ jobs: docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ "apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ - && cmake -DENABLE_CUDA=ON . \ + && cmake -DCOMPUTE_BACKEND=cuda . \ && make" else - cmake -DENABLE_CUDA=ON . + cmake -DCOMPUTE_BACKEND=cuda . pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" fi mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} From 59a1000178b35c6e52f6794ff0d98c8e9a73e755 Mon Sep 17 00:00:00 2001 From: Rickard Date: Fri, 2 Feb 2024 23:17:46 +0100 Subject: [PATCH 35/52] Update CMakeLists.txt --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b9a55ae5e..ff40a8089 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -161,11 +161,11 @@ target_include_directories(bitsandbytes PUBLIC csrc include) if(BUILD_CUDA) target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cusparse) if(NO_CUBLASLT) + target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cusparse) target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT) else() - target_link_libraries(bitsandbytes PUBLIC CUDA::cublasLt) + target_link_libraries(bitsandbytes PUBLIC CUDA::cudart CUDA::cublas CUDA::cusparse CUDA::cublasLt) endif() set_target_properties(bitsandbytes From 41ddd25ea1d386443fd4ae84d6e99f931d90d0eb Mon Sep 17 00:00:00 2001 From: Rickard Date: Sat, 3 Feb 2024 09:14:41 +0100 Subject: [PATCH 36/52] Fixed merge error --- include/SIMD.h | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/include/SIMD.h b/include/SIMD.h index 7c6a4094b..19e8bffb3 100644 --- a/include/SIMD.h +++ b/include/SIMD.h @@ -28,6 +28,16 @@ struct IVec; template struct FVec1; +template <> struct InstrFloatTraits +{ + typedef __m128 vec_t; +}; + +template <> struct InstrFloatTraits +{ + typedef __m128d vec_t; +}; + } } @@ -76,26 +86,10 @@ template <> struct InstrFloatTraits typedef __m128d vec_t; }; -template <> struct InstrFloatTraits -{ - typedef float vec_t; -}; - -template <> struct InstrFloatTraits -{ - typedef double vec_t; -}; - -template -struct FTOITraits -{ - typedef IVec vec_t; -}; - template <> -struct FTOITraits +struct FTOITraits { - typedef IVec vec_t; + typedef IVec vec_t; }; #ifdef USE_AVX @@ -592,4 +586,4 @@ FORCE_INLINE FVec mulSub(const FVec& a, const FVec Date: Sat, 3 Feb 2024 09:17:01 +0100 Subject: [PATCH 37/52] cleanup --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ff40a8089..b9a55ae5e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -161,11 +161,11 @@ target_include_directories(bitsandbytes PUBLIC csrc include) if(BUILD_CUDA) target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cusparse) if(NO_CUBLASLT) - target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cusparse) target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT) else() - target_link_libraries(bitsandbytes PUBLIC CUDA::cudart CUDA::cublas CUDA::cusparse CUDA::cublasLt) + target_link_libraries(bitsandbytes PUBLIC CUDA::cublasLt) endif() set_target_properties(bitsandbytes From ca5f14aff135aa583b5ed7956f8a0eb82cd3c359 Mon Sep 17 00:00:00 2001 From: Rickard Date: Sat, 3 Feb 2024 09:24:55 +0100 Subject: [PATCH 38/52] cleanup --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b9a55ae5e..067e1ea26 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -161,7 +161,7 @@ target_include_directories(bitsandbytes PUBLIC csrc include) if(BUILD_CUDA) target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cusparse) + target_link_libraries(bitsandbytes PUBLIC CUDA::cudart CUDA::cublas CUDA::cusparse) if(NO_CUBLASLT) target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT) else() From b460125bdaa4f5cdb4db5c8748dccc23a6d53f71 Mon Sep 17 00:00:00 2001 From: Rickard Date: Sat, 3 Feb 2024 10:18:28 +0100 Subject: [PATCH 39/52] Find CUDA --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 067e1ea26..a0e24528b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,6 +59,7 @@ endif() if(BUILD_CUDA) enable_language(CUDA) # This will fail if CUDA is not found + find_package(CUDAToolkit REQUIRED) # Convert the CUDA version from X.Y.z to XY. There's probably a shorter way of doing this string(REGEX MATCH "^[0-9]+.[0-9]+" _CUDA_VERSION_FIRST_TWO "${CMAKE_CUDA_COMPILER_VERSION}") From 7a605e1c2afecc776a48cf7642ee0b59ac956588 Mon Sep 17 00:00:00 2001 From: Rickard Date: Sat, 3 Feb 2024 10:28:22 +0100 Subject: [PATCH 40/52] Fix --- CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a0e24528b..21e2fde31 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,8 +30,6 @@ option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF) if(APPLE) set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1) endif() -set(CMAKE_CXX_STANDARD 14) -set(CXX_STANDARD_REQUIRED C++14) set(BNB_OUTPUT_NAME "bitsandbytes") @@ -157,6 +155,7 @@ endif() set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX) add_library(bitsandbytes SHARED ${SRC_FILES}) +target_compile_features(bitsandbytes PUBLIC cxx_std_14) target_include_directories(bitsandbytes PUBLIC csrc include) From 28188a55e0734dbad6aa977b0aa713593a35f0bf Mon Sep 17 00:00:00 2001 From: Rickard Date: Sat, 3 Feb 2024 10:32:01 +0100 Subject: [PATCH 41/52] Fixing merge error from latest merge from main --- include/Algo-Direct-Common.h | 2 +- include/Portable.h | 2 +- include/SIMD.h | 14 ++------------ 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/include/Algo-Direct-Common.h b/include/Algo-Direct-Common.h index c97084904..7b40edea9 100644 --- a/include/Algo-Direct-Common.h +++ b/include/Algo-Direct-Common.h @@ -190,7 +190,7 @@ struct DirectInfo xi = xws; } else { - myassert(Gap==1, "if Gap>1 then X workspace must be provided"); + myassert((Gap==1), "if Gap>1 then X workspace must be provided"); xi = x; } diff --git a/include/Portable.h b/include/Portable.h index 78599944e..178f8baa9 100644 --- a/include/Portable.h +++ b/include/Portable.h @@ -81,7 +81,7 @@ typedef unsigned __int64 uint64; namespace Details { -#define myassert(cond, msg) if (!(cond)){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); } +#define myassert(cond, msg) if (!cond){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); } // log2 is not defined in VS2008 #if defined(_MSC_VER) diff --git a/include/SIMD.h b/include/SIMD.h index b16104f9b..19e8bffb3 100644 --- a/include/SIMD.h +++ b/include/SIMD.h @@ -86,18 +86,8 @@ template <> struct InstrFloatTraits typedef __m128d vec_t; }; -template <> struct InstrFloatTraits -{ - typedef float vec_t; -}; - -template <> struct InstrFloatTraits -{ - typedef double vec_t; -}; - -template -struct FTOITraits +template <> +struct FTOITraits { typedef IVec vec_t; }; From 86b2bd689d036b5d22370c2253405075eabe9760 Mon Sep 17 00:00:00 2001 From: Rickard Date: Sat, 3 Feb 2024 10:59:48 +0100 Subject: [PATCH 42/52] Fix setup.py --- setup.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/setup.py b/setup.py index 7b50386ac..0ee090ba5 100644 --- a/setup.py +++ b/setup.py @@ -40,9 +40,6 @@ def has_ext_modules(self): }, long_description=read("README.md"), long_description_content_type="text/markdown", - # HACK: pretend we have a native extension module so the wheel is tagged - # correctly with a platform tag (e.g. `-linux_x86_64.whl`). - ext_modules=[Extension("bitsandbytes", sources=[], language="c")], classifiers=[ "Development Status :: 4 - Beta", "Topic :: Scientific/Engineering :: Artificial Intelligence", From 01c3f598adce0759d8ac63e3eee195064d9116c6 Mon Sep 17 00:00:00 2001 From: Rickard Date: Sat, 3 Feb 2024 11:36:36 +0100 Subject: [PATCH 43/52] Fixed typo in artifact name --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 815602386..54726e71e 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -190,7 +190,7 @@ jobs: - name: Upload build artifact uses: actions/upload-artifact@v4 with: - name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}}_${{ matrix.python-version }} + name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.python-version }} path: dist/bitsandbytes-*.whl retention-days: 7 publish: From e4344b0404246cd6d02ecd488d91ce9ee4ef4eb8 Mon Sep 17 00:00:00 2001 From: Rickard Date: Sat, 3 Feb 2024 11:39:00 +0100 Subject: [PATCH 44/52] Remove linker flags --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 21e2fde31..80894a033 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -148,7 +148,6 @@ endif() # Weird MSVC hacks if(MSVC) - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:msvcprtd /NODEFAULTLIB:MSVCRTD /NODEFAULTLIB:LIBCMT") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2 /fp:fast") endif() From 2ba8be3ad069cc6da066fa64323f9d3fc9ba53a3 Mon Sep 17 00:00:00 2001 From: Rickard Date: Sat, 3 Feb 2024 15:01:14 +0100 Subject: [PATCH 45/52] Build nocublaslt versions --- .github/workflows/cmake.yml | 159 --------------------------- .github/workflows/python-package.yml | 26 +++-- 2 files changed, 14 insertions(+), 171 deletions(-) delete mode 100644 .github/workflows/cmake.yml diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml deleted file mode 100644 index 728dd09fb..000000000 --- a/.github/workflows/cmake.yml +++ /dev/null @@ -1,159 +0,0 @@ -name: CMake on multiple platforms - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - -concurrency: - group: cmake-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - runs-on: ${{ matrix.os }} - - strategy: - # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable. - fail-fast: false - - matrix: - os: [ubuntu-latest, windows-latest] - cuda-version: ['11.8', '12.1'] - build_type: [Release] - - steps: - - uses: actions/checkout@v4 - - - name: Set up MSVC - if: matrix.os == 'windows-latest' - uses: ilammy/msvc-dev-cmd@v1.12.1 - with: - arch: amd64 - - - name: Setup Mambaforge - uses: conda-incubator/setup-miniconda@v3.0.1 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: bnb-env - use-mamba: true - - - uses: conda-incubator/setup-miniconda@v3.0.1 - with: - auto-update-conda: true - activate-environment: bnb-env - environment-file: environment-bnb.yml - use-only-tar-bz2: false - auto-activate-base: true - python-version: "3.10" - mamba-version: "*" - - - name: Set reusable strings - # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file. - id: strings - shell: bash - run: | - echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT" - - - name: CUDA Toolkit - shell: bash -el {0} - run: | - if [ "${{ matrix.os }}" = "ubuntu-latest" ]; then - # to prepare space - sudo rm -rf /usr/share/dotnet - sudo rm -rf /opt/ghc - sudo rm -rf /usr/local/share/boost - fi - addon="" - cuda_version=${{ matrix.cuda-version }} - [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "ubuntu-latest" ] && addon="cuda-cudart-static cuda-nvrtc" - [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "windows-latest" ] && addon="cuda-nvrtc" - [ "$cuda_version" = "11.8" ] && cuda_version="11.8.0" - [ "$cuda_version" = "12.1" ] && cuda_version="12.1.1" - - conda install pytorch-cuda=${{ matrix.cuda-version }} -c pytorch # it's dependency not correctly resolved sometime - conda install cuda-python=${{ matrix.cuda-version }} cuda-libraries-dev cuda-nvcc cuda-nvtx cuda-cupti cuda-cudart cuda-cudart-dev cuda-runtime cuda-libraries $addon -c "nvidia/label/cuda-$cuda_version" - - [ "${{ matrix.os }}" = "windows-latest" ] && conda install "clang>=17.0.6" "clangxx>=17.0.6" -c conda-forge - - CUDA_HOME="${{ env.CONDA }}/envs/bnb-env" - echo CUDA_HOME=$CUDA_HOME >> "$GITHUB_ENV" - echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV" - - if [ "${{ matrix.os }}" = "windows-latest" ]; then - echo CXX_COMPILER=cl >> "$GITHUB_ENV" - echo C_COMPILER=cl >> "$GITHUB_ENV" - # without -DCMAKE_CUDA_COMPILER=nvcc, cmake config always fail for cuda-11.8 - echo DCMAKE_CUDA_COMPILER=-DCMAKE_CUDA_COMPILER=nvcc >> "$GITHUB_ENV" - else - echo CXX_COMPILER=g++ >> "$GITHUB_ENV" - echo C_COMPILER=gcc >> "$GITHUB_ENV" - fi - - nvcc --version - - - name: Update environment - run: mamba env update -n bnb-env -f environment-bnb.yml - - - name: Prep build - run: python -m pip install cmake==3.27.9 ninja setuptools wheel - - # TODO: the following steps (CUDA, NOBLASLT, CPU) could be moved to the matrix, so they're built in parallel - - - name: Configure CUDA - run: > - cmake -B ${{ steps.strings.outputs.build-output-dir }} - -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} - -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} - -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90" - -S ${{ github.workspace }} - - - name: Build CUDA - run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} - - - name: Configure NOBLASLT - run: > - cmake -B ${{ steps.strings.outputs.build-output-dir }} - -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} - -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} - -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90" - -DNO_CUBLASLT=ON - -S ${{ github.workspace }} - - - name: Build NOBLASLT - run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} - - - name: Configure CPU - run: > - cmake -B ${{ steps.strings.outputs.build-output-dir }} - -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} - -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} - -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - -DNO_CUBLASLT=ON - -DBUILD_CUDA=OFF - -S ${{ github.workspace }} - - - name: Build CPU - run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} - - - name: Build dist - shell: bash -el {0} - run: | - python -m pip install build - python -m build --wheel - mkdir dist/cu${{ matrix.cuda-version }} - mv dist/bitsandbytes*.* dist/cu${{ matrix.cuda-version }}/ - - - name: Upload Build Artifacts - uses: actions/upload-artifact@v4.3.0 - with: - name: bitsandbytes-${{ matrix.os }}-${{ matrix.cuda-version }} - path: | - ${{ github.workspace }}/dist/ diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 54726e71e..223f29ec2 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -119,18 +119,20 @@ jobs: set -ex build_os=${{ matrix.os }} build_arch=${{ matrix.arch }} - if [ ${build_os:0:6} == ubuntu ]; then - image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04 - echo "Using image $image" - docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ - "apt-get update \ - && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ - && cmake -DCOMPUTE_BACKEND=cuda . \ - && make" - else - cmake -DCOMPUTE_BACKEND=cuda . - pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" - fi + for NO_CUBLASLT in ON OFF; do + if [ ${build_os:0:6} == ubuntu ]; then + image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04 + echo "Using image $image" + docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ + "apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ + && cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . \ + && make" + else + cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . + pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" + fi + done mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) - name: Upload build artifact From 3288a0fb5b11d23a6b446e42fd75f11d57607266 Mon Sep 17 00:00:00 2001 From: Rickard Date: Sun, 4 Feb 2024 10:42:42 +0100 Subject: [PATCH 46/52] Fixed formatting --- setup.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 0ee090ba5..13af2a39b 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,6 @@ from setuptools import find_packages, setup from setuptools.dist import Distribution - libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.*")) libs = [os.path.basename(p) for p in libs] print("libs:", libs) @@ -17,11 +16,13 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() + # Tested with wheel v0.29.0 class BinaryDistribution(Distribution): def has_ext_modules(self): return True + setup( name="bitsandbytes", version="0.43.0.dev0", @@ -33,10 +34,10 @@ def has_ext_modules(self): url="https://github.com/TimDettmers/bitsandbytes", packages=find_packages(), package_data={"": libs}, - install_requires=['torch', 'numpy'], + install_requires=["torch", "numpy"], extras_require={ - 'benchmark': ['pandas', 'matplotlib'], - 'test': ['scipy'], + "benchmark": ["pandas", "matplotlib"], + "test": ["scipy"], }, long_description=read("README.md"), long_description_content_type="text/markdown", @@ -44,5 +45,5 @@ def has_ext_modules(self): "Development Status :: 4 - Beta", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], - distclass=BinaryDistribution + distclass=BinaryDistribution, ) From fdddb111e8d5733dce54a368ead17dd42556a452 Mon Sep 17 00:00:00 2001 From: Rickard Date: Sun, 4 Feb 2024 11:23:26 +0100 Subject: [PATCH 47/52] Fixed VS Code format on save --- .editorconfig | 3 +++ .gitignore | 1 - .vscode/extensions.json | 7 +++++++ .vscode/settings.json | 7 +++++++ 4 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 .editorconfig create mode 100644 .vscode/extensions.json create mode 100644 .vscode/settings.json diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..03490db50 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,3 @@ +[*] +trim_trailing_whitespace = true +insert_final_newline = true diff --git a/.gitignore b/.gitignore index 46316e4b3..22f5a6cd6 100644 --- a/.gitignore +++ b/.gitignore @@ -154,4 +154,3 @@ dmypy.json dependencies cuda_build output/ -.vscode/* diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 000000000..fcae843c2 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "ms-python.python", + "charliermarsh.ruff", + "twxs.cmake" + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..893ebb174 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "ruff.fixAll": true, + "ruff.lint.run": "onType", + "editor.codeActionsOnSave": { + "source.fixAll": "always" + } +} \ No newline at end of file From b7503c92386854f117411d3430e5c0c8cffd681b Mon Sep 17 00:00:00 2001 From: Rickard Date: Sun, 4 Feb 2024 11:29:52 +0100 Subject: [PATCH 48/52] Ran format on save from VScode --- .github/dependabot.yml | 2 +- .github/workflows/python-package.yml | 4 +- csrc/mps_kernels.metal | 60 ++++++++++++++-------------- csrc/mps_ops.mm | 12 +++--- 4 files changed, 39 insertions(+), 39 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 616a1f98e..8a36c3689 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -8,4 +8,4 @@ updates: major: update-types: [major] minor-patch: - update-types: [minor, patch] \ No newline at end of file + update-types: [minor, patch] diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 223f29ec2..4a34389af 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -8,7 +8,7 @@ on: types: [ published ] jobs: - + ## # This job matrix builds the non-CUDA versions of the libraries for all supported platforms. ## @@ -120,7 +120,7 @@ jobs: build_os=${{ matrix.os }} build_arch=${{ matrix.arch }} for NO_CUBLASLT in ON OFF; do - if [ ${build_os:0:6} == ubuntu ]; then + if [ ${build_os:0:6} == ubuntu ]; then image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04 echo "Using image $image" docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ diff --git a/csrc/mps_kernels.metal b/csrc/mps_kernels.metal index a5c8e35b2..63b3bf78c 100644 --- a/csrc/mps_kernels.metal +++ b/csrc/mps_kernels.metal @@ -83,35 +83,35 @@ static unsigned char quantize_scalar( } } -kernel void quantize(device float* code [[buffer(0)]], - device float* A [[buffer(1)]], - device uchar* out [[buffer(2)]], - constant uint& n [[buffer(3)]], - uint id [[thread_position_in_grid]]) { - const uint n_full = (NUM_BLOCK * (n / NUM_BLOCK)) + (n % NUM_BLOCK == 0 ? 0 : NUM_BLOCK); - uint valid_items = (id / NUM_BLOCK + 1 == (n + NUM_BLOCK - 1) / NUM_BLOCK) ? n - (id / NUM_BLOCK * NUM_BLOCK) : NUM_BLOCK; - const uint base_idx = (id / NUM_BLOCK * NUM_BLOCK); - - float vals[NUM]; - uchar qvals[NUM]; - - for (uint i = base_idx; i < n_full; i += ((n + NUM_BLOCK - 1) / NUM_BLOCK) * NUM_BLOCK) { - valid_items = n - i > NUM_BLOCK ? NUM_BLOCK : n - i; - - threadgroup_barrier(mem_flags::mem_threadgroup); - - for (uint j = 0; j < valid_items; j++) { - vals[j] = A[i + j]; - } - - for (uint j = 0; j < valid_items; j++) { +kernel void quantize(device float* code [[buffer(0)]], + device float* A [[buffer(1)]], + device uchar* out [[buffer(2)]], + constant uint& n [[buffer(3)]], + uint id [[thread_position_in_grid]]) { + const uint n_full = (NUM_BLOCK * (n / NUM_BLOCK)) + (n % NUM_BLOCK == 0 ? 0 : NUM_BLOCK); + uint valid_items = (id / NUM_BLOCK + 1 == (n + NUM_BLOCK - 1) / NUM_BLOCK) ? n - (id / NUM_BLOCK * NUM_BLOCK) : NUM_BLOCK; + const uint base_idx = (id / NUM_BLOCK * NUM_BLOCK); + + float vals[NUM]; + uchar qvals[NUM]; + + for (uint i = base_idx; i < n_full; i += ((n + NUM_BLOCK - 1) / NUM_BLOCK) * NUM_BLOCK) { + valid_items = n - i > NUM_BLOCK ? NUM_BLOCK : n - i; + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint j = 0; j < valid_items; j++) { + vals[j] = A[i + j]; + } + + for (uint j = 0; j < valid_items; j++) { qvals[j] = quantize_scalar(0.0f, code, vals[j]); - } - - threadgroup_barrier(mem_flags::mem_threadgroup); - - for (uint j = 0; j < valid_items; j++) { - out[i + j] = qvals[j]; - } - } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint j = 0; j < valid_items; j++) { + out[i + j] = qvals[j]; + } + } } diff --git a/csrc/mps_ops.mm b/csrc/mps_ops.mm index 5e3adeebe..d198b3552 100644 --- a/csrc/mps_ops.mm +++ b/csrc/mps_ops.mm @@ -16,10 +16,10 @@ static inline id get_device() { - NSError *error = nil; + NSError *error = nil; static id device = nil; if(!device) { - device = MTLCreateSystemDefaultDevice(); + device = MTLCreateSystemDefaultDevice(); } if(!device) { NSLog(@"Failed to get MPS device"); @@ -30,7 +30,7 @@ static inline id get_library() { - NSError *error = nil; + NSError *error = nil; static id library = nil; if(!library) { library = [get_device() newLibraryWithURL:[NSURL fileURLWithPath:@"bitsandbytes.metallib"] error:&error]; @@ -40,7 +40,7 @@ abort(); } return library; -} +} /*MPSGraphTensor* dequantize_mps(MPSGraphTensor* code, MPSGraphTensor* A, int n) { @@ -49,7 +49,7 @@ }*/ -// MPSGraph function for quantize +// MPSGraph function for quantize extern "C" MPSGraphTensor* quantize_mps(MPSGraph* graph, MPSGraphTensor* code, MPSGraphTensor* A, int n) { id device = get_device(); @@ -64,4 +64,4 @@ } NSLog(@"Not implemented"); return nil; -} \ No newline at end of file +} From fb642a5c9422bb422f92a571dd3ae10daac10339 Mon Sep 17 00:00:00 2001 From: Rickard Date: Sun, 4 Feb 2024 11:35:23 +0100 Subject: [PATCH 49/52] Re-saved the json files using the new settings --- .vscode/extensions.json | 2 +- .vscode/settings.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.vscode/extensions.json b/.vscode/extensions.json index fcae843c2..939843f43 100644 --- a/.vscode/extensions.json +++ b/.vscode/extensions.json @@ -4,4 +4,4 @@ "charliermarsh.ruff", "twxs.cmake" ] -} \ No newline at end of file +} diff --git a/.vscode/settings.json b/.vscode/settings.json index 893ebb174..906f28588 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -4,4 +4,4 @@ "editor.codeActionsOnSave": { "source.fixAll": "always" } -} \ No newline at end of file +} From 2730dd94f7eb6f15a9c5e56df964fdf770a69228 Mon Sep 17 00:00:00 2001 From: Rickard Date: Sun, 4 Feb 2024 11:36:44 +0100 Subject: [PATCH 50/52] Re-saved CMakeLists.txt to get formatting right --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 80894a033..4a4090bb7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,11 +128,11 @@ elseif(BUILD_MPS) string(APPEND BNB_OUTPUT_NAME "_mps") add_compile_definitions(BUILD_MPS) file(MAKE_DIRECTORY "build") - add_custom_command(OUTPUT "bitsandbytes/bitsandbytes.metallib" + add_custom_command(OUTPUT "bitsandbytes/bitsandbytes.metallib" COMMAND xcrun metal -c -o "build/bitsandbytes.air" ${METAL_FILES} COMMAND xcrun metallib "build/bitsandbytes.air" -o "bitsandbytes/bitsandbytes.metallib" - DEPENDS "${METAL_FILES}" - COMMENT "Compiling Metal kernels" + DEPENDS "${METAL_FILES}" + COMMENT "Compiling Metal kernels" VERBATIM) add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib") else() From 2e3a1d81931d17756a23a11f0ee9b3010d2b35ba Mon Sep 17 00:00:00 2001 From: Rickard Date: Sun, 4 Feb 2024 21:11:34 +0100 Subject: [PATCH 51/52] Add path filter --- .github/workflows/python-package.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 4a34389af..8dfa5d7bd 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -4,6 +4,18 @@ on: push: {} pull_request: branches: [ main ] + paths: + - '.github/workflows/python-package.yml' + - 'bitsandbytes/**' + - 'csrc/**' + - 'include/**' + - 'tests/**' + - 'CMakeLists.txt' + - 'requirements*.txt' + - 'setup.py' + - 'pyproject.toml' + - 'pytest.ini' + - '**/*.md' release: types: [ published ] From 927f7167e3395ec26f859f294c1d4979a70a718a Mon Sep 17 00:00:00 2001 From: Rickard Date: Mon, 5 Feb 2024 17:30:50 +0100 Subject: [PATCH 52/52] Formatting --- .github/workflows/python-package.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 8dfa5d7bd..265128637 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -30,7 +30,7 @@ jobs: os: [ubuntu-latest, macos-latest, windows-latest] arch: [x86_64, aarch64] exclude: - - os: windows-latest # This probably requres arm64 Windows agents + - os: windows-latest # This probably requires arm64 Windows agents arch: aarch64 runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents steps: @@ -90,7 +90,7 @@ jobs: arch: [x86_64, aarch64] cuda_version: ['12.1.0'] exclude: - - os: windows-latest # This probably requres arm64 Windows agents + - os: windows-latest # This probably requires arm64 Windows agents arch: aarch64 runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents steps: @@ -163,7 +163,7 @@ jobs: python-version: ["3.9", "3.10", "3.11", "3.12"] arch: [x86_64, aarch64] exclude: - - os: windows-latest # This probably requres arm64 Windows agents + - os: windows-latest # This probably requires arm64 Windows agents arch: aarch64 runs-on: ${{ matrix.os }} steps: