From b263fc6d2e16b150930b67491bb0602a88412ffc Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Tue, 2 Jan 2024 14:48:54 +0100
Subject: [PATCH 01/52] Make native code portable and add GitHub workflow for
 building

---
 .github/workflows/python-package.yml          | 201 ++++++++++++++++++
 .gitignore                                    |  22 +-
 CMakeLists.txt                                | 121 +++++++++++
 Makefile                                      | 141 ------------
 csrc/common.cpp                               |  24 +--
 csrc/common.h                                 |   2 +-
 csrc/cpu_ops.cpp                              |  42 ++--
 csrc/kernels.cu                               |  12 +-
 csrc/mps_kernels.metal                        | 117 ++++++++++
 csrc/ops.cuh                                  |   2 +
 ...{pythonInterface.c => pythonInterface.cpp} |   4 +
 include/Algo-Direct2.h                        |   2 +
 include/Portable.h                            |  35 ++-
 include/SIMD.h                                |  67 ++++--
 include/Type.h                                |   2 +-
 requirements.txt                              |   8 +-
 setup.py                                      |   8 +-
 17 files changed, 593 insertions(+), 217 deletions(-)
 create mode 100644 .github/workflows/python-package.yml
 create mode 100644 CMakeLists.txt
 delete mode 100644 Makefile
 create mode 100644 csrc/mps_kernels.metal
 rename csrc/{pythonInterface.c => pythonInterface.cpp} (99%)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
new file mode 100644
index 000000000..f50091980
--- /dev/null
+++ b/.github/workflows/python-package.yml
@@ -0,0 +1,201 @@
+name: Python package
+
+on:
+  push:
+    branches: [ "*" ]
+  pull_request:
+    branches: [ master ]
+  release:
+    types: [ published ]
+
+jobs:
+  
+  ##
+  # This job matrix builds the non-CUDA versions of the libraries for all supported platforms.
+  ##
+  build-shared-libs:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        arch: [x86_64, aarch64]
+        exclude:
+          - os: windows-latest # This probably requres arm64 Windows agents
+            arch: aarch64
+    runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
+    steps:
+      # Check out code
+    - uses: actions/checkout@v3
+      # On Linux we use CMake within Docker
+    - name: Setup cmake
+      uses: jwlawson/actions-setup-cmake@v1.13
+      with:
+        cmake-version: '3.26.x'
+    - name: Add msbuild to PATH
+      uses: microsoft/setup-msbuild@v1.1
+      if: ${{ startsWith(matrix.os, 'windows') }}
+      # Compile C++ code
+    - name: Build C++
+      shell: bash
+      run: |
+        set -ex
+        build_os=${{ matrix.os }}
+        build_arch=${{ matrix.arch }}
+        ( git clone https://github.com/NVlabs/cub ./dependencies/cub; cd dependencies/cub; git checkout 1.11.0 )
+        if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then
+          # Allow cross-compile om aarch64
+          sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu
+        fi
+        if [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then
+          cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DENABLE_CUDA=OFF .
+        else
+          cmake -DENABLE_CUDA=OFF .
+        fi
+        if [ ${build_os:0:7} == windows ]; then
+          pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release"
+        else
+          make
+        fi
+        mkdir -p output/${{ matrix.os }}/${{ matrix.arch }}
+        ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ )
+    - name: Upload build artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: shared_library
+        path: output/*
+        retention-days: 7
+  ##
+  # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)
+  ##
+  build-shared-libs-cuda:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+        arch: [x86_64, aarch64]
+        cuda_version: ['12.1.0']
+        exclude:
+          - os: windows-latest # This probably requres arm64 Windows agents
+            arch: aarch64
+    runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
+    steps:
+      # Check out code
+    - uses: actions/checkout@v3
+      # Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation)
+    - name: Set up Docker multiarch
+      if: startsWith(matrix.os, 'ubuntu')
+      uses: docker/setup-qemu-action@v2
+      # On Linux we use CMake within Docker
+    - name: Setup cmake
+      if: ${{ !startsWith(matrix.os, 'linux') }}
+      uses: jwlawson/actions-setup-cmake@v1.13
+      with:
+        cmake-version: '3.26.x'
+      # Windows: We install Cuda on the agent (slow)
+    - uses: Jimver/cuda-toolkit@v0.2.10
+      if: startsWith(matrix.os, 'windows')
+      id: cuda-toolkit
+      with:
+        cuda: ${{ matrix.cuda_version }}
+        method: 'local'
+        #sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]'
+    - name: Add msbuild to PATH
+      uses: microsoft/setup-msbuild@v1.1
+      if: ${{ startsWith(matrix.os, 'windows') }}
+      # Compile C++ code
+    - name: Build C++
+      shell: bash
+      run: |
+        set -ex
+        build_os=${{ matrix.os }}
+        build_arch=${{ matrix.arch }}
+        ( git clone https://github.com/NVlabs/cub ./dependencies/cub; cd dependencies/cub; git checkout 1.11.0 )
+        if [ ${build_os:0:6} == ubuntu ]; then 
+          image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04
+          echo "Using image $image"
+          docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \
+            "apt-get update \
+            && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
+            && cmake -DENABLE_CUDA=ON . \
+            && make"
+        else
+          cmake -DENABLE_CUDA=ON .
+          pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release"
+        fi
+        mkdir -p output/${{ matrix.os }}/${{ matrix.arch }}
+        ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ )
+    - name: Upload build artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: shared_library
+        path: output/*
+        retention-days: 7
+  build-wheels:
+    needs:
+    - build-shared-libs
+    - build-shared-libs-cuda
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        arch: [x86_64, aarch64]
+        exclude:
+          - os: windows-latest # This probably requres arm64 Windows agents
+            arch: aarch64
+    runs-on: ${{ matrix.os }}
+    steps:
+      # Check out code
+    - uses: actions/checkout@v3
+      # Download shared libraries
+    - name: Download build artifact
+      uses: actions/download-artifact@v3
+      with:
+        name: shared_library
+        path: output/
+    - name: Copy correct platform shared library
+      shell: bash
+      run: |
+        cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/
+      # Compile C++ code
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+      # 
+    - name: Install Python dependencies
+      shell: bash
+      run: |
+        pip install -r requirements.txt
+    # TODO: How to run CUDA tests on GitHub actions?
+    #- name: Run unit tests
+    #  if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents
+    #  run: |
+    #    PYTHONPATH=. pytest --log-cli-level=DEBUG tests
+    - name: Build wheel
+      shell: bash
+      run: |
+        python setup.py bdist_wheel
+    - name: Upload build artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: bdist_wheel
+        path: dist/bitsandbytes-*.whl
+        retention-days: 7
+  publish:
+    needs: build-wheels
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Build dist
+      run: |
+        python setup.py sdist
+    - name: Download build artifact
+      uses: actions/download-artifact@v3
+      with:
+        name: bdist_wheel
+        path: dist/
+    - run: |
+        ls -lR dist/
+    - name: Publish to PyPi
+      if: startsWith(github.ref, 'refs/tags')
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.pypi }}
diff --git a/.gitignore b/.gitignore
index 2f929968b..202dcb13d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,9 +2,26 @@
 __pycache__/
 *.py[cod]
 *$py.class
-
-# C extensions
 *.so
+*.dll
+*.dylib
+*.o
+*.obj
+*.air
+*.metallib
+
+# CMake generated files
+CMakeCache.txt
+CMakeScripts/
+cmake_install.cmake
+Makefile
+CMakeFiles/
+*.sln
+*.vcxproj*
+*.xcodeproj/
+bitsandbytes.dir/
+Debug/
+Release/
 
 # Distribution / packaging
 .Python
@@ -133,4 +150,5 @@ dmypy.json
 
 dependencies
 cuda_build
+output/
 .vscode/*
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..d6e269d15
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,121 @@
+cmake_minimum_required(VERSION 3.22.1)
+
+option(ENABLE_CUDA "Build for CUDA (Nvidia)" OFF)
+option(ENABLE_MPS "Build for Metal Performance Shaders (Apple)" OFF)
+
+if(ENABLE_CUDA)
+    if(APPLE)
+        message(FATAL_ERROR "CUDA is not supported on macOS" )
+    endif()
+    option(NO_CUBLASLT "Don't use CUBLAST" OFF)
+    if(NO_CUBLASLT)
+        set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72)
+    else()
+        set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89 90)
+    endif()
+endif()
+
+if(ENABLE_CUDA)
+    message("Building CUDA support for ${CMAKE_CUDA_ARCHITECTURES}")
+    # Find CUDA tools if we are compiling with CUDA
+    find_package(CUDAToolkit REQUIRED)
+    if(NO_CUBLASLT)
+        set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}_nocublaslt")
+    else()
+        set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}")
+    endif()
+
+    project(bitsandbytes LANGUAGES CXX CUDA)
+    add_compile_definitions(BUILD_CUDA)
+    set(CMAKE_CUDA_STANDARD 14)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+    set(GPU_SOURCES csrc/ops.cu csrc/kernels.cu)
+elseif(ENABLE_MPS)
+    if(NOT APPLE)
+        message(FATAL_ERROR "MPS is only supported on macOS" )
+    endif()
+    message("Building MPS support")
+    set(LIBSUFFIX "mps")
+    project(bitsandbytes LANGUAGES CXX OBJCXX)
+    add_compile_definitions(BUILD_MPS)
+    set(METAL_SOURCES csrc/mps_kernels.metal)
+    file(MAKE_DIRECTORY "build")
+    add_custom_command(OUTPUT "bitsandbytes/bitsandbytes.metallib"  
+                   COMMAND xcrun metal -c -o "build/bitsandbytes.air" ${METAL_SOURCES}
+                   COMMAND xcrun metallib "build/bitsandbytes.air" -o "bitsandbytes/bitsandbytes.metallib"
+                   DEPENDS "${METAL_SOURCES}"  
+                   COMMENT "Compiling Metal kernels"  
+                   VERBATIM)
+    add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib")
+    set(GPU_SOURCES csrc/mps_ops.mm)
+else()
+    message("Building with CPU only")
+    set(LIBSUFFIX "cpu")
+
+    project(bitsandbytes LANGUAGES CXX)
+    set(GPU_SOURCES)
+endif()
+
+if(APPLE)
+  set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1)
+endif()
+set(CMAKE_CXX_STANDARD 14)
+set(CXX_STANDARD_REQUIRED C++14)
+
+if(WIN32)
+    # Mute warnings
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -diag-suppress=177")
+
+    # Enable fast math on VC++
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast")
+
+    # Export all symbols
+    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif()
+
+# Weird MSVC hacks
+if(MSVC)
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}  /NODEFAULTLIB:msvcprtd /NODEFAULTLIB:MSVCRTD /NODEFAULTLIB:LIBCMT")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
+endif()
+
+# Add csrc files
+add_library(bitsandbytes SHARED
+	${GPU_SOURCES}
+	csrc/common.cpp
+	csrc/cpu_ops.cpp
+	csrc/pythonInterface.cpp)
+
+target_include_directories(bitsandbytes PUBLIC
+    ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc
+    ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+if(ENABLE_CUDA)
+    target_include_directories(bitsandbytes PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/include)
+
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
+
+    set_target_properties(
+        bitsandbytes
+        PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON)
+
+    target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cusparse)
+endif()
+if(ENABLE_MPS)
+    add_dependencies(bitsandbytes metallib)
+    target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
+endif()
+
+set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME "bitsandbytes_${LIBSUFFIX}")
+# Set the output name of the CUDA library
+if(MSVC)
+set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE bitsandbytes)
+set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG bitsandbytes)
+set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bitsandbytes)
+set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG bitsandbytes)
+endif()
+
+set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY bitsandbytes)
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 5f997a122..000000000
--- a/Makefile
+++ /dev/null
@@ -1,141 +0,0 @@
-MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
-ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH)))
-
-GPP:= /usr/bin/g++
-#GPP:= /sw/gcc/11.2.0/bin/g++
-ifeq ($(CUDA_HOME),)
-	CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev)
-endif
-
-ifndef CUDA_VERSION
-ifneq ($(MAKECMDGOALS),clean)
-$(warning WARNING: CUDA_VERSION not set. Call make with CUDA string, for example: make cuda11x CUDA_VERSION=115 or make cpuonly CUDA_VERSION=CPU)
-CUDA_VERSION:=
-endif
-endif
-
-
-
-NVCC := $(CUDA_HOME)/bin/nvcc
-
-###########################################
-
-CSRC := $(ROOT_DIR)/csrc
-BUILD_DIR:= $(ROOT_DIR)/build
-
-FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu
-FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c
-
-INCLUDE :=  -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include
-LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib
-
-# NVIDIA NVCC compilation flags
-COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell
-COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
-COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal
-COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal
-COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta
-
-CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler
-CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler
-
-# Later versions of CUDA support the new architectures
-CC_CUDA11x := -gencode arch=compute_75,code=sm_75
-CC_CUDA11x += -gencode arch=compute_80,code=sm_80
-CC_CUDA11x += -gencode arch=compute_86,code=sm_86
-
-
-CC_cublasLt110 := -gencode arch=compute_75,code=sm_75
-CC_cublasLt110 += -gencode arch=compute_80,code=sm_80
-
-CC_cublasLt111 := -gencode arch=compute_75,code=sm_75
-CC_cublasLt111 += -gencode arch=compute_80,code=sm_80
-CC_cublasLt111 += -gencode arch=compute_86,code=sm_86
-
-CC_ADA_HOPPER := -gencode arch=compute_89,code=sm_89
-CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90
-
-
-all: $(BUILD_DIR) env
-	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
-
-cuda110_nomatmul_kepler: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-cuda11x_nomatmul_kepler: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-
-cuda110_nomatmul: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-cuda11x_nomatmul: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-cuda118_nomatmul: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER)  -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-cuda12x_nomatmul: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-cuda110: $(BUILD_DIR) env
-	$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
-
-cuda11x: $(BUILD_DIR) env
-	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
-
-cuda118: $(BUILD_DIR) env
-	$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
-
-cuda12x: $(BUILD_DIR) env
-	$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
-
-cpuonly: $(BUILD_DIR) env
-	$(GPP) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu.so
-
-env:
-	@echo "ENVIRONMENT"
-	@echo "============================"
-	@echo "CUDA_VERSION: $(CUDA_VERSION)"
-	@echo "============================"
-	@echo "NVCC path: $(NVCC)"
-	@echo "GPP path: $(GPP) VERSION: `$(GPP) --version | head -n 1`"
-	@echo "CUDA_HOME: $(CUDA_HOME)"
-	@echo "CONDA_PREFIX: $(CONDA_PREFIX)"
-	@echo "PATH: $(PATH)"
-	@echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)"
-	@echo "============================"
-
-$(BUILD_DIR):
-	mkdir -p build
-	mkdir -p dependencies
-
-$(ROOT_DIR)/dependencies/cub:
-	git clone https://github.com/NVlabs/cub $(ROOT_DIR)/dependencies/cub
-	cd dependencies/cub; git checkout 1.11.0
-
-clean:
-	rm -rf build/* *.egg*
-	rm -f bitsandbytes/libbitsandbytes*.so
diff --git a/csrc/common.cpp b/csrc/common.cpp
index 52f029917..0a9601689 100644
--- a/csrc/common.cpp
+++ b/csrc/common.cpp
@@ -1,39 +1,35 @@
 #include <common.h>
 #include <float.h>
 
-void *quantize_block(void *arguments) {
+void quantize_block(const quantize_block_args& args) {
     // 1. find absmax in block
     // 2. divide input value by absmax to normalize into [-1.0, 1.0]
     // 3. do binary search to find the closest value
     // 4. check minimal distance
     // 5. store index
 
-    struct quantize_block_args *args = (quantize_block_args *) arguments;
-
     // 1. find absmax in block
     float absmax_block = -FLT_MAX;
-    for (long long i = args->block_idx; i < args->block_end; i++)
-        absmax_block = fmax(absmax_block, fabs(args->A[i]));
+    for (long long i = args.block_idx; i < args.block_end; i++)
+        absmax_block = fmax(absmax_block, fabs(args.A[i]));
 
-    args->absmax[args->block_idx / args->blocksize] = absmax_block;
+    args.absmax[args.block_idx / args.blocksize] = absmax_block;
 
-    for (long long i = args->block_idx; i < args->block_end; i++) {
+    for (long long i = args.block_idx; i < args.block_end; i++) {
         // 2. divide input value by absmax to normalize into [-1.0, 1.0]
         // 3. do binary search to find the closest value
-        float normed_value = args->A[i] / absmax_block;
-        long long idx = args->bin_searcher->scalar(normed_value);
+        float normed_value = args.A[i] / absmax_block;
+        long long idx = args.bin_searcher->scalar(normed_value);
 
         // 4. check minimal distance
         // The binary search returns always the value to the left, which might not be the closest value
         if (idx < 255) {
-            float dist_left = fabs(normed_value - (args->code[idx]));
-            float dist_right = fabs(normed_value - (args->code[idx + 1]));
+            float dist_left = fabs(normed_value - (args.code[idx]));
+            float dist_right = fabs(normed_value - (args.code[idx + 1]));
             if (dist_right < dist_left) { idx += 1; }
         }
 
         // 5. store index
-        args->out[i] = (unsigned char) idx;
+        args.out[i] = (unsigned char) idx;
     }
-
-    return NULL;
 }
diff --git a/csrc/common.h b/csrc/common.h
index c99034e78..e513f2875 100644
--- a/csrc/common.h
+++ b/csrc/common.h
@@ -20,6 +20,6 @@ struct quantize_block_args {
 };
 
 
-void *quantize_block(void *arguments);
+void quantize_block(const quantize_block_args& args);
 
 #endif
diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index e28e7b2c2..478c1f4ff 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -1,6 +1,6 @@
 #include <BinSearch.h>
-#include <pthread.h>
 #include <common.h>
+#include <thread>
 
 using namespace BinSearch;
 
@@ -31,12 +31,8 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
     for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size)
     {
       long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset;
-      pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * valid_chunks);
-
-      struct quantize_block_args **args = (quantize_block_args **) malloc(valid_chunks * sizeof(quantize_block_args *));
-
-      for(long long i = 0; i < valid_chunks; i++)
-          args[i] = (quantize_block_args *) malloc(sizeof(quantize_block_args));
+      std::vector<std::thread> threads(valid_chunks);
+      std::vector<quantize_block_args> args(valid_chunks);
 
       int chunks_processed = 0;
       for(long long block_idx = offset*blocksize; block_idx < n; block_idx += blocksize)
@@ -44,30 +40,24 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
           long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx;
           long long block_end = block_idx + valid_items;
 
-          struct quantize_block_args *arg = args[chunks_processed];
-          arg->bin_searcher = &bin_searcher;
-          arg->code = code;
-          arg->A = A;
-          arg->absmax = absmax;
-          arg->out = out;
-          arg->block_end = block_end;
-          arg->block_idx = block_idx;
-          arg->threadidx = block_idx / blocksize;
-          arg->blocksize = blocksize;
-
-          pthread_create(&threads[chunks_processed], NULL, &quantize_block, (void *) arg);
+          struct quantize_block_args& arg = args[chunks_processed];
+          arg.bin_searcher = &bin_searcher;
+          arg.code = code;
+          arg.A = A;
+          arg.absmax = absmax;
+          arg.out = out;
+          arg.block_end = block_end;
+          arg.block_idx = block_idx;
+          arg.threadidx = block_idx / blocksize;
+          arg.blocksize = blocksize;
+
+          threads[chunks_processed] = std::thread([arg] { quantize_block(arg); });
           chunks_processed += 1;
           if(chunks_processed == valid_chunks){ break; }
       }
 
       for (int i = 0; i < valid_chunks; i++)
-          int err = pthread_join(threads[i], NULL);
-
-      free(threads);
-      for (int i = 0; i < valid_chunks; i++)
-          free(args[i]);
-      free(args);
-
+          threads[i].join();
     }
 
 }
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index 1ab8aa242..c2e2d7da7 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -3816,12 +3816,12 @@ template __global__ void kgemm_4bit_inference_naive<float, 128, 32>(int M, int N
 template __global__ void kExtractOutliers<COL_TURING>(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
 template __global__ void kExtractOutliers<COL_AMPERE>(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
 
-template __global__ void kspmm_coo_very_sparse_naive<half, 8, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<half, 16, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<half, 32, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 8, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 16, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 32, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 8, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 16, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 32, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 8, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 16, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 32, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
 
 template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 0, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
 template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
diff --git a/csrc/mps_kernels.metal b/csrc/mps_kernels.metal
new file mode 100644
index 000000000..a5c8e35b2
--- /dev/null
+++ b/csrc/mps_kernels.metal
@@ -0,0 +1,117 @@
+#include <metal_stdlib>
+using namespace metal;
+
+#define HLF_MAX 65504
+#define TH 1024
+#define NUM 4
+#define NUM_BLOCK 4096
+
+template<bool STOCHASTIC>
+static unsigned char quantize_scalar(
+  float rand,
+  device float* code,
+  float x)
+{
+    int pivot = 127;
+    int upper_pivot = 255;
+    int lower_pivot = 0;
+
+    float lower = -1.0f;
+    float upper = 1.0f;
+
+    float val = code[pivot];
+    // i>>=1 = {32, 16, 8, 4, 2, 1}
+    for(int i = 64; i > 0; i>>=1)
+    {
+        if(x > val)
+        {
+            lower_pivot = pivot;
+            lower = val;
+            pivot+=i;
+        }
+        else
+        {
+            upper_pivot = pivot;
+            upper = val;
+            pivot-=i;
+        }
+        val = code[pivot];
+    }
+
+    if(upper_pivot == 255)
+        upper = code[upper_pivot];
+    if(lower_pivot == 0)
+        lower = code[lower_pivot];
+
+    if(!STOCHASTIC)
+    {
+      if(x > val)
+      {
+        float midpoint = (upper+val)*0.5f;
+        if(x > midpoint)
+        {
+          return upper_pivot;
+        }
+        else
+          return pivot;
+      }
+      else
+      {
+        float midpoint = (lower+val)*0.5f;
+        if(x < midpoint)
+          return lower_pivot;
+        else
+          return pivot;
+      }
+    }
+    else
+    {
+      if(x > val)
+      {
+        float dist_to_upper = fabs(upper-x);
+        float dist_full = upper-val;
+        if(rand >= dist_to_upper/dist_full) return upper_pivot;
+        else return pivot;
+      }
+      else
+      {
+        float dist_to_lower = fabs(lower-x);
+        float dist_full = val-lower;
+        if(rand >= dist_to_lower/dist_full) return lower_pivot;
+        else return pivot;
+      }
+    }
+}
+
+kernel void quantize(device float* code [[buffer(0)]],  
+                      device float* A [[buffer(1)]],  
+                      device uchar* out [[buffer(2)]],  
+                      constant uint& n [[buffer(3)]],  
+                      uint id [[thread_position_in_grid]]) {  
+  const uint n_full = (NUM_BLOCK * (n / NUM_BLOCK)) + (n % NUM_BLOCK == 0 ? 0 : NUM_BLOCK);  
+  uint valid_items = (id / NUM_BLOCK + 1 == (n + NUM_BLOCK - 1) / NUM_BLOCK) ? n - (id / NUM_BLOCK * NUM_BLOCK) : NUM_BLOCK;  
+  const uint base_idx = (id / NUM_BLOCK * NUM_BLOCK);  
+  
+  float vals[NUM];  
+  uchar qvals[NUM];  
+  
+  for (uint i = base_idx; i < n_full; i += ((n + NUM_BLOCK - 1) / NUM_BLOCK) * NUM_BLOCK) {  
+    valid_items = n - i > NUM_BLOCK ? NUM_BLOCK : n - i;  
+  
+    threadgroup_barrier(mem_flags::mem_threadgroup);  
+  
+    for (uint j = 0; j < valid_items; j++) {  
+      vals[j] = A[i + j];  
+    }  
+  
+    for (uint j = 0; j < valid_items; j++) {  
+      qvals[j] = quantize_scalar<false>(0.0f, code, vals[j]);
+    }  
+  
+    threadgroup_barrier(mem_flags::mem_threadgroup);  
+  
+    for (uint j = 0; j < valid_items; j++) {  
+      out[i + j] = qvals[j];  
+    }  
+  }  
+}
diff --git a/csrc/ops.cuh b/csrc/ops.cuh
index f37b3b3af..cc7b59505 100644
--- a/csrc/ops.cuh
+++ b/csrc/ops.cuh
@@ -9,7 +9,9 @@
 
 #include <stdio.h>
 #include <iostream>
+#ifndef _MSC_VER
 #include <unistd.h>
+#endif
 #include <assert.h>
 
 #include <cuda_runtime_api.h>
diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.cpp
similarity index 99%
rename from csrc/pythonInterface.c
rename to csrc/pythonInterface.cpp
index 865e4b6d5..a6b348ca6 100644
--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.cpp
@@ -6,6 +6,9 @@
 #if BUILD_CUDA
 #include <ops.cuh>
 #endif
+#if BUILD_MPS
+// #include <mps_ops.h>
+#endif
 #include <cpu_ops.h>
 
 // We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary.
@@ -412,6 +415,7 @@ extern "C"
 	{ gemm_4bit_inference_naive_fp32(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize); }
 
 #endif
+
 	void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); }
 	void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); }
 }
diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h
index d5fa58d12..7f52fce14 100644
--- a/include/Algo-Direct2.h
+++ b/include/Algo-Direct2.h
@@ -52,6 +52,7 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
 private:
     typedef AlgoScalarBase<T, A> base_t;
 
+#ifdef USE_SSE2
     FORCE_INLINE
         //NO_INLINE
         void resolve(const FVec<SSE, float>& vz, const IVec<SSE, float>& bidx, uint32 *pr) const
@@ -135,6 +136,7 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
         pr[0] = u.ui32[0];
         pr[1] = u.ui32[2];
     }
+#endif // USE_SSE2
 
 #ifdef USE_AVX
 
diff --git a/include/Portable.h b/include/Portable.h
index 1710b0502..78599944e 100644
--- a/include/Portable.h
+++ b/include/Portable.h
@@ -4,10 +4,40 @@
 #include <stdexcept>
 #include <sstream>
 
+#if defined(__aarch64__)
+#ifdef __CUDACC__
+#undef USE_NEON // Doesn't work with nvcc, undefined symbols
+#else
+#include <arm_neon.h>
+#undef USE_NEON // Not yet implemented
+#endif
+#undef USE_AVX // x86_64 only
+#undef USE_AVX2 // x86_64 only
+#undef USE_SSE2 // x86_64 only
+#undef USE_SSE41 // x86_64 only
+#undef USE_SSE42 // x86_64 only
+#undef USE_FMA // x86_64 only
+#ifdef USE_NEON
+typedef float32x4_t __m128;
+typedef int32x4_t __m128i;
+typedef float64x2_t __m128d;
+#else
+typedef struct {float a; float b; float c; float d;} __m128;
+typedef struct {int a; int b; int c; int d;} __m128i;
+typedef struct {double a; double b;} __m128d;
+#endif
+#else
+#undef USE_NEON // ARM64 only
 #ifdef __FMA__
 #define USE_FMA
 #endif
+#if !defined(__SSE2__) && !defined(_MSC_VER)
+#error Compiler must support SSE2
+#endif
+#define USE_SSE2
 
+#if defined(__aarch64__)
+#else
 #ifdef __AVX2__
 #define USE_AVX2
 #endif
@@ -24,7 +54,8 @@
 #ifdef __SSE4_2__
 #define USE_SSE42
 #endif
-
+#endif
+#endif
 
 #ifndef _MSC_VER
 #include <stdint.h>
@@ -50,7 +81,7 @@ typedef unsigned __int64  uint64;
 
 namespace Details {
 
-#define myassert(cond, msg) if (!cond){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); }
+#define myassert(cond, msg) if (!(cond)){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); }
 
 // log2 is not defined in VS2008
 #if defined(_MSC_VER)
diff --git a/include/SIMD.h b/include/SIMD.h
index a2ac1a9ae..18a38dbfd 100644
--- a/include/SIMD.h
+++ b/include/SIMD.h
@@ -2,6 +2,46 @@
 
 #include "Portable.h"
 
+#ifdef USE_SSE2
+#include <emmintrin.h>
+#if defined(USE_AVX) || defined(USE_AVX2)
+#include <immintrin.h>
+#else
+#ifdef USE_SSE41
+#include <smmintrin.h>
+#endif
+#endif
+#endif
+
+namespace BinSearch {
+namespace Details {
+
+template <InstrSet I, typename T>
+struct FTOITraits{};
+
+template <InstrSet I, class T>
+struct FVec;
+
+template <InstrSet I, class T>
+struct IVec;
+
+template <InstrSet I, class T>
+struct FVec1;
+
+template <> struct InstrFloatTraits<Scalar, float>
+{
+    typedef __m128  vec_t;
+};
+
+template <> struct InstrFloatTraits<Scalar, double>
+{
+    typedef __m128d vec_t;
+};
+
+}
+}
+
+#if !defined(__aarch64__)
 #ifdef USE_SSE42
 #ifndef _MSC_VER
 #include <popcntintrin.h>
@@ -26,29 +66,11 @@ FORCE_INLINE int popcnt32(int x32)
 } // namespace
 #endif
 
-#if defined(USE_AVX) || defined(USE_AVX2)
-#include <immintrin.h>
-#else
-#include <emmintrin.h>
-#ifdef USE_SSE41
-#include <smmintrin.h>
-#endif
-#endif
-
 #include "Type.h"
 
 namespace BinSearch {
 namespace Details {
 
-template <InstrSet I, class T>
-struct FVec;
-
-template <InstrSet I, class T>
-struct IVec;
-
-template <InstrSet I, class T>
-struct FVec1;
-
 template <> struct InstrIntTraits<SSE>
 {
     typedef __m128i vec_t;
@@ -64,8 +86,8 @@ template <> struct InstrFloatTraits<SSE, double>
     typedef __m128d vec_t;
 };
 
-template <InstrSet I, typename T>
-struct FTOITraits
+template <>
+struct FTOITraits<SSE, float>
 {
     typedef IVec<SSE, float> vec_t;
 };
@@ -285,9 +307,11 @@ FORCE_INLINE FVec<SSE,float> operator-   (const FVec<SSE,float>& a,  const FVec<
 FORCE_INLINE FVec<SSE,float> operator*   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_mul_ps( a, b ); }
 FORCE_INLINE FVec<SSE,float> operator/   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_div_ps( a, b ); }
 FORCE_INLINE IVec<SSE,float> ftoi        (const FVec<SSE,float>& a)                             { return _mm_cvttps_epi32(a); }
+#ifndef __clang__ // Conflicts with builtin operator
 FORCE_INLINE IVec<SSE,float> operator<=  (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128( _mm_cmple_ps( a, b ) ); }
 FORCE_INLINE IVec<SSE,float> operator>=  (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128( _mm_cmpge_ps( a, b ) ); }
 FORCE_INLINE IVec<SSE,float> operator<   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128(_mm_cmplt_ps(a, b)); }
+#endif
 #ifdef USE_FMA
 FORCE_INLINE FVec<SSE, float> mulSub(const FVec<SSE, float>& a, const FVec<SSE, float>& b, const FVec<SSE, float>& c) { return _mm_fmsub_ps(a, b, c); }
 #endif
@@ -339,9 +363,11 @@ FORCE_INLINE FVec<SSE,double> operator-   (const FVec<SSE,double>& a, const FVec
 FORCE_INLINE FVec<SSE,double> operator*   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_mul_pd( a, b ); }
 FORCE_INLINE FVec<SSE,double> operator/   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_div_pd( a, b ); }
 FORCE_INLINE IVec<SSE,float>  ftoi        (const FVec<SSE,double>& a)                               { return _mm_cvttpd_epi32(a); }
+#ifndef __clang__ // Conflicts with builtin operator
 FORCE_INLINE IVec<SSE,double> operator<=  (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128( _mm_cmple_pd( a, b ) ); }
 FORCE_INLINE IVec<SSE,double> operator<   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128(_mm_cmplt_pd(a, b)); }
 FORCE_INLINE IVec<SSE,double> operator>=  (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128( _mm_cmpge_pd( a, b ) ); }
+#endif
 #ifdef USE_FMA
 FORCE_INLINE FVec<SSE, double> mulSub(const FVec<SSE, double>& a, const FVec<SSE, double>& b, const FVec<SSE, double>& c ) { return _mm_fmsub_pd(a, b, c); }
 #endif
@@ -560,3 +586,4 @@ FORCE_INLINE FVec<AVX, double> mulSub(const FVec<AVX, double>& a, const FVec<AVX
 
 } // namepsace Details
 } // namespace BinSearch
+#endif // !defined(__aarch64__)
\ No newline at end of file
diff --git a/include/Type.h b/include/Type.h
index 720bfb86f..16bf3e3ae 100644
--- a/include/Type.h
+++ b/include/Type.h
@@ -10,7 +10,7 @@ using std::size_t;
 
 namespace BinSearch {
 
-enum InstrSet { Scalar, SSE, AVX };
+enum InstrSet { Scalar, SSE, AVX, Neon };
 
 #define ALGOENUM(x, b) x,
 enum Algos
diff --git a/requirements.txt b/requirements.txt
index 3bde2dc6a..ad64af67a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
-lion-pytorch
-pytest
-scipy
+pytest==7.2.2
+einops==0.6.0
+wheel==0.40.0
+lion-pytorch==0.0.6
+scipy==1.11.4
diff --git a/setup.py b/setup.py
index 2068c5fd8..72c809820 100644
--- a/setup.py
+++ b/setup.py
@@ -6,11 +6,12 @@
 import os
 
 from setuptools import find_packages, setup
+from setuptools.dist import Distribution
 import bitsandbytes as bnb
 
 VERSION = bnb.__version__
 
-libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.so"))
+libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.*"))
 libs = [os.path.basename(p) for p in libs]
 print("libs:", libs)
 
@@ -18,6 +19,10 @@
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 
+# Tested with wheel v0.29.0
+class BinaryDistribution(Distribution):
+    def has_ext_modules(foo):
+        return True
 
 setup(
     name=f"bitsandbytes",
@@ -37,4 +42,5 @@ def read(fname):
         "Development Status :: 4 - Beta",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
+    distclass=BinaryDistribution
 )

From de10f7eee7042e945339564bbe475dff81d1b781 Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Tue, 2 Jan 2024 14:56:05 +0100
Subject: [PATCH 02/52] Removed deprecated Python versions

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index f50091980..004d0b7e7 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -135,7 +135,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
         arch: [x86_64, aarch64]
         exclude:
           - os: windows-latest # This probably requres arm64 Windows agents

From aae5ff7055746082f7e1dc28adfeddc0139af717 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Thu, 25 Jan 2024 10:09:40 +0100
Subject: [PATCH 03/52] Update python-package.yml

Co-authored-by: Aarni Koskela <akx@iki.fi>
---
 .github/workflows/python-package.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 004d0b7e7..6408ad971 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -186,7 +186,8 @@ jobs:
     - uses: actions/checkout@v2
     - name: Build dist
       run: |
-        python setup.py sdist
+        pip install build
+        python -m build -s .
     - name: Download build artifact
       uses: actions/download-artifact@v3
       with:

From b06590d7ea7156f02ba2d80f2bb2e27254e763d6 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Thu, 25 Jan 2024 10:18:56 +0100
Subject: [PATCH 04/52] Update python-package.yml

Co-authored-by: Aarni Koskela <akx@iki.fi>
---
 .github/workflows/python-package.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 6408ad971..73d8e95e0 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -172,7 +172,8 @@ jobs:
     - name: Build wheel
       shell: bash
       run: |
-        python setup.py bdist_wheel
+        pip install build
+        python -m build .
     - name: Upload build artifact
       uses: actions/upload-artifact@v3
       with:

From 03744cba3e21a688aa8de8f25654c68b5000d038 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Thu, 25 Jan 2024 10:57:03 +0100
Subject: [PATCH 05/52] Update python-package.yml

Co-authored-by: Aarni Koskela <akx@iki.fi>
---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 73d8e95e0..bfffd0c3d 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -24,7 +24,7 @@ jobs:
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
     steps:
       # Check out code
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
       # On Linux we use CMake within Docker
     - name: Setup cmake
       uses: jwlawson/actions-setup-cmake@v1.13

From 648e2f5d6c4aae007aa838d306d968aa0c3bee0b Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Thu, 25 Jan 2024 11:03:12 +0100
Subject: [PATCH 06/52] Update python-package.yml

Co-authored-by: Aarni Koskela <akx@iki.fi>
---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index bfffd0c3d..501d9bfef 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -78,7 +78,7 @@ jobs:
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
     steps:
       # Check out code
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
       # Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation)
     - name: Set up Docker multiarch
       if: startsWith(matrix.os, 'ubuntu')

From cba2b1a0767b25de874877812269b72a7bb3106b Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Thu, 25 Jan 2024 11:05:34 +0100
Subject: [PATCH 07/52] Update python-package.yml

Co-authored-by: Aarni Koskela <akx@iki.fi>
---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 501d9bfef..3ce5078d4 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -143,7 +143,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     steps:
       # Check out code
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
       # Download shared libraries
     - name: Download build artifact
       uses: actions/download-artifact@v3

From 6f70a5e9a516416859ce3f1261a101a57b002696 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Thu, 25 Jan 2024 15:28:01 +0100
Subject: [PATCH 08/52] Update python-package.yml

Co-authored-by: Aarni Koskela <akx@iki.fi>
---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 3ce5078d4..e2d11ab9d 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -156,7 +156,7 @@ jobs:
         cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/
       # Compile C++ code
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
       # 

From 90fa8b1d680e2f07f435b306de3d8cc43e85414a Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Thu, 25 Jan 2024 15:28:56 +0100
Subject: [PATCH 09/52] Update python-package.yml

Co-authored-by: Aarni Koskela <akx@iki.fi>
---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index e2d11ab9d..fe1071a09 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -184,7 +184,7 @@ jobs:
     needs: build-wheels
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Build dist
       run: |
         pip install build

From c815ca09e27049eb6bbef912067b6cdab2c4144e Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Thu, 25 Jan 2024 17:58:28 +0100
Subject: [PATCH 10/52] Update python-package.yml

---
 .github/workflows/python-package.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index fe1071a09..b560cd5db 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -1,10 +1,9 @@
 name: Python package
 
 on:
-  push:
-    branches: [ "*" ]
+  push: {}
   pull_request:
-    branches: [ master ]
+    branches: [ main ]
   release:
     types: [ published ]
 

From 36b1ef203722d53ac578f095aeed14336471db05 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Thu, 25 Jan 2024 18:08:11 +0100
Subject: [PATCH 11/52] Do not test on Python 3.13 until released

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index b560cd5db..1200a5ca5 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -134,7 +134,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
         arch: [x86_64, aarch64]
         exclude:
           - os: windows-latest # This probably requres arm64 Windows agents

From 44e3f174d241150f3d74b372dd6944f46c59ede1 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Thu, 25 Jan 2024 18:16:39 +0100
Subject: [PATCH 12/52] Update python-package.yml

---
 .github/workflows/python-package.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 1200a5ca5..c36e0c618 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -32,6 +32,13 @@ jobs:
     - name: Add msbuild to PATH
       uses: microsoft/setup-msbuild@v1.1
       if: ${{ startsWith(matrix.os, 'windows') }}
+      # Check out code
+    - uses: actions/checkout@v4
+      name: Check out NVidia cub
+      with:
+        repository: nvidia/cub
+        ref: 1.11.0
+        path: dependencies/cub
       # Compile C++ code
     - name: Build C++
       shell: bash

From 6fe8d0cf69a5f6d204f346d8149455dda2f3f6e9 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Fri, 26 Jan 2024 13:39:43 +0100
Subject: [PATCH 13/52] Update python-package.yml

---
 .github/workflows/python-package.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index c36e0c618..aa73eea18 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -46,7 +46,6 @@ jobs:
         set -ex
         build_os=${{ matrix.os }}
         build_arch=${{ matrix.arch }}
-        ( git clone https://github.com/NVlabs/cub ./dependencies/cub; cd dependencies/cub; git checkout 1.11.0 )
         if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then
           # Allow cross-compile om aarch64
           sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu

From 572225e8e9486fc2f9ab92a1c995043907ff6a87 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Fri, 26 Jan 2024 13:51:39 +0100
Subject: [PATCH 14/52] Update python-package.yml

---
 .github/workflows/python-package.yml | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index aa73eea18..3c25cf4af 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -32,7 +32,7 @@ jobs:
     - name: Add msbuild to PATH
       uses: microsoft/setup-msbuild@v1.1
       if: ${{ startsWith(matrix.os, 'windows') }}
-      # Check out code
+      # Check out dependencies code
     - uses: actions/checkout@v4
       name: Check out NVidia cub
       with:
@@ -105,6 +105,13 @@ jobs:
     - name: Add msbuild to PATH
       uses: microsoft/setup-msbuild@v1.1
       if: ${{ startsWith(matrix.os, 'windows') }}
+      # Check out dependencies code
+    - uses: actions/checkout@v4
+      name: Check out NVidia cub
+      with:
+        repository: nvidia/cub
+        ref: 1.11.0
+        path: dependencies/cub
       # Compile C++ code
     - name: Build C++
       shell: bash
@@ -112,7 +119,6 @@ jobs:
         set -ex
         build_os=${{ matrix.os }}
         build_arch=${{ matrix.arch }}
-        ( git clone https://github.com/NVlabs/cub ./dependencies/cub; cd dependencies/cub; git checkout 1.11.0 )
         if [ ${build_os:0:6} == ubuntu ]; then 
           image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04
           echo "Using image $image"

From 7a8676e18f814dd92eacbacb697e35b04bede8b9 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Fri, 26 Jan 2024 14:00:52 +0100
Subject: [PATCH 15/52] Update python-package.yml

---
 .github/workflows/python-package.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 3c25cf4af..4e9fa0f73 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -26,7 +26,7 @@ jobs:
     - uses: actions/checkout@v4
       # On Linux we use CMake within Docker
     - name: Setup cmake
-      uses: jwlawson/actions-setup-cmake@v1.13
+      uses: jwlawson/actions-setup-cmake@v1.14
       with:
         cmake-version: '3.26.x'
     - name: Add msbuild to PATH
@@ -91,7 +91,7 @@ jobs:
       # On Linux we use CMake within Docker
     - name: Setup cmake
       if: ${{ !startsWith(matrix.os, 'linux') }}
-      uses: jwlawson/actions-setup-cmake@v1.13
+      uses: jwlawson/actions-setup-cmake@v1.14
       with:
         cmake-version: '3.26.x'
       # Windows: We install Cuda on the agent (slow)
@@ -134,7 +134,7 @@ jobs:
         mkdir -p output/${{ matrix.os }}/${{ matrix.arch }}
         ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ )
     - name: Upload build artifact
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: shared_library
         path: output/*
@@ -186,7 +186,7 @@ jobs:
         pip install build
         python -m build .
     - name: Upload build artifact
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: bdist_wheel
         path: dist/bitsandbytes-*.whl
@@ -201,7 +201,7 @@ jobs:
         pip install build
         python -m build -s .
     - name: Download build artifact
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: bdist_wheel
         path: dist/

From 8dd8d6333d5b7c232de2b952f61fb09a2e9ae354 Mon Sep 17 00:00:00 2001
From: Rickard Lyrenius <rickard@evolviq.com>
Date: Sun, 28 Jan 2024 15:31:18 +0100
Subject: [PATCH 16/52] Refactor build stage

---
 .github/workflows/python-package.yml    | 25 ++++++++++---------------
 pyproject.toml                          |  8 ++++++--
 requirements.txt => requirements.ci.txt |  1 +
 3 files changed, 17 insertions(+), 17 deletions(-)
 rename requirements.txt => requirements.ci.txt (66%)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 4e9fa0f73..8b285218d 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -136,7 +136,7 @@ jobs:
     - name: Upload build artifact
       uses: actions/upload-artifact@v4
       with:
-        name: shared_library
+        name: shared_library_cuda
         path: output/*
         retention-days: 7
   build-wheels:
@@ -157,24 +157,25 @@ jobs:
     - uses: actions/checkout@v4
       # Download shared libraries
     - name: Download build artifact
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
-        name: shared_library
         path: output/
     - name: Copy correct platform shared library
       shell: bash
       run: |
         cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/
-      # Compile C++ code
+      # Set up the Python version needed
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
-      # 
-    - name: Install Python dependencies
+        cache: pip
+    - name: Install build package
       shell: bash
-      run: |
-        pip install -r requirements.txt
+      run: pip install build
+    - name: Install Python test dependencies
+      shell: bash
+      run: pip install -r requirements.ci.txt
     # TODO: How to run CUDA tests on GitHub actions?
     #- name: Run unit tests
     #  if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents
@@ -182,9 +183,7 @@ jobs:
     #    PYTHONPATH=. pytest --log-cli-level=DEBUG tests
     - name: Build wheel
       shell: bash
-      run: |
-        pip install build
-        python -m build .
+      run: python -m build .
     - name: Upload build artifact
       uses: actions/upload-artifact@v4
       with:
@@ -196,10 +195,6 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v4
-    - name: Build dist
-      run: |
-        pip install build
-        python -m build -s .
     - name: Download build artifact
       uses: actions/download-artifact@v4
       with:
diff --git a/pyproject.toml b/pyproject.toml
index 74d17dd90..c028ca265 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,11 @@
 [build-system]
 requires = [
-    "setuptools>=42",
-    "wheel"
+    "setuptools>=63",
+    "pytest~=7.2.2",
+    "einops~=0.6.0",
+    "wheel~=0.40.0",
+    "lion-pytorch~=0.0.6",
+    "scipy~=1.11.4"
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/requirements.txt b/requirements.ci.txt
similarity index 66%
rename from requirements.txt
rename to requirements.ci.txt
index ad64af67a..e9d86293d 100644
--- a/requirements.txt
+++ b/requirements.ci.txt
@@ -1,3 +1,4 @@
+# Requirements used for GitHub actions
 pytest==7.2.2
 einops==0.6.0
 wheel==0.40.0

From 8b1ceb7aecacd92c7f492aa60c9c28c42c43072b Mon Sep 17 00:00:00 2001
From: Rickard Lyrenius <rickard@evolviq.com>
Date: Sun, 28 Jan 2024 16:01:16 +0100
Subject: [PATCH 17/52] Fixed breaking actions change

---
 .github/workflows/python-package.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 8b285218d..ded1b5484 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -65,7 +65,7 @@ jobs:
     - name: Upload build artifact
       uses: actions/upload-artifact@v3
       with:
-        name: shared_library
+        name: shared_library/${{ matrix.os }}/${{ matrix.arch }}
         path: output/*
         retention-days: 7
   ##
@@ -136,7 +136,7 @@ jobs:
     - name: Upload build artifact
       uses: actions/upload-artifact@v4
       with:
-        name: shared_library_cuda
+        name: shared_library_cuda/${{ matrix.os }}/${{ matrix.arch }}
         path: output/*
         retention-days: 7
   build-wheels:
@@ -187,7 +187,7 @@ jobs:
     - name: Upload build artifact
       uses: actions/upload-artifact@v4
       with:
-        name: bdist_wheel
+        name: bdist_wheel/${{ matrix.os }}/${{ matrix.arch }}
         path: dist/bitsandbytes-*.whl
         retention-days: 7
   publish:
@@ -198,7 +198,6 @@ jobs:
     - name: Download build artifact
       uses: actions/download-artifact@v4
       with:
-        name: bdist_wheel
         path: dist/
     - run: |
         ls -lR dist/

From e11867bcacdb7f9ab1844e843b40e3e1dc33ccab Mon Sep 17 00:00:00 2001
From: Rickard Lyrenius <rickard@evolviq.com>
Date: Sun, 28 Jan 2024 16:04:10 +0100
Subject: [PATCH 18/52] Slim down Windows cuda

---
 .github/workflows/python-package.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index ded1b5484..07b74c851 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -65,7 +65,7 @@ jobs:
     - name: Upload build artifact
       uses: actions/upload-artifact@v3
       with:
-        name: shared_library/${{ matrix.os }}/${{ matrix.arch }}
+        name: shared_library_${{ matrix.os }}_${{ matrix.arch }}
         path: output/*
         retention-days: 7
   ##
@@ -95,13 +95,13 @@ jobs:
       with:
         cmake-version: '3.26.x'
       # Windows: We install Cuda on the agent (slow)
-    - uses: Jimver/cuda-toolkit@v0.2.10
+    - uses: Jimver/cuda-toolkit@v0.2.14
       if: startsWith(matrix.os, 'windows')
       id: cuda-toolkit
       with:
         cuda: ${{ matrix.cuda_version }}
         method: 'local'
-        #sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]'
+        sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev"]'
     - name: Add msbuild to PATH
       uses: microsoft/setup-msbuild@v1.1
       if: ${{ startsWith(matrix.os, 'windows') }}
@@ -136,7 +136,7 @@ jobs:
     - name: Upload build artifact
       uses: actions/upload-artifact@v4
       with:
-        name: shared_library_cuda/${{ matrix.os }}/${{ matrix.arch }}
+        name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}
         path: output/*
         retention-days: 7
   build-wheels:
@@ -187,7 +187,7 @@ jobs:
     - name: Upload build artifact
       uses: actions/upload-artifact@v4
       with:
-        name: bdist_wheel/${{ matrix.os }}/${{ matrix.arch }}
+        name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}
         path: dist/bitsandbytes-*.whl
         retention-days: 7
   publish:

From 57625dba9161fec458384046d69a5b7374f69970 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Sun, 28 Jan 2024 16:27:10 +0100
Subject: [PATCH 19/52] Create dependabot.yml

---
 .github/dependabot.yml | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 .github/dependabot.yml

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 000000000..8fb637562
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: pip
+    directory: "/"
+    schedule:
+      interval: "weekly"

From ada2e9aa1e06ade396a013ffd1ffd7fd25cf5d7c Mon Sep 17 00:00:00 2001
From: Rickard Lyrenius <rickard@evolviq.com>
Date: Sun, 28 Jan 2024 16:27:54 +0100
Subject: [PATCH 20/52] Bespoke local dev requirements.txt

---
 .github/workflows/python-package.yml       |  2 +-
 .github/workflows/stale.yml.disabled       | 27 ----------------------
 requirements.ci.txt => requirements-ci.txt |  1 +
 requirements-dev.txt                       |  9 ++++++++
 4 files changed, 11 insertions(+), 28 deletions(-)
 delete mode 100644 .github/workflows/stale.yml.disabled
 rename requirements.ci.txt => requirements-ci.txt (89%)
 create mode 100644 requirements-dev.txt

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 07b74c851..92c49a196 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -175,7 +175,7 @@ jobs:
       run: pip install build
     - name: Install Python test dependencies
       shell: bash
-      run: pip install -r requirements.ci.txt
+      run: pip install -r requirements-ci.txt
     # TODO: How to run CUDA tests on GitHub actions?
     #- name: Run unit tests
     #  if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents
diff --git a/.github/workflows/stale.yml.disabled b/.github/workflows/stale.yml.disabled
deleted file mode 100644
index ec011c7fb..000000000
--- a/.github/workflows/stale.yml.disabled
+++ /dev/null
@@ -1,27 +0,0 @@
-name: Stale Bot
-
-on:
-  schedule:
-    - cron: "0 15 * * *"
-
-jobs:
-  close_stale_issues:
-    name: Close Stale Issues
-    if: github.repository == 'TimDettmers/bitsandbytes'
-    runs-on: ubuntu-latest
-    env:
-      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-    steps:
-    - uses: actions/checkout@v3
-
-    - name: Setup Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: 3.8
-
-    - name: Install requirements
-      run: |
-        pip install PyGithub
-    - name: Close stale issues
-      run: |
-        python scripts/stale.py
\ No newline at end of file
diff --git a/requirements.ci.txt b/requirements-ci.txt
similarity index 89%
rename from requirements.ci.txt
rename to requirements-ci.txt
index e9d86293d..46bd5b9cd 100644
--- a/requirements.ci.txt
+++ b/requirements-ci.txt
@@ -4,3 +4,4 @@ einops==0.6.0
 wheel==0.40.0
 lion-pytorch==0.0.6
 scipy==1.11.4
+pandas==2.2.0
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 000000000..7ede5b061
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,9 @@
+# Requirements used for local development
+setuptools>=63
+pytest~=7.2.2
+einops~=0.6.0
+wheel~=0.40.0
+lion-pytorch~=0.0.6
+scipy~=1.11.4
+pandas~=2.2.0
+matplotlib~=3.8.2

From e0093e9cc473749b8861a7723018c6f537c6eaaa Mon Sep 17 00:00:00 2001
From: Rickard Lyrenius <rickard@evolviq.com>
Date: Sun, 28 Jan 2024 16:33:04 +0100
Subject: [PATCH 21/52] Enable VS integration

---
 .github/workflows/python-package.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 92c49a196..c85224292 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -101,7 +101,7 @@ jobs:
       with:
         cuda: ${{ matrix.cuda_version }}
         method: 'local'
-        sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev"]'
+        sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]'
     - name: Add msbuild to PATH
       uses: microsoft/setup-msbuild@v1.1
       if: ${{ startsWith(matrix.os, 'windows') }}
@@ -175,7 +175,7 @@ jobs:
       run: pip install build
     - name: Install Python test dependencies
       shell: bash
-      run: pip install -r requirements-ci.txt
+      run: pip install -r requirements.ci.txt
     # TODO: How to run CUDA tests on GitHub actions?
     #- name: Run unit tests
     #  if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents

From 23bdf05b69a57fa26238fd8a37428c5c49bc1b1a Mon Sep 17 00:00:00 2001
From: Rickard Lyrenius <rickard@evolviq.com>
Date: Sun, 28 Jan 2024 16:35:20 +0100
Subject: [PATCH 22/52] Group Dependabot updates

---
 .github/dependabot.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 8fb637562..616a1f98e 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -4,3 +4,8 @@ updates:
     directory: "/"
     schedule:
       interval: "weekly"
+    groups:
+      major:
+        update-types: [major]
+      minor-patch:
+        update-types: [minor, patch]
\ No newline at end of file

From 87414c3148600f44eabd15f5003a01de6dafa712 Mon Sep 17 00:00:00 2001
From: Rickard Lyrenius <rickard@evolviq.com>
Date: Sun, 28 Jan 2024 16:42:29 +0100
Subject: [PATCH 23/52] Cleanup

---
 pyproject.toml | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c028ca265..0ddfaa0e9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,12 +1,5 @@
 [build-system]
-requires = [
-    "setuptools>=63",
-    "pytest~=7.2.2",
-    "einops~=0.6.0",
-    "wheel~=0.40.0",
-    "lion-pytorch~=0.0.6",
-    "scipy~=1.11.4"
-]
+requires = [ "setuptools", "wheel" ]
 build-backend = "setuptools.build_meta"
 
 [tool.ruff]

From 0ee8f7f8790c103bfea22fc43f567c68d69d5a9d Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Sun, 28 Jan 2024 23:27:26 +0100
Subject: [PATCH 24/52] Update python-package.yml

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index c85224292..2c2ea64a2 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -101,7 +101,7 @@ jobs:
       with:
         cuda: ${{ matrix.cuda_version }}
         method: 'local'
-        sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]'
+        # sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]'
     - name: Add msbuild to PATH
       uses: microsoft/setup-msbuild@v1.1
       if: ${{ startsWith(matrix.os, 'windows') }}

From 816eee0a7bfe6f1d47f70741e2c4603065fb5f13 Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Wed, 31 Jan 2024 23:10:17 +0100
Subject: [PATCH 25/52] Reinstate file that was wrongly merged

---
 .github/workflows/stale.yml.disabled | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 .github/workflows/stale.yml.disabled

diff --git a/.github/workflows/stale.yml.disabled b/.github/workflows/stale.yml.disabled
new file mode 100644
index 000000000..ec011c7fb
--- /dev/null
+++ b/.github/workflows/stale.yml.disabled
@@ -0,0 +1,27 @@
+name: Stale Bot
+
+on:
+  schedule:
+    - cron: "0 15 * * *"
+
+jobs:
+  close_stale_issues:
+    name: Close Stale Issues
+    if: github.repository == 'TimDettmers/bitsandbytes'
+    runs-on: ubuntu-latest
+    env:
+      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.8
+
+    - name: Install requirements
+      run: |
+        pip install PyGithub
+    - name: Close stale issues
+      run: |
+        python scripts/stale.py
\ No newline at end of file

From 05283245675b5d4fb85f51ae61dcc157b68b0d1b Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Wed, 31 Jan 2024 23:18:06 +0100
Subject: [PATCH 26/52] Fixed regression caused by new version of
 download-artifact

---
 .github/workflows/python-package.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 2c2ea64a2..3891a4b88 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -63,7 +63,7 @@ jobs:
         mkdir -p output/${{ matrix.os }}/${{ matrix.arch }}
         ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ )
     - name: Upload build artifact
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: shared_library_${{ matrix.os }}_${{ matrix.arch }}
         path: output/*
@@ -159,10 +159,13 @@ jobs:
     - name: Download build artifact
       uses: actions/download-artifact@v4
       with:
+        merge-multiple: true
+        pattern: "shared_library_*_${{ matrix.os }}_${{ matrix.arch }}"
         path: output/
     - name: Copy correct platform shared library
       shell: bash
       run: |
+        ls -lR output/
         cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/
       # Set up the Python version needed
     - name: Set up Python ${{ matrix.python-version }}
@@ -199,6 +202,8 @@ jobs:
       uses: actions/download-artifact@v4
       with:
         path: dist/
+        merge-multiple: true
+        pattern: "bdist_wheel_*"
     - run: |
         ls -lR dist/
     - name: Publish to PyPi

From 8152e213c5b4b6cd7ded6be329a436922aeec469 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Thu, 1 Feb 2024 01:40:50 +0100
Subject: [PATCH 27/52] Update python-package.yml

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 3891a4b88..db71b357e 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -160,7 +160,7 @@ jobs:
       uses: actions/download-artifact@v4
       with:
         merge-multiple: true
-        pattern: "shared_library_*_${{ matrix.os }}_${{ matrix.arch }}"
+        pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}"
         path: output/
     - name: Copy correct platform shared library
       shell: bash

From 9aad25a70bd1cb7ab93c841ed9c5216ab6643583 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Thu, 1 Feb 2024 08:25:35 +0100
Subject: [PATCH 28/52] Update python-package.yml

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index db71b357e..243b0008c 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -178,7 +178,7 @@ jobs:
       run: pip install build
     - name: Install Python test dependencies
       shell: bash
-      run: pip install -r requirements.ci.txt
+      run: pip install -r requirements-ci.txt
     # TODO: How to run CUDA tests on GitHub actions?
     #- name: Run unit tests
     #  if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents

From bcc67808070fc798afc046facf211a5236b8d399 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Fri, 2 Feb 2024 00:35:52 +0100
Subject: [PATCH 29/52] Fix matrix

---
 .github/workflows/python-package.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 243b0008c..3fcb1b55f 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -136,7 +136,7 @@ jobs:
     - name: Upload build artifact
       uses: actions/upload-artifact@v4
       with:
-        name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}
+        name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }}
         path: output/*
         retention-days: 7
   build-wheels:
@@ -190,7 +190,7 @@ jobs:
     - name: Upload build artifact
       uses: actions/upload-artifact@v4
       with:
-        name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}
+        name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}}_${{ matrix.python-version }}
         path: dist/bitsandbytes-*.whl
         retention-days: 7
   publish:

From 2951e2c7f5c33826184f435e7bb8a646935ac7f4 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Fri, 2 Feb 2024 01:40:45 +0100
Subject: [PATCH 30/52] Update python-package.yml

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 3fcb1b55f..56fefe7da 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -160,7 +160,7 @@ jobs:
       uses: actions/download-artifact@v4
       with:
         merge-multiple: true
-        pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}"
+        pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*"
         path: output/
     - name: Copy correct platform shared library
       shell: bash

From 9867392b16714e143475fb6bb36ee91345c44328 Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Fri, 2 Feb 2024 09:06:07 +0100
Subject: [PATCH 31/52] Merge

---
 .github/ISSUE_TEMPLATE/bug-report.yml         |   8 +-
 .github/ISSUE_TEMPLATE/feature-request.yml    |   4 +-
 .github/workflows/build_pr_documentation.yml  |   3 +-
 .github/workflows/cmake.yml                   | 159 ++++++
 .github/workflows/stale.yml.disabled          |   2 +-
 .github/workflows/upload_pr_documentation.yml |  16 +
 .pre-commit-config.yaml                       |  11 +
 .style.yapf                                   |   2 +-
 CMakeLists.txt                                | 217 +++++---
 README.md                                     |   4 +-
 benchmarking/switchback/README.md             |   2 +-
 .../switchback/make_plot_with_jsonl.py        |   9 +-
 benchmarking/switchback/speed_benchmark.py    |   4 +-
 bitsandbytes/__init__.py                      |   2 +-
 bitsandbytes/cuda_setup/main.py               |   2 +-
 bitsandbytes/optim/adamw.py                   |   1 -
 bitsandbytes/research/autograd/_functions.py  |   4 +-
 bitsandbytes/triton/dequantize_rowwise.py     |   2 +-
 .../triton/int8_matmul_mixed_dequantize.py    |   2 +-
 .../triton/int8_matmul_rowwise_dequantize.py  |   2 +-
 .../quantize_columnwise_and_transpose.py      |   3 +-
 bitsandbytes/triton/quantize_global.py        |  17 +-
 bitsandbytes/triton/quantize_rowwise.py       |   3 +-
 compile_from_source.md                        |   5 +-
 csrc/cpu_ops.cpp                              |   2 +-
 csrc/kernels.cu                               |  70 +--
 csrc/mps_ops.h                                |   0
 csrc/mps_ops.mm                               |  67 +++
 csrc/ops.cuh                                  |   3 -
 csrc/pythonInterface.cpp                      |   2 +-
 docs/source/_toctree.yml                      |   4 +-
 docs/source/index.mdx                         |   6 +-
 docs/source/installation.mdx                  |  42 +-
 docs/source/quickstart.mdx                    |   4 +-
 environment-bnb.yml                           |  21 +
 environment.yml                               |   2 +-
 examples/int8_inference_huggingface.py        |   3 -
 how_to_use_nonpytorch_cuda.md                 |   2 +-
 include/SIMD.h                                |  20 +-
 install_cuda.py                               |   8 +-
 pytest.ini                                    |   5 +-
 scripts/stale.py                              |   2 +-
 setup.py                                      |   4 +-
 tests/conftest.py                             |   4 +
 tests/helpers.py                              |  51 ++
 tests/test_autograd.py                        | 213 ++-----
 tests/test_cuda_setup_evaluator.py            |   8 -
 tests/test_functional.py                      | 527 ++++++------------
 tests/test_generation.py                      |  23 +-
 tests/test_linear4bit.py                      |  10 +-
 tests/test_linear8bitlt.py                    |   8 +-
 tests/test_modules.py                         |  38 +-
 tests/test_optim.py                           |  99 ++--
 tests/test_triton.py                          |   4 +-
 54 files changed, 914 insertions(+), 822 deletions(-)
 create mode 100644 .github/workflows/cmake.yml
 create mode 100644 .github/workflows/upload_pr_documentation.yml
 create mode 100644 csrc/mps_ops.h
 create mode 100644 csrc/mps_ops.mm
 create mode 100644 environment-bnb.yml
 create mode 100644 tests/helpers.py

diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index ac8e9de00..6ae3c7c0a 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -18,15 +18,15 @@ body:
       label: Reproduction
       description: |
         Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
-        Please provide the simplest reproducer as possible so that we can quickly fix the issue. 
+        Please provide the simplest reproducer as possible so that we can quickly fix the issue.
 
       placeholder: |
-        Reproducer: 
-   
+        Reproducer:
+
   - type: textarea
     id: expected-behavior
     validations:
       required: true
     attributes:
       label: Expected behavior
-      description: "A clear and concise description of what you would expect to happen."
\ No newline at end of file
+      description: "A clear and concise description of what you would expect to happen."
diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
index 4e75c2a64..c39f346b9 100644
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -18,7 +18,7 @@ body:
     attributes:
       label: Motivation
       description: |
-        Please outline the motivation for the proposal. Is your feature request related to a problem? 
+        Please outline the motivation for the proposal. Is your feature request related to a problem?
 
   - type: textarea
     id: contribution
@@ -27,4 +27,4 @@ body:
     attributes:
       label: Your contribution
       description: |
-        Is there any way that you could help, e.g. by submitting a PR? 
\ No newline at end of file
+        Is there any way that you could help, e.g. by submitting a PR?
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index dace206b1..d6455fd11 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -9,9 +9,10 @@ concurrency:
 
 jobs:
   build:
+    if: github.repository == 'TimDettmers/bitsandbytes'
     uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
     with:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: bitsandbytes
-      repo_owner: TimDettmers
\ No newline at end of file
+      repo_owner: TimDettmers
diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
new file mode 100644
index 000000000..728dd09fb
--- /dev/null
+++ b/.github/workflows/cmake.yml
@@ -0,0 +1,159 @@
+name: CMake on multiple platforms
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+concurrency:
+  group: cmake-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+      # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable.
+      fail-fast: false
+
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+        cuda-version: ['11.8', '12.1']
+        build_type: [Release]
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up MSVC
+      if: matrix.os == 'windows-latest'
+      uses: ilammy/msvc-dev-cmd@v1.12.1
+      with:
+        arch: amd64
+
+    - name: Setup Mambaforge
+      uses: conda-incubator/setup-miniconda@v3.0.1
+      with:
+        miniforge-variant: Mambaforge
+        miniforge-version: latest
+        activate-environment: bnb-env
+        use-mamba: true
+
+    - uses: conda-incubator/setup-miniconda@v3.0.1
+      with:
+        auto-update-conda: true
+        activate-environment: bnb-env
+        environment-file: environment-bnb.yml
+        use-only-tar-bz2: false
+        auto-activate-base: true
+        python-version: "3.10"
+        mamba-version: "*"
+
+    - name: Set reusable strings
+      # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file.
+      id: strings
+      shell: bash
+      run: |
+        echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"
+
+    - name: CUDA Toolkit
+      shell: bash -el {0}
+      run: |
+        if [ "${{ matrix.os }}" = "ubuntu-latest" ]; then
+            # to prepare space
+            sudo rm -rf /usr/share/dotnet
+            sudo rm -rf /opt/ghc
+            sudo rm -rf /usr/local/share/boost
+        fi
+        addon=""
+        cuda_version=${{ matrix.cuda-version }}
+        [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "ubuntu-latest" ] && addon="cuda-cudart-static cuda-nvrtc"
+        [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "windows-latest" ] && addon="cuda-nvrtc"
+        [ "$cuda_version" = "11.8" ] && cuda_version="11.8.0"
+        [ "$cuda_version" = "12.1" ] && cuda_version="12.1.1"
+
+        conda install pytorch-cuda=${{ matrix.cuda-version }} -c pytorch # it's dependency not correctly resolved sometime
+        conda install cuda-python=${{ matrix.cuda-version }} cuda-libraries-dev cuda-nvcc cuda-nvtx cuda-cupti cuda-cudart cuda-cudart-dev cuda-runtime cuda-libraries $addon -c "nvidia/label/cuda-$cuda_version"
+
+        [ "${{ matrix.os }}" = "windows-latest" ] && conda install "clang>=17.0.6" "clangxx>=17.0.6" -c conda-forge
+
+        CUDA_HOME="${{ env.CONDA }}/envs/bnb-env"
+        echo CUDA_HOME=$CUDA_HOME >> "$GITHUB_ENV"
+        echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV"
+
+        if [ "${{ matrix.os }}" = "windows-latest" ]; then
+            echo CXX_COMPILER=cl >> "$GITHUB_ENV"
+            echo C_COMPILER=cl >> "$GITHUB_ENV"
+            # without -DCMAKE_CUDA_COMPILER=nvcc, cmake config always fail for cuda-11.8
+            echo DCMAKE_CUDA_COMPILER=-DCMAKE_CUDA_COMPILER=nvcc >> "$GITHUB_ENV"
+        else
+            echo CXX_COMPILER=g++ >> "$GITHUB_ENV"
+            echo C_COMPILER=gcc >> "$GITHUB_ENV"
+        fi
+
+        nvcc --version
+
+    - name: Update environment
+      run: mamba env update -n bnb-env -f environment-bnb.yml
+
+    - name: Prep build
+      run: python -m pip install cmake==3.27.9 ninja setuptools wheel
+
+    # TODO: the following steps (CUDA, NOBLASLT, CPU) could be moved to the matrix, so they're built in parallel
+
+    - name: Configure CUDA
+      run: >
+        cmake -B ${{ steps.strings.outputs.build-output-dir }}
+        -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
+        -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }}
+        -DCMAKE_C_COMPILER=${{ env.C_COMPILER }}
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+        -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
+        -S ${{ github.workspace }}
+
+    - name: Build CUDA
+      run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
+
+    - name: Configure NOBLASLT
+      run: >
+        cmake -B ${{ steps.strings.outputs.build-output-dir }}
+        -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
+        -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }}
+        -DCMAKE_C_COMPILER=${{ env.C_COMPILER }}
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+        -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
+        -DNO_CUBLASLT=ON
+        -S ${{ github.workspace }}
+
+    - name: Build NOBLASLT
+      run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
+
+    - name: Configure CPU
+      run: >
+        cmake -B ${{ steps.strings.outputs.build-output-dir }}
+        -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
+        -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }}
+        -DCMAKE_C_COMPILER=${{ env.C_COMPILER }}
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+        -DNO_CUBLASLT=ON
+        -DBUILD_CUDA=OFF
+        -S ${{ github.workspace }}
+
+    - name: Build CPU
+      run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
+
+    - name: Build dist
+      shell: bash -el {0}
+      run: |
+        python -m pip install build
+        python -m build --wheel
+        mkdir dist/cu${{ matrix.cuda-version }}
+        mv dist/bitsandbytes*.* dist/cu${{ matrix.cuda-version }}/
+
+    - name: Upload Build Artifacts
+      uses: actions/upload-artifact@v4.3.0
+      with:
+        name: bitsandbytes-${{ matrix.os }}-${{ matrix.cuda-version }}
+        path: |
+          ${{ github.workspace }}/dist/
diff --git a/.github/workflows/stale.yml.disabled b/.github/workflows/stale.yml.disabled
index ec011c7fb..0b4f789ea 100644
--- a/.github/workflows/stale.yml.disabled
+++ b/.github/workflows/stale.yml.disabled
@@ -24,4 +24,4 @@ jobs:
         pip install PyGithub
     - name: Close stale issues
       run: |
-        python scripts/stale.py
\ No newline at end of file
+        python scripts/stale.py
diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml
new file mode 100644
index 000000000..6497caf2d
--- /dev/null
+++ b/.github/workflows/upload_pr_documentation.yml
@@ -0,0 +1,16 @@
+name: Upload PR Documentation
+
+on:
+  workflow_run:
+    workflows: ["Build PR Documentation"]
+    types:
+      - completed
+
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
+    with:
+      package_name: bitsandbytes
+    secrets:
+      hf_token: ${{ secrets.HUGGINGFACE_PUSH }}
+      comment_bot_token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d568a849f..039139b95 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,3 +6,14 @@ repos:
         args:
           - --fix
       # - id: ruff-format  # TODO: enable when the time is right
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: check-merge-conflict
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: fix-byte-order-marker
+      - id: trailing-whitespace
+      - id: mixed-line-ending
+        args:
+          - --fix=lf
diff --git a/.style.yapf b/.style.yapf
index a185235cf..e60ac16e5 100644
--- a/.style.yapf
+++ b/.style.yapf
@@ -10,4 +10,4 @@ SPLIT_BEFORE_BITWISE_OPERATOR = True
 SPLIT_BEFORE_FIRST_ARGUMENT = True
 SPLIT_BEFORE_LOGICAL_OPERATOR = True
 SPLIT_BEFORE_NAMED_ASSIGNS = True
-SPLIT_COMPLEX_COMPREHENSION = True
\ No newline at end of file
+SPLIT_COMPLEX_COMPREHENSION = True
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d6e269d15..b9a55ae5e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,74 +1,148 @@
+# This CMake config hopefully makes it easier to compile.
+# Ensure the CUDA Toolkit is available on your path. Then run:
+#   For  GCC: `cmake -B build . && cmake --build build`
+#   For MSVC: `cmake -B build . && cmake --build build --config Release`
+# You can also use the following options and variables
+#  - COMPUTE_BACKEND: Set to `cpu`, `cuda`, or `mps` to select the backend
+#  - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support
+#  - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version
+#                  is whatever CMake finds on your path.
+#  - COMPUTE_CAPABILITY: Which GPU Arch/Compute codes to provide to NVCC.
+#                        Separate by semicolons, i.e. `-DCOMPUTE_CAPABILITY=89;90`
+#                        Check your compute capability here: https://developer.nvidia.com/cuda-gpus
+#  - PTXAS_VERBOSE: Pass the `-v` option to the PTX Assembler
 cmake_minimum_required(VERSION 3.22.1)
 
-option(ENABLE_CUDA "Build for CUDA (Nvidia)" OFF)
-option(ENABLE_MPS "Build for Metal Performance Shaders (Apple)" OFF)
+project(bitsandbytes LANGUAGES CXX)
 
-if(ENABLE_CUDA)
+# Define included source files
+set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.cpp)
+set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
+set(MPS_FILES csrc/mps_ops.mm)
+set(METAL_FILES csrc/mps_kernels.metal)
+# C++ sources are always included
+list(APPEND SRC_FILES ${CPP_FILES})
+
+set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, mps)")
+set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda mps)
+option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
+
+if(APPLE)
+  set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1)
+endif()
+set(CMAKE_CXX_STANDARD 14)
+set(CXX_STANDARD_REQUIRED C++14)
+
+set(BNB_OUTPUT_NAME "bitsandbytes")
+
+message(STATUS "Building with backend ${COMPUTE_BACKEND}")
+
+if(${COMPUTE_BACKEND} STREQUAL "cuda")
     if(APPLE)
         message(FATAL_ERROR "CUDA is not supported on macOS" )
     endif()
-    option(NO_CUBLASLT "Don't use CUBLAST" OFF)
-    if(NO_CUBLASLT)
-        set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72)
-    else()
-        set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89 90)
+    option(NO_CUBLASLT "Disable CUBLAS" OFF)
+    set(BUILD_CUDA ON)
+    set(BUILD_MPS OFF)
+    message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}")
+elseif(${COMPUTE_BACKEND} STREQUAL "mps")
+    if(NOT APPLE)
+        message(FATAL_ERROR "MPS is only supported on macOS" )
     endif()
+    set(BUILD_CUDA OFF)
+    set(BUILD_MPS ON)
+else()
+    set(BUILD_CUDA OFF)
+    set(BUILD_MPS OFF)
 endif()
 
-if(ENABLE_CUDA)
-    message("Building CUDA support for ${CMAKE_CUDA_ARCHITECTURES}")
-    # Find CUDA tools if we are compiling with CUDA
-    find_package(CUDAToolkit REQUIRED)
-    if(NO_CUBLASLT)
-        set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}_nocublaslt")
-    else()
-        set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}")
+
+if(BUILD_CUDA)
+    enable_language(CUDA) # This will fail if CUDA is not found
+
+    # Convert the CUDA version from X.Y.z to XY. There's probably a shorter way of doing this
+    string(REGEX MATCH "^[0-9]+.[0-9]+" _CUDA_VERSION_FIRST_TWO "${CMAKE_CUDA_COMPILER_VERSION}")
+    string(REPLACE "." "" CUDA_VERSION_SHORT "${_CUDA_VERSION_FIRST_TWO}")
+
+    # Expose a cache variable that the user can set to ensure the correct version of CUDA is found
+    set(CUDA_VERSION "${CUDA_VERSION_SHORT}" CACHE STRING "Expected CUDA Version Shortcode")
+
+    message(STATUS "CUDA Version: ${CUDA_VERSION_SHORT} (${CMAKE_CUDA_COMPILER_VERSION})")
+    message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}")
+
+    # It should match the discovered version
+    if(NOT CUDA_VERSION STREQUAL "${CUDA_VERSION_SHORT}")
+        message(FATAL_ERROR "You've specified CUDA version ${CUDA_VERSION} however the CUDA compiler found is ${CUDA_VERSION_SHORT}."
+            " Ensure the desired CUDA compiler is the first one available on your PATH."
+        )
+    endif()
+
+    if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.0")
+        message(FATAL_ERROR "CUDA Version < 11 is not supported")
+    elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
+        message(FATAL_ERROR "CUDA Version > 12 is not supported")
+    endif()
+
+    string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math")
+    if(PTXAS_VERBOSE)
+        # Verbose? Outputs register usage information, and other things...
+        string(APPEND CMAKE_CUDA_FLAGS " -Xptxas=-v")
     endif()
 
-    project(bitsandbytes LANGUAGES CXX CUDA)
+    foreach(capability ${CMAKE_CUDA_ARCHITECTURES_ALL})
+        # Most of the items here are like: `xx-real`, so we just extract the `xx` portion
+        string(REGEX MATCH "[0-9]+" capability_id "${capability}")
+        if(capability_id GREATER 0)
+            list(APPEND POSSIBLE_CAPABILITIES ${capability_id})
+        endif()
+    endforeach()
+
+    # This can be changed via -D argument to CMake
+    # By default all possible capabilities are compiled
+    set(COMPUTE_CAPABILITY "${POSSIBLE_CAPABILITIES}" CACHE STRING "Compute Capabilities Targeted")
+
+    message(STATUS "CUDA Capabilities Available: ${POSSIBLE_CAPABILITIES}")
+    message(STATUS "CUDA Capabilities  Selected: ${COMPUTE_CAPABILITY}")
+
+    foreach(capability ${COMPUTE_CAPABILITY})
+        string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_${capability},code=sm_${capability}")
+    endforeach()
+
+    message(STATUS "CUDA NVCC Flags: ${CMAKE_CUDA_FLAGS}")
+
+    list(APPEND SRC_FILES ${CUDA_FILES})
+
+    string(APPEND BNB_OUTPUT_NAME "_cuda${CUDA_VERSION_SHORT}")
+    if(NO_CUBLASLT)
+        string(APPEND BNB_OUTPUT_NAME "_nocublaslt")
+    endif()
     add_compile_definitions(BUILD_CUDA)
-    set(CMAKE_CUDA_STANDARD 14)
-    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
-    set(GPU_SOURCES csrc/ops.cu csrc/kernels.cu)
-elseif(ENABLE_MPS)
+elseif(BUILD_MPS)
     if(NOT APPLE)
         message(FATAL_ERROR "MPS is only supported on macOS" )
     endif()
-    message("Building MPS support")
-    set(LIBSUFFIX "mps")
-    project(bitsandbytes LANGUAGES CXX OBJCXX)
+
+    enable_language(OBJCXX)
+
+    list(APPEND SRC_FILES ${MPS_FILES})
+
+    string(APPEND BNB_OUTPUT_NAME "_mps")
     add_compile_definitions(BUILD_MPS)
-    set(METAL_SOURCES csrc/mps_kernels.metal)
     file(MAKE_DIRECTORY "build")
     add_custom_command(OUTPUT "bitsandbytes/bitsandbytes.metallib"  
-                   COMMAND xcrun metal -c -o "build/bitsandbytes.air" ${METAL_SOURCES}
-                   COMMAND xcrun metallib "build/bitsandbytes.air" -o "bitsandbytes/bitsandbytes.metallib"
-                   DEPENDS "${METAL_SOURCES}"  
-                   COMMENT "Compiling Metal kernels"  
-                   VERBATIM)
+                COMMAND xcrun metal -c -o "build/bitsandbytes.air" ${METAL_FILES}
+                COMMAND xcrun metallib "build/bitsandbytes.air" -o "bitsandbytes/bitsandbytes.metallib"
+                DEPENDS "${METAL_FILES}"  
+                COMMENT "Compiling Metal kernels"  
+                VERBATIM)
     add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib")
-    set(GPU_SOURCES csrc/mps_ops.mm)
 else()
-    message("Building with CPU only")
     set(LIBSUFFIX "cpu")
-
-    project(bitsandbytes LANGUAGES CXX)
     set(GPU_SOURCES)
 endif()
 
-if(APPLE)
-  set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1)
-endif()
-set(CMAKE_CXX_STANDARD 14)
-set(CXX_STANDARD_REQUIRED C++14)
 
 if(WIN32)
-    # Mute warnings
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -diag-suppress=177")
-
-    # Enable fast math on VC++
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast")
-
     # Export all symbols
     set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif()
@@ -76,46 +150,43 @@ endif()
 # Weird MSVC hacks
 if(MSVC)
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}  /NODEFAULTLIB:msvcprtd /NODEFAULTLIB:MSVCRTD /NODEFAULTLIB:LIBCMT")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2 /fp:fast")
 endif()
 
-# Add csrc files
-add_library(bitsandbytes SHARED
-	${GPU_SOURCES}
-	csrc/common.cpp
-	csrc/cpu_ops.cpp
-	csrc/pythonInterface.cpp)
-
-target_include_directories(bitsandbytes PUBLIC
-    ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
-    ${CMAKE_CURRENT_SOURCE_DIR}/csrc
-    ${CMAKE_CURRENT_SOURCE_DIR}/include)
+set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
+add_library(bitsandbytes SHARED ${SRC_FILES})
+target_include_directories(bitsandbytes PUBLIC csrc include)
 
-if(ENABLE_CUDA)
-    target_include_directories(bitsandbytes PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/include)
 
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
+if(BUILD_CUDA)
+    target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cusparse)
+    if(NO_CUBLASLT)
+        target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT)
+    else()
+        target_link_libraries(bitsandbytes PUBLIC CUDA::cublasLt)
+    endif()
 
-    set_target_properties(
-        bitsandbytes
+    set_target_properties(bitsandbytes
         PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON)
-
-    target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cusparse)
+            CUDA_SEPARABLE_COMPILATION ON
+    )
 endif()
-if(ENABLE_MPS)
+if(BUILD_MPS)
     add_dependencies(bitsandbytes metallib)
     target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
 endif()
 
-set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME "bitsandbytes_${LIBSUFFIX}")
-# Set the output name of the CUDA library
+if(WIN32)
+    set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")
+endif()
+set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME ${BNB_OUTPUT_NAME})
 if(MSVC)
-set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE bitsandbytes)
-set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG bitsandbytes)
-set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bitsandbytes)
-set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG bitsandbytes)
+    set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE bitsandbytes)
+    set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG bitsandbytes)
+    set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bitsandbytes)
+    set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG bitsandbytes)
 endif()
 
 set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY bitsandbytes)
diff --git a/README.md b/README.md
index a4586d6ca..61dede8c1 100644
--- a/README.md
+++ b/README.md
@@ -153,10 +153,10 @@ To compile from source, you need an installation of CUDA. If `nvcc` is not insta
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
-#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
+#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
-bash install_cuda.sh 117 ~/local 1 
+bash install_cuda.sh 117 ~/local 1
 ```
 
 To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`:
diff --git a/benchmarking/switchback/README.md b/benchmarking/switchback/README.md
index bb33b5bbd..b73569030 100644
--- a/benchmarking/switchback/README.md
+++ b/benchmarking/switchback/README.md
@@ -1,4 +1,4 @@
 Steps:
 
 1. Run `python speed_benchmark/speed_benchmark.py` which times operations and writes their time to `speed_benchmark/info_a100_py2.jsonl` (change the name of the jsonl to a different name for your profiling).
-2. Run `python speed_benchmark/make_plot_with_jsonl.py`, which produces the `speed_benchmark/plot_with_info.pdf`. Again make sure you change the jsonl which is being processed.
\ No newline at end of file
+2. Run `python speed_benchmark/make_plot_with_jsonl.py`, which produces the `speed_benchmark/plot_with_info.pdf`. Again make sure you change the jsonl which is being processed.
diff --git a/benchmarking/switchback/make_plot_with_jsonl.py b/benchmarking/switchback/make_plot_with_jsonl.py
index 3ef87d6b2..177270346 100644
--- a/benchmarking/switchback/make_plot_with_jsonl.py
+++ b/benchmarking/switchback/make_plot_with_jsonl.py
@@ -33,7 +33,7 @@
 
         ('global_fwd', '^', '--', 'C4', 'Int8 Matmul XW (switchback)'),
         ('global_bwd', '^', '-.', 'C4', 'Int8 Matmul GW (switchback)'),
-        
+
         ('x_quantize_rowwise', 'P', '--', 'C4', 'Quantize rowwise X (switchback)'),
         ('g_quantize_rowwise', 'P', '-.', 'C4', 'Quantize rowwise G (switchback)'),
         ('w_quantize_global', '.', '--', 'C4', 'Quatnize global W (switchback)'),
@@ -55,7 +55,7 @@
                 y_ += df_[k_].values[0]
             ys.append(y_ * 0.5)
 
-        
+
         ax.plot(xs, ys, color=color, label=name, marker=marker, markersize=5 if marker=='s' else 5, linestyle=ls, linewidth=2 if '+' in k else 1.)
 
 
@@ -67,7 +67,7 @@
     ax.set_xscale('log')
     if logscale_plot1:
         ax.set_yscale('log')
-    
+
     ax.tick_params(axis='x', labelsize=11)
     ax.tick_params(axis='y', labelsize=11)
 
@@ -91,7 +91,7 @@
             ('standard_gx+standard_gw+standard_fwd', 's', '-', 'C2', 'Standard fp16 (total time)'),
             ('x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd', 'o', '-', 'C4', 'SwitchBack int8 (total time)'),
         ]:
-        
+
             xs, ys = [], []
             df = rdf[rdf.batch_size == batch_size]
             for embed_dim in dims_to_consider:
@@ -133,4 +133,3 @@
 
 
     plt.savefig('speed_benchmark/plot_with_info.pdf', bbox_inches='tight')
-
diff --git a/benchmarking/switchback/speed_benchmark.py b/benchmarking/switchback/speed_benchmark.py
index d70df0386..c4f3cd4c6 100644
--- a/benchmarking/switchback/speed_benchmark.py
+++ b/benchmarking/switchback/speed_benchmark.py
@@ -42,7 +42,7 @@ def get_time(k, fn, info_dict):
     for dim in [1024, 1280, 1408, 1664, 2048, 4096]:
         # note "batch_size" is actually "batch_size * embed_dim", which is why it's large
         for batch_size in [256*32, 256*64, 256*128, 256*256, 256*512]:
-            
+
             # switch switches dim_in and dim_out
             for switch in [False, True]:
 
@@ -62,7 +62,7 @@ def get_time(k, fn, info_dict):
                 x = torch.randn(batch_size, dim_in, dtype=torch.float16).cuda()
                 g = torch.randn(batch_size, dim_out, dtype=torch.float16).cuda()
                 w = torch.randn(dim_out, dim_in, dtype=torch.float16).cuda()
-                
+
                 x_int8 = x.clone().to(torch.int8)
                 g_int8 = g.clone().to(torch.int8)
                 w_int8 = w.clone().to(torch.int8)
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 87307a9d2..e54e933d9 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -24,6 +24,6 @@
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.42.0"
+__version__ = "0.43.0.dev"
 
 PACKAGE_GITHUB_URL = "https://github.com/TimDettmers/bitsandbytes"
diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
index a34385b1f..0db9df343 100644
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -210,7 +210,7 @@ def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]:
             if path.exists():
                 existent_directories.add(path)
         except PermissionError:
-            # Handle the PermissionError first as it is a subtype of OSError 
+            # Handle the PermissionError first as it is a subtype of OSError
             # https://docs.python.org/3/library/exceptions.html#exception-hierarchy
             pass
         except OSError as exc:
diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py
index 9ea5812ea..17383eed5 100644
--- a/bitsandbytes/optim/adamw.py
+++ b/bitsandbytes/optim/adamw.py
@@ -35,4 +35,3 @@ class PagedAdamW32bit(Optimizer2State):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32,
                        args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True):
         super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True)
-
diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py
index e515bfeff..7d869e39a 100644
--- a/bitsandbytes/research/autograd/_functions.py
+++ b/bitsandbytes/research/autograd/_functions.py
@@ -83,7 +83,7 @@ def backward(ctx, grad_output):
         # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2])
 
         # not supported by PyTorch. TODO: create work-around
-        if req_gradA: 
+        if req_gradA:
             grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype)
 
         if req_gradB:
@@ -167,7 +167,7 @@ def backward(ctx, grad_output):
         # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2])
 
         # not supported by PyTorch. TODO: create work-around
-        if req_gradA: 
+        if req_gradA:
             grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype)
 
         if req_gradB:
diff --git a/bitsandbytes/triton/dequantize_rowwise.py b/bitsandbytes/triton/dequantize_rowwise.py
index daa59da9c..3d7529852 100644
--- a/bitsandbytes/triton/dequantize_rowwise.py
+++ b/bitsandbytes/triton/dequantize_rowwise.py
@@ -50,7 +50,7 @@ def _dequantize_rowwise(
         max_val = tl.load(state_x + pid)
         output = max_val * x * inv_127
         tl.store(output_ptr + offsets, output, mask=row_mask)
-        
+
 
     def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor):
         output = torch.empty(*x.shape, device=x.device, dtype=torch.float16)
diff --git a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py
index 1b80ab1a0..dc3047d7e 100644
--- a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py
+++ b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py
@@ -120,7 +120,7 @@ def _int8_matmul_mixed_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N,
             acc += tl.dot(a, b)
             A += BLOCK_K * SPLIT_K * stride_ak
             B += BLOCK_K * SPLIT_K * stride_bk
-        
+
         acc = (w_factor * (x_factor * (acc * divfactor)))
         acc = acc.to(C.dtype.element_ty)
 
diff --git a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py
index 1f28b0d10..4881e1468 100644
--- a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py
+++ b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py
@@ -119,7 +119,7 @@ def _int8_matmul_rowwise_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M,
             acc += tl.dot(a, b)
             A += BLOCK_K * SPLIT_K * stride_ak
             B += BLOCK_K * SPLIT_K * stride_bk
-        
+
         acc = (w_factor * (x_factor * (acc * divfactor)))
         acc = acc.to(C.dtype.element_ty)
 
diff --git a/bitsandbytes/triton/quantize_columnwise_and_transpose.py b/bitsandbytes/triton/quantize_columnwise_and_transpose.py
index fcadaba3e..e7961cf53 100644
--- a/bitsandbytes/triton/quantize_columnwise_and_transpose.py
+++ b/bitsandbytes/triton/quantize_columnwise_and_transpose.py
@@ -54,7 +54,7 @@ def _quantize_columnwise_and_transpose(
         max_val = tl.max(tl.where(p2_arange_mask, abs_x, 0), axis=0)
         output = tl.libdevice.llrint(127. * (x / max_val))
 
-        new_start = pid * M 
+        new_start = pid * M
         new_offsets = new_start + p2_arange
         tl.store(output_ptr + new_offsets, output, mask=p2_arange_mask)
         tl.store(output_maxs + pid, max_val)
@@ -71,4 +71,3 @@ def quantize_columnwise_and_transpose(x: torch.Tensor):
         grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
         _quantize_columnwise_and_transpose[grid](x, output, output_maxs, n_elements, M, N, BLOCK_SIZE=M, P2=P2)
         return output, output_maxs
-
diff --git a/bitsandbytes/triton/quantize_global.py b/bitsandbytes/triton/quantize_global.py
index a73a5bbaa..5cf194744 100644
--- a/bitsandbytes/triton/quantize_global.py
+++ b/bitsandbytes/triton/quantize_global.py
@@ -59,27 +59,27 @@ def quantize_global(x: torch.Tensor):
             key=['M', 'N']
     )
     @triton.jit
-    def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N, 
-                          BLOCK_M : tl.constexpr, 
-                          BLOCK_N : tl.constexpr, 
+    def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N,
+                          BLOCK_M : tl.constexpr,
+                          BLOCK_N : tl.constexpr,
                           GROUP_M : tl.constexpr):
         pid = tl.program_id(0)
         grid_m = (M + BLOCK_M - 1) // BLOCK_M
         grid_n = (N + BLOCK_N - 1) // BLOCK_N
-        
+
         width = GROUP_M * grid_n
         group_id = pid // width
         group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
         pid_m = group_id * GROUP_M + (pid % group_size)
         pid_n = (pid % width) // group_size
-        
+
         rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
         rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
         A = A + (rm[:, None] * stride_am + rn[None, :] * stride_an)
         mask = (rm < M)[:, None] & (rn < N)[None, :]
         a = tl.load(A, mask=mask)
         absmax_inv = tl.load(absmax_inv_ptr)
-        
+
         # rematerialize to save registers
         rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
         rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
@@ -95,12 +95,11 @@ def quantize_global_transpose(input):
         absmax_inv = 1./ absmax
         M, N = input.shape
         out = torch.empty(N, M, device='cuda', dtype=torch.int8)
-        
+
         assert out.size(0) == N and out.size(1) == M
         assert input.stride(0) == 1 or input.stride(1) == 1
         assert out.stride(0) == 1 or out.stride(1) == 1
-        
+
         grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),)
         _quantize_global_transpose[grid](input, absmax_inv, out, input.stride(0), input.stride(1), out.stride(0), out.stride(1), M, N)
         return out, absmax
-
diff --git a/bitsandbytes/triton/quantize_rowwise.py b/bitsandbytes/triton/quantize_rowwise.py
index fce464b19..078f4aa2d 100644
--- a/bitsandbytes/triton/quantize_rowwise.py
+++ b/bitsandbytes/triton/quantize_rowwise.py
@@ -46,7 +46,7 @@ def _quantize_rowwise(
         offsets = block_start + arange
         row_mask = arange < BLOCK_SIZE
         x = tl.load(x_ptr + offsets, mask=row_mask)
-        
+
         abs_x = tl.abs(x)
         max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0)
         output = tl.libdevice.llrint(127. * (x / max_val))
@@ -64,4 +64,3 @@ def quantize_rowwise(x: torch.Tensor):
         grid = lambda meta: (x.shape[0],)
         _quantize_rowwise[grid](x, output, output_maxs, n_elements, BLOCK_SIZE=x.shape[1], P2=P2)
         return output, output_maxs
-
diff --git a/compile_from_source.md b/compile_from_source.md
index 23afe1591..6310fd6c6 100644
--- a/compile_from_source.md
+++ b/compile_from_source.md
@@ -12,10 +12,10 @@ You can install CUDA locally without sudo by following the following steps:
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
-#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
+#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
-bash install_cuda.sh 117 ~/local 1 
+bash install_cuda.sh 117 ~/local 1
 ```
 
 By default, the Makefile will look at your `CUDA_HOME` environmental variable to find your CUDA version for compiling the library. If this path is not set it is inferred from the path of your `nvcc` compiler.
@@ -37,4 +37,3 @@ If you have problems compiling the library with these instructions from source,
 ## Compilation with Kepler
 
 Since 0.39.1 bitsandbytes installed via pip no longer provides Kepler binaries and these need to be compiled from source. Follow the steps above and instead of `cuda11x_nomatmul` etc use `cuda11x_nomatmul_kepler`
-
diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index 478c1f4ff..e67135360 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -26,7 +26,7 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
     BinAlgo<Scalar, float, Direct2> bin_searcher(code, elements_code);
 
     int thread_wave_size = 256;
-    // we chunk the thresds into waves of 256 since the max limit is
+    // we chunk the threads into waves of 256 since the max limit is
     // between 16k and 64k on Linux (we reach this when running BLOOM-176B with a large batch size)
     for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size)
     {
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index 0fff83665..f117547ed 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -110,7 +110,7 @@ __device__ float dDequantizeFP4Tree(unsigned char val, float absmax)
         return 1.00000000f*absmax*sign; // 1011
       else
         return 0.66666667f*absmax*sign; // 1010
-    else 
+    else
       if((val & 0b0001) == 1) // 100
         return 5.208333333e-03f*absmax*sign; // 1001
       else
@@ -174,36 +174,36 @@ __device__ half dhDequantizeNF4(unsigned char val)
     if((val & 0b0100) == 4) // 1
       if((val & 0b0010) == 2) // 11
         if((val & 0b0001) == 1) // 111
-          return 1.0f; 
+          return 1.0f;
         else
           return 0.7229568362236023f;
       else
         if((val & 0b0001) == 1) // 110
-          return 0.5626170039176941f; 
+          return 0.5626170039176941f;
         else
-          return 0.44070982933044434f; 
+          return 0.44070982933044434f;
     else
       if((val & 0b0010) == 2) //10
         if((val & 0b0001) == 1) // 101
-          return 0.33791524171829224f; 
+          return 0.33791524171829224f;
         else
-          return 0.24611230194568634f; 
-      else 
+          return 0.24611230194568634f;
+      else
         if((val & 0b0001) == 1) // 100
-          return 0.16093020141124725f; 
+          return 0.16093020141124725f;
         else
-          return 0.07958029955625534f; 
+          return 0.07958029955625534f;
 
   else
     if((val & 0b0100) == 4) // 0
       if((val & 0b0010) == 2) //01
         if((val & 0b0001) == 1) // 011
-          return 0.0f; 
+          return 0.0f;
         else
-          return -0.09105003625154495f; 
+          return -0.09105003625154495f;
       else
         if((val & 0b0001) == 1) // 010
-          return -0.18477343022823334f; 
+          return -0.18477343022823334f;
         else
           return -0.28444138169288635f;
     else
@@ -211,12 +211,12 @@ __device__ half dhDequantizeNF4(unsigned char val)
         if((val & 0b0001) == 1) // 001
           return -0.39491748809814453f;
         else
-          return -0.5250730514526367f; 
-      else 
+          return -0.5250730514526367f;
+      else
         if((val & 0b0001) == 1) // 000
-          return -0.6961928009986877f; 
+          return -0.6961928009986877f;
         else
-          return -1.0f; 
+          return -1.0f;
 
 }
 
@@ -229,36 +229,36 @@ __device__ float dDequantizeNF4(unsigned char val)
     if((val & 0b0100) == 4) // 1
       if((val & 0b0010) == 2) // 11
         if((val & 0b0001) == 1) // 111
-          return 1.0f; 
+          return 1.0f;
         else
           return 0.7229568362236023f;
       else
         if((val & 0b0001) == 1) // 110
-          return 0.5626170039176941f; 
+          return 0.5626170039176941f;
         else
-          return 0.44070982933044434f; 
+          return 0.44070982933044434f;
     else
       if((val & 0b0010) == 2) //10
         if((val & 0b0001) == 1) // 101
-          return 0.33791524171829224f; 
+          return 0.33791524171829224f;
         else
-          return 0.24611230194568634f; 
-      else 
+          return 0.24611230194568634f;
+      else
         if((val & 0b0001) == 1) // 100
-          return 0.16093020141124725f; 
+          return 0.16093020141124725f;
         else
-          return 0.07958029955625534f; 
+          return 0.07958029955625534f;
 
   else
     if((val & 0b0100) == 4) // 0
       if((val & 0b0010) == 2) //01
         if((val & 0b0001) == 1) // 011
-          return 0.0f; 
+          return 0.0f;
         else
-          return -0.09105003625154495f; 
+          return -0.09105003625154495f;
       else
         if((val & 0b0001) == 1) // 010
-          return -0.18477343022823334f; 
+          return -0.18477343022823334f;
         else
           return -0.28444138169288635f;
     else
@@ -266,12 +266,12 @@ __device__ float dDequantizeNF4(unsigned char val)
         if((val & 0b0001) == 1) // 001
           return -0.39491748809814453f;
         else
-          return -0.5250730514526367f; 
-      else 
+          return -0.5250730514526367f;
+      else
         if((val & 0b0001) == 1) // 000
-          return -0.6961928009986877f; 
+          return -0.6961928009986877f;
         else
-          return -1.0f; 
+          return -1.0f;
 
 }
 
@@ -1863,7 +1863,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char
               //float ratio = (g_val*g_val)/fmaxf(s2_vals[j], eps*eps);
               //g_val = ratio > 2.0f ? 2.0f*g_val/ratio : g_val;
               g_val *= gnorm_scale;
-              
+
 							s2_vals[j] = (s2_vals[j]*beta2) + (((1.0f-beta2)*g_val*g_val));
 
 							s1_vals[j] = smem_quantiles1[lane_id][c1s[j]]*absmax1[i/BLOCK_SIZE];
@@ -3069,7 +3069,7 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
 //// use k warps per thread block
 //// 1. threadblock use read-only cache to read in register tile for A into shared memory
 //// 2. each warp loops over shared memory tiles of A of size 8x16 and loads them into fragments
-//// 3. each warp reads a segment of values 16x32 from B 
+//// 3. each warp reads a segment of values 16x32 from B
 //// 4. do dequantization from register of B into second pair of registers
 //// 5. store (4) into fragment
 //// 6. matmul aggregate into fragment C
@@ -3531,7 +3531,7 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
 template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inference_naive(int M, int N, int K, T * __restrict__ const A, unsigned char *B,  float *absmax, const float *datatype, T * out,  int lda, int ldb, int ldc, int blocksize)
 {
 
-  // per threadblock: 
+  // per threadblock:
   // load step-by-step in chunks of [32,warps]: 1x32 * [32,warps] -> [1,warps]
   // 4 warps -> 4 loads per iter
   // 1x32 * 32x4 -> 1x4 outputs per thread block
@@ -3764,7 +3764,7 @@ template <typename T, int FUNC> __global__ void kfunc(T *A, T *B, T value, long
   {
     switch(FUNC)
     {
-      case FILL: 
+      case FILL:
         A[i] = (T)value;
         break;
       case ARANGE:
diff --git a/csrc/mps_ops.h b/csrc/mps_ops.h
new file mode 100644
index 000000000..e69de29bb
diff --git a/csrc/mps_ops.mm b/csrc/mps_ops.mm
new file mode 100644
index 000000000..5e3adeebe
--- /dev/null
+++ b/csrc/mps_ops.mm
@@ -0,0 +1,67 @@
+#import <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+
+#define HLF_MAX 65504
+#define TH 1024
+#define NUM 4
+#define NUM_BLOCK 4096
+
+static inline MPSGraph* get_graph()
+{
+  static MPSGraph* cur = nil;
+  if(!cur) {
+    cur = [[MPSGraph alloc] init];
+  }
+  return cur;
+}
+
+static inline id<MTLDevice> get_device()
+{
+  NSError *error = nil;  
+  static id<MTLDevice> device = nil;
+  if(!device) {
+    device = MTLCreateSystemDefaultDevice();  
+  }
+  if(!device) {
+    NSLog(@"Failed to get MPS device");
+    abort();
+  }
+  return device;
+}
+
+static inline id<MTLLibrary> get_library()
+{
+  NSError *error = nil;  
+  static id<MTLLibrary> library = nil;
+  if(!library) {
+    library = [get_device() newLibraryWithURL:[NSURL fileURLWithPath:@"bitsandbytes.metallib"] error:&error];
+  }
+  if(!library) {
+    NSLog(@"Failed to load bitsandbytes.metallib");
+    abort();
+  }
+  return library;
+}  
+
+/*MPSGraphTensor* dequantize_mps(MPSGraphTensor* code, MPSGraphTensor* A, int n)
+{
+  id out = [get_graph() dequantizeTensor:(MPSGraphTensor*)A scaleTensor:(MPSGraphTensor*)code zeroPoint:0.0 dataType:MPSDataTypeInt8 axis:0 name:@"out"];
+  return out;
+}*/
+
+
+// MPSGraph function for quantize  
+extern "C" MPSGraphTensor* quantize_mps(MPSGraph* graph, MPSGraphTensor* code, MPSGraphTensor* A, int n)
+{
+  id<MTLDevice> device = get_device();
+  id<MTLLibrary> library = get_library();
+  static id<MTLFunction> kernel = nil;
+  if(!kernel) {
+    kernel = [library newFunctionWithName:@"quantize"];
+    if(!kernel) {
+      NSLog(@"Failed to load bitsandbytes.metallib");
+      abort();
+    }
+  }
+  NSLog(@"Not implemented");
+  return nil;
+}
\ No newline at end of file
diff --git a/csrc/ops.cuh b/csrc/ops.cuh
index cc7b59505..da9df6af0 100644
--- a/csrc/ops.cuh
+++ b/csrc/ops.cuh
@@ -9,9 +9,6 @@
 
 #include <stdio.h>
 #include <iostream>
-#ifndef _MSC_VER
-#include <unistd.h>
-#endif
 #include <assert.h>
 
 #include <cuda_runtime_api.h>
diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp
index a6b348ca6..ea2283504 100644
--- a/csrc/pythonInterface.cpp
+++ b/csrc/pythonInterface.cpp
@@ -392,7 +392,7 @@ extern "C"
 		int hasPrefetch = 0;
 		CUDA_CHECK_RETURN(cudaDeviceGetAttribute(&hasPrefetch, cudaDevAttrConcurrentManagedAccess, device)); // 40ns overhead
 		if (hasPrefetch == 0) return;
- 
+
 		CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0));
 		CUDA_CHECK_RETURN(cudaPeekAtLastError());
 	}
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 28da69eb0..043597177 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -1,8 +1,8 @@
-- sections: 
+- sections:
   - local: index
     title: Bits & Bytes
   - local: quickstart
     title: Quickstart
   - local: installation
     title: Installation
-  title: Get started
\ No newline at end of file
+  title: Get started
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 68ad433e6..67c928309 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -149,10 +149,10 @@ To compile from source, you need an installation of CUDA. If `nvcc` is not insta
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
-#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
+#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
-bash install_cuda.sh 117 ~/local 1 
+bash install_cuda.sh 117 ~/local 1
 ```
 
 To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`:
@@ -188,4 +188,4 @@ For 8-bit optimizers or quantization routines, please consider citing the follow
   journal={9th International Conference on Learning Representations, ICLR},
   year={2022}
 }
-```
\ No newline at end of file
+```
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 035e3e70d..50031acf7 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -1,3 +1,43 @@
 # Installation
 
-... work in progress ...
\ No newline at end of file
+Note currently `bitsandbytes` is only supported on CUDA GPU hardwares, support for AMD GPUs and M1 chips (MacOS) is coming soon.
+
+<hfoptions id="OS system">
+<hfoption id="Linux">
+
+## Linux
+
+### From Pypi
+
+```bash
+pip install bitsandbytes
+```
+
+### From source
+
+```bash
+git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+CUDA_VERSION=XXX make cuda12x
+python setup.py install
+```
+
+with `XXX` being your CUDA version, for <12.0 call `make cuda 11x`
+
+</hfoption>
+<hfoption id="Windows">
+
+## Windows
+
+Currently for Windows users, you need to build bitsandbytes from source
+
+```bash
+git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+cmake -B build -DBUILD_CUDA=ON -S .
+cmake --build build --config Release
+python -m build --wheel
+```
+
+Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com/Jamezo97), [rickardp](https://github.com/rickardp), [akx](https://github.com/akx) for their amazing contributions to make bitsandbytes compatible with Windows.
+
+</hfoption>
+</hfoptions>
diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx
index 4dff2ba46..d1028c655 100644
--- a/docs/source/quickstart.mdx
+++ b/docs/source/quickstart.mdx
@@ -6,7 +6,7 @@
 
 ## Minimal example
 
-The following code illustrates the steps above. 
+The following code illustrates the steps above.
 
 ```python
-```
\ No newline at end of file
+```
diff --git a/environment-bnb.yml b/environment-bnb.yml
new file mode 100644
index 000000000..92c7761bb
--- /dev/null
+++ b/environment-bnb.yml
@@ -0,0 +1,21 @@
+# for cmake build
+name: bnb
+channels:
+  - pytorch
+  - nvidia
+  - conda-forge
+
+dependencies:
+  - python
+  - accelerate
+  - einops
+  - scipy
+  - transformers
+  - pytest
+  - pytest-cases
+  - ipython
+  - debugpy
+  - yapf
+  - monkeytype
+  - rich
+  - pytest-sugar
diff --git a/environment.yml b/environment.yml
index c0e07f153..9ab48dedc 100644
--- a/environment.yml
+++ b/environment.yml
@@ -42,4 +42,4 @@ dependencies:
 
 ## ENV UPDATE:
 # # add new packages to environment.yml, then:
-# mamba env update -n bnb -f environment.yml
\ No newline at end of file
+# mamba env update -n bnb -f environment.yml
diff --git a/examples/int8_inference_huggingface.py b/examples/int8_inference_huggingface.py
index dc80a44db..2cee48e8e 100644
--- a/examples/int8_inference_huggingface.py
+++ b/examples/int8_inference_huggingface.py
@@ -22,6 +22,3 @@
 )
 generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS)
 print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
-
-
-
diff --git a/how_to_use_nonpytorch_cuda.md b/how_to_use_nonpytorch_cuda.md
index b5f01fbe5..566b0170e 100644
--- a/how_to_use_nonpytorch_cuda.md
+++ b/how_to_use_nonpytorch_cuda.md
@@ -18,7 +18,7 @@ You can also install CUDA version that you need locally with a script provided b
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
-#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
+#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
 
diff --git a/include/SIMD.h b/include/SIMD.h
index 18a38dbfd..0cd8d13d6 100644
--- a/include/SIMD.h
+++ b/include/SIMD.h
@@ -86,12 +86,28 @@ template <> struct InstrFloatTraits<SSE, double>
     typedef __m128d vec_t;
 };
 
-template <>
-struct FTOITraits<SSE, float>
+template <> struct InstrFloatTraits<Scalar, float>
+{
+    typedef float  vec_t;
+};
+
+template <> struct InstrFloatTraits<Scalar, double>
+{
+    typedef double vec_t;
+};
+
+template <InstrSet I, typename T>
+struct FTOITraits
 {
     typedef IVec<SSE, float> vec_t;
 };
 
+template <>
+struct FTOITraits<Scalar, float>
+{
+    typedef IVec<Scalar, float> vec_t;
+};
+
 #ifdef USE_AVX
 
 template <>
diff --git a/install_cuda.py b/install_cuda.py
index 77e258609..4b041b8d0 100644
--- a/install_cuda.py
+++ b/install_cuda.py
@@ -49,13 +49,13 @@ def install_cuda(version, base_path, download_path):
     # Install CUDA
     print(f"Installing CUDA version {version}...")
     install_command = [
-        "bash", filepath, 
-        "--no-drm", "--no-man-page", "--override", 
+        "bash", filepath,
+        "--no-drm", "--no-man-page", "--override",
         "--toolkitpath=" + install_path, "--toolkit", "--silent"
     ]
 
     print(f"Running command: {' '.join(install_command)}")
-    
+
     try:
         subprocess.run(install_command, check=True)
     except subprocess.CalledProcessError as e:
@@ -99,4 +99,4 @@ def main():
         sys.exit(1)
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/pytest.ini b/pytest.ini
index 9902b98fa..ac6d72e63 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -7,4 +7,7 @@ addopts = -rP
 
 log_cli = True
 log_cli_level = INFO
-log_file = logs/pytest.log
\ No newline at end of file
+log_file = logs/pytest.log
+markers =
+    benchmark: mark test as benchmark
+    slow: mark test as slow
diff --git a/scripts/stale.py b/scripts/stale.py
index c299643ae..613f5b7cb 100644
--- a/scripts/stale.py
+++ b/scripts/stale.py
@@ -55,4 +55,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/setup.py b/setup.py
index 68c6058cd..0ee090ba5 100644
--- a/setup.py
+++ b/setup.py
@@ -19,12 +19,12 @@ def read(fname):
 
 # Tested with wheel v0.29.0
 class BinaryDistribution(Distribution):
-    def has_ext_modules(foo):
+    def has_ext_modules(self):
         return True
 
 setup(
     name="bitsandbytes",
-    version="0.42.0",
+    version="0.43.0.dev0",
     author="Tim Dettmers",
     author_email="dettmers@cs.washington.edu",
     description="k-bit optimizers and matrix multiplication routines.",
diff --git a/tests/conftest.py b/tests/conftest.py
index 0b4b91225..7aee8c922 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,6 +5,10 @@
 def pytest_runtest_call(item):
     try:
         item.runtest()
+    except NotImplementedError as nie:
+        if "NO_CUBLASLT" in str(nie):
+            pytest.skip("CUBLASLT not available")
+        raise
     except AssertionError as ae:
         if str(ae) == "Torch not compiled with CUDA enabled":
             pytest.skip("Torch not compiled with CUDA enabled")
diff --git a/tests/helpers.py b/tests/helpers.py
new file mode 100644
index 000000000..46c6ef93d
--- /dev/null
+++ b/tests/helpers.py
@@ -0,0 +1,51 @@
+from itertools import product
+import random
+from typing import Any
+
+import torch
+
+test_dims_rng = random.Random(42)
+
+
+def get_test_dims(min: int, max: int, *, n: int) -> list[int]:
+    return [test_dims_rng.randint(min, max) for _ in range(n)]
+
+
+def format_with_label(label: str, value: Any) -> str:
+    if isinstance(value, bool):
+        formatted = "T" if value else "F"
+    elif isinstance(value, (list, tuple)) and all(isinstance(v, bool) for v in value):
+        formatted = "".join("T" if b else "F" for b in value)
+    else:
+        formatted = str(value)
+    return f"{label}={formatted}"
+
+
+def id_formatter(label: str):
+    """
+    Return a function that formats the value given to it with the given label.
+    """
+    return lambda value: format_with_label(label, value)
+
+
+DTYPE_NAMES = {
+    torch.bfloat16: "bf16",
+    torch.bool: "bool",
+    torch.float16: "fp16",
+    torch.float32: "fp32",
+    torch.float64: "fp64",
+    torch.int32: "int32",
+    torch.int64: "int64",
+    torch.int8: "int8",
+}
+
+
+def describe_dtype(dtype: torch.dtype) -> str:
+    return DTYPE_NAMES.get(dtype) or str(dtype).rpartition(".")[2]
+
+
+TRUE_FALSE = (True, False)
+BOOLEAN_TRIPLES = list(
+    product(TRUE_FALSE, repeat=3)
+)  # all combinations of (bool, bool, bool)
+BOOLEAN_TUPLES = list(product(TRUE_FALSE, repeat=2))  # all combinations of (bool, bool)
diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index ed482b356..d01e5e9db 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -1,50 +1,35 @@
-from itertools import product
+from typing import Tuple
 
 import pytest
 import torch
 
 import bitsandbytes as bnb
-
-n = 1
-k = 25
-dim1 = torch.randint(16, 64, size=(n,)).tolist()
-dim2 = torch.randint(32, 96, size=(n,)).tolist()
-dim3 = torch.randint(32, 96, size=(n,)).tolist()
-dim4 = torch.randint(32, 96, size=(n,)).tolist()
-funcs = [(torch.bmm, bnb.bmm_cublas), (torch.matmul, bnb.matmul_cublas)]
-str_funcs = ["bmm", "matmul"]
-req_grad = [(False, False), (True, False), (True, True), (False, True)]
-req_grad_str = ["FF", "TF", "TT", "FT"]
-transpose = [(False, False), (False, True), (True, True), (True, False)]
-str_transpose = ["FF", "FT", "TT", "TF"]
-dtype = [torch.float32, torch.float16]
-values = list(
-    product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose)
-)
-str_values = list(
-    product(
-        dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose
-    )
-)
-names = [
-    "dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}".format(
-        *vals
-    )
-    for vals in str_values
-]
-
-
-@pytest.mark.parametrize(
-    "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose",
-    values,
-    ids=names,
+from tests.helpers import (
+    BOOLEAN_TRIPLES,
+    BOOLEAN_TUPLES,
+    TRUE_FALSE,
+    describe_dtype,
+    get_test_dims,
+    id_formatter,
 )
-def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
+
+TRANSPOSE_VALS = [(False, True), (False, False)]
+
+
+@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", get_test_dims(32, 96, n=1), ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3"))
+@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4"))
+@pytest.mark.parametrize("funcs", [(torch.bmm, bnb.bmm_cublas), (torch.matmul, bnb.matmul_cublas)], ids=["func=bmm", "func=matmul"])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=describe_dtype)
+@pytest.mark.parametrize("req_grad", BOOLEAN_TUPLES, ids=id_formatter("req_grad"))
+@pytest.mark.parametrize("transpose", BOOLEAN_TUPLES, ids=id_formatter("transpose"))
+def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad: Tuple[bool, bool], transpose: Tuple[bool, bool]):
     if dim2 > 0:
         dim2 = dim2 - (dim2 % 16)
     dim3 = dim3 - (dim3 % 16)
     dim4 = dim4 - (dim4 % 16)
-    for i in range(k):
+    for i in range(25):
 
         # normal multiply
         if funcs[0] in [torch.mm, torch.matmul]:
@@ -228,71 +213,17 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
                 assert (idx == 0).sum().item() < n * 0.02
 
 
-n = 1
-k = 3
-dim1 = torch.randint(16, 64, size=(n,)).tolist()
-dim2 = torch.randint(32, 96, size=(n,)).tolist()
-dim3 = torch.randint(32, 96, size=(n,)).tolist()
-dim4 = torch.randint(32, 96, size=(n,)).tolist()
-
-dim2.append(0)
-
-decomp = [0.0, 6.0]
-funcs = [(torch.matmul, bnb.matmul), (torch.matmul, bnb.research.switchback_bnb)]
-str_funcs = ["matmullt", 'switchback_bnb']
-req_grad = [(False, False), (True, False), (True, True), (False, True)]
-req_grad = list(product([True, False], repeat=3))
-req_grad_str = []
-for c in req_grad:
-    strval = ''
-    for v in c:
-        if v == True: strval += 'T'
-        else: strval += 'F'
-    req_grad_str.append(strval)
-
-transpose = [(False, True), (False, False)]
-str_transpose = ["NT", "NN"]
-dtype = [torch.float16, torch.bfloat16, torch.float32]
-has_fp16_weights = [True, False]
-has_bias = [True, False]
-values = list(
-    product(
-        dim1,
-        dim2,
-        dim3,
-        dim4,
-        funcs,
-        dtype,
-        req_grad,
-        transpose,
-        decomp,
-        has_fp16_weights,
-        has_bias
-    )
-)
-str_values = list(
-    product(
-        dim1,
-        dim2,
-        dim3,
-        dim4,
-        str_funcs,
-        dtype,
-        req_grad_str,
-        str_transpose,
-        decomp,
-        has_fp16_weights,
-        has_bias
-    )
-)
-names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_decomp_{}_has_fp16_weights_{}_has_bias_{}".format(*vals) for vals in str_values]
-
-
-@pytest.mark.parametrize(
-    "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, decomp, has_fp16_weights, has_bias",
-    values,
-    ids=names,
-)
+@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", [*get_test_dims(32, 96, n=1), 0], ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3"))
+@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4"))
+@pytest.mark.parametrize("decomp", [0.0, 6.0], ids=id_formatter("decomp"))
+@pytest.mark.parametrize("funcs", [(torch.matmul, bnb.matmul), (torch.matmul, bnb.research.switchback_bnb)], ids=["func=matmul", "func=switchback_bnb"])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
+@pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad"))
+@pytest.mark.parametrize("transpose", TRANSPOSE_VALS, ids=id_formatter("transpose"))
+@pytest.mark.parametrize("has_fp16_weights", TRUE_FALSE, ids=id_formatter("has_fp16_weights"))
+@pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias"))
 def test_matmullt(
     dim1,
     dim2,
@@ -313,7 +244,7 @@ def test_matmullt(
         req_grad = list(req_grad)
         req_grad[2] = False
 
-    for i in range(k):
+    for i in range(3):
 
         # normal multiply
         if funcs[0] in [torch.mm, torch.matmul]:
@@ -429,45 +360,25 @@ def test_matmullt(
                     torch.testing.assert_close(gradBias1, gradBias2)
 
 
-n = 1
-k = 3
-dim1 = torch.randint(16, 64, size=(n,)).tolist()
-dim2 = torch.randint(32, 96, size=(n,)).tolist()
-dim3 = torch.randint(32, 96, size=(n,)).tolist()
-dim4 = torch.randint(32, 96, size=(n,)).tolist()
-
-dim2.append(0)
-
-funcs = [(torch.matmul, bnb.matmul_4bit)]
-str_funcs = ["matmul"]
-req_grad = list(product([True, False], repeat=3))
-req_grad_str = []
-for c in req_grad:
-    strval = ''
-    for v in c:
-        if v == True: strval += 'T'
-        else: strval += 'F'
-    req_grad_str.append(strval)
-
-transpose = [(False, True), (False, False)]
-str_transpose = ["NT", "NN"]
-dtype = [torch.float16, torch.float32]
-compress_statistics = [False, True]
-has_fp16_weights = [True, False]
-has_bias = [True, False]
-quant_type = ['fp4', 'nf4']
-values = list(product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type))
-str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose, has_bias, compress_statistics, quant_type))
-names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_has_bias_{}_compress_statistics_{}_quant_type_{}".format(*vals) for vals in str_values]
-@pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type", values, ids=names)
-def test_matmul_4bit( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type):
+@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", [*get_test_dims(32, 96, n=1), 0], ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3"))
+@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4"))
+@pytest.mark.parametrize("funcs", [(torch.matmul, bnb.matmul_4bit)], ids=["func=matmul"])
+@pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad"))
+@pytest.mark.parametrize("transpose", TRANSPOSE_VALS, ids=id_formatter("transpose"))
+@pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias"))
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=describe_dtype)
+@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
+@pytest.mark.parametrize("quant_type", ['fp4', 'nf4'], ids=id_formatter("quant_type"))
+def test_matmul_4bit(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type):
     dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2)
     dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3)
     if has_bias == False:
         req_grad = list(req_grad)
         req_grad[2] = False
 
-    for i in range(k):
+    for i in range(3):
         # normal multiply
         if funcs[0] in [torch.mm, torch.matmul]:
             A = torch.randn(size=dimA, device="cuda", requires_grad=req_grad[0], dtype=dtype)
@@ -530,32 +441,21 @@ def test_matmul_4bit( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose,
                     torch.testing.assert_close(gradBias1, gradBias2)
 
 
-funcs = [(torch.matmul, bnb.research.matmul_fp8_mixed), (torch.matmul, bnb.research.matmul_fp8_global)]
-str_funcs = ["matmul_fp8_mixed", 'matmul_fp8_global']
-req_grad = list(product([True, False], repeat=3))
-req_grad_str = []
-for c in req_grad:
-    strval = ''
-    for v in c:
-        if v == True: strval += 'T'
-        else: strval += 'F'
-    req_grad_str.append(strval)
-
-transpose = [(False, True), (False, False)]
-str_transpose = ["NT", "NN"]
-dtype = [torch.float16, torch.float32]
-has_fp16_weights = [True, False]
-values = list(product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose))
-str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose))
-names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}".format(*vals) for vals in str_values]
-@pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose", values, ids=names)
+@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", [*get_test_dims(32, 96, n=1), 0], ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3"))
+@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4"))
+@pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad"))
+@pytest.mark.parametrize("transpose", TRANSPOSE_VALS, ids=id_formatter("transpose"))
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=describe_dtype)
+@pytest.mark.parametrize("funcs", [(torch.matmul, bnb.research.matmul_fp8_mixed), (torch.matmul, bnb.research.matmul_fp8_global)], ids=["matmul_fp8_mixed", 'matmul_fp8_global'])
 def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
     dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2)
     dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3)
     req_grad = list(req_grad)
     req_grad[2] = False
 
-    for i in range(k):
+    for i in range(3):
         # normal multiply
         if funcs[0] in [torch.mm, torch.matmul]:
             A = torch.randn(size=dimA, device="cuda", requires_grad=req_grad[0], dtype=dtype)
@@ -619,4 +519,3 @@ def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
                     torch.testing.assert_close(
                         gradB1, gradB2, atol=0.18, rtol=0.3
                     )
-
diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py
index 5e1a548e5..189aa75b5 100644
--- a/tests/test_cuda_setup_evaluator.py
+++ b/tests/test_cuda_setup_evaluator.py
@@ -19,11 +19,3 @@ def test_manual_override(requires_cuda):
     import bitsandbytes as bnb
     loaded_lib = bnb.cuda_setup.main.CUDASetup.get_instance().binary_name
     #assert loaded_lib == 'libbitsandbytes_cuda122.so'
-
-
-
-
-
-
-
-
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 5b7f83bc3..2d4e959ad 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -11,6 +11,13 @@
 
 import bitsandbytes as bnb
 from bitsandbytes import functional as F
+from tests.helpers import (
+    BOOLEAN_TUPLES,
+    TRUE_FALSE,
+    describe_dtype,
+    get_test_dims,
+    id_formatter,
+)
 
 torch.set_printoptions(
     precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000
@@ -155,10 +162,10 @@ def test_dynamic_quantization():
 
 
 
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=["fp32", "fp16", "bf16"])
-@pytest.mark.parametrize("nested", [False, True], ids=["False", "True"])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
+@pytest.mark.parametrize("nested", TRUE_FALSE, ids=id_formatter("nested"))
 @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64])
-@pytest.mark.parametrize("signed", [True, False], ids=['signed_True', 'signed_False'])
+@pytest.mark.parametrize("signed", TRUE_FALSE, ids=id_formatter("signed"))
 def test_dynamic_blockwise_quantization(dtype, nested, blocksize, signed):
     #print('')
     diffs = []
@@ -281,34 +288,22 @@ def mean(xx):
     return sum(xx) / float(len(xx))
 
 
-# dim1 = torch.randint(1,1024*4, size=(4,)).tolist()
-# dim2 = torch.randint(1,1024*4, size=(4,)).tolist()
-dim1 = [1024 * 2]
-dim2 = [1024 * 16]
-methods = [
-    (
+methods = {
+    "linear": (
         lambda x, dim: quant(x),
         lambda x, dim: quant(x),
         dequant,
         dequant,
         mm_dequant,
-    )
-]
-methods.append((quant_multi, quant_multi, dequant, dequant, mm_dequant))
-# methods.append((lambda x: quant_multi_chunk(x, dim=-1), lambda x: quant_multi_chunk(x, dim=0), dequant, dequant, mm_dequant))
-method_names = ["linear", "vectorwise"]
-batched = [False, True]
-values = list(product(dim1, dim2, methods, batched))
-values_names = list(product(dim1, dim2, method_names, batched))
-names = [
-    "dim1_{}_dim2_{}_quant_{}_batched_{}".format(*vals)
-    for vals in values_names
-]
+    ),
+    "vectorwise": (quant_multi, quant_multi, dequant, dequant, mm_dequant),
+}
 
 
-@pytest.mark.parametrize(
-    "dim1, dim2, quant_methods, batched", values, ids=names
-)
+@pytest.mark.parametrize("dim1", [1024 * 2], ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", [1024 * 16], ids=id_formatter("dim2"))
+@pytest.mark.parametrize("quant_methods", methods.values(), ids=methods.keys())
+@pytest.mark.parametrize("batched", TRUE_FALSE, ids=id_formatter("batched"))
 def test_approx_igemm(dim1, dim2, quant_methods, batched):
     dim1 = dim1 - (dim1 % 32)
     dim2 = dim2 - (dim2 % 32)
@@ -352,21 +347,10 @@ def test_stable_embedding():
     layer.reset_parameters()
 
 
-n = 2
-hidden_dim = torch.randint(32, 256, size=(n,)).tolist()
-batch_dim = torch.randint(16, 256, size=(n,)).tolist()
-seq_dim = torch.randint(16, 256, size=(n,)).tolist()
-transpose = [(False, False), (False, True), (True, False), (True, True)]
-values = list(product(hidden_dim, batch_dim, transpose, seq_dim))
-names = [
-    "hidden_dim_{}_batch_dim_{},transpose_{}_seq_dim_{}".format(*vals)
-    for vals in values
-]
-
-
-@pytest.mark.parametrize(
-    "hidden_dim, batch_dim, transpose, seq_dim", values, ids=names
-)
+@pytest.mark.parametrize("hidden_dim", get_test_dims(32, 256, n=2), ids=id_formatter("hidden_dim"))
+@pytest.mark.parametrize("batch_dim", get_test_dims(16, 256, n=2), ids=id_formatter("batch_dim"))
+@pytest.mark.parametrize("seq_dim", get_test_dims(16, 256, n=2), ids=id_formatter("seq_dim"))
+@pytest.mark.parametrize("transpose", BOOLEAN_TUPLES, ids=id_formatter("transpose"))
 def test_igemm(hidden_dim, batch_dim, transpose, seq_dim):
     hidden_dim = hidden_dim - (hidden_dim % 32)
     batch_dim = batch_dim - (batch_dim % 16)
@@ -418,17 +402,9 @@ def test_igemm(hidden_dim, batch_dim, transpose, seq_dim):
         torch.testing.assert_close(out.float(), out2)
 
 
-n = 3
-seq_dim = torch.randint(32, 512, size=(n,)).tolist()
-hidden_dim = torch.randint(32, 1024 * 4, size=(n,)).tolist()
-batch_dim = torch.randint(2, 16, size=(n,)).tolist()
-values = list(product(seq_dim, hidden_dim, batch_dim))
-names = [
-    "seq_dim{}_hidden_dim{}_batch_dim{}".format(*vals) for vals in values
-]
-
-
-@pytest.mark.parametrize("seq_dim, hidden_dim, batch_dim", values, ids=names)
+@pytest.mark.parametrize("seq_dim", get_test_dims(32, 512, n=3), ids=id_formatter("seq_dim"))
+@pytest.mark.parametrize("hidden_dim", get_test_dims(32, 1024 * 4, n=3), ids=id_formatter("hidden_dim"))
+@pytest.mark.parametrize("batch_dim", get_test_dims(2, 16, n=3), ids=id_formatter("batch_dim"))
 def test_dim3_igemm(seq_dim, hidden_dim, batch_dim):
     seq_dim = seq_dim - (seq_dim % 32)
     hidden_dim = hidden_dim - (hidden_dim % 32)
@@ -449,21 +425,10 @@ def test_dim3_igemm(seq_dim, hidden_dim, batch_dim):
         torch.testing.assert_close(out.float(), out2)
 
 
-n = 2
-seq_dim = torch.randint(32, 512, size=(n,)).tolist()
-hidden_dim = torch.randint(32, 1024 * 4, size=(n,)).tolist()
-batch_dim = torch.randint(2, 16, size=(n,)).tolist()
-transpose = [False, True]
-values = list(product(seq_dim, hidden_dim, batch_dim, transpose))
-names = [
-    "seq_dim={}_hidden_dim={}_batch_dim={}_transpose{}".format(*vals)
-    for vals in values
-]
-
-
-@pytest.mark.parametrize(
-    "seq_dim, hidden_dim, batch_dim, transpose", values, ids=names
-)
+@pytest.mark.parametrize("seq_dim", get_test_dims(32, 512, n=2), ids=id_formatter("seq_dim"))
+@pytest.mark.parametrize("hidden_dim", get_test_dims(32, 1024 * 4, n=2), ids=id_formatter("hidden_dim"))
+@pytest.mark.parametrize("batch_dim", get_test_dims(2, 16, n=2), ids=id_formatter("batch_dim"))
+@pytest.mark.parametrize("transpose", TRUE_FALSE, ids=id_formatter("transpose"))
 def test_minmax_igemm(seq_dim, hidden_dim, batch_dim, transpose):
     def min_max(x):
         maxA = torch.amax(x, dim=2, keepdim=True)
@@ -533,20 +498,11 @@ def min_max(x):
     assert mean(relerrs) < 0.3
 
 
-n = 2
-dim1 = torch.randint(1, 64, size=(n,)).tolist()
-dim2 = torch.randint(32, 128, size=(n,)).tolist()
-dim3 = torch.randint(32, 256, size=(n,)).tolist()
-dim4 = torch.randint(32, 256, size=(n,)).tolist()
-transpose = [(False, False), (True, False), (False, True), (True, True)]
-values = list(product(dim1, dim2, dim3, dim4, transpose))
-names = [
-    "dim1_{}_dim2_{}_dim3_{}_dim4_{}_transpose_{}".format(*vals)
-    for vals in values
-]
-
-
-@pytest.mark.parametrize("dim1, dim2, dim3, dim4, transpose", values, ids=names)
+@pytest.mark.parametrize("dim1", get_test_dims(1, 64, n=2), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", get_test_dims(32, 128, n=2), ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim3", get_test_dims(32, 256, n=2), ids=id_formatter("dim3"))
+@pytest.mark.parametrize("dim4", get_test_dims(32, 256, n=2), ids=id_formatter("dim4"))
+@pytest.mark.parametrize("transpose", BOOLEAN_TUPLES, ids=id_formatter("transpose"))
 def test_ibmm(dim1, dim2, dim3, dim4, transpose):
     dim2 = dim2 - (dim2 % 16)
     dim3 = dim3 - (dim3 % 16)
@@ -574,15 +530,9 @@ def test_ibmm(dim1, dim2, dim3, dim4, transpose):
         torch.testing.assert_close(out.float(), out2.float())
 
 
-n = 1
-dim1 = torch.randint(1, 64, size=(n,)).tolist()
-dim2 = torch.randint(32, 128, size=(n,)).tolist()
-dim3 = torch.randint(32, 256, size=(n,)).tolist()
-values = list(product(dim1, dim2, dim3))
-names = ["dim1_{}_dim2_{}_dim3_{}".format(*vals) for vals in values]
-
-
-@pytest.mark.parametrize("dim1, dim2, dim3", values, ids=names)
+@pytest.mark.parametrize("dim1", get_test_dims(1, 64, n=1), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", get_test_dims(32, 128, n=1), ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim3", get_test_dims(32, 256, n=1), ids=id_formatter("dim3"))
 def test_vector_quant(dim1, dim2, dim3):
     dim2 = dim2 - (dim2 % 16)
     dim3 = dim3 - (dim3 % 16)
@@ -594,28 +544,18 @@ def test_vector_quant(dim1, dim2, dim3):
         assert_all_approx_close(A1, A, atol=0.01, rtol=0.1, count=int(n*0.002))
 
 
-
-
-n = 2
-dim1 = torch.randint(2, 256, size=(n,)).tolist()
-dim2 = torch.randint(2, 256, size=(n,)).tolist()
-dim3 = torch.randint(2, 256, size=(n,)).tolist()
-# dim1, dim2 = (256,), (256,)
-dtype = [torch.int8, torch.int32]
-a_order = ["row"]
-out_order = ["col", "row", "col32"]
-transpose = [False]
-dims = [2, 3]
-values = list(product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose))
-
-names = ["dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_transpose_{}".format(*vals)for vals in values]
-
-
-@pytest.mark.parametrize("dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",values,ids=names)
+@pytest.mark.parametrize("dim1", get_test_dims(2, 256, n=2), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", get_test_dims(2, 256, n=2), ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim3", get_test_dims(2, 256, n=2), ids=id_formatter("dim3"))
+@pytest.mark.parametrize("dtype", [torch.int8, torch.int32], ids=describe_dtype)
+@pytest.mark.parametrize("orderA", ["row"], ids=id_formatter("orderA"))
+@pytest.mark.parametrize("orderOut", ["col", "row", "col32"], ids=id_formatter("orderOut"))
+@pytest.mark.parametrize("transpose", [False], ids=id_formatter("transpose"))
+@pytest.mark.parametrize("dims", [2, 3], ids=id_formatter("dims"))
 def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
-    if dims == 3 and out_order != "col32":
+    if dims == 3 and orderOut != "col32":
         return
-    if dtype == torch.int32 and out_order != "col32":
+    if dtype == torch.int32 and orderOut != "col32":
         return
     try:
         func = F.get_transform_func(dtype, orderA, orderOut, transpose)
@@ -677,28 +617,12 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans
         torch.testing.assert_close(A, out2)
 
 
-n = 1
-dim1 = torch.randint(1, 256, size=(n,)).tolist()
-dim2 = torch.randint(32, 512, size=(n,)).tolist()
-dim3 = torch.randint(32, 1024, size=(n,)).tolist()
-dim4 = torch.randint(32, 1024, size=(n,)).tolist()
-
-# dim1 = [2]
-# dim2 = [2]
-# dim3 = [2]
-# dim4 = [2]
-
-dims = (2, 3)
-ldb = [0]
-# ldb = list(range(256, 1*1024, 256))
-values = list(product(dim1, dim2, dim3, dim4, dims, ldb))
-names = [
-    "dim1_{}_dim2_{}_dim3_{}_dim4_{}_dims_{}_ldb_{}".format(*vals)
-    for vals in values
-]
-
-
-@pytest.mark.parametrize("dim1, dim2, dim3, dim4, dims, ldb", values, ids=names)
+@pytest.mark.parametrize("dim1", get_test_dims(1, 256, n=1), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", get_test_dims(32, 512, n=1), ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim3", get_test_dims(32, 1024, n=1), ids=id_formatter("dim3"))
+@pytest.mark.parametrize("dim4", get_test_dims(32, 1024, n=1), ids=id_formatter("dim4"))
+@pytest.mark.parametrize("dims", (2, 3), ids=id_formatter("dims"))
+@pytest.mark.parametrize("ldb", (0,), ids=id_formatter("ldb"))
 def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
     for i in range(k):
         if dims == 2:
@@ -732,21 +656,11 @@ def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
         torch.testing.assert_close(C1, C3.float())
 
 
-dim1 = [32]
-dim2 = [32]
-dim3 = [32]
-dim4 = [32]
-
-dims = (2,)
-# ldb = list(range(256, 1*1024, 256))
-values = list(product(dim1, dim2, dim3, dim4, dims))
-names = [
-    "dim1_{}_dim2_{}_dim3_{}_dim4_{}_dims_{}".format(*vals)
-    for vals in values
-]
-
-
-@pytest.mark.parametrize("dim1, dim2, dim3, dim4, dims", values, ids=names)
+@pytest.mark.parametrize("dim1", [32], ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", [32], ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim3", [32], ids=id_formatter("dim3"))
+@pytest.mark.parametrize("dim4", [32], ids=id_formatter("dim4"))
+@pytest.mark.parametrize("dims", (2,), ids=id_formatter("dims"))
 def test_igemmlt_half(dim1, dim2, dim3, dim4, dims):
     formatB = F.get_special_format_str()
     for i in range(k):
@@ -786,24 +700,15 @@ def test_igemmlt_half(dim1, dim2, dim3, dim4, dims):
         # C3, S = F.transform(C2, 'row', state=SC)
         # torch.testing.assert_close(C1, C3.float())
 
-
-batch_size = 2
-seqdim = 512
-# values = [(batch_size, seqdim, 4*1024, 16*1024),(batch_size, seqdim, 5120, 4*5120),(batch_size, seqdim, 12*1024, 4*12*1024)]
-values = [
-    (batch_size, seqdim, 4 * 1024, 3 * 4 * 1024),
-    (batch_size, seqdim, 5120, 3 * 5120),
-    (batch_size, seqdim, 12 * 1024, 4 * 12 * 1024),
-]
-
-
-# values = list(product(batch, seq, model, hidden))
-names = [
-    "batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values
-]
-
-
-@pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names)
+@pytest.mark.parametrize(
+    ("batch", "seq", "model", "hidden"),
+    [
+        pytest.param(2, 512, 4 * 1024, 3 * 4 * 1024, id="batch=2, seq=512, model=4k, hidden=12k"),
+        pytest.param(2, 512, 5120, 3 * 5120, id="batch=2, seq=512, model=5k, hidden=15k"),
+        pytest.param(2, 512, 12 * 1024, 4 * 12 * 1024, id="batch=2, seq=512, model=12k, hidden=48k"),
+    ],
+)
+@pytest.mark.benchmark
 def test_bench_8bit_training(batch, seq, model, hidden):
     formatB = F.get_special_format_str()
     A = torch.randn(batch, seq, model, device="cuda").half()
@@ -953,24 +858,11 @@ def test_bench_8bit_training(batch, seq, model, hidden):
     # print(t8)
 
 
-n = 2
-dim1 = torch.randint(64, 256, size=(n,)).tolist()
-dim4 = torch.randint(64, 1024, size=(n,)).tolist()
-
-#dim1 = [2*1024]
-#dim4 = [2*1024]
-
-#dim1 = [4]
-#dim4 = [4]
-
-dims = (2,)
-formatB = ["col_turing", "col_ampere"]
-has_bias = [True, False]
-values = list(product(dim1, dim4, dims, formatB, has_bias))
-names = ["dim1_{}_dim4_{}_dims_{}_formatB_{}_has_bias_{}".format(*vals) for vals in values]
-
-
-@pytest.mark.parametrize("dim1, dim4, dims, formatB, has_bias", values, ids=names)
+@pytest.mark.parametrize("dim1", get_test_dims(64, 256, n=2), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim4", get_test_dims(64, 1024, n=2), ids=id_formatter("dim4"))
+@pytest.mark.parametrize("dims", (2,), ids=id_formatter("dims"))
+@pytest.mark.parametrize("formatB", ["col_turing", "col_ampere"], ids=id_formatter("formatB"))
+@pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias"))
 def test_dequant_mm(dim1, dim4, dims, formatB, has_bias):
     inner = torch.randint(1, 128, size=(1,)).item()
     bias = None
@@ -994,33 +886,23 @@ def test_dequant_mm(dim1, dim4, dims, formatB, has_bias):
         if has_bias: C4 += bias
 
         # TODO: is something wrong here? If so, the problem goes deeper
-        #n = C1.numel()
-        #p = 0.06
+        # n = C1.numel()
+        # p = 0.06
         std = C1.std(0).view(1, -1)
         C1 /= std
         C4 /= std
-        #assert_all_approx_close(C1, C4, atol=0.02, rtol=0.1, count=int(n*0.06))
-        #assert (count / n < p), f"error in more than {p} of elements: {count}/{n}={count/n}"
+        # assert_all_approx_close(C1, C4, atol=0.02, rtol=0.1, count=int(n*0.06))
+        # assert (count / n < p), f"error in more than {p} of elements: {count}/{n}={count/n}"
 
         C5 = F.mm_dequant(C2, SC, maxA.flatten(), maxB.flatten(), bias=bias)
-        #torch.testing.assert_close(C5, C4, atol=0.015, rtol=0.1)
+        # torch.testing.assert_close(C5, C4, atol=0.015, rtol=0.1)
         n = C5.numel()
-        assert_all_approx_close(C1, C4, atol=0.015, rtol=0.1, count=int(0.01*n))
-
-
-n = 2
-dim1 = [1 * 1024]
-dim2 = [1 * 1024]
-# dim1 = torch.randint(1,4*1024, size=(n,)).tolist()
-# dim2 = torch.randint(1,4*1024, size=(n,)).tolist()
+        assert_all_approx_close(C1, C4, atol=0.015, rtol=0.1, count=int(0.01 * n))
 
-dims = (2,)
-# ldb = list(range(256, 1*1024, 256))
-values = list(product(dim1, dim2, dims))
-names = ["dim1_{}_dim2_{}_dims_{}".format(*vals) for vals in values]
 
-
-@pytest.mark.parametrize("dim1, dim2, dims", values, ids=names)
+@pytest.mark.parametrize("dim1", [1 * 1024], ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", [1 * 1024], ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dims", (2,), ids=id_formatter("dims"))
 def test_colrow_absmax(dim1, dim2, dims):
     for i in range(k):
         threshold = 3.0
@@ -1066,17 +948,8 @@ def test_colrow_absmax(dim1, dim2, dims):
         assert nnz_block_ptr2 is None
 
 
-n = 2
-# dim1 = [8*1024]
-# dim2 = [4*1024]
-dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
-dim2 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
-
-values = list(product(dim1, dim2))
-names = ["dim1_{}_dim2_{}".format(*vals) for vals in values]
-
-
-@pytest.mark.parametrize("dim1, dim2", values, ids=names)
+@pytest.mark.parametrize("dim1", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim2"))
 def test_double_quant(dim1, dim2):
     for i in range(k):
         A = torch.randn(dim1, dim2, device="cuda").half()
@@ -1114,16 +987,18 @@ def test_double_quant(dim1, dim2):
         torch.testing.assert_close(Scol.flatten().float(), statsAt)
 
 
-n = 4
-dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
-dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
-inner = torch.randint(1, 4 * 1024, size=(n,)).tolist()
-
-values = list(zip(dim1, dim4, inner))
-names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values]
-
-
-@pytest.mark.parametrize("dim1, dim4, inner", values, ids=names)
+@pytest.mark.parametrize(
+    ("dim1", "dim4", "inner"),
+    (
+        pytest.param(dim1, dim4, inner, id=f"{dim1=},{dim4=},{inner=}")
+        for (dim1, dim4, inner)
+        in zip(
+            get_test_dims(1, 4 * 1024, n=4),
+            get_test_dims(1, 4 * 1024, n=4),
+            get_test_dims(1, 4 * 1024, n=4),
+        )
+    )
+)
 def test_integrated_igemmlt(dim1, dim4, inner):
     for i in range(k):
         A = torch.randn(dim1, inner, device="cuda").half()
@@ -1158,16 +1033,18 @@ def test_integrated_igemmlt(dim1, dim4, inner):
         assert err2 <= err1 * 1.025
 
 
-n = 6
-dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
-dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
-inner = torch.randint(1, 4 * 1024, size=(n,)).tolist()
-
-values = list(zip(dim1, dim4, inner))
-names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values]
-
-
-@pytest.mark.parametrize("dim1, dim4, inner", values, ids=names)
+@pytest.mark.parametrize(
+    ("dim1", "dim4", "inner"),
+    (
+        pytest.param(dim1, dim4, inner, id=f"{dim1=},{dim4=},{inner=}")
+        for (dim1, dim4, inner)
+        in zip(
+            get_test_dims(1, 4 * 1024, n=6),
+            get_test_dims(1, 4 * 1024, n=6),
+            get_test_dims(1, 4 * 1024, n=6),
+        )
+    )
+)
 @pytest.mark.skip("Row scale has some bugs for ampere")
 def test_igemmlt_row_scale(dim1, dim4, inner):
     formatB = F.get_special_format_str()
@@ -1234,17 +1111,17 @@ def test_igemmlt_row_scale(dim1, dim4, inner):
     print(sum(err3) / len(err3))
 
 
-dim1 = [1024, 2048]
-inner = [12288 * 4, 4096 * 4]
-dim4 = [12288, 4096]
-
-values = list(zip(dim1, dim4, inner))
-names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values]
-
-
-@pytest.mark.parametrize("dim1, dim4, inner", values, ids=names)
+@pytest.mark.parametrize(
+    ("dim1", "dim4", "inner"),
+    [
+        pytest.param(1024, 12288 * 4, 12288, id="1024, 12288*4, 12288"),
+        pytest.param(2048, 4096 * 4, 4096, id="2048, 4096*4, 4096"),
+    ],
+)
 @pytest.mark.skip("Row scale has some bugs for ampere")
+@pytest.mark.benchmark
 def test_row_scale_bench(dim1, dim4, inner):
+    formatB = F.get_special_format_str()
     err1, err2, err3 = [], [], []
     relerr1, relerr2 = [], []
     scale = 1
@@ -1289,34 +1166,14 @@ def test_row_scale_bench(dim1, dim4, inner):
     print("vector-wise", time.time() - t0)
 
 
-n = 2
-dim1 = torch.randint(2, 1024, size=(n,)).tolist()
-dim2 = torch.randint(2, 1024, size=(n,)).tolist()
-# dim1 = [8*1024]
-# dim2 = [4*1024]
-
-dim3 = [0]
-dtype = [torch.int8]
-a_order = ["row"]
-out_order = ["col32", "col_turing", "col_ampere"]
-transpose = [False, True]
-dims = [2]
-values = list(
-    product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose)
-)
-names = [
-    "dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_{}".format(
-        *vals
-    )
-    for vals in values
-]
-
-
-@pytest.mark.parametrize(
-    "dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",
-    values,
-    ids=names,
-)
+@pytest.mark.parametrize("dim1", get_test_dims(2, 1024, n=2), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", get_test_dims(2, 1024, n=2), ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim3", [0], ids=id_formatter("dim3"))
+@pytest.mark.parametrize("dims", [2], ids=id_formatter("dims"))
+@pytest.mark.parametrize("dtype", [torch.int8], ids=describe_dtype)
+@pytest.mark.parametrize("orderA", ["row"], ids=id_formatter("orderA"))
+@pytest.mark.parametrize("orderOut", ["col32", "col_turing", "col_ampere"], ids=id_formatter("orderOut"))
+@pytest.mark.parametrize("transpose", TRUE_FALSE, ids=id_formatter("transpose"))
 def test_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
     for i in range(k):
         if dims == 2:
@@ -1344,23 +1201,6 @@ def test_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
         torch.testing.assert_close(out1, out2)
 
 
-n = 2
-# dim1 = torch.randint(2,1024, size=(n,)).tolist()
-# dim2 = torch.randint(2,1024, size=(n,)).tolist()
-dim1 = [1]
-dim2 = [33]
-
-dtype = [torch.int8]
-# a_order = ['col_turing', 'col_ampere']
-a_order = ["col_turing"]
-out_order = ["row"]
-values = list(product(dim1, dim2, dtype, a_order, out_order))
-names = [
-    "dim1_{}_dim2_{}_dtype_{}_orderA_{}_orderOut_{}".format(*vals)
-    for vals in values
-]
-
-
 def test_overflow():
     formatB = F.get_special_format_str()
     print(formatB)
@@ -1375,17 +1215,8 @@ def test_overflow():
         c2 = torch.matmul(a.float(), b.float().t())
 
 
-n = 2
-dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
-dim2 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
-# dim1 = [4]
-# dim2 = [5]
-
-values = list(product(dim1, dim2))
-names = ["dim1_{}_dim2_{}".format(*vals) for vals in values]
-
-
-@pytest.mark.parametrize("dim1, dim2", values, ids=names)
+@pytest.mark.parametrize("dim1", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim2"))
 def test_coo_double_quant(dim1, dim2):
     threshold = 3.00
     for i in range(k):
@@ -1412,17 +1243,9 @@ def test_coo_double_quant(dim1, dim2):
             )
 
 
-n = 2
-dim1 = torch.randint(1, 1 * 1024, size=(n,)).tolist()
-dim2 = torch.randint(1, 1 * 1024, size=(n,)).tolist()
-# dim1 = [7]
-# dim2 = [11]
-transposed_B = [False, True]
-values = list(product(dim1, dim2, transposed_B))
-names = ["dim1_{}_dim2_{}_transposed_B_{}".format(*vals) for vals in values]
-
-
-@pytest.mark.parametrize("dim1, dim2, transposed_B", values, ids=names)
+@pytest.mark.parametrize("dim1", get_test_dims(1, 1 * 1024, n=2), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", get_test_dims(1, 1 * 1024, n=2), ids=id_formatter("dim2"))
+@pytest.mark.parametrize("transposed_B", TRUE_FALSE, ids=id_formatter("transposed_B"))
 def test_spmm_coo(dim1, dim2, transposed_B):
     threshold = 1.5
     dim3 = torch.randint(32, 128, size=(1,)).item()
@@ -1453,6 +1276,7 @@ def test_spmm_coo(dim1, dim2, transposed_B):
         assert_all_approx_close(out1, out2, rtol=0.01, atol=3.0e-2, count=30)
 
 
+@pytest.mark.benchmark
 def test_spmm_bench():
     batch = 2
     model = 1024 * 1
@@ -1496,14 +1320,8 @@ def test_spmm_bench():
     print(tsp / t8)
 
 
-n = 2
-dim1 = torch.randint(256, 1 * 1024, size=(n,)).tolist()
-dim2 = torch.randint(256, 1 * 1024, size=(n,)).tolist()
-values = list(product(dim1, dim2))
-names = ["dim1_{}_dim2_{}".format(*vals) for vals in values]
-
-
-@pytest.mark.parametrize("dim1, dim2", values, ids=names)
+@pytest.mark.parametrize("dim1", get_test_dims(256, 1024, n=2), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", get_test_dims(256, 1024, n=2), ids=id_formatter("dim2"))
 def test_integrated_sparse_decomp(dim1, dim2):
     threshold = 3.0
     formatB = "col_turing"
@@ -1553,23 +1371,10 @@ def test_matmuls():
     print(err1, err2)
 
 
-n = 2
-# dim1 = torch.randint(1,1*1024, size=(n,)).tolist()
-# dim2 = torch.randint(1,4*1024, size=(n,)).tolist()
-dim1 = [1 * 2048]
-dim2 = [12288]
-# dim1 = [32]
-# dim2 = [32]
-# dtype = [torch.float16, torch.int8]
-dtype = [torch.float16]
-out_function = ["zeros", "ones"]
-values = list(product(dim1, dim2, dtype, out_function))
-names = [
-    "dim1_{}_dim2_{}_dtype_{}_out_func_{}".format(*vals) for vals in values
-]
-
-
-@pytest.mark.parametrize("dim1, dim2, dtype, out_func", values, ids=names)
+@pytest.mark.parametrize("dim1", [1 * 2048], ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", [12288], ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dtype", [torch.float16], ids=describe_dtype)
+@pytest.mark.parametrize("out_func", ["zeros", "ones"], ids=id_formatter("out_func"))
 def test_spmm_coo_very_sparse(dim1, dim2, dtype, out_func):
     out_func = getattr(torch, out_func)
 
@@ -1672,20 +1477,9 @@ def test_coo2csc():
     torch.testing.assert_close(A2.t()[idx], cscA.values)
 
 
-n = 2
-# dim1 = torch.randint(1,1*1024, size=(n,)).tolist()
-# dim2 = torch.randint(1,4*1024, size=(n,)).tolist()
-dim1 = [1 * 2048]
-# dim2 = [12288]
-dim2 = [2048]
-# dim1 = [2]
-# dim2 = [2]
-dtype = [torch.int8]
-values = list(product(dim1, dim2, dtype))
-names = ["dim1_{}_dim2_{}_dtype_{}".format(*vals) for vals in values]
-
-
-@pytest.mark.parametrize("dim1, dim2, dtype", values, ids=names)
+@pytest.mark.parametrize("dim1", [1 * 2048])
+@pytest.mark.parametrize("dim2", [2048])
+@pytest.mark.parametrize("dtype", [torch.int8])
 def test_spmm_coo_dequant(dim1, dim2, dtype):
     threshold = 6.0
     # threshold = 2.8
@@ -1786,22 +1580,11 @@ def test_spmm_coo_dequant(dim1, dim2, dtype):
     print("partial matmul", time.time() - t0)
 
 
-batch_size = 1
-seqdim = 1
-values = []
-#values.append((batch_size, seqdim, 768, 4 * 768))
-#values.append((batch_size, seqdim, 1024, 4*1024))
-#values.append((batch_size, seqdim, 1536, 4*1536))
-#values.append((batch_size, seqdim, 2048, 4*2048))
-#values.append((batch_size, seqdim, 2560, 4*2560))
-#values.append((batch_size, seqdim, 4096, 4*4096))
-#values.append((batch_size, seqdim, 5120, 4*5120))
-values.append((batch_size, seqdim, 6656, 4*6656))
-#values.append((batch_size, seqdim, 8192, 4*8192))
-#values.append((batch_size, seqdim, 5140, 4*5140))
-#values.append((batch_size, seqdim, 12288, 4*12288))
-names = ["batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values]
-@pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names)
+@pytest.mark.parametrize(
+    ("batch", "seq", "model", "hidden"),
+    [pytest.param(1, 1, 6656, 4*6656, id="batch=1, seq=1, model=6656, hidden=26k")],
+)
+@pytest.mark.benchmark
 def test_bench_matmul(batch, seq, model, hidden):
     iters = 1000
     formatB = F.get_special_format_str()
@@ -2226,6 +2009,7 @@ def test_kbit_quantile_estimation():
             assert err < 0.035
 
 
+@pytest.mark.benchmark
 def test_bench_dequantization():
     a = torch.rand(1024, 1024, device='cuda').half()
     code =F.create_fp8_map(True, 3, 0, 4).cuda()
@@ -2244,7 +2028,7 @@ def test_bench_dequantization():
 
 
 
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=["fp32", "fp16", "bf16"])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
 def test_fp4_quant(dtype):
     vals = list(product([0, 1], repeat=4))
 
@@ -2321,6 +2105,7 @@ def test_4bit_compressed_stats(quant_type):
 
 #@pytest.mark.parametrize("quant_type", ['fp4', 'nf4'])
 @pytest.mark.parametrize("quant_type", ['nf4'])
+@pytest.mark.benchmark
 def test_bench_4bit_dequant(quant_type):
     blocksize = 256
     a = torch.rand(1024*12*4, 1024*12, device='cuda').half()
@@ -2367,11 +2152,11 @@ def test_normal_map_tree():
         #print(pivots)
 
 
-@pytest.mark.parametrize("double_quant", [True, False], ids=['DQ_True', 'DQ_False'])
-@pytest.mark.parametrize("storage_type", ['nf4', 'fp4'], ids=['nf4', 'fp4'])
-@pytest.mark.parametrize("kind", ['fc1', 'fc2', 'attn', 'attn_packed'], ids=['fc1', 'fc2', 'attn', 'attn_packed'])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32'])
-@pytest.mark.parametrize("quant_storage", [torch.uint8, torch.float16, torch.bfloat16, torch.float32], ids=['uint8', 'fp16', 'bf16', 'fp32'])
+@pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
+@pytest.mark.parametrize("storage_type", ['nf4', 'fp4'])
+@pytest.mark.parametrize("kind", ['fc1', 'fc2', 'attn', 'attn_packed'])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
+@pytest.mark.parametrize("quant_storage", [torch.uint8, torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
 def test_gemv_4bit(dtype, storage_type, quant_storage, double_quant, kind):
     for dim in [128, 256, 512, 1024]:
     #for dim in [4*1024]:
@@ -2537,12 +2322,12 @@ def test_managed():
 
 
 @pytest.mark.parametrize("storage_type", ['nf4', 'fp4'], ids=['nf4', 'fp4'])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32'])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
 @pytest.mark.parametrize("double_quant", [False], ids=['DQ_True'])
 def test_gemv_eye_4bit(storage_type, dtype, double_quant):
     dims = 10
     torch.random.manual_seed(np.random.randint(0, 412424242))
-    dims = torch.randint(0, 8192, size=(dims,)).tolist()
+    dims = get_test_dims(0, 8192, n=dims)
     dims = [dim + (64-(dim % 64)) for dim in dims]
     #for dim in [576, 5120, 3520, 5184, 1280, 4992, 5312, 2048]:
     for dim in dims:
@@ -2560,5 +2345,3 @@ def test_gemv_eye_4bit(storage_type, dtype, double_quant):
         torch.testing.assert_close(A, C2)
         #torch.testing.assert_close(A, C1, rtol=1e-5, atol=0.00001)
         #torch.testing.assert_close(A, C2, rtol=1e-5, atol=0.080)
-
-
diff --git a/tests/test_generation.py b/tests/test_generation.py
index 753623b27..b05749bf8 100644
--- a/tests/test_generation.py
+++ b/tests/test_generation.py
@@ -9,6 +9,8 @@
   BitsAndBytesConfig,
 )
 
+from tests.helpers import TRUE_FALSE, describe_dtype, id_formatter
+
 
 def get_4bit_config():
   return BitsAndBytesConfig(
@@ -59,23 +61,19 @@ def generate(model, tokenizer, text, generation_config, prompt_func=get_prompt_f
 
 models = ['huggyllama/llama-7b', 'bigscience/bloom-1b7']
 dtypes = ['nf4', 'fp4']
-load_in_4bit = [True, False]
-values = list(product(models, dtypes))
-strfunc = lambda lst: [str(x) for x in lst]
-ids = ['_'.join(strfunc(x)) for x in values]
-@pytest.fixture(scope='session', params=values, ids=ids)
+
+@pytest.fixture(scope='session', params=product(models, dtypes))
 def model_and_tokenizer(request):
     model, tokenizer = get_model_and_tokenizer(request.param)
     yield request.param, model, tokenizer
     del model
 
-@pytest.mark.parametrize("DQ", [True, False], ids=['DQ_True', 'DQ_False'])
-@pytest.mark.parametrize("inference_kernel", [True, False], ids=['inference_kernel_True', 'inference_kernel_False'])
-#@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32'])
-def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ):
-    print('')
-    dtype = torch.float16
 
+@pytest.mark.parametrize("DQ", TRUE_FALSE, ids=id_formatter("dq"))
+@pytest.mark.parametrize("inference_kernel", TRUE_FALSE, ids=id_formatter("inference_kernel"))
+@pytest.mark.parametrize("dtype", [torch.float16], ids=describe_dtype)
+@pytest.mark.slow
+def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ, dtype):
     fixture_config, model, tokenizer = model_and_tokenizer
 
     generation_config = transformers.GenerationConfig(
@@ -122,6 +120,3 @@ def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ):
         for out in outputs:
             print(out)
         raise ValueError(f'Failure count: {failure_count}/{n_cases}')
-
-
-
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index d396a910b..13db28ed4 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -1,4 +1,3 @@
-from itertools import product
 import os
 from tempfile import TemporaryDirectory
 
@@ -6,6 +5,7 @@
 import torch
 
 import bitsandbytes as bnb
+from tests.helpers import TRUE_FALSE
 
 storage = {
     'uint8': torch.uint8,
@@ -14,10 +14,10 @@
     'float32': torch.float32
 }
 
-@pytest.mark.parametrize(
-    "quant_type, compress_statistics, bias, quant_storage",
-    list(product(["nf4", "fp4"], [False, True], [False, True], ['uint8', 'float16', 'bfloat16', 'float32'])),
-)
+@pytest.mark.parametrize("quant_storage", ['uint8', 'float16', 'bfloat16', 'float32'])
+@pytest.mark.parametrize("bias", TRUE_FALSE)
+@pytest.mark.parametrize("compress_statistics", TRUE_FALSE)
+@pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
 def test_linear_serialization(quant_type, compress_statistics, bias, quant_storage):
     original_dtype = torch.float16
     compute_dtype = None
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index d4967969c..6fa7efb8d 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -1,5 +1,4 @@
 from contextlib import nullcontext
-from itertools import product
 import os
 from tempfile import TemporaryDirectory
 
@@ -10,6 +9,7 @@
 from bitsandbytes import functional as F
 from bitsandbytes.autograd import get_inverse_transform_indices, undo_layout
 from bitsandbytes.nn.modules import Linear8bitLt
+from tests.helpers import TRUE_FALSE, id_formatter
 
 # contributed by Alex Borzunov, see:
 # https://github.com/bigscience-workshop/petals/blob/main/tests/test_linear8bitlt.py
@@ -66,8 +66,10 @@ def test_linear_no_igemmlt():
     assert linear_custom.state.CxB is None
 
 
-@pytest.mark.parametrize("has_fp16_weights, serialize_before_forward, deserialize_before_cuda, force_no_igemmlt",
-                         list(product([False, True], [False, True], [False, True], [False, True])))
+@pytest.mark.parametrize("has_fp16_weights", TRUE_FALSE, ids=id_formatter("has_fp16_weights"))
+@pytest.mark.parametrize("serialize_before_forward", TRUE_FALSE, ids=id_formatter("serialize_before_forward"))
+@pytest.mark.parametrize("deserialize_before_cuda", TRUE_FALSE, ids=id_formatter("deserialize_before_cuda"))
+@pytest.mark.parametrize("force_no_igemmlt", TRUE_FALSE, ids=id_formatter("force_no_igemmlt"))
 def test_linear_serialization(has_fp16_weights, serialize_before_forward, deserialize_before_cuda, force_no_igemmlt):
     linear = torch.nn.Linear(32, 96)
     x = torch.randn(3, 32, dtype=torch.half)
diff --git a/tests/test_modules.py b/tests/test_modules.py
index c98f7a6d4..32d90938d 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -6,6 +6,7 @@
 from torch import nn
 
 import bitsandbytes as bnb
+from tests.helpers import id_formatter
 
 
 class MockArgs:
@@ -311,12 +312,7 @@ def forward(self, x):
         return LinearFunction.apply(x, self.weight, self.bias, self.args)
 
 
-threshold = [0.0, 3.0]
-values = threshold
-names = [f"threshold_{vals}" for vals in values]
-
-
-@pytest.mark.parametrize("threshold", values, ids=names)
+@pytest.mark.parametrize("threshold", [0.0, 3.0], ids=id_formatter("threshold"))
 def test_linear8bitlt_inference(threshold):
     l1 = bnb.nn.Linear8bitLt(32, 64, threshold=threshold).cuda().half()
     assert l1.weight.device.type == "cuda"
@@ -510,18 +506,21 @@ def test_linear_kbit_fp32_bias(module):
         o1 = l1(b1)
         assert l1.bias is None
 
-modules = []
-modules.append(bnb.nn.Linear8bitLt)
-modules.append(bnb.nn.Linear4bit)
-modules.append(bnb.nn.LinearFP4)
-modules.append(bnb.nn.LinearNF4)
-modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compress_statistics=True))
-modules.append(lambda d1, d2: bnb.nn.LinearNF4(d1, d2, compress_statistics=True))
-modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float32))
-modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float16))
-modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.bfloat16))
-names = ['Int8Lt', '4bit', 'FP4', 'NF4', 'FP4+C', 'NF4+C', 'NF4+fp32', 'NF4+fp16', 'NF4+bf16']
-@pytest.mark.parametrize("module", modules, ids=names)
+
+module_dict = {
+    "Int8Lt": bnb.nn.Linear8bitLt,
+    "4bit": bnb.nn.Linear4bit,
+    "FP4": bnb.nn.LinearFP4,
+    "NF4": bnb.nn.LinearNF4,
+    "FP4+C": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compress_statistics=True),
+    "NF4+C": lambda d1, d2: bnb.nn.LinearNF4(d1, d2, compress_statistics=True),
+    "NF4+fp32": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float32),
+    "NF4+fp16": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float16),
+    "NF4+bf16": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.bfloat16),
+}
+
+
+@pytest.mark.parametrize("module", module_dict.values(), ids=module_dict.keys())
 def test_kbit_backprop(module):
     b = 17
     dim1 = 37
@@ -638,6 +637,3 @@ def test_4bit_warnings():
         net(inp)
 
     assert len(record) == 2
-
-
-
diff --git a/tests/test_optim.py b/tests/test_optim.py
index 993ac8b60..e379c424a 100644
--- a/tests/test_optim.py
+++ b/tests/test_optim.py
@@ -1,4 +1,3 @@
-from itertools import product
 import os
 from os.path import join
 import shutil
@@ -11,6 +10,7 @@
 
 import bitsandbytes as bnb
 import bitsandbytes.functional as F
+from tests.helpers import describe_dtype, id_formatter
 
 # import apex
 
@@ -101,15 +101,16 @@ def rm_path(path):
 str2statenames["lion8bit_blockwise"] = [("exp_avg", "state1", "qmap1", "absmax1")]
 str2statenames["paged_lion8bit_blockwise"] = [("exp_avg", "state1", "qmap1", "absmax1")]
 
-dim1 = [1024]
-dim2 = [32, 1024, 4097, 1]
-gtype = [torch.float32, torch.float16, torch.bfloat16]
-optimizer_names = ["adam", "momentum", "rmsprop", 'paged_adamw', 'paged_adam', 'lion', 'paged_lion']
-values = list(product(dim1, dim2, gtype, optimizer_names))
-names = ["dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values]
-@pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names)
+optimizer_names_32bit = ["adam", "momentum", "rmsprop", 'paged_adamw', 'paged_adam', 'lion', 'paged_lion']
+
+
+@pytest.mark.parametrize("optim_name", optimizer_names_32bit, ids=id_formatter("opt"))
+@pytest.mark.parametrize("gtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
+@pytest.mark.parametrize("dim1", [1024], ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", [32, 1024, 4097, 1], ids=id_formatter("dim2"))
 def test_optimizer32bit(dim1, dim2, gtype, optim_name):
-    if gtype == torch.bfloat16 and optim_name in ['momentum', 'rmsprop']: pytest.skip()
+    if gtype == torch.bfloat16 and optim_name in ['momentum', 'rmsprop']:
+        pytest.skip()
     if dim1 == 1 and dim2 == 1:
         return
     p1 = torch.randn(dim1, dim2, device="cuda", dtype=gtype) * 0.1
@@ -134,7 +135,6 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name):
         bnb_optimizer.step()
         torch_optimizer.step()
 
-
         for name1, name2 in str2statenames[optim_name]:
             torch.testing.assert_close(
                 torch_optimizer.state[p1][name1],
@@ -177,14 +177,9 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name):
             assert bnb_optimizer.state[p2]["unorm_vec"] > 0.0
 
 
-dim1 = [1024]
-dim2 = [32, 1024, 4097]
-gtype = [torch.float32, torch.float16]
-values = list(product(dim1, dim2, gtype))
-names = ["dim1_{}_dim2_{}_gtype_{}".format(*vals) for vals in values]
-
-
-@pytest.mark.parametrize("dim1, dim2, gtype", values, ids=names)
+@pytest.mark.parametrize("dim1", [1024], ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", [32, 1024, 4097], ids=id_formatter("dim2"))
+@pytest.mark.parametrize("gtype", [torch.float32, torch.float16], ids=describe_dtype)
 def test_global_config(dim1, dim2, gtype):
     if dim1 == 1 and dim2 == 1:
         return
@@ -230,10 +225,7 @@ def test_global_config(dim1, dim2, gtype):
         assert adam2.state[p3]["state2"].dtype == torch.uint8
 
 
-dim1 = [1024]
-dim2 = [32, 1024, 4097]
-gtype = [torch.float32, torch.float16, torch.bfloat16]
-optimizer_names = [
+optimizer_names_8bit = [
     "adam8bit",
     "lion8bit",
     "momentum8bit",
@@ -243,13 +235,12 @@ def test_global_config(dim1, dim2, gtype):
     "momentum8bit_blockwise",
     "rmsprop8bit_blockwise",
 ]
-values = list(product(dim1, dim2, gtype, optimizer_names))
-names = [
-    "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values
-]
 
 
-@pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names)
+@pytest.mark.parametrize("optim_name", optimizer_names_8bit, ids=id_formatter("opt"))
+@pytest.mark.parametrize("gtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
+@pytest.mark.parametrize("dim2", [32, 1024, 4097], ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim1", [1024], ids=id_formatter("dim1"))
 def test_optimizer8bit(dim1, dim2, gtype, optim_name):
     if gtype == torch.bfloat16 and optim_name not in ['adam8bit_blockwise', 'lion8bit_blockwise']: pytest.skip()
     if dim1 == 1 and dim2 == 1:
@@ -375,18 +366,10 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name):
     # print(sum(relerrors)/len(relerrors))
 
 
-dim1 = [1024]
-dim2 = [32, 1024, 4097]
-gtype = [torch.float32]
-optim_bits = [32, 8]
-values = list(product(dim1, dim2, gtype, optim_bits))
-names = [
-    "dim1_{}_dim2_{}_gtype_{}_optim_bits_{}".format(*vals)
-    for vals in values
-]
-
-
-@pytest.mark.parametrize("dim1, dim2, gtype, optim_bits", values, ids=names)
+@pytest.mark.parametrize("optim_bits", [32, 8], ids=id_formatter("optim_bits"))
+@pytest.mark.parametrize("gtype", [torch.float32], ids=describe_dtype)
+@pytest.mark.parametrize("dim2", [32, 1024, 4097], ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim1", [1024], ids=id_formatter("dim1"))
 def test_adam_percentile_clipping(dim1, dim2, gtype, optim_bits):
     if dim1 == 1 and dim2 == 1:
         return
@@ -474,22 +457,19 @@ def test_adam_percentile_clipping(dim1, dim2, gtype, optim_bits):
             adam2.load_state_dict(torch.load(join(path, "opt.pt")))
 
 
-dim1 = [4096]
-dim2 = [4096]
-gtype = [torch.float32, torch.float16]
-# optimizer_names = ['adam8bit_blockwise', 'adam8bit', 'lamb8bit']
-# optimizer_names = ['adam8bit_blockwise', 'adam_apex', 'adam8bit', 'adam', 'adam_pytorch']
-# optimizer_names = ['momentum_apex', 'momentum8bit', 'momentum_pytorch']
-# optimizer_names = ['lamb_apex', 'lamb8bit']
-# optimizer_names = ['lars_apex', 'lars8bit']
-optimizer_names = ["adam8bit_blockwise", 'paged_adam8bit_blockwise', 'paged_adamw8bit_blockwise', 'paged_lion8bit_blockwise']
-values = list(product(dim1, dim2, gtype, optimizer_names))
-names = [
-    "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values
+optimizer_names_benchmark = [
+    "adam8bit_blockwise",
+    "paged_adam8bit_blockwise",
+    "paged_adamw8bit_blockwise",
+    "paged_lion8bit_blockwise",
 ]
 
 
-@pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names)
+@pytest.mark.parametrize("dim1", [4096], ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", [4096], ids=id_formatter("dim2"))
+@pytest.mark.parametrize("gtype", [torch.float32, torch.float16], ids=describe_dtype)
+@pytest.mark.parametrize("optim_name", optimizer_names_benchmark, ids=id_formatter("opt"))
+@pytest.mark.benchmark
 def test_benchmark_blockwise(dim1, dim2, gtype, optim_name):
     if dim1 == 1 and dim2 == 1:
         return
@@ -514,15 +494,12 @@ def test_benchmark_blockwise(dim1, dim2, gtype, optim_name):
     print(optim_name, gtype, s / params)
     # assert s < 3.9
 
-dim1 = [2*1024]
-gtype = [torch.float16]
-#mode = ['torch', 'bnb']
-mode = ['bnb']
-optimizer_names = ['paged_adamw']
-#optimizer_names = ['paged_adamw8bit_blockwise']
-values = list(product(dim1,gtype, optimizer_names, mode))
-names = ['dim1_{0}_gtype_{1}_optim_{2}_mode_{3}'.format(*vals) for vals in values]
-@pytest.mark.parametrize("dim1, gtype, optim_name, mode", values, ids=names)
+
+@pytest.mark.parametrize("dim1", [2 * 1024], ids=id_formatter("dim1"))
+@pytest.mark.parametrize("gtype", [torch.float16], ids=describe_dtype)
+@pytest.mark.parametrize("optim_name", ['paged_adamw'], ids=id_formatter("optim_name"))
+@pytest.mark.parametrize("mode", ['bnb'], ids=id_formatter("mode"))
+@pytest.mark.benchmark
 def test_stream_optimizer_bench(dim1, gtype, optim_name, mode):
     layers1 = torch.nn.Sequential(*torch.nn.ModuleList([torch.nn.Linear(dim1, dim1) for i in range(10)]))
     layers1 = layers1.to(gtype)
diff --git a/tests/test_triton.py b/tests/test_triton.py
index d0397ee4a..218a533d5 100644
--- a/tests/test_triton.py
+++ b/tests/test_triton.py
@@ -4,11 +4,12 @@
 from bitsandbytes.nn import Linear8bitLt
 from bitsandbytes.nn.triton_based_modules import SwitchBackLinear
 from bitsandbytes.triton.triton_utils import is_triton_available
+from tests.helpers import TRUE_FALSE
 
 
 @pytest.mark.skipif(not is_triton_available() or not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8,
                     reason="This test requires triton and a GPU with compute capability 8.0 or higher.")
-@pytest.mark.parametrize("vector_wise_quantization", [False, True])
+@pytest.mark.parametrize("vector_wise_quantization", TRUE_FALSE)
 def test_switchback(vector_wise_quantization):
     for dim in [83]:
         for batch in [13]:
@@ -57,4 +58,3 @@ def test_switchback(vector_wise_quantization):
 
             print('GX1', err_sb, err_baseline)
             assert err_sb < 2 * err_baseline
-

From b773dfb6295e16e445f583d1bda15bdb849639c2 Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Fri, 2 Feb 2024 09:08:04 +0100
Subject: [PATCH 32/52] Pipeline

---
 .github/workflows/cmake.yml | 159 ------------------------------------
 1 file changed, 159 deletions(-)
 delete mode 100644 .github/workflows/cmake.yml

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
deleted file mode 100644
index 728dd09fb..000000000
--- a/.github/workflows/cmake.yml
+++ /dev/null
@@ -1,159 +0,0 @@
-name: CMake on multiple platforms
-
-on:
-  push:
-    branches: [ "main" ]
-  pull_request:
-    branches: [ "main" ]
-
-concurrency:
-  group: cmake-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-
-    strategy:
-      # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable.
-      fail-fast: false
-
-      matrix:
-        os: [ubuntu-latest, windows-latest]
-        cuda-version: ['11.8', '12.1']
-        build_type: [Release]
-
-    steps:
-    - uses: actions/checkout@v4
-
-    - name: Set up MSVC
-      if: matrix.os == 'windows-latest'
-      uses: ilammy/msvc-dev-cmd@v1.12.1
-      with:
-        arch: amd64
-
-    - name: Setup Mambaforge
-      uses: conda-incubator/setup-miniconda@v3.0.1
-      with:
-        miniforge-variant: Mambaforge
-        miniforge-version: latest
-        activate-environment: bnb-env
-        use-mamba: true
-
-    - uses: conda-incubator/setup-miniconda@v3.0.1
-      with:
-        auto-update-conda: true
-        activate-environment: bnb-env
-        environment-file: environment-bnb.yml
-        use-only-tar-bz2: false
-        auto-activate-base: true
-        python-version: "3.10"
-        mamba-version: "*"
-
-    - name: Set reusable strings
-      # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file.
-      id: strings
-      shell: bash
-      run: |
-        echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"
-
-    - name: CUDA Toolkit
-      shell: bash -el {0}
-      run: |
-        if [ "${{ matrix.os }}" = "ubuntu-latest" ]; then
-            # to prepare space
-            sudo rm -rf /usr/share/dotnet
-            sudo rm -rf /opt/ghc
-            sudo rm -rf /usr/local/share/boost
-        fi
-        addon=""
-        cuda_version=${{ matrix.cuda-version }}
-        [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "ubuntu-latest" ] && addon="cuda-cudart-static cuda-nvrtc"
-        [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "windows-latest" ] && addon="cuda-nvrtc"
-        [ "$cuda_version" = "11.8" ] && cuda_version="11.8.0"
-        [ "$cuda_version" = "12.1" ] && cuda_version="12.1.1"
-
-        conda install pytorch-cuda=${{ matrix.cuda-version }} -c pytorch # it's dependency not correctly resolved sometime
-        conda install cuda-python=${{ matrix.cuda-version }} cuda-libraries-dev cuda-nvcc cuda-nvtx cuda-cupti cuda-cudart cuda-cudart-dev cuda-runtime cuda-libraries $addon -c "nvidia/label/cuda-$cuda_version"
-
-        [ "${{ matrix.os }}" = "windows-latest" ] && conda install "clang>=17.0.6" "clangxx>=17.0.6" -c conda-forge
-
-        CUDA_HOME="${{ env.CONDA }}/envs/bnb-env"
-        echo CUDA_HOME=$CUDA_HOME >> "$GITHUB_ENV"
-        echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV"
-
-        if [ "${{ matrix.os }}" = "windows-latest" ]; then
-            echo CXX_COMPILER=cl >> "$GITHUB_ENV"
-            echo C_COMPILER=cl >> "$GITHUB_ENV"
-            # without -DCMAKE_CUDA_COMPILER=nvcc, cmake config always fail for cuda-11.8
-            echo DCMAKE_CUDA_COMPILER=-DCMAKE_CUDA_COMPILER=nvcc >> "$GITHUB_ENV"
-        else
-            echo CXX_COMPILER=g++ >> "$GITHUB_ENV"
-            echo C_COMPILER=gcc >> "$GITHUB_ENV"
-        fi
-
-        nvcc --version
-
-    - name: Update environment
-      run: mamba env update -n bnb-env -f environment-bnb.yml
-
-    - name: Prep build
-      run: python -m pip install cmake==3.27.9 ninja setuptools wheel
-
-    # TODO: the following steps (CUDA, NOBLASLT, CPU) could be moved to the matrix, so they're built in parallel
-
-    - name: Configure CUDA
-      run: >
-        cmake -B ${{ steps.strings.outputs.build-output-dir }}
-        -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
-        -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }}
-        -DCMAKE_C_COMPILER=${{ env.C_COMPILER }}
-        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-        -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
-        -S ${{ github.workspace }}
-
-    - name: Build CUDA
-      run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
-
-    - name: Configure NOBLASLT
-      run: >
-        cmake -B ${{ steps.strings.outputs.build-output-dir }}
-        -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
-        -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }}
-        -DCMAKE_C_COMPILER=${{ env.C_COMPILER }}
-        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-        -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
-        -DNO_CUBLASLT=ON
-        -S ${{ github.workspace }}
-
-    - name: Build NOBLASLT
-      run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
-
-    - name: Configure CPU
-      run: >
-        cmake -B ${{ steps.strings.outputs.build-output-dir }}
-        -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
-        -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }}
-        -DCMAKE_C_COMPILER=${{ env.C_COMPILER }}
-        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-        -DNO_CUBLASLT=ON
-        -DBUILD_CUDA=OFF
-        -S ${{ github.workspace }}
-
-    - name: Build CPU
-      run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
-
-    - name: Build dist
-      shell: bash -el {0}
-      run: |
-        python -m pip install build
-        python -m build --wheel
-        mkdir dist/cu${{ matrix.cuda-version }}
-        mv dist/bitsandbytes*.* dist/cu${{ matrix.cuda-version }}/
-
-    - name: Upload Build Artifacts
-      uses: actions/upload-artifact@v4.3.0
-      with:
-        name: bitsandbytes-${{ matrix.os }}-${{ matrix.cuda-version }}
-        path: |
-          ${{ github.workspace }}/dist/

From 45ad3948978a456894f8a1952c39c498404cdeca Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Fri, 2 Feb 2024 09:11:00 +0100
Subject: [PATCH 33/52] Fixed conflict

---
 include/SIMD.h | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/include/SIMD.h b/include/SIMD.h
index 0cd8d13d6..7c6a4094b 100644
--- a/include/SIMD.h
+++ b/include/SIMD.h
@@ -28,16 +28,6 @@ struct IVec;
 template <InstrSet I, class T>
 struct FVec1;
 
-template <> struct InstrFloatTraits<Scalar, float>
-{
-    typedef __m128  vec_t;
-};
-
-template <> struct InstrFloatTraits<Scalar, double>
-{
-    typedef __m128d vec_t;
-};
-
 }
 }
 

From e2e4874b31e896cc42119c62503785c7e1518e69 Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Fri, 2 Feb 2024 09:12:30 +0100
Subject: [PATCH 34/52] Fixed conflict

---
 .github/workflows/python-package.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 56fefe7da..815602386 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -51,9 +51,9 @@ jobs:
           sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu
         fi
         if [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then
-          cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DENABLE_CUDA=OFF .
+          cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu .
         else
-          cmake -DENABLE_CUDA=OFF .
+          cmake -DCOMPUTE_BACKEND=cpu .
         fi
         if [ ${build_os:0:7} == windows ]; then
           pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release"
@@ -125,10 +125,10 @@ jobs:
           docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \
             "apt-get update \
             && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-            && cmake -DENABLE_CUDA=ON . \
+            && cmake -DCOMPUTE_BACKEND=cuda . \
             && make"
         else
-          cmake -DENABLE_CUDA=ON .
+          cmake -DCOMPUTE_BACKEND=cuda .
           pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release"
         fi
         mkdir -p output/${{ matrix.os }}/${{ matrix.arch }}

From 59a1000178b35c6e52f6794ff0d98c8e9a73e755 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Fri, 2 Feb 2024 23:17:46 +0100
Subject: [PATCH 35/52] Update CMakeLists.txt

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b9a55ae5e..ff40a8089 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -161,11 +161,11 @@ target_include_directories(bitsandbytes PUBLIC csrc include)
 
 if(BUILD_CUDA)
     target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-    target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cusparse)
     if(NO_CUBLASLT)
+        target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cusparse)
         target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT)
     else()
-        target_link_libraries(bitsandbytes PUBLIC CUDA::cublasLt)
+        target_link_libraries(bitsandbytes PUBLIC CUDA::cudart CUDA::cublas CUDA::cusparse CUDA::cublasLt)
     endif()
 
     set_target_properties(bitsandbytes

From 41ddd25ea1d386443fd4ae84d6e99f931d90d0eb Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Sat, 3 Feb 2024 09:14:41 +0100
Subject: [PATCH 36/52] Fixed merge error

---
 include/SIMD.h | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/include/SIMD.h b/include/SIMD.h
index 7c6a4094b..19e8bffb3 100644
--- a/include/SIMD.h
+++ b/include/SIMD.h
@@ -28,6 +28,16 @@ struct IVec;
 template <InstrSet I, class T>
 struct FVec1;
 
+template <> struct InstrFloatTraits<Scalar, float>
+{
+    typedef __m128  vec_t;
+};
+
+template <> struct InstrFloatTraits<Scalar, double>
+{
+    typedef __m128d vec_t;
+};
+
 }
 }
 
@@ -76,26 +86,10 @@ template <> struct InstrFloatTraits<SSE, double>
     typedef __m128d vec_t;
 };
 
-template <> struct InstrFloatTraits<Scalar, float>
-{
-    typedef float  vec_t;
-};
-
-template <> struct InstrFloatTraits<Scalar, double>
-{
-    typedef double vec_t;
-};
-
-template <InstrSet I, typename T>
-struct FTOITraits
-{
-    typedef IVec<SSE, float> vec_t;
-};
-
 template <>
-struct FTOITraits<Scalar, float>
+struct FTOITraits<SSE, float>
 {
-    typedef IVec<Scalar, float> vec_t;
+    typedef IVec<SSE, float> vec_t;
 };
 
 #ifdef USE_AVX
@@ -592,4 +586,4 @@ FORCE_INLINE FVec<AVX, double> mulSub(const FVec<AVX, double>& a, const FVec<AVX
 
 } // namepsace Details
 } // namespace BinSearch
-#endif // !defined(__aarch64__)
\ No newline at end of file
+#endif // !defined(__aarch64__)

From b5b6151b7cb800f6ce19017ff325d95ee6158dec Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Sat, 3 Feb 2024 09:17:01 +0100
Subject: [PATCH 37/52] cleanup

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff40a8089..b9a55ae5e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -161,11 +161,11 @@ target_include_directories(bitsandbytes PUBLIC csrc include)
 
 if(BUILD_CUDA)
     target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cusparse)
     if(NO_CUBLASLT)
-        target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cusparse)
         target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT)
     else()
-        target_link_libraries(bitsandbytes PUBLIC CUDA::cudart CUDA::cublas CUDA::cusparse CUDA::cublasLt)
+        target_link_libraries(bitsandbytes PUBLIC CUDA::cublasLt)
     endif()
 
     set_target_properties(bitsandbytes

From ca5f14aff135aa583b5ed7956f8a0eb82cd3c359 Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Sat, 3 Feb 2024 09:24:55 +0100
Subject: [PATCH 38/52] cleanup

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b9a55ae5e..067e1ea26 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -161,7 +161,7 @@ target_include_directories(bitsandbytes PUBLIC csrc include)
 
 if(BUILD_CUDA)
     target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-    target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cusparse)
+    target_link_libraries(bitsandbytes PUBLIC CUDA::cudart CUDA::cublas CUDA::cusparse)
     if(NO_CUBLASLT)
         target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT)
     else()

From b460125bdaa4f5cdb4db5c8748dccc23a6d53f71 Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Sat, 3 Feb 2024 10:18:28 +0100
Subject: [PATCH 39/52] Find CUDA

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 067e1ea26..a0e24528b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,6 +59,7 @@ endif()
 
 if(BUILD_CUDA)
     enable_language(CUDA) # This will fail if CUDA is not found
+    find_package(CUDAToolkit REQUIRED)
 
     # Convert the CUDA version from X.Y.z to XY. There's probably a shorter way of doing this
     string(REGEX MATCH "^[0-9]+.[0-9]+" _CUDA_VERSION_FIRST_TWO "${CMAKE_CUDA_COMPILER_VERSION}")

From 7a605e1c2afecc776a48cf7642ee0b59ac956588 Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Sat, 3 Feb 2024 10:28:22 +0100
Subject: [PATCH 40/52] Fix

---
 CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a0e24528b..21e2fde31 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,8 +30,6 @@ option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
 if(APPLE)
   set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1)
 endif()
-set(CMAKE_CXX_STANDARD 14)
-set(CXX_STANDARD_REQUIRED C++14)
 
 set(BNB_OUTPUT_NAME "bitsandbytes")
 
@@ -157,6 +155,7 @@ endif()
 
 set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
 add_library(bitsandbytes SHARED ${SRC_FILES})
+target_compile_features(bitsandbytes PUBLIC cxx_std_14)
 target_include_directories(bitsandbytes PUBLIC csrc include)
 
 

From 28188a55e0734dbad6aa977b0aa713593a35f0bf Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Sat, 3 Feb 2024 10:32:01 +0100
Subject: [PATCH 41/52] Fixing merge error from latest merge from main

---
 include/Algo-Direct-Common.h |  2 +-
 include/Portable.h           |  2 +-
 include/SIMD.h               | 14 ++------------
 3 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/include/Algo-Direct-Common.h b/include/Algo-Direct-Common.h
index c97084904..7b40edea9 100644
--- a/include/Algo-Direct-Common.h
+++ b/include/Algo-Direct-Common.h
@@ -190,7 +190,7 @@ struct DirectInfo
                 xi = xws;
             }
             else {
-                myassert(Gap==1, "if Gap>1 then X workspace must be provided");
+                myassert((Gap==1), "if Gap>1 then X workspace must be provided");
                 xi = x;
             }
 
diff --git a/include/Portable.h b/include/Portable.h
index 78599944e..178f8baa9 100644
--- a/include/Portable.h
+++ b/include/Portable.h
@@ -81,7 +81,7 @@ typedef unsigned __int64  uint64;
 
 namespace Details {
 
-#define myassert(cond, msg) if (!(cond)){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); }
+#define myassert(cond, msg) if (!cond){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); }
 
 // log2 is not defined in VS2008
 #if defined(_MSC_VER)
diff --git a/include/SIMD.h b/include/SIMD.h
index b16104f9b..19e8bffb3 100644
--- a/include/SIMD.h
+++ b/include/SIMD.h
@@ -86,18 +86,8 @@ template <> struct InstrFloatTraits<SSE, double>
     typedef __m128d vec_t;
 };
 
-template <> struct InstrFloatTraits<Scalar, float>
-{
-    typedef float  vec_t;
-};
-
-template <> struct InstrFloatTraits<Scalar, double>
-{
-    typedef double vec_t;
-};
-
-template <InstrSet I, typename T>
-struct FTOITraits
+template <>
+struct FTOITraits<SSE, float>
 {
     typedef IVec<SSE, float> vec_t;
 };

From 86b2bd689d036b5d22370c2253405075eabe9760 Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Sat, 3 Feb 2024 10:59:48 +0100
Subject: [PATCH 42/52] Fix setup.py

---
 setup.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/setup.py b/setup.py
index 7b50386ac..0ee090ba5 100644
--- a/setup.py
+++ b/setup.py
@@ -40,9 +40,6 @@ def has_ext_modules(self):
     },
     long_description=read("README.md"),
     long_description_content_type="text/markdown",
-    # HACK: pretend we have a native extension module so the wheel is tagged
-    #       correctly with a platform tag (e.g. `-linux_x86_64.whl`).
-    ext_modules=[Extension("bitsandbytes", sources=[], language="c")],
     classifiers=[
         "Development Status :: 4 - Beta",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",

From 01c3f598adce0759d8ac63e3eee195064d9116c6 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Sat, 3 Feb 2024 11:36:36 +0100
Subject: [PATCH 43/52] Fixed typo in artifact name

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 815602386..54726e71e 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -190,7 +190,7 @@ jobs:
     - name: Upload build artifact
       uses: actions/upload-artifact@v4
       with:
-        name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}}_${{ matrix.python-version }}
+        name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.python-version }}
         path: dist/bitsandbytes-*.whl
         retention-days: 7
   publish:

From e4344b0404246cd6d02ecd488d91ce9ee4ef4eb8 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Sat, 3 Feb 2024 11:39:00 +0100
Subject: [PATCH 44/52] Remove linker flags

---
 CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21e2fde31..80894a033 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -148,7 +148,6 @@ endif()
 
 # Weird MSVC hacks
 if(MSVC)
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}  /NODEFAULTLIB:msvcprtd /NODEFAULTLIB:MSVCRTD /NODEFAULTLIB:LIBCMT")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast")
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2 /fp:fast")
 endif()

From 2ba8be3ad069cc6da066fa64323f9d3fc9ba53a3 Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Sat, 3 Feb 2024 15:01:14 +0100
Subject: [PATCH 45/52] Build nocublaslt versions

---
 .github/workflows/cmake.yml          | 159 ---------------------------
 .github/workflows/python-package.yml |  26 +++--
 2 files changed, 14 insertions(+), 171 deletions(-)
 delete mode 100644 .github/workflows/cmake.yml

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
deleted file mode 100644
index 728dd09fb..000000000
--- a/.github/workflows/cmake.yml
+++ /dev/null
@@ -1,159 +0,0 @@
-name: CMake on multiple platforms
-
-on:
-  push:
-    branches: [ "main" ]
-  pull_request:
-    branches: [ "main" ]
-
-concurrency:
-  group: cmake-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-
-    strategy:
-      # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable.
-      fail-fast: false
-
-      matrix:
-        os: [ubuntu-latest, windows-latest]
-        cuda-version: ['11.8', '12.1']
-        build_type: [Release]
-
-    steps:
-    - uses: actions/checkout@v4
-
-    - name: Set up MSVC
-      if: matrix.os == 'windows-latest'
-      uses: ilammy/msvc-dev-cmd@v1.12.1
-      with:
-        arch: amd64
-
-    - name: Setup Mambaforge
-      uses: conda-incubator/setup-miniconda@v3.0.1
-      with:
-        miniforge-variant: Mambaforge
-        miniforge-version: latest
-        activate-environment: bnb-env
-        use-mamba: true
-
-    - uses: conda-incubator/setup-miniconda@v3.0.1
-      with:
-        auto-update-conda: true
-        activate-environment: bnb-env
-        environment-file: environment-bnb.yml
-        use-only-tar-bz2: false
-        auto-activate-base: true
-        python-version: "3.10"
-        mamba-version: "*"
-
-    - name: Set reusable strings
-      # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file.
-      id: strings
-      shell: bash
-      run: |
-        echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"
-
-    - name: CUDA Toolkit
-      shell: bash -el {0}
-      run: |
-        if [ "${{ matrix.os }}" = "ubuntu-latest" ]; then
-            # to prepare space
-            sudo rm -rf /usr/share/dotnet
-            sudo rm -rf /opt/ghc
-            sudo rm -rf /usr/local/share/boost
-        fi
-        addon=""
-        cuda_version=${{ matrix.cuda-version }}
-        [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "ubuntu-latest" ] && addon="cuda-cudart-static cuda-nvrtc"
-        [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "windows-latest" ] && addon="cuda-nvrtc"
-        [ "$cuda_version" = "11.8" ] && cuda_version="11.8.0"
-        [ "$cuda_version" = "12.1" ] && cuda_version="12.1.1"
-
-        conda install pytorch-cuda=${{ matrix.cuda-version }} -c pytorch # it's dependency not correctly resolved sometime
-        conda install cuda-python=${{ matrix.cuda-version }} cuda-libraries-dev cuda-nvcc cuda-nvtx cuda-cupti cuda-cudart cuda-cudart-dev cuda-runtime cuda-libraries $addon -c "nvidia/label/cuda-$cuda_version"
-
-        [ "${{ matrix.os }}" = "windows-latest" ] && conda install "clang>=17.0.6" "clangxx>=17.0.6" -c conda-forge
-
-        CUDA_HOME="${{ env.CONDA }}/envs/bnb-env"
-        echo CUDA_HOME=$CUDA_HOME >> "$GITHUB_ENV"
-        echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV"
-
-        if [ "${{ matrix.os }}" = "windows-latest" ]; then
-            echo CXX_COMPILER=cl >> "$GITHUB_ENV"
-            echo C_COMPILER=cl >> "$GITHUB_ENV"
-            # without -DCMAKE_CUDA_COMPILER=nvcc, cmake config always fail for cuda-11.8
-            echo DCMAKE_CUDA_COMPILER=-DCMAKE_CUDA_COMPILER=nvcc >> "$GITHUB_ENV"
-        else
-            echo CXX_COMPILER=g++ >> "$GITHUB_ENV"
-            echo C_COMPILER=gcc >> "$GITHUB_ENV"
-        fi
-
-        nvcc --version
-
-    - name: Update environment
-      run: mamba env update -n bnb-env -f environment-bnb.yml
-
-    - name: Prep build
-      run: python -m pip install cmake==3.27.9 ninja setuptools wheel
-
-    # TODO: the following steps (CUDA, NOBLASLT, CPU) could be moved to the matrix, so they're built in parallel
-
-    - name: Configure CUDA
-      run: >
-        cmake -B ${{ steps.strings.outputs.build-output-dir }}
-        -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
-        -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }}
-        -DCMAKE_C_COMPILER=${{ env.C_COMPILER }}
-        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-        -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
-        -S ${{ github.workspace }}
-
-    - name: Build CUDA
-      run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
-
-    - name: Configure NOBLASLT
-      run: >
-        cmake -B ${{ steps.strings.outputs.build-output-dir }}
-        -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
-        -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }}
-        -DCMAKE_C_COMPILER=${{ env.C_COMPILER }}
-        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-        -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
-        -DNO_CUBLASLT=ON
-        -S ${{ github.workspace }}
-
-    - name: Build NOBLASLT
-      run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
-
-    - name: Configure CPU
-      run: >
-        cmake -B ${{ steps.strings.outputs.build-output-dir }}
-        -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
-        -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }}
-        -DCMAKE_C_COMPILER=${{ env.C_COMPILER }}
-        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-        -DNO_CUBLASLT=ON
-        -DBUILD_CUDA=OFF
-        -S ${{ github.workspace }}
-
-    - name: Build CPU
-      run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
-
-    - name: Build dist
-      shell: bash -el {0}
-      run: |
-        python -m pip install build
-        python -m build --wheel
-        mkdir dist/cu${{ matrix.cuda-version }}
-        mv dist/bitsandbytes*.* dist/cu${{ matrix.cuda-version }}/
-
-    - name: Upload Build Artifacts
-      uses: actions/upload-artifact@v4.3.0
-      with:
-        name: bitsandbytes-${{ matrix.os }}-${{ matrix.cuda-version }}
-        path: |
-          ${{ github.workspace }}/dist/
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 54726e71e..223f29ec2 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -119,18 +119,20 @@ jobs:
         set -ex
         build_os=${{ matrix.os }}
         build_arch=${{ matrix.arch }}
-        if [ ${build_os:0:6} == ubuntu ]; then 
-          image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04
-          echo "Using image $image"
-          docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \
-            "apt-get update \
-            && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-            && cmake -DCOMPUTE_BACKEND=cuda . \
-            && make"
-        else
-          cmake -DCOMPUTE_BACKEND=cuda .
-          pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release"
-        fi
+        for NO_CUBLASLT in ON OFF; do
+          if [ ${build_os:0:6} == ubuntu ]; then 
+            image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04
+            echo "Using image $image"
+            docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \
+              "apt-get update \
+              && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
+              && cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . \
+              && make"
+          else
+            cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} .
+            pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release"
+          fi
+        done
         mkdir -p output/${{ matrix.os }}/${{ matrix.arch }}
         ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ )
     - name: Upload build artifact

From 3288a0fb5b11d23a6b446e42fd75f11d57607266 Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Sun, 4 Feb 2024 10:42:42 +0100
Subject: [PATCH 46/52] Fixed formatting

---
 setup.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index 0ee090ba5..13af2a39b 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,6 @@
 from setuptools import find_packages, setup
 from setuptools.dist import Distribution
 
-
 libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.*"))
 libs = [os.path.basename(p) for p in libs]
 print("libs:", libs)
@@ -17,11 +16,13 @@
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 
+
 # Tested with wheel v0.29.0
 class BinaryDistribution(Distribution):
     def has_ext_modules(self):
         return True
 
+
 setup(
     name="bitsandbytes",
     version="0.43.0.dev0",
@@ -33,10 +34,10 @@ def has_ext_modules(self):
     url="https://github.com/TimDettmers/bitsandbytes",
     packages=find_packages(),
     package_data={"": libs},
-    install_requires=['torch', 'numpy'],
+    install_requires=["torch", "numpy"],
     extras_require={
-        'benchmark': ['pandas', 'matplotlib'],
-        'test': ['scipy'],
+        "benchmark": ["pandas", "matplotlib"],
+        "test": ["scipy"],
     },
     long_description=read("README.md"),
     long_description_content_type="text/markdown",
@@ -44,5 +45,5 @@ def has_ext_modules(self):
         "Development Status :: 4 - Beta",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
-    distclass=BinaryDistribution
+    distclass=BinaryDistribution,
 )

From fdddb111e8d5733dce54a368ead17dd42556a452 Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Sun, 4 Feb 2024 11:23:26 +0100
Subject: [PATCH 47/52] Fixed VS Code format on save

---
 .editorconfig           | 3 +++
 .gitignore              | 1 -
 .vscode/extensions.json | 7 +++++++
 .vscode/settings.json   | 7 +++++++
 4 files changed, 17 insertions(+), 1 deletion(-)
 create mode 100644 .editorconfig
 create mode 100644 .vscode/extensions.json
 create mode 100644 .vscode/settings.json

diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 000000000..03490db50
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,3 @@
+[*]
+trim_trailing_whitespace = true
+insert_final_newline = true
diff --git a/.gitignore b/.gitignore
index 46316e4b3..22f5a6cd6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -154,4 +154,3 @@ dmypy.json
 dependencies
 cuda_build
 output/
-.vscode/*
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
new file mode 100644
index 000000000..fcae843c2
--- /dev/null
+++ b/.vscode/extensions.json
@@ -0,0 +1,7 @@
+{
+    "recommendations": [
+        "ms-python.python",
+        "charliermarsh.ruff",
+        "twxs.cmake"
+    ]
+}
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 000000000..893ebb174
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+    "ruff.fixAll": true,
+    "ruff.lint.run": "onType",
+    "editor.codeActionsOnSave": {
+        "source.fixAll": "always"
+    }
+}
\ No newline at end of file

From b7503c92386854f117411d3430e5c0c8cffd681b Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Sun, 4 Feb 2024 11:29:52 +0100
Subject: [PATCH 48/52] Ran format on save from VScode

---
 .github/dependabot.yml               |  2 +-
 .github/workflows/python-package.yml |  4 +-
 csrc/mps_kernels.metal               | 60 ++++++++++++++--------------
 csrc/mps_ops.mm                      | 12 +++---
 4 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 616a1f98e..8a36c3689 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -8,4 +8,4 @@ updates:
       major:
         update-types: [major]
       minor-patch:
-        update-types: [minor, patch]
\ No newline at end of file
+        update-types: [minor, patch]
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 223f29ec2..4a34389af 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -8,7 +8,7 @@ on:
     types: [ published ]
 
 jobs:
-  
+
   ##
   # This job matrix builds the non-CUDA versions of the libraries for all supported platforms.
   ##
@@ -120,7 +120,7 @@ jobs:
         build_os=${{ matrix.os }}
         build_arch=${{ matrix.arch }}
         for NO_CUBLASLT in ON OFF; do
-          if [ ${build_os:0:6} == ubuntu ]; then 
+          if [ ${build_os:0:6} == ubuntu ]; then
             image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04
             echo "Using image $image"
             docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \
diff --git a/csrc/mps_kernels.metal b/csrc/mps_kernels.metal
index a5c8e35b2..63b3bf78c 100644
--- a/csrc/mps_kernels.metal
+++ b/csrc/mps_kernels.metal
@@ -83,35 +83,35 @@ static unsigned char quantize_scalar(
     }
 }
 
-kernel void quantize(device float* code [[buffer(0)]],  
-                      device float* A [[buffer(1)]],  
-                      device uchar* out [[buffer(2)]],  
-                      constant uint& n [[buffer(3)]],  
-                      uint id [[thread_position_in_grid]]) {  
-  const uint n_full = (NUM_BLOCK * (n / NUM_BLOCK)) + (n % NUM_BLOCK == 0 ? 0 : NUM_BLOCK);  
-  uint valid_items = (id / NUM_BLOCK + 1 == (n + NUM_BLOCK - 1) / NUM_BLOCK) ? n - (id / NUM_BLOCK * NUM_BLOCK) : NUM_BLOCK;  
-  const uint base_idx = (id / NUM_BLOCK * NUM_BLOCK);  
-  
-  float vals[NUM];  
-  uchar qvals[NUM];  
-  
-  for (uint i = base_idx; i < n_full; i += ((n + NUM_BLOCK - 1) / NUM_BLOCK) * NUM_BLOCK) {  
-    valid_items = n - i > NUM_BLOCK ? NUM_BLOCK : n - i;  
-  
-    threadgroup_barrier(mem_flags::mem_threadgroup);  
-  
-    for (uint j = 0; j < valid_items; j++) {  
-      vals[j] = A[i + j];  
-    }  
-  
-    for (uint j = 0; j < valid_items; j++) {  
+kernel void quantize(device float* code [[buffer(0)]],
+                      device float* A [[buffer(1)]],
+                      device uchar* out [[buffer(2)]],
+                      constant uint& n [[buffer(3)]],
+                      uint id [[thread_position_in_grid]]) {
+  const uint n_full = (NUM_BLOCK * (n / NUM_BLOCK)) + (n % NUM_BLOCK == 0 ? 0 : NUM_BLOCK);
+  uint valid_items = (id / NUM_BLOCK + 1 == (n + NUM_BLOCK - 1) / NUM_BLOCK) ? n - (id / NUM_BLOCK * NUM_BLOCK) : NUM_BLOCK;
+  const uint base_idx = (id / NUM_BLOCK * NUM_BLOCK);
+
+  float vals[NUM];
+  uchar qvals[NUM];
+
+  for (uint i = base_idx; i < n_full; i += ((n + NUM_BLOCK - 1) / NUM_BLOCK) * NUM_BLOCK) {
+    valid_items = n - i > NUM_BLOCK ? NUM_BLOCK : n - i;
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    for (uint j = 0; j < valid_items; j++) {
+      vals[j] = A[i + j];
+    }
+
+    for (uint j = 0; j < valid_items; j++) {
       qvals[j] = quantize_scalar<false>(0.0f, code, vals[j]);
-    }  
-  
-    threadgroup_barrier(mem_flags::mem_threadgroup);  
-  
-    for (uint j = 0; j < valid_items; j++) {  
-      out[i + j] = qvals[j];  
-    }  
-  }  
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    for (uint j = 0; j < valid_items; j++) {
+      out[i + j] = qvals[j];
+    }
+  }
 }
diff --git a/csrc/mps_ops.mm b/csrc/mps_ops.mm
index 5e3adeebe..d198b3552 100644
--- a/csrc/mps_ops.mm
+++ b/csrc/mps_ops.mm
@@ -16,10 +16,10 @@
 
 static inline id<MTLDevice> get_device()
 {
-  NSError *error = nil;  
+  NSError *error = nil;
   static id<MTLDevice> device = nil;
   if(!device) {
-    device = MTLCreateSystemDefaultDevice();  
+    device = MTLCreateSystemDefaultDevice();
   }
   if(!device) {
     NSLog(@"Failed to get MPS device");
@@ -30,7 +30,7 @@
 
 static inline id<MTLLibrary> get_library()
 {
-  NSError *error = nil;  
+  NSError *error = nil;
   static id<MTLLibrary> library = nil;
   if(!library) {
     library = [get_device() newLibraryWithURL:[NSURL fileURLWithPath:@"bitsandbytes.metallib"] error:&error];
@@ -40,7 +40,7 @@
     abort();
   }
   return library;
-}  
+}
 
 /*MPSGraphTensor* dequantize_mps(MPSGraphTensor* code, MPSGraphTensor* A, int n)
 {
@@ -49,7 +49,7 @@
 }*/
 
 
-// MPSGraph function for quantize  
+// MPSGraph function for quantize
 extern "C" MPSGraphTensor* quantize_mps(MPSGraph* graph, MPSGraphTensor* code, MPSGraphTensor* A, int n)
 {
   id<MTLDevice> device = get_device();
@@ -64,4 +64,4 @@
   }
   NSLog(@"Not implemented");
   return nil;
-}
\ No newline at end of file
+}

From fb642a5c9422bb422f92a571dd3ae10daac10339 Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Sun, 4 Feb 2024 11:35:23 +0100
Subject: [PATCH 49/52] Re-saved the json files using the new settings

---
 .vscode/extensions.json | 2 +-
 .vscode/settings.json   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.vscode/extensions.json b/.vscode/extensions.json
index fcae843c2..939843f43 100644
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@@ -4,4 +4,4 @@
         "charliermarsh.ruff",
         "twxs.cmake"
     ]
-}
\ No newline at end of file
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 893ebb174..906f28588 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -4,4 +4,4 @@
     "editor.codeActionsOnSave": {
         "source.fixAll": "always"
     }
-}
\ No newline at end of file
+}

From 2730dd94f7eb6f15a9c5e56df964fdf770a69228 Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Sun, 4 Feb 2024 11:36:44 +0100
Subject: [PATCH 50/52] Re-saved CMakeLists.txt to get formatting right

---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 80894a033..4a4090bb7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,11 +128,11 @@ elseif(BUILD_MPS)
     string(APPEND BNB_OUTPUT_NAME "_mps")
     add_compile_definitions(BUILD_MPS)
     file(MAKE_DIRECTORY "build")
-    add_custom_command(OUTPUT "bitsandbytes/bitsandbytes.metallib"  
+    add_custom_command(OUTPUT "bitsandbytes/bitsandbytes.metallib"
                 COMMAND xcrun metal -c -o "build/bitsandbytes.air" ${METAL_FILES}
                 COMMAND xcrun metallib "build/bitsandbytes.air" -o "bitsandbytes/bitsandbytes.metallib"
-                DEPENDS "${METAL_FILES}"  
-                COMMENT "Compiling Metal kernels"  
+                DEPENDS "${METAL_FILES}"
+                COMMENT "Compiling Metal kernels"
                 VERBATIM)
     add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib")
 else()

From 2e3a1d81931d17756a23a11f0ee9b3010d2b35ba Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Sun, 4 Feb 2024 21:11:34 +0100
Subject: [PATCH 51/52] Add path filter

---
 .github/workflows/python-package.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 4a34389af..8dfa5d7bd 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -4,6 +4,18 @@ on:
   push: {}
   pull_request:
     branches: [ main ]
+    paths:
+      - '.github/workflows/python-package.yml'
+      - 'bitsandbytes/**'
+      - 'csrc/**'
+      - 'include/**'
+      - 'tests/**'
+      - 'CMakeLists.txt'
+      - 'requirements*.txt'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'pytest.ini'
+      - '**/*.md'
   release:
     types: [ published ]
 

From 927f7167e3395ec26f859f294c1d4979a70a718a Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Mon, 5 Feb 2024 17:30:50 +0100
Subject: [PATCH 52/52] Formatting

---
 .github/workflows/python-package.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 8dfa5d7bd..265128637 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -30,7 +30,7 @@ jobs:
         os: [ubuntu-latest, macos-latest, windows-latest]
         arch: [x86_64, aarch64]
         exclude:
-          - os: windows-latest # This probably requres arm64 Windows agents
+          - os: windows-latest # This probably requires arm64 Windows agents
             arch: aarch64
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
     steps:
@@ -90,7 +90,7 @@ jobs:
         arch: [x86_64, aarch64]
         cuda_version: ['12.1.0']
         exclude:
-          - os: windows-latest # This probably requres arm64 Windows agents
+          - os: windows-latest # This probably requires arm64 Windows agents
             arch: aarch64
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
     steps:
@@ -163,7 +163,7 @@ jobs:
         python-version: ["3.9", "3.10", "3.11", "3.12"]
         arch: [x86_64, aarch64]
         exclude:
-          - os: windows-latest # This probably requres arm64 Windows agents
+          - os: windows-latest # This probably requires arm64 Windows agents
             arch: aarch64
     runs-on: ${{ matrix.os }}
     steps: