From 73d3e7b61307a7a8c05a8bab1be7a54d4ebd0156 Mon Sep 17 00:00:00 2001 From: Rickard Date: Mon, 5 Feb 2024 19:25:09 +0100 Subject: [PATCH] Make native code portable and add GitHub workflow for building (#949) * Make native code portable and add GitHub workflow for building * Removed deprecated Python versions * Update python-package.yml Co-authored-by: Aarni Koskela * Update python-package.yml Co-authored-by: Aarni Koskela * Update python-package.yml Co-authored-by: Aarni Koskela * Update python-package.yml Co-authored-by: Aarni Koskela * Update python-package.yml Co-authored-by: Aarni Koskela * Update python-package.yml Co-authored-by: Aarni Koskela * Update python-package.yml Co-authored-by: Aarni Koskela * Update python-package.yml * Do not test on Python 3.13 until released * Update python-package.yml * Update python-package.yml * Update python-package.yml * Update python-package.yml * Refactor build stage * Fixed breaking actions change * Slim down Windows cuda * Create dependabot.yml * Bespoke local dev requirements.txt * Enable VS integration * Group Dependabot updates * Cleanup * Update python-package.yml * Reinstate file that was wrongly merged * Fixed regression caused by new version of download-artifact * Update python-package.yml * Update python-package.yml * Fix matrix * Update python-package.yml * Merge * Pipeline * Fixed conflict * Fixed conflict * Update CMakeLists.txt * Fixed merge error * cleanup * cleanup * Find CUDA * Fix * Fixing merge error from latest merge from main * Fix setup.py * Fixed typo in artifact name * Remove linker flags * Build nocublaslt versions * Fixed formatting * Fixed VS Code format on save * Ran format on save from VScode * Re-saved the json files using the new settings * Re-saved CMakeLists.txt to get formatting right * Add path filter * Formatting --------- Co-authored-by: Aarni Koskela --- .github/dependabot.yml | 11 + .github/workflows/cmake.yml | 271 ------------------ .github/workflows/python-package.yml | 227 +++++++++++++++ CMakeLists.txt | 121 ++++++-- Makefile | 141 --------- csrc/mps_kernels.metal | 117 ++++++++ csrc/mps_ops.h | 0 csrc/mps_ops.mm | 67 +++++ ...{pythonInterface.c => pythonInterface.cpp} | 4 + include/Algo-Direct-Common.h | 2 +- include/Algo-Direct2.h | 2 + include/Portable.h | 33 ++- include/SIMD.h | 77 +++-- include/Type.h | 2 +- pyproject.toml | 5 +- requirements-ci.txt | 7 + requirements-dev.txt | 9 + requirements.txt | 3 - setup.py | 22 +- 19 files changed, 629 insertions(+), 492 deletions(-) create mode 100644 .github/dependabot.yml delete mode 100644 .github/workflows/cmake.yml create mode 100644 .github/workflows/python-package.yml delete mode 100644 Makefile create mode 100644 csrc/mps_kernels.metal create mode 100644 csrc/mps_ops.h create mode 100644 csrc/mps_ops.mm rename csrc/{pythonInterface.c => pythonInterface.cpp} (99%) create mode 100644 requirements-ci.txt create mode 100644 requirements-dev.txt delete mode 100644 requirements.txt diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..8a36c3689 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +version: 2 +updates: + - package-ecosystem: pip + directory: "/" + schedule: + interval: "weekly" + groups: + major: + update-types: [major] + minor-patch: + update-types: [minor, patch] diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml deleted file mode 100644 index 06f08eb9d..000000000 --- a/.github/workflows/cmake.yml +++ /dev/null @@ -1,271 +0,0 @@ -name: CMake on multiple platforms - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - -concurrency: - group: cmake-${{ github.ref }} - cancel-in-progress: true - -jobs: - build-shared-libs: - runs-on: ${{ matrix.os }} - - strategy: - # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable. - fail-fast: false - - matrix: - os: [ubuntu-latest, windows-latest] - arch: [x86_64, aarch64] - build_type: [Release] - exclude: - - os: windows-latest - arch: aarch64 - - steps: - - uses: actions/checkout@v4 - - - name: Set up MSVC - if: matrix.os == 'windows-latest' - uses: ilammy/msvc-dev-cmd@v1.13.0 - with: - arch: amd64 - - - name: Set reusable strings - # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file. - id: strings - shell: bash - run: | - echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT" - - - name: Prep build - run: python3 -m pip install cmake==3.27.9 ninja setuptools wheel - - - name: Prep Compilers - shell: bash -el {0} - run: | - if [ "${{ matrix.os }}" = "windows-latest" ]; then - echo CXX_COMPILER=cl >> "$GITHUB_ENV" - echo C_COMPILER=cl >> "$GITHUB_ENV" - else - echo CXX_COMPILER=g++ >> "$GITHUB_ENV" - echo C_COMPILER=gcc >> "$GITHUB_ENV" - fi - - - - name: Configure CPU - run: > - cmake -B ${{ steps.strings.outputs.build-output-dir }} - -G Ninja - -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} - -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - -DBUILD_CUDA=OFF - -S ${{ github.workspace }} - - - name: Build CPU - run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} - - - name: Copy libraries - shell: bash - run: | - mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} - ( shopt -s nullglob && cp -a bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }} ) - - - - name: Upload Build Artifacts - uses: actions/upload-artifact@v4 - with: - name: shared_library-${{ matrix.os }}-${{ matrix.arch }} - path: output/* - - - build-shared-libs-cuda: - runs-on: ${{ matrix.os }} - - strategy: - # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable. - fail-fast: false - - matrix: - os: [ubuntu-latest, windows-latest] - cuda-version: ['11.8', '12.1'] - arch: [x86_64, aarch64] - build_type: [Release] - exclude: - - os: windows-latest - arch: aarch64 - - steps: - - uses: actions/checkout@v4 - - name: Set up Python 3.10 - uses: actions/setup-python@v5 - with: - python-version: "3.10" - - - name: Set up MSVC - if: matrix.os == 'windows-latest' - uses: ilammy/msvc-dev-cmd@v1.13.0 - with: - arch: amd64 - - - name: Setup Mambaforge - uses: conda-incubator/setup-miniconda@v3.0.1 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: bnb-env - use-mamba: true - - - uses: conda-incubator/setup-miniconda@v3.0.1 - with: - auto-update-conda: true - activate-environment: bnb-env - environment-file: environment-bnb.yml - use-only-tar-bz2: false - auto-activate-base: true - python-version: "3.10" - mamba-version: "*" - - - name: Set reusable strings - # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file. - id: strings - shell: bash - run: | - echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT" - - - name: CUDA Toolkit - shell: bash -el {0} - run: | - if [ "${{ matrix.os }}" = "ubuntu-latest" ]; then - # to prepare space - sudo rm -rf /usr/share/dotnet - sudo rm -rf /opt/ghc - sudo rm -rf /usr/local/share/boost - fi - addon="" - cuda_version=${{ matrix.cuda-version }} - [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "ubuntu-latest" ] && addon="cuda-cudart-static cuda-nvrtc" - [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "windows-latest" ] && addon="cuda-nvrtc" - [ "$cuda_version" = "11.8" ] && cuda_version="11.8.0" - [ "$cuda_version" = "12.1" ] && cuda_version="12.1.1" - - conda install pytorch-cuda=${{ matrix.cuda-version }} -c pytorch # it's dependency not correctly resolved sometime - conda install cuda-python=${{ matrix.cuda-version }} cuda-libraries-dev cuda-nvcc cuda-nvtx cuda-cupti cuda-cudart cuda-cudart-dev cuda-runtime cuda-libraries $addon -c "nvidia/label/cuda-$cuda_version" - - [ "${{ matrix.os }}" = "windows-latest" ] && conda install "clang>=17.0.6" "clangxx>=17.0.6" -c conda-forge - - CUDA_HOME="${{ env.CONDA }}/envs/bnb-env" - echo CUDA_HOME=$CUDA_HOME >> "$GITHUB_ENV" - echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV" - - if [ "${{ matrix.os }}" = "windows-latest" ]; then - echo CXX_COMPILER=cl >> "$GITHUB_ENV" - echo C_COMPILER=cl >> "$GITHUB_ENV" - # without -DCMAKE_CUDA_COMPILER=nvcc, cmake config always fail for cuda-11.8 - echo DCMAKE_CUDA_COMPILER=-DCMAKE_CUDA_COMPILER=nvcc >> "$GITHUB_ENV" - else - echo CXX_COMPILER=g++ >> "$GITHUB_ENV" - echo C_COMPILER=gcc >> "$GITHUB_ENV" - fi - - nvcc --version - - - name: Update environment - run: mamba env update -n bnb-env -f environment-bnb.yml - - - name: Prep build - run: python -m pip install cmake==3.27.9 ninja setuptools wheel - - # TODO: the following steps (CUDA, NOBLASLT, CPU) could be moved to the matrix, so they're built in parallel - - - name: Configure CUDA - run: > - cmake -B ${{ steps.strings.outputs.build-output-dir }} - -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} - -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} - -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90" - -S ${{ github.workspace }} - - - name: Build CUDA - run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} - - - name: Configure NOBLASLT - run: > - cmake -B ${{ steps.strings.outputs.build-output-dir }} - -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} - -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} - -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90" - -DNO_CUBLASLT=ON - -S ${{ github.workspace }} - - - name: Build NOBLASLT - run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} - - - name: Copy libraries - shell: bash - run: | - mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} - ( shopt -s nullglob && cp -a bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }} ) - - - - name: Upload Build Artifacts - uses: actions/upload-artifact@v4 - with: - name: shared_library_cuda-${{ matrix.os }}-${{ matrix.cuda-version }}-${{ matrix.arch }} - path: output/* - - - build-wheels: - needs: - - build-shared-libs - - build-shared-libs-cuda - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest, windows-latest] - arch: [x86_64, aarch64] - exclude: - - os: windows-latest - arch: aarch64 - - steps: - # Check out code - - uses: actions/checkout@v4 - # Download shared libraries - - name: Download build artifact - uses: actions/download-artifact@v4 - with: - merge-multiple: true - path: output/ - - name: Copy correct platform shared libraries - shell: bash - run: | - cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ - # Set up the Python version needed - - name: Set up Python 3.10 - uses: actions/setup-python@v5 - with: - python-version: "3.10" - cache: pip - - - name: Install build package - shell: bash - run: pip install build - - name: Build wheel - shell: bash - run: python -m build . --wheel - - name: Upload Build Artifacts - uses: actions/upload-artifact@v4 - with: - name: bdist_wheel-${{ matrix.os }}-${{ matrix.arch }} - path: | - ${{ github.workspace }}/dist/ diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 000000000..265128637 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,227 @@ +name: Python package + +on: + push: {} + pull_request: + branches: [ main ] + paths: + - '.github/workflows/python-package.yml' + - 'bitsandbytes/**' + - 'csrc/**' + - 'include/**' + - 'tests/**' + - 'CMakeLists.txt' + - 'requirements*.txt' + - 'setup.py' + - 'pyproject.toml' + - 'pytest.ini' + - '**/*.md' + release: + types: [ published ] + +jobs: + + ## + # This job matrix builds the non-CUDA versions of the libraries for all supported platforms. + ## + build-shared-libs: + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + arch: [x86_64, aarch64] + exclude: + - os: windows-latest # This probably requires arm64 Windows agents + arch: aarch64 + runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents + steps: + # Check out code + - uses: actions/checkout@v4 + # On Linux we use CMake within Docker + - name: Setup cmake + uses: jwlawson/actions-setup-cmake@v1.14 + with: + cmake-version: '3.26.x' + - name: Add msbuild to PATH + uses: microsoft/setup-msbuild@v1.1 + if: ${{ startsWith(matrix.os, 'windows') }} + # Check out dependencies code + - uses: actions/checkout@v4 + name: Check out NVidia cub + with: + repository: nvidia/cub + ref: 1.11.0 + path: dependencies/cub + # Compile C++ code + - name: Build C++ + shell: bash + run: | + set -ex + build_os=${{ matrix.os }} + build_arch=${{ matrix.arch }} + if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then + # Allow cross-compile om aarch64 + sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu + fi + if [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then + cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu . + else + cmake -DCOMPUTE_BACKEND=cpu . + fi + if [ ${build_os:0:7} == windows ]; then + pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" + else + make + fi + mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} + ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: shared_library_${{ matrix.os }}_${{ matrix.arch }} + path: output/* + retention-days: 7 + ## + # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64) + ## + build-shared-libs-cuda: + strategy: + matrix: + os: [ubuntu-latest, windows-latest] + arch: [x86_64, aarch64] + cuda_version: ['12.1.0'] + exclude: + - os: windows-latest # This probably requires arm64 Windows agents + arch: aarch64 + runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents + steps: + # Check out code + - uses: actions/checkout@v4 + # Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation) + - name: Set up Docker multiarch + if: startsWith(matrix.os, 'ubuntu') + uses: docker/setup-qemu-action@v2 + # On Linux we use CMake within Docker + - name: Setup cmake + if: ${{ !startsWith(matrix.os, 'linux') }} + uses: jwlawson/actions-setup-cmake@v1.14 + with: + cmake-version: '3.26.x' + # Windows: We install Cuda on the agent (slow) + - uses: Jimver/cuda-toolkit@v0.2.14 + if: startsWith(matrix.os, 'windows') + id: cuda-toolkit + with: + cuda: ${{ matrix.cuda_version }} + method: 'local' + # sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]' + - name: Add msbuild to PATH + uses: microsoft/setup-msbuild@v1.1 + if: ${{ startsWith(matrix.os, 'windows') }} + # Check out dependencies code + - uses: actions/checkout@v4 + name: Check out NVidia cub + with: + repository: nvidia/cub + ref: 1.11.0 + path: dependencies/cub + # Compile C++ code + - name: Build C++ + shell: bash + run: | + set -ex + build_os=${{ matrix.os }} + build_arch=${{ matrix.arch }} + for NO_CUBLASLT in ON OFF; do + if [ ${build_os:0:6} == ubuntu ]; then + image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04 + echo "Using image $image" + docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ + "apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ + && cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . \ + && make" + else + cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . + pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" + fi + done + mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} + ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }} + path: output/* + retention-days: 7 + build-wheels: + needs: + - build-shared-libs + - build-shared-libs-cuda + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ["3.9", "3.10", "3.11", "3.12"] + arch: [x86_64, aarch64] + exclude: + - os: windows-latest # This probably requires arm64 Windows agents + arch: aarch64 + runs-on: ${{ matrix.os }} + steps: + # Check out code + - uses: actions/checkout@v4 + # Download shared libraries + - name: Download build artifact + uses: actions/download-artifact@v4 + with: + merge-multiple: true + pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*" + path: output/ + - name: Copy correct platform shared library + shell: bash + run: | + ls -lR output/ + cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ + # Set up the Python version needed + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + - name: Install build package + shell: bash + run: pip install build + - name: Install Python test dependencies + shell: bash + run: pip install -r requirements-ci.txt + # TODO: How to run CUDA tests on GitHub actions? + #- name: Run unit tests + # if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents + # run: | + # PYTHONPATH=. pytest --log-cli-level=DEBUG tests + - name: Build wheel + shell: bash + run: python -m build . + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.python-version }} + path: dist/bitsandbytes-*.whl + retention-days: 7 + publish: + needs: build-wheels + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Download build artifact + uses: actions/download-artifact@v4 + with: + path: dist/ + merge-multiple: true + pattern: "bdist_wheel_*" + - run: | + ls -lR dist/ + - name: Publish to PyPi + if: startsWith(github.ref, 'refs/tags') + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.pypi }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 140753af4..4a4090bb7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,8 +2,8 @@ # Ensure the CUDA Toolkit is available on your path. Then run: # For GCC: `cmake -B build . && cmake --build build` # For MSVC: `cmake -B build . && cmake --build build --config Release` -# You can also use the following options -# - BUILD_CUDA: Default ON, will build with CUDA +# You can also use the following options and variables +# - COMPUTE_BACKEND: Set to `cpu`, `cuda`, or `mps` to select the backend # - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support # - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version # is whatever CMake finds on your path. @@ -11,25 +11,53 @@ # Separate by semicolons, i.e. `-DCOMPUTE_CAPABILITY=89;90` # Check your compute capability here: https://developer.nvidia.com/cuda-gpus # - PTXAS_VERBOSE: Pass the `-v` option to the PTX Assembler -cmake_minimum_required(VERSION 3.18) +cmake_minimum_required(VERSION 3.22.1) -project(bitsandbytes LANGUAGES C CXX) +project(bitsandbytes LANGUAGES CXX) -option(BUILD_CUDA "Build bitsandbytes with CUDA support" ON) -option(NO_CUBLASLT "Disable CUBLAS" OFF) -option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF) - -set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.c) -list(APPEND CUDA_FILES csrc/ops.cu csrc/kernels.cu) +# Define included source files +set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.cpp) +set(CUDA_FILES csrc/ops.cu csrc/kernels.cu) +set(MPS_FILES csrc/mps_ops.mm) +set(METAL_FILES csrc/mps_kernels.metal) +# C++ sources are always included list(APPEND SRC_FILES ${CPP_FILES}) -message(STATUS "BUILD_CUDA := ${BUILD_CUDA}") -message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}") +set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, mps)") +set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda mps) +option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF) + +if(APPLE) + set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1) +endif() set(BNB_OUTPUT_NAME "bitsandbytes") +message(STATUS "Building with backend ${COMPUTE_BACKEND}") + +if(${COMPUTE_BACKEND} STREQUAL "cuda") + if(APPLE) + message(FATAL_ERROR "CUDA is not supported on macOS" ) + endif() + option(NO_CUBLASLT "Disable CUBLAS" OFF) + set(BUILD_CUDA ON) + set(BUILD_MPS OFF) + message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}") +elseif(${COMPUTE_BACKEND} STREQUAL "mps") + if(NOT APPLE) + message(FATAL_ERROR "MPS is only supported on macOS" ) + endif() + set(BUILD_CUDA OFF) + set(BUILD_MPS ON) +else() + set(BUILD_CUDA OFF) + set(BUILD_MPS OFF) +endif() + + if(BUILD_CUDA) enable_language(CUDA) # This will fail if CUDA is not found + find_package(CUDAToolkit REQUIRED) # Convert the CUDA version from X.Y.z to XY. There's probably a shorter way of doing this string(REGEX MATCH "^[0-9]+.[0-9]+" _CUDA_VERSION_FIRST_TWO "${CMAKE_CUDA_COMPILER_VERSION}") @@ -87,28 +115,56 @@ if(BUILD_CUDA) if(NO_CUBLASLT) string(APPEND BNB_OUTPUT_NAME "_nocublaslt") endif() -else() - message(STATUS "Building CPU Only") - string(APPEND BNB_OUTPUT_NAME "_cpu") - if(NO_CUBLASLT) - message(WARNING "We're building in CPU only mode but NO_CUBLASLT is enabled. It will have no effect.") + add_compile_definitions(BUILD_CUDA) +elseif(BUILD_MPS) + if(NOT APPLE) + message(FATAL_ERROR "MPS is only supported on macOS" ) endif() + + enable_language(OBJCXX) + + list(APPEND SRC_FILES ${MPS_FILES}) + + string(APPEND BNB_OUTPUT_NAME "_mps") + add_compile_definitions(BUILD_MPS) + file(MAKE_DIRECTORY "build") + add_custom_command(OUTPUT "bitsandbytes/bitsandbytes.metallib" + COMMAND xcrun metal -c -o "build/bitsandbytes.air" ${METAL_FILES} + COMMAND xcrun metallib "build/bitsandbytes.air" -o "bitsandbytes/bitsandbytes.metallib" + DEPENDS "${METAL_FILES}" + COMMENT "Compiling Metal kernels" + VERBATIM) + add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib") +else() + set(LIBSUFFIX "cpu") + set(GPU_SOURCES) +endif() + + +if(WIN32) + # Export all symbols + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) +endif() + +# Weird MSVC hacks +if(MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2 /fp:fast") endif() set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX) add_library(bitsandbytes SHARED ${SRC_FILES}) -include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -target_include_directories(bitsandbytes PUBLIC csrc include) target_compile_features(bitsandbytes PUBLIC cxx_std_14) +target_include_directories(bitsandbytes PUBLIC csrc include) if(BUILD_CUDA) - target_compile_definitions(bitsandbytes PUBLIC BUILD_CUDA) - target_link_libraries(bitsandbytes PUBLIC cudart cublas cusparse) + target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + target_link_libraries(bitsandbytes PUBLIC CUDA::cudart CUDA::cublas CUDA::cusparse) if(NO_CUBLASLT) target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT) else() - target_link_libraries(bitsandbytes PUBLIC cublasLt) + target_link_libraries(bitsandbytes PUBLIC CUDA::cublasLt) endif() set_target_properties(bitsandbytes @@ -116,17 +172,20 @@ if(BUILD_CUDA) CUDA_SEPARABLE_COMPILATION ON ) endif() +if(BUILD_MPS) + add_dependencies(bitsandbytes metallib) + target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph") +endif() if(WIN32) set_target_properties(bitsandbytes PROPERTIES PREFIX "lib") endif() +set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME ${BNB_OUTPUT_NAME}) +if(MSVC) + set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE bitsandbytes) + set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG bitsandbytes) + set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bitsandbytes) + set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG bitsandbytes) +endif() -set_target_properties(bitsandbytes - PROPERTIES - OUTPUT_NAME ${BNB_OUTPUT_NAME} - # We have to use a generator expression to prevent MSVC Debug/Release subdirs being made - RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_SOURCE_DIR}/bitsandbytes>" - LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_SOURCE_DIR}/bitsandbytes>" - POSITION_INDEPENDENT_CODE ON # The `-fPIC` commands for non-windows compilers - WINDOWS_EXPORT_ALL_SYMBOLS ON # On Windows, export all c methods as DLL exports -) +set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY bitsandbytes) diff --git a/Makefile b/Makefile deleted file mode 100644 index e16d24624..000000000 --- a/Makefile +++ /dev/null @@ -1,141 +0,0 @@ -MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) -ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH))) - -GPP:= /usr/bin/g++ -#GPP:= /sw/gcc/11.2.0/bin/g++ -ifeq ($(CUDA_HOME),) - CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev) -endif - -ifndef CUDA_VERSION -ifneq ($(MAKECMDGOALS),clean) -$(warning WARNING: CUDA_VERSION not set. Call make with CUDA string, for example: make cuda11x CUDA_VERSION=115 or make cpuonly CUDA_VERSION=CPU) -CUDA_VERSION:= -endif -endif - - - -NVCC := $(CUDA_HOME)/bin/nvcc - -########################################### - -CSRC := $(ROOT_DIR)/csrc -BUILD_DIR:= $(ROOT_DIR)/build - -FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu -FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c - -INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include -LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib - -# NVIDIA NVCC compilation flags -COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell -COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell -COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal -COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal -COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta - -CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler -CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler - -# Later versions of CUDA support the new architectures -CC_CUDA11x := -gencode arch=compute_75,code=sm_75 -CC_CUDA11x += -gencode arch=compute_80,code=sm_80 -CC_CUDA11x += -gencode arch=compute_86,code=sm_86 - - -CC_cublasLt110 := -gencode arch=compute_75,code=sm_75 -CC_cublasLt110 += -gencode arch=compute_80,code=sm_80 - -CC_cublasLt111 := -gencode arch=compute_75,code=sm_75 -CC_cublasLt111 += -gencode arch=compute_80,code=sm_80 -CC_cublasLt111 += -gencode arch=compute_86,code=sm_86 - -CC_ADA_HOPPER := -gencode arch=compute_89,code=sm_89 -CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90 - - -all: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda110_nomatmul_kepler: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda11x_nomatmul_kepler: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - - -cuda110_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda11x_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda118_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda12x_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda110: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda11x: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda118: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda12x: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++20 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cpuonly: $(BUILD_DIR) env - $(GPP) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu.so - -env: - @echo "ENVIRONMENT" - @echo "============================" - @echo "CUDA_VERSION: $(CUDA_VERSION)" - @echo "============================" - @echo "NVCC path: $(NVCC)" - @echo "GPP path: $(GPP) VERSION: `$(GPP) --version | head -n 1`" - @echo "CUDA_HOME: $(CUDA_HOME)" - @echo "CONDA_PREFIX: $(CONDA_PREFIX)" - @echo "PATH: $(PATH)" - @echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)" - @echo "============================" - -$(BUILD_DIR): - mkdir -p build - mkdir -p dependencies - -$(ROOT_DIR)/dependencies/cub: - git clone https://github.com/NVlabs/cub $(ROOT_DIR)/dependencies/cub - cd dependencies/cub; git checkout 1.11.0 - -clean: - rm -rf build/* *.egg* - rm -f bitsandbytes/libbitsandbytes*.so diff --git a/csrc/mps_kernels.metal b/csrc/mps_kernels.metal new file mode 100644 index 000000000..63b3bf78c --- /dev/null +++ b/csrc/mps_kernels.metal @@ -0,0 +1,117 @@ +#include +using namespace metal; + +#define HLF_MAX 65504 +#define TH 1024 +#define NUM 4 +#define NUM_BLOCK 4096 + +template +static unsigned char quantize_scalar( + float rand, + device float* code, + float x) +{ + int pivot = 127; + int upper_pivot = 255; + int lower_pivot = 0; + + float lower = -1.0f; + float upper = 1.0f; + + float val = code[pivot]; + // i>>=1 = {32, 16, 8, 4, 2, 1} + for(int i = 64; i > 0; i>>=1) + { + if(x > val) + { + lower_pivot = pivot; + lower = val; + pivot+=i; + } + else + { + upper_pivot = pivot; + upper = val; + pivot-=i; + } + val = code[pivot]; + } + + if(upper_pivot == 255) + upper = code[upper_pivot]; + if(lower_pivot == 0) + lower = code[lower_pivot]; + + if(!STOCHASTIC) + { + if(x > val) + { + float midpoint = (upper+val)*0.5f; + if(x > midpoint) + { + return upper_pivot; + } + else + return pivot; + } + else + { + float midpoint = (lower+val)*0.5f; + if(x < midpoint) + return lower_pivot; + else + return pivot; + } + } + else + { + if(x > val) + { + float dist_to_upper = fabs(upper-x); + float dist_full = upper-val; + if(rand >= dist_to_upper/dist_full) return upper_pivot; + else return pivot; + } + else + { + float dist_to_lower = fabs(lower-x); + float dist_full = val-lower; + if(rand >= dist_to_lower/dist_full) return lower_pivot; + else return pivot; + } + } +} + +kernel void quantize(device float* code [[buffer(0)]], + device float* A [[buffer(1)]], + device uchar* out [[buffer(2)]], + constant uint& n [[buffer(3)]], + uint id [[thread_position_in_grid]]) { + const uint n_full = (NUM_BLOCK * (n / NUM_BLOCK)) + (n % NUM_BLOCK == 0 ? 0 : NUM_BLOCK); + uint valid_items = (id / NUM_BLOCK + 1 == (n + NUM_BLOCK - 1) / NUM_BLOCK) ? n - (id / NUM_BLOCK * NUM_BLOCK) : NUM_BLOCK; + const uint base_idx = (id / NUM_BLOCK * NUM_BLOCK); + + float vals[NUM]; + uchar qvals[NUM]; + + for (uint i = base_idx; i < n_full; i += ((n + NUM_BLOCK - 1) / NUM_BLOCK) * NUM_BLOCK) { + valid_items = n - i > NUM_BLOCK ? NUM_BLOCK : n - i; + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint j = 0; j < valid_items; j++) { + vals[j] = A[i + j]; + } + + for (uint j = 0; j < valid_items; j++) { + qvals[j] = quantize_scalar(0.0f, code, vals[j]); + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint j = 0; j < valid_items; j++) { + out[i + j] = qvals[j]; + } + } +} diff --git a/csrc/mps_ops.h b/csrc/mps_ops.h new file mode 100644 index 000000000..e69de29bb diff --git a/csrc/mps_ops.mm b/csrc/mps_ops.mm new file mode 100644 index 000000000..d198b3552 --- /dev/null +++ b/csrc/mps_ops.mm @@ -0,0 +1,67 @@ +#import + +#define HLF_MAX 65504 +#define TH 1024 +#define NUM 4 +#define NUM_BLOCK 4096 + +static inline MPSGraph* get_graph() +{ + static MPSGraph* cur = nil; + if(!cur) { + cur = [[MPSGraph alloc] init]; + } + return cur; +} + +static inline id get_device() +{ + NSError *error = nil; + static id device = nil; + if(!device) { + device = MTLCreateSystemDefaultDevice(); + } + if(!device) { + NSLog(@"Failed to get MPS device"); + abort(); + } + return device; +} + +static inline id get_library() +{ + NSError *error = nil; + static id library = nil; + if(!library) { + library = [get_device() newLibraryWithURL:[NSURL fileURLWithPath:@"bitsandbytes.metallib"] error:&error]; + } + if(!library) { + NSLog(@"Failed to load bitsandbytes.metallib"); + abort(); + } + return library; +} + +/*MPSGraphTensor* dequantize_mps(MPSGraphTensor* code, MPSGraphTensor* A, int n) +{ + id out = [get_graph() dequantizeTensor:(MPSGraphTensor*)A scaleTensor:(MPSGraphTensor*)code zeroPoint:0.0 dataType:MPSDataTypeInt8 axis:0 name:@"out"]; + return out; +}*/ + + +// MPSGraph function for quantize +extern "C" MPSGraphTensor* quantize_mps(MPSGraph* graph, MPSGraphTensor* code, MPSGraphTensor* A, int n) +{ + id device = get_device(); + id library = get_library(); + static id kernel = nil; + if(!kernel) { + kernel = [library newFunctionWithName:@"quantize"]; + if(!kernel) { + NSLog(@"Failed to load bitsandbytes.metallib"); + abort(); + } + } + NSLog(@"Not implemented"); + return nil; +} diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.cpp similarity index 99% rename from csrc/pythonInterface.c rename to csrc/pythonInterface.cpp index 087ae3921..ea2283504 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.cpp @@ -6,6 +6,9 @@ #if BUILD_CUDA #include #endif +#if BUILD_MPS +// #include +#endif #include // We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary. @@ -412,6 +415,7 @@ extern "C" { gemm_4bit_inference_naive_fp32(m, n, k, A, B, absmax, datatype, out, lda, ldb, ldc, blocksize); } #endif + void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); } void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); } } diff --git a/include/Algo-Direct-Common.h b/include/Algo-Direct-Common.h index c97084904..7b40edea9 100644 --- a/include/Algo-Direct-Common.h +++ b/include/Algo-Direct-Common.h @@ -190,7 +190,7 @@ struct DirectInfo xi = xws; } else { - myassert(Gap==1, "if Gap>1 then X workspace must be provided"); + myassert((Gap==1), "if Gap>1 then X workspace must be provided"); xi = x; } diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h index 4211c77bd..547ca9955 100644 --- a/include/Algo-Direct2.h +++ b/include/Algo-Direct2.h @@ -52,6 +52,7 @@ struct AlgoVecBase::val private: typedef AlgoScalarBase base_t; +#ifdef USE_SSE2 FORCE_INLINE //NO_INLINE void resolve(const FVec& vz, const IVec& bidx, uint32 *pr) const @@ -135,6 +136,7 @@ struct AlgoVecBase::val pr[0] = u.ui32[0]; pr[1] = u.ui32[2]; } +#endif // USE_SSE2 #ifdef USE_AVX diff --git a/include/Portable.h b/include/Portable.h index 2cec1e7de..090a25065 100644 --- a/include/Portable.h +++ b/include/Portable.h @@ -4,10 +4,40 @@ #include #include +#if defined(__aarch64__) +#ifdef __CUDACC__ +#undef USE_NEON // Doesn't work with nvcc, undefined symbols +#else +#include +#undef USE_NEON // Not yet implemented +#endif +#undef USE_AVX // x86_64 only +#undef USE_AVX2 // x86_64 only +#undef USE_SSE2 // x86_64 only +#undef USE_SSE41 // x86_64 only +#undef USE_SSE42 // x86_64 only +#undef USE_FMA // x86_64 only +#ifdef USE_NEON +typedef float32x4_t __m128; +typedef int32x4_t __m128i; +typedef float64x2_t __m128d; +#else +typedef struct {float a; float b; float c; float d;} __m128; +typedef struct {int a; int b; int c; int d;} __m128i; +typedef struct {double a; double b;} __m128d; +#endif +#else +#undef USE_NEON // ARM64 only #ifdef __FMA__ #define USE_FMA #endif +#if !defined(__SSE2__) && !defined(_MSC_VER) +#error Compiler must support SSE2 +#endif +#define USE_SSE2 +#if defined(__aarch64__) +#else #ifdef __AVX2__ #define USE_AVX2 #endif @@ -24,7 +54,8 @@ #ifdef __SSE4_2__ #define USE_SSE42 #endif - +#endif +#endif #ifndef _MSC_VER #include diff --git a/include/SIMD.h b/include/SIMD.h index a2639d3ac..9d1410c73 100644 --- a/include/SIMD.h +++ b/include/SIMD.h @@ -2,6 +2,46 @@ #include "Portable.h" +#ifdef USE_SSE2 +#include +#if defined(USE_AVX) || defined(USE_AVX2) +#include +#else +#ifdef USE_SSE41 +#include +#endif +#endif +#endif + +namespace BinSearch { +namespace Details { + +template +struct FTOITraits{}; + +template +struct FVec; + +template +struct IVec; + +template +struct FVec1; + +template <> struct InstrFloatTraits +{ + typedef __m128 vec_t; +}; + +template <> struct InstrFloatTraits +{ + typedef __m128d vec_t; +}; + +} +} + +#if !defined(__aarch64__) #ifdef USE_SSE42 #ifndef _MSC_VER #include @@ -26,29 +66,11 @@ FORCE_INLINE int popcnt32(int x32) } // namespace #endif -#if defined(USE_AVX) || defined(USE_AVX2) -#include -#else -#include -#ifdef USE_SSE41 -#include -#endif -#endif - #include "Type.h" namespace BinSearch { namespace Details { -template -struct FVec; - -template -struct IVec; - -template -struct FVec1; - template <> struct InstrIntTraits { typedef __m128i vec_t; @@ -64,18 +86,8 @@ template <> struct InstrFloatTraits typedef __m128d vec_t; }; -template <> struct InstrFloatTraits -{ - typedef float vec_t; -}; - -template <> struct InstrFloatTraits -{ - typedef double vec_t; -}; - -template -struct FTOITraits +template <> +struct FTOITraits { typedef IVec vec_t; }; @@ -295,9 +307,11 @@ FORCE_INLINE FVec operator- (const FVec& a, const FVec< FORCE_INLINE FVec operator* (const FVec& a, const FVec& b) { return _mm_mul_ps( a, b ); } FORCE_INLINE FVec operator/ (const FVec& a, const FVec& b) { return _mm_div_ps( a, b ); } FORCE_INLINE IVec ftoi (const FVec& a) { return _mm_cvttps_epi32(a); } +#ifndef __clang__ // Conflicts with builtin operator FORCE_INLINE IVec operator<= (const FVec& a, const FVec& b) { return _mm_castps_si128( _mm_cmple_ps( a, b ) ); } FORCE_INLINE IVec operator>= (const FVec& a, const FVec& b) { return _mm_castps_si128( _mm_cmpge_ps( a, b ) ); } FORCE_INLINE IVec operator< (const FVec& a, const FVec& b) { return _mm_castps_si128(_mm_cmplt_ps(a, b)); } +#endif #ifdef USE_FMA FORCE_INLINE FVec mulSub(const FVec& a, const FVec& b, const FVec& c) { return _mm_fmsub_ps(a, b, c); } #endif @@ -349,9 +363,11 @@ FORCE_INLINE FVec operator- (const FVec& a, const FVec FORCE_INLINE FVec operator* (const FVec& a, const FVec& b) { return _mm_mul_pd( a, b ); } FORCE_INLINE FVec operator/ (const FVec& a, const FVec& b) { return _mm_div_pd( a, b ); } FORCE_INLINE IVec ftoi (const FVec& a) { return _mm_cvttpd_epi32(a); } +#ifndef __clang__ // Conflicts with builtin operator FORCE_INLINE IVec operator<= (const FVec& a, const FVec& b) { return _mm_castpd_si128( _mm_cmple_pd( a, b ) ); } FORCE_INLINE IVec operator< (const FVec& a, const FVec& b) { return _mm_castpd_si128(_mm_cmplt_pd(a, b)); } FORCE_INLINE IVec operator>= (const FVec& a, const FVec& b) { return _mm_castpd_si128( _mm_cmpge_pd( a, b ) ); } +#endif #ifdef USE_FMA FORCE_INLINE FVec mulSub(const FVec& a, const FVec& b, const FVec& c ) { return _mm_fmsub_pd(a, b, c); } #endif @@ -570,3 +586,4 @@ FORCE_INLINE FVec mulSub(const FVec& a, const FVec=42", - "wheel" -] +requires = [ "setuptools", "wheel" ] build-backend = "setuptools.build_meta" [tool.ruff] diff --git a/requirements-ci.txt b/requirements-ci.txt new file mode 100644 index 000000000..46bd5b9cd --- /dev/null +++ b/requirements-ci.txt @@ -0,0 +1,7 @@ +# Requirements used for GitHub actions +pytest==7.2.2 +einops==0.6.0 +wheel==0.40.0 +lion-pytorch==0.0.6 +scipy==1.11.4 +pandas==2.2.0 diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 000000000..7ede5b061 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,9 @@ +# Requirements used for local development +setuptools>=63 +pytest~=7.2.2 +einops~=0.6.0 +wheel~=0.40.0 +lion-pytorch~=0.0.6 +scipy~=1.11.4 +pandas~=2.2.0 +matplotlib~=3.8.2 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 3bde2dc6a..000000000 --- a/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -lion-pytorch -pytest -scipy diff --git a/setup.py b/setup.py index c493b8b62..13af2a39b 100644 --- a/setup.py +++ b/setup.py @@ -5,10 +5,10 @@ import glob import os -from setuptools import Extension, find_packages, setup +from setuptools import find_packages, setup +from setuptools.dist import Distribution -libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.so")) -libs += list(glob.glob("./bitsandbytes/libbitsandbytes*.dll")) +libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.*")) libs = [os.path.basename(p) for p in libs] print("libs:", libs) @@ -17,6 +17,12 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() +# Tested with wheel v0.29.0 +class BinaryDistribution(Distribution): + def has_ext_modules(self): + return True + + setup( name="bitsandbytes", version="0.43.0.dev0", @@ -28,18 +34,16 @@ def read(fname): url="https://github.com/TimDettmers/bitsandbytes", packages=find_packages(), package_data={"": libs}, - install_requires=['torch', 'numpy'], + install_requires=["torch", "numpy"], extras_require={ - 'benchmark': ['pandas', 'matplotlib'], - 'test': ['scipy'], + "benchmark": ["pandas", "matplotlib"], + "test": ["scipy"], }, long_description=read("README.md"), long_description_content_type="text/markdown", - # HACK: pretend we have a native extension module so the wheel is tagged - # correctly with a platform tag (e.g. `-linux_x86_64.whl`). - ext_modules=[Extension("bitsandbytes", sources=[], language="c")], classifiers=[ "Development Status :: 4 - Beta", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], + distclass=BinaryDistribution, )