From 642a209c66fd2f82ae8265c58788a811bd05d8b6 Mon Sep 17 00:00:00 2001 From: Ruihang Lai Date: Sun, 17 Nov 2024 23:13:06 -0500 Subject: [PATCH] [CUDA] JIT compilation for ApplyTokenBitmask kernel This PR introduces the JIT compilation for the CUDA kernel of ApplyTokenBitmask. The JIT compilation is enabled by the cuda-python package. With JIT compilation, we can remove the AOT kernel compilation whcih introduces extra dependency when building the package. --- CMakeLists.txt | 6 - cmake/config.cmake | 1 - cpp/kernels/CMakeLists.txt | 29 --- cpp/kernels/apply_token_mask_inplace.cu | 110 --------- cpp/kernels/kernels.h | 20 -- cpp/pybind/CMakeLists.txt | 9 - cpp/pybind/pybind.cc | 5 - cpp/pybind/python_methods.cc | 56 ----- cpp/pybind/python_methods.h | 4 - .../xgrammar/{version.py => cuda/__init__.py} | 2 - .../xgrammar/cuda/apply_token_mask_inplace.py | 220 ++++++++++++++++++ python/xgrammar/xgrammar.py | 5 +- scripts/build_xgrammar_lib_osx.sh | 2 - scripts/build_xgrammar_lib_win.bat | 2 - scripts/build_xgrammar_wheel_manylinux.sh | 2 - 15 files changed, 224 insertions(+), 249 deletions(-) delete mode 100644 cpp/kernels/CMakeLists.txt delete mode 100644 cpp/kernels/apply_token_mask_inplace.cu delete mode 100644 cpp/kernels/kernels.h rename python/xgrammar/{version.py => cuda/__init__.py} (97%) create mode 100644 python/xgrammar/cuda/apply_token_mask_inplace.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 8bda79b..52709d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,6 @@ endif() option(XGRAMMAR_BUILD_PYTHON_BINDINGS "Build Python bindings" ON) option(XGRAMMAR_BUILD_CXX_TESTS "Build C++ tests" ON) -option(XGRAMMAR_BUILD_CUDA_KERNELS "Build CUDA kernels" ON) set(XGRAMMAR_CUDA_ARCHITECTURES native CACHE STRING "CUDA architectures" @@ -37,7 +36,6 @@ endif() message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") message(STATUS "Build Python bindings: ${XGRAMMAR_BUILD_PYTHON_BINDINGS}") message(STATUS "Build C++ tests: ${XGRAMMAR_BUILD_CXX_TESTS}") -message(STATUS "Build CUDA kernels: ${XGRAMMAR_BUILD_CUDA_KERNELS}") message(STATUS "CUDA architectures: ${XGRAMMAR_CUDA_ARCHITECTURES}") if(MSVC) @@ -63,10 +61,6 @@ list(FILTER XGRAMMAR_SOURCES_PATH EXCLUDE REGEX "${PROJECT_SOURCE_DIR}/cpp/pybin add_library(xgrammar STATIC ${XGRAMMAR_SOURCES_PATH}) target_include_directories(xgrammar PUBLIC ${XGRAMMAR_INCLUDE_PATH}) -if(XGRAMMAR_BUILD_KERNELS) - add_subdirectory(${PROJECT_SOURCE_DIR}/cpp/kernels) -endif() - if(XGRAMMAR_BUILD_PYTHON_BINDINGS) add_subdirectory(${PROJECT_SOURCE_DIR}/cpp/pybind) endif() diff --git a/cmake/config.cmake b/cmake/config.cmake index bbd1713..2673c7d 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -1,7 +1,6 @@ set(CMAKE_BUILD_TYPE RelWithDebInfo) set(XGRAMMAR_BUILD_PYTHON_BINDINGS ON) set(XGRAMMAR_BUILD_CXX_TESTS OFF) -set(XGRAMMAR_BUILD_KERNELS ON) # set it to your own architecture set(XGRAMMAR_CUDA_ARCHITECTURES native diff --git a/cpp/kernels/CMakeLists.txt b/cpp/kernels/CMakeLists.txt deleted file mode 100644 index 3eddccc..0000000 --- a/cpp/kernels/CMakeLists.txt +++ /dev/null @@ -1,29 +0,0 @@ -enable_language(CUDA) - -include(CheckLanguage) -check_language(CUDA) - -if(CMAKE_CUDA_COMPILER STREQUAL "NOTFOUND") - message( - FATAL_ERROR "CUDA compiler not found but XGRAMMAR_BUILD_CUDA_KERNELS is ON. Please install " - "CUDA or set XGRAMMAR_BUILD_CUDA_KERNELS to OFF." - ) -endif() - -message(STATUS "Found cuda, building XGrammar kernels") - -if(DEFINED XGRAMMAR_CUDA_ARCHITECTURES) - message(STATUS "CUDA architectures are set to ${XGRAMMAR_CUDA_ARCHITECTURES}.") - set(CMAKE_CUDA_ARCHITECTURES ${XGRAMMAR_CUDA_ARCHITECTURES}) -else() - message(STATUS "CUDA architectures are set to default value ${CMAKE_CUDA_ARCHITECTURES}.") -endif() - -file(GLOB_RECURSE XGRAMMAR_KERNELS_PATH ${PROJECT_SOURCE_DIR}/cpp/kernels/*.cu) -add_library(xgrammar_kernels STATIC ${XGRAMMAR_KERNELS_PATH}) - -# Expose CMAKE_CUDA_ARCHITECTURES to parent scope -set(CMAKE_CUDA_ARCHITECTURES - ${CMAKE_CUDA_ARCHITECTURES} - PARENT_SCOPE -) diff --git a/cpp/kernels/apply_token_mask_inplace.cu b/cpp/kernels/apply_token_mask_inplace.cu deleted file mode 100644 index 58b3850..0000000 --- a/cpp/kernels/apply_token_mask_inplace.cu +++ /dev/null @@ -1,110 +0,0 @@ -#include -#include - -#include -#include - -#include "../support/logging.h" -#include "kernels.h" - -#define XGRAMMAR_CUDA_CALL(...) \ - do { \ - __VA_ARGS__; \ - cudaError_t err = cudaGetLastError(); \ - XGRAMMAR_CHECK(err == cudaSuccess) << "CUDA Error: " << cudaGetErrorString(err) << " (" << err \ - << ") " << __FILE__ << ": line " << __LINE__ << std::endl; \ - } while (0) - -#define XGRAMMAR_DISPATCH_DTYPE(dtype_flag, c_type, ...) \ - do { \ - switch (dtype_flag) { \ - case DTypeFlag::DTYPE_FLOAT16: { \ - using c_type = half; \ - __VA_ARGS__; \ - break; \ - } \ - case DTypeFlag::DTYPE_FLOAT32: { \ - using c_type = float; \ - __VA_ARGS__; \ - break; \ - } \ - case DTypeFlag::DTYPE_FLOAT64: { \ - using c_type = double; \ - __VA_ARGS__; \ - break; \ - } \ - default: \ - std::ostringstream oss; \ - oss << #__VA_ARGS__ << " failed to dispatch data type " << static_cast(dtype_flag); \ - XGRAMMAR_LOG(FATAL) << oss.str(); \ - break; \ - } \ - } while (0) - -namespace xgrammar { - -#define BITS_PER_BLOCK 32 -#define GET_BIT(data_ptr, bit_idx) \ - ((data_ptr[bit_idx / BITS_PER_BLOCK] >> (bit_idx % BITS_PER_BLOCK)) & 1) - -template -__device__ T GetNegativeInfinity() { - return -cuda::std::numeric_limits::infinity(); -} - -template <> -__device__ half GetNegativeInfinity() { - return __float2half(-INFINITY); -} - -template -__global__ void __launch_bounds__(512) ApplyTokenBitmaskInplaceKernel( - T* __restrict__ logits, - const int32_t* __restrict__ bitmask, - int vocab_size, - int bitmask_size, - int bitmask_row_size -) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - if (gid >= bitmask_size) { - return; - } - - int batch_id = gid / bitmask_row_size; - int bitmask_id = gid % bitmask_row_size; - int bitmask_val = bitmask[gid]; - T* logits_ptr = logits + batch_id * vocab_size + bitmask_id * BITS_PER_BLOCK; - for (int i = 0; i < BITS_PER_BLOCK; ++i) { - if (bitmask_id * BITS_PER_BLOCK + i >= vocab_size) { - break; - } - if ((bitmask_val & 1) == 0) { - logits_ptr[i] = GetNegativeInfinity(); - } - bitmask_val >>= 1; - } -} - -#define THREADS_PER_BLOCK 512 - -void ApplyTokenBitmaskInplace( - void* logits, DTypeFlag dtype_flag, int32_t* bitmask, int batch_size, int vocab_size -) { - int bitmask_size = (vocab_size + BITS_PER_BLOCK - 1) / BITS_PER_BLOCK; - int num_blocks = (batch_size * bitmask_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - int num_threads = THREADS_PER_BLOCK; - - XGRAMMAR_DISPATCH_DTYPE(dtype_flag, c_type, { - XGRAMMAR_CUDA_CALL({ - ApplyTokenBitmaskInplaceKernel<<>>( - reinterpret_cast(logits), - bitmask, - vocab_size, - batch_size * bitmask_size, - bitmask_size - ); - }); - }); -} - -} // namespace xgrammar diff --git a/cpp/kernels/kernels.h b/cpp/kernels/kernels.h deleted file mode 100644 index bd73c6a..0000000 --- a/cpp/kernels/kernels.h +++ /dev/null @@ -1,20 +0,0 @@ -/*! - * Copyright (c) 2024 by Contributors - * \file xgrammar/kernels/kernels.h - * \brief The header for the support of grammar-guided generation. - */ - -#ifndef XGRAMMAR_KERNELS_KERNELS_H_ -#define XGRAMMAR_KERNELS_KERNELS_H_ - -namespace xgrammar { - -enum class DTypeFlag : int { DTYPE_FLOAT16 = 0, DTYPE_FLOAT32 = 1, DTYPE_FLOAT64 = 2 }; - -void ApplyTokenBitmaskInplace( - void* logits, DTypeFlag dtype_flag, int32_t* bitmask, int batch_size, int vocab_size -); - -} // namespace xgrammar - -#endif // XGRAMMAR_KERNELS_KERNELS_H_ diff --git a/cpp/pybind/CMakeLists.txt b/cpp/pybind/CMakeLists.txt index 2d34ea5..cbd611c 100644 --- a/cpp/pybind/CMakeLists.txt +++ b/cpp/pybind/CMakeLists.txt @@ -39,18 +39,9 @@ find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib # -D_GLIBCXX_USE_CXX11_ABI=0. So we compile bindings separately. file(GLOB_RECURSE XGRAMMAR_BINDINGS_PATH ${PROJECT_SOURCE_DIR}/cpp/*.cc) -if(XGRAMMAR_BUILD_KERNELS) - file(GLOB_RECURSE XGRAMMAR_KERNELS_PATH ${PROJECT_SOURCE_DIR}/cpp/kernels/*.cu) - list(APPEND XGRAMMAR_BINDINGS_PATH ${XGRAMMAR_KERNELS_PATH}) -endif() - pybind11_add_module(xgrammar_bindings ${XGRAMMAR_BINDINGS_PATH}) target_include_directories(xgrammar_bindings PUBLIC ${XGRAMMAR_INCLUDE_PATH}) -if(XGRAMMAR_BUILD_KERNELS) - target_compile_definitions(xgrammar_bindings PUBLIC -DXGRAMMAR_BUILD_KERNELS) -endif() - target_link_libraries(xgrammar_bindings PUBLIC ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY}) set(LIB_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/python/xgrammar") set_target_properties(xgrammar_bindings PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${LIB_OUTPUT_DIRECTORY}) diff --git a/cpp/pybind/pybind.cc b/cpp/pybind/pybind.cc index f45d391..6696ed6 100644 --- a/cpp/pybind/pybind.cc +++ b/cpp/pybind/pybind.cc @@ -74,9 +74,4 @@ PYBIND11_MODULE(xgrammar_bindings, m) { .def_property_readonly("vocab_size", &GrammarMatcher::GetVocabSize) .def_property_readonly("max_rollback_tokens", &GrammarMatcher::GetMaxRollbackTokens) .def_property_readonly("stop_token_ids", &GrammarMatcher::GetStopTokenIds); -#ifdef XGRAMMAR_BUILD_KERNELS - pyGrammarMatcher.def_static( - "apply_token_bitmask_inplace", &GrammarMatcher_ApplyTokenBitmaskInplace - ); -#endif } diff --git a/cpp/pybind/python_methods.cc b/cpp/pybind/python_methods.cc index 432ea0f..6546a40 100644 --- a/cpp/pybind/python_methods.cc +++ b/cpp/pybind/python_methods.cc @@ -5,7 +5,6 @@ #include "python_methods.h" -#include #include #include @@ -16,10 +15,6 @@ #include "../support/dynamic_bitset.h" #include "../support/logging.h" -#ifdef XGRAMMAR_BUILD_KERNELS -#include "../kernels/kernels.h" -#endif - namespace xgrammar { // Parse the EBNF string but not normalize it @@ -130,55 +125,4 @@ std::vector GrammarMatcher_DebugGetMaskedTokensFromBitmask( return result; } -#ifdef XGRAMMAR_BUILD_KERNELS -void GrammarMatcher_ApplyTokenBitmaskInplace(torch::Tensor logits, torch::Tensor token_bitmask) { - auto logits_shape = logits.sizes(); - int batch_size = 1; - int vocab_size; - if (logits_shape.size() == 1) { - vocab_size = logits_shape[0]; - } else if (logits_shape.size() == 2) { - batch_size = logits_shape[0]; - vocab_size = logits_shape[1]; - } else { - XGRAMMAR_LOG(FATAL) << "logits tensor must be 1D or 2D"; - } - - auto bitmask_shape = token_bitmask.sizes(); - int expected_bitmask_size = DynamicBitset::GetBufferSize(vocab_size); - if (bitmask_shape.size() == 1) { - XGRAMMAR_CHECK(bitmask_shape[0] == expected_bitmask_size) - << "The last dimension of the token bitmask tensor must be " << expected_bitmask_size - << ", but got " << bitmask_shape[0]; - } else if (bitmask_shape.size() == 2) { - XGRAMMAR_CHECK(bitmask_shape[0] == batch_size) - << "The first dimension of the token bitmask tensor must be " << batch_size << ", but got " - << bitmask_shape[0]; - XGRAMMAR_CHECK(bitmask_shape[1] == expected_bitmask_size) - << "The last dimension of the token bitmask tensor must be " << expected_bitmask_size - << ", but got " << bitmask_shape[1]; - } else { - XGRAMMAR_LOG(FATAL) << "token_bitmask tensor must be 1D or 2D"; - } - - DTypeFlag dtype_flag; - if (logits.dtype() == torch::kFloat16) { - dtype_flag = DTypeFlag::DTYPE_FLOAT16; - } else if (logits.dtype() == torch::kFloat32) { - dtype_flag = DTypeFlag::DTYPE_FLOAT32; - } else if (logits.dtype() == torch::kFloat64) { - dtype_flag = DTypeFlag::DTYPE_FLOAT64; - } else { - XGRAMMAR_LOG(FATAL) << "logits tensor must be of type float16, float32, or float64"; - } - - XGRAMMAR_CHECK(token_bitmask.dtype() == torch::kInt32) - << "token bitmask tensor must be of type int32"; - - ApplyTokenBitmaskInplace( - logits.data_ptr(), dtype_flag, token_bitmask.data_ptr(), batch_size, vocab_size - ); -} -#endif - } // namespace xgrammar diff --git a/cpp/pybind/python_methods.h b/cpp/pybind/python_methods.h index f65d75e..60f7ab3 100644 --- a/cpp/pybind/python_methods.h +++ b/cpp/pybind/python_methods.h @@ -40,10 +40,6 @@ std::vector GrammarMatcher_DebugGetMaskedTokensFromBitmask( GrammarMatcher& matcher, torch::Tensor token_bitmask, int batch_id ); -#ifdef XGRAMMAR_BUILD_KERNELS -void GrammarMatcher_ApplyTokenBitmaskInplace(torch::Tensor logits, torch::Tensor token_bitmask); -#endif - } // namespace xgrammar #endif // XGRAMMAR_PYBIND_PYTHON_METHODS_H_ diff --git a/python/xgrammar/version.py b/python/xgrammar/cuda/__init__.py similarity index 97% rename from python/xgrammar/version.py rename to python/xgrammar/cuda/__init__.py index bc3d15d..13a8339 100644 --- a/python/xgrammar/version.py +++ b/python/xgrammar/cuda/__init__.py @@ -14,5 +14,3 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - -__version__ = "0.0.3" diff --git a/python/xgrammar/cuda/apply_token_mask_inplace.py b/python/xgrammar/cuda/apply_token_mask_inplace.py new file mode 100644 index 0000000..772b27b --- /dev/null +++ b/python/xgrammar/cuda/apply_token_mask_inplace.py @@ -0,0 +1,220 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""The CUDA kernel source code for in-place applying token mask.""" +import ctypes +import os + +import numpy as np +import torch + +try: + from cuda import cuda, cudart, nvrtc +except ImportError: + cuda = None + cudart = None + nvrtc = None + + +BITS_PER_BLOCK = 32 +THREADS_PER_BLOCK = 512 + +_apply_token_bitmask_inplace_kernel = """ +#include + +#include + +#define BITS_PER_BLOCK 32 + +extern "C" __global__ void __launch_bounds__(512) ApplyTokenBitmaskInplaceKernel( + float* __restrict__ logits, + const int32_t* __restrict__ bitmask, + int32_t vocab_size, + int64_t bitmask_size, + int32_t bitmask_row_size +) { + int64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + if (gid >= bitmask_size) { + return; + } + + int32_t batch_id = gid / bitmask_row_size; + int32_t bitmask_id = gid % bitmask_row_size; + int32_t bitmask_val = bitmask[gid]; + float* logits_ptr = logits + batch_id * vocab_size + bitmask_id * BITS_PER_BLOCK; + for (int i = 0; i < BITS_PER_BLOCK; ++i) { + if (bitmask_id * BITS_PER_BLOCK + i >= vocab_size) { + break; + } + if ((bitmask_val & 1) == 0) { + logits_ptr[i] = -cuda::std::numeric_limits::infinity(); + } + bitmask_val >>= 1; + } +} +""".strip() + + +# Adapted from https://github.com/NVIDIA/cuda-python/blob/main/cuda_bindings/examples. +def _cudaGetErrorEnum(error): + if isinstance(error, cuda.CUresult): + err, name = cuda.cuGetErrorName(error) + return name if err == cuda.CUresult.CUDA_SUCCESS else "" + elif isinstance(error, cudart.cudaError_t): + return cudart.cudaGetErrorName(error)[1] + elif isinstance(error, nvrtc.nvrtcResult): + return nvrtc.nvrtcGetErrorString(error)[1] + else: + raise RuntimeError("Unknown error type: {}".format(error)) + + +# Adapted from https://github.com/NVIDIA/cuda-python/blob/main/cuda_bindings/examples. +def checkCudaErrors(result): + if result[0].value: + raise RuntimeError( + "CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0])) + ) + if len(result) == 1: + return None + elif len(result) == 2: + return result[1] + else: + return result[1:] + + +# Adapted from https://github.com/NVIDIA/cuda-python/blob/main/cuda_bindings/examples. +class KernelStore: + _module = None + _func = None + + @classmethod + def compile(cls, device_id: int): + if cls._func is not None: + return cls._func + + prog = checkCudaErrors( + nvrtc.nvrtcCreateProgram( + str.encode(_apply_token_bitmask_inplace_kernel), b"sourceCode.cu", 0, [], [] + ) + ) + CUDA_HOME = os.getenv("CUDA_HOME") + if CUDA_HOME == None: + CUDA_HOME = os.getenv("CUDA_PATH") + if CUDA_HOME == None: + raise RuntimeError("Environment variable CUDA_HOME or CUDA_PATH is not set") + include_dirs = os.path.join(CUDA_HOME, "include") + + # Initialize CUDA + checkCudaErrors(cudart.cudaFree(0)) + + major = checkCudaErrors( + cudart.cudaDeviceGetAttribute( + cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device_id + ) + ) + minor = checkCudaErrors( + cudart.cudaDeviceGetAttribute( + cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device_id + ) + ) + prefix = "compute" + arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii") + + try: + opts = [ + b"--fmad=true", + arch_arg, + "--include-path={}".format(include_dirs).encode("UTF-8"), + b"--std=c++11", + b"-default-device", + ] + checkCudaErrors(nvrtc.nvrtcCompileProgram(prog, len(opts), opts)) + except RuntimeError as err: + logSize = checkCudaErrors(nvrtc.nvrtcGetProgramLogSize(prog)) + log = b" " * logSize + checkCudaErrors(nvrtc.nvrtcGetProgramLog(prog, log)) + # NOTE: The prints below are intended to provide the kernel compilation error. + print(log.decode()) + print(err) + raise RuntimeError("CUDA kernel compilation failure") + + dataSize = checkCudaErrors(nvrtc.nvrtcGetPTXSize(prog)) + data = b" " * dataSize + checkCudaErrors(nvrtc.nvrtcGetPTX(prog, data)) + + # Store into `_module` and `_func`. + module = checkCudaErrors(cuda.cuModuleLoadData(np.char.array(data))) + func = checkCudaErrors(cuda.cuModuleGetFunction(module, b"ApplyTokenBitmaskInplaceKernel")) + cls._module = module + cls._func = func + # Return the compiled kernel. + return func + + +def apply_token_bitmask_inplace(logits: torch.Tensor, bitmask: torch.Tensor): + if cuda is None or cudart is None or nvrtc is None: + raise RuntimeError("cuda-python is not installed. Please install cuda-python first.") + + # Check input tensor shapes. + if logits.ndim == 2: + batch_size, vocab_size = logits.shape + elif logits.ndim == 1: + batch_size = 1 + (vocab_size,) = logits.shape + else: + raise ValueError(f"Invalid logits tensor shape {logits.shape}") + bitmask_size = (vocab_size + BITS_PER_BLOCK - 1) // BITS_PER_BLOCK + + # Ensure that the tensors are contiguous in memory. + logits = logits.contiguous() + bitmask = bitmask.contiguous() + + # Compile the kernel. + kernel = KernelStore.compile(logits.device.index) + # Setup kernel launching arguments. + grid_dims = (batch_size * bitmask_size + THREADS_PER_BLOCK - 1) // THREADS_PER_BLOCK, 1, 1 + block_dims = THREADS_PER_BLOCK, 1, 1 + shared_mem_bytes = 0 + stream = cuda.CU_STREAM_LEGACY + extra = 0 + kernelArgs = ( + ( + logits.data_ptr(), + bitmask.data_ptr(), + vocab_size, + batch_size * bitmask_size, + bitmask_size, + ), + ( + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int32, + ctypes.c_int64, + ctypes.c_int32, + ), + ) + # Launch the kernel. + checkCudaErrors( + cuda.cuLaunchKernel( + kernel, + *grid_dims, + *block_dims, + shared_mem_bytes, + stream, + kernelArgs, + extra, + ) + ) diff --git a/python/xgrammar/xgrammar.py b/python/xgrammar/xgrammar.py index 30b1f1e..6ea45d9 100644 --- a/python/xgrammar/xgrammar.py +++ b/python/xgrammar/xgrammar.py @@ -26,6 +26,9 @@ from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast from . import xgrammar_bindings as _core +from .cuda.apply_token_mask_inplace import ( + apply_token_bitmask_inplace as apply_token_bitmask_inplace_cuda, +) class XGObject: @@ -792,7 +795,7 @@ def apply_token_bitmask_inplace(logits: torch.Tensor, bitmask: torch.Tensor): if bitmask.device != logits.device: bitmask = bitmask.to(logits.device) - _core.GrammarMatcher.apply_token_bitmask_inplace(logits, bitmask) + apply_token_bitmask_inplace_cuda(logits, bitmask) def debug_get_masked_tokens_from_bitmask( self, bitmask: torch.Tensor, batch_id: int = 0 diff --git a/scripts/build_xgrammar_lib_osx.sh b/scripts/build_xgrammar_lib_osx.sh index 8b5ceb4..0602bc9 100755 --- a/scripts/build_xgrammar_lib_osx.sh +++ b/scripts/build_xgrammar_lib_osx.sh @@ -6,8 +6,6 @@ set -u # setup config.cmake rm -f config.cmake echo set\(XGRAMMAR_BUILD_PYTHON_BINDINGS ON\) >>config.cmake -echo set\(XGRAMMAR_BUILD_KERNELS OFF\) >>config.cmake -echo set\(XGRAMMAR_BUILD_CUDA_KERNELS OFF\) >>config.cmake echo set\(XGRAMMAR_BUILD_CXX_TESTS OFF\) >>config.cmake diff --git a/scripts/build_xgrammar_lib_win.bat b/scripts/build_xgrammar_lib_win.bat index 289ab7c..6cafeae 100644 --- a/scripts/build_xgrammar_lib_win.bat +++ b/scripts/build_xgrammar_lib_win.bat @@ -2,8 +2,6 @@ echo on del /f config.cmake echo set(XGRAMMAR_BUILD_PYTHON_BINDINGS ON) >> config.cmake -echo set(XGRAMMAR_BUILD_KERNELS OFF) >> config.cmake -echo set(XGRAMMAR_BUILD_CUDA_KERNELS OFF) >> config.cmake echo set(XGRAMMAR_BUILD_CXX_TESTS OFF) >> config.cmake rd /s /q build diff --git a/scripts/build_xgrammar_wheel_manylinux.sh b/scripts/build_xgrammar_wheel_manylinux.sh index eb95d93..55513f0 100755 --- a/scripts/build_xgrammar_wheel_manylinux.sh +++ b/scripts/build_xgrammar_wheel_manylinux.sh @@ -68,8 +68,6 @@ cd /workspace # setup config.cmake echo set\(XGRAMMAR_BUILD_PYTHON_BINDINGS ON\) >>config.cmake -echo set\(XGRAMMAR_BUILD_KERNELS OFF\) >>config.cmake -echo set\(XGRAMMAR_BUILD_CUDA_KERNELS OFF\) >>config.cmake echo set\(XGRAMMAR_BUILD_CXX_TESTS OFF\) >>config.cmake # compile the xgrammar