From 642a209c66fd2f82ae8265c58788a811bd05d8b6 Mon Sep 17 00:00:00 2001
From: Ruihang Lai <ruihangl@cs.cmu.edu>
Date: Sun, 17 Nov 2024 23:13:06 -0500
Subject: [PATCH] [CUDA] JIT compilation for ApplyTokenBitmask kernel

This PR introduces the JIT compilation for the CUDA kernel of
ApplyTokenBitmask. The JIT compilation is enabled by the cuda-python
package.

With JIT compilation, we can remove the AOT kernel compilation
whcih introduces extra dependency when building the package.
---
 CMakeLists.txt                                |   6 -
 cmake/config.cmake                            |   1 -
 cpp/kernels/CMakeLists.txt                    |  29 ---
 cpp/kernels/apply_token_mask_inplace.cu       | 110 ---------
 cpp/kernels/kernels.h                         |  20 --
 cpp/pybind/CMakeLists.txt                     |   9 -
 cpp/pybind/pybind.cc                          |   5 -
 cpp/pybind/python_methods.cc                  |  56 -----
 cpp/pybind/python_methods.h                   |   4 -
 .../xgrammar/{version.py => cuda/__init__.py} |   2 -
 .../xgrammar/cuda/apply_token_mask_inplace.py | 220 ++++++++++++++++++
 python/xgrammar/xgrammar.py                   |   5 +-
 scripts/build_xgrammar_lib_osx.sh             |   2 -
 scripts/build_xgrammar_lib_win.bat            |   2 -
 scripts/build_xgrammar_wheel_manylinux.sh     |   2 -
 15 files changed, 224 insertions(+), 249 deletions(-)
 delete mode 100644 cpp/kernels/CMakeLists.txt
 delete mode 100644 cpp/kernels/apply_token_mask_inplace.cu
 delete mode 100644 cpp/kernels/kernels.h
 rename python/xgrammar/{version.py => cuda/__init__.py} (97%)
 create mode 100644 python/xgrammar/cuda/apply_token_mask_inplace.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8bda79b..52709d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,7 +16,6 @@ endif()
 
 option(XGRAMMAR_BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
 option(XGRAMMAR_BUILD_CXX_TESTS "Build C++ tests" ON)
-option(XGRAMMAR_BUILD_CUDA_KERNELS "Build CUDA kernels" ON)
 set(XGRAMMAR_CUDA_ARCHITECTURES
     native
     CACHE STRING "CUDA architectures"
@@ -37,7 +36,6 @@ endif()
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Build Python bindings: ${XGRAMMAR_BUILD_PYTHON_BINDINGS}")
 message(STATUS "Build C++ tests: ${XGRAMMAR_BUILD_CXX_TESTS}")
-message(STATUS "Build CUDA kernels: ${XGRAMMAR_BUILD_CUDA_KERNELS}")
 message(STATUS "CUDA architectures: ${XGRAMMAR_CUDA_ARCHITECTURES}")
 
 if(MSVC)
@@ -63,10 +61,6 @@ list(FILTER XGRAMMAR_SOURCES_PATH EXCLUDE REGEX "${PROJECT_SOURCE_DIR}/cpp/pybin
 add_library(xgrammar STATIC ${XGRAMMAR_SOURCES_PATH})
 target_include_directories(xgrammar PUBLIC ${XGRAMMAR_INCLUDE_PATH})
 
-if(XGRAMMAR_BUILD_KERNELS)
-  add_subdirectory(${PROJECT_SOURCE_DIR}/cpp/kernels)
-endif()
-
 if(XGRAMMAR_BUILD_PYTHON_BINDINGS)
   add_subdirectory(${PROJECT_SOURCE_DIR}/cpp/pybind)
 endif()
diff --git a/cmake/config.cmake b/cmake/config.cmake
index bbd1713..2673c7d 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -1,7 +1,6 @@
 set(CMAKE_BUILD_TYPE RelWithDebInfo)
 set(XGRAMMAR_BUILD_PYTHON_BINDINGS ON)
 set(XGRAMMAR_BUILD_CXX_TESTS OFF)
-set(XGRAMMAR_BUILD_KERNELS ON)
 # set it to your own architecture
 set(XGRAMMAR_CUDA_ARCHITECTURES
     native
diff --git a/cpp/kernels/CMakeLists.txt b/cpp/kernels/CMakeLists.txt
deleted file mode 100644
index 3eddccc..0000000
--- a/cpp/kernels/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-enable_language(CUDA)
-
-include(CheckLanguage)
-check_language(CUDA)
-
-if(CMAKE_CUDA_COMPILER STREQUAL "NOTFOUND")
-  message(
-    FATAL_ERROR "CUDA compiler not found but XGRAMMAR_BUILD_CUDA_KERNELS is ON. Please install "
-                "CUDA or set XGRAMMAR_BUILD_CUDA_KERNELS to OFF."
-  )
-endif()
-
-message(STATUS "Found cuda, building XGrammar kernels")
-
-if(DEFINED XGRAMMAR_CUDA_ARCHITECTURES)
-  message(STATUS "CUDA architectures are set to ${XGRAMMAR_CUDA_ARCHITECTURES}.")
-  set(CMAKE_CUDA_ARCHITECTURES ${XGRAMMAR_CUDA_ARCHITECTURES})
-else()
-  message(STATUS "CUDA architectures are set to default value ${CMAKE_CUDA_ARCHITECTURES}.")
-endif()
-
-file(GLOB_RECURSE XGRAMMAR_KERNELS_PATH ${PROJECT_SOURCE_DIR}/cpp/kernels/*.cu)
-add_library(xgrammar_kernels STATIC ${XGRAMMAR_KERNELS_PATH})
-
-# Expose CMAKE_CUDA_ARCHITECTURES to parent scope
-set(CMAKE_CUDA_ARCHITECTURES
-    ${CMAKE_CUDA_ARCHITECTURES}
-    PARENT_SCOPE
-)
diff --git a/cpp/kernels/apply_token_mask_inplace.cu b/cpp/kernels/apply_token_mask_inplace.cu
deleted file mode 100644
index 58b3850..0000000
--- a/cpp/kernels/apply_token_mask_inplace.cu
+++ /dev/null
@@ -1,110 +0,0 @@
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-
-#include <cuda/std/limits>
-#include <sstream>
-
-#include "../support/logging.h"
-#include "kernels.h"
-
-#define XGRAMMAR_CUDA_CALL(...)                                                                    \
-  do {                                                                                             \
-    __VA_ARGS__;                                                                                   \
-    cudaError_t err = cudaGetLastError();                                                          \
-    XGRAMMAR_CHECK(err == cudaSuccess) << "CUDA Error: " << cudaGetErrorString(err) << " (" << err \
-                                       << ") " << __FILE__ << ": line " << __LINE__ << std::endl;  \
-  } while (0)
-
-#define XGRAMMAR_DISPATCH_DTYPE(dtype_flag, c_type, ...)                                         \
-  do {                                                                                           \
-    switch (dtype_flag) {                                                                        \
-      case DTypeFlag::DTYPE_FLOAT16: {                                                           \
-        using c_type = half;                                                                     \
-        __VA_ARGS__;                                                                             \
-        break;                                                                                   \
-      }                                                                                          \
-      case DTypeFlag::DTYPE_FLOAT32: {                                                           \
-        using c_type = float;                                                                    \
-        __VA_ARGS__;                                                                             \
-        break;                                                                                   \
-      }                                                                                          \
-      case DTypeFlag::DTYPE_FLOAT64: {                                                           \
-        using c_type = double;                                                                   \
-        __VA_ARGS__;                                                                             \
-        break;                                                                                   \
-      }                                                                                          \
-      default:                                                                                   \
-        std::ostringstream oss;                                                                  \
-        oss << #__VA_ARGS__ << " failed to dispatch data type " << static_cast<int>(dtype_flag); \
-        XGRAMMAR_LOG(FATAL) << oss.str();                                                        \
-        break;                                                                                   \
-    }                                                                                            \
-  } while (0)
-
-namespace xgrammar {
-
-#define BITS_PER_BLOCK 32
-#define GET_BIT(data_ptr, bit_idx) \
-  ((data_ptr[bit_idx / BITS_PER_BLOCK] >> (bit_idx % BITS_PER_BLOCK)) & 1)
-
-template <typename T>
-__device__ T GetNegativeInfinity() {
-  return -cuda::std::numeric_limits<T>::infinity();
-}
-
-template <>
-__device__ half GetNegativeInfinity<half>() {
-  return __float2half(-INFINITY);
-}
-
-template <typename T>
-__global__ void __launch_bounds__(512) ApplyTokenBitmaskInplaceKernel(
-    T* __restrict__ logits,
-    const int32_t* __restrict__ bitmask,
-    int vocab_size,
-    int bitmask_size,
-    int bitmask_row_size
-) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (gid >= bitmask_size) {
-    return;
-  }
-
-  int batch_id = gid / bitmask_row_size;
-  int bitmask_id = gid % bitmask_row_size;
-  int bitmask_val = bitmask[gid];
-  T* logits_ptr = logits + batch_id * vocab_size + bitmask_id * BITS_PER_BLOCK;
-  for (int i = 0; i < BITS_PER_BLOCK; ++i) {
-    if (bitmask_id * BITS_PER_BLOCK + i >= vocab_size) {
-      break;
-    }
-    if ((bitmask_val & 1) == 0) {
-      logits_ptr[i] = GetNegativeInfinity<T>();
-    }
-    bitmask_val >>= 1;
-  }
-}
-
-#define THREADS_PER_BLOCK 512
-
-void ApplyTokenBitmaskInplace(
-    void* logits, DTypeFlag dtype_flag, int32_t* bitmask, int batch_size, int vocab_size
-) {
-  int bitmask_size = (vocab_size + BITS_PER_BLOCK - 1) / BITS_PER_BLOCK;
-  int num_blocks = (batch_size * bitmask_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-  int num_threads = THREADS_PER_BLOCK;
-
-  XGRAMMAR_DISPATCH_DTYPE(dtype_flag, c_type, {
-    XGRAMMAR_CUDA_CALL({
-      ApplyTokenBitmaskInplaceKernel<<<num_blocks, num_threads>>>(
-          reinterpret_cast<c_type*>(logits),
-          bitmask,
-          vocab_size,
-          batch_size * bitmask_size,
-          bitmask_size
-      );
-    });
-  });
-}
-
-}  // namespace xgrammar
diff --git a/cpp/kernels/kernels.h b/cpp/kernels/kernels.h
deleted file mode 100644
index bd73c6a..0000000
--- a/cpp/kernels/kernels.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*!
- *  Copyright (c) 2024 by Contributors
- * \file xgrammar/kernels/kernels.h
- * \brief The header for the support of grammar-guided generation.
- */
-
-#ifndef XGRAMMAR_KERNELS_KERNELS_H_
-#define XGRAMMAR_KERNELS_KERNELS_H_
-
-namespace xgrammar {
-
-enum class DTypeFlag : int { DTYPE_FLOAT16 = 0, DTYPE_FLOAT32 = 1, DTYPE_FLOAT64 = 2 };
-
-void ApplyTokenBitmaskInplace(
-    void* logits, DTypeFlag dtype_flag, int32_t* bitmask, int batch_size, int vocab_size
-);
-
-}  // namespace xgrammar
-
-#endif  // XGRAMMAR_KERNELS_KERNELS_H_
diff --git a/cpp/pybind/CMakeLists.txt b/cpp/pybind/CMakeLists.txt
index 2d34ea5..cbd611c 100644
--- a/cpp/pybind/CMakeLists.txt
+++ b/cpp/pybind/CMakeLists.txt
@@ -39,18 +39,9 @@ find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib
 # -D_GLIBCXX_USE_CXX11_ABI=0. So we compile bindings separately.
 file(GLOB_RECURSE XGRAMMAR_BINDINGS_PATH ${PROJECT_SOURCE_DIR}/cpp/*.cc)
 
-if(XGRAMMAR_BUILD_KERNELS)
-  file(GLOB_RECURSE XGRAMMAR_KERNELS_PATH ${PROJECT_SOURCE_DIR}/cpp/kernels/*.cu)
-  list(APPEND XGRAMMAR_BINDINGS_PATH ${XGRAMMAR_KERNELS_PATH})
-endif()
-
 pybind11_add_module(xgrammar_bindings ${XGRAMMAR_BINDINGS_PATH})
 target_include_directories(xgrammar_bindings PUBLIC ${XGRAMMAR_INCLUDE_PATH})
 
-if(XGRAMMAR_BUILD_KERNELS)
-  target_compile_definitions(xgrammar_bindings PUBLIC -DXGRAMMAR_BUILD_KERNELS)
-endif()
-
 target_link_libraries(xgrammar_bindings PUBLIC ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY})
 set(LIB_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/python/xgrammar")
 set_target_properties(xgrammar_bindings PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${LIB_OUTPUT_DIRECTORY})
diff --git a/cpp/pybind/pybind.cc b/cpp/pybind/pybind.cc
index f45d391..6696ed6 100644
--- a/cpp/pybind/pybind.cc
+++ b/cpp/pybind/pybind.cc
@@ -74,9 +74,4 @@ PYBIND11_MODULE(xgrammar_bindings, m) {
       .def_property_readonly("vocab_size", &GrammarMatcher::GetVocabSize)
       .def_property_readonly("max_rollback_tokens", &GrammarMatcher::GetMaxRollbackTokens)
       .def_property_readonly("stop_token_ids", &GrammarMatcher::GetStopTokenIds);
-#ifdef XGRAMMAR_BUILD_KERNELS
-  pyGrammarMatcher.def_static(
-      "apply_token_bitmask_inplace", &GrammarMatcher_ApplyTokenBitmaskInplace
-  );
-#endif
 }
diff --git a/cpp/pybind/python_methods.cc b/cpp/pybind/python_methods.cc
index 432ea0f..6546a40 100644
--- a/cpp/pybind/python_methods.cc
+++ b/cpp/pybind/python_methods.cc
@@ -5,7 +5,6 @@
 
 #include "python_methods.h"
 
-#include <ATen/DLConvertor.h>
 #include <xgrammar/xgrammar.h>
 
 #include <algorithm>
@@ -16,10 +15,6 @@
 #include "../support/dynamic_bitset.h"
 #include "../support/logging.h"
 
-#ifdef XGRAMMAR_BUILD_KERNELS
-#include "../kernels/kernels.h"
-#endif
-
 namespace xgrammar {
 
 // Parse the EBNF string but not normalize it
@@ -130,55 +125,4 @@ std::vector<int> GrammarMatcher_DebugGetMaskedTokensFromBitmask(
   return result;
 }
 
-#ifdef XGRAMMAR_BUILD_KERNELS
-void GrammarMatcher_ApplyTokenBitmaskInplace(torch::Tensor logits, torch::Tensor token_bitmask) {
-  auto logits_shape = logits.sizes();
-  int batch_size = 1;
-  int vocab_size;
-  if (logits_shape.size() == 1) {
-    vocab_size = logits_shape[0];
-  } else if (logits_shape.size() == 2) {
-    batch_size = logits_shape[0];
-    vocab_size = logits_shape[1];
-  } else {
-    XGRAMMAR_LOG(FATAL) << "logits tensor must be 1D or 2D";
-  }
-
-  auto bitmask_shape = token_bitmask.sizes();
-  int expected_bitmask_size = DynamicBitset::GetBufferSize(vocab_size);
-  if (bitmask_shape.size() == 1) {
-    XGRAMMAR_CHECK(bitmask_shape[0] == expected_bitmask_size)
-        << "The last dimension of the token bitmask tensor must be " << expected_bitmask_size
-        << ", but got " << bitmask_shape[0];
-  } else if (bitmask_shape.size() == 2) {
-    XGRAMMAR_CHECK(bitmask_shape[0] == batch_size)
-        << "The first dimension of the token bitmask tensor must be " << batch_size << ", but got "
-        << bitmask_shape[0];
-    XGRAMMAR_CHECK(bitmask_shape[1] == expected_bitmask_size)
-        << "The last dimension of the token bitmask tensor must be " << expected_bitmask_size
-        << ", but got " << bitmask_shape[1];
-  } else {
-    XGRAMMAR_LOG(FATAL) << "token_bitmask tensor must be 1D or 2D";
-  }
-
-  DTypeFlag dtype_flag;
-  if (logits.dtype() == torch::kFloat16) {
-    dtype_flag = DTypeFlag::DTYPE_FLOAT16;
-  } else if (logits.dtype() == torch::kFloat32) {
-    dtype_flag = DTypeFlag::DTYPE_FLOAT32;
-  } else if (logits.dtype() == torch::kFloat64) {
-    dtype_flag = DTypeFlag::DTYPE_FLOAT64;
-  } else {
-    XGRAMMAR_LOG(FATAL) << "logits tensor must be of type float16, float32, or float64";
-  }
-
-  XGRAMMAR_CHECK(token_bitmask.dtype() == torch::kInt32)
-      << "token bitmask tensor must be of type int32";
-
-  ApplyTokenBitmaskInplace(
-      logits.data_ptr(), dtype_flag, token_bitmask.data_ptr<int32_t>(), batch_size, vocab_size
-  );
-}
-#endif
-
 }  // namespace xgrammar
diff --git a/cpp/pybind/python_methods.h b/cpp/pybind/python_methods.h
index f65d75e..60f7ab3 100644
--- a/cpp/pybind/python_methods.h
+++ b/cpp/pybind/python_methods.h
@@ -40,10 +40,6 @@ std::vector<int> GrammarMatcher_DebugGetMaskedTokensFromBitmask(
     GrammarMatcher& matcher, torch::Tensor token_bitmask, int batch_id
 );
 
-#ifdef XGRAMMAR_BUILD_KERNELS
-void GrammarMatcher_ApplyTokenBitmaskInplace(torch::Tensor logits, torch::Tensor token_bitmask);
-#endif
-
 }  // namespace xgrammar
 
 #endif  // XGRAMMAR_PYBIND_PYTHON_METHODS_H_
diff --git a/python/xgrammar/version.py b/python/xgrammar/cuda/__init__.py
similarity index 97%
rename from python/xgrammar/version.py
rename to python/xgrammar/cuda/__init__.py
index bc3d15d..13a8339 100644
--- a/python/xgrammar/version.py
+++ b/python/xgrammar/cuda/__init__.py
@@ -14,5 +14,3 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-__version__ = "0.0.3"
diff --git a/python/xgrammar/cuda/apply_token_mask_inplace.py b/python/xgrammar/cuda/apply_token_mask_inplace.py
new file mode 100644
index 0000000..772b27b
--- /dev/null
+++ b/python/xgrammar/cuda/apply_token_mask_inplace.py
@@ -0,0 +1,220 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The CUDA kernel source code for in-place applying token mask."""
+import ctypes
+import os
+
+import numpy as np
+import torch
+
+try:
+    from cuda import cuda, cudart, nvrtc
+except ImportError:
+    cuda = None
+    cudart = None
+    nvrtc = None
+
+
+BITS_PER_BLOCK = 32
+THREADS_PER_BLOCK = 512
+
+_apply_token_bitmask_inplace_kernel = """
+#include <cuda_fp16.h>
+
+#include <cuda/std/limits>
+
+#define BITS_PER_BLOCK 32
+
+extern "C" __global__ void __launch_bounds__(512) ApplyTokenBitmaskInplaceKernel(
+    float* __restrict__ logits,
+    const int32_t* __restrict__ bitmask,
+    int32_t vocab_size,
+    int64_t bitmask_size,
+    int32_t bitmask_row_size
+) {
+  int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (gid >= bitmask_size) {
+    return;
+  }
+
+  int32_t batch_id = gid / bitmask_row_size;
+  int32_t bitmask_id = gid % bitmask_row_size;
+  int32_t bitmask_val = bitmask[gid];
+  float* logits_ptr = logits + batch_id * vocab_size + bitmask_id * BITS_PER_BLOCK;
+  for (int i = 0; i < BITS_PER_BLOCK; ++i) {
+    if (bitmask_id * BITS_PER_BLOCK + i >= vocab_size) {
+      break;
+    }
+    if ((bitmask_val & 1) == 0) {
+      logits_ptr[i] = -cuda::std::numeric_limits<float>::infinity();
+    }
+    bitmask_val >>= 1;
+  }
+}
+""".strip()
+
+
+# Adapted from https://github.com/NVIDIA/cuda-python/blob/main/cuda_bindings/examples.
+def _cudaGetErrorEnum(error):
+    if isinstance(error, cuda.CUresult):
+        err, name = cuda.cuGetErrorName(error)
+        return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
+    elif isinstance(error, cudart.cudaError_t):
+        return cudart.cudaGetErrorName(error)[1]
+    elif isinstance(error, nvrtc.nvrtcResult):
+        return nvrtc.nvrtcGetErrorString(error)[1]
+    else:
+        raise RuntimeError("Unknown error type: {}".format(error))
+
+
+# Adapted from https://github.com/NVIDIA/cuda-python/blob/main/cuda_bindings/examples.
+def checkCudaErrors(result):
+    if result[0].value:
+        raise RuntimeError(
+            "CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0]))
+        )
+    if len(result) == 1:
+        return None
+    elif len(result) == 2:
+        return result[1]
+    else:
+        return result[1:]
+
+
+# Adapted from https://github.com/NVIDIA/cuda-python/blob/main/cuda_bindings/examples.
+class KernelStore:
+    _module = None
+    _func = None
+
+    @classmethod
+    def compile(cls, device_id: int):
+        if cls._func is not None:
+            return cls._func
+
+        prog = checkCudaErrors(
+            nvrtc.nvrtcCreateProgram(
+                str.encode(_apply_token_bitmask_inplace_kernel), b"sourceCode.cu", 0, [], []
+            )
+        )
+        CUDA_HOME = os.getenv("CUDA_HOME")
+        if CUDA_HOME == None:
+            CUDA_HOME = os.getenv("CUDA_PATH")
+        if CUDA_HOME == None:
+            raise RuntimeError("Environment variable CUDA_HOME or CUDA_PATH is not set")
+        include_dirs = os.path.join(CUDA_HOME, "include")
+
+        # Initialize CUDA
+        checkCudaErrors(cudart.cudaFree(0))
+
+        major = checkCudaErrors(
+            cudart.cudaDeviceGetAttribute(
+                cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device_id
+            )
+        )
+        minor = checkCudaErrors(
+            cudart.cudaDeviceGetAttribute(
+                cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device_id
+            )
+        )
+        prefix = "compute"
+        arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii")
+
+        try:
+            opts = [
+                b"--fmad=true",
+                arch_arg,
+                "--include-path={}".format(include_dirs).encode("UTF-8"),
+                b"--std=c++11",
+                b"-default-device",
+            ]
+            checkCudaErrors(nvrtc.nvrtcCompileProgram(prog, len(opts), opts))
+        except RuntimeError as err:
+            logSize = checkCudaErrors(nvrtc.nvrtcGetProgramLogSize(prog))
+            log = b" " * logSize
+            checkCudaErrors(nvrtc.nvrtcGetProgramLog(prog, log))
+            # NOTE: The prints below are intended to provide the kernel compilation error.
+            print(log.decode())
+            print(err)
+            raise RuntimeError("CUDA kernel compilation failure")
+
+        dataSize = checkCudaErrors(nvrtc.nvrtcGetPTXSize(prog))
+        data = b" " * dataSize
+        checkCudaErrors(nvrtc.nvrtcGetPTX(prog, data))
+
+        # Store into `_module` and `_func`.
+        module = checkCudaErrors(cuda.cuModuleLoadData(np.char.array(data)))
+        func = checkCudaErrors(cuda.cuModuleGetFunction(module, b"ApplyTokenBitmaskInplaceKernel"))
+        cls._module = module
+        cls._func = func
+        # Return the compiled kernel.
+        return func
+
+
+def apply_token_bitmask_inplace(logits: torch.Tensor, bitmask: torch.Tensor):
+    if cuda is None or cudart is None or nvrtc is None:
+        raise RuntimeError("cuda-python is not installed. Please install cuda-python first.")
+
+    # Check input tensor shapes.
+    if logits.ndim == 2:
+        batch_size, vocab_size = logits.shape
+    elif logits.ndim == 1:
+        batch_size = 1
+        (vocab_size,) = logits.shape
+    else:
+        raise ValueError(f"Invalid logits tensor shape {logits.shape}")
+    bitmask_size = (vocab_size + BITS_PER_BLOCK - 1) // BITS_PER_BLOCK
+
+    # Ensure that the tensors are contiguous in memory.
+    logits = logits.contiguous()
+    bitmask = bitmask.contiguous()
+
+    # Compile the kernel.
+    kernel = KernelStore.compile(logits.device.index)
+    # Setup kernel launching arguments.
+    grid_dims = (batch_size * bitmask_size + THREADS_PER_BLOCK - 1) // THREADS_PER_BLOCK, 1, 1
+    block_dims = THREADS_PER_BLOCK, 1, 1
+    shared_mem_bytes = 0
+    stream = cuda.CU_STREAM_LEGACY
+    extra = 0
+    kernelArgs = (
+        (
+            logits.data_ptr(),
+            bitmask.data_ptr(),
+            vocab_size,
+            batch_size * bitmask_size,
+            bitmask_size,
+        ),
+        (
+            ctypes.c_void_p,
+            ctypes.c_void_p,
+            ctypes.c_int32,
+            ctypes.c_int64,
+            ctypes.c_int32,
+        ),
+    )
+    # Launch the kernel.
+    checkCudaErrors(
+        cuda.cuLaunchKernel(
+            kernel,
+            *grid_dims,
+            *block_dims,
+            shared_mem_bytes,
+            stream,
+            kernelArgs,
+            extra,
+        )
+    )
diff --git a/python/xgrammar/xgrammar.py b/python/xgrammar/xgrammar.py
index 30b1f1e..6ea45d9 100644
--- a/python/xgrammar/xgrammar.py
+++ b/python/xgrammar/xgrammar.py
@@ -26,6 +26,9 @@
 from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast
 
 from . import xgrammar_bindings as _core
+from .cuda.apply_token_mask_inplace import (
+    apply_token_bitmask_inplace as apply_token_bitmask_inplace_cuda,
+)
 
 
 class XGObject:
@@ -792,7 +795,7 @@ def apply_token_bitmask_inplace(logits: torch.Tensor, bitmask: torch.Tensor):
         if bitmask.device != logits.device:
             bitmask = bitmask.to(logits.device)
 
-        _core.GrammarMatcher.apply_token_bitmask_inplace(logits, bitmask)
+        apply_token_bitmask_inplace_cuda(logits, bitmask)
 
     def debug_get_masked_tokens_from_bitmask(
         self, bitmask: torch.Tensor, batch_id: int = 0
diff --git a/scripts/build_xgrammar_lib_osx.sh b/scripts/build_xgrammar_lib_osx.sh
index 8b5ceb4..0602bc9 100755
--- a/scripts/build_xgrammar_lib_osx.sh
+++ b/scripts/build_xgrammar_lib_osx.sh
@@ -6,8 +6,6 @@ set -u
 # setup config.cmake
 rm -f config.cmake
 echo set\(XGRAMMAR_BUILD_PYTHON_BINDINGS ON\) >>config.cmake
-echo set\(XGRAMMAR_BUILD_KERNELS OFF\) >>config.cmake
-echo set\(XGRAMMAR_BUILD_CUDA_KERNELS OFF\) >>config.cmake
 echo set\(XGRAMMAR_BUILD_CXX_TESTS OFF\) >>config.cmake
 
 
diff --git a/scripts/build_xgrammar_lib_win.bat b/scripts/build_xgrammar_lib_win.bat
index 289ab7c..6cafeae 100644
--- a/scripts/build_xgrammar_lib_win.bat
+++ b/scripts/build_xgrammar_lib_win.bat
@@ -2,8 +2,6 @@ echo on
 
 del /f config.cmake
 echo set(XGRAMMAR_BUILD_PYTHON_BINDINGS ON) >> config.cmake
-echo set(XGRAMMAR_BUILD_KERNELS OFF) >> config.cmake
-echo set(XGRAMMAR_BUILD_CUDA_KERNELS OFF) >> config.cmake
 echo set(XGRAMMAR_BUILD_CXX_TESTS OFF) >> config.cmake
 
 rd /s /q build
diff --git a/scripts/build_xgrammar_wheel_manylinux.sh b/scripts/build_xgrammar_wheel_manylinux.sh
index eb95d93..55513f0 100755
--- a/scripts/build_xgrammar_wheel_manylinux.sh
+++ b/scripts/build_xgrammar_wheel_manylinux.sh
@@ -68,8 +68,6 @@ cd /workspace
 
 # setup config.cmake
 echo set\(XGRAMMAR_BUILD_PYTHON_BINDINGS ON\) >>config.cmake
-echo set\(XGRAMMAR_BUILD_KERNELS OFF\) >>config.cmake
-echo set\(XGRAMMAR_BUILD_CUDA_KERNELS OFF\) >>config.cmake
 echo set\(XGRAMMAR_BUILD_CXX_TESTS OFF\) >>config.cmake
 
 # compile the xgrammar