From 035495fd537ded28284a7dc0f38f4f73a8250cca Mon Sep 17 00:00:00 2001
From: Charlie Ruan <53290280+CharlieFRuan@users.noreply.github.com>
Date: Thu, 21 Nov 2024 12:03:49 -0800
Subject: [PATCH] [Bitmask] Add allocate_token_bitmask to xgr from testing
 (#72)

This PR changes from `from xgrammar.testing import
_allocate_token_bitmask` to `from xgrammar import
allocate_token_bitmask`.

Passed all test with `pytest .`
---
 python/xgrammar/__init__.py                   |  4 +-
 python/xgrammar/matcher.py                    | 41 +++++++++++++------
 python/xgrammar/testing.py                    | 10 +----
 tests/README.md                               | 11 +++++
 tests/python/test_builtin_grammar_json.py     |  3 +-
 .../test_builtin_grammar_json_schema.py       |  3 +-
 tests/python/test_custom_grammar.py           |  3 +-
 tests/python/test_grammar_matcher.py          | 21 +++++-----
 tests/python/test_regex_converter.py          |  3 +-
 9 files changed, 58 insertions(+), 41 deletions(-)
 create mode 100644 tests/README.md

diff --git a/python/xgrammar/__init__.py b/python/xgrammar/__init__.py
index c5e1336..766a0a5 100644
--- a/python/xgrammar/__init__.py
+++ b/python/xgrammar/__init__.py
@@ -19,7 +19,9 @@
 from .matcher import (
     GrammarMatcher,
     apply_token_bitmask_inplace,
-    get_bitmask_dtype,
     get_bitmask_shape,
+    allocate_token_bitmask,
+    bitmask_dtype,
 )
 from .tokenizer_info import TokenizerInfo, VocabType
+from . import testing
diff --git a/python/xgrammar/matcher.py b/python/xgrammar/matcher.py
index 03a4960..b4bd7e3 100644
--- a/python/xgrammar/matcher.py
+++ b/python/xgrammar/matcher.py
@@ -27,31 +27,48 @@
     apply_token_bitmask_inplace as apply_token_bitmask_inplace_cuda,
 )
 
+bitmask_dtype = torch.int32
+
 
 def get_bitmask_shape(batch_size: int, vocab_size: int) -> Tuple[int, int]:
-    """Allocate the bitmask for the next token prediction. The bitmask is a int32 tensor on CPU
-    with shape (batch_size, ceil(vocab_size / 32)). If the batch size is None, the bitmask is
-    a 1D tensor with shape (ceil(vocab_size / 32),).
+    """Return the shape of the bitmask (batch_size, ceil(vocab_size / 32))"""
+    return (batch_size, math.ceil(vocab_size / 32))
+
+
+def allocate_token_bitmask(batch_size: int, vocab_size: int) -> torch.Tensor:
+    """Allocate the bitmask for the next token prediction. The bitmask is an int32 tensor on CPU
+    with shape (batch_size, ceil(vocab_size / 32)). This function defaults to
+
+    .. code:: python
+
+        return torch.empty(
+            xgr.get_bitmask_shape(batch_size, vocab_size),
+            dtype=xgr.bitmask_dtype,
+            pin_memory=True,
+        )
 
     Parameters
     ----------
+    batch_size : int
+        The batch size of the bitmask.
+
     vocab_size : int
         The size of the vocabulary.
 
-    batch_size : Optional[int], default: None
-        The batch size of the bitmask. If None, the bitmask is a 1D tensor.
-
     Returns
     -------
     bitmask : torch.Tensor
         The shape of the bitmask.
-    """
-    return (batch_size, math.ceil(vocab_size / 32))
-
 
-def get_bitmask_dtype() -> torch.dtype:
-    """Get the dtype of the bitmask."""
-    return torch.int32
+    Note
+    ----
+    This is the default way of allocating a bitmask. You can also customize the implementation.
+    """
+    return torch.empty(
+        get_bitmask_shape(batch_size, vocab_size),
+        dtype=bitmask_dtype,
+        pin_memory=True,
+    )
 
 
 def apply_token_bitmask_inplace(
diff --git a/python/xgrammar/testing.py b/python/xgrammar/testing.py
index 5c16970..7f1fa5e 100644
--- a/python/xgrammar/testing.py
+++ b/python/xgrammar/testing.py
@@ -23,7 +23,7 @@
 from .base import _core
 from .compiler import GrammarCompiler
 from .grammar import Grammar
-from .matcher import GrammarMatcher, get_bitmask_dtype, get_bitmask_shape
+from .matcher import GrammarMatcher, get_bitmask_shape
 from .tokenizer_info import TokenizerInfo
 
 
@@ -110,14 +110,6 @@ def _match_grammar_with_string(
     return matcher.is_terminated()
 
 
-def _allocate_token_bitmask(batch_size: int, vocab_size: int) -> torch.Tensor:
-    return torch.empty(
-        get_bitmask_shape(batch_size, vocab_size),
-        dtype=get_bitmask_dtype(),
-        pin_memory=True,
-    )
-
-
 def _get_masked_tokens_from_bitmask(
     bitmask: torch.Tensor, vocab_size: int, index: int = 0
 ) -> List[int]:
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 0000000..eef58d1
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,11 @@
+To test, run `pytest .` under `xgrammar` folder. You may need to do the following:
+
+```bash
+pip install sentencepiece
+pip install protobuf
+pip install -U "huggingface_hub[cli]"
+huggingface-cli login --token YOUR_HF_TOKEN
+```
+
+Make sure you also have access to the gated models, which should only require you to agree
+some terms on the models' website on huggingface. 
diff --git a/tests/python/test_builtin_grammar_json.py b/tests/python/test_builtin_grammar_json.py
index 32d13ad..2c9b9a3 100644
--- a/tests/python/test_builtin_grammar_json.py
+++ b/tests/python/test_builtin_grammar_json.py
@@ -10,7 +10,6 @@
 
 import xgrammar as xgr
 from xgrammar.testing import (
-    _allocate_token_bitmask,
     _get_masked_tokens_from_bitmask,
     _match_grammar_with_string,
 )
@@ -290,7 +289,7 @@ def test_fill_next_token_bitmask(
     time_end = time.monotonic_ns()
     print(f"Time to init GrammarMatcher: {(time_end - time_start) / 1e3} us")
 
-    token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size)
+    token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
     logits_gpu = torch.zeros(tokenizer_info.vocab_size, dtype=torch.float32, device="cuda")
 
     input_bytes = input_str.encode("utf-8")
diff --git a/tests/python/test_builtin_grammar_json_schema.py b/tests/python/test_builtin_grammar_json_schema.py
index aad4528..6a298fc 100644
--- a/tests/python/test_builtin_grammar_json_schema.py
+++ b/tests/python/test_builtin_grammar_json_schema.py
@@ -9,7 +9,6 @@
 
 import xgrammar as xgr
 from xgrammar.testing import (
-    _allocate_token_bitmask,
     _get_masked_tokens_from_bitmask,
     _get_matcher_from_grammar_and_tokenizer_info,
 )
@@ -98,7 +97,7 @@ def test_fill_next_token_bitmask(tokenizer_path: str):
     time_end = time.monotonic_ns()
     print(f"Time to init GrammarMatcher: {(time_end - time_start) / 1e3} us")
 
-    token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size)
+    token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
     logits_gpu = torch.zeros(tokenizer_info.vocab_size, dtype=torch.float32, device="cuda")
 
     input_bytes = instance_str.encode("utf-8")
diff --git a/tests/python/test_custom_grammar.py b/tests/python/test_custom_grammar.py
index 417659d..ab90f84 100644
--- a/tests/python/test_custom_grammar.py
+++ b/tests/python/test_custom_grammar.py
@@ -12,7 +12,6 @@
 
 import xgrammar as xgr
 from xgrammar.testing import (
-    _allocate_token_bitmask,
     _get_masked_tokens_from_bitmask,
     _match_grammar_with_string,
 )
@@ -335,7 +334,7 @@ def test_fill_next_token_bitmask(
     time_end = time.monotonic_ns()
     print(f"Time to init GrammarMatcher: {(time_end - time_start) / 1e3} us")
 
-    token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size)
+    token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
     logits_gpu = torch.zeros(tokenizer_info.vocab_size, dtype=torch.float32, device="cuda")
 
     input_bytes = input_str.encode("utf-8")
diff --git a/tests/python/test_grammar_matcher.py b/tests/python/test_grammar_matcher.py
index 1e10286..6beb961 100644
--- a/tests/python/test_grammar_matcher.py
+++ b/tests/python/test_grammar_matcher.py
@@ -10,7 +10,6 @@
 
 import xgrammar as xgr
 from xgrammar.testing import (
-    _allocate_token_bitmask,
     _get_masked_tokens_from_bitmask,
     _get_matcher_from_grammar_and_tokenizer_info,
     _match_grammar_with_string,
@@ -84,7 +83,7 @@ def test_fill_next_token_bitmask(
     tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer)
     matcher = _get_matcher_from_grammar_and_tokenizer_info(json_grammar, tokenizer_info)
 
-    token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size)
+    token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
 
     input_bytes = input_str.encode("utf-8")
     rejected_sizes = []
@@ -121,7 +120,7 @@ def test_token_operations():
 
     tokenizer_info = xgr.TokenizerInfo(vocab)
     matcher = _get_matcher_from_grammar_and_tokenizer_info(json_grammar, tokenizer_info)
-    token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size)
+    token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
 
     expected = [
         ["{"],
@@ -241,22 +240,22 @@ def test_rollback():
 
     for i_1, i_2 in input_ids_splitted:
         orig_result = []
-        token_bitmask1 = _allocate_token_bitmask(1, tokenizer_info.vocab_size)
+        token_bitmask1 = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
         matcher.fill_next_token_bitmask(token_bitmask1)
         orig_result.append(token_bitmask1)
         assert matcher.accept_token(i_1)
-        token_bitmask2 = _allocate_token_bitmask(1, tokenizer_info.vocab_size)
+        token_bitmask2 = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
         matcher.fill_next_token_bitmask(token_bitmask2)
         orig_result.append(token_bitmask2)
         assert matcher.accept_token(i_2)
 
         matcher.rollback(2)
         result_after_rollback = []
-        new_token_bitmask1 = _allocate_token_bitmask(1, tokenizer_info.vocab_size)
+        new_token_bitmask1 = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
         matcher.fill_next_token_bitmask(new_token_bitmask1)
         result_after_rollback.append(new_token_bitmask1)
         assert matcher.accept_token(i_1)
-        new_token_bitmask2 = _allocate_token_bitmask(1, tokenizer_info.vocab_size)
+        new_token_bitmask2 = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
         matcher.fill_next_token_bitmask(new_token_bitmask2)
         result_after_rollback.append(new_token_bitmask2)
         assert matcher.accept_token(i_2)
@@ -278,7 +277,7 @@ def test_reset():
     orig_result = []
 
     for i in input_ids:
-        token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size)
+        token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
         matcher.fill_next_token_bitmask(token_bitmask)
         orig_result.append(token_bitmask)
         assert matcher.accept_token(i)
@@ -288,7 +287,7 @@ def test_reset():
     result_after_reset = []
 
     for i in input_ids:
-        token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size)
+        token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
         matcher.fill_next_token_bitmask(token_bitmask)
         result_after_reset.append(token_bitmask)
         assert matcher.accept_token(i)
@@ -321,7 +320,7 @@ def test_termination():
     matcher = _get_matcher_from_grammar_and_tokenizer_info(
         json_grammar, tokenizer_info, max_rollback_tokens=5
     )
-    token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size)
+    token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
 
     for i in input_ids:
         matcher.fill_next_token_bitmask(token_bitmask)
@@ -360,7 +359,7 @@ def test_vocab_size():
     tokenizer_info = xgr.TokenizerInfo(vocab, vocab_size=64)
     matcher = _get_matcher_from_grammar_and_tokenizer_info(json_grammar, tokenizer_info)
 
-    token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size)
+    token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
     matcher.fill_next_token_bitmask(token_bitmask)
     assert token_bitmask.shape == (1, 2)
 
diff --git a/tests/python/test_regex_converter.py b/tests/python/test_regex_converter.py
index 36e55eb..a5a2e2d 100644
--- a/tests/python/test_regex_converter.py
+++ b/tests/python/test_regex_converter.py
@@ -7,7 +7,6 @@
 
 import xgrammar as xgr
 from xgrammar.testing import (
-    _allocate_token_bitmask,
     _match_grammar_with_string,
     _regex_to_ebnf,
 )
@@ -309,7 +308,7 @@ def test_mask_generation(tokenizer_path: str, regex: str, instance: str):
     time_end = time.monotonic_ns()
     print(f"Time for preprocessing: {(time_end - time_start) / 1e3} us")
     matcher = xgr.GrammarMatcher(matcher_compiled_grammar)
-    token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size)
+    token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
 
     for c in instance.encode("utf-8"):
         time_start = time.monotonic_ns()