From 035495fd537ded28284a7dc0f38f4f73a8250cca Mon Sep 17 00:00:00 2001 From: Charlie Ruan <53290280+CharlieFRuan@users.noreply.github.com> Date: Thu, 21 Nov 2024 12:03:49 -0800 Subject: [PATCH] [Bitmask] Add allocate_token_bitmask to xgr from testing (#72) This PR changes from `from xgrammar.testing import _allocate_token_bitmask` to `from xgrammar import allocate_token_bitmask`. Passed all test with `pytest .` --- python/xgrammar/__init__.py | 4 +- python/xgrammar/matcher.py | 41 +++++++++++++------ python/xgrammar/testing.py | 10 +---- tests/README.md | 11 +++++ tests/python/test_builtin_grammar_json.py | 3 +- .../test_builtin_grammar_json_schema.py | 3 +- tests/python/test_custom_grammar.py | 3 +- tests/python/test_grammar_matcher.py | 21 +++++----- tests/python/test_regex_converter.py | 3 +- 9 files changed, 58 insertions(+), 41 deletions(-) create mode 100644 tests/README.md diff --git a/python/xgrammar/__init__.py b/python/xgrammar/__init__.py index c5e1336..766a0a5 100644 --- a/python/xgrammar/__init__.py +++ b/python/xgrammar/__init__.py @@ -19,7 +19,9 @@ from .matcher import ( GrammarMatcher, apply_token_bitmask_inplace, - get_bitmask_dtype, get_bitmask_shape, + allocate_token_bitmask, + bitmask_dtype, ) from .tokenizer_info import TokenizerInfo, VocabType +from . import testing diff --git a/python/xgrammar/matcher.py b/python/xgrammar/matcher.py index 03a4960..b4bd7e3 100644 --- a/python/xgrammar/matcher.py +++ b/python/xgrammar/matcher.py @@ -27,31 +27,48 @@ apply_token_bitmask_inplace as apply_token_bitmask_inplace_cuda, ) +bitmask_dtype = torch.int32 + def get_bitmask_shape(batch_size: int, vocab_size: int) -> Tuple[int, int]: - """Allocate the bitmask for the next token prediction. The bitmask is a int32 tensor on CPU - with shape (batch_size, ceil(vocab_size / 32)). If the batch size is None, the bitmask is - a 1D tensor with shape (ceil(vocab_size / 32),). + """Return the shape of the bitmask (batch_size, ceil(vocab_size / 32))""" + return (batch_size, math.ceil(vocab_size / 32)) + + +def allocate_token_bitmask(batch_size: int, vocab_size: int) -> torch.Tensor: + """Allocate the bitmask for the next token prediction. The bitmask is an int32 tensor on CPU + with shape (batch_size, ceil(vocab_size / 32)). This function defaults to + + .. code:: python + + return torch.empty( + xgr.get_bitmask_shape(batch_size, vocab_size), + dtype=xgr.bitmask_dtype, + pin_memory=True, + ) Parameters ---------- + batch_size : int + The batch size of the bitmask. + vocab_size : int The size of the vocabulary. - batch_size : Optional[int], default: None - The batch size of the bitmask. If None, the bitmask is a 1D tensor. - Returns ------- bitmask : torch.Tensor The shape of the bitmask. - """ - return (batch_size, math.ceil(vocab_size / 32)) - -def get_bitmask_dtype() -> torch.dtype: - """Get the dtype of the bitmask.""" - return torch.int32 + Note + ---- + This is the default way of allocating a bitmask. You can also customize the implementation. + """ + return torch.empty( + get_bitmask_shape(batch_size, vocab_size), + dtype=bitmask_dtype, + pin_memory=True, + ) def apply_token_bitmask_inplace( diff --git a/python/xgrammar/testing.py b/python/xgrammar/testing.py index 5c16970..7f1fa5e 100644 --- a/python/xgrammar/testing.py +++ b/python/xgrammar/testing.py @@ -23,7 +23,7 @@ from .base import _core from .compiler import GrammarCompiler from .grammar import Grammar -from .matcher import GrammarMatcher, get_bitmask_dtype, get_bitmask_shape +from .matcher import GrammarMatcher, get_bitmask_shape from .tokenizer_info import TokenizerInfo @@ -110,14 +110,6 @@ def _match_grammar_with_string( return matcher.is_terminated() -def _allocate_token_bitmask(batch_size: int, vocab_size: int) -> torch.Tensor: - return torch.empty( - get_bitmask_shape(batch_size, vocab_size), - dtype=get_bitmask_dtype(), - pin_memory=True, - ) - - def _get_masked_tokens_from_bitmask( bitmask: torch.Tensor, vocab_size: int, index: int = 0 ) -> List[int]: diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..eef58d1 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,11 @@ +To test, run `pytest .` under `xgrammar` folder. You may need to do the following: + +```bash +pip install sentencepiece +pip install protobuf +pip install -U "huggingface_hub[cli]" +huggingface-cli login --token YOUR_HF_TOKEN +``` + +Make sure you also have access to the gated models, which should only require you to agree +some terms on the models' website on huggingface. diff --git a/tests/python/test_builtin_grammar_json.py b/tests/python/test_builtin_grammar_json.py index 32d13ad..2c9b9a3 100644 --- a/tests/python/test_builtin_grammar_json.py +++ b/tests/python/test_builtin_grammar_json.py @@ -10,7 +10,6 @@ import xgrammar as xgr from xgrammar.testing import ( - _allocate_token_bitmask, _get_masked_tokens_from_bitmask, _match_grammar_with_string, ) @@ -290,7 +289,7 @@ def test_fill_next_token_bitmask( time_end = time.monotonic_ns() print(f"Time to init GrammarMatcher: {(time_end - time_start) / 1e3} us") - token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size) + token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size) logits_gpu = torch.zeros(tokenizer_info.vocab_size, dtype=torch.float32, device="cuda") input_bytes = input_str.encode("utf-8") diff --git a/tests/python/test_builtin_grammar_json_schema.py b/tests/python/test_builtin_grammar_json_schema.py index aad4528..6a298fc 100644 --- a/tests/python/test_builtin_grammar_json_schema.py +++ b/tests/python/test_builtin_grammar_json_schema.py @@ -9,7 +9,6 @@ import xgrammar as xgr from xgrammar.testing import ( - _allocate_token_bitmask, _get_masked_tokens_from_bitmask, _get_matcher_from_grammar_and_tokenizer_info, ) @@ -98,7 +97,7 @@ def test_fill_next_token_bitmask(tokenizer_path: str): time_end = time.monotonic_ns() print(f"Time to init GrammarMatcher: {(time_end - time_start) / 1e3} us") - token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size) + token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size) logits_gpu = torch.zeros(tokenizer_info.vocab_size, dtype=torch.float32, device="cuda") input_bytes = instance_str.encode("utf-8") diff --git a/tests/python/test_custom_grammar.py b/tests/python/test_custom_grammar.py index 417659d..ab90f84 100644 --- a/tests/python/test_custom_grammar.py +++ b/tests/python/test_custom_grammar.py @@ -12,7 +12,6 @@ import xgrammar as xgr from xgrammar.testing import ( - _allocate_token_bitmask, _get_masked_tokens_from_bitmask, _match_grammar_with_string, ) @@ -335,7 +334,7 @@ def test_fill_next_token_bitmask( time_end = time.monotonic_ns() print(f"Time to init GrammarMatcher: {(time_end - time_start) / 1e3} us") - token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size) + token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size) logits_gpu = torch.zeros(tokenizer_info.vocab_size, dtype=torch.float32, device="cuda") input_bytes = input_str.encode("utf-8") diff --git a/tests/python/test_grammar_matcher.py b/tests/python/test_grammar_matcher.py index 1e10286..6beb961 100644 --- a/tests/python/test_grammar_matcher.py +++ b/tests/python/test_grammar_matcher.py @@ -10,7 +10,6 @@ import xgrammar as xgr from xgrammar.testing import ( - _allocate_token_bitmask, _get_masked_tokens_from_bitmask, _get_matcher_from_grammar_and_tokenizer_info, _match_grammar_with_string, @@ -84,7 +83,7 @@ def test_fill_next_token_bitmask( tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer) matcher = _get_matcher_from_grammar_and_tokenizer_info(json_grammar, tokenizer_info) - token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size) + token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size) input_bytes = input_str.encode("utf-8") rejected_sizes = [] @@ -121,7 +120,7 @@ def test_token_operations(): tokenizer_info = xgr.TokenizerInfo(vocab) matcher = _get_matcher_from_grammar_and_tokenizer_info(json_grammar, tokenizer_info) - token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size) + token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size) expected = [ ["{"], @@ -241,22 +240,22 @@ def test_rollback(): for i_1, i_2 in input_ids_splitted: orig_result = [] - token_bitmask1 = _allocate_token_bitmask(1, tokenizer_info.vocab_size) + token_bitmask1 = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size) matcher.fill_next_token_bitmask(token_bitmask1) orig_result.append(token_bitmask1) assert matcher.accept_token(i_1) - token_bitmask2 = _allocate_token_bitmask(1, tokenizer_info.vocab_size) + token_bitmask2 = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size) matcher.fill_next_token_bitmask(token_bitmask2) orig_result.append(token_bitmask2) assert matcher.accept_token(i_2) matcher.rollback(2) result_after_rollback = [] - new_token_bitmask1 = _allocate_token_bitmask(1, tokenizer_info.vocab_size) + new_token_bitmask1 = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size) matcher.fill_next_token_bitmask(new_token_bitmask1) result_after_rollback.append(new_token_bitmask1) assert matcher.accept_token(i_1) - new_token_bitmask2 = _allocate_token_bitmask(1, tokenizer_info.vocab_size) + new_token_bitmask2 = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size) matcher.fill_next_token_bitmask(new_token_bitmask2) result_after_rollback.append(new_token_bitmask2) assert matcher.accept_token(i_2) @@ -278,7 +277,7 @@ def test_reset(): orig_result = [] for i in input_ids: - token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size) + token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size) matcher.fill_next_token_bitmask(token_bitmask) orig_result.append(token_bitmask) assert matcher.accept_token(i) @@ -288,7 +287,7 @@ def test_reset(): result_after_reset = [] for i in input_ids: - token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size) + token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size) matcher.fill_next_token_bitmask(token_bitmask) result_after_reset.append(token_bitmask) assert matcher.accept_token(i) @@ -321,7 +320,7 @@ def test_termination(): matcher = _get_matcher_from_grammar_and_tokenizer_info( json_grammar, tokenizer_info, max_rollback_tokens=5 ) - token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size) + token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size) for i in input_ids: matcher.fill_next_token_bitmask(token_bitmask) @@ -360,7 +359,7 @@ def test_vocab_size(): tokenizer_info = xgr.TokenizerInfo(vocab, vocab_size=64) matcher = _get_matcher_from_grammar_and_tokenizer_info(json_grammar, tokenizer_info) - token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size) + token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size) matcher.fill_next_token_bitmask(token_bitmask) assert token_bitmask.shape == (1, 2) diff --git a/tests/python/test_regex_converter.py b/tests/python/test_regex_converter.py index 36e55eb..a5a2e2d 100644 --- a/tests/python/test_regex_converter.py +++ b/tests/python/test_regex_converter.py @@ -7,7 +7,6 @@ import xgrammar as xgr from xgrammar.testing import ( - _allocate_token_bitmask, _match_grammar_with_string, _regex_to_ebnf, ) @@ -309,7 +308,7 @@ def test_mask_generation(tokenizer_path: str, regex: str, instance: str): time_end = time.monotonic_ns() print(f"Time for preprocessing: {(time_end - time_start) / 1e3} us") matcher = xgr.GrammarMatcher(matcher_compiled_grammar) - token_bitmask = _allocate_token_bitmask(1, tokenizer_info.vocab_size) + token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size) for c in instance.encode("utf-8"): time_start = time.monotonic_ns()