guidance-ai · riedgar-ms · Jul 23, 2024 · Jul 23, 2024 · Jul 23, 2024 · Jul 23, 2024
diff --git a/.github/workflows/workflow-pr-gate.yml b/.github/workflows/workflow-pr-gate.yml
@@ -67,6 +67,7 @@ jobs:
           - "transformers_phi2_cpu"
           # - "transformers_mistral_7b_cpu" See Issue 713
           - "llamacpp_llama2_7b_cpu"
+          - "llamacpp_llama3_9b_cpu"
           - "llamacpp_mistral_7b_cpu"
           - "transformers_phi3_mini_4k_instruct_cpu"
           - "llamacpp_phi3_mini_4k_instruct_cpu"
@@ -108,6 +109,7 @@ jobs:
           - "transformers_phi2_cpu"
           # - "transformers_mistral_7b_cpu" See Issue 713
           - "llamacpp_llama2_7b_cpu"
+          - "llamacpp_llama3_9b_cpu"
           - "llamacpp_mistral_7b_cpu"
           - "transformers_phi3_mini_4k_instruct_cpu"
           - "llamacpp_phi3_mini_4k_instruct_cpu"
@@ -153,6 +155,7 @@ jobs:
           - "transformers_phi2_cpu"
           # - "transformers_mistral_7b_cpu" See Issue 713
           - "llamacpp_llama2_7b_cpu"
+          - "llamacpp_llama3_9b_cpu"
           - "llamacpp_mistral_7b_cpu"
           # - "transformers_phi3_mini_4k_instruct_cpu" Gives trouble on MacOS
           - "llamacpp_phi3_mini_4k_instruct_cpu"
@@ -193,6 +196,7 @@ jobs:
           - "transformers_phi2_cpu"
           # - "transformers_mistral_7b_cpu" See Issue 713
           - "llamacpp_llama2_7b_cpu"
+          - "llamacpp_llama3_9b_cpu"
           - "llamacpp_mistral_7b_cpu"
           - "transformers_phi3_mini_4k_instruct_cpu"
           - "llamacpp_phi3_mini_4k_instruct_cpu"

diff --git a/guidance/models/llama_cpp/_llama_cpp.py b/guidance/models/llama_cpp/_llama_cpp.py
@@ -89,9 +89,28 @@ def __init__(self, model_obj, chat_template=None):
         )
 
     def encode(self, byte_string: bytes) -> Sequence[int]:
+        # Some models (e.g. Llama3) can produce tokens which are only partial
+        # UTF-8 codepoints. The underlying tokenizer can't cope with these
+        # and has a tendency to segfault.
+        # To address this, shorten the bytes sent to the tokenizer to a valid
+        # UTF-8 value
+        # I hope this will not bite us with some subtle other bug in future
+        bytes_to_encode = b""
+        got_bytes_to_encode = False
+        for i in range(0, 3):
+            # Recall that a UTF-8 codepoint can be up to 3 bytes long
+            try:
+                bytes_to_encode = byte_string[0 : len(byte_string) - i]
+                _ = bytes_to_encode.decode()
+                got_bytes_to_encode = True
+                break
+            except UnicodeDecodeError:
+                pass
+        if not got_bytes_to_encode:
+            raise ValueError(f"Failed to shorten {byte_string!r} to valid unicode")
         # Workaround for the LlamaCpp prepending spaces on encoding
         raw_tokens = self._model_obj.tokenize(
-            self._sentinel_bytes + byte_string, add_bos=False, special=True
+            self._sentinel_bytes + bytes_to_encode, add_bos=False, special=True
         )
         assert raw_tokens[: len(self._sentinel_tokens)] == self._sentinel_tokens
         return raw_tokens[len(self._sentinel_tokens) :]

diff --git a/tests/_llms_for_testing.py b/tests/_llms_for_testing.py
@@ -63,6 +63,10 @@
         name="transformers:meta-llama/Meta-Llama-3-8B-Instruct",
         kwargs={"trust_remote_code": True, "torch_dtype": torch.bfloat16, "device_map": "cuda:0"},
     ),
+    "llamacpp_llama3_9b_cpu": dict(
+        name="huggingface_hubllama:bartowski/Meta-Llama-3-8B-Instruct-GGUF:Meta-Llama-3-8B-Instruct-IQ3_S.gguf",
+        kwargs={"verbose": True, "n_ctx": 4096},
+    ),
 }
 
 _MISTRAL = {

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -66,6 +66,7 @@ def llamacpp_model(selected_model, selected_model_name):
     if selected_model_name in [
         "llamacpp_llama2_7b_cpu",
         "llamacpp_llama2_7b_gpu",
+        "llamacpp_llama3_9b_cpu",
         "llamacpp_gemma2_9b_cpu",
         "llamacpp_phi3_mini_4k_instruct_cpu",
         "llamacpp_mistral_7b_cpu",

diff --git a/tests/model_integration/test_model.py b/tests/model_integration/test_model.py
@@ -21,6 +21,13 @@ def my_function(lm):
     assert str(lm) in ["this is a test another item1", "this is a test another item2"]
 
 
+def test_with_multitokenchars(selected_model: guidance.models.Model):
+    # Taken from https://github.com/guidance-ai/guidance/issues/934
+    lm = selected_model
+    lm += "歪" + select(["打正着", "门邪道"])
+    assert str(lm) == "歪打正着" or str(lm) == "歪门邪道"
+
+
 def test_token_count(selected_model):
     lm = selected_model
     lm2 = lm + " 1 1 1 1 1" + gen(max_tokens=9) + gen(max_tokens=9)
@@ -36,9 +43,7 @@ def test_token_healing(selected_model):
     if model_type != "GPT2LMHeadModel":
         pytest.skip("Test for GPT2 bug only")
     gpt2 = selected_model
-    lm = gpt2 + (
-        "This is a story of 10 or 5 or " + zero_or_more(byte_range(b"0", b"9"))
-    )
+    lm = gpt2 + ("This is a story of 10 or 5 or " + zero_or_more(byte_range(b"0", b"9")))
     assert len(lm) > len("This is a story of 10 or 5 or ")