From 386cdd3a52f80a6734eb81e9f788fd8f9ca6285f Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Fri, 26 Apr 2024 09:07:27 -0400
Subject: [PATCH 01/29] Starting to think about what we need for AzureAI Studio

---
 aais_example.py                     | 60 +++++++++++++++++++++++++++++
 tests/models/test_azureai_openai.py | 39 ++++++++-----------
 tests/models/test_azureai_studio.py | 34 ++++++++++++++++
 tests/models/test_model.py          |  2 +
 tests/utils.py                      |  6 +++
 5 files changed, 119 insertions(+), 22 deletions(-)
 create mode 100644 aais_example.py
 create mode 100644 tests/models/test_azureai_studio.py

diff --git a/aais_example.py b/aais_example.py
new file mode 100644
index 000000000..a8ffafbe7
--- /dev/null
+++ b/aais_example.py
@@ -0,0 +1,60 @@
+import urllib.request
+import json
+import os
+import ssl
+
+def allowSelfSignedHttps(allowed):
+    # bypass the server certificate verification on client side
+    if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
+        ssl._create_default_https_context = ssl._create_unverified_context
+
+allowSelfSignedHttps(True) # this line is needed if you use self-signed certificate in your scoring service.
+
+# Request data goes here
+# The example below assumes JSON formatting which may be updated
+# depending on the format your endpoint expects.
+# More information can be found here:
+# https://docs.microsoft.com/azure/machine-learning/how-to-deploy-advanced-entry-script
+data = {
+  "input_data": {
+    "input_string": [
+      {
+        "role": "user",
+        "content": "I am going to Gomorrah, give me a list of 10 places to visit"
+      }
+    ],
+    "parameters": {
+      "temperature": 0.7,
+      "top_p": 0.9,
+      "do_sample": True,
+      "max_new_tokens": 1000
+    }
+  }
+}
+
+
+body = str.encode(json.dumps(data))
+
+url = 'https://guidance-build-azureai-mo-qahti.eastus2.inference.ml.azure.com/score'
+# Replace this with the primary/secondary key, AMLToken, or Microsoft Entra ID token for the endpoint
+api_key = 'jnR8Q7wKkyHaV3xJwXVTT8rTN7MMT3SR'
+if not api_key:
+    raise Exception("A key should be provided to invoke the endpoint")
+
+# The azureml-model-deployment header will force the request to go to a specific deployment.
+# Remove this header to have the request observe the endpoint traffic rules
+headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key), 'azureml-model-deployment': 'phi-3-mini-4k-instruct-2' }
+
+req = urllib.request.Request(url, body, headers)
+
+try:
+    response = urllib.request.urlopen(req)
+
+    result = response.read()
+    print(result)
+except urllib.error.HTTPError as error:
+    print("The request failed with status code: " + str(error.code))
+
+    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
+    print(error.info())
+    print(error.read().decode("utf8", 'ignore'))
diff --git a/tests/models/test_azureai_openai.py b/tests/models/test_azureai_openai.py
index 38ff6d8ed..6e2b2fd6d 100644
--- a/tests/models/test_azureai_openai.py
+++ b/tests/models/test_azureai_openai.py
@@ -7,23 +7,18 @@
 
 from guidance import assistant, gen, models, system, user
 
+from utils import env_or_fail
+
 # Everything in here needs credentials to work
 # Mark is configured in pyproject.toml
 pytestmark = pytest.mark.needs_credentials
 
 
-def _env_or_fail(var_name: str) -> str:
-    env_value = os.getenv(var_name, None)
-
-    assert env_value is not None, f"Env '{var_name}' not found"
-
-    return env_value
-
 
 def test_azureai_openai_chat_smoke(rate_limiter):
-    azureai_endpoint = _env_or_fail("AZUREAI_CHAT_ENDPOINT")
-    azureai_key = _env_or_fail("AZUREAI_CHAT_KEY")
-    model = _env_or_fail("AZUREAI_CHAT_MODEL")
+    azureai_endpoint = env_or_fail("AZUREAI_CHAT_ENDPOINT")
+    azureai_key = env_or_fail("AZUREAI_CHAT_KEY")
+    model = env_or_fail("AZUREAI_CHAT_MODEL")
 
     lm = models.AzureOpenAI(
         model=model, azure_endpoint=azureai_endpoint, api_key=azureai_key
@@ -45,9 +40,9 @@ def test_azureai_openai_chat_smoke(rate_limiter):
 
 
 def test_azureai_openai_chat_alt_args(rate_limiter):
-    azureai_endpoint = _env_or_fail("AZUREAI_CHAT_ENDPOINT")
-    azureai_key = _env_or_fail("AZUREAI_CHAT_KEY")
-    model = _env_or_fail("AZUREAI_CHAT_MODEL")
+    azureai_endpoint = env_or_fail("AZUREAI_CHAT_ENDPOINT")
+    azureai_key = env_or_fail("AZUREAI_CHAT_KEY")
+    model = env_or_fail("AZUREAI_CHAT_MODEL")
 
     parsed_url = urlparse(azureai_endpoint)
     parsed_query = parse_qs(parsed_url.query)
@@ -78,9 +73,9 @@ def test_azureai_openai_chat_alt_args(rate_limiter):
 
 
 def test_azureai_openai_completion_smoke(rate_limiter):
-    azureai_endpoint = _env_or_fail("AZUREAI_COMPLETION_ENDPOINT")
-    azureai_key = _env_or_fail("AZUREAI_COMPLETION_KEY")
-    model = _env_or_fail("AZUREAI_COMPLETION_MODEL")
+    azureai_endpoint = env_or_fail("AZUREAI_COMPLETION_ENDPOINT")
+    azureai_key = env_or_fail("AZUREAI_COMPLETION_KEY")
+    model = env_or_fail("AZUREAI_COMPLETION_MODEL")
 
     lm = models.AzureOpenAI(
         model=model, azure_endpoint=azureai_endpoint, api_key=azureai_key
@@ -93,9 +88,9 @@ def test_azureai_openai_completion_smoke(rate_limiter):
 
 
 def test_azureai_openai_completion_alt_args(rate_limiter):
-    azureai_endpoint = _env_or_fail("AZUREAI_COMPLETION_ENDPOINT")
-    azureai_key = _env_or_fail("AZUREAI_COMPLETION_KEY")
-    model = _env_or_fail("AZUREAI_COMPLETION_MODEL")
+    azureai_endpoint = env_or_fail("AZUREAI_COMPLETION_ENDPOINT")
+    azureai_key = env_or_fail("AZUREAI_COMPLETION_KEY")
+    model = env_or_fail("AZUREAI_COMPLETION_MODEL")
 
     parsed_url = urlparse(azureai_endpoint)
     parsed_query = parse_qs(parsed_url.query)
@@ -118,9 +113,9 @@ def test_azureai_openai_completion_alt_args(rate_limiter):
 
 
 def test_azureai_openai_chat_loop(rate_limiter):
-    azureai_endpoint = _env_or_fail("AZUREAI_CHAT_ENDPOINT")
-    azureai_key = _env_or_fail("AZUREAI_CHAT_KEY")
-    model = _env_or_fail("AZUREAI_CHAT_MODEL")
+    azureai_endpoint = env_or_fail("AZUREAI_CHAT_ENDPOINT")
+    azureai_key = env_or_fail("AZUREAI_CHAT_KEY")
+    model = env_or_fail("AZUREAI_CHAT_MODEL")
 
     lm = models.AzureOpenAI(
         model=model, azure_endpoint=azureai_endpoint, api_key=azureai_key
diff --git a/tests/models/test_azureai_studio.py b/tests/models/test_azureai_studio.py
new file mode 100644
index 000000000..72920e643
--- /dev/null
+++ b/tests/models/test_azureai_studio.py
@@ -0,0 +1,34 @@
+import pytest
+
+import pytest
+
+from guidance import assistant, gen, models, system, user
+
+from utils import env_or_fail
+
+# Everything in here needs credentials to work
+# Mark is configured in pyproject.toml
+pytestmark = pytest.mark.needs_credentials
+
+
+def test_azureai_openai_chat_smoke(rate_limiter):
+    azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_ENDPOINT")
+    azureai_studio_key = env_or_fail("AZUREAI_CHAT_KEY")
+
+    lm = models.AzureOpenAI(
+        model=model, azure_endpoint=azureai_endpoint, api_key=azureai_key
+    )
+    assert isinstance(lm, models.AzureOpenAIChat)
+
+    with system():
+        lm += "You are a math wiz."
+
+    with user():
+        lm += "What is 1 + 1?"
+
+    with assistant():
+        lm += gen(max_tokens=10, name="text")
+        lm += "Pick a number: "
+
+    print(str(lm))
+    assert len(lm["text"]) > 0
diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index 1dc4de113..c6f1478a2 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -6,6 +6,8 @@
 
 def test_fstring(selected_model):
     lm = selected_model
+    print(f"{dir(lm.engine.tokenizer)=}")
+    assert hasattr(lm.engine.tokenizer,"sp_model")
     lm += f'this is a test {select(["item1", "item2"])}'
     assert str(lm) in ["this is a test item1", "this is a test item2"]
 
diff --git a/tests/utils.py b/tests/utils.py
index bf15ee59d..9a674e8bf 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -8,6 +8,12 @@
 
 opanai_model_cache = {}
 
+def env_or_fail(var_name: str) -> str:
+    env_value = os.getenv(var_name, None)
+
+    assert env_value is not None, f"Env '{var_name}' not found"
+
+    return env_value
 
 def get_model(model_name, caching=False, **kwargs):
     """Get an LLM by name."""

From 176201cc5c5b3c9142c793a77f412c80971a0452 Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Fri, 26 Apr 2024 09:38:26 -0400
Subject: [PATCH 02/29] Getting to the initially desired failure

---
 guidance/models/__init__.py         |  1 +
 guidance/models/_azureai_studio.py  | 43 +++++++++++++++++++++++++++++
 tests/models/test_azureai_openai.py |  2 +-
 tests/models/test_azureai_studio.py | 11 ++++----
 4 files changed, 51 insertions(+), 6 deletions(-)
 create mode 100644 guidance/models/_azureai_studio.py

diff --git a/guidance/models/__init__.py b/guidance/models/__init__.py
index 2599a98ec..c1d2e6767 100644
--- a/guidance/models/__init__.py
+++ b/guidance/models/__init__.py
@@ -19,6 +19,7 @@
     AzureOpenAICompletion,
     AzureOpenAIInstruct,
 )
+from ._azureai_studio import AzureAIStudioChat
 from ._openai import OpenAI, OpenAIChat, OpenAIInstruct, OpenAICompletion
 from ._lite_llm import LiteLLM, LiteLLMChat, LiteLLMInstruct, LiteLLMCompletion
 from ._cohere import Cohere, CohereCompletion, CohereInstruct
diff --git a/guidance/models/_azureai_studio.py b/guidance/models/_azureai_studio.py
new file mode 100644
index 000000000..4ec99cb02
--- /dev/null
+++ b/guidance/models/_azureai_studio.py
@@ -0,0 +1,43 @@
+from ._model import Chat
+from ._grammarless import GrammarlessEngine, Grammarless
+
+
+class AzureAIStudioChatEngine(GrammarlessEngine):
+    def __init__(
+        self,
+        *,
+        tokenizer,
+        max_streaming_tokens: int,
+        timeout: float,
+        compute_log_probs: bool,
+        azureai_studio_endpoint: str,
+        azureai_studio_key: str,
+    ):
+        self._endpoint = azureai_studio_endpoint
+        self._api_key = azureai_studio_key
+
+        super().__init__(tokenizer, max_streaming_tokens, timeout, compute_log_probs)
+
+
+class AzureAIStudioChat(Grammarless, Chat):
+    def __init__(
+        self,
+        azureai_studio_endpoint: str,
+        azureai_studio_key: str,
+        tokenizer=None,
+        echo: bool = True,
+        max_streaming_tokens: int = 1000,
+        timeout: float = 0.5,
+        compute_log_probs: bool = False,
+    ):
+        super().__init__(
+            AzureAIStudioChatEngine(
+                azureai_studio_endpoint=azureai_studio_endpoint,
+                azureai_studio_key=azureai_studio_key,
+                tokenizer=tokenizer,
+                max_streaming_tokens=max_streaming_tokens,
+                timeout=timeout,
+                compute_log_probs=compute_log_probs,
+            ),
+            echo=echo,
+        )
diff --git a/tests/models/test_azureai_openai.py b/tests/models/test_azureai_openai.py
index 6e2b2fd6d..c2bc818d7 100644
--- a/tests/models/test_azureai_openai.py
+++ b/tests/models/test_azureai_openai.py
@@ -7,7 +7,7 @@
 
 from guidance import assistant, gen, models, system, user
 
-from utils import env_or_fail
+from ..utils import env_or_fail
 
 # Everything in here needs credentials to work
 # Mark is configured in pyproject.toml
diff --git a/tests/models/test_azureai_studio.py b/tests/models/test_azureai_studio.py
index 72920e643..0b8e8cc36 100644
--- a/tests/models/test_azureai_studio.py
+++ b/tests/models/test_azureai_studio.py
@@ -4,7 +4,7 @@
 
 from guidance import assistant, gen, models, system, user
 
-from utils import env_or_fail
+from ..utils import env_or_fail
 
 # Everything in here needs credentials to work
 # Mark is configured in pyproject.toml
@@ -13,12 +13,13 @@
 
 def test_azureai_openai_chat_smoke(rate_limiter):
     azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_ENDPOINT")
-    azureai_studio_key = env_or_fail("AZUREAI_CHAT_KEY")
+    azureai_studio_key = env_or_fail("AZURE_AI_STUDIO_KEY")
 
-    lm = models.AzureOpenAI(
-        model=model, azure_endpoint=azureai_endpoint, api_key=azureai_key
+    lm = models.AzureAIStudioChat(
+        azureai_studio_endpoint=azureai_studio_endpoint,
+        azureai_studio_key=azureai_studio_key,
     )
-    assert isinstance(lm, models.AzureOpenAIChat)
+    assert isinstance(lm, models.AzureAIStudioChat)
 
     with system():
         lm += "You are a math wiz."

From beac0cc0c3aa3aa5892050e9792521121392c3e5 Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Fri, 26 Apr 2024 09:52:52 -0400
Subject: [PATCH 03/29] Very rough draft....

---
 guidance/models/_azureai_studio.py | 94 ++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/guidance/models/_azureai_studio.py b/guidance/models/_azureai_studio.py
index 4ec99cb02..73ac6bbda 100644
--- a/guidance/models/_azureai_studio.py
+++ b/guidance/models/_azureai_studio.py
@@ -1,3 +1,6 @@
+import json
+import urllib.request
+
 from ._model import Chat
 from ._grammarless import GrammarlessEngine, Grammarless
 
@@ -11,18 +14,108 @@ def __init__(
         timeout: float,
         compute_log_probs: bool,
         azureai_studio_endpoint: str,
+        azureai_model_deployment: str,
         azureai_studio_key: str,
     ):
         self._endpoint = azureai_studio_endpoint
+        self._deployment = azureai_model_deployment
         self._api_key = azureai_studio_key
 
         super().__init__(tokenizer, max_streaming_tokens, timeout, compute_log_probs)
 
+    def _generator(self, prompt, temperature: float):
+        # Initial parts of this straight up copied from OpenAIChatEngine
+
+        # find the role tags
+        pos = 0
+        role_end = b"<|im_end|>"
+        messages = []
+        found = True
+        while found:
+
+            # find the role text blocks
+            found = False
+            for role_name, start_bytes in (
+                ("system", b"<|im_start|>system\n"),
+                ("user", b"<|im_start|>user\n"),
+                ("assistant", b"<|im_start|>assistant\n"),
+            ):
+                if prompt[pos:].startswith(start_bytes):
+                    pos += len(start_bytes)
+                    end_pos = prompt[pos:].find(role_end)
+                    if end_pos < 0:
+                        assert (
+                            role_name == "assistant"
+                        ), "Bad chat format! Last role before gen needs to be assistant!"
+                        break
+                    btext = prompt[pos : pos + end_pos]
+                    pos += end_pos + len(role_end)
+                    messages.append(
+                        {"role": role_name, "content": btext.decode("utf8")}
+                    )
+                    found = True
+                    break
+
+        # Add nice exception if no role tags were used in the prompt.
+        # TODO: Move this somewhere more general for all chat models?
+        if messages == []:
+            raise ValueError(
+                f"The model is a Chat-based model and requires role tags in the prompt! \
+            Make sure you are using guidance context managers like `with system():`, `with user():` and `with assistant():` \
+            to appropriately format your guidance program for this type of model."
+            )
+
+        # Update shared data state
+        self._reset_shared_data(prompt[:pos], temperature)
+
+        # Use cache only when temperature is 0
+        if temperature == 0:
+            cache_key = self._hash_prompt(prompt)
+
+            # Check if the result is already in the cache
+            if cache_key in self.cache:
+                for chunk in self.cache[cache_key]:
+                    yield chunk
+                return
+
+        # Now switch to the example code from AzureAI Studio
+
+        # Prepare for the API call (this might be model specific....)
+        parameters = dict(temperature=temperature)
+        payload = dict(input_data=dict(input_string=messages, parameters=parameters))
+
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": ("Bearer " + self._api_key),
+            "azureml-model-deployment": self._deployment,
+        }
+
+        body = str.encode(json.dumps(payload))
+
+        req = urllib.request.Request(self._endpoint, body, headers)
+
+        response = urllib.request.urlopen(req)
+        result = json.loads(response.read())
+
+        # Now back to OpenAIChatEngine
+        if temperature == 0:
+            cached_results = []
+
+        yield result["output"]
+
+        if temperature == 0:
+            cached_results.append(result["output"])
+
+        # Cache the results after the generator is exhausted
+        if temperature == 0:
+            self.cache[cache_key] = cached_results
+
 
 class AzureAIStudioChat(Grammarless, Chat):
     def __init__(
         self,
         azureai_studio_endpoint: str,
+        azureai_studio_deployment: str,
         azureai_studio_key: str,
         tokenizer=None,
         echo: bool = True,
@@ -33,6 +126,7 @@ def __init__(
         super().__init__(
             AzureAIStudioChatEngine(
                 azureai_studio_endpoint=azureai_studio_endpoint,
+                azureai_model_deployment=azureai_studio_deployment,
                 azureai_studio_key=azureai_studio_key,
                 tokenizer=tokenizer,
                 max_streaming_tokens=max_streaming_tokens,

From 3d90baaa3b000b4c293f47febd585057c9cf1c6e Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Fri, 26 Apr 2024 09:57:07 -0400
Subject: [PATCH 04/29] Inching along

---
 guidance/models/_azureai_studio.py  | 5 +++++
 tests/models/test_azureai_studio.py | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/guidance/models/_azureai_studio.py b/guidance/models/_azureai_studio.py
index 73ac6bbda..880379022 100644
--- a/guidance/models/_azureai_studio.py
+++ b/guidance/models/_azureai_studio.py
@@ -1,3 +1,4 @@
+import hashlib
 import json
 import urllib.request
 
@@ -23,6 +24,10 @@ def __init__(
 
         super().__init__(tokenizer, max_streaming_tokens, timeout, compute_log_probs)
 
+    def _hash_prompt(self, prompt):
+        # Copied from OpenAIChatEngine
+        return hashlib.sha256(f"{prompt}".encode()).hexdigest()
+
     def _generator(self, prompt, temperature: float):
         # Initial parts of this straight up copied from OpenAIChatEngine
 
diff --git a/tests/models/test_azureai_studio.py b/tests/models/test_azureai_studio.py
index 0b8e8cc36..ab51d6450 100644
--- a/tests/models/test_azureai_studio.py
+++ b/tests/models/test_azureai_studio.py
@@ -13,10 +13,12 @@
 
 def test_azureai_openai_chat_smoke(rate_limiter):
     azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_ENDPOINT")
+    azureai_studio_deployment = env_or_fail("AZURE_AI_STUDIO_DEPLOYMENT")
     azureai_studio_key = env_or_fail("AZURE_AI_STUDIO_KEY")
 
     lm = models.AzureAIStudioChat(
         azureai_studio_endpoint=azureai_studio_endpoint,
+        azureai_studio_deployment=azureai_studio_deployment,
         azureai_studio_key=azureai_studio_key,
     )
     assert isinstance(lm, models.AzureAIStudioChat)

From 32bc793af3fa16350f3d1a5d7967fd5cda5a2425 Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Fri, 26 Apr 2024 10:06:04 -0400
Subject: [PATCH 05/29] Trying to get things working :-/

---
 guidance/models/_azureai_studio.py | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/guidance/models/_azureai_studio.py b/guidance/models/_azureai_studio.py
index 880379022..db6fc414c 100644
--- a/guidance/models/_azureai_studio.py
+++ b/guidance/models/_azureai_studio.py
@@ -1,7 +1,11 @@
 import hashlib
 import json
+import pathlib
 import urllib.request
 
+import diskcache as dc
+import platformdirs
+
 from ._model import Chat
 from ._grammarless import GrammarlessEngine, Grammarless
 
@@ -22,6 +26,12 @@ def __init__(
         self._deployment = azureai_model_deployment
         self._api_key = azureai_studio_key
 
+        path = (
+            pathlib.Path(platformdirs.user_cache_dir("guidance"))
+            / "azureaistudio.tokens"
+        )
+        self.cache = dc.Cache(path)
+
         super().__init__(tokenizer, max_streaming_tokens, timeout, compute_log_probs)
 
     def _hash_prompt(self, prompt):
@@ -31,6 +41,11 @@ def _hash_prompt(self, prompt):
     def _generator(self, prompt, temperature: float):
         # Initial parts of this straight up copied from OpenAIChatEngine
 
+        # The next loop (or one like it) appears in several places,
+        # and quite possibly belongs in a library function or superclass
+        # That said, I'm not _completely sure that there aren't subtle
+        # differences between the various versions
+
         # find the role tags
         pos = 0
         role_end = b"<|im_end|>"
@@ -84,6 +99,7 @@ def _generator(self, prompt, temperature: float):
                 return
 
         # Now switch to the example code from AzureAI Studio
+        # Might want to rewrite this to the requests package
 
         # Prepare for the API call (this might be model specific....)
         parameters = dict(temperature=temperature)
@@ -102,14 +118,17 @@ def _generator(self, prompt, temperature: float):
         response = urllib.request.urlopen(req)
         result = json.loads(response.read())
 
-        # Now back to OpenAIChatEngine
+        # Now back to OpenAIChatEngine, with slight modifications since
+        # this isn't a streaming API
         if temperature == 0:
             cached_results = []
 
-        yield result["output"]
+        encoded_chunk = result["output"].encode("utf8")
+
+        yield encoded_chunk
 
         if temperature == 0:
-            cached_results.append(result["output"])
+            cached_results.append(encoded_chunk)
 
         # Cache the results after the generator is exhausted
         if temperature == 0:

From 7840cfd427e5b66d9c8a0b41cbc4d499734daeec Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Fri, 26 Apr 2024 11:10:48 -0400
Subject: [PATCH 06/29] Didn't mean to check that in

---
 aais_example.py | 60 -------------------------------------------------
 1 file changed, 60 deletions(-)
 delete mode 100644 aais_example.py

diff --git a/aais_example.py b/aais_example.py
deleted file mode 100644
index a8ffafbe7..000000000
--- a/aais_example.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import urllib.request
-import json
-import os
-import ssl
-
-def allowSelfSignedHttps(allowed):
-    # bypass the server certificate verification on client side
-    if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
-        ssl._create_default_https_context = ssl._create_unverified_context
-
-allowSelfSignedHttps(True) # this line is needed if you use self-signed certificate in your scoring service.
-
-# Request data goes here
-# The example below assumes JSON formatting which may be updated
-# depending on the format your endpoint expects.
-# More information can be found here:
-# https://docs.microsoft.com/azure/machine-learning/how-to-deploy-advanced-entry-script
-data = {
-  "input_data": {
-    "input_string": [
-      {
-        "role": "user",
-        "content": "I am going to Gomorrah, give me a list of 10 places to visit"
-      }
-    ],
-    "parameters": {
-      "temperature": 0.7,
-      "top_p": 0.9,
-      "do_sample": True,
-      "max_new_tokens": 1000
-    }
-  }
-}
-
-
-body = str.encode(json.dumps(data))
-
-url = 'https://guidance-build-azureai-mo-qahti.eastus2.inference.ml.azure.com/score'
-# Replace this with the primary/secondary key, AMLToken, or Microsoft Entra ID token for the endpoint
-api_key = 'jnR8Q7wKkyHaV3xJwXVTT8rTN7MMT3SR'
-if not api_key:
-    raise Exception("A key should be provided to invoke the endpoint")
-
-# The azureml-model-deployment header will force the request to go to a specific deployment.
-# Remove this header to have the request observe the endpoint traffic rules
-headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key), 'azureml-model-deployment': 'phi-3-mini-4k-instruct-2' }
-
-req = urllib.request.Request(url, body, headers)
-
-try:
-    response = urllib.request.urlopen(req)
-
-    result = response.read()
-    print(result)
-except urllib.error.HTTPError as error:
-    print("The request failed with status code: " + str(error.code))
-
-    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
-    print(error.info())
-    print(error.read().decode("utf8", 'ignore'))

From 04e45c749cd8199696d8e679bd18dca978fa56a1 Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Fri, 26 Apr 2024 11:19:12 -0400
Subject: [PATCH 07/29] Erroneous addition

---
 tests/models/test_model.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index c6f1478a2..1dc4de113 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -6,8 +6,6 @@
 
 def test_fstring(selected_model):
     lm = selected_model
-    print(f"{dir(lm.engine.tokenizer)=}")
-    assert hasattr(lm.engine.tokenizer,"sp_model")
     lm += f'this is a test {select(["item1", "item2"])}'
     assert str(lm) in ["this is a test item1", "this is a test item2"]
 

From 25ecccfb2640abc8c3d7c171202b6bb65c1b0a20 Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Mon, 29 Apr 2024 08:55:08 -0400
Subject: [PATCH 08/29] Switch to requests

---
 guidance/models/_azureai_studio.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/guidance/models/_azureai_studio.py b/guidance/models/_azureai_studio.py
index db6fc414c..97b2f31b1 100644
--- a/guidance/models/_azureai_studio.py
+++ b/guidance/models/_azureai_studio.py
@@ -1,10 +1,9 @@
 import hashlib
-import json
 import pathlib
-import urllib.request
 
 import diskcache as dc
 import platformdirs
+import requests
 
 from ._model import Chat
 from ._grammarless import GrammarlessEngine, Grammarless
@@ -98,9 +97,6 @@ def _generator(self, prompt, temperature: float):
                     yield chunk
                 return
 
-        # Now switch to the example code from AzureAI Studio
-        # Might want to rewrite this to the requests package
-
         # Prepare for the API call (this might be model specific....)
         parameters = dict(temperature=temperature)
         payload = dict(input_data=dict(input_string=messages, parameters=parameters))
@@ -111,12 +107,13 @@ def _generator(self, prompt, temperature: float):
             "azureml-model-deployment": self._deployment,
         }
 
-        body = str.encode(json.dumps(payload))
-
-        req = urllib.request.Request(self._endpoint, body, headers)
+        response = requests.post(
+            self._endpoint,
+            json=payload,
+            headers=headers,
+        )
 
-        response = urllib.request.urlopen(req)
-        result = json.loads(response.read())
+        result = response.json()
 
         # Now back to OpenAIChatEngine, with slight modifications since
         # this isn't a streaming API

From 1265346714ae9364f2350d199d213cfb104ed3a0 Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Mon, 29 Apr 2024 10:40:06 -0400
Subject: [PATCH 09/29] Make sure that cache is unique to endpoint/deployment

---
 guidance/models/_azureai_studio.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/guidance/models/_azureai_studio.py b/guidance/models/_azureai_studio.py
index 97b2f31b1..e90f2430f 100644
--- a/guidance/models/_azureai_studio.py
+++ b/guidance/models/_azureai_studio.py
@@ -25,9 +25,13 @@ def __init__(
         self._deployment = azureai_model_deployment
         self._api_key = azureai_studio_key
 
+        # There is a cache... better make sure it's specific
+        # to the endpoint and deployment
+        deployment_id = self._hash_prompt(self._endpoint + self._deployment)
+
         path = (
             pathlib.Path(platformdirs.user_cache_dir("guidance"))
-            / "azureaistudio.tokens"
+            / f"azureaistudio.tokens.{deployment_id}"
         )
         self.cache = dc.Cache(path)
 
@@ -44,7 +48,7 @@ def _generator(self, prompt, temperature: float):
         # and quite possibly belongs in a library function or superclass
         # That said, I'm not _completely sure that there aren't subtle
         # differences between the various versions
-
+        
         # find the role tags
         pos = 0
         role_end = b"<|im_end|>"

From f348880bb712b651031d48f7fe2ca8d7f970892c Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Mon, 29 Apr 2024 10:40:27 -0400
Subject: [PATCH 10/29] Starting to test mistral too.... not fully working yet

---
 tests/models/test_azureai_studio.py | 37 +++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/tests/models/test_azureai_studio.py b/tests/models/test_azureai_studio.py
index ab51d6450..49d0298e9 100644
--- a/tests/models/test_azureai_studio.py
+++ b/tests/models/test_azureai_studio.py
@@ -11,10 +11,10 @@
 pytestmark = pytest.mark.needs_credentials
 
 
-def test_azureai_openai_chat_smoke(rate_limiter):
-    azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_ENDPOINT")
-    azureai_studio_deployment = env_or_fail("AZURE_AI_STUDIO_DEPLOYMENT")
-    azureai_studio_key = env_or_fail("AZURE_AI_STUDIO_KEY")
+def test_azureai_phi3_chat_smoke(rate_limiter):
+    azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_PHI3_ENDPOINT")
+    azureai_studio_deployment = env_or_fail("AZURE_AI_STUDIO_PHI3_DEPLOYMENT")
+    azureai_studio_key = env_or_fail("AZURE_AI_STUDIO_PHI3_KEY")
 
     lm = models.AzureAIStudioChat(
         azureai_studio_endpoint=azureai_studio_endpoint,
@@ -30,8 +30,35 @@ def test_azureai_openai_chat_smoke(rate_limiter):
         lm += "What is 1 + 1?"
 
     with assistant():
-        lm += gen(max_tokens=10, name="text")
+        lm += gen(max_tokens=10, name="text", temperature=0.5)
         lm += "Pick a number: "
 
     print(str(lm))
     assert len(lm["text"]) > 0
+
+
+def test_azureai_mistral_chat_smoke(rate_limiter):
+    azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_MISTRAL_CHAT_ENDPOINT")
+    azureai_studio_deployment = env_or_fail("AZURE_AI_STUDIO_MISTRAL_CHAT_DEPLOYMENT")
+    azureai_studio_key = env_or_fail("AZURE_AI_STUDIO_MISTRAL_CHAT_KEY")
+
+    lm = models.AzureAIStudioChat(
+        azureai_studio_endpoint=azureai_studio_endpoint,
+        azureai_studio_deployment=azureai_studio_deployment,
+        azureai_studio_key=azureai_studio_key,
+    )
+    assert isinstance(lm, models.AzureAIStudioChat)
+    lm.engine.cache.clear()
+
+    with system():
+        lm += "You are a math wiz."
+
+    with user():
+        lm += "What is 1 + 1?"
+
+    with assistant():
+        lm += gen(max_tokens=10, name="text", temperature=0.5)
+        lm += "Pick a number: "
+
+    print(str(lm))
+    assert len(lm["text"]) < 0

From 0fc4727afa71811d55816c1a56bc5353617d4b56 Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Mon, 29 Apr 2024 11:00:31 -0400
Subject: [PATCH 11/29] Get the Mistral test working

---
 guidance/models/_azureai_studio.py  |  2 +-
 tests/models/test_azureai_studio.py | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/guidance/models/_azureai_studio.py b/guidance/models/_azureai_studio.py
index e90f2430f..54ebbd4f1 100644
--- a/guidance/models/_azureai_studio.py
+++ b/guidance/models/_azureai_studio.py
@@ -48,7 +48,7 @@ def _generator(self, prompt, temperature: float):
         # and quite possibly belongs in a library function or superclass
         # That said, I'm not _completely sure that there aren't subtle
         # differences between the various versions
-        
+
         # find the role tags
         pos = 0
         role_end = b"<|im_end|>"
diff --git a/tests/models/test_azureai_studio.py b/tests/models/test_azureai_studio.py
index 49d0298e9..6fa6bfec6 100644
--- a/tests/models/test_azureai_studio.py
+++ b/tests/models/test_azureai_studio.py
@@ -50,15 +50,16 @@ def test_azureai_mistral_chat_smoke(rate_limiter):
     assert isinstance(lm, models.AzureAIStudioChat)
     lm.engine.cache.clear()
 
-    with system():
-        lm += "You are a math wiz."
+    # No "system" role for Mistral?
+    # with system():
+    #    lm += "You are a math wiz."
 
     with user():
         lm += "What is 1 + 1?"
 
     with assistant():
-        lm += gen(max_tokens=10, name="text", temperature=0.5)
-        lm += "Pick a number: "
+        lm += gen(max_tokens=15, name="text", temperature=0.5)
+        lm += "\nPick a number: "
 
     print(str(lm))
-    assert len(lm["text"]) < 0
+    assert len(lm["text"]) > 0

From 2be7f58c34fd1a650d73de0bf1d8933129c5d804 Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Mon, 29 Apr 2024 11:24:57 -0400
Subject: [PATCH 12/29] Add LLama3

---
 tests/models/test_azureai_studio.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tests/models/test_azureai_studio.py b/tests/models/test_azureai_studio.py
index 6fa6bfec6..45ba1bc3f 100644
--- a/tests/models/test_azureai_studio.py
+++ b/tests/models/test_azureai_studio.py
@@ -63,3 +63,29 @@ def test_azureai_mistral_chat_smoke(rate_limiter):
 
     print(str(lm))
     assert len(lm["text"]) > 0
+
+
+def test_azureai_llama3_chat_smoke(rate_limiter):
+    azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_LLAMA3_CHAT_ENDPOINT")
+    azureai_studio_deployment = env_or_fail("AZURE_AI_STUDIO_LLAMA3_CHAT_DEPLOYMENT")
+    azureai_studio_key = env_or_fail("AZURE_AI_STUDIO_LLAMA3_CHAT_KEY")
+
+    lm = models.AzureAIStudioChat(
+        azureai_studio_endpoint=azureai_studio_endpoint,
+        azureai_studio_deployment=azureai_studio_deployment,
+        azureai_studio_key=azureai_studio_key,
+    )
+    assert isinstance(lm, models.AzureAIStudioChat)
+
+    with system():
+        lm += "You are a math wiz."
+
+    with user():
+        lm += "What is 1 + 1?"
+
+    with assistant():
+        lm += gen(max_tokens=10, name="text", temperature=0.5)
+        lm += "Pick a number: "
+
+    print(str(lm))
+    assert len(lm["text"]) > 0

From 3bcb48ea6ae3e553230a73606505c478adf384da Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Tue, 30 Apr 2024 08:25:51 -0400
Subject: [PATCH 13/29] Expand the endpoint configuration

---
 .github/workflows/ci_tests.yml | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci_tests.yml b/.github/workflows/ci_tests.yml
index de28e840c..8cd45f378 100644
--- a/.github/workflows/ci_tests.yml
+++ b/.github/workflows/ci_tests.yml
@@ -58,13 +58,23 @@ jobs:
           python -c "import torch; assert torch.cuda.is_available()"
       - name: Test with pytest
         env:
-          # Configure endpoints
+          # Configure endpoints for Azure OpenAI
           AZUREAI_CHAT_ENDPOINT: ${{ secrets.AZUREAI_CHAT_ENDPOINT }}
           AZUREAI_CHAT_KEY: ${{ secrets.AZUREAI_CHAT_KEY }}
           AZUREAI_CHAT_MODEL: ${{ secrets.AZUREAI_CHAT_MODEL }}
           AZUREAI_COMPLETION_ENDPOINT: ${{ secrets.AZUREAI_COMPLETION_ENDPOINT }}
           AZUREAI_COMPLETION_KEY: ${{ secrets.AZUREAI_COMPLETION_KEY }}
           AZUREAI_COMPLETION_MODEL: ${{ secrets.AZUREAI_COMPLETION_MODEL }}
+          # Configure endpoints for Azure AI Studio
+          AZURE_AI_STUDIO_PHI3_ENDPOINT: ${{ vars.AZURE_AI_STUDIO_PHI3_ENDPOINT }}
+          AZURE_AI_STUDIO_PHI3_DEPLOYMENT: ${{ vars.AZURE_AI_STUDIO_PHI3_DEPLOYMENT }}
+          AZURE_AI_STUDIO_PHI3_KEY: ${{ secrets.AZURE_AI_STUDIO_PHI3_KEY }}
+          AZURE_AI_STUDIO_MISTRAL_CHAT_ENDPOINT: ${{ vars.AZURE_AI_STUDIO_MISTRAL_CHAT_ENDPOINT }}
+          AZURE_AI_STUDIO_MISTRAL_CHAT_DEPLOYMENT: ${{ vars.AZURE_AI_STUDIO_MISTRAL_CHAT_DEPLOYMENT }}
+          AZURE_AI_STUDIO_MISTRAL_CHAT_KEY: ${{ secrets.AZURE_AI_STUDIO_MISTRAL_CHAT_KEY }}
+          AZURE_AI_STUDIO_LLAMA3_CHAT_ENDPOINT: ${{ vars.AZURE_AI_STUDIO_LLAMA3_CHAT_ENDPOINT }}
+          AZURE_AI_STUDIO_LLAMA3_CHAT_DEPLOYMENT: ${{ vars.AZURE_AI_STUDIO_LLAMA3_CHAT_DEPLOYMENT }}
+          AZURE_AI_STUDIO_LLAMA3_CHAT_KEY: ${{ secrets.AZURE_AI_STUDIO_LLAMA3_CHAT_KEY }}
         run: |
           pytest --cov=guidance --cov-report=xml --cov-report=term-missing \
             -m needs_credentials \

From 9e46101b240efe3985227652af59bf1f6abcd38b Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Wed, 1 May 2024 09:02:16 -0400
Subject: [PATCH 14/29] Add option to clear cache on instaniating model

---
 guidance/models/_azureai_studio.py  | 5 +++++
 tests/models/test_azureai_studio.py | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/guidance/models/_azureai_studio.py b/guidance/models/_azureai_studio.py
index 54ebbd4f1..dc1e938c4 100644
--- a/guidance/models/_azureai_studio.py
+++ b/guidance/models/_azureai_studio.py
@@ -20,6 +20,7 @@ def __init__(
         azureai_studio_endpoint: str,
         azureai_model_deployment: str,
         azureai_studio_key: str,
+        clear_cache: bool,
     ):
         self._endpoint = azureai_studio_endpoint
         self._deployment = azureai_model_deployment
@@ -34,6 +35,8 @@ def __init__(
             / f"azureaistudio.tokens.{deployment_id}"
         )
         self.cache = dc.Cache(path)
+        if clear_cache:
+            self.cache.clear()
 
         super().__init__(tokenizer, max_streaming_tokens, timeout, compute_log_probs)
 
@@ -147,6 +150,7 @@ def __init__(
         max_streaming_tokens: int = 1000,
         timeout: float = 0.5,
         compute_log_probs: bool = False,
+        clear_cache: bool = False,
     ):
         super().__init__(
             AzureAIStudioChatEngine(
@@ -157,6 +161,7 @@ def __init__(
                 max_streaming_tokens=max_streaming_tokens,
                 timeout=timeout,
                 compute_log_probs=compute_log_probs,
+                clear_cache=False,
             ),
             echo=echo,
         )
diff --git a/tests/models/test_azureai_studio.py b/tests/models/test_azureai_studio.py
index 45ba1bc3f..102f1889a 100644
--- a/tests/models/test_azureai_studio.py
+++ b/tests/models/test_azureai_studio.py
@@ -20,6 +20,7 @@ def test_azureai_phi3_chat_smoke(rate_limiter):
         azureai_studio_endpoint=azureai_studio_endpoint,
         azureai_studio_deployment=azureai_studio_deployment,
         azureai_studio_key=azureai_studio_key,
+        clear_cache=True,
     )
     assert isinstance(lm, models.AzureAIStudioChat)
 
@@ -46,6 +47,7 @@ def test_azureai_mistral_chat_smoke(rate_limiter):
         azureai_studio_endpoint=azureai_studio_endpoint,
         azureai_studio_deployment=azureai_studio_deployment,
         azureai_studio_key=azureai_studio_key,
+        clear_cache=True,
     )
     assert isinstance(lm, models.AzureAIStudioChat)
     lm.engine.cache.clear()
@@ -74,6 +76,7 @@ def test_azureai_llama3_chat_smoke(rate_limiter):
         azureai_studio_endpoint=azureai_studio_endpoint,
         azureai_studio_deployment=azureai_studio_deployment,
         azureai_studio_key=azureai_studio_key,
+        clear_cache=True,
     )
     assert isinstance(lm, models.AzureAIStudioChat)
 

From 0a7cc81a56368fe33238ea0d9cd1db7648f4d29d Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Wed, 1 May 2024 09:32:21 -0400
Subject: [PATCH 15/29] Some more experimenting

---
 tests/models/test_azureai_studio.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_azureai_studio.py b/tests/models/test_azureai_studio.py
index 102f1889a..bb6f0e83a 100644
--- a/tests/models/test_azureai_studio.py
+++ b/tests/models/test_azureai_studio.py
@@ -36,6 +36,7 @@ def test_azureai_phi3_chat_smoke(rate_limiter):
 
     print(str(lm))
     assert len(lm["text"]) > 0
+    assert str(lm).endswith("Pick a number: <|im_end|>")
 
 
 def test_azureai_mistral_chat_smoke(rate_limiter):
@@ -65,6 +66,7 @@ def test_azureai_mistral_chat_smoke(rate_limiter):
 
     print(str(lm))
     assert len(lm["text"]) > 0
+    assert str(lm).endswith("Pick a number: <|im_end|>")
 
 
 def test_azureai_llama3_chat_smoke(rate_limiter):
@@ -86,9 +88,24 @@ def test_azureai_llama3_chat_smoke(rate_limiter):
     with user():
         lm += "What is 1 + 1?"
 
+    with assistant():
+        lm += "2"
+    
+    with user():
+        lm += "What is 2 + 3?"
+
     with assistant():
         lm += gen(max_tokens=10, name="text", temperature=0.5)
         lm += "Pick a number: "
 
-    print(str(lm))
     assert len(lm["text"]) > 0
+    assert str(lm).endswith("Pick a number: <|im_end|>")
+
+    with user():
+        lm += "I pick 10. Can you pick a number between 0 and 20?"
+
+    with assistant():
+        lm += gen(max_tokens=2, name="number")
+
+    print(str(lm))
+    assert len(lm["number"]) < 0

From 1584d9ff5d1d47048252140cccfeee2b94094d6f Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Wed, 1 May 2024 09:58:47 -0400
Subject: [PATCH 16/29] Want some parallel Azure OpenAI tests

---
 tests/models/test_azureai_openai.py | 75 +++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/tests/models/test_azureai_openai.py b/tests/models/test_azureai_openai.py
index c2bc818d7..1de6e5182 100644
--- a/tests/models/test_azureai_openai.py
+++ b/tests/models/test_azureai_openai.py
@@ -37,6 +37,81 @@ def test_azureai_openai_chat_smoke(rate_limiter):
 
     print(str(lm))
     assert len(lm["text"]) > 0
+    assert str(lm).endswith("Pick a number: <|im_end|>")
+
+
+def test_azureai_openai_chat_longer_1(rate_limiter):
+    azureai_endpoint = _env_or_fail("AZUREAI_CHAT_ENDPOINT")
+    azureai_key = _env_or_fail("AZUREAI_CHAT_KEY")
+    model = _env_or_fail("AZUREAI_CHAT_MODEL")
+
+    lm = models.AzureOpenAI(
+        model=model, azure_endpoint=azureai_endpoint, api_key=azureai_key
+    )
+    assert isinstance(lm, models.AzureOpenAIChat)
+
+    with system():
+        lm += "You are a math wiz."
+
+    with user():
+        lm += "What is 1 + 1?"
+
+    with assistant():
+        lm += gen(max_tokens=10, name="text")
+        lm += "Pick a number: "
+
+    print(str(lm))
+    assert len(lm["text"]) > 0
+    assert str(lm).endswith("Pick a number: <|im_end|>")
+
+    with user():
+        lm += "10. Now you pick a number between 0 and 20"
+
+    with assistant():
+        lm += gen(max_tokens=2, name="number")
+
+    print(str(lm))
+    assert len(lm["number"]) > 0
+
+
+def test_azureai_openai_chat_longer_2(rate_limiter):
+    azureai_endpoint = _env_or_fail("AZUREAI_CHAT_ENDPOINT")
+    azureai_key = _env_or_fail("AZUREAI_CHAT_KEY")
+    model = _env_or_fail("AZUREAI_CHAT_MODEL")
+
+    lm = models.AzureOpenAI(
+        model=model, azure_endpoint=azureai_endpoint, api_key=azureai_key
+    )
+    assert isinstance(lm, models.AzureOpenAIChat)
+
+    with system():
+        lm += "You are a math wiz."
+
+    with user():
+        lm += "What is 1 + 1?"
+
+    with assistant():
+        lm += "2"
+
+    with user():
+        lm += "What is 2 + 3?"
+
+    with assistant():
+        lm += gen(max_tokens=10, name="text")
+        lm += "Pick a number: "
+
+    print(str(lm))
+    assert len(lm["text"]) > 0
+    assert str(lm).endswith("Pick a number: <|im_end|>")
+
+    with user():
+        lm += "10. Now you pick a number between 0 and 20"
+
+    with assistant():
+        lm += gen(max_tokens=2, name="number")
+
+    print(str(lm))
+    assert len(lm["number"]) < 0
 
 
 def test_azureai_openai_chat_alt_args(rate_limiter):

From 60b23c899740c86b29535d8f53ecb3fd78ec62e8 Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Wed, 1 May 2024 10:08:32 -0400
Subject: [PATCH 17/29] Copy/paste error

---
 tests/models/test_azureai_openai.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/models/test_azureai_openai.py b/tests/models/test_azureai_openai.py
index 1de6e5182..11c3d91d7 100644
--- a/tests/models/test_azureai_openai.py
+++ b/tests/models/test_azureai_openai.py
@@ -41,9 +41,9 @@ def test_azureai_openai_chat_smoke(rate_limiter):
 
 
 def test_azureai_openai_chat_longer_1(rate_limiter):
-    azureai_endpoint = _env_or_fail("AZUREAI_CHAT_ENDPOINT")
-    azureai_key = _env_or_fail("AZUREAI_CHAT_KEY")
-    model = _env_or_fail("AZUREAI_CHAT_MODEL")
+    azureai_endpoint = env_or_fail("AZUREAI_CHAT_ENDPOINT")
+    azureai_key = env_or_fail("AZUREAI_CHAT_KEY")
+    model = env_or_fail("AZUREAI_CHAT_MODEL")
 
     lm = models.AzureOpenAI(
         model=model, azure_endpoint=azureai_endpoint, api_key=azureai_key
@@ -75,9 +75,9 @@ def test_azureai_openai_chat_longer_1(rate_limiter):
 
 
 def test_azureai_openai_chat_longer_2(rate_limiter):
-    azureai_endpoint = _env_or_fail("AZUREAI_CHAT_ENDPOINT")
-    azureai_key = _env_or_fail("AZUREAI_CHAT_KEY")
-    model = _env_or_fail("AZUREAI_CHAT_MODEL")
+    azureai_endpoint = env_or_fail("AZUREAI_CHAT_ENDPOINT")
+    azureai_key = env_or_fail("AZUREAI_CHAT_KEY")
+    model = env_or_fail("AZUREAI_CHAT_MODEL")
 
     lm = models.AzureOpenAI(
         model=model, azure_endpoint=azureai_endpoint, api_key=azureai_key

From 10fc9ba7d7fcdf676f2eb627be62ba76cb7f2b76 Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Wed, 1 May 2024 10:14:44 -0400
Subject: [PATCH 18/29] Change test to passing

---
 tests/models/test_azureai_openai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_azureai_openai.py b/tests/models/test_azureai_openai.py
index 11c3d91d7..e64f5d040 100644
--- a/tests/models/test_azureai_openai.py
+++ b/tests/models/test_azureai_openai.py
@@ -111,7 +111,7 @@ def test_azureai_openai_chat_longer_2(rate_limiter):
         lm += gen(max_tokens=2, name="number")
 
     print(str(lm))
-    assert len(lm["number"]) < 0
+    assert len(lm["number"]) > 0
 
 
 def test_azureai_openai_chat_alt_args(rate_limiter):

From b68e9d716a252eb5bb6c2774ee6f2f1e7aaee773 Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Wed, 1 May 2024 10:18:28 -0400
Subject: [PATCH 19/29] Expand Azure AI Studio testing

---
 tests/models/test_azureai_studio.py | 90 +++++++++++++++++++----------
 1 file changed, 59 insertions(+), 31 deletions(-)

diff --git a/tests/models/test_azureai_studio.py b/tests/models/test_azureai_studio.py
index bb6f0e83a..33bf5a4c4 100644
--- a/tests/models/test_azureai_studio.py
+++ b/tests/models/test_azureai_studio.py
@@ -11,10 +11,23 @@
 pytestmark = pytest.mark.needs_credentials
 
 
-def test_azureai_phi3_chat_smoke(rate_limiter):
-    azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_PHI3_ENDPOINT")
-    azureai_studio_deployment = env_or_fail("AZURE_AI_STUDIO_PHI3_DEPLOYMENT")
-    azureai_studio_key = env_or_fail("AZURE_AI_STUDIO_PHI3_KEY")
+def _get_chat_model(model_name: str):
+    if model_name == "phi3":
+        azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_PHI3_ENDPOINT")
+        azureai_studio_deployment = env_or_fail("AZURE_AI_STUDIO_PHI3_DEPLOYMENT")
+        azureai_studio_key = env_or_fail("AZURE_AI_STUDIO_PHI3_KEY")
+    elif model_name == "mistral":
+        azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_MISTRAL_CHAT_ENDPOINT")
+        azureai_studio_deployment = env_or_fail(
+            "AZURE_AI_STUDIO_MISTRAL_CHAT_DEPLOYMENT"
+        )
+        azureai_studio_key = env_or_fail("AZURE_AI_STUDIO_MISTRAL_CHAT_KEY")
+    elif model_name == "llama3":
+        azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_LLAMA3_CHAT_ENDPOINT")
+        azureai_studio_deployment = env_or_fail(
+            "AZURE_AI_STUDIO_LLAMA3_CHAT_DEPLOYMENT"
+        )
+        azureai_studio_key = env_or_fail("AZURE_AI_STUDIO_LLAMA3_CHAT_KEY")
 
     lm = models.AzureAIStudioChat(
         azureai_studio_endpoint=azureai_studio_endpoint,
@@ -23,6 +36,12 @@ def test_azureai_phi3_chat_smoke(rate_limiter):
         clear_cache=True,
     )
     assert isinstance(lm, models.AzureAIStudioChat)
+    return lm
+
+
+@pytest.mark.parametrize("chat_model_name", ["phi3", "llama3"])
+def test_azureai_chat_smoke(rate_limiter, chat_model_name: str):
+    lm = _get_chat_model(chat_model_name)
 
     with system():
         lm += "You are a math wiz."
@@ -40,18 +59,7 @@ def test_azureai_phi3_chat_smoke(rate_limiter):
 
 
 def test_azureai_mistral_chat_smoke(rate_limiter):
-    azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_MISTRAL_CHAT_ENDPOINT")
-    azureai_studio_deployment = env_or_fail("AZURE_AI_STUDIO_MISTRAL_CHAT_DEPLOYMENT")
-    azureai_studio_key = env_or_fail("AZURE_AI_STUDIO_MISTRAL_CHAT_KEY")
-
-    lm = models.AzureAIStudioChat(
-        azureai_studio_endpoint=azureai_studio_endpoint,
-        azureai_studio_deployment=azureai_studio_deployment,
-        azureai_studio_key=azureai_studio_key,
-        clear_cache=True,
-    )
-    assert isinstance(lm, models.AzureAIStudioChat)
-    lm.engine.cache.clear()
+    lm = _get_chat_model("mistral")
 
     # No "system" role for Mistral?
     # with system():
@@ -69,18 +77,37 @@ def test_azureai_mistral_chat_smoke(rate_limiter):
     assert str(lm).endswith("Pick a number: <|im_end|>")
 
 
-def test_azureai_llama3_chat_smoke(rate_limiter):
-    azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_LLAMA3_CHAT_ENDPOINT")
-    azureai_studio_deployment = env_or_fail("AZURE_AI_STUDIO_LLAMA3_CHAT_DEPLOYMENT")
-    azureai_studio_key = env_or_fail("AZURE_AI_STUDIO_LLAMA3_CHAT_KEY")
+@pytest.mark.parametrize("chat_model_name", ["phi3", "llama3"])
+def test_azureai_chat_longer_1(rate_limiter, chat_model_name: str):
+    lm = _get_chat_model(chat_model_name)
 
-    lm = models.AzureAIStudioChat(
-        azureai_studio_endpoint=azureai_studio_endpoint,
-        azureai_studio_deployment=azureai_studio_deployment,
-        azureai_studio_key=azureai_studio_key,
-        clear_cache=True,
-    )
-    assert isinstance(lm, models.AzureAIStudioChat)
+    with system():
+        lm += "You are a math wiz."
+
+    with user():
+        lm += "What is 1 + 1?"
+
+    with assistant():
+        lm += gen(max_tokens=10, name="text")
+        lm += "Pick a number: "
+
+    print(str(lm))
+    assert len(lm["text"]) > 0
+    assert str(lm).endswith("Pick a number: <|im_end|>")
+
+    with user():
+        lm += "10. Now you pick a number between 0 and 20"
+
+    with assistant():
+        lm += gen(max_tokens=2, name="number")
+
+    print(str(lm))
+    assert len(lm["number"]) > 0
+
+
+@pytest.mark.parametrize("chat_model_name", ["phi3", "llama3"])
+def test_azureai_chat_longer_2(rate_limiter, chat_model_name: str):
+    lm = _get_chat_model(chat_model_name)
 
     with system():
         lm += "You are a math wiz."
@@ -90,22 +117,23 @@ def test_azureai_llama3_chat_smoke(rate_limiter):
 
     with assistant():
         lm += "2"
-    
+
     with user():
         lm += "What is 2 + 3?"
 
     with assistant():
-        lm += gen(max_tokens=10, name="text", temperature=0.5)
+        lm += gen(max_tokens=10, name="text")
         lm += "Pick a number: "
 
+    print(str(lm))
     assert len(lm["text"]) > 0
     assert str(lm).endswith("Pick a number: <|im_end|>")
 
     with user():
-        lm += "I pick 10. Can you pick a number between 0 and 20?"
+        lm += "10. Now you pick a number between 0 and 20"
 
     with assistant():
         lm += gen(max_tokens=2, name="number")
 
     print(str(lm))
-    assert len(lm["number"]) < 0
+    assert len(lm["number"]) > 0

From 9a4c1a87d7b0152e1f84353c68220876c8b36849 Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Wed, 1 May 2024 10:36:04 -0400
Subject: [PATCH 20/29] Refactor tests

---
 tests/models/test_azureai_studio.py | 68 ++++++++++-------------------
 1 file changed, 24 insertions(+), 44 deletions(-)

diff --git a/tests/models/test_azureai_studio.py b/tests/models/test_azureai_studio.py
index 33bf5a4c4..0e0fff587 100644
--- a/tests/models/test_azureai_studio.py
+++ b/tests/models/test_azureai_studio.py
@@ -10,24 +10,17 @@
 # Mark is configured in pyproject.toml
 pytestmark = pytest.mark.needs_credentials
 
+# How to fill out the environment variables to
+# set up the models
+_chat_models = {"phi3": "PHI3", "mistral": "MISTRAL_CHAT", "llama3": "LLAMA3_CHAT"}
+
 
 def _get_chat_model(model_name: str):
-    if model_name == "phi3":
-        azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_PHI3_ENDPOINT")
-        azureai_studio_deployment = env_or_fail("AZURE_AI_STUDIO_PHI3_DEPLOYMENT")
-        azureai_studio_key = env_or_fail("AZURE_AI_STUDIO_PHI3_KEY")
-    elif model_name == "mistral":
-        azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_MISTRAL_CHAT_ENDPOINT")
-        azureai_studio_deployment = env_or_fail(
-            "AZURE_AI_STUDIO_MISTRAL_CHAT_DEPLOYMENT"
-        )
-        azureai_studio_key = env_or_fail("AZURE_AI_STUDIO_MISTRAL_CHAT_KEY")
-    elif model_name == "llama3":
-        azureai_studio_endpoint = env_or_fail("AZURE_AI_STUDIO_LLAMA3_CHAT_ENDPOINT")
-        azureai_studio_deployment = env_or_fail(
-            "AZURE_AI_STUDIO_LLAMA3_CHAT_DEPLOYMENT"
-        )
-        azureai_studio_key = env_or_fail("AZURE_AI_STUDIO_LLAMA3_CHAT_KEY")
+    env_string = _chat_models[model_name]
+
+    azureai_studio_endpoint = env_or_fail(f"AZURE_AI_STUDIO_{env_string}_ENDPOINT")
+    azureai_studio_deployment = env_or_fail(f"AZURE_AI_STUDIO_{env_string}_DEPLOYMENT")
+    azureai_studio_key = env_or_fail(f"AZURE_AI_STUDIO_{env_string}_KEY")
 
     lm = models.AzureAIStudioChat(
         azureai_studio_endpoint=azureai_studio_endpoint,
@@ -39,12 +32,14 @@ def _get_chat_model(model_name: str):
     return lm
 
 
-@pytest.mark.parametrize("chat_model_name", ["phi3", "llama3"])
+@pytest.mark.parametrize("chat_model_name", _chat_models.keys())
 def test_azureai_chat_smoke(rate_limiter, chat_model_name: str):
     lm = _get_chat_model(chat_model_name)
 
-    with system():
-        lm += "You are a math wiz."
+    # This makes me unhappy
+    if chat_model_name != "mistral":
+        with system():
+            lm += "You are a math wiz."
 
     with user():
         lm += "What is 1 + 1?"
@@ -58,31 +53,14 @@ def test_azureai_chat_smoke(rate_limiter, chat_model_name: str):
     assert str(lm).endswith("Pick a number: <|im_end|>")
 
 
-def test_azureai_mistral_chat_smoke(rate_limiter):
-    lm = _get_chat_model("mistral")
-
-    # No "system" role for Mistral?
-    # with system():
-    #    lm += "You are a math wiz."
-
-    with user():
-        lm += "What is 1 + 1?"
-
-    with assistant():
-        lm += gen(max_tokens=15, name="text", temperature=0.5)
-        lm += "\nPick a number: "
-
-    print(str(lm))
-    assert len(lm["text"]) > 0
-    assert str(lm).endswith("Pick a number: <|im_end|>")
-
-
-@pytest.mark.parametrize("chat_model_name", ["phi3", "llama3"])
+@pytest.mark.parametrize("chat_model_name", _chat_models.keys())
 def test_azureai_chat_longer_1(rate_limiter, chat_model_name: str):
     lm = _get_chat_model(chat_model_name)
 
-    with system():
-        lm += "You are a math wiz."
+    # This makes me unhappy
+    if chat_model_name != "mistral":
+        with system():
+            lm += "You are a math wiz."
 
     with user():
         lm += "What is 1 + 1?"
@@ -105,12 +83,14 @@ def test_azureai_chat_longer_1(rate_limiter, chat_model_name: str):
     assert len(lm["number"]) > 0
 
 
-@pytest.mark.parametrize("chat_model_name", ["phi3", "llama3"])
+@pytest.mark.parametrize("chat_model_name", _chat_models.keys())
 def test_azureai_chat_longer_2(rate_limiter, chat_model_name: str):
     lm = _get_chat_model(chat_model_name)
 
-    with system():
-        lm += "You are a math wiz."
+    # This makes me unhappy
+    if chat_model_name != "mistral":
+        with system():
+            lm += "You are a math wiz."
 
     with user():
         lm += "What is 1 + 1?"

From c0769b69efed6a73fc0c6a5fb7faee2bb088d048 Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Wed, 1 May 2024 10:51:59 -0400
Subject: [PATCH 21/29] Refactor tests

---
 tests/models/common_chat_testing.py | 78 ++++++++++++++++++++++++++++
 tests/models/test_azureai_openai.py | 80 ++---------------------------
 tests/models/test_azureai_studio.py | 76 ++-------------------------
 3 files changed, 88 insertions(+), 146 deletions(-)
 create mode 100644 tests/models/common_chat_testing.py

diff --git a/tests/models/common_chat_testing.py b/tests/models/common_chat_testing.py
new file mode 100644
index 000000000..99c45e860
--- /dev/null
+++ b/tests/models/common_chat_testing.py
@@ -0,0 +1,78 @@
+from guidance import assistant, gen, models, system, user
+
+
+def smoke_chat(lm: models.Chat, has_system_role: bool = True):
+    if has_system_role:
+        with system():
+            lm += "You are a math wiz."
+
+    with user():
+        lm += "What is 1 + 1?"
+
+    with assistant():
+        lm += gen(max_tokens=10, name="text", temperature=0.5)
+        lm += "Pick a number: "
+
+    print(str(lm))
+    assert len(lm["text"]) > 0
+    assert str(lm).endswith("Pick a number: <|im_end|>")
+
+
+def longer_chat_1(lm: models.Chat, has_system_role: bool = True):
+    if has_system_role:
+        with system():
+            lm += "You are a math wiz."
+
+    with user():
+        lm += "What is 1 + 1?"
+
+    with assistant():
+        lm += gen(max_tokens=10, name="text")
+        lm += "Pick a number: "
+
+    print(str(lm))
+    assert len(lm["text"]) > 0
+    assert str(lm).endswith("Pick a number: <|im_end|>")
+
+    with user():
+        lm += "10. Now you pick a number between 0 and 20"
+
+    with assistant():
+        lm += gen(max_tokens=2, name="number")
+
+    print(str(lm))
+    assert len(lm["number"]) > 0
+
+
+def longer_chat_2(lm: models.Chat, has_system_role: bool = True):
+    if has_system_role:
+        with system():
+            lm += "You are a math wiz."
+
+    with user():
+        lm += "What is 1 + 1?"
+
+    # This is the new part compared to longer_chat_1
+    with assistant():
+        lm += "2"
+
+    with user():
+        lm += "What is 2 + 3?"
+
+    # Resume the previous
+    with assistant():
+        lm += gen(max_tokens=10, name="text")
+        lm += "Pick a number: "
+
+    print(str(lm))
+    assert len(lm["text"]) > 0
+    assert str(lm).endswith("Pick a number: <|im_end|>")
+
+    with user():
+        lm += "10. Now you pick a number between 0 and 20"
+
+    with assistant():
+        lm += gen(max_tokens=2, name="number")
+
+    print(str(lm))
+    assert len(lm["number"]) > 0
diff --git a/tests/models/test_azureai_openai.py b/tests/models/test_azureai_openai.py
index e64f5d040..1a70d759f 100644
--- a/tests/models/test_azureai_openai.py
+++ b/tests/models/test_azureai_openai.py
@@ -7,6 +7,7 @@
 
 from guidance import assistant, gen, models, system, user
 
+from . import common_chat_testing
 from ..utils import env_or_fail
 
 # Everything in here needs credentials to work
@@ -25,19 +26,7 @@ def test_azureai_openai_chat_smoke(rate_limiter):
     )
     assert isinstance(lm, models.AzureOpenAIChat)
 
-    with system():
-        lm += "You are a math wiz."
-
-    with user():
-        lm += "What is 1 + 1?"
-
-    with assistant():
-        lm += gen(max_tokens=10, name="text")
-        lm += "Pick a number: "
-
-    print(str(lm))
-    assert len(lm["text"]) > 0
-    assert str(lm).endswith("Pick a number: <|im_end|>")
+    common_chat_testing.smoke_chat(lm)
 
 
 def test_azureai_openai_chat_longer_1(rate_limiter):
@@ -50,28 +39,7 @@ def test_azureai_openai_chat_longer_1(rate_limiter):
     )
     assert isinstance(lm, models.AzureOpenAIChat)
 
-    with system():
-        lm += "You are a math wiz."
-
-    with user():
-        lm += "What is 1 + 1?"
-
-    with assistant():
-        lm += gen(max_tokens=10, name="text")
-        lm += "Pick a number: "
-
-    print(str(lm))
-    assert len(lm["text"]) > 0
-    assert str(lm).endswith("Pick a number: <|im_end|>")
-
-    with user():
-        lm += "10. Now you pick a number between 0 and 20"
-
-    with assistant():
-        lm += gen(max_tokens=2, name="number")
-
-    print(str(lm))
-    assert len(lm["number"]) > 0
+    common_chat_testing.longer_chat_1(lm)
 
 
 def test_azureai_openai_chat_longer_2(rate_limiter):
@@ -84,34 +52,7 @@ def test_azureai_openai_chat_longer_2(rate_limiter):
     )
     assert isinstance(lm, models.AzureOpenAIChat)
 
-    with system():
-        lm += "You are a math wiz."
-
-    with user():
-        lm += "What is 1 + 1?"
-
-    with assistant():
-        lm += "2"
-
-    with user():
-        lm += "What is 2 + 3?"
-
-    with assistant():
-        lm += gen(max_tokens=10, name="text")
-        lm += "Pick a number: "
-
-    print(str(lm))
-    assert len(lm["text"]) > 0
-    assert str(lm).endswith("Pick a number: <|im_end|>")
-
-    with user():
-        lm += "10. Now you pick a number between 0 and 20"
-
-    with assistant():
-        lm += gen(max_tokens=2, name="number")
-
-    print(str(lm))
-    assert len(lm["number"]) > 0
+    common_chat_testing.longer_chat_2(lm)
 
 
 def test_azureai_openai_chat_alt_args(rate_limiter):
@@ -133,18 +74,7 @@ def test_azureai_openai_chat_alt_args(rate_limiter):
         azure_deployment=azureai_deployment,
     )
 
-    with system():
-        lm += "You are a math wiz."
-
-    with user():
-        lm += "What is 1 + 1?"
-
-    with assistant():
-        lm += gen(max_tokens=10, name="text")
-        lm += "Pick a number: "
-
-    print(str(lm))
-    assert len(lm["text"]) > 0
+    common_chat_testing.smoke_chat(lm)
 
 
 def test_azureai_openai_completion_smoke(rate_limiter):
diff --git a/tests/models/test_azureai_studio.py b/tests/models/test_azureai_studio.py
index 0e0fff587..c13dbb2f5 100644
--- a/tests/models/test_azureai_studio.py
+++ b/tests/models/test_azureai_studio.py
@@ -4,6 +4,8 @@
 
 from guidance import assistant, gen, models, system, user
 
+
+from . import common_chat_testing
 from ..utils import env_or_fail
 
 # Everything in here needs credentials to work
@@ -36,84 +38,16 @@ def _get_chat_model(model_name: str):
 def test_azureai_chat_smoke(rate_limiter, chat_model_name: str):
     lm = _get_chat_model(chat_model_name)
 
-    # This makes me unhappy
-    if chat_model_name != "mistral":
-        with system():
-            lm += "You are a math wiz."
-
-    with user():
-        lm += "What is 1 + 1?"
-
-    with assistant():
-        lm += gen(max_tokens=10, name="text", temperature=0.5)
-        lm += "Pick a number: "
-
-    print(str(lm))
-    assert len(lm["text"]) > 0
-    assert str(lm).endswith("Pick a number: <|im_end|>")
+    common_chat_testing.smoke_chat(lm, chat_model_name != "mistral")
 
 
 @pytest.mark.parametrize("chat_model_name", _chat_models.keys())
 def test_azureai_chat_longer_1(rate_limiter, chat_model_name: str):
     lm = _get_chat_model(chat_model_name)
-
-    # This makes me unhappy
-    if chat_model_name != "mistral":
-        with system():
-            lm += "You are a math wiz."
-
-    with user():
-        lm += "What is 1 + 1?"
-
-    with assistant():
-        lm += gen(max_tokens=10, name="text")
-        lm += "Pick a number: "
-
-    print(str(lm))
-    assert len(lm["text"]) > 0
-    assert str(lm).endswith("Pick a number: <|im_end|>")
-
-    with user():
-        lm += "10. Now you pick a number between 0 and 20"
-
-    with assistant():
-        lm += gen(max_tokens=2, name="number")
-
-    print(str(lm))
-    assert len(lm["number"]) > 0
+    common_chat_testing.longer_chat_1(lm, chat_model_name != "mistral")
 
 
 @pytest.mark.parametrize("chat_model_name", _chat_models.keys())
 def test_azureai_chat_longer_2(rate_limiter, chat_model_name: str):
     lm = _get_chat_model(chat_model_name)
-
-    # This makes me unhappy
-    if chat_model_name != "mistral":
-        with system():
-            lm += "You are a math wiz."
-
-    with user():
-        lm += "What is 1 + 1?"
-
-    with assistant():
-        lm += "2"
-
-    with user():
-        lm += "What is 2 + 3?"
-
-    with assistant():
-        lm += gen(max_tokens=10, name="text")
-        lm += "Pick a number: "
-
-    print(str(lm))
-    assert len(lm["text"]) > 0
-    assert str(lm).endswith("Pick a number: <|im_end|>")
-
-    with user():
-        lm += "10. Now you pick a number between 0 and 20"
-
-    with assistant():
-        lm += gen(max_tokens=2, name="number")
-
-    print(str(lm))
-    assert len(lm["number"]) > 0
+    common_chat_testing.longer_chat_2(lm, chat_model_name != "mistral")

From 9c755d6a0a45064811d1f579636da24979612933 Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Thu, 2 May 2024 06:38:15 -0400
Subject: [PATCH 22/29] Start doc writing

---
 guidance/models/_azureai_studio.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/guidance/models/_azureai_studio.py b/guidance/models/_azureai_studio.py
index dc1e938c4..1bf5a5082 100644
--- a/guidance/models/_azureai_studio.py
+++ b/guidance/models/_azureai_studio.py
@@ -152,6 +152,10 @@ def __init__(
         compute_log_probs: bool = False,
         clear_cache: bool = False,
     ):
+        """Create a model object for interacting with Azure AI Studio chat endpoints
+
+
+        """
         super().__init__(
             AzureAIStudioChatEngine(
                 azureai_studio_endpoint=azureai_studio_endpoint,
@@ -161,7 +165,7 @@ def __init__(
                 max_streaming_tokens=max_streaming_tokens,
                 timeout=timeout,
                 compute_log_probs=compute_log_probs,
-                clear_cache=False,
+                clear_cache=clear_cache,
             ),
             echo=echo,
         )

From 21ee13fdfacf3ee148da2f61f25c030da6f41d2a Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Thu, 2 May 2024 06:43:23 -0400
Subject: [PATCH 23/29] Add some basic docs

---
 guidance/models/_azureai_studio.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/guidance/models/_azureai_studio.py b/guidance/models/_azureai_studio.py
index 1bf5a5082..8ce88ddca 100644
--- a/guidance/models/_azureai_studio.py
+++ b/guidance/models/_azureai_studio.py
@@ -152,9 +152,25 @@ def __init__(
         compute_log_probs: bool = False,
         clear_cache: bool = False,
     ):
-        """Create a model object for interacting with Azure AI Studio chat endpoints
-
-
+        """Create a model object for interacting with Azure AI Studio chat endpoints.
+
+        The required information about the deployed endpoint can
+        be obtained from Azure AI Studio.
+
+        A `diskcache`-based caching system is used to speed up
+        repeated calls when the temperature is specified to be
+        zero.
+
+        Parameters
+        ----------
+        azureai_studio_endpoint : str
+            The HTTPS endpoint deployed by Azure AI Studio
+        azureai_studio_deployment : str
+            The specific model deployed to the endpoint
+        azureai_studio_key : str
+            The key required for access to the API
+        clear_cache : bool
+            Whether to empty the internal cache
         """
         super().__init__(
             AzureAIStudioChatEngine(

From cdf679c7373e41035008bd0cef02c26b089228cf Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Fri, 3 May 2024 09:44:24 -0400
Subject: [PATCH 24/29] Use the new endpoint

---
 guidance/models/_azureai_studio.py | 58 +++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 17 deletions(-)

diff --git a/guidance/models/_azureai_studio.py b/guidance/models/_azureai_studio.py
index 8ce88ddca..53bd56e07 100644
--- a/guidance/models/_azureai_studio.py
+++ b/guidance/models/_azureai_studio.py
@@ -1,7 +1,9 @@
 import hashlib
 import pathlib
+import urllib.parse
 
 import diskcache as dc
+import openai
 import platformdirs
 import requests
 
@@ -22,7 +24,13 @@ def __init__(
         azureai_studio_key: str,
         clear_cache: bool,
     ):
-        self._endpoint = azureai_studio_endpoint
+        endpoint_parts = urllib.parse.urlparse(azureai_studio_endpoint)
+        if endpoint_parts.path == "/score":
+            self._is_openai_compatible = False
+            self._endpoint = azureai_studio_endpoint
+        else:
+            self._is_openai_compatible = True
+            self._endpoint = f"{endpoint_parts.scheme}://{endpoint_parts.hostname}"
         self._deployment = azureai_model_deployment
         self._api_key = azureai_studio_key
 
@@ -104,31 +112,47 @@ def _generator(self, prompt, temperature: float):
                     yield chunk
                 return
 
-        # Prepare for the API call (this might be model specific....)
-        parameters = dict(temperature=temperature)
-        payload = dict(input_data=dict(input_string=messages, parameters=parameters))
+        # Call the actual API and extract the next chunk
+        if self._is_openai_compatible:
+            client = openai.OpenAI(api_key=self._api_key, base_url=self._endpoint)
+            response = client.chat.completions.create(
+                model=self._deployment,
+                messages=messages,
+                # max_tokens=self.max_streaming_tokens,
+                n=1,
+                top_p=1.0,  # TODO: this should be controllable like temp (from the grammar)
+                temperature=temperature,
+                # stream=True,
+            )
 
-        headers = {
-            "Content-Type": "application/json",
-            "Authorization": ("Bearer " + self._api_key),
-            "azureml-model-deployment": self._deployment,
-        }
+            result = response.choices[0]
+            encoded_chunk = result.message.content.encode("utf8")
+        else:
+            parameters = dict(temperature=temperature)
+            payload = dict(
+                input_data=dict(input_string=messages, parameters=parameters)
+            )
 
-        response = requests.post(
-            self._endpoint,
-            json=payload,
-            headers=headers,
-        )
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": ("Bearer " + self._api_key),
+                "azureml-model-deployment": self._deployment,
+            }
+            response = requests.post(
+                self._endpoint,
+                json=payload,
+                headers=headers,
+            )
+
+            result = response.json()
 
-        result = response.json()
+            encoded_chunk = result["output"].encode("utf8")
 
         # Now back to OpenAIChatEngine, with slight modifications since
         # this isn't a streaming API
         if temperature == 0:
             cached_results = []
 
-        encoded_chunk = result["output"].encode("utf8")
-
         yield encoded_chunk
 
         if temperature == 0:

From 4281d7f0513a1b562871ac507b0c048308e556b2 Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Fri, 3 May 2024 10:55:44 -0400
Subject: [PATCH 25/29] Handle optional import

---
 guidance/models/_azureai_studio.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/guidance/models/_azureai_studio.py b/guidance/models/_azureai_studio.py
index 53bd56e07..3d28e7635 100644
--- a/guidance/models/_azureai_studio.py
+++ b/guidance/models/_azureai_studio.py
@@ -3,13 +3,16 @@
 import urllib.parse
 
 import diskcache as dc
-import openai
 import platformdirs
 import requests
 
 from ._model import Chat
 from ._grammarless import GrammarlessEngine, Grammarless
 
+try:
+    import openai
+except ImportError:
+    openai = None
 
 class AzureAIStudioChatEngine(GrammarlessEngine):
     def __init__(

From 7bf3d0792d1d2c1cfcc58752b841157070aa8f23 Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Fri, 3 May 2024 10:57:19 -0400
Subject: [PATCH 26/29] OpenAI guard mk II

---
 guidance/models/_azureai_studio.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/guidance/models/_azureai_studio.py b/guidance/models/_azureai_studio.py
index 3d28e7635..9f50c1340 100644
--- a/guidance/models/_azureai_studio.py
+++ b/guidance/models/_azureai_studio.py
@@ -9,10 +9,13 @@
 from ._model import Chat
 from ._grammarless import GrammarlessEngine, Grammarless
 
+
 try:
     import openai
-except ImportError:
-    openai = None
+
+    is_openai = True
+except ModuleNotFoundError:
+    is_openai = False
 
 class AzureAIStudioChatEngine(GrammarlessEngine):
     def __init__(
@@ -32,6 +35,8 @@ def __init__(
             self._is_openai_compatible = False
             self._endpoint = azureai_studio_endpoint
         else:
+            if not is_openai:
+                raise ValueError("Detected OpenAI compatible model; please install openai package")
             self._is_openai_compatible = True
             self._endpoint = f"{endpoint_parts.scheme}://{endpoint_parts.hostname}"
         self._deployment = azureai_model_deployment

From 50847c514ad8e22c27285b9002f4ae8fb8138aad Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Fri, 3 May 2024 11:11:40 -0400
Subject: [PATCH 27/29] Small fixes for mypy

---
 guidance/models/_azureai_studio.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/guidance/models/_azureai_studio.py b/guidance/models/_azureai_studio.py
index 9f50c1340..6a8bea201 100644
--- a/guidance/models/_azureai_studio.py
+++ b/guidance/models/_azureai_studio.py
@@ -146,13 +146,13 @@ def _generator(self, prompt, temperature: float):
                 "Authorization": ("Bearer " + self._api_key),
                 "azureml-model-deployment": self._deployment,
             }
-            response = requests.post(
+            response_score = requests.post(
                 self._endpoint,
                 json=payload,
                 headers=headers,
             )
 
-            result = response.json()
+            result = response_score.json()
 
             encoded_chunk = result["output"].encode("utf8")
 

From 3d73c424f7983059e37b014f20426647fe676ed4 Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Fri, 3 May 2024 11:20:29 -0400
Subject: [PATCH 28/29] One suppression....

---
 guidance/models/_azureai_studio.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/guidance/models/_azureai_studio.py b/guidance/models/_azureai_studio.py
index 6a8bea201..52a731c62 100644
--- a/guidance/models/_azureai_studio.py
+++ b/guidance/models/_azureai_studio.py
@@ -17,6 +17,7 @@
 except ModuleNotFoundError:
     is_openai = False
 
+
 class AzureAIStudioChatEngine(GrammarlessEngine):
     def __init__(
         self,
@@ -36,7 +37,9 @@ def __init__(
             self._endpoint = azureai_studio_endpoint
         else:
             if not is_openai:
-                raise ValueError("Detected OpenAI compatible model; please install openai package")
+                raise ValueError(
+                    "Detected OpenAI compatible model; please install openai package"
+                )
             self._is_openai_compatible = True
             self._endpoint = f"{endpoint_parts.scheme}://{endpoint_parts.hostname}"
         self._deployment = azureai_model_deployment
@@ -125,7 +128,7 @@ def _generator(self, prompt, temperature: float):
             client = openai.OpenAI(api_key=self._api_key, base_url=self._endpoint)
             response = client.chat.completions.create(
                 model=self._deployment,
-                messages=messages,
+                messages=messages,  # type: ignore[arg-type]
                 # max_tokens=self.max_streaming_tokens,
                 n=1,
                 top_p=1.0,  # TODO: this should be controllable like temp (from the grammar)

From 64ec23227ddb1bddfd8e1ccb5b461c71530413cc Mon Sep 17 00:00:00 2001
From: "Richard Edgar (Microsoft)" <riedgar@microsoft.com>
Date: Fri, 3 May 2024 11:27:11 -0400
Subject: [PATCH 29/29] More mypy fixing

---
 guidance/models/_azureai_studio.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/guidance/models/_azureai_studio.py b/guidance/models/_azureai_studio.py
index 52a731c62..9eb1aae72 100644
--- a/guidance/models/_azureai_studio.py
+++ b/guidance/models/_azureai_studio.py
@@ -137,7 +137,7 @@ def _generator(self, prompt, temperature: float):
             )
 
             result = response.choices[0]
-            encoded_chunk = result.message.content.encode("utf8")
+            encoded_chunk = result.message.content.encode("utf8")  # type: ignore[union-attr]
         else:
             parameters = dict(temperature=temperature)
             payload = dict(
@@ -155,9 +155,9 @@ def _generator(self, prompt, temperature: float):
                 headers=headers,
             )
 
-            result = response_score.json()
+            result_score = response_score.json()
 
-            encoded_chunk = result["output"].encode("utf8")
+            encoded_chunk = result_score["output"].encode("utf8")
 
         # Now back to OpenAIChatEngine, with slight modifications since
         # this isn't a streaming API