allegro · alicja-raczkowska · Mar 22, 2024 · Mar 13, 2024 · Mar 20, 2024 · Mar 22, 2024
diff --git a/allms/domain/response.py b/allms/domain/response.py
@@ -5,13 +5,18 @@
 from allms.domain.input_data import InputData
 
 
+class ResponseParsingOutput(BaseModel):
+    response: typing.Optional[typing.Any]
+    error_message: typing.Optional[str]
+
+
 class ResponseData(BaseModel):
     response: typing.Optional[typing.Any] = None
     input_data: typing.Optional[InputData] = None
 
     number_of_prompt_tokens: typing.Optional[int] = None
     number_of_generated_tokens: typing.Optional[int] = None
-    error: typing.Optional[typing.Union[str, Exception]] = None
+    error: typing.Optional[str] = None
 
     # Without this, only classes inheriting from the pydantic BaseModel are allowed as field types. Exception isn't
     # such a class and that's why we need it.

diff --git a/allms/models/abstract.py b/allms/models/abstract.py
@@ -13,7 +13,6 @@
 from langchain.chat_models.base import BaseChatModel
 from langchain.output_parsers import PydanticOutputParser
 from langchain.prompts import ChatPromptTemplate
-from langchain.schema import OutputParserException
 from langchain_core.language_models.llms import create_base_retry_decorator
 from langchain_core.prompts import HumanMessagePromptTemplate, SystemMessagePromptTemplate
 from langchain_core.prompts.prompt import PromptTemplate
@@ -34,6 +33,7 @@
 from allms.domain.prompt_dto import SummaryOutputClass, KeywordsOutputClass
 from allms.domain.response import ResponseData
 from allms.utils.long_text_processing_utils import get_max_allowed_number_of_tokens
+from allms.utils.response_parsing_utils import ResponseParser
 
 logger = logging.getLogger(__name__)
 
@@ -58,6 +58,8 @@ def __init__(
         self._is_long_text_bypass_enabled: bool = False  # Should be false till we fully implement support for long sequences in our package
         self._aggregation_strategy: AggregationLogicForLongInputData = AggregationLogicForLongInputData.SIMPLE_CONCATENATION
         self._parser: typing.Optional[PydanticOutputParser] = None
+        self._json_pattern = re.compile(r"{.*?}", re.DOTALL)
+        self._is_json_format_injected_into_prompt: bool = True
 
         if max_output_tokens >= model_total_max_tokens:
             raise ValueError("max_output_tokens has to be lower than model_total_max_tokens")
@@ -103,38 +105,9 @@ def generate(
         )
 
         if output_data_model_class:
-            return self._parse_model_output(model_responses)
+            return ResponseParser(self._parser).parse_model_output(model_responses)
         return model_responses
 
-    def _parse_response(self, model_response_data: ResponseData) -> typing.Tuple[str, typing.Optional[str]]:
-        try:
-            return self._parser.parse(model_response_data.response), None
-        except OutputParserException as output_parser_exception:
-            return None, OutputParserException(
-                f"An OutputParserException has occurred for "
-                f"The response from model: {model_response_data.response}\n"
-                f"The exception message: {output_parser_exception}"
-            )
-
-    def _parse_model_output(self, model_responses_data: typing.List[ResponseData]) -> typing.List[ResponseData]:
-        parsed_responses = []
-        for model_response_data in model_responses_data:
-            if not model_response_data.error:
-                response, error_message = self._parse_response(model_response_data)
-
-                parsed_responses.append(ResponseData(
-                    input_data=model_response_data.input_data,
-                    response=response,
-                    error=error_message,
-                    number_of_prompt_tokens=model_response_data.number_of_prompt_tokens,
-                    number_of_generated_tokens=model_response_data.number_of_generated_tokens
-
-                ))
-            else:
-                parsed_responses.append(model_response_data)
-
-        return parsed_responses
-
     async def _generate(
             self,
             prompt: str,
@@ -155,10 +128,12 @@ async def _generate(
 
         if output_data_model_class:
             self._parser = PydanticOutputParser(pydantic_object=output_data_model_class)
-            prompt_template_args[PromptConstants.PARTIAL_VARIABLES_STR] = {
-                PromptConstants.OUTPUT_DATA_MODEL: self._parser.get_format_instructions(),
-            }
-            prompt_template_args[PromptConstants.TEMPLATE_STR] = self._add_output_data_format(prompt=prompt)
+
+            if self._is_json_format_injected_into_prompt:
+                prompt_template_args[PromptConstants.PARTIAL_VARIABLES_STR] = {
+                    PromptConstants.OUTPUT_DATA_MODEL: self._parser.get_format_instructions(),
+                }
+                prompt_template_args[PromptConstants.TEMPLATE_STR] = self._add_output_data_format(prompt=prompt)
 
         chat_prompts = await self._build_chat_prompts(prompt_template_args, system_prompt)
 

diff --git a/allms/models/azure_llama2.py b/allms/models/azure_llama2.py
@@ -1,11 +1,15 @@
 import typing
 from asyncio import AbstractEventLoop
+from typing import List, Type
 
 from langchain_community.chat_models.azureml_endpoint import LlamaChatContentFormatter
+from pydantic import BaseModel
 
 from allms.defaults.azure_defaults import AzureLlama2Defaults
 from allms.defaults.general_defaults import GeneralDefaults
 from allms.domain.configuration import AzureSelfDeployedConfiguration
+from allms.domain.input_data import InputData
+from allms.domain.response import ResponseData
 from allms.models.abstract import AbstractModel
 from allms.models.azure_base import AzureMLOnlineEndpointAsync
 
@@ -35,6 +39,8 @@ def __init__(
             event_loop=event_loop
         )
 
+        self._is_json_format_injected_into_prompt = False
+
     def _create_llm(self) -> AzureMLOnlineEndpointAsync:
         model_kwargs = {"max_new_tokens": self._max_output_tokens, "top_p": self._top_p, "do_sample": False}
         if self._temperature > 0:

diff --git a/allms/models/azure_mistral.py b/allms/models/azure_mistral.py
@@ -35,6 +35,8 @@ def __init__(
             event_loop=event_loop
         )
 
+        self._is_json_format_injected_into_prompt = False
+
     def _create_llm(self) -> AzureMLOnlineEndpointAsync:
         model_kwargs = {
             "max_new_tokens": self._max_output_tokens, "top_p": self._top_p, "do_sample": False,

diff --git a/allms/models/vertexai_gemma.py b/allms/models/vertexai_gemma.py
@@ -38,6 +38,8 @@ def __init__(
             event_loop=event_loop
         )
 
+        self._is_json_format_injected_into_prompt = False
+
     def _create_llm(self) -> VertexAIModelGarden:
         return VertexAIModelGardenWrapper(
             model_name=GemmaModelDefaults.GCP_MODEL_NAME,

diff --git a/allms/utils/response_parsing_utils.py b/allms/utils/response_parsing_utils.py
@@ -0,0 +1,70 @@
+import re
+import typing
+
+from langchain.output_parsers import PydanticOutputParser
+from langchain.schema import OutputParserException
+
+from allms.domain.response import ResponseData, ResponseParsingOutput
+
+
+class ResponseParser:
+    def __init__(self, parser: PydanticOutputParser) -> None:
+        self._json_pattern = re.compile(r"{.*?}", re.DOTALL)
+        self._parser = parser
+
+    def _clean_extracted_json(self, extracted_json: str) -> str:
+        json_without_newlines = extracted_json.replace("\\n", "")
+        json_without_backslashes = json_without_newlines.replace("\\", "")
+
+        return json_without_backslashes
+
+    def _extract_json_from_response(self, model_response_data: ResponseData) -> str:
+        search_results = self._json_pattern.findall(model_response_data.response)
+
+        if len(search_results) == 0:
+            return model_response_data.response
+
+        return self._clean_extracted_json(search_results[0])
+
+    def _parse_response(
+        self, 
+        model_response_data: ResponseData
+    ) -> ResponseParsingOutput:
+        raw_response = self._extract_json_from_response(model_response_data)
+
+        try:
+            return ResponseParsingOutput(
+                response=self._parser.parse(raw_response), 
+                error_message=None
+            )
+        except OutputParserException as output_parser_exception:
+            return ResponseParsingOutput(
+                response=None, 
+                error_message=f"""
+                    An OutputParserException has occurred for the model response: {raw_response}
+                    The exception message: {output_parser_exception}
+                    """
+            )
+
+    def parse_model_output(
+        self, 
+        model_responses_data: typing.List[ResponseData]
+    ) -> typing.List[ResponseData]:
+        parsed_responses = []
+
+        for model_response_data in model_responses_data:
+            if not model_response_data.error:
+                response_with_error = self._parse_response(model_response_data)
+
+                parsed_responses.append(ResponseData(
+                    input_data=model_response_data.input_data,
+                    response=response_with_error.response,
+                    error=response_with_error.error_message,
+                    number_of_prompt_tokens=model_response_data.number_of_prompt_tokens,
+                    number_of_generated_tokens=model_response_data.number_of_generated_tokens
+
+                ))
+            else:
+                parsed_responses.append(model_response_data)
+
+        return parsed_responses
diff --git a/docs/api/models/azure_llama2_model.md b/docs/api/models/azure_llama2_model.md
@@ -39,8 +39,7 @@ generate(
 - `input_data` (`Optional[List[InputData]]`): If prompt contains symbolic variables you can use this parameter to
    generate model responses for batch of examples. Each symbolic variable from the prompt should have mapping provided
    in the `input_mappings` of `InputData`.
-- `output_data_model_class` (`Optional[Type[BaseModel]]`): If provided forces the model to generate output in the
-  format defined by the passed class. Generated response is automatically parsed to this class.
+- `output_data_model_class` (`Optional[Type[BaseModel]]`): Generated response is automatically parsed to this class. WARNING: You need to manually provide the JSON format instructions in the prompt, they are not injected for this model.
 
 #### Returns
 `List[ResponseData]`: Each `ResponseData` contains the response for a single example from `input_data`. If `input_data`

diff --git a/docs/api/models/azure_mistral_model.md b/docs/api/models/azure_mistral_model.md
@@ -37,8 +37,7 @@ generate(
 - `input_data` (`Optional[List[InputData]]`): If prompt contains symbolic variables you can use this parameter to
    generate model responses for batch of examples. Each symbolic variable from the prompt should have mapping provided
    in the `input_mappings` of `InputData`.
-- `output_data_model_class` (`Optional[Type[BaseModel]]`): If provided forces the model to generate output in the
-  format defined by the passed class. Generated response is automatically parsed to this class.
+- `output_data_model_class` (`Optional[Type[BaseModel]]`): Generated response is automatically parsed to this class. WARNING: You need to manually provide the JSON format instructions in the prompt, they are not injected for this model.
 
 Note that Mistral-based models currently don't support system prompts.
 

diff --git a/docs/api/models/vertexai_gemma.md b/docs/api/models/vertexai_gemma.md
@@ -44,8 +44,7 @@ generate(
 - `input_data` (`Optional[List[InputData]]`): If prompt contains symbolic variables you can use this parameter to
    generate model responses for batch of examples. Each symbolic variable from the prompt should have mapping provided
    in the `input_mappings` of `InputData`.
-- `output_data_model_class` (`Optional[Type[BaseModel]]`): If provided forces the model to generate output in the
-  format defined by the passed class. Generated response is automatically parsed to this class.
+- `output_data_model_class` (`Optional[Type[BaseModel]]`): Generated response is automatically parsed to this class. WARNING: You need to manually provide the JSON format instructions in the prompt, they are not injected for this model.
 
 #### Returns
 `List[ResponseData]`: Each `ResponseData` contains the response for a single example from `input_data`. If `input_data`

diff --git a/docs/usage/forcing_response_format.md b/docs/usage/forcing_response_format.md
@@ -66,13 +66,10 @@ False
 
 ## What to do when output formatting doesn't work?
 
-The feature described above works best with advanced proprietary models like GPT and PaLM/Gemini. Less capable models like Llama2 or Mistral
-may not able to understand instructions passed as output_dataclasses, and in most cases the returned response won't be compatible
-with the defined format, resulting in an unexpected response.
+The feature described above works only with advanced proprietary models like GPT and PaLM/Gemini. Less capable models like Llama2 or Mistral
+are unable to understand instructions passed as output_dataclasses.
 
-In such cases, we recommend to address the issue by specifying in the prompt how the response should look like. Using
-few-shot learning techniques is also advisable. In the case of JSON-like output, use double curly brackets to escape them in order
-to use them in the JSON example.
+For these less capable models, you need to manually specify in the prompt how the response should look like. You can then pass the `output_data_model_class` to try parsing the output. Using few-shot learning techniques is also advisable. In the case of JSON-like output, use double curly brackets instead of single ones, e.g. `{{"key": "value"}}` instead of `{"key": "value"}`.
 
 ## How forcing response format works under the hood?
 To force the model to provide output in a desired format, under the hood `allms` automatically adds a description
@@ -90,7 +87,7 @@ Here is the output schema:
 ```
 ````
 
-This feature is really helpful, but you have to bear in mind that by using it you increase the number or prompt tokens
+This feature is really helpful, but you have to keep in mind that by using it you increase the number or prompt tokens
 so it'll make the requests more costly (if you're using model with per token pricing)
 
 If the model will return an output that doesn't comform to the defined data model, raw model response will be returned

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "allms"
-version = "1.0.1"
+version = "1.0.2"
 description = ""
 authors = ["Allegro Opensource <[email protected]>"]
 readme = "README.md"
@@ -17,6 +17,7 @@ langchain = "^0.0.351"
 aioresponses = "^0.7.6"
 tiktoken = "^0.6.0"
 openai = "^0.27.8"
+pytest-mock = "^3.14.0"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.4.0"

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -27,7 +27,7 @@ class GenerativeModels:
     vertex_palm: typing.Optional[VertexAIPalmModel] = None
 
 
-class VertexAIMock(FakeListLLM):
+class ModelWithoutAsyncRequestsMock(FakeListLLM):
     def __init__(self, *args, **kwargs):
         super().__init__(responses=["{}"])
 
@@ -37,9 +37,11 @@ def models():
     event_loop = asyncio.new_event_loop()
 
     with (
-        patch("allms.models.vertexai_palm.CustomVertexAI", VertexAIMock),
-        patch("allms.models.vertexai_gemini.CustomVertexAI", VertexAIMock),
-        patch("allms.models.vertexai_gemma.VertexAIModelGardenWrapper", VertexAIMock)
+        patch("allms.models.vertexai_palm.CustomVertexAI", ModelWithoutAsyncRequestsMock),
+        patch("allms.models.vertexai_gemini.CustomVertexAI", ModelWithoutAsyncRequestsMock),
+        patch("allms.models.vertexai_gemma.VertexAIModelGardenWrapper", ModelWithoutAsyncRequestsMock),
+        patch("allms.models.azure_llama2.AzureMLOnlineEndpointAsync", ModelWithoutAsyncRequestsMock),
+        patch("allms.models.azure_mistral.AzureMLOnlineEndpointAsync", ModelWithoutAsyncRequestsMock)
     ):
         return {
                 "azure_open_ai": AzureOpenAIModel(