Merge branch 'main' into llmevaluator

deepset-ai · Mar 24, 2024 · 903363c · 903363c
2 parents 45e9bd9 + 42b587a
commit 903363c
Show file tree

Hide file tree

Showing 15 changed files with 126 additions and 73 deletions.
diff --git a/.github/workflows/project.yml b/.github/workflows/project.yml
@@ -10,7 +10,7 @@ jobs:
     name: Add new issues to project for triage
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/add-to-project@v0.5.0
+      - uses: actions/add-to-project@v0.6.1
         with:
           project-url: https://github.com/orgs/deepset-ai/projects/5
           github-token: ${{ secrets.GH_PROJECT_PAT }}
diff --git a/README.md b/README.md
@@ -57,7 +57,7 @@ Some examples of what you can do with Haystack:
 -   Use **user feedback** to evaluate, benchmark, and continuously improve your models.
 
 > [!TIP]
-><img src="docs/img/deepset-cloud-logo-lightblue.png"  width=30% height=30%>
+><img src="https://github.com/deepset-ai/haystack/raw/main/docs/img/deepset-cloud-logo-lightblue.png"  width=30% height=30%>
 >
 > Are you looking for a managed solution that benefits from Haystack? [deepset Cloud](https://www.deepset.ai/deepset-cloud?utm_campaign=developer-relations&utm_source=haystack&utm_medium=readme) is our fully managed, end-to-end platform to integrate LLMs with your data, which uses Haystack for the LLM pipelines architecture.
 

diff --git a/haystack/components/generators/chat/hugging_face_tgi.py b/haystack/components/generators/chat/hugging_face_tgi.py
@@ -16,6 +16,7 @@
 logger = logging.getLogger(__name__)
 
 
+@component
 class HuggingFaceTGIChatGenerator:
     """
     Enables text generation using HuggingFace Hub hosted chat-based LLMs. This component is designed to seamlessly
@@ -227,8 +228,9 @@ def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str,
             raise RuntimeError("Please call warm_up() before running LLM inference.")
 
         # apply either model's chat template or the user-provided one
+        formatted_messages = [message.to_openai_format() for message in messages]
         prepared_prompt: str = self.tokenizer.apply_chat_template(
-            conversation=messages, chat_template=self.chat_template, tokenize=False
+            conversation=formatted_messages, chat_template=self.chat_template, tokenize=False
         )
         prompt_token_count: int = len(self.tokenizer.encode(prepared_prompt, add_special_tokens=False))
 

diff --git a/haystack/components/generators/chat/openai.py b/haystack/components/generators/chat/openai.py
@@ -1,9 +1,8 @@
 import copy
-import dataclasses
 import json
 from typing import Any, Callable, Dict, List, Optional, Union
 
-from openai import OpenAI, Stream  # type: ignore
+from openai import OpenAI, Stream
 from openai.types.chat import ChatCompletion, ChatCompletionChunk, ChatCompletionMessage
 from openai.types.chat.chat_completion import Choice
 from openai.types.chat.chat_completion_chunk import Choice as ChunkChoice
@@ -169,7 +168,7 @@ def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str,
         generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
 
         # adapt ChatMessage(s) to the format expected by the OpenAI API
-        openai_formatted_messages = self._convert_to_openai_format(messages)
+        openai_formatted_messages = [message.to_openai_format() for message in messages]
 
         chat_completion: Union[Stream[ChatCompletionChunk], ChatCompletion] = self.client.chat.completions.create(
             model=self.model,
@@ -204,20 +203,6 @@ def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str,
 
         return {"replies": completions}
 
-    def _convert_to_openai_format(self, messages: List[ChatMessage]) -> List[Dict[str, Any]]:
-        """
-        Converts the list of ChatMessage to the list of messages in the format expected by the OpenAI API.
-        :param messages: The list of ChatMessage.
-        :return: The list of messages in the format expected by the OpenAI API.
-        """
-        openai_chat_message_format = {"role", "content", "name"}
-        openai_formatted_messages = []
-        for m in messages:
-            message_dict = dataclasses.asdict(m)
-            filtered_message = {k: v for k, v in message_dict.items() if k in openai_chat_message_format and v}
-            openai_formatted_messages.append(filtered_message)
-        return openai_formatted_messages
-
     def _connect_chunks(self, chunk: Any, chunks: List[StreamingChunk]) -> ChatMessage:
         """
         Connects the streaming chunks into a single ChatMessage.

diff --git a/haystack/components/generators/hugging_face_local.py b/haystack/components/generators/hugging_face_local.py
@@ -1,8 +1,15 @@
-from typing import Any, Dict, List, Literal, Optional
+from typing import Any, Callable, Dict, List, Literal, Optional
 
 from haystack import component, default_from_dict, default_to_dict, logging
+from haystack.dataclasses import StreamingChunk
 from haystack.lazy_imports import LazyImport
-from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace
+from haystack.utils import (
+    ComponentDevice,
+    Secret,
+    deserialize_callable,
+    deserialize_secrets_inplace,
+    serialize_callable,
+)
 from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs
 
 logger = logging.getLogger(__name__)
@@ -12,7 +19,11 @@
 with LazyImport(message="Run 'pip install transformers[torch]'") as transformers_import:
     from transformers import StoppingCriteriaList, pipeline
 
-    from haystack.utils.hf import StopWordsCriteria, resolve_hf_pipeline_kwargs  # pylint: disable=ungrouped-imports
+    from haystack.utils.hf import (  # pylint: disable=ungrouped-imports
+        HFTokenStreamingHandler,
+        StopWordsCriteria,
+        resolve_hf_pipeline_kwargs,
+    )
 
 
 @component
@@ -47,6 +58,7 @@ def __init__(
         generation_kwargs: Optional[Dict[str, Any]] = None,
         huggingface_pipeline_kwargs: Optional[Dict[str, Any]] = None,
         stop_words: Optional[List[str]] = None,
+        streaming_callback: Optional[Callable[[StreamingChunk], None]] = None,
     ):
         """
         Creates an instance of a HuggingFaceLocalGenerator.
@@ -80,6 +92,7 @@ def __init__(
             If you provide this parameter, you should not specify the `stopping_criteria` in `generation_kwargs`.
             For some chat models, the output includes both the new text and the original prompt.
             In these cases, it's important to make sure your prompt has no stop words.
+        :param streaming_callback: An optional callable for handling streaming responses.
         """
         transformers_import.check()
 
@@ -113,6 +126,7 @@ def __init__(
         self.stop_words = stop_words
         self.pipeline = None
         self.stopping_criteria_list = None
+        self.streaming_callback = streaming_callback
 
     def _get_telemetry_data(self) -> Dict[str, Any]:
         """
@@ -142,10 +156,12 @@ def to_dict(self) -> Dict[str, Any]:
         :returns:
             Dictionary with serialized data.
         """
+        callback_name = serialize_callable(self.streaming_callback) if self.streaming_callback else None
         serialization_dict = default_to_dict(
             self,
             huggingface_pipeline_kwargs=self.huggingface_pipeline_kwargs,
             generation_kwargs=self.generation_kwargs,
+            streaming_callback=callback_name,
             stop_words=self.stop_words,
             token=self.token.to_dict() if self.token else None,
         )
@@ -168,6 +184,11 @@ def from_dict(cls, data: Dict[str, Any]) -> "HuggingFaceLocalGenerator":
         """
         deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
         deserialize_hf_model_kwargs(data["init_parameters"]["huggingface_pipeline_kwargs"])
+        init_params = data.get("init_parameters", {})
+        serialized_callback_handler = init_params.get("streaming_callback")
+        if serialized_callback_handler:
+            data["init_parameters"]["streaming_callback"] = deserialize_callable(serialized_callback_handler)
+
         return default_from_dict(cls, data)
 
     @component.output_types(replies=List[str])
@@ -193,6 +214,21 @@ def run(self, prompt: str, generation_kwargs: Optional[Dict[str, Any]] = None):
         # merge generation kwargs from init method with those from run method
         updated_generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
 
+        if self.streaming_callback:
+            num_responses = updated_generation_kwargs.get("num_return_sequences", 1)
+            if num_responses > 1:
+                logger.warning(
+                    "Streaming is enabled, but the number of responses is set to %d. "
+                    "Streaming is only supported for single response generation. "
+                    "Setting the number of responses to 1.",
+                    num_responses,
+                )
+                updated_generation_kwargs["num_return_sequences"] = 1
+            # streamer parameter hooks into HF streaming, HFTokenStreamingHandler is an adapter to our streaming
+            updated_generation_kwargs["streamer"] = HFTokenStreamingHandler(
+                self.pipeline.tokenizer, self.streaming_callback, self.stop_words
+            )
+
         output = self.pipeline(prompt, stopping_criteria=self.stopping_criteria_list, **updated_generation_kwargs)
         replies = [o["generated_text"] for o in output if "generated_text" in o]
 

diff --git a/haystack/components/generators/openai.py b/haystack/components/generators/openai.py
@@ -1,4 +1,3 @@
-import dataclasses
 from typing import Any, Callable, Dict, List, Optional, Union
 
 from openai import OpenAI, Stream
@@ -164,7 +163,7 @@ def run(self, prompt: str, generation_kwargs: Optional[Dict[str, Any]] = None):
         generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
 
         # adapt ChatMessage(s) to the format expected by the OpenAI API
-        openai_formatted_messages = self._convert_to_openai_format(messages)
+        openai_formatted_messages = [message.to_openai_format() for message in messages]
 
         completion: Union[Stream[ChatCompletionChunk], ChatCompletion] = self.client.chat.completions.create(
             model=self.model,
@@ -200,23 +199,6 @@ def run(self, prompt: str, generation_kwargs: Optional[Dict[str, Any]] = None):
             "meta": [message.meta for message in completions],
         }
 
-    def _convert_to_openai_format(self, messages: List[ChatMessage]) -> List[Dict[str, Any]]:
-        """
-        Converts the list of ChatMessage to the list of messages in the format expected by the OpenAI API.
-
-        :param messages:
-            The list of ChatMessage.
-        :returns:
-            The list of messages in the format expected by the OpenAI API.
-        """
-        openai_chat_message_format = {"role", "content", "name"}
-        openai_formatted_messages = []
-        for m in messages:
-            message_dict = dataclasses.asdict(m)
-            filtered_message = {k: v for k, v in message_dict.items() if k in openai_chat_message_format and v}
-            openai_formatted_messages.append(filtered_message)
-        return openai_formatted_messages
-
     def _connect_chunks(self, chunk: Any, chunks: List[StreamingChunk]) -> ChatMessage:
         """
         Connects the streaming chunks into a single ChatMessage.

diff --git a/haystack/core/pipeline/pipeline.py b/haystack/core/pipeline/pipeline.py
@@ -86,13 +86,7 @@ def __eq__(self, other) -> bool:
     def __repr__(self) -> str:
         """
         Returns a text representation of the Pipeline.
-        If this runs in a Jupyter notebook, it will instead display the Pipeline image.
         """
-        if is_in_jupyter():
-            # If we're in a Jupyter notebook we want to display the image instead of the text repr.
-            self.show()
-            return ""
-
         res = f"{object.__repr__(self)}\n"
         if self.metadata:
             res += "🧱 Metadata\n"

diff --git a/haystack/dataclasses/chat_message.py b/haystack/dataclasses/chat_message.py
@@ -28,6 +28,22 @@ class ChatMessage:
     name: Optional[str]
     meta: Dict[str, Any] = field(default_factory=dict, hash=False)
 
+    def to_openai_format(self) -> Dict[str, Any]:
+        """
+        Convert the message to the format expected by OpenAI's Chat API.
+        See the [API reference](https://platform.openai.com/docs/api-reference/chat/create) for details.
+
+        :returns: A dictionary with the following key:
+            - `role`
+            - `content`
+            - `name` (optional)
+        """
+        msg = {"role": self.role.value, "content": self.content}
+        if self.name:
+            msg["name"] = self.name
+
+        return msg
+
     def is_from(self, role: ChatRole) -> bool:
         """
         Check if the message is from a specific role.

diff --git a/haystack/utils/hf.py b/haystack/utils/hf.py
@@ -222,8 +222,7 @@ def check_generation_params(kwargs: Optional[Dict[str, Any]], additional_accepte
 with LazyImport(message="Run 'pip install transformers[torch]'") as torch_and_transformers_import:
     from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, StoppingCriteria, TextStreamer
 
-    transformers_import.check()
-    torch_import.check()
+    torch_and_transformers_import.check()
 
     class StopWordsCriteria(StoppingCriteria):
         """

diff --git a/releasenotes/notes/hugging-face-local-generator-streaming-callback-38a77d37199f9672.yaml b/releasenotes/notes/hugging-face-local-generator-streaming-callback-38a77d37199f9672.yaml
@@ -0,0 +1,4 @@
+---
+features:
+ - |
+   Adds 'streaming_callback' parameter to 'HuggingFaceLocalGenerator', allowing users to handle streaming responses.
diff --git a/releasenotes/notes/pipe-disable-autoshow-dbbafd2bfdcce7a4.yaml b/releasenotes/notes/pipe-disable-autoshow-dbbafd2bfdcce7a4.yaml
@@ -0,0 +1,7 @@
+---
+enhancements:
+  - |
+    In Jupyter notebooks, the image of the Pipeline will no longer be displayed automatically.
+    The textual representation of the Pipeline will be displayed.
+
+    To display the Pipeline image, use the `show` method of the Pipeline object.
diff --git a/releasenotes/notes/tgi-chat-missing-decorator-799b2a133ee4708c.yaml b/releasenotes/notes/tgi-chat-missing-decorator-799b2a133ee4708c.yaml
@@ -0,0 +1,5 @@
+---
+fixes:
+  - |
+    Add the `@component` decorator to `HuggingFaceTGIChatGenerator`.
+    The lack of this decorator made it impossible to use the `HuggingFaceTGIChatGenerator` in a pipeline.
diff --git a/test/components/generators/test_hugging_face_local_generator.py b/test/components/generators/test_hugging_face_local_generator.py
@@ -8,6 +8,7 @@
 from haystack.components.generators.hugging_face_local import HuggingFaceLocalGenerator, StopWordsCriteria
 from haystack.utils import ComponentDevice
 from haystack.utils.auth import Secret
+from haystack.utils.hf import HFTokenStreamingHandler
 
 
 class TestHuggingFaceLocalGenerator:
@@ -154,6 +155,7 @@ def test_to_dict_default(self, model_info_mock):
                     "device": ComponentDevice.resolve_device(None).to_hf(),
                 },
                 "generation_kwargs": {"max_new_tokens": 512},
+                "streaming_callback": None,
                 "stop_words": None,
             },
         }
@@ -194,6 +196,7 @@ def test_to_dict_with_parameters(self):
                     },
                 },
                 "generation_kwargs": {"max_new_tokens": 100, "return_full_text": False},
+                "streaming_callback": None,
                 "stop_words": ["coca", "cola"],
             },
         }
@@ -238,6 +241,7 @@ def test_to_dict_with_quantization_config(self):
                     },
                 },
                 "generation_kwargs": {"max_new_tokens": 100, "return_full_text": False},
+                "streaming_callback": None,
                 "stop_words": ["coca", "cola"],
             },
         }
@@ -350,6 +354,29 @@ def test_run_with_generation_kwargs(self):
             "irrelevant", max_new_tokens=200, temperature=0.5, stopping_criteria=None
         )
 
+    def test_run_with_streaming(self):
+        def streaming_callback_handler(x):
+            return x
+
+        generator = HuggingFaceLocalGenerator(
+            model="google/flan-t5-base", task="text2text-generation", streaming_callback=streaming_callback_handler
+        )
+
+        # create the pipeline object (simulating the warm_up)
+        generator.pipeline = Mock(return_value=[{"generated_text": "Rome"}])
+
+        generator.run(prompt="irrelevant")
+
+        # when we use streaming, the pipeline should be called with the `streamer` argument being an instance of
+        # ouf our adapter class HFTokenStreamingHandler
+        assert isinstance(generator.pipeline.call_args.kwargs["streamer"], HFTokenStreamingHandler)
+        streamer = generator.pipeline.call_args.kwargs["streamer"]
+
+        # check that the streaming callback is set
+        assert streamer.token_handler == streaming_callback_handler
+        # the tokenizer should be set, here it is a mock
+        assert streamer.tokenizer
+
     def test_run_fails_without_warm_up(self):
         generator = HuggingFaceLocalGenerator(
             model="google/flan-t5-base", task="text2text-generation", generation_kwargs={"max_new_tokens": 100}

diff --git a/test/core/pipeline/test_pipeline.py b/test/core/pipeline/test_pipeline.py
@@ -281,8 +281,7 @@ def test_get_component_name_not_added_to_pipeline():
     assert pipe.get_component_name(some_component) == ""
 
 
-@patch("haystack.core.pipeline.pipeline.is_in_jupyter")
-def test_repr(mock_is_in_jupyter):
+def test_repr():
     pipe = Pipeline(metadata={"test": "test"}, max_loops_allowed=42)
     pipe.add_component("add_two", AddFixedValue(add=2))
     pipe.add_component("add_default", AddFixedValue())
@@ -302,26 +301,8 @@ def test_repr(mock_is_in_jupyter):
         "  - add_two.result -> double.value (int)\n"
         "  - double.value -> add_default.value (int)\n"
     )
-    # Simulate not being in a notebook
-    mock_is_in_jupyter.return_value = False
-    assert repr(pipe) == expected_repr
-
 
-@patch("haystack.core.pipeline.pipeline.is_in_jupyter")
-def test_repr_in_notebook(mock_is_in_jupyter):
-    pipe = Pipeline(metadata={"test": "test"}, max_loops_allowed=42)
-    pipe.add_component("add_two", AddFixedValue(add=2))
-    pipe.add_component("add_default", AddFixedValue())
-    pipe.add_component("double", Double())
-    pipe.connect("add_two", "double")
-    pipe.connect("double", "add_default")
-
-    # Simulate being in a notebook
-    mock_is_in_jupyter.return_value = True
-
-    with patch.object(Pipeline, "show") as mock_show:
-        assert repr(pipe) == ""
-        mock_show.assert_called_once_with()
+    assert repr(pipe) == expected_repr
 
 
 def test_run_raises_if_max_visits_reached():