Skip to content

Commit

Permalink
Merge branch 'main' into llmevaluator
Browse files Browse the repository at this point in the history
  • Loading branch information
julian-risch committed Mar 24, 2024
2 parents 45e9bd9 + 42b587a commit 903363c
Show file tree
Hide file tree
Showing 15 changed files with 126 additions and 73 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
name: Add new issues to project for triage
runs-on: ubuntu-latest
steps:
- uses: actions/add-to-project@v0.5.0
- uses: actions/add-to-project@v0.6.1
with:
project-url: https://github.com/orgs/deepset-ai/projects/5
github-token: ${{ secrets.GH_PROJECT_PAT }}
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ Some examples of what you can do with Haystack:
- Use **user feedback** to evaluate, benchmark, and continuously improve your models.

> [!TIP]
><img src="docs/img/deepset-cloud-logo-lightblue.png" width=30% height=30%>
><img src="https://github.com/deepset-ai/haystack/raw/main/docs/img/deepset-cloud-logo-lightblue.png" width=30% height=30%>
>
> Are you looking for a managed solution that benefits from Haystack? [deepset Cloud](https://www.deepset.ai/deepset-cloud?utm_campaign=developer-relations&utm_source=haystack&utm_medium=readme) is our fully managed, end-to-end platform to integrate LLMs with your data, which uses Haystack for the LLM pipelines architecture.
Expand Down
4 changes: 3 additions & 1 deletion haystack/components/generators/chat/hugging_face_tgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
logger = logging.getLogger(__name__)


@component
class HuggingFaceTGIChatGenerator:
"""
Enables text generation using HuggingFace Hub hosted chat-based LLMs. This component is designed to seamlessly
Expand Down Expand Up @@ -227,8 +228,9 @@ def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str,
raise RuntimeError("Please call warm_up() before running LLM inference.")

# apply either model's chat template or the user-provided one
formatted_messages = [message.to_openai_format() for message in messages]
prepared_prompt: str = self.tokenizer.apply_chat_template(
conversation=messages, chat_template=self.chat_template, tokenize=False
conversation=formatted_messages, chat_template=self.chat_template, tokenize=False
)
prompt_token_count: int = len(self.tokenizer.encode(prepared_prompt, add_special_tokens=False))

Expand Down
19 changes: 2 additions & 17 deletions haystack/components/generators/chat/openai.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import copy
import dataclasses
import json
from typing import Any, Callable, Dict, List, Optional, Union

from openai import OpenAI, Stream # type: ignore
from openai import OpenAI, Stream
from openai.types.chat import ChatCompletion, ChatCompletionChunk, ChatCompletionMessage
from openai.types.chat.chat_completion import Choice
from openai.types.chat.chat_completion_chunk import Choice as ChunkChoice
Expand Down Expand Up @@ -169,7 +168,7 @@ def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str,
generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}

# adapt ChatMessage(s) to the format expected by the OpenAI API
openai_formatted_messages = self._convert_to_openai_format(messages)
openai_formatted_messages = [message.to_openai_format() for message in messages]

chat_completion: Union[Stream[ChatCompletionChunk], ChatCompletion] = self.client.chat.completions.create(
model=self.model,
Expand Down Expand Up @@ -204,20 +203,6 @@ def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str,

return {"replies": completions}

def _convert_to_openai_format(self, messages: List[ChatMessage]) -> List[Dict[str, Any]]:
"""
Converts the list of ChatMessage to the list of messages in the format expected by the OpenAI API.
:param messages: The list of ChatMessage.
:return: The list of messages in the format expected by the OpenAI API.
"""
openai_chat_message_format = {"role", "content", "name"}
openai_formatted_messages = []
for m in messages:
message_dict = dataclasses.asdict(m)
filtered_message = {k: v for k, v in message_dict.items() if k in openai_chat_message_format and v}
openai_formatted_messages.append(filtered_message)
return openai_formatted_messages

def _connect_chunks(self, chunk: Any, chunks: List[StreamingChunk]) -> ChatMessage:
"""
Connects the streaming chunks into a single ChatMessage.
Expand Down
42 changes: 39 additions & 3 deletions haystack/components/generators/hugging_face_local.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
from typing import Any, Dict, List, Literal, Optional
from typing import Any, Callable, Dict, List, Literal, Optional

from haystack import component, default_from_dict, default_to_dict, logging
from haystack.dataclasses import StreamingChunk
from haystack.lazy_imports import LazyImport
from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace
from haystack.utils import (
ComponentDevice,
Secret,
deserialize_callable,
deserialize_secrets_inplace,
serialize_callable,
)
from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs

logger = logging.getLogger(__name__)
Expand All @@ -12,7 +19,11 @@
with LazyImport(message="Run 'pip install transformers[torch]'") as transformers_import:
from transformers import StoppingCriteriaList, pipeline

from haystack.utils.hf import StopWordsCriteria, resolve_hf_pipeline_kwargs # pylint: disable=ungrouped-imports
from haystack.utils.hf import ( # pylint: disable=ungrouped-imports
HFTokenStreamingHandler,
StopWordsCriteria,
resolve_hf_pipeline_kwargs,
)


@component
Expand Down Expand Up @@ -47,6 +58,7 @@ def __init__(
generation_kwargs: Optional[Dict[str, Any]] = None,
huggingface_pipeline_kwargs: Optional[Dict[str, Any]] = None,
stop_words: Optional[List[str]] = None,
streaming_callback: Optional[Callable[[StreamingChunk], None]] = None,
):
"""
Creates an instance of a HuggingFaceLocalGenerator.
Expand Down Expand Up @@ -80,6 +92,7 @@ def __init__(
If you provide this parameter, you should not specify the `stopping_criteria` in `generation_kwargs`.
For some chat models, the output includes both the new text and the original prompt.
In these cases, it's important to make sure your prompt has no stop words.
:param streaming_callback: An optional callable for handling streaming responses.
"""
transformers_import.check()

Expand Down Expand Up @@ -113,6 +126,7 @@ def __init__(
self.stop_words = stop_words
self.pipeline = None
self.stopping_criteria_list = None
self.streaming_callback = streaming_callback

def _get_telemetry_data(self) -> Dict[str, Any]:
"""
Expand Down Expand Up @@ -142,10 +156,12 @@ def to_dict(self) -> Dict[str, Any]:
:returns:
Dictionary with serialized data.
"""
callback_name = serialize_callable(self.streaming_callback) if self.streaming_callback else None
serialization_dict = default_to_dict(
self,
huggingface_pipeline_kwargs=self.huggingface_pipeline_kwargs,
generation_kwargs=self.generation_kwargs,
streaming_callback=callback_name,
stop_words=self.stop_words,
token=self.token.to_dict() if self.token else None,
)
Expand All @@ -168,6 +184,11 @@ def from_dict(cls, data: Dict[str, Any]) -> "HuggingFaceLocalGenerator":
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
deserialize_hf_model_kwargs(data["init_parameters"]["huggingface_pipeline_kwargs"])
init_params = data.get("init_parameters", {})
serialized_callback_handler = init_params.get("streaming_callback")
if serialized_callback_handler:
data["init_parameters"]["streaming_callback"] = deserialize_callable(serialized_callback_handler)

return default_from_dict(cls, data)

@component.output_types(replies=List[str])
Expand All @@ -193,6 +214,21 @@ def run(self, prompt: str, generation_kwargs: Optional[Dict[str, Any]] = None):
# merge generation kwargs from init method with those from run method
updated_generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}

if self.streaming_callback:
num_responses = updated_generation_kwargs.get("num_return_sequences", 1)
if num_responses > 1:
logger.warning(
"Streaming is enabled, but the number of responses is set to %d. "
"Streaming is only supported for single response generation. "
"Setting the number of responses to 1.",
num_responses,
)
updated_generation_kwargs["num_return_sequences"] = 1
# streamer parameter hooks into HF streaming, HFTokenStreamingHandler is an adapter to our streaming
updated_generation_kwargs["streamer"] = HFTokenStreamingHandler(
self.pipeline.tokenizer, self.streaming_callback, self.stop_words
)

output = self.pipeline(prompt, stopping_criteria=self.stopping_criteria_list, **updated_generation_kwargs)
replies = [o["generated_text"] for o in output if "generated_text" in o]

Expand Down
20 changes: 1 addition & 19 deletions haystack/components/generators/openai.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import dataclasses
from typing import Any, Callable, Dict, List, Optional, Union

from openai import OpenAI, Stream
Expand Down Expand Up @@ -164,7 +163,7 @@ def run(self, prompt: str, generation_kwargs: Optional[Dict[str, Any]] = None):
generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}

# adapt ChatMessage(s) to the format expected by the OpenAI API
openai_formatted_messages = self._convert_to_openai_format(messages)
openai_formatted_messages = [message.to_openai_format() for message in messages]

completion: Union[Stream[ChatCompletionChunk], ChatCompletion] = self.client.chat.completions.create(
model=self.model,
Expand Down Expand Up @@ -200,23 +199,6 @@ def run(self, prompt: str, generation_kwargs: Optional[Dict[str, Any]] = None):
"meta": [message.meta for message in completions],
}

def _convert_to_openai_format(self, messages: List[ChatMessage]) -> List[Dict[str, Any]]:
"""
Converts the list of ChatMessage to the list of messages in the format expected by the OpenAI API.
:param messages:
The list of ChatMessage.
:returns:
The list of messages in the format expected by the OpenAI API.
"""
openai_chat_message_format = {"role", "content", "name"}
openai_formatted_messages = []
for m in messages:
message_dict = dataclasses.asdict(m)
filtered_message = {k: v for k, v in message_dict.items() if k in openai_chat_message_format and v}
openai_formatted_messages.append(filtered_message)
return openai_formatted_messages

def _connect_chunks(self, chunk: Any, chunks: List[StreamingChunk]) -> ChatMessage:
"""
Connects the streaming chunks into a single ChatMessage.
Expand Down
6 changes: 0 additions & 6 deletions haystack/core/pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,7 @@ def __eq__(self, other) -> bool:
def __repr__(self) -> str:
"""
Returns a text representation of the Pipeline.
If this runs in a Jupyter notebook, it will instead display the Pipeline image.
"""
if is_in_jupyter():
# If we're in a Jupyter notebook we want to display the image instead of the text repr.
self.show()
return ""

res = f"{object.__repr__(self)}\n"
if self.metadata:
res += "🧱 Metadata\n"
Expand Down
16 changes: 16 additions & 0 deletions haystack/dataclasses/chat_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,22 @@ class ChatMessage:
name: Optional[str]
meta: Dict[str, Any] = field(default_factory=dict, hash=False)

def to_openai_format(self) -> Dict[str, Any]:
"""
Convert the message to the format expected by OpenAI's Chat API.
See the [API reference](https://platform.openai.com/docs/api-reference/chat/create) for details.
:returns: A dictionary with the following key:
- `role`
- `content`
- `name` (optional)
"""
msg = {"role": self.role.value, "content": self.content}
if self.name:
msg["name"] = self.name

return msg

def is_from(self, role: ChatRole) -> bool:
"""
Check if the message is from a specific role.
Expand Down
3 changes: 1 addition & 2 deletions haystack/utils/hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,7 @@ def check_generation_params(kwargs: Optional[Dict[str, Any]], additional_accepte
with LazyImport(message="Run 'pip install transformers[torch]'") as torch_and_transformers_import:
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, StoppingCriteria, TextStreamer

transformers_import.check()
torch_import.check()
torch_and_transformers_import.check()

class StopWordsCriteria(StoppingCriteria):
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Adds 'streaming_callback' parameter to 'HuggingFaceLocalGenerator', allowing users to handle streaming responses.
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
enhancements:
- |
In Jupyter notebooks, the image of the Pipeline will no longer be displayed automatically.
The textual representation of the Pipeline will be displayed.
To display the Pipeline image, use the `show` method of the Pipeline object.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
fixes:
- |
Add the `@component` decorator to `HuggingFaceTGIChatGenerator`.
The lack of this decorator made it impossible to use the `HuggingFaceTGIChatGenerator` in a pipeline.
27 changes: 27 additions & 0 deletions test/components/generators/test_hugging_face_local_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from haystack.components.generators.hugging_face_local import HuggingFaceLocalGenerator, StopWordsCriteria
from haystack.utils import ComponentDevice
from haystack.utils.auth import Secret
from haystack.utils.hf import HFTokenStreamingHandler


class TestHuggingFaceLocalGenerator:
Expand Down Expand Up @@ -154,6 +155,7 @@ def test_to_dict_default(self, model_info_mock):
"device": ComponentDevice.resolve_device(None).to_hf(),
},
"generation_kwargs": {"max_new_tokens": 512},
"streaming_callback": None,
"stop_words": None,
},
}
Expand Down Expand Up @@ -194,6 +196,7 @@ def test_to_dict_with_parameters(self):
},
},
"generation_kwargs": {"max_new_tokens": 100, "return_full_text": False},
"streaming_callback": None,
"stop_words": ["coca", "cola"],
},
}
Expand Down Expand Up @@ -238,6 +241,7 @@ def test_to_dict_with_quantization_config(self):
},
},
"generation_kwargs": {"max_new_tokens": 100, "return_full_text": False},
"streaming_callback": None,
"stop_words": ["coca", "cola"],
},
}
Expand Down Expand Up @@ -350,6 +354,29 @@ def test_run_with_generation_kwargs(self):
"irrelevant", max_new_tokens=200, temperature=0.5, stopping_criteria=None
)

def test_run_with_streaming(self):
def streaming_callback_handler(x):
return x

generator = HuggingFaceLocalGenerator(
model="google/flan-t5-base", task="text2text-generation", streaming_callback=streaming_callback_handler
)

# create the pipeline object (simulating the warm_up)
generator.pipeline = Mock(return_value=[{"generated_text": "Rome"}])

generator.run(prompt="irrelevant")

# when we use streaming, the pipeline should be called with the `streamer` argument being an instance of
# ouf our adapter class HFTokenStreamingHandler
assert isinstance(generator.pipeline.call_args.kwargs["streamer"], HFTokenStreamingHandler)
streamer = generator.pipeline.call_args.kwargs["streamer"]

# check that the streaming callback is set
assert streamer.token_handler == streaming_callback_handler
# the tokenizer should be set, here it is a mock
assert streamer.tokenizer

def test_run_fails_without_warm_up(self):
generator = HuggingFaceLocalGenerator(
model="google/flan-t5-base", task="text2text-generation", generation_kwargs={"max_new_tokens": 100}
Expand Down
23 changes: 2 additions & 21 deletions test/core/pipeline/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,8 +281,7 @@ def test_get_component_name_not_added_to_pipeline():
assert pipe.get_component_name(some_component) == ""


@patch("haystack.core.pipeline.pipeline.is_in_jupyter")
def test_repr(mock_is_in_jupyter):
def test_repr():
pipe = Pipeline(metadata={"test": "test"}, max_loops_allowed=42)
pipe.add_component("add_two", AddFixedValue(add=2))
pipe.add_component("add_default", AddFixedValue())
Expand All @@ -302,26 +301,8 @@ def test_repr(mock_is_in_jupyter):
" - add_two.result -> double.value (int)\n"
" - double.value -> add_default.value (int)\n"
)
# Simulate not being in a notebook
mock_is_in_jupyter.return_value = False
assert repr(pipe) == expected_repr


@patch("haystack.core.pipeline.pipeline.is_in_jupyter")
def test_repr_in_notebook(mock_is_in_jupyter):
pipe = Pipeline(metadata={"test": "test"}, max_loops_allowed=42)
pipe.add_component("add_two", AddFixedValue(add=2))
pipe.add_component("add_default", AddFixedValue())
pipe.add_component("double", Double())
pipe.connect("add_two", "double")
pipe.connect("double", "add_default")

# Simulate being in a notebook
mock_is_in_jupyter.return_value = True

with patch.object(Pipeline, "show") as mock_show:
assert repr(pipe) == ""
mock_show.assert_called_once_with()
assert repr(pipe) == expected_repr


def test_run_raises_if_max_visits_reached():
Expand Down
Loading

0 comments on commit 903363c

Please sign in to comment.