From 1bef6457c755c3901be40292cedee313ab0540d3 Mon Sep 17 00:00:00 2001 From: Krish Dholakia Date: Fri, 8 Nov 2024 19:34:22 +0530 Subject: [PATCH] Litellm dev 11 07 2024 (#6649) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(streaming_handler.py): save finish_reasons which might show up mid-stream (store last received one) Fixes https://github.com/BerriAI/litellm/issues/6104 * refactor: add readme to litellm_core_utils/ make it easier to navigate * fix(team_endpoints.py): return team id + object for invalid team in `/team/list` * fix(streaming_handler.py): remove import * fix(pattern_match_deployments.py): default to user input if unable to map based on wildcards (#6646) * fix(pattern_match_deployments.py): default to user input if unable to… (#6632) * fix(pattern_match_deployments.py): default to user input if unable to map based on wildcards * test: fix test * test: reset test name * test: update conftest to reload proxy server module between tests * ci(config.yml): move langfuse out of local_testing reduce ci/cd time * ci(config.yml): cleanup langfuse ci/cd tests * fix: update test to not use global proxy_server app module * ci: move caching to a separate test pipeline speed up ci pipeline * test: update conftest to check if proxy_server attr exists before reloading * build(conftest.py): don't block on inability to reload proxy_server * ci(config.yml): update caching unit test filter to work on 'cache' keyword as well * fix(encrypt_decrypt_utils.py): use function to get salt key * test: mark flaky test * test: handle anthropic overloaded errors * refactor: create separate ci/cd pipeline for proxy unit tests make ci/cd faster * ci(config.yml): add litellm_proxy_unit_testing to build_and_test jobs * ci(config.yml): generate prisma binaries for proxy unit tests * test: readd vertex_key.json * ci(config.yml): remove `-s` from proxy_unit_test cmd speed up test * ci: remove any 'debug' logging flag speed up ci pipeline * test: fix test * test(test_braintrust.py): rerun * test: add delay for braintrust test * chore: comment for maritalk (#6607) * Update gpt-4o-2024-08-06, and o1-preview, o1-mini models in model cost map (#6654) * Adding supports_response_schema to gpt-4o-2024-08-06 models * o1 models do not support vision --------- Co-authored-by: Emerson Gomes * (QOL improvement) add unit testing for all static_methods in litellm_logging.py (#6640) * add unit testing for standard logging payload * unit testing for static methods in litellm_logging * add code coverage check for litellm_logging * litellm_logging_code_coverage * test_get_final_response_obj * fix validate_redacted_message_span_attributes * test validate_redacted_message_span_attributes * (feat) log error class, function_name on prometheus service failure hook + only log DB related failures on DB service hook (#6650) * log error on prometheus service failure hook * use a more accurate function name for wrapper that handles logging db metrics * fix log_db_metrics * test_log_db_metrics_failure_error_types * fix linting * fix auth checks * Update several Azure AI models in model cost map (#6655) * Adding Azure Phi 3/3.5 models to model cost map * Update gpt-4o-mini models * Adding missing Azure Mistral models to model cost map * Adding Azure Llama3.2 models to model cost map * Fix Gemini-1.5-flash pricing * Fix Gemini-1.5-flash output pricing * Fix Gemini-1.5-pro prices * Fix Gemini-1.5-flash output prices * Correct gemini-1.5-pro prices * Correction on Vertex Llama3.2 entry --------- Co-authored-by: Emerson Gomes * fix(streaming_handler.py): fix linting error * test: remove duplicate test causes gemini ratelimit error --------- Co-authored-by: nobuo kawasaki Co-authored-by: Emerson Gomes Co-authored-by: Emerson Gomes Co-authored-by: Ishaan Jaff --- litellm/litellm_core_utils/README.md | 11 + litellm/litellm_core_utils/core_helpers.py | 27 + .../litellm_core_utils/default_encoding.py | 21 + litellm/litellm_core_utils/rules.py | 50 + .../litellm_core_utils/streaming_handler.py | 2020 ++++++++ litellm/litellm_core_utils/streaming_utils.py | 14 - litellm/llms/databricks/streaming_utils.py | 4 +- .../management_endpoints/team_endpoints.py | 20 +- litellm/utils.py | 4153 ++++++++--------- tests/local_testing/test_streaming.py | 80 + 10 files changed, 4251 insertions(+), 2149 deletions(-) create mode 100644 litellm/litellm_core_utils/README.md create mode 100644 litellm/litellm_core_utils/default_encoding.py create mode 100644 litellm/litellm_core_utils/rules.py create mode 100644 litellm/litellm_core_utils/streaming_handler.py delete mode 100644 litellm/litellm_core_utils/streaming_utils.py diff --git a/litellm/litellm_core_utils/README.md b/litellm/litellm_core_utils/README.md new file mode 100644 index 000000000000..9cd3514536bc --- /dev/null +++ b/litellm/litellm_core_utils/README.md @@ -0,0 +1,11 @@ +## Folder Contents + +This folder contains general-purpose utilities that are used in multiple places in the codebase. + +Core files: +- `streaming_handler.py`: The core streaming logic + streaming related helper utils +- `core_helpers.py`: code used in `types/` - e.g. `map_finish_reason`. +- `exception_mapping_utils.py`: utils for mapping exceptions to openai-compatible error types. +- `default_encoding.py`: code for loading the default encoding (tiktoken) +- `get_llm_provider_logic.py`: code for inferring the LLM provider from a given model name. + diff --git a/litellm/litellm_core_utils/core_helpers.py b/litellm/litellm_core_utils/core_helpers.py index cddca61eec4a..816dff81ee92 100644 --- a/litellm/litellm_core_utils/core_helpers.py +++ b/litellm/litellm_core_utils/core_helpers.py @@ -3,6 +3,8 @@ import os from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple, Union +import httpx + from litellm._logging import verbose_logger if TYPE_CHECKING: @@ -99,3 +101,28 @@ def _get_parent_otel_span_from_kwargs( "Error in _get_parent_otel_span_from_kwargs: " + str(e) ) return None + + +def process_response_headers(response_headers: Union[httpx.Headers, dict]) -> dict: + from litellm.types.utils import OPENAI_RESPONSE_HEADERS + + openai_headers = {} + processed_headers = {} + additional_headers = {} + + for k, v in response_headers.items(): + if k in OPENAI_RESPONSE_HEADERS: # return openai-compatible headers + openai_headers[k] = v + if k.startswith( + "llm_provider-" + ): # return raw provider headers (incl. openai-compatible ones) + processed_headers[k] = v + else: + additional_headers["{}-{}".format("llm_provider", k)] = v + + additional_headers = { + **openai_headers, + **processed_headers, + **additional_headers, + } + return additional_headers diff --git a/litellm/litellm_core_utils/default_encoding.py b/litellm/litellm_core_utils/default_encoding.py new file mode 100644 index 000000000000..e093325829b7 --- /dev/null +++ b/litellm/litellm_core_utils/default_encoding.py @@ -0,0 +1,21 @@ +import os + +import litellm + +try: + # New and recommended way to access resources + from importlib import resources + + filename = str(resources.files(litellm).joinpath("llms/tokenizers")) +except (ImportError, AttributeError): + # Old way to access resources, which setuptools deprecated some time ago + import pkg_resources # type: ignore + + filename = pkg_resources.resource_filename(__name__, "llms/tokenizers") + +os.environ["TIKTOKEN_CACHE_DIR"] = os.getenv( + "CUSTOM_TIKTOKEN_CACHE_DIR", filename +) # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071 +import tiktoken + +encoding = tiktoken.get_encoding("cl100k_base") diff --git a/litellm/litellm_core_utils/rules.py b/litellm/litellm_core_utils/rules.py new file mode 100644 index 000000000000..beeb012d032f --- /dev/null +++ b/litellm/litellm_core_utils/rules.py @@ -0,0 +1,50 @@ +from typing import Optional + +import litellm + + +class Rules: + """ + Fail calls based on the input or llm api output + + Example usage: + import litellm + def my_custom_rule(input): # receives the model response + if "i don't think i can answer" in input: # trigger fallback if the model refuses to answer + return False + return True + + litellm.post_call_rules = [my_custom_rule] # have these be functions that can be called to fail a call + + response = litellm.completion(model="gpt-3.5-turbo", messages=[{"role": "user", + "content": "Hey, how's it going?"}], fallbacks=["openrouter/mythomax"]) + """ + + def __init__(self) -> None: + pass + + def pre_call_rules(self, input: str, model: str): + for rule in litellm.pre_call_rules: + if callable(rule): + decision = rule(input) + if decision is False: + raise litellm.APIResponseValidationError(message="LLM Response failed post-call-rule check", llm_provider="", model=model) # type: ignore + return True + + def post_call_rules(self, input: Optional[str], model: str) -> bool: + if input is None: + return True + for rule in litellm.post_call_rules: + if callable(rule): + decision = rule(input) + if isinstance(decision, bool): + if decision is False: + raise litellm.APIResponseValidationError(message="LLM Response failed post-call-rule check", llm_provider="", model=model) # type: ignore + elif isinstance(decision, dict): + decision_val = decision.get("decision", True) + decision_message = decision.get( + "message", "LLM Response failed post-call-rule check" + ) + if decision_val is False: + raise litellm.APIResponseValidationError(message=decision_message, llm_provider="", model=model) # type: ignore + return True diff --git a/litellm/litellm_core_utils/streaming_handler.py b/litellm/litellm_core_utils/streaming_handler.py new file mode 100644 index 000000000000..5c18ff512b6c --- /dev/null +++ b/litellm/litellm_core_utils/streaming_handler.py @@ -0,0 +1,2020 @@ +import asyncio +import json +import threading +import time +import traceback +import uuid +from concurrent.futures import ThreadPoolExecutor +from typing import Any, Callable, List, Optional + +import httpx +from pydantic import BaseModel + +import litellm +from litellm import verbose_logger +from litellm.litellm_core_utils.redact_messages import ( + LiteLLMLoggingObject, + redact_message_input_output_from_logging, +) +from litellm.types.utils import Delta +from litellm.types.utils import GenericStreamingChunk as GChunk +from litellm.types.utils import ( + ModelResponse, + ModelResponseStream, + StreamingChoices, + Usage, +) + +from ..exceptions import OpenAIError +from .core_helpers import map_finish_reason, process_response_headers +from .default_encoding import encoding +from .exception_mapping_utils import exception_type +from .rules import Rules + +MAX_THREADS = 100 + +# Create a ThreadPoolExecutor +executor = ThreadPoolExecutor(max_workers=MAX_THREADS) + + +def print_verbose(print_statement): + try: + if litellm.set_verbose: + print(print_statement) # noqa + except Exception: + pass + + +class CustomStreamWrapper: + def __init__( + self, + completion_stream, + model, + logging_obj: Any, + custom_llm_provider: Optional[str] = None, + stream_options=None, + make_call: Optional[Callable] = None, + _response_headers: Optional[dict] = None, + ): + self.model = model + self.make_call = make_call + self.custom_llm_provider = custom_llm_provider + self.logging_obj: LiteLLMLoggingObject = logging_obj + self.completion_stream = completion_stream + self.sent_first_chunk = False + self.sent_last_chunk = False + self.system_fingerprint: Optional[str] = None + self.received_finish_reason: Optional[str] = None + self.intermittent_finish_reason: Optional[str] = ( + None # finish reasons that show up mid-stream + ) + self.special_tokens = [ + "<|assistant|>", + "<|system|>", + "<|user|>", + "", + "", + "<|im_end|>", + "<|im_start|>", + ] + self.holding_chunk = "" + self.complete_response = "" + self.response_uptil_now = "" + _model_info = ( + self.logging_obj.model_call_details.get("litellm_params", {}).get( + "model_info", {} + ) + or {} + ) + self._hidden_params = { + "model_id": (_model_info.get("id", None)), + } # returned as x-litellm-model-id response header in proxy + + self._hidden_params["additional_headers"] = process_response_headers( + _response_headers or {} + ) # GUARANTEE OPENAI HEADERS IN RESPONSE + + self._response_headers = _response_headers + self.response_id = None + self.logging_loop = None + self.rules = Rules() + self.stream_options = stream_options or getattr( + logging_obj, "stream_options", None + ) + self.messages = getattr(logging_obj, "messages", None) + self.sent_stream_usage = False + self.send_stream_usage = ( + True if self.check_send_stream_usage(self.stream_options) else False + ) + self.tool_call = False + self.chunks: List = ( + [] + ) # keep track of the returned chunks - used for calculating the input/output tokens for stream options + self.is_function_call = self.check_is_function_call(logging_obj=logging_obj) + + def __iter__(self): + return self + + def __aiter__(self): + return self + + def check_send_stream_usage(self, stream_options: Optional[dict]): + return ( + stream_options is not None + and stream_options.get("include_usage", False) is True + ) + + def check_is_function_call(self, logging_obj) -> bool: + if hasattr(logging_obj, "optional_params") and isinstance( + logging_obj.optional_params, dict + ): + if ( + "litellm_param_is_function_call" in logging_obj.optional_params + and logging_obj.optional_params["litellm_param_is_function_call"] + is True + ): + return True + + return False + + def process_chunk(self, chunk: str): + """ + NLP Cloud streaming returns the entire response, for each chunk. Process this, to only return the delta. + """ + try: + chunk = chunk.strip() + self.complete_response = self.complete_response.strip() + + if chunk.startswith(self.complete_response): + # Remove last_sent_chunk only if it appears at the start of the new chunk + chunk = chunk[len(self.complete_response) :] + + self.complete_response += chunk + return chunk + except Exception as e: + raise e + + def safety_checker(self) -> None: + """ + Fixes - https://github.com/BerriAI/litellm/issues/5158 + + if the model enters a loop and starts repeating the same chunk again, break out of loop and raise an internalservererror - allows for retries. + + Raises - InternalServerError, if LLM enters infinite loop while streaming + """ + if len(self.chunks) >= litellm.REPEATED_STREAMING_CHUNK_LIMIT: + # Get the last n chunks + last_chunks = self.chunks[-litellm.REPEATED_STREAMING_CHUNK_LIMIT :] + + # Extract the relevant content from the chunks + last_contents = [chunk.choices[0].delta.content for chunk in last_chunks] + + # Check if all extracted contents are identical + if all(content == last_contents[0] for content in last_contents): + if ( + last_contents[0] is not None + and isinstance(last_contents[0], str) + and len(last_contents[0]) > 2 + ): # ignore empty content - https://github.com/BerriAI/litellm/issues/5158#issuecomment-2287156946 + # All last n chunks are identical + raise litellm.InternalServerError( + message="The model is repeating the same chunk = {}.".format( + last_contents[0] + ), + model="", + llm_provider="", + ) + + def check_special_tokens(self, chunk: str, finish_reason: Optional[str]): + """ + Output parse / special tokens for sagemaker + hf streaming. + """ + hold = False + if ( + self.custom_llm_provider != "huggingface" + and self.custom_llm_provider != "sagemaker" + ): + return hold, chunk + + if finish_reason: + for token in self.special_tokens: + if token in chunk: + chunk = chunk.replace(token, "") + return hold, chunk + + if self.sent_first_chunk is True: + return hold, chunk + + curr_chunk = self.holding_chunk + chunk + curr_chunk = curr_chunk.strip() + + for token in self.special_tokens: + if len(curr_chunk) < len(token) and curr_chunk in token: + hold = True + self.holding_chunk = curr_chunk + elif len(curr_chunk) >= len(token): + if token in curr_chunk: + self.holding_chunk = curr_chunk.replace(token, "") + hold = True + else: + pass + + if hold is False: # reset + self.holding_chunk = "" + return hold, curr_chunk + + def handle_anthropic_text_chunk(self, chunk): + """ + For old anthropic models - claude-1, claude-2. + + Claude-3 is handled from within Anthropic.py VIA ModelResponseIterator() + """ + str_line = chunk + if isinstance(chunk, bytes): # Handle binary data + str_line = chunk.decode("utf-8") # Convert bytes to string + text = "" + is_finished = False + finish_reason = None + if str_line.startswith("data:"): + data_json = json.loads(str_line[5:]) + type_chunk = data_json.get("type", None) + if type_chunk == "completion": + text = data_json.get("completion") + finish_reason = data_json.get("stop_reason") + if finish_reason is not None: + is_finished = True + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + elif "error" in str_line: + raise ValueError(f"Unable to parse response. Original response: {str_line}") + else: + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + + def handle_predibase_chunk(self, chunk): + try: + if not isinstance(chunk, str): + chunk = chunk.decode( + "utf-8" + ) # DO NOT REMOVE this: This is required for HF inference API + Streaming + text = "" + is_finished = False + finish_reason = "" + print_verbose(f"chunk: {chunk}") + if chunk.startswith("data:"): + data_json = json.loads(chunk[5:]) + print_verbose(f"data json: {data_json}") + if "token" in data_json and "text" in data_json["token"]: + text = data_json["token"]["text"] + if data_json.get("details", False) and data_json["details"].get( + "finish_reason", False + ): + is_finished = True + finish_reason = data_json["details"]["finish_reason"] + elif data_json.get( + "generated_text", False + ): # if full generated text exists, then stream is complete + text = "" # don't return the final bos token + is_finished = True + finish_reason = "stop" + elif data_json.get("error", False): + raise Exception(data_json.get("error")) + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + elif "error" in chunk: + raise ValueError(chunk) + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception as e: + raise e + + def handle_huggingface_chunk(self, chunk): + try: + if not isinstance(chunk, str): + chunk = chunk.decode( + "utf-8" + ) # DO NOT REMOVE this: This is required for HF inference API + Streaming + text = "" + is_finished = False + finish_reason = "" + print_verbose(f"chunk: {chunk}") + if chunk.startswith("data:"): + data_json = json.loads(chunk[5:]) + print_verbose(f"data json: {data_json}") + if "token" in data_json and "text" in data_json["token"]: + text = data_json["token"]["text"] + if data_json.get("details", False) and data_json["details"].get( + "finish_reason", False + ): + is_finished = True + finish_reason = data_json["details"]["finish_reason"] + elif data_json.get( + "generated_text", False + ): # if full generated text exists, then stream is complete + text = "" # don't return the final bos token + is_finished = True + finish_reason = "stop" + elif data_json.get("error", False): + raise Exception(data_json.get("error")) + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + elif "error" in chunk: + raise ValueError(chunk) + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception as e: + raise e + + def handle_ai21_chunk(self, chunk): # fake streaming + chunk = chunk.decode("utf-8") + data_json = json.loads(chunk) + try: + text = data_json["completions"][0]["data"]["text"] + is_finished = True + finish_reason = "stop" + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + + def handle_maritalk_chunk(self, chunk): # fake streaming + chunk = chunk.decode("utf-8") + data_json = json.loads(chunk) + try: + text = data_json["answer"] + is_finished = True + finish_reason = "stop" + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + + def handle_nlp_cloud_chunk(self, chunk): + text = "" + is_finished = False + finish_reason = "" + try: + if "dolphin" in self.model: + chunk = self.process_chunk(chunk=chunk) + else: + data_json = json.loads(chunk) + chunk = data_json["generated_text"] + text = chunk + if "[DONE]" in text: + text = text.replace("[DONE]", "") + is_finished = True + finish_reason = "stop" + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + + def handle_aleph_alpha_chunk(self, chunk): + chunk = chunk.decode("utf-8") + data_json = json.loads(chunk) + try: + text = data_json["completions"][0]["completion"] + is_finished = True + finish_reason = "stop" + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + + def handle_cohere_chunk(self, chunk): + chunk = chunk.decode("utf-8") + data_json = json.loads(chunk) + try: + text = "" + is_finished = False + finish_reason = "" + index: Optional[int] = None + if "index" in data_json: + index = data_json.get("index") + if "text" in data_json: + text = data_json["text"] + elif "is_finished" in data_json: + is_finished = data_json["is_finished"] + finish_reason = data_json["finish_reason"] + else: + raise Exception(data_json) + return { + "index": index, + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + + def handle_cohere_chat_chunk(self, chunk): + chunk = chunk.decode("utf-8") + data_json = json.loads(chunk) + print_verbose(f"chunk: {chunk}") + try: + text = "" + is_finished = False + finish_reason = "" + if "text" in data_json: + text = data_json["text"] + elif "is_finished" in data_json and data_json["is_finished"] is True: + is_finished = data_json["is_finished"] + finish_reason = data_json["finish_reason"] + else: + return + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + + def handle_azure_chunk(self, chunk): + is_finished = False + finish_reason = "" + text = "" + print_verbose(f"chunk: {chunk}") + if "data: [DONE]" in chunk: + text = "" + is_finished = True + finish_reason = "stop" + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + elif chunk.startswith("data:"): + data_json = json.loads(chunk[5:]) # chunk.startswith("data:"): + try: + if len(data_json["choices"]) > 0: + delta = data_json["choices"][0]["delta"] + text = "" if delta is None else delta.get("content", "") + if data_json["choices"][0].get("finish_reason", None): + is_finished = True + finish_reason = data_json["choices"][0]["finish_reason"] + print_verbose( + f"text: {text}; is_finished: {is_finished}; finish_reason: {finish_reason}" + ) + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception: + raise ValueError( + f"Unable to parse response. Original response: {chunk}" + ) + elif "error" in chunk: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + else: + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + + def handle_replicate_chunk(self, chunk): + try: + text = "" + is_finished = False + finish_reason = "" + if "output" in chunk: + text = chunk["output"] + if "status" in chunk: + if chunk["status"] == "succeeded": + is_finished = True + finish_reason = "stop" + elif chunk.get("error", None): + raise Exception(chunk["error"]) + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + + def handle_openai_chat_completion_chunk(self, chunk): + try: + print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n") + str_line = chunk + text = "" + is_finished = False + finish_reason = None + logprobs = None + usage = None + if str_line and str_line.choices and len(str_line.choices) > 0: + if ( + str_line.choices[0].delta is not None + and str_line.choices[0].delta.content is not None + ): + text = str_line.choices[0].delta.content + else: # function/tool calling chunk - when content is None. in this case we just return the original chunk from openai + pass + if str_line.choices[0].finish_reason: + is_finished = True + finish_reason = str_line.choices[0].finish_reason + + # checking for logprobs + if ( + hasattr(str_line.choices[0], "logprobs") + and str_line.choices[0].logprobs is not None + ): + logprobs = str_line.choices[0].logprobs + else: + logprobs = None + + usage = getattr(str_line, "usage", None) + + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + "logprobs": logprobs, + "original_chunk": str_line, + "usage": usage, + } + except Exception as e: + raise e + + def handle_azure_text_completion_chunk(self, chunk): + try: + print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n") + text = "" + is_finished = False + finish_reason = None + choices = getattr(chunk, "choices", []) + if len(choices) > 0: + text = choices[0].text + if choices[0].finish_reason is not None: + is_finished = True + finish_reason = choices[0].finish_reason + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + + except Exception as e: + raise e + + def handle_openai_text_completion_chunk(self, chunk): + try: + print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n") + text = "" + is_finished = False + finish_reason = None + usage = None + choices = getattr(chunk, "choices", []) + if len(choices) > 0: + text = choices[0].text + if choices[0].finish_reason is not None: + is_finished = True + finish_reason = choices[0].finish_reason + usage = getattr(chunk, "usage", None) + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + "usage": usage, + } + + except Exception as e: + raise e + + def handle_baseten_chunk(self, chunk): + try: + chunk = chunk.decode("utf-8") + if len(chunk) > 0: + if chunk.startswith("data:"): + data_json = json.loads(chunk[5:]) + if "token" in data_json and "text" in data_json["token"]: + return data_json["token"]["text"] + else: + return "" + data_json = json.loads(chunk) + if "model_output" in data_json: + if ( + isinstance(data_json["model_output"], dict) + and "data" in data_json["model_output"] + and isinstance(data_json["model_output"]["data"], list) + ): + return data_json["model_output"]["data"][0] + elif isinstance(data_json["model_output"], str): + return data_json["model_output"] + elif "completion" in data_json and isinstance( + data_json["completion"], str + ): + return data_json["completion"] + else: + raise ValueError( + f"Unable to parse response. Original response: {chunk}" + ) + else: + return "" + else: + return "" + except Exception as e: + verbose_logger.exception( + "litellm.CustomStreamWrapper.handle_baseten_chunk(): Exception occured - {}".format( + str(e) + ) + ) + return "" + + def handle_cloudlfare_stream(self, chunk): + try: + print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n") + chunk = chunk.decode("utf-8") + str_line = chunk + text = "" + is_finished = False + finish_reason = None + + if "[DONE]" in chunk: + return {"text": text, "is_finished": True, "finish_reason": "stop"} + elif str_line.startswith("data:"): + data_json = json.loads(str_line[5:]) + print_verbose(f"delta content: {data_json}") + text = data_json["response"] + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + else: + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + + except Exception as e: + raise e + + def handle_ollama_stream(self, chunk): + try: + if isinstance(chunk, dict): + json_chunk = chunk + else: + json_chunk = json.loads(chunk) + if "error" in json_chunk: + raise Exception(f"Ollama Error - {json_chunk}") + + text = "" + is_finished = False + finish_reason = None + if json_chunk["done"] is True: + text = "" + is_finished = True + finish_reason = "stop" + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + elif json_chunk["response"]: + print_verbose(f"delta content: {json_chunk}") + text = json_chunk["response"] + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + else: + raise Exception(f"Ollama Error - {json_chunk}") + except Exception as e: + raise e + + def handle_ollama_chat_stream(self, chunk): + # for ollama_chat/ provider + try: + if isinstance(chunk, dict): + json_chunk = chunk + else: + json_chunk = json.loads(chunk) + if "error" in json_chunk: + raise Exception(f"Ollama Error - {json_chunk}") + + text = "" + is_finished = False + finish_reason = None + if json_chunk["done"] is True: + text = "" + is_finished = True + finish_reason = "stop" + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + elif "message" in json_chunk: + print_verbose(f"delta content: {json_chunk}") + text = json_chunk["message"]["content"] + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + else: + raise Exception(f"Ollama Error - {json_chunk}") + except Exception as e: + raise e + + def handle_watsonx_stream(self, chunk): + try: + if isinstance(chunk, dict): + parsed_response = chunk + elif isinstance(chunk, (str, bytes)): + if isinstance(chunk, bytes): + chunk = chunk.decode("utf-8") + if "generated_text" in chunk: + response = chunk.replace("data: ", "").strip() + parsed_response = json.loads(response) + else: + return { + "text": "", + "is_finished": False, + "prompt_tokens": 0, + "completion_tokens": 0, + } + else: + print_verbose(f"chunk: {chunk} (Type: {type(chunk)})") + raise ValueError( + f"Unable to parse response. Original response: {chunk}" + ) + results = parsed_response.get("results", []) + if len(results) > 0: + text = results[0].get("generated_text", "") + finish_reason = results[0].get("stop_reason") + is_finished = finish_reason != "not_finished" + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + "prompt_tokens": results[0].get("input_token_count", 0), + "completion_tokens": results[0].get("generated_token_count", 0), + } + return {"text": "", "is_finished": False} + except Exception as e: + raise e + + def handle_triton_stream(self, chunk): + try: + if isinstance(chunk, dict): + parsed_response = chunk + elif isinstance(chunk, (str, bytes)): + if isinstance(chunk, bytes): + chunk = chunk.decode("utf-8") + if "text_output" in chunk: + response = chunk.replace("data: ", "").strip() + parsed_response = json.loads(response) + else: + return { + "text": "", + "is_finished": False, + "prompt_tokens": 0, + "completion_tokens": 0, + } + else: + print_verbose(f"chunk: {chunk} (Type: {type(chunk)})") + raise ValueError( + f"Unable to parse response. Original response: {chunk}" + ) + text = parsed_response.get("text_output", "") + finish_reason = parsed_response.get("stop_reason") + is_finished = parsed_response.get("is_finished", False) + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + "prompt_tokens": parsed_response.get("input_token_count", 0), + "completion_tokens": parsed_response.get("generated_token_count", 0), + } + return {"text": "", "is_finished": False} + except Exception as e: + raise e + + def handle_clarifai_completion_chunk(self, chunk): + try: + if isinstance(chunk, dict): + parsed_response = chunk + elif isinstance(chunk, (str, bytes)): + if isinstance(chunk, bytes): + parsed_response = chunk.decode("utf-8") + else: + parsed_response = chunk + else: + raise ValueError("Unable to parse streaming chunk") + if isinstance(parsed_response, dict): + data_json = parsed_response + else: + data_json = json.loads(parsed_response) + text = ( + data_json.get("outputs", "")[0] + .get("data", "") + .get("text", "") + .get("raw", "") + ) + len( + encoding.encode( + data_json.get("outputs", "")[0] + .get("input", "") + .get("data", "") + .get("text", "") + .get("raw", "") + ) + ) + len(encoding.encode(text)) + return { + "text": text, + "is_finished": True, + } + except Exception as e: + verbose_logger.exception( + "litellm.CustomStreamWrapper.handle_clarifai_chunk(): Exception occured - {}".format( + str(e) + ) + ) + return "" + + def model_response_creator( + self, chunk: Optional[dict] = None, hidden_params: Optional[dict] = None + ): + _model = self.model + _received_llm_provider = self.custom_llm_provider + _logging_obj_llm_provider = self.logging_obj.model_call_details.get("custom_llm_provider", None) # type: ignore + if ( + _received_llm_provider == "openai" + and _received_llm_provider != _logging_obj_llm_provider + ): + _model = "{}/{}".format(_logging_obj_llm_provider, _model) + if chunk is None: + chunk = {} + else: + # pop model keyword + chunk.pop("model", None) + + model_response = ModelResponse( + stream=True, model=_model, stream_options=self.stream_options, **chunk + ) + if self.response_id is not None: + model_response.id = self.response_id + else: + self.response_id = model_response.id # type: ignore + if self.system_fingerprint is not None: + model_response.system_fingerprint = self.system_fingerprint + if hidden_params is not None: + model_response._hidden_params = hidden_params + model_response._hidden_params["custom_llm_provider"] = _logging_obj_llm_provider + model_response._hidden_params["created_at"] = time.time() + model_response._hidden_params = { + **model_response._hidden_params, + **self._hidden_params, + } + + if ( + len(model_response.choices) > 0 + and getattr(model_response.choices[0], "delta") is not None + ): + # do nothing, if object instantiated + pass + else: + model_response.choices = [StreamingChoices(finish_reason=None)] + return model_response + + def is_delta_empty(self, delta: Delta) -> bool: + is_empty = True + if delta.content is not None: + is_empty = False + elif delta.tool_calls is not None: + is_empty = False + elif delta.function_call is not None: + is_empty = False + return is_empty + + def return_processed_chunk_logic( # noqa + self, + completion_obj: dict, + model_response: ModelResponseStream, + response_obj: dict, + ): + + print_verbose( + f"completion_obj: {completion_obj}, model_response.choices[0]: {model_response.choices[0]}, response_obj: {response_obj}" + ) + if ( + "content" in completion_obj + and ( + isinstance(completion_obj["content"], str) + and len(completion_obj["content"]) > 0 + ) + or ( + "tool_calls" in completion_obj + and completion_obj["tool_calls"] is not None + and len(completion_obj["tool_calls"]) > 0 + ) + or ( + "function_call" in completion_obj + and completion_obj["function_call"] is not None + ) + ): # cannot set content of an OpenAI Object to be an empty string + self.safety_checker() + hold, model_response_str = self.check_special_tokens( + chunk=completion_obj["content"], + finish_reason=model_response.choices[0].finish_reason, + ) # filter out bos/eos tokens from openai-compatible hf endpoints + print_verbose(f"hold - {hold}, model_response_str - {model_response_str}") + if hold is False: + ## check if openai/azure chunk + original_chunk = response_obj.get("original_chunk", None) + if original_chunk: + model_response.id = original_chunk.id + self.response_id = original_chunk.id + if len(original_chunk.choices) > 0: + choices = [] + for choice in original_chunk.choices: + try: + if isinstance(choice, BaseModel): + choice_json = choice.model_dump() + choice_json.pop( + "finish_reason", None + ) # for mistral etc. which return a value in their last chunk (not-openai compatible). + print_verbose(f"choice_json: {choice_json}") + choices.append(StreamingChoices(**choice_json)) + except Exception: + choices.append(StreamingChoices()) + print_verbose(f"choices in streaming: {choices}") + setattr(model_response, "choices", choices) + else: + return + model_response.system_fingerprint = ( + original_chunk.system_fingerprint + ) + setattr( + model_response, + "citations", + getattr(original_chunk, "citations", None), + ) + print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}") + if self.sent_first_chunk is False: + model_response.choices[0].delta["role"] = "assistant" + self.sent_first_chunk = True + elif self.sent_first_chunk is True and hasattr( + model_response.choices[0].delta, "role" + ): + _initial_delta = model_response.choices[0].delta.model_dump() + _initial_delta.pop("role", None) + model_response.choices[0].delta = Delta(**_initial_delta) + print_verbose( + f"model_response.choices[0].delta: {model_response.choices[0].delta}" + ) + else: + ## else + completion_obj["content"] = model_response_str + if self.sent_first_chunk is False: + completion_obj["role"] = "assistant" + self.sent_first_chunk = True + + model_response.choices[0].delta = Delta(**completion_obj) + _index: Optional[int] = completion_obj.get("index") + if _index is not None: + model_response.choices[0].index = _index + print_verbose(f"returning model_response: {model_response}") + return model_response + else: + return + elif self.received_finish_reason is not None: + if self.sent_last_chunk is True: + # Bedrock returns the guardrail trace in the last chunk - we want to return this here + if self.custom_llm_provider == "bedrock" and "trace" in model_response: + return model_response + + # Default - return StopIteration + raise StopIteration + # flush any remaining holding chunk + if len(self.holding_chunk) > 0: + if model_response.choices[0].delta.content is None: + model_response.choices[0].delta.content = self.holding_chunk + else: + model_response.choices[0].delta.content = ( + self.holding_chunk + model_response.choices[0].delta.content + ) + self.holding_chunk = "" + # if delta is None + _is_delta_empty = self.is_delta_empty(delta=model_response.choices[0].delta) + + if _is_delta_empty: + # get any function call arguments + model_response.choices[0].finish_reason = map_finish_reason( + finish_reason=self.received_finish_reason + ) # ensure consistent output to openai + + self.sent_last_chunk = True + + return model_response + elif ( + model_response.choices[0].delta.tool_calls is not None + or model_response.choices[0].delta.function_call is not None + ): + if self.sent_first_chunk is False: + model_response.choices[0].delta["role"] = "assistant" + self.sent_first_chunk = True + return model_response + elif ( + len(model_response.choices) > 0 + and hasattr(model_response.choices[0].delta, "audio") + and model_response.choices[0].delta.audio is not None + ): + return model_response + else: + if hasattr(model_response, "usage"): + self.chunks.append(model_response) + return + + def chunk_creator(self, chunk): # type: ignore # noqa: PLR0915 + model_response = self.model_response_creator() + response_obj: dict = {} + try: + # return this for all models + completion_obj = {"content": ""} + from litellm.types.utils import GenericStreamingChunk as GChunk + + if ( + isinstance(chunk, dict) + and generic_chunk_has_all_required_fields( + chunk=chunk + ) # check if chunk is a generic streaming chunk + ) or ( + self.custom_llm_provider + and ( + self.custom_llm_provider == "anthropic" + or self.custom_llm_provider in litellm._custom_providers + ) + ): + + if self.received_finish_reason is not None: + if "provider_specific_fields" not in chunk: + raise StopIteration + anthropic_response_obj: GChunk = chunk + completion_obj["content"] = anthropic_response_obj["text"] + if anthropic_response_obj["is_finished"]: + self.received_finish_reason = anthropic_response_obj[ + "finish_reason" + ] + + if anthropic_response_obj["finish_reason"]: + self.intermittent_finish_reason = anthropic_response_obj[ + "finish_reason" + ] + + if anthropic_response_obj["usage"] is not None: + model_response.usage = litellm.Usage( + **anthropic_response_obj["usage"] + ) + + if ( + "tool_use" in anthropic_response_obj + and anthropic_response_obj["tool_use"] is not None + ): + completion_obj["tool_calls"] = [anthropic_response_obj["tool_use"]] + + if ( + "provider_specific_fields" in anthropic_response_obj + and anthropic_response_obj["provider_specific_fields"] is not None + ): + for key, value in anthropic_response_obj[ + "provider_specific_fields" + ].items(): + setattr(model_response, key, value) + + response_obj = anthropic_response_obj + elif ( + self.custom_llm_provider + and self.custom_llm_provider == "anthropic_text" + ): + response_obj = self.handle_anthropic_text_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider and self.custom_llm_provider == "clarifai": + response_obj = self.handle_clarifai_completion_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.model == "replicate" or self.custom_llm_provider == "replicate": + response_obj = self.handle_replicate_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider and self.custom_llm_provider == "huggingface": + response_obj = self.handle_huggingface_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider and self.custom_llm_provider == "predibase": + response_obj = self.handle_predibase_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif ( + self.custom_llm_provider and self.custom_llm_provider == "baseten" + ): # baseten doesn't provide streaming + completion_obj["content"] = self.handle_baseten_chunk(chunk) + elif ( + self.custom_llm_provider and self.custom_llm_provider == "ai21" + ): # ai21 doesn't provide streaming + response_obj = self.handle_ai21_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider and self.custom_llm_provider == "maritalk": + response_obj = self.handle_maritalk_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider and self.custom_llm_provider == "vllm": + completion_obj["content"] = chunk[0].outputs[0].text + elif ( + self.custom_llm_provider and self.custom_llm_provider == "aleph_alpha" + ): # aleph alpha doesn't provide streaming + response_obj = self.handle_aleph_alpha_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider == "nlp_cloud": + try: + response_obj = self.handle_nlp_cloud_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + except Exception as e: + if self.received_finish_reason: + raise e + else: + if self.sent_first_chunk is False: + raise Exception("An unknown error occurred with the stream") + self.received_finish_reason = "stop" + elif self.custom_llm_provider == "vertex_ai": + import proto # type: ignore + + if hasattr(chunk, "candidates") is True: + try: + try: + completion_obj["content"] = chunk.text + except Exception as e: + if "Part has no text." in str(e): + ## check for function calling + function_call = ( + chunk.candidates[0].content.parts[0].function_call + ) + + args_dict = {} + + # Check if it's a RepeatedComposite instance + for key, val in function_call.args.items(): + if isinstance( + val, + proto.marshal.collections.repeated.RepeatedComposite, + ): + # If so, convert to list + args_dict[key] = [v for v in val] + else: + args_dict[key] = val + + try: + args_str = json.dumps(args_dict) + except Exception as e: + raise e + _delta_obj = litellm.utils.Delta( + content=None, + tool_calls=[ + { + "id": f"call_{str(uuid.uuid4())}", + "function": { + "arguments": args_str, + "name": function_call.name, + }, + "type": "function", + } + ], + ) + _streaming_response = StreamingChoices(delta=_delta_obj) + _model_response = ModelResponse(stream=True) + _model_response.choices = [_streaming_response] + response_obj = {"original_chunk": _model_response} + else: + raise e + if ( + hasattr(chunk.candidates[0], "finish_reason") + and chunk.candidates[0].finish_reason.name + != "FINISH_REASON_UNSPECIFIED" + ): # every non-final chunk in vertex ai has this + self.received_finish_reason = chunk.candidates[ + 0 + ].finish_reason.name + except Exception: + if chunk.candidates[0].finish_reason.name == "SAFETY": + raise Exception( + f"The response was blocked by VertexAI. {str(chunk)}" + ) + else: + completion_obj["content"] = str(chunk) + elif self.custom_llm_provider == "cohere": + response_obj = self.handle_cohere_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider == "cohere_chat": + response_obj = self.handle_cohere_chat_chunk(chunk) + if response_obj is None: + return + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + + elif self.custom_llm_provider == "petals": + if len(self.completion_stream) == 0: + if self.received_finish_reason is not None: + raise StopIteration + else: + self.received_finish_reason = "stop" + chunk_size = 30 + new_chunk = self.completion_stream[:chunk_size] + completion_obj["content"] = new_chunk + self.completion_stream = self.completion_stream[chunk_size:] + elif self.custom_llm_provider == "palm": + # fake streaming + response_obj = {} + if len(self.completion_stream) == 0: + if self.received_finish_reason is not None: + raise StopIteration + else: + self.received_finish_reason = "stop" + chunk_size = 30 + new_chunk = self.completion_stream[:chunk_size] + completion_obj["content"] = new_chunk + self.completion_stream = self.completion_stream[chunk_size:] + elif self.custom_llm_provider == "ollama": + response_obj = self.handle_ollama_stream(chunk) + completion_obj["content"] = response_obj["text"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider == "ollama_chat": + response_obj = self.handle_ollama_chat_stream(chunk) + completion_obj["content"] = response_obj["text"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider == "cloudflare": + response_obj = self.handle_cloudlfare_stream(chunk) + completion_obj["content"] = response_obj["text"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider == "watsonx": + response_obj = self.handle_watsonx_stream(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider == "triton": + response_obj = self.handle_triton_stream(chunk) + completion_obj["content"] = response_obj["text"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider == "text-completion-openai": + response_obj = self.handle_openai_text_completion_chunk(chunk) + completion_obj["content"] = response_obj["text"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + if response_obj["usage"] is not None: + model_response.usage = litellm.Usage( + prompt_tokens=response_obj["usage"].prompt_tokens, + completion_tokens=response_obj["usage"].completion_tokens, + total_tokens=response_obj["usage"].total_tokens, + ) + elif self.custom_llm_provider == "text-completion-codestral": + response_obj = litellm.MistralTextCompletionConfig()._chunk_parser( + chunk + ) + completion_obj["content"] = response_obj["text"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + if "usage" in response_obj is not None: + model_response.usage = litellm.Usage( + prompt_tokens=response_obj["usage"].prompt_tokens, + completion_tokens=response_obj["usage"].completion_tokens, + total_tokens=response_obj["usage"].total_tokens, + ) + elif self.custom_llm_provider == "azure_text": + response_obj = self.handle_azure_text_completion_chunk(chunk) + completion_obj["content"] = response_obj["text"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider == "cached_response": + response_obj = { + "text": chunk.choices[0].delta.content, + "is_finished": True, + "finish_reason": chunk.choices[0].finish_reason, + "original_chunk": chunk, + "tool_calls": ( + chunk.choices[0].delta.tool_calls + if hasattr(chunk.choices[0].delta, "tool_calls") + else None + ), + } + + completion_obj["content"] = response_obj["text"] + if response_obj["tool_calls"] is not None: + completion_obj["tool_calls"] = response_obj["tool_calls"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if hasattr(chunk, "id"): + model_response.id = chunk.id + self.response_id = chunk.id + if hasattr(chunk, "system_fingerprint"): + self.system_fingerprint = chunk.system_fingerprint + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + else: # openai / azure chat model + if self.custom_llm_provider == "azure": + if hasattr(chunk, "model"): + # for azure, we need to pass the model from the orignal chunk + self.model = chunk.model + response_obj = self.handle_openai_chat_completion_chunk(chunk) + if response_obj is None: + return + completion_obj["content"] = response_obj["text"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if response_obj["is_finished"]: + if response_obj["finish_reason"] == "error": + raise Exception( + "{} raised a streaming error - finish_reason: error, no content string given. Received Chunk={}".format( + self.custom_llm_provider, response_obj + ) + ) + self.received_finish_reason = response_obj["finish_reason"] + if response_obj.get("original_chunk", None) is not None: + if hasattr(response_obj["original_chunk"], "id"): + model_response.id = response_obj["original_chunk"].id + self.response_id = model_response.id + if hasattr(response_obj["original_chunk"], "system_fingerprint"): + model_response.system_fingerprint = response_obj[ + "original_chunk" + ].system_fingerprint + self.system_fingerprint = response_obj[ + "original_chunk" + ].system_fingerprint + if response_obj["logprobs"] is not None: + model_response.choices[0].logprobs = response_obj["logprobs"] + + if response_obj["usage"] is not None: + if isinstance(response_obj["usage"], dict): + model_response.usage = litellm.Usage( + prompt_tokens=response_obj["usage"].get( + "prompt_tokens", None + ) + or None, + completion_tokens=response_obj["usage"].get( + "completion_tokens", None + ) + or None, + total_tokens=response_obj["usage"].get("total_tokens", None) + or None, + ) + elif isinstance(response_obj["usage"], BaseModel): + model_response.usage = litellm.Usage( + **response_obj["usage"].model_dump() + ) + + model_response.model = self.model + print_verbose( + f"model_response finish reason 3: {self.received_finish_reason}; response_obj={response_obj}" + ) + ## FUNCTION CALL PARSING + if ( + response_obj is not None + and response_obj.get("original_chunk", None) is not None + ): # function / tool calling branch - only set for openai/azure compatible endpoints + # enter this branch when no content has been passed in response + original_chunk = response_obj.get("original_chunk", None) + model_response.id = original_chunk.id + self.response_id = original_chunk.id + if original_chunk.choices and len(original_chunk.choices) > 0: + delta = original_chunk.choices[0].delta + if delta is not None and ( + delta.function_call is not None or delta.tool_calls is not None + ): + try: + model_response.system_fingerprint = ( + original_chunk.system_fingerprint + ) + ## AZURE - check if arguments is not None + if ( + original_chunk.choices[0].delta.function_call + is not None + ): + if ( + getattr( + original_chunk.choices[0].delta.function_call, + "arguments", + ) + is None + ): + original_chunk.choices[ + 0 + ].delta.function_call.arguments = "" + elif original_chunk.choices[0].delta.tool_calls is not None: + if isinstance( + original_chunk.choices[0].delta.tool_calls, list + ): + for t in original_chunk.choices[0].delta.tool_calls: + if hasattr(t, "functions") and hasattr( + t.functions, "arguments" + ): + if ( + getattr( + t.function, + "arguments", + ) + is None + ): + t.function.arguments = "" + _json_delta = delta.model_dump() + print_verbose(f"_json_delta: {_json_delta}") + if "role" not in _json_delta or _json_delta["role"] is None: + _json_delta["role"] = ( + "assistant" # mistral's api returns role as None + ) + if "tool_calls" in _json_delta and isinstance( + _json_delta["tool_calls"], list + ): + for tool in _json_delta["tool_calls"]: + if ( + isinstance(tool, dict) + and "function" in tool + and isinstance(tool["function"], dict) + and ("type" not in tool or tool["type"] is None) + ): + # if function returned but type set to None - mistral's api returns type: None + tool["type"] = "function" + model_response.choices[0].delta = Delta(**_json_delta) + except Exception as e: + verbose_logger.exception( + "litellm.CustomStreamWrapper.chunk_creator(): Exception occured - {}".format( + str(e) + ) + ) + model_response.choices[0].delta = Delta() + elif ( + delta is not None and getattr(delta, "audio", None) is not None + ): + model_response.choices[0].delta.audio = delta.audio + else: + try: + delta = ( + dict() + if original_chunk.choices[0].delta is None + else dict(original_chunk.choices[0].delta) + ) + print_verbose(f"original delta: {delta}") + model_response.choices[0].delta = Delta(**delta) + print_verbose( + f"new delta: {model_response.choices[0].delta}" + ) + except Exception: + model_response.choices[0].delta = Delta() + else: + if ( + self.stream_options is not None + and self.stream_options["include_usage"] is True + ): + return model_response + return + print_verbose( + f"model_response.choices[0].delta: {model_response.choices[0].delta}; completion_obj: {completion_obj}" + ) + print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}") + + ## CHECK FOR TOOL USE + if "tool_calls" in completion_obj and len(completion_obj["tool_calls"]) > 0: + if self.is_function_call is True: # user passed in 'functions' param + completion_obj["function_call"] = completion_obj["tool_calls"][0][ + "function" + ] + completion_obj["tool_calls"] = None + + self.tool_call = True + + ## RETURN ARG + return self.return_processed_chunk_logic( + completion_obj=completion_obj, + model_response=model_response, # type: ignore + response_obj=response_obj, + ) + + except StopIteration: + raise StopIteration + except Exception as e: + traceback.format_exc() + e.message = str(e) + raise exception_type( + model=self.model, + custom_llm_provider=self.custom_llm_provider, + original_exception=e, + ) + + def set_logging_event_loop(self, loop): + """ + import litellm, asyncio + + loop = asyncio.get_event_loop() # πŸ‘ˆ gets the current event loop + + response = litellm.completion(.., stream=True) + + response.set_logging_event_loop(loop=loop) # πŸ‘ˆ enables async_success callbacks for sync logging + + for chunk in response: + ... + """ + self.logging_loop = loop + + def run_success_logging_and_cache_storage(self, processed_chunk, cache_hit: bool): + """ + Runs success logging in a thread and adds the response to the cache + """ + if litellm.disable_streaming_logging is True: + """ + [NOT RECOMMENDED] + Set this via `litellm.disable_streaming_logging = True`. + + Disables streaming logging. + """ + return + ## ASYNC LOGGING + # Create an event loop for the new thread + if self.logging_loop is not None: + future = asyncio.run_coroutine_threadsafe( + self.logging_obj.async_success_handler( + processed_chunk, None, None, cache_hit + ), + loop=self.logging_loop, + ) + future.result() + else: + asyncio.run( + self.logging_obj.async_success_handler( + processed_chunk, None, None, cache_hit + ) + ) + ## SYNC LOGGING + self.logging_obj.success_handler(processed_chunk, None, None, cache_hit) + + ## Sync store in cache + if self.logging_obj._llm_caching_handler is not None: + self.logging_obj._llm_caching_handler._sync_add_streaming_response_to_cache( + processed_chunk + ) + + def finish_reason_handler(self): + model_response = self.model_response_creator() + _finish_reason = self.received_finish_reason or self.intermittent_finish_reason + if _finish_reason is not None: + model_response.choices[0].finish_reason = _finish_reason + else: + model_response.choices[0].finish_reason = "stop" + + ## if tool use + if ( + model_response.choices[0].finish_reason == "stop" and self.tool_call + ): # don't overwrite for other - potential error finish reasons + model_response.choices[0].finish_reason = "tool_calls" + return model_response + + def __next__(self): # noqa: PLR0915 + cache_hit = False + if ( + self.custom_llm_provider is not None + and self.custom_llm_provider == "cached_response" + ): + cache_hit = True + try: + if self.completion_stream is None: + self.fetch_sync_stream() + while True: + if ( + isinstance(self.completion_stream, str) + or isinstance(self.completion_stream, bytes) + or isinstance(self.completion_stream, ModelResponse) + ): + chunk = self.completion_stream + else: + chunk = next(self.completion_stream) + if chunk is not None and chunk != b"": + print_verbose( + f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}; custom_llm_provider: {self.custom_llm_provider}" + ) + response: Optional[ModelResponse] = self.chunk_creator(chunk=chunk) + print_verbose(f"PROCESSED CHUNK POST CHUNK CREATOR: {response}") + + if response is None: + continue + ## LOGGING + threading.Thread( + target=self.run_success_logging_and_cache_storage, + args=(response, cache_hit), + ).start() # log response + choice = response.choices[0] + if isinstance(choice, StreamingChoices): + self.response_uptil_now += choice.delta.get("content", "") or "" + else: + self.response_uptil_now += "" + self.rules.post_call_rules( + input=self.response_uptil_now, model=self.model + ) + # HANDLE STREAM OPTIONS + self.chunks.append(response) + if hasattr( + response, "usage" + ): # remove usage from chunk, only send on final chunk + # Convert the object to a dictionary + obj_dict = response.dict() + + # Remove an attribute (e.g., 'attr2') + if "usage" in obj_dict: + del obj_dict["usage"] + + # Create a new object without the removed attribute + response = self.model_response_creator( + chunk=obj_dict, hidden_params=response._hidden_params + ) + # add usage as hidden param + if self.sent_last_chunk is True and self.stream_options is None: + usage = calculate_total_usage(chunks=self.chunks) + response._hidden_params["usage"] = usage + # RETURN RESULT + return response + + except StopIteration: + if self.sent_last_chunk is True: + complete_streaming_response = litellm.stream_chunk_builder( + chunks=self.chunks, messages=self.messages + ) + response = self.model_response_creator() + if complete_streaming_response is not None: + setattr( + response, + "usage", + getattr(complete_streaming_response, "usage"), + ) + + ## LOGGING + threading.Thread( + target=self.logging_obj.success_handler, + args=(response, None, None, cache_hit), + ).start() # log response + + if self.sent_stream_usage is False and self.send_stream_usage is True: + self.sent_stream_usage = True + return response + raise # Re-raise StopIteration + else: + self.sent_last_chunk = True + processed_chunk = self.finish_reason_handler() + if self.stream_options is None: # add usage as hidden param + usage = calculate_total_usage(chunks=self.chunks) + processed_chunk._hidden_params["usage"] = usage + ## LOGGING + threading.Thread( + target=self.run_success_logging_and_cache_storage, + args=(processed_chunk, cache_hit), + ).start() # log response + return processed_chunk + except Exception as e: + traceback_exception = traceback.format_exc() + # LOG FAILURE - handle streaming failure logging in the _next_ object, remove `handle_failure` once it's deprecated + threading.Thread( + target=self.logging_obj.failure_handler, args=(e, traceback_exception) + ).start() + if isinstance(e, OpenAIError): + raise e + else: + raise exception_type( + model=self.model, + original_exception=e, + custom_llm_provider=self.custom_llm_provider, + ) + + def fetch_sync_stream(self): + if self.completion_stream is None and self.make_call is not None: + # Call make_call to get the completion stream + self.completion_stream = self.make_call(client=litellm.module_level_client) + self._stream_iter = self.completion_stream.__iter__() + + return self.completion_stream + + async def fetch_stream(self): + if self.completion_stream is None and self.make_call is not None: + # Call make_call to get the completion stream + self.completion_stream = await self.make_call( + client=litellm.module_level_aclient + ) + self._stream_iter = self.completion_stream.__aiter__() + + return self.completion_stream + + async def __anext__(self): # noqa: PLR0915 + cache_hit = False + if ( + self.custom_llm_provider is not None + and self.custom_llm_provider == "cached_response" + ): + cache_hit = True + try: + if self.completion_stream is None: + await self.fetch_stream() + + if ( + self.custom_llm_provider == "openai" + or self.custom_llm_provider == "azure" + or self.custom_llm_provider == "custom_openai" + or self.custom_llm_provider == "text-completion-openai" + or self.custom_llm_provider == "text-completion-codestral" + or self.custom_llm_provider == "azure_text" + or self.custom_llm_provider == "anthropic" + or self.custom_llm_provider == "anthropic_text" + or self.custom_llm_provider == "huggingface" + or self.custom_llm_provider == "ollama" + or self.custom_llm_provider == "ollama_chat" + or self.custom_llm_provider == "vertex_ai" + or self.custom_llm_provider == "vertex_ai_beta" + or self.custom_llm_provider == "sagemaker" + or self.custom_llm_provider == "sagemaker_chat" + or self.custom_llm_provider == "gemini" + or self.custom_llm_provider == "replicate" + or self.custom_llm_provider == "cached_response" + or self.custom_llm_provider == "predibase" + or self.custom_llm_provider == "databricks" + or self.custom_llm_provider == "bedrock" + or self.custom_llm_provider == "triton" + or self.custom_llm_provider == "watsonx" + or self.custom_llm_provider in litellm.openai_compatible_endpoints + or self.custom_llm_provider in litellm._custom_providers + ): + async for chunk in self.completion_stream: + if chunk == "None" or chunk is None: + raise Exception + elif ( + self.custom_llm_provider == "gemini" + and hasattr(chunk, "parts") + and len(chunk.parts) == 0 + ): + continue + # chunk_creator() does logging/stream chunk building. We need to let it know its being called in_async_func, so we don't double add chunks. + # __anext__ also calls async_success_handler, which does logging + print_verbose(f"PROCESSED ASYNC CHUNK PRE CHUNK CREATOR: {chunk}") + + processed_chunk: Optional[ModelResponse] = self.chunk_creator( + chunk=chunk + ) + print_verbose( + f"PROCESSED ASYNC CHUNK POST CHUNK CREATOR: {processed_chunk}" + ) + if processed_chunk is None: + continue + ## LOGGING + ## LOGGING + executor.submit( + self.logging_obj.success_handler, + result=processed_chunk, + start_time=None, + end_time=None, + cache_hit=cache_hit, + ) + + asyncio.create_task( + self.logging_obj.async_success_handler( + processed_chunk, cache_hit=cache_hit + ) + ) + + if self.logging_obj._llm_caching_handler is not None: + asyncio.create_task( + self.logging_obj._llm_caching_handler._add_streaming_response_to_cache( + processed_chunk=processed_chunk, + ) + ) + + choice = processed_chunk.choices[0] + if isinstance(choice, StreamingChoices): + self.response_uptil_now += choice.delta.get("content", "") or "" + else: + self.response_uptil_now += "" + self.rules.post_call_rules( + input=self.response_uptil_now, model=self.model + ) + self.chunks.append(processed_chunk) + if hasattr( + processed_chunk, "usage" + ): # remove usage from chunk, only send on final chunk + # Convert the object to a dictionary + obj_dict = processed_chunk.dict() + + # Remove an attribute (e.g., 'attr2') + if "usage" in obj_dict: + del obj_dict["usage"] + + # Create a new object without the removed attribute + processed_chunk = self.model_response_creator(chunk=obj_dict) + print_verbose(f"final returned processed chunk: {processed_chunk}") + return processed_chunk + raise StopAsyncIteration + else: # temporary patch for non-aiohttp async calls + # example - boto3 bedrock llms + while True: + if isinstance(self.completion_stream, str) or isinstance( + self.completion_stream, bytes + ): + chunk = self.completion_stream + else: + chunk = next(self.completion_stream) + if chunk is not None and chunk != b"": + print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}") + processed_chunk: Optional[ModelResponse] = self.chunk_creator( + chunk=chunk + ) + print_verbose( + f"PROCESSED CHUNK POST CHUNK CREATOR: {processed_chunk}" + ) + if processed_chunk is None: + continue + ## LOGGING + threading.Thread( + target=self.logging_obj.success_handler, + args=(processed_chunk, None, None, cache_hit), + ).start() # log processed_chunk + asyncio.create_task( + self.logging_obj.async_success_handler( + processed_chunk, cache_hit=cache_hit + ) + ) + + choice = processed_chunk.choices[0] + if isinstance(choice, StreamingChoices): + self.response_uptil_now += ( + choice.delta.get("content", "") or "" + ) + else: + self.response_uptil_now += "" + self.rules.post_call_rules( + input=self.response_uptil_now, model=self.model + ) + # RETURN RESULT + self.chunks.append(processed_chunk) + return processed_chunk + except (StopAsyncIteration, StopIteration): + if self.sent_last_chunk is True: + # log the final chunk with accurate streaming values + complete_streaming_response = litellm.stream_chunk_builder( + chunks=self.chunks, messages=self.messages + ) + response = self.model_response_creator() + if complete_streaming_response is not None: + setattr( + response, + "usage", + getattr(complete_streaming_response, "usage"), + ) + ## LOGGING + threading.Thread( + target=self.logging_obj.success_handler, + args=(response, None, None, cache_hit), + ).start() # log response + asyncio.create_task( + self.logging_obj.async_success_handler( + response, cache_hit=cache_hit + ) + ) + if self.sent_stream_usage is False and self.send_stream_usage is True: + self.sent_stream_usage = True + return response + raise StopAsyncIteration # Re-raise StopIteration + else: + self.sent_last_chunk = True + processed_chunk = self.finish_reason_handler() + ## LOGGING + threading.Thread( + target=self.logging_obj.success_handler, + args=(processed_chunk, None, None, cache_hit), + ).start() # log response + asyncio.create_task( + self.logging_obj.async_success_handler( + processed_chunk, cache_hit=cache_hit + ) + ) + return processed_chunk + except httpx.TimeoutException as e: # if httpx read timeout error occues + traceback_exception = traceback.format_exc() + ## ADD DEBUG INFORMATION - E.G. LITELLM REQUEST TIMEOUT + traceback_exception += "\nLiteLLM Default Request Timeout - {}".format( + litellm.request_timeout + ) + if self.logging_obj is not None: + ## LOGGING + threading.Thread( + target=self.logging_obj.failure_handler, + args=(e, traceback_exception), + ).start() # log response + # Handle any exceptions that might occur during streaming + asyncio.create_task( + self.logging_obj.async_failure_handler(e, traceback_exception) + ) + raise e + except Exception as e: + traceback_exception = traceback.format_exc() + if self.logging_obj is not None: + ## LOGGING + threading.Thread( + target=self.logging_obj.failure_handler, + args=(e, traceback_exception), + ).start() # log response + # Handle any exceptions that might occur during streaming + asyncio.create_task( + self.logging_obj.async_failure_handler(e, traceback_exception) # type: ignore + ) + ## Map to OpenAI Exception + raise exception_type( + model=self.model, + custom_llm_provider=self.custom_llm_provider, + original_exception=e, + completion_kwargs={}, + extra_kwargs={}, + ) + + +def calculate_total_usage(chunks: List[ModelResponse]) -> Usage: + """Assume most recent usage chunk has total usage uptil then.""" + prompt_tokens: int = 0 + completion_tokens: int = 0 + for chunk in chunks: + if "usage" in chunk: + if "prompt_tokens" in chunk["usage"]: + prompt_tokens = chunk["usage"].get("prompt_tokens", 0) or 0 + if "completion_tokens" in chunk["usage"]: + completion_tokens = chunk["usage"].get("completion_tokens", 0) or 0 + + returned_usage_chunk = Usage( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + ) + + return returned_usage_chunk + + +def generic_chunk_has_all_required_fields(chunk: dict) -> bool: + """ + Checks if the provided chunk dictionary contains all required fields for GenericStreamingChunk. + + :param chunk: The dictionary to check. + :return: True if all required fields are present, False otherwise. + """ + _all_fields = GChunk.__annotations__ + + decision = all(key in _all_fields for key in chunk) + return decision diff --git a/litellm/litellm_core_utils/streaming_utils.py b/litellm/litellm_core_utils/streaming_utils.py deleted file mode 100644 index c41b4f64c4df..000000000000 --- a/litellm/litellm_core_utils/streaming_utils.py +++ /dev/null @@ -1,14 +0,0 @@ -from litellm.types.utils import GenericStreamingChunk as GChunk - - -def generic_chunk_has_all_required_fields(chunk: dict) -> bool: - """ - Checks if the provided chunk dictionary contains all required fields for GenericStreamingChunk. - - :param chunk: The dictionary to check. - :return: True if all required fields are present, False otherwise. - """ - _all_fields = GChunk.__annotations__ - - decision = all(key in _all_fields for key in chunk) - return decision diff --git a/litellm/llms/databricks/streaming_utils.py b/litellm/llms/databricks/streaming_utils.py index a87ab39bba6a..502f4a091299 100644 --- a/litellm/llms/databricks/streaming_utils.py +++ b/litellm/llms/databricks/streaming_utils.py @@ -1,5 +1,5 @@ import json -from typing import Optional +from typing import List, Optional import litellm from litellm import verbose_logger @@ -10,7 +10,7 @@ ChatCompletionToolCallFunctionChunk, ChatCompletionUsageBlock, ) -from litellm.types.utils import GenericStreamingChunk +from litellm.types.utils import GenericStreamingChunk, ModelResponse, Usage class ModelResponseIterator: diff --git a/litellm/proxy/management_endpoints/team_endpoints.py b/litellm/proxy/management_endpoints/team_endpoints.py index 74289c90a350..8dcd0c7ebb79 100644 --- a/litellm/proxy/management_endpoints/team_endpoints.py +++ b/litellm/proxy/management_endpoints/team_endpoints.py @@ -1281,12 +1281,20 @@ async def list_team( where={"team_id": team.team_id} ) - returned_responses.append( - TeamListResponseObject( - **team.model_dump(), - team_memberships=_team_memberships, - keys=keys, + try: + returned_responses.append( + TeamListResponseObject( + **team.model_dump(), + team_memberships=_team_memberships, + keys=keys, + ) ) - ) + except Exception as e: + team_exception = """Invalid team object for team_id: {}. team_object={}. + Error: {} + """.format( + team.team_id, team.model_dump(), str(e) + ) + raise HTTPException(status_code=400, detail={"error": team_exception}) return returned_responses diff --git a/litellm/utils.py b/litellm/utils.py index efda579d672d..f2360884c093 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -35,6 +35,7 @@ import uuid from dataclasses import dataclass, field from functools import lru_cache, wraps +from importlib import resources from inspect import iscoroutine from os.path import abspath, dirname, join @@ -49,6 +50,7 @@ from openai.lib import _parsing, _pydantic from openai.types.chat.completion_create_params import ResponseFormat from pydantic import BaseModel +from tiktoken import Encoding from tokenizers import Tokenizer import litellm @@ -59,7 +61,11 @@ from litellm.caching.caching import DualCache from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler from litellm.integrations.custom_logger import CustomLogger -from litellm.litellm_core_utils.core_helpers import map_finish_reason +from litellm.litellm_core_utils.core_helpers import ( + map_finish_reason, + process_response_headers, +) +from litellm.litellm_core_utils.default_encoding import encoding from litellm.litellm_core_utils.exception_mapping_utils import ( _get_response_headers, exception_type, @@ -87,6 +93,8 @@ LiteLLMLoggingObject, redact_message_input_output_from_logging, ) +from litellm.litellm_core_utils.rules import Rules +from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper from litellm.litellm_core_utils.token_counter import get_modified_max_tokens from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler from litellm.secret_managers.main import get_secret @@ -123,25 +131,6 @@ Usage, ) -try: - # New and recommended way to access resources - from importlib import resources - - filename = str(resources.files(litellm).joinpath("llms/tokenizers")) -except (ImportError, AttributeError): - # Old way to access resources, which setuptools deprecated some time ago - import pkg_resources # type: ignore - - filename = pkg_resources.resource_filename(__name__, "llms/tokenizers") - -os.environ["TIKTOKEN_CACHE_DIR"] = os.getenv( - "CUSTOM_TIKTOKEN_CACHE_DIR", filename -) # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071 -from tiktoken import Encoding - -encoding = tiktoken.get_encoding("cl100k_base") -from importlib import resources - with resources.open_text("litellm.llms.tokenizers", "anthropic_tokenizer.json") as f: json_data = json.load(f) # Convert to str (if necessary) @@ -276,56 +265,6 @@ def print_verbose( pass -####### RULES ################### - - -class Rules: - """ - Fail calls based on the input or llm api output - - Example usage: - import litellm - def my_custom_rule(input): # receives the model response - if "i don't think i can answer" in input: # trigger fallback if the model refuses to answer - return False - return True - - litellm.post_call_rules = [my_custom_rule] # have these be functions that can be called to fail a call - - response = litellm.completion(model="gpt-3.5-turbo", messages=[{"role": "user", - "content": "Hey, how's it going?"}], fallbacks=["openrouter/mythomax"]) - """ - - def __init__(self) -> None: - pass - - def pre_call_rules(self, input: str, model: str): - for rule in litellm.pre_call_rules: - if callable(rule): - decision = rule(input) - if decision is False: - raise litellm.APIResponseValidationError(message="LLM Response failed post-call-rule check", llm_provider="", model=model) # type: ignore - return True - - def post_call_rules(self, input: Optional[str], model: str) -> bool: - if input is None: - return True - for rule in litellm.post_call_rules: - if callable(rule): - decision = rule(input) - if isinstance(decision, bool): - if decision is False: - raise litellm.APIResponseValidationError(message="LLM Response failed post-call-rule check", llm_provider="", model=model) # type: ignore - elif isinstance(decision, dict): - decision_val = decision.get("decision", True) - decision_message = decision.get( - "message", "LLM Response failed post-call-rule check" - ) - if decision_val is False: - raise litellm.APIResponseValidationError(message=decision_message, llm_provider="", model=model) # type: ignore - return True - - ####### CLIENT ################### # make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking def custom_llm_setup(): @@ -5568,2042 +5507,2025 @@ def get_model_list(): # wraps the completion stream to return the correct format for the model # replicate/anthropic/cohere +# class CustomStreamWrapper: +# def __init__( +# self, +# completion_stream, +# model, +# logging_obj: Any, +# custom_llm_provider: Optional[str] = None, +# stream_options=None, +# make_call: Optional[Callable] = None, +# _response_headers: Optional[dict] = None, +# ): +# self.model = model +# self.make_call = make_call +# self.custom_llm_provider = custom_llm_provider +# self.logging_obj: LiteLLMLoggingObject = logging_obj +# self.completion_stream = completion_stream +# self.sent_first_chunk = False +# self.sent_last_chunk = False +# self.system_fingerprint: Optional[str] = None +# self.received_finish_reason: Optional[str] = None +# self.special_tokens = [ +# "<|assistant|>", +# "<|system|>", +# "<|user|>", +# "", +# "", +# "<|im_end|>", +# "<|im_start|>", +# ] +# self.holding_chunk = "" +# self.complete_response = "" +# self.response_uptil_now = "" +# _model_info = ( +# self.logging_obj.model_call_details.get("litellm_params", {}).get( +# "model_info", {} +# ) +# or {} +# ) +# self._hidden_params = { +# "model_id": (_model_info.get("id", None)), +# } # returned as x-litellm-model-id response header in proxy + +# self._hidden_params["additional_headers"] = process_response_headers( +# _response_headers or {} +# ) # GUARANTEE OPENAI HEADERS IN RESPONSE + +# self._response_headers = _response_headers +# self.response_id = None +# self.logging_loop = None +# self.rules = Rules() +# self.stream_options = stream_options or getattr( +# logging_obj, "stream_options", None +# ) +# self.messages = getattr(logging_obj, "messages", None) +# self.sent_stream_usage = False +# self.send_stream_usage = ( +# True if self.check_send_stream_usage(self.stream_options) else False +# ) +# self.tool_call = False +# self.chunks: List = ( +# [] +# ) # keep track of the returned chunks - used for calculating the input/output tokens for stream options +# self.is_function_call = self.check_is_function_call(logging_obj=logging_obj) + +# def __iter__(self): +# return self + +# def __aiter__(self): +# return self + +# def check_send_stream_usage(self, stream_options: Optional[dict]): +# return ( +# stream_options is not None +# and stream_options.get("include_usage", False) is True +# ) + +# def check_is_function_call(self, logging_obj) -> bool: +# if hasattr(logging_obj, "optional_params") and isinstance( +# logging_obj.optional_params, dict +# ): +# if ( +# "litellm_param_is_function_call" in logging_obj.optional_params +# and logging_obj.optional_params["litellm_param_is_function_call"] +# is True +# ): +# return True + +# return False + +# def process_chunk(self, chunk: str): +# """ +# NLP Cloud streaming returns the entire response, for each chunk. Process this, to only return the delta. +# """ +# try: +# chunk = chunk.strip() +# self.complete_response = self.complete_response.strip() + +# if chunk.startswith(self.complete_response): +# # Remove last_sent_chunk only if it appears at the start of the new chunk +# chunk = chunk[len(self.complete_response) :] + +# self.complete_response += chunk +# return chunk +# except Exception as e: +# raise e + +# def safety_checker(self) -> None: +# """ +# Fixes - https://github.com/BerriAI/litellm/issues/5158 + +# if the model enters a loop and starts repeating the same chunk again, break out of loop and raise an internalservererror - allows for retries. + +# Raises - InternalServerError, if LLM enters infinite loop while streaming +# """ +# if len(self.chunks) >= litellm.REPEATED_STREAMING_CHUNK_LIMIT: +# # Get the last n chunks +# last_chunks = self.chunks[-litellm.REPEATED_STREAMING_CHUNK_LIMIT :] + +# # Extract the relevant content from the chunks +# last_contents = [chunk.choices[0].delta.content for chunk in last_chunks] + +# # Check if all extracted contents are identical +# if all(content == last_contents[0] for content in last_contents): +# if ( +# last_contents[0] is not None +# and isinstance(last_contents[0], str) +# and len(last_contents[0]) > 2 +# ): # ignore empty content - https://github.com/BerriAI/litellm/issues/5158#issuecomment-2287156946 +# # All last n chunks are identical +# raise litellm.InternalServerError( +# message="The model is repeating the same chunk = {}.".format( +# last_contents[0] +# ), +# model="", +# llm_provider="", +# ) + +# def check_special_tokens(self, chunk: str, finish_reason: Optional[str]): +# """ +# Output parse / special tokens for sagemaker + hf streaming. +# """ +# hold = False +# if ( +# self.custom_llm_provider != "huggingface" +# and self.custom_llm_provider != "sagemaker" +# ): +# return hold, chunk + +# if finish_reason: +# for token in self.special_tokens: +# if token in chunk: +# chunk = chunk.replace(token, "") +# return hold, chunk + +# if self.sent_first_chunk is True: +# return hold, chunk + +# curr_chunk = self.holding_chunk + chunk +# curr_chunk = curr_chunk.strip() + +# for token in self.special_tokens: +# if len(curr_chunk) < len(token) and curr_chunk in token: +# hold = True +# self.holding_chunk = curr_chunk +# elif len(curr_chunk) >= len(token): +# if token in curr_chunk: +# self.holding_chunk = curr_chunk.replace(token, "") +# hold = True +# else: +# pass + +# if hold is False: # reset +# self.holding_chunk = "" +# return hold, curr_chunk + +# def handle_anthropic_text_chunk(self, chunk): +# """ +# For old anthropic models - claude-1, claude-2. + +# Claude-3 is handled from within Anthropic.py VIA ModelResponseIterator() +# """ +# str_line = chunk +# if isinstance(chunk, bytes): # Handle binary data +# str_line = chunk.decode("utf-8") # Convert bytes to string +# text = "" +# is_finished = False +# finish_reason = None +# if str_line.startswith("data:"): +# data_json = json.loads(str_line[5:]) +# type_chunk = data_json.get("type", None) +# if type_chunk == "completion": +# text = data_json.get("completion") +# finish_reason = data_json.get("stop_reason") +# if finish_reason is not None: +# is_finished = True +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# elif "error" in str_line: +# raise ValueError(f"Unable to parse response. Original response: {str_line}") +# else: +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } + +# def handle_vertexai_anthropic_chunk(self, chunk): +# """ +# - MessageStartEvent(message=Message(id='msg_01LeRRgvX4gwkX3ryBVgtuYZ', content=[], model='claude-3-sonnet-20240229', role='assistant', stop_reason=None, stop_sequence=None, type='message', usage=Usage(input_tokens=8, output_tokens=1)), type='message_start'); custom_llm_provider: vertex_ai +# - ContentBlockStartEvent(content_block=ContentBlock(text='', type='text'), index=0, type='content_block_start'); custom_llm_provider: vertex_ai +# - ContentBlockDeltaEvent(delta=TextDelta(text='Hello', type='text_delta'), index=0, type='content_block_delta'); custom_llm_provider: vertex_ai +# """ +# text = "" +# prompt_tokens = None +# completion_tokens = None +# is_finished = False +# finish_reason = None +# type_chunk = getattr(chunk, "type", None) +# if type_chunk == "message_start": +# message = getattr(chunk, "message", None) +# text = "" # lets us return a chunk with usage to user +# _usage = getattr(message, "usage", None) +# if _usage is not None: +# prompt_tokens = getattr(_usage, "input_tokens", None) +# completion_tokens = getattr(_usage, "output_tokens", None) +# elif type_chunk == "content_block_delta": +# """ +# Anthropic content chunk +# chunk = {'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': 'Hello'}} +# """ +# delta = getattr(chunk, "delta", None) +# if delta is not None: +# text = getattr(delta, "text", "") +# else: +# text = "" +# elif type_chunk == "message_delta": +# """ +# Anthropic +# chunk = {'type': 'message_delta', 'delta': {'stop_reason': 'max_tokens', 'stop_sequence': None}, 'usage': {'output_tokens': 10}} +# """ +# # TODO - get usage from this chunk, set in response +# delta = getattr(chunk, "delta", None) +# if delta is not None: +# finish_reason = getattr(delta, "stop_reason", "stop") +# is_finished = True +# _usage = getattr(chunk, "usage", None) +# if _usage is not None: +# prompt_tokens = getattr(_usage, "input_tokens", None) +# completion_tokens = getattr(_usage, "output_tokens", None) + +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# "prompt_tokens": prompt_tokens, +# "completion_tokens": completion_tokens, +# } -def calculate_total_usage(chunks: List[ModelResponse]) -> Usage: - """Assume most recent usage chunk has total usage uptil then.""" - prompt_tokens: int = 0 - completion_tokens: int = 0 - for chunk in chunks: - if "usage" in chunk: - if "prompt_tokens" in chunk["usage"]: - prompt_tokens = chunk["usage"].get("prompt_tokens", 0) or 0 - if "completion_tokens" in chunk["usage"]: - completion_tokens = chunk["usage"].get("completion_tokens", 0) or 0 - - returned_usage_chunk = Usage( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, - ) - - return returned_usage_chunk - - -class CustomStreamWrapper: - def __init__( - self, - completion_stream, - model, - logging_obj: Any, - custom_llm_provider: Optional[str] = None, - stream_options=None, - make_call: Optional[Callable] = None, - _response_headers: Optional[dict] = None, - ): - self.model = model - self.make_call = make_call - self.custom_llm_provider = custom_llm_provider - self.logging_obj: LiteLLMLoggingObject = logging_obj - self.completion_stream = completion_stream - self.sent_first_chunk = False - self.sent_last_chunk = False - self.system_fingerprint: Optional[str] = None - self.received_finish_reason: Optional[str] = None - self.special_tokens = [ - "<|assistant|>", - "<|system|>", - "<|user|>", - "", - "", - "<|im_end|>", - "<|im_start|>", - ] - self.holding_chunk = "" - self.complete_response = "" - self.response_uptil_now = "" - _model_info = ( - self.logging_obj.model_call_details.get("litellm_params", {}).get( - "model_info", {} - ) - or {} - ) - self._hidden_params = { - "model_id": (_model_info.get("id", None)), - } # returned as x-litellm-model-id response header in proxy - - self._hidden_params["additional_headers"] = process_response_headers( - _response_headers or {} - ) # GUARANTEE OPENAI HEADERS IN RESPONSE - - self._response_headers = _response_headers - self.response_id = None - self.logging_loop = None - self.rules = Rules() - self.stream_options = stream_options or getattr( - logging_obj, "stream_options", None - ) - self.messages = getattr(logging_obj, "messages", None) - self.sent_stream_usage = False - self.send_stream_usage = ( - True if self.check_send_stream_usage(self.stream_options) else False - ) - self.tool_call = False - self.chunks: List = ( - [] - ) # keep track of the returned chunks - used for calculating the input/output tokens for stream options - self.is_function_call = self.check_is_function_call(logging_obj=logging_obj) - - def __iter__(self): - return self - - def __aiter__(self): - return self - - def check_send_stream_usage(self, stream_options: Optional[dict]): - return ( - stream_options is not None - and stream_options.get("include_usage", False) is True - ) - - def check_is_function_call(self, logging_obj) -> bool: - if hasattr(logging_obj, "optional_params") and isinstance( - logging_obj.optional_params, dict - ): - if ( - "litellm_param_is_function_call" in logging_obj.optional_params - and logging_obj.optional_params["litellm_param_is_function_call"] - is True - ): - return True - - return False - - def process_chunk(self, chunk: str): - """ - NLP Cloud streaming returns the entire response, for each chunk. Process this, to only return the delta. - """ - try: - chunk = chunk.strip() - self.complete_response = self.complete_response.strip() - - if chunk.startswith(self.complete_response): - # Remove last_sent_chunk only if it appears at the start of the new chunk - chunk = chunk[len(self.complete_response) :] - - self.complete_response += chunk - return chunk - except Exception as e: - raise e - - def safety_checker(self) -> None: - """ - Fixes - https://github.com/BerriAI/litellm/issues/5158 - - if the model enters a loop and starts repeating the same chunk again, break out of loop and raise an internalservererror - allows for retries. - - Raises - InternalServerError, if LLM enters infinite loop while streaming - """ - if len(self.chunks) >= litellm.REPEATED_STREAMING_CHUNK_LIMIT: - # Get the last n chunks - last_chunks = self.chunks[-litellm.REPEATED_STREAMING_CHUNK_LIMIT :] - - # Extract the relevant content from the chunks - last_contents = [chunk.choices[0].delta.content for chunk in last_chunks] - - # Check if all extracted contents are identical - if all(content == last_contents[0] for content in last_contents): - if ( - last_contents[0] is not None - and isinstance(last_contents[0], str) - and len(last_contents[0]) > 2 - ): # ignore empty content - https://github.com/BerriAI/litellm/issues/5158#issuecomment-2287156946 - # All last n chunks are identical - raise litellm.InternalServerError( - message="The model is repeating the same chunk = {}.".format( - last_contents[0] - ), - model="", - llm_provider="", - ) - - def check_special_tokens(self, chunk: str, finish_reason: Optional[str]): - """ - Output parse / special tokens for sagemaker + hf streaming. - """ - hold = False - if ( - self.custom_llm_provider != "huggingface" - and self.custom_llm_provider != "sagemaker" - ): - return hold, chunk - - if finish_reason: - for token in self.special_tokens: - if token in chunk: - chunk = chunk.replace(token, "") - return hold, chunk - - if self.sent_first_chunk is True: - return hold, chunk - - curr_chunk = self.holding_chunk + chunk - curr_chunk = curr_chunk.strip() - - for token in self.special_tokens: - if len(curr_chunk) < len(token) and curr_chunk in token: - hold = True - self.holding_chunk = curr_chunk - elif len(curr_chunk) >= len(token): - if token in curr_chunk: - self.holding_chunk = curr_chunk.replace(token, "") - hold = True - else: - pass - - if hold is False: # reset - self.holding_chunk = "" - return hold, curr_chunk - - def handle_anthropic_text_chunk(self, chunk): - """ - For old anthropic models - claude-1, claude-2. - - Claude-3 is handled from within Anthropic.py VIA ModelResponseIterator() - """ - str_line = chunk - if isinstance(chunk, bytes): # Handle binary data - str_line = chunk.decode("utf-8") # Convert bytes to string - text = "" - is_finished = False - finish_reason = None - if str_line.startswith("data:"): - data_json = json.loads(str_line[5:]) - type_chunk = data_json.get("type", None) - if type_chunk == "completion": - text = data_json.get("completion") - finish_reason = data_json.get("stop_reason") - if finish_reason is not None: - is_finished = True - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - elif "error" in str_line: - raise ValueError(f"Unable to parse response. Original response: {str_line}") - else: - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - - def handle_vertexai_anthropic_chunk(self, chunk): - """ - - MessageStartEvent(message=Message(id='msg_01LeRRgvX4gwkX3ryBVgtuYZ', content=[], model='claude-3-sonnet-20240229', role='assistant', stop_reason=None, stop_sequence=None, type='message', usage=Usage(input_tokens=8, output_tokens=1)), type='message_start'); custom_llm_provider: vertex_ai - - ContentBlockStartEvent(content_block=ContentBlock(text='', type='text'), index=0, type='content_block_start'); custom_llm_provider: vertex_ai - - ContentBlockDeltaEvent(delta=TextDelta(text='Hello', type='text_delta'), index=0, type='content_block_delta'); custom_llm_provider: vertex_ai - """ - text = "" - prompt_tokens = None - completion_tokens = None - is_finished = False - finish_reason = None - type_chunk = getattr(chunk, "type", None) - if type_chunk == "message_start": - message = getattr(chunk, "message", None) - text = "" # lets us return a chunk with usage to user - _usage = getattr(message, "usage", None) - if _usage is not None: - prompt_tokens = getattr(_usage, "input_tokens", None) - completion_tokens = getattr(_usage, "output_tokens", None) - elif type_chunk == "content_block_delta": - """ - Anthropic content chunk - chunk = {'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': 'Hello'}} - """ - delta = getattr(chunk, "delta", None) - if delta is not None: - text = getattr(delta, "text", "") - else: - text = "" - elif type_chunk == "message_delta": - """ - Anthropic - chunk = {'type': 'message_delta', 'delta': {'stop_reason': 'max_tokens', 'stop_sequence': None}, 'usage': {'output_tokens': 10}} - """ - # TODO - get usage from this chunk, set in response - delta = getattr(chunk, "delta", None) - if delta is not None: - finish_reason = getattr(delta, "stop_reason", "stop") - is_finished = True - _usage = getattr(chunk, "usage", None) - if _usage is not None: - prompt_tokens = getattr(_usage, "input_tokens", None) - completion_tokens = getattr(_usage, "output_tokens", None) - - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - } - - def handle_predibase_chunk(self, chunk): - try: - if not isinstance(chunk, str): - chunk = chunk.decode( - "utf-8" - ) # DO NOT REMOVE this: This is required for HF inference API + Streaming - text = "" - is_finished = False - finish_reason = "" - print_verbose(f"chunk: {chunk}") - if chunk.startswith("data:"): - data_json = json.loads(chunk[5:]) - print_verbose(f"data json: {data_json}") - if "token" in data_json and "text" in data_json["token"]: - text = data_json["token"]["text"] - if data_json.get("details", False) and data_json["details"].get( - "finish_reason", False - ): - is_finished = True - finish_reason = data_json["details"]["finish_reason"] - elif data_json.get( - "generated_text", False - ): # if full generated text exists, then stream is complete - text = "" # don't return the final bos token - is_finished = True - finish_reason = "stop" - elif data_json.get("error", False): - raise Exception(data_json.get("error")) - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - elif "error" in chunk: - raise ValueError(chunk) - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - except Exception as e: - raise e - - def handle_huggingface_chunk(self, chunk): - try: - if not isinstance(chunk, str): - chunk = chunk.decode( - "utf-8" - ) # DO NOT REMOVE this: This is required for HF inference API + Streaming - text = "" - is_finished = False - finish_reason = "" - print_verbose(f"chunk: {chunk}") - if chunk.startswith("data:"): - data_json = json.loads(chunk[5:]) - print_verbose(f"data json: {data_json}") - if "token" in data_json and "text" in data_json["token"]: - text = data_json["token"]["text"] - if data_json.get("details", False) and data_json["details"].get( - "finish_reason", False - ): - is_finished = True - finish_reason = data_json["details"]["finish_reason"] - elif data_json.get( - "generated_text", False - ): # if full generated text exists, then stream is complete - text = "" # don't return the final bos token - is_finished = True - finish_reason = "stop" - elif data_json.get("error", False): - raise Exception(data_json.get("error")) - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - elif "error" in chunk: - raise ValueError(chunk) - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - except Exception as e: - raise e - - def handle_ai21_chunk(self, chunk): # fake streaming - chunk = chunk.decode("utf-8") - data_json = json.loads(chunk) - try: - text = data_json["completions"][0]["data"]["text"] - is_finished = True - finish_reason = "stop" - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - except Exception: - raise ValueError(f"Unable to parse response. Original response: {chunk}") - - def handle_maritalk_chunk(self, chunk): # fake streaming - chunk = chunk.decode("utf-8") - data_json = json.loads(chunk) - try: - text = data_json["answer"] - is_finished = True - finish_reason = "stop" - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - except Exception: - raise ValueError(f"Unable to parse response. Original response: {chunk}") - - def handle_nlp_cloud_chunk(self, chunk): - text = "" - is_finished = False - finish_reason = "" - try: - if "dolphin" in self.model: - chunk = self.process_chunk(chunk=chunk) - else: - data_json = json.loads(chunk) - chunk = data_json["generated_text"] - text = chunk - if "[DONE]" in text: - text = text.replace("[DONE]", "") - is_finished = True - finish_reason = "stop" - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - except Exception: - raise ValueError(f"Unable to parse response. Original response: {chunk}") - - def handle_aleph_alpha_chunk(self, chunk): - chunk = chunk.decode("utf-8") - data_json = json.loads(chunk) - try: - text = data_json["completions"][0]["completion"] - is_finished = True - finish_reason = "stop" - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - except Exception: - raise ValueError(f"Unable to parse response. Original response: {chunk}") - - def handle_cohere_chunk(self, chunk): - chunk = chunk.decode("utf-8") - data_json = json.loads(chunk) - try: - text = "" - is_finished = False - finish_reason = "" - index: Optional[int] = None - if "index" in data_json: - index = data_json.get("index") - if "text" in data_json: - text = data_json["text"] - elif "is_finished" in data_json: - is_finished = data_json["is_finished"] - finish_reason = data_json["finish_reason"] - else: - raise Exception(data_json) - return { - "index": index, - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - except Exception: - raise ValueError(f"Unable to parse response. Original response: {chunk}") - - def handle_cohere_chat_chunk(self, chunk): - chunk = chunk.decode("utf-8") - data_json = json.loads(chunk) - print_verbose(f"chunk: {chunk}") - try: - text = "" - is_finished = False - finish_reason = "" - if "text" in data_json: - text = data_json["text"] - elif "is_finished" in data_json and data_json["is_finished"] is True: - is_finished = data_json["is_finished"] - finish_reason = data_json["finish_reason"] - else: - return - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - except Exception: - raise ValueError(f"Unable to parse response. Original response: {chunk}") - - def handle_azure_chunk(self, chunk): - is_finished = False - finish_reason = "" - text = "" - print_verbose(f"chunk: {chunk}") - if "data: [DONE]" in chunk: - text = "" - is_finished = True - finish_reason = "stop" - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - elif chunk.startswith("data:"): - data_json = json.loads(chunk[5:]) # chunk.startswith("data:"): - try: - if len(data_json["choices"]) > 0: - delta = data_json["choices"][0]["delta"] - text = "" if delta is None else delta.get("content", "") - if data_json["choices"][0].get("finish_reason", None): - is_finished = True - finish_reason = data_json["choices"][0]["finish_reason"] - print_verbose( - f"text: {text}; is_finished: {is_finished}; finish_reason: {finish_reason}" - ) - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - except Exception: - raise ValueError( - f"Unable to parse response. Original response: {chunk}" - ) - elif "error" in chunk: - raise ValueError(f"Unable to parse response. Original response: {chunk}") - else: - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - - def handle_replicate_chunk(self, chunk): - try: - text = "" - is_finished = False - finish_reason = "" - if "output" in chunk: - text = chunk["output"] - if "status" in chunk: - if chunk["status"] == "succeeded": - is_finished = True - finish_reason = "stop" - elif chunk.get("error", None): - raise Exception(chunk["error"]) - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - except Exception: - raise ValueError(f"Unable to parse response. Original response: {chunk}") - - def handle_openai_chat_completion_chunk(self, chunk): - try: - print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n") - str_line = chunk - text = "" - is_finished = False - finish_reason = None - logprobs = None - usage = None - if str_line and str_line.choices and len(str_line.choices) > 0: - if ( - str_line.choices[0].delta is not None - and str_line.choices[0].delta.content is not None - ): - text = str_line.choices[0].delta.content - else: # function/tool calling chunk - when content is None. in this case we just return the original chunk from openai - pass - if str_line.choices[0].finish_reason: - is_finished = True - finish_reason = str_line.choices[0].finish_reason - - # checking for logprobs - if ( - hasattr(str_line.choices[0], "logprobs") - and str_line.choices[0].logprobs is not None - ): - logprobs = str_line.choices[0].logprobs - else: - logprobs = None - - usage = getattr(str_line, "usage", None) - - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - "logprobs": logprobs, - "original_chunk": str_line, - "usage": usage, - } - except Exception as e: - raise e - - def handle_azure_text_completion_chunk(self, chunk): - try: - print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n") - text = "" - is_finished = False - finish_reason = None - choices = getattr(chunk, "choices", []) - if len(choices) > 0: - text = choices[0].text - if choices[0].finish_reason is not None: - is_finished = True - finish_reason = choices[0].finish_reason - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - - except Exception as e: - raise e - - def handle_openai_text_completion_chunk(self, chunk): - try: - print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n") - text = "" - is_finished = False - finish_reason = None - usage = None - choices = getattr(chunk, "choices", []) - if len(choices) > 0: - text = choices[0].text - if choices[0].finish_reason is not None: - is_finished = True - finish_reason = choices[0].finish_reason - usage = getattr(chunk, "usage", None) - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - "usage": usage, - } - - except Exception as e: - raise e - - def handle_baseten_chunk(self, chunk): - try: - chunk = chunk.decode("utf-8") - if len(chunk) > 0: - if chunk.startswith("data:"): - data_json = json.loads(chunk[5:]) - if "token" in data_json and "text" in data_json["token"]: - return data_json["token"]["text"] - else: - return "" - data_json = json.loads(chunk) - if "model_output" in data_json: - if ( - isinstance(data_json["model_output"], dict) - and "data" in data_json["model_output"] - and isinstance(data_json["model_output"]["data"], list) - ): - return data_json["model_output"]["data"][0] - elif isinstance(data_json["model_output"], str): - return data_json["model_output"] - elif "completion" in data_json and isinstance( - data_json["completion"], str - ): - return data_json["completion"] - else: - raise ValueError( - f"Unable to parse response. Original response: {chunk}" - ) - else: - return "" - else: - return "" - except Exception as e: - verbose_logger.exception( - "litellm.CustomStreamWrapper.handle_baseten_chunk(): Exception occured - {}".format( - str(e) - ) - ) - return "" - - def handle_cloudlfare_stream(self, chunk): - try: - print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n") - chunk = chunk.decode("utf-8") - str_line = chunk - text = "" - is_finished = False - finish_reason = None - - if "[DONE]" in chunk: - return {"text": text, "is_finished": True, "finish_reason": "stop"} - elif str_line.startswith("data:"): - data_json = json.loads(str_line[5:]) - print_verbose(f"delta content: {data_json}") - text = data_json["response"] - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - else: - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - - except Exception as e: - raise e - - def handle_ollama_stream(self, chunk): - try: - if isinstance(chunk, dict): - json_chunk = chunk - else: - json_chunk = json.loads(chunk) - if "error" in json_chunk: - raise Exception(f"Ollama Error - {json_chunk}") - - text = "" - is_finished = False - finish_reason = None - if json_chunk["done"] is True: - text = "" - is_finished = True - finish_reason = "stop" - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - elif json_chunk["response"]: - print_verbose(f"delta content: {json_chunk}") - text = json_chunk["response"] - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - else: - raise Exception(f"Ollama Error - {json_chunk}") - except Exception as e: - raise e - - def handle_ollama_chat_stream(self, chunk): - # for ollama_chat/ provider - try: - if isinstance(chunk, dict): - json_chunk = chunk - else: - json_chunk = json.loads(chunk) - if "error" in json_chunk: - raise Exception(f"Ollama Error - {json_chunk}") - - text = "" - is_finished = False - finish_reason = None - if json_chunk["done"] is True: - text = "" - is_finished = True - finish_reason = "stop" - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - elif "message" in json_chunk: - print_verbose(f"delta content: {json_chunk}") - text = json_chunk["message"]["content"] - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - } - else: - raise Exception(f"Ollama Error - {json_chunk}") - except Exception as e: - raise e - - def handle_watsonx_stream(self, chunk): - try: - if isinstance(chunk, dict): - parsed_response = chunk - elif isinstance(chunk, (str, bytes)): - if isinstance(chunk, bytes): - chunk = chunk.decode("utf-8") - if "generated_text" in chunk: - response = chunk.replace("data: ", "").strip() - parsed_response = json.loads(response) - else: - return { - "text": "", - "is_finished": False, - "prompt_tokens": 0, - "completion_tokens": 0, - } - else: - print_verbose(f"chunk: {chunk} (Type: {type(chunk)})") - raise ValueError( - f"Unable to parse response. Original response: {chunk}" - ) - results = parsed_response.get("results", []) - if len(results) > 0: - text = results[0].get("generated_text", "") - finish_reason = results[0].get("stop_reason") - is_finished = finish_reason != "not_finished" - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - "prompt_tokens": results[0].get("input_token_count", 0), - "completion_tokens": results[0].get("generated_token_count", 0), - } - return {"text": "", "is_finished": False} - except Exception as e: - raise e - - def handle_triton_stream(self, chunk): - try: - if isinstance(chunk, dict): - parsed_response = chunk - elif isinstance(chunk, (str, bytes)): - if isinstance(chunk, bytes): - chunk = chunk.decode("utf-8") - if "text_output" in chunk: - response = chunk.replace("data: ", "").strip() - parsed_response = json.loads(response) - else: - return { - "text": "", - "is_finished": False, - "prompt_tokens": 0, - "completion_tokens": 0, - } - else: - print_verbose(f"chunk: {chunk} (Type: {type(chunk)})") - raise ValueError( - f"Unable to parse response. Original response: {chunk}" - ) - text = parsed_response.get("text_output", "") - finish_reason = parsed_response.get("stop_reason") - is_finished = parsed_response.get("is_finished", False) - return { - "text": text, - "is_finished": is_finished, - "finish_reason": finish_reason, - "prompt_tokens": parsed_response.get("input_token_count", 0), - "completion_tokens": parsed_response.get("generated_token_count", 0), - } - return {"text": "", "is_finished": False} - except Exception as e: - raise e - - def handle_clarifai_completion_chunk(self, chunk): - try: - if isinstance(chunk, dict): - parsed_response = chunk - elif isinstance(chunk, (str, bytes)): - if isinstance(chunk, bytes): - parsed_response = chunk.decode("utf-8") - else: - parsed_response = chunk - else: - raise ValueError("Unable to parse streaming chunk") - if isinstance(parsed_response, dict): - data_json = parsed_response - else: - data_json = json.loads(parsed_response) - text = ( - data_json.get("outputs", "")[0] - .get("data", "") - .get("text", "") - .get("raw", "") - ) - len( - encoding.encode( - data_json.get("outputs", "")[0] - .get("input", "") - .get("data", "") - .get("text", "") - .get("raw", "") - ) - ) - len(encoding.encode(text)) - return { - "text": text, - "is_finished": True, - } - except Exception as e: - verbose_logger.exception( - "litellm.CustomStreamWrapper.handle_clarifai_chunk(): Exception occured - {}".format( - str(e) - ) - ) - return "" - - def model_response_creator( - self, chunk: Optional[dict] = None, hidden_params: Optional[dict] = None - ): - _model = self.model - _received_llm_provider = self.custom_llm_provider - _logging_obj_llm_provider = self.logging_obj.model_call_details.get("custom_llm_provider", None) # type: ignore - if ( - _received_llm_provider == "openai" - and _received_llm_provider != _logging_obj_llm_provider - ): - _model = "{}/{}".format(_logging_obj_llm_provider, _model) - if chunk is None: - chunk = {} - else: - # pop model keyword - chunk.pop("model", None) - - model_response = ModelResponse( - stream=True, model=_model, stream_options=self.stream_options, **chunk - ) - if self.response_id is not None: - model_response.id = self.response_id - else: - self.response_id = model_response.id # type: ignore - if self.system_fingerprint is not None: - model_response.system_fingerprint = self.system_fingerprint - if hidden_params is not None: - model_response._hidden_params = hidden_params - model_response._hidden_params["custom_llm_provider"] = _logging_obj_llm_provider - model_response._hidden_params["created_at"] = time.time() - model_response._hidden_params = { - **model_response._hidden_params, - **self._hidden_params, - } - - if ( - len(model_response.choices) > 0 - and getattr(model_response.choices[0], "delta") is not None - ): - # do nothing, if object instantiated - pass - else: - model_response.choices = [StreamingChoices(finish_reason=None)] - return model_response - - def is_delta_empty(self, delta: Delta) -> bool: - is_empty = True - if delta.content is not None: - is_empty = False - elif delta.tool_calls is not None: - is_empty = False - elif delta.function_call is not None: - is_empty = False - return is_empty - - def return_processed_chunk_logic( # noqa - self, - completion_obj: dict, - model_response: ModelResponseStream, - response_obj: dict, - ): - - print_verbose( - f"completion_obj: {completion_obj}, model_response.choices[0]: {model_response.choices[0]}, response_obj: {response_obj}" - ) - if ( - "content" in completion_obj - and ( - isinstance(completion_obj["content"], str) - and len(completion_obj["content"]) > 0 - ) - or ( - "tool_calls" in completion_obj - and completion_obj["tool_calls"] is not None - and len(completion_obj["tool_calls"]) > 0 - ) - or ( - "function_call" in completion_obj - and completion_obj["function_call"] is not None - ) - ): # cannot set content of an OpenAI Object to be an empty string - self.safety_checker() - hold, model_response_str = self.check_special_tokens( - chunk=completion_obj["content"], - finish_reason=model_response.choices[0].finish_reason, - ) # filter out bos/eos tokens from openai-compatible hf endpoints - print_verbose(f"hold - {hold}, model_response_str - {model_response_str}") - if hold is False: - ## check if openai/azure chunk - original_chunk = response_obj.get("original_chunk", None) - if original_chunk: - model_response.id = original_chunk.id - self.response_id = original_chunk.id - if len(original_chunk.choices) > 0: - choices = [] - for choice in original_chunk.choices: - try: - if isinstance(choice, BaseModel): - choice_json = choice.model_dump() - choice_json.pop( - "finish_reason", None - ) # for mistral etc. which return a value in their last chunk (not-openai compatible). - print_verbose(f"choice_json: {choice_json}") - choices.append(StreamingChoices(**choice_json)) - except Exception: - choices.append(StreamingChoices()) - print_verbose(f"choices in streaming: {choices}") - setattr(model_response, "choices", choices) - else: - return - model_response.system_fingerprint = ( - original_chunk.system_fingerprint - ) - setattr( - model_response, - "citations", - getattr(original_chunk, "citations", None), - ) - print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}") - if self.sent_first_chunk is False: - model_response.choices[0].delta["role"] = "assistant" - self.sent_first_chunk = True - elif self.sent_first_chunk is True and hasattr( - model_response.choices[0].delta, "role" - ): - _initial_delta = model_response.choices[0].delta.model_dump() - _initial_delta.pop("role", None) - model_response.choices[0].delta = Delta(**_initial_delta) - print_verbose( - f"model_response.choices[0].delta: {model_response.choices[0].delta}" - ) - else: - ## else - completion_obj["content"] = model_response_str - if self.sent_first_chunk is False: - completion_obj["role"] = "assistant" - self.sent_first_chunk = True - - model_response.choices[0].delta = Delta(**completion_obj) - _index: Optional[int] = completion_obj.get("index") - if _index is not None: - model_response.choices[0].index = _index - print_verbose(f"returning model_response: {model_response}") - return model_response - else: - return - elif self.received_finish_reason is not None: - if self.sent_last_chunk is True: - # Bedrock returns the guardrail trace in the last chunk - we want to return this here - if self.custom_llm_provider == "bedrock" and "trace" in model_response: - return model_response - - # Default - return StopIteration - raise StopIteration - # flush any remaining holding chunk - if len(self.holding_chunk) > 0: - if model_response.choices[0].delta.content is None: - model_response.choices[0].delta.content = self.holding_chunk - else: - model_response.choices[0].delta.content = ( - self.holding_chunk + model_response.choices[0].delta.content - ) - self.holding_chunk = "" - # if delta is None - _is_delta_empty = self.is_delta_empty(delta=model_response.choices[0].delta) - - if _is_delta_empty: - # get any function call arguments - model_response.choices[0].finish_reason = map_finish_reason( - finish_reason=self.received_finish_reason - ) # ensure consistent output to openai - - self.sent_last_chunk = True - - return model_response - elif ( - model_response.choices[0].delta.tool_calls is not None - or model_response.choices[0].delta.function_call is not None - ): - if self.sent_first_chunk is False: - model_response.choices[0].delta["role"] = "assistant" - self.sent_first_chunk = True - return model_response - elif ( - len(model_response.choices) > 0 - and hasattr(model_response.choices[0].delta, "audio") - and model_response.choices[0].delta.audio is not None - ): - return model_response - else: - if hasattr(model_response, "usage"): - self.chunks.append(model_response) - return - - def chunk_creator(self, chunk): # type: ignore # noqa: PLR0915 - model_response = self.model_response_creator() - response_obj: dict = {} - try: - # return this for all models - completion_obj = {"content": ""} - from litellm.litellm_core_utils.streaming_utils import ( - generic_chunk_has_all_required_fields, - ) - from litellm.types.utils import GenericStreamingChunk as GChunk - - if ( - isinstance(chunk, dict) - and generic_chunk_has_all_required_fields( - chunk=chunk - ) # check if chunk is a generic streaming chunk - ) or ( - self.custom_llm_provider - and ( - self.custom_llm_provider == "anthropic" - or self.custom_llm_provider in litellm._custom_providers - ) - ): - - if self.received_finish_reason is not None: - if "provider_specific_fields" not in chunk: - raise StopIteration - anthropic_response_obj: GChunk = chunk - completion_obj["content"] = anthropic_response_obj["text"] - if anthropic_response_obj["is_finished"]: - self.received_finish_reason = anthropic_response_obj[ - "finish_reason" - ] - - if anthropic_response_obj["usage"] is not None: - model_response.usage = litellm.Usage( - **anthropic_response_obj["usage"] - ) - - if ( - "tool_use" in anthropic_response_obj - and anthropic_response_obj["tool_use"] is not None - ): - completion_obj["tool_calls"] = [anthropic_response_obj["tool_use"]] - - if ( - "provider_specific_fields" in anthropic_response_obj - and anthropic_response_obj["provider_specific_fields"] is not None - ): - for key, value in anthropic_response_obj[ - "provider_specific_fields" - ].items(): - setattr(model_response, key, value) - - response_obj = anthropic_response_obj - elif ( - self.custom_llm_provider - and self.custom_llm_provider == "anthropic_text" - ): - response_obj = self.handle_anthropic_text_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider and self.custom_llm_provider == "clarifai": - response_obj = self.handle_clarifai_completion_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.model == "replicate" or self.custom_llm_provider == "replicate": - response_obj = self.handle_replicate_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider and self.custom_llm_provider == "huggingface": - response_obj = self.handle_huggingface_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider and self.custom_llm_provider == "predibase": - response_obj = self.handle_predibase_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif ( - self.custom_llm_provider and self.custom_llm_provider == "baseten" - ): # baseten doesn't provide streaming - completion_obj["content"] = self.handle_baseten_chunk(chunk) - elif ( - self.custom_llm_provider and self.custom_llm_provider == "ai21" - ): # ai21 doesn't provide streaming - response_obj = self.handle_ai21_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider and self.custom_llm_provider == "maritalk": - response_obj = self.handle_maritalk_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider and self.custom_llm_provider == "vllm": - completion_obj["content"] = chunk[0].outputs[0].text - elif ( - self.custom_llm_provider and self.custom_llm_provider == "aleph_alpha" - ): # aleph alpha doesn't provide streaming - response_obj = self.handle_aleph_alpha_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "nlp_cloud": - try: - response_obj = self.handle_nlp_cloud_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - except Exception as e: - if self.received_finish_reason: - raise e - else: - if self.sent_first_chunk is False: - raise Exception("An unknown error occurred with the stream") - self.received_finish_reason = "stop" - elif self.custom_llm_provider == "vertex_ai": - import proto # type: ignore - - if self.model.startswith("claude-3"): - response_obj = self.handle_vertexai_anthropic_chunk(chunk=chunk) - if response_obj is None: - return - completion_obj["content"] = response_obj["text"] - setattr(model_response, "usage", Usage()) - if response_obj.get("prompt_tokens", None) is not None: - model_response.usage.prompt_tokens = response_obj[ - "prompt_tokens" - ] - if response_obj.get("completion_tokens", None) is not None: - model_response.usage.completion_tokens = response_obj[ - "completion_tokens" - ] - if hasattr(model_response.usage, "prompt_tokens"): - model_response.usage.total_tokens = ( - getattr(model_response.usage, "total_tokens", 0) - + model_response.usage.prompt_tokens - ) - if hasattr(model_response.usage, "completion_tokens"): - model_response.usage.total_tokens = ( - getattr(model_response.usage, "total_tokens", 0) - + model_response.usage.completion_tokens - ) - - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif hasattr(chunk, "candidates") is True: - try: - try: - completion_obj["content"] = chunk.text - except Exception as e: - if "Part has no text." in str(e): - ## check for function calling - function_call = ( - chunk.candidates[0].content.parts[0].function_call - ) - - args_dict = {} - - # Check if it's a RepeatedComposite instance - for key, val in function_call.args.items(): - if isinstance( - val, - proto.marshal.collections.repeated.RepeatedComposite, - ): - # If so, convert to list - args_dict[key] = [v for v in val] - else: - args_dict[key] = val - - try: - args_str = json.dumps(args_dict) - except Exception as e: - raise e - _delta_obj = litellm.utils.Delta( - content=None, - tool_calls=[ - { - "id": f"call_{str(uuid.uuid4())}", - "function": { - "arguments": args_str, - "name": function_call.name, - }, - "type": "function", - } - ], - ) - _streaming_response = StreamingChoices(delta=_delta_obj) - _model_response = ModelResponse(stream=True) - _model_response.choices = [_streaming_response] - response_obj = {"original_chunk": _model_response} - else: - raise e - if ( - hasattr(chunk.candidates[0], "finish_reason") - and chunk.candidates[0].finish_reason.name - != "FINISH_REASON_UNSPECIFIED" - ): # every non-final chunk in vertex ai has this - self.received_finish_reason = chunk.candidates[ - 0 - ].finish_reason.name - except Exception: - if chunk.candidates[0].finish_reason.name == "SAFETY": - raise Exception( - f"The response was blocked by VertexAI. {str(chunk)}" - ) - else: - completion_obj["content"] = str(chunk) - elif self.custom_llm_provider == "cohere": - response_obj = self.handle_cohere_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "cohere_chat": - response_obj = self.handle_cohere_chat_chunk(chunk) - if response_obj is None: - return - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - - elif self.custom_llm_provider == "petals": - if len(self.completion_stream) == 0: - if self.received_finish_reason is not None: - raise StopIteration - else: - self.received_finish_reason = "stop" - chunk_size = 30 - new_chunk = self.completion_stream[:chunk_size] - completion_obj["content"] = new_chunk - self.completion_stream = self.completion_stream[chunk_size:] - elif self.custom_llm_provider == "palm": - # fake streaming - response_obj = {} - if len(self.completion_stream) == 0: - if self.received_finish_reason is not None: - raise StopIteration - else: - self.received_finish_reason = "stop" - chunk_size = 30 - new_chunk = self.completion_stream[:chunk_size] - completion_obj["content"] = new_chunk - self.completion_stream = self.completion_stream[chunk_size:] - elif self.custom_llm_provider == "ollama": - response_obj = self.handle_ollama_stream(chunk) - completion_obj["content"] = response_obj["text"] - print_verbose(f"completion obj content: {completion_obj['content']}") - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "ollama_chat": - response_obj = self.handle_ollama_chat_stream(chunk) - completion_obj["content"] = response_obj["text"] - print_verbose(f"completion obj content: {completion_obj['content']}") - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "cloudflare": - response_obj = self.handle_cloudlfare_stream(chunk) - completion_obj["content"] = response_obj["text"] - print_verbose(f"completion obj content: {completion_obj['content']}") - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "watsonx": - response_obj = self.handle_watsonx_stream(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "triton": - response_obj = self.handle_triton_stream(chunk) - completion_obj["content"] = response_obj["text"] - print_verbose(f"completion obj content: {completion_obj['content']}") - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "text-completion-openai": - response_obj = self.handle_openai_text_completion_chunk(chunk) - completion_obj["content"] = response_obj["text"] - print_verbose(f"completion obj content: {completion_obj['content']}") - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - if response_obj["usage"] is not None: - model_response.usage = litellm.Usage( - prompt_tokens=response_obj["usage"].prompt_tokens, - completion_tokens=response_obj["usage"].completion_tokens, - total_tokens=response_obj["usage"].total_tokens, - ) - elif self.custom_llm_provider == "text-completion-codestral": - response_obj = litellm.MistralTextCompletionConfig()._chunk_parser( - chunk - ) - completion_obj["content"] = response_obj["text"] - print_verbose(f"completion obj content: {completion_obj['content']}") - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - if "usage" in response_obj is not None: - model_response.usage = litellm.Usage( - prompt_tokens=response_obj["usage"].prompt_tokens, - completion_tokens=response_obj["usage"].completion_tokens, - total_tokens=response_obj["usage"].total_tokens, - ) - elif self.custom_llm_provider == "azure_text": - response_obj = self.handle_azure_text_completion_chunk(chunk) - completion_obj["content"] = response_obj["text"] - print_verbose(f"completion obj content: {completion_obj['content']}") - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "cached_response": - response_obj = { - "text": chunk.choices[0].delta.content, - "is_finished": True, - "finish_reason": chunk.choices[0].finish_reason, - "original_chunk": chunk, - "tool_calls": ( - chunk.choices[0].delta.tool_calls - if hasattr(chunk.choices[0].delta, "tool_calls") - else None - ), - } - - completion_obj["content"] = response_obj["text"] - if response_obj["tool_calls"] is not None: - completion_obj["tool_calls"] = response_obj["tool_calls"] - print_verbose(f"completion obj content: {completion_obj['content']}") - if hasattr(chunk, "id"): - model_response.id = chunk.id - self.response_id = chunk.id - if hasattr(chunk, "system_fingerprint"): - self.system_fingerprint = chunk.system_fingerprint - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - else: # openai / azure chat model - if self.custom_llm_provider == "azure": - if hasattr(chunk, "model"): - # for azure, we need to pass the model from the orignal chunk - self.model = chunk.model - response_obj = self.handle_openai_chat_completion_chunk(chunk) - if response_obj is None: - return - completion_obj["content"] = response_obj["text"] - print_verbose(f"completion obj content: {completion_obj['content']}") - if response_obj["is_finished"]: - if response_obj["finish_reason"] == "error": - raise Exception( - "{} raised a streaming error - finish_reason: error, no content string given. Received Chunk={}".format( - self.custom_llm_provider, response_obj - ) - ) - self.received_finish_reason = response_obj["finish_reason"] - if response_obj.get("original_chunk", None) is not None: - if hasattr(response_obj["original_chunk"], "id"): - model_response.id = response_obj["original_chunk"].id - self.response_id = model_response.id - if hasattr(response_obj["original_chunk"], "system_fingerprint"): - model_response.system_fingerprint = response_obj[ - "original_chunk" - ].system_fingerprint - self.system_fingerprint = response_obj[ - "original_chunk" - ].system_fingerprint - if response_obj["logprobs"] is not None: - model_response.choices[0].logprobs = response_obj["logprobs"] - - if response_obj["usage"] is not None: - if isinstance(response_obj["usage"], dict): - model_response.usage = litellm.Usage( - prompt_tokens=response_obj["usage"].get( - "prompt_tokens", None - ) - or None, - completion_tokens=response_obj["usage"].get( - "completion_tokens", None - ) - or None, - total_tokens=response_obj["usage"].get("total_tokens", None) - or None, - ) - elif isinstance(response_obj["usage"], BaseModel): - model_response.usage = litellm.Usage( - **response_obj["usage"].model_dump() - ) - - model_response.model = self.model - print_verbose( - f"model_response finish reason 3: {self.received_finish_reason}; response_obj={response_obj}" - ) - ## FUNCTION CALL PARSING - if ( - response_obj is not None - and response_obj.get("original_chunk", None) is not None - ): # function / tool calling branch - only set for openai/azure compatible endpoints - # enter this branch when no content has been passed in response - original_chunk = response_obj.get("original_chunk", None) - model_response.id = original_chunk.id - self.response_id = original_chunk.id - if original_chunk.choices and len(original_chunk.choices) > 0: - delta = original_chunk.choices[0].delta - if delta is not None and ( - delta.function_call is not None or delta.tool_calls is not None - ): - try: - model_response.system_fingerprint = ( - original_chunk.system_fingerprint - ) - ## AZURE - check if arguments is not None - if ( - original_chunk.choices[0].delta.function_call - is not None - ): - if ( - getattr( - original_chunk.choices[0].delta.function_call, - "arguments", - ) - is None - ): - original_chunk.choices[ - 0 - ].delta.function_call.arguments = "" - elif original_chunk.choices[0].delta.tool_calls is not None: - if isinstance( - original_chunk.choices[0].delta.tool_calls, list - ): - for t in original_chunk.choices[0].delta.tool_calls: - if hasattr(t, "functions") and hasattr( - t.functions, "arguments" - ): - if ( - getattr( - t.function, - "arguments", - ) - is None - ): - t.function.arguments = "" - _json_delta = delta.model_dump() - print_verbose(f"_json_delta: {_json_delta}") - if "role" not in _json_delta or _json_delta["role"] is None: - _json_delta["role"] = ( - "assistant" # mistral's api returns role as None - ) - if "tool_calls" in _json_delta and isinstance( - _json_delta["tool_calls"], list - ): - for tool in _json_delta["tool_calls"]: - if ( - isinstance(tool, dict) - and "function" in tool - and isinstance(tool["function"], dict) - and ("type" not in tool or tool["type"] is None) - ): - # if function returned but type set to None - mistral's api returns type: None - tool["type"] = "function" - model_response.choices[0].delta = Delta(**_json_delta) - except Exception as e: - verbose_logger.exception( - "litellm.CustomStreamWrapper.chunk_creator(): Exception occured - {}".format( - str(e) - ) - ) - model_response.choices[0].delta = Delta() - elif ( - delta is not None and getattr(delta, "audio", None) is not None - ): - model_response.choices[0].delta.audio = delta.audio - else: - try: - delta = ( - dict() - if original_chunk.choices[0].delta is None - else dict(original_chunk.choices[0].delta) - ) - print_verbose(f"original delta: {delta}") - model_response.choices[0].delta = Delta(**delta) - print_verbose( - f"new delta: {model_response.choices[0].delta}" - ) - except Exception: - model_response.choices[0].delta = Delta() - else: - if ( - self.stream_options is not None - and self.stream_options["include_usage"] is True - ): - return model_response - return - print_verbose( - f"model_response.choices[0].delta: {model_response.choices[0].delta}; completion_obj: {completion_obj}" - ) - print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}") - - ## CHECK FOR TOOL USE - if "tool_calls" in completion_obj and len(completion_obj["tool_calls"]) > 0: - if self.is_function_call is True: # user passed in 'functions' param - completion_obj["function_call"] = completion_obj["tool_calls"][0][ - "function" - ] - completion_obj["tool_calls"] = None - - self.tool_call = True - - ## RETURN ARG - return self.return_processed_chunk_logic( - completion_obj=completion_obj, - model_response=model_response, # type: ignore - response_obj=response_obj, - ) - - except StopIteration: - raise StopIteration - except Exception as e: - traceback.format_exc() - e.message = str(e) - raise exception_type( - model=self.model, - custom_llm_provider=self.custom_llm_provider, - original_exception=e, - ) - - def set_logging_event_loop(self, loop): - """ - import litellm, asyncio - - loop = asyncio.get_event_loop() # πŸ‘ˆ gets the current event loop - - response = litellm.completion(.., stream=True) - - response.set_logging_event_loop(loop=loop) # πŸ‘ˆ enables async_success callbacks for sync logging - - for chunk in response: - ... - """ - self.logging_loop = loop - - def run_success_logging_and_cache_storage(self, processed_chunk, cache_hit: bool): - """ - Runs success logging in a thread and adds the response to the cache - """ - if litellm.disable_streaming_logging is True: - """ - [NOT RECOMMENDED] - Set this via `litellm.disable_streaming_logging = True`. - - Disables streaming logging. - """ - return - ## ASYNC LOGGING - # Create an event loop for the new thread - if self.logging_loop is not None: - future = asyncio.run_coroutine_threadsafe( - self.logging_obj.async_success_handler( - processed_chunk, None, None, cache_hit - ), - loop=self.logging_loop, - ) - future.result() - else: - asyncio.run( - self.logging_obj.async_success_handler( - processed_chunk, None, None, cache_hit - ) - ) - ## SYNC LOGGING - self.logging_obj.success_handler(processed_chunk, None, None, cache_hit) - - ## Sync store in cache - if self.logging_obj._llm_caching_handler is not None: - self.logging_obj._llm_caching_handler._sync_add_streaming_response_to_cache( - processed_chunk - ) - - def finish_reason_handler(self): - model_response = self.model_response_creator() - if self.received_finish_reason is not None: - model_response.choices[0].finish_reason = map_finish_reason( - finish_reason=self.received_finish_reason - ) - else: - model_response.choices[0].finish_reason = "stop" - - ## if tool use - if ( - model_response.choices[0].finish_reason == "stop" and self.tool_call - ): # don't overwrite for other - potential error finish reasons - model_response.choices[0].finish_reason = "tool_calls" - return model_response - - def __next__(self): # noqa: PLR0915 - cache_hit = False - if ( - self.custom_llm_provider is not None - and self.custom_llm_provider == "cached_response" - ): - cache_hit = True - try: - if self.completion_stream is None: - self.fetch_sync_stream() - while True: - if ( - isinstance(self.completion_stream, str) - or isinstance(self.completion_stream, bytes) - or isinstance(self.completion_stream, ModelResponse) - ): - chunk = self.completion_stream - else: - chunk = next(self.completion_stream) - if chunk is not None and chunk != b"": - print_verbose( - f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}; custom_llm_provider: {self.custom_llm_provider}" - ) - response: Optional[ModelResponse] = self.chunk_creator(chunk=chunk) - print_verbose(f"PROCESSED CHUNK POST CHUNK CREATOR: {response}") - - if response is None: - continue - ## LOGGING - threading.Thread( - target=self.run_success_logging_and_cache_storage, - args=(response, cache_hit), - ).start() # log response - choice = response.choices[0] - if isinstance(choice, StreamingChoices): - self.response_uptil_now += choice.delta.get("content", "") or "" - else: - self.response_uptil_now += "" - self.rules.post_call_rules( - input=self.response_uptil_now, model=self.model - ) - # HANDLE STREAM OPTIONS - self.chunks.append(response) - if hasattr( - response, "usage" - ): # remove usage from chunk, only send on final chunk - # Convert the object to a dictionary - obj_dict = response.dict() - - # Remove an attribute (e.g., 'attr2') - if "usage" in obj_dict: - del obj_dict["usage"] - - # Create a new object without the removed attribute - response = self.model_response_creator( - chunk=obj_dict, hidden_params=response._hidden_params - ) - # add usage as hidden param - if self.sent_last_chunk is True and self.stream_options is None: - usage = calculate_total_usage(chunks=self.chunks) - response._hidden_params["usage"] = usage - # RETURN RESULT - return response - - except StopIteration: - if self.sent_last_chunk is True: - complete_streaming_response = litellm.stream_chunk_builder( - chunks=self.chunks, messages=self.messages - ) - response = self.model_response_creator() - if complete_streaming_response is not None: - setattr( - response, - "usage", - getattr(complete_streaming_response, "usage"), - ) - - ## LOGGING - threading.Thread( - target=self.logging_obj.success_handler, - args=(response, None, None, cache_hit), - ).start() # log response - - if self.sent_stream_usage is False and self.send_stream_usage is True: - self.sent_stream_usage = True - return response - raise # Re-raise StopIteration - else: - self.sent_last_chunk = True - processed_chunk = self.finish_reason_handler() - if self.stream_options is None: # add usage as hidden param - usage = calculate_total_usage(chunks=self.chunks) - processed_chunk._hidden_params["usage"] = usage - ## LOGGING - threading.Thread( - target=self.run_success_logging_and_cache_storage, - args=(processed_chunk, cache_hit), - ).start() # log response - return processed_chunk - except Exception as e: - traceback_exception = traceback.format_exc() - # LOG FAILURE - handle streaming failure logging in the _next_ object, remove `handle_failure` once it's deprecated - threading.Thread( - target=self.logging_obj.failure_handler, args=(e, traceback_exception) - ).start() - if isinstance(e, OpenAIError): - raise e - else: - raise exception_type( - model=self.model, - original_exception=e, - custom_llm_provider=self.custom_llm_provider, - ) - - def fetch_sync_stream(self): - if self.completion_stream is None and self.make_call is not None: - # Call make_call to get the completion stream - self.completion_stream = self.make_call(client=litellm.module_level_client) - self._stream_iter = self.completion_stream.__iter__() - - return self.completion_stream - - async def fetch_stream(self): - if self.completion_stream is None and self.make_call is not None: - # Call make_call to get the completion stream - self.completion_stream = await self.make_call( - client=litellm.module_level_aclient - ) - self._stream_iter = self.completion_stream.__aiter__() - - return self.completion_stream - - async def __anext__(self): # noqa: PLR0915 - cache_hit = False - if ( - self.custom_llm_provider is not None - and self.custom_llm_provider == "cached_response" - ): - cache_hit = True - try: - if self.completion_stream is None: - await self.fetch_stream() - - if ( - self.custom_llm_provider == "openai" - or self.custom_llm_provider == "azure" - or self.custom_llm_provider == "custom_openai" - or self.custom_llm_provider == "text-completion-openai" - or self.custom_llm_provider == "text-completion-codestral" - or self.custom_llm_provider == "azure_text" - or self.custom_llm_provider == "anthropic" - or self.custom_llm_provider == "anthropic_text" - or self.custom_llm_provider == "huggingface" - or self.custom_llm_provider == "ollama" - or self.custom_llm_provider == "ollama_chat" - or self.custom_llm_provider == "vertex_ai" - or self.custom_llm_provider == "vertex_ai_beta" - or self.custom_llm_provider == "sagemaker" - or self.custom_llm_provider == "sagemaker_chat" - or self.custom_llm_provider == "gemini" - or self.custom_llm_provider == "replicate" - or self.custom_llm_provider == "cached_response" - or self.custom_llm_provider == "predibase" - or self.custom_llm_provider == "databricks" - or self.custom_llm_provider == "bedrock" - or self.custom_llm_provider == "triton" - or self.custom_llm_provider == "watsonx" - or self.custom_llm_provider in litellm.openai_compatible_endpoints - or self.custom_llm_provider in litellm._custom_providers - ): - async for chunk in self.completion_stream: - if chunk == "None" or chunk is None: - raise Exception - elif ( - self.custom_llm_provider == "gemini" - and hasattr(chunk, "parts") - and len(chunk.parts) == 0 - ): - continue - # chunk_creator() does logging/stream chunk building. We need to let it know its being called in_async_func, so we don't double add chunks. - # __anext__ also calls async_success_handler, which does logging - print_verbose(f"PROCESSED ASYNC CHUNK PRE CHUNK CREATOR: {chunk}") - - processed_chunk: Optional[ModelResponse] = self.chunk_creator( - chunk=chunk - ) - print_verbose( - f"PROCESSED ASYNC CHUNK POST CHUNK CREATOR: {processed_chunk}" - ) - if processed_chunk is None: - continue - ## LOGGING - ## LOGGING - executor.submit( - self.logging_obj.success_handler, - result=processed_chunk, - start_time=None, - end_time=None, - cache_hit=cache_hit, - ) - - asyncio.create_task( - self.logging_obj.async_success_handler( - processed_chunk, cache_hit=cache_hit - ) - ) - - if self.logging_obj._llm_caching_handler is not None: - asyncio.create_task( - self.logging_obj._llm_caching_handler._add_streaming_response_to_cache( - processed_chunk=processed_chunk, - ) - ) - - choice = processed_chunk.choices[0] - if isinstance(choice, StreamingChoices): - self.response_uptil_now += choice.delta.get("content", "") or "" - else: - self.response_uptil_now += "" - self.rules.post_call_rules( - input=self.response_uptil_now, model=self.model - ) - self.chunks.append(processed_chunk) - if hasattr( - processed_chunk, "usage" - ): # remove usage from chunk, only send on final chunk - # Convert the object to a dictionary - obj_dict = processed_chunk.dict() - - # Remove an attribute (e.g., 'attr2') - if "usage" in obj_dict: - del obj_dict["usage"] - - # Create a new object without the removed attribute - processed_chunk = self.model_response_creator(chunk=obj_dict) - print_verbose(f"final returned processed chunk: {processed_chunk}") - return processed_chunk - raise StopAsyncIteration - else: # temporary patch for non-aiohttp async calls - # example - boto3 bedrock llms - while True: - if isinstance(self.completion_stream, str) or isinstance( - self.completion_stream, bytes - ): - chunk = self.completion_stream - else: - chunk = next(self.completion_stream) - if chunk is not None and chunk != b"": - print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}") - processed_chunk: Optional[ModelResponse] = self.chunk_creator( - chunk=chunk - ) - print_verbose( - f"PROCESSED CHUNK POST CHUNK CREATOR: {processed_chunk}" - ) - if processed_chunk is None: - continue - ## LOGGING - threading.Thread( - target=self.logging_obj.success_handler, - args=(processed_chunk, None, None, cache_hit), - ).start() # log processed_chunk - asyncio.create_task( - self.logging_obj.async_success_handler( - processed_chunk, cache_hit=cache_hit - ) - ) +# def handle_predibase_chunk(self, chunk): +# try: +# if not isinstance(chunk, str): +# chunk = chunk.decode( +# "utf-8" +# ) # DO NOT REMOVE this: This is required for HF inference API + Streaming +# text = "" +# is_finished = False +# finish_reason = "" +# print_verbose(f"chunk: {chunk}") +# if chunk.startswith("data:"): +# data_json = json.loads(chunk[5:]) +# print_verbose(f"data json: {data_json}") +# if "token" in data_json and "text" in data_json["token"]: +# text = data_json["token"]["text"] +# if data_json.get("details", False) and data_json["details"].get( +# "finish_reason", False +# ): +# is_finished = True +# finish_reason = data_json["details"]["finish_reason"] +# elif data_json.get( +# "generated_text", False +# ): # if full generated text exists, then stream is complete +# text = "" # don't return the final bos token +# is_finished = True +# finish_reason = "stop" +# elif data_json.get("error", False): +# raise Exception(data_json.get("error")) +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# elif "error" in chunk: +# raise ValueError(chunk) +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# except Exception as e: +# raise e + +# def handle_huggingface_chunk(self, chunk): +# try: +# if not isinstance(chunk, str): +# chunk = chunk.decode( +# "utf-8" +# ) # DO NOT REMOVE this: This is required for HF inference API + Streaming +# text = "" +# is_finished = False +# finish_reason = "" +# print_verbose(f"chunk: {chunk}") +# if chunk.startswith("data:"): +# data_json = json.loads(chunk[5:]) +# print_verbose(f"data json: {data_json}") +# if "token" in data_json and "text" in data_json["token"]: +# text = data_json["token"]["text"] +# if data_json.get("details", False) and data_json["details"].get( +# "finish_reason", False +# ): +# is_finished = True +# finish_reason = data_json["details"]["finish_reason"] +# elif data_json.get( +# "generated_text", False +# ): # if full generated text exists, then stream is complete +# text = "" # don't return the final bos token +# is_finished = True +# finish_reason = "stop" +# elif data_json.get("error", False): +# raise Exception(data_json.get("error")) +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# elif "error" in chunk: +# raise ValueError(chunk) +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# except Exception as e: +# raise e + +# def handle_ai21_chunk(self, chunk): # fake streaming +# chunk = chunk.decode("utf-8") +# data_json = json.loads(chunk) +# try: +# text = data_json["completions"][0]["data"]["text"] +# is_finished = True +# finish_reason = "stop" +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# except Exception: +# raise ValueError(f"Unable to parse response. Original response: {chunk}") + +# def handle_maritalk_chunk(self, chunk): # fake streaming +# chunk = chunk.decode("utf-8") +# data_json = json.loads(chunk) +# try: +# text = data_json["answer"] +# is_finished = True +# finish_reason = "stop" +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# except Exception: +# raise ValueError(f"Unable to parse response. Original response: {chunk}") + +# def handle_nlp_cloud_chunk(self, chunk): +# text = "" +# is_finished = False +# finish_reason = "" +# try: +# if "dolphin" in self.model: +# chunk = self.process_chunk(chunk=chunk) +# else: +# data_json = json.loads(chunk) +# chunk = data_json["generated_text"] +# text = chunk +# if "[DONE]" in text: +# text = text.replace("[DONE]", "") +# is_finished = True +# finish_reason = "stop" +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# except Exception: +# raise ValueError(f"Unable to parse response. Original response: {chunk}") + +# def handle_aleph_alpha_chunk(self, chunk): +# chunk = chunk.decode("utf-8") +# data_json = json.loads(chunk) +# try: +# text = data_json["completions"][0]["completion"] +# is_finished = True +# finish_reason = "stop" +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# except Exception: +# raise ValueError(f"Unable to parse response. Original response: {chunk}") + +# def handle_cohere_chunk(self, chunk): +# chunk = chunk.decode("utf-8") +# data_json = json.loads(chunk) +# try: +# text = "" +# is_finished = False +# finish_reason = "" +# index: Optional[int] = None +# if "index" in data_json: +# index = data_json.get("index") +# if "text" in data_json: +# text = data_json["text"] +# elif "is_finished" in data_json: +# is_finished = data_json["is_finished"] +# finish_reason = data_json["finish_reason"] +# else: +# raise Exception(data_json) +# return { +# "index": index, +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# except Exception: +# raise ValueError(f"Unable to parse response. Original response: {chunk}") + +# def handle_cohere_chat_chunk(self, chunk): +# chunk = chunk.decode("utf-8") +# data_json = json.loads(chunk) +# print_verbose(f"chunk: {chunk}") +# try: +# text = "" +# is_finished = False +# finish_reason = "" +# if "text" in data_json: +# text = data_json["text"] +# elif "is_finished" in data_json and data_json["is_finished"] is True: +# is_finished = data_json["is_finished"] +# finish_reason = data_json["finish_reason"] +# else: +# return +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# except Exception: +# raise ValueError(f"Unable to parse response. Original response: {chunk}") + +# def handle_azure_chunk(self, chunk): +# is_finished = False +# finish_reason = "" +# text = "" +# print_verbose(f"chunk: {chunk}") +# if "data: [DONE]" in chunk: +# text = "" +# is_finished = True +# finish_reason = "stop" +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# elif chunk.startswith("data:"): +# data_json = json.loads(chunk[5:]) # chunk.startswith("data:"): +# try: +# if len(data_json["choices"]) > 0: +# delta = data_json["choices"][0]["delta"] +# text = "" if delta is None else delta.get("content", "") +# if data_json["choices"][0].get("finish_reason", None): +# is_finished = True +# finish_reason = data_json["choices"][0]["finish_reason"] +# print_verbose( +# f"text: {text}; is_finished: {is_finished}; finish_reason: {finish_reason}" +# ) +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# except Exception: +# raise ValueError( +# f"Unable to parse response. Original response: {chunk}" +# ) +# elif "error" in chunk: +# raise ValueError(f"Unable to parse response. Original response: {chunk}") +# else: +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } + +# def handle_replicate_chunk(self, chunk): +# try: +# text = "" +# is_finished = False +# finish_reason = "" +# if "output" in chunk: +# text = chunk["output"] +# if "status" in chunk: +# if chunk["status"] == "succeeded": +# is_finished = True +# finish_reason = "stop" +# elif chunk.get("error", None): +# raise Exception(chunk["error"]) +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# except Exception: +# raise ValueError(f"Unable to parse response. Original response: {chunk}") + +# def handle_openai_chat_completion_chunk(self, chunk): +# try: +# print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n") +# str_line = chunk +# text = "" +# is_finished = False +# finish_reason = None +# logprobs = None +# usage = None +# if str_line and str_line.choices and len(str_line.choices) > 0: +# if ( +# str_line.choices[0].delta is not None +# and str_line.choices[0].delta.content is not None +# ): +# text = str_line.choices[0].delta.content +# else: # function/tool calling chunk - when content is None. in this case we just return the original chunk from openai +# pass +# if str_line.choices[0].finish_reason: +# is_finished = True +# finish_reason = str_line.choices[0].finish_reason + +# # checking for logprobs +# if ( +# hasattr(str_line.choices[0], "logprobs") +# and str_line.choices[0].logprobs is not None +# ): +# logprobs = str_line.choices[0].logprobs +# else: +# logprobs = None + +# usage = getattr(str_line, "usage", None) + +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# "logprobs": logprobs, +# "original_chunk": str_line, +# "usage": usage, +# } +# except Exception as e: +# raise e + +# def handle_azure_text_completion_chunk(self, chunk): +# try: +# print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n") +# text = "" +# is_finished = False +# finish_reason = None +# choices = getattr(chunk, "choices", []) +# if len(choices) > 0: +# text = choices[0].text +# if choices[0].finish_reason is not None: +# is_finished = True +# finish_reason = choices[0].finish_reason +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } + +# except Exception as e: +# raise e + +# def handle_openai_text_completion_chunk(self, chunk): +# try: +# print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n") +# text = "" +# is_finished = False +# finish_reason = None +# usage = None +# choices = getattr(chunk, "choices", []) +# if len(choices) > 0: +# text = choices[0].text +# if choices[0].finish_reason is not None: +# is_finished = True +# finish_reason = choices[0].finish_reason +# usage = getattr(chunk, "usage", None) +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# "usage": usage, +# } + +# except Exception as e: +# raise e + +# def handle_baseten_chunk(self, chunk): +# try: +# chunk = chunk.decode("utf-8") +# if len(chunk) > 0: +# if chunk.startswith("data:"): +# data_json = json.loads(chunk[5:]) +# if "token" in data_json and "text" in data_json["token"]: +# return data_json["token"]["text"] +# else: +# return "" +# data_json = json.loads(chunk) +# if "model_output" in data_json: +# if ( +# isinstance(data_json["model_output"], dict) +# and "data" in data_json["model_output"] +# and isinstance(data_json["model_output"]["data"], list) +# ): +# return data_json["model_output"]["data"][0] +# elif isinstance(data_json["model_output"], str): +# return data_json["model_output"] +# elif "completion" in data_json and isinstance( +# data_json["completion"], str +# ): +# return data_json["completion"] +# else: +# raise ValueError( +# f"Unable to parse response. Original response: {chunk}" +# ) +# else: +# return "" +# else: +# return "" +# except Exception as e: +# verbose_logger.exception( +# "litellm.CustomStreamWrapper.handle_baseten_chunk(): Exception occured - {}".format( +# str(e) +# ) +# ) +# return "" + +# def handle_cloudlfare_stream(self, chunk): +# try: +# print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n") +# chunk = chunk.decode("utf-8") +# str_line = chunk +# text = "" +# is_finished = False +# finish_reason = None + +# if "[DONE]" in chunk: +# return {"text": text, "is_finished": True, "finish_reason": "stop"} +# elif str_line.startswith("data:"): +# data_json = json.loads(str_line[5:]) +# print_verbose(f"delta content: {data_json}") +# text = data_json["response"] +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# else: +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } + +# except Exception as e: +# raise e + +# def handle_ollama_stream(self, chunk): +# try: +# if isinstance(chunk, dict): +# json_chunk = chunk +# else: +# json_chunk = json.loads(chunk) +# if "error" in json_chunk: +# raise Exception(f"Ollama Error - {json_chunk}") + +# text = "" +# is_finished = False +# finish_reason = None +# if json_chunk["done"] is True: +# text = "" +# is_finished = True +# finish_reason = "stop" +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# elif json_chunk["response"]: +# print_verbose(f"delta content: {json_chunk}") +# text = json_chunk["response"] +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# else: +# raise Exception(f"Ollama Error - {json_chunk}") +# except Exception as e: +# raise e + +# def handle_ollama_chat_stream(self, chunk): +# # for ollama_chat/ provider +# try: +# if isinstance(chunk, dict): +# json_chunk = chunk +# else: +# json_chunk = json.loads(chunk) +# if "error" in json_chunk: +# raise Exception(f"Ollama Error - {json_chunk}") + +# text = "" +# is_finished = False +# finish_reason = None +# if json_chunk["done"] is True: +# text = "" +# is_finished = True +# finish_reason = "stop" +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# elif "message" in json_chunk: +# print_verbose(f"delta content: {json_chunk}") +# text = json_chunk["message"]["content"] +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# } +# else: +# raise Exception(f"Ollama Error - {json_chunk}") +# except Exception as e: +# raise e + +# def handle_watsonx_stream(self, chunk): +# try: +# if isinstance(chunk, dict): +# parsed_response = chunk +# elif isinstance(chunk, (str, bytes)): +# if isinstance(chunk, bytes): +# chunk = chunk.decode("utf-8") +# if "generated_text" in chunk: +# response = chunk.replace("data: ", "").strip() +# parsed_response = json.loads(response) +# else: +# return { +# "text": "", +# "is_finished": False, +# "prompt_tokens": 0, +# "completion_tokens": 0, +# } +# else: +# print_verbose(f"chunk: {chunk} (Type: {type(chunk)})") +# raise ValueError( +# f"Unable to parse response. Original response: {chunk}" +# ) +# results = parsed_response.get("results", []) +# if len(results) > 0: +# text = results[0].get("generated_text", "") +# finish_reason = results[0].get("stop_reason") +# is_finished = finish_reason != "not_finished" +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# "prompt_tokens": results[0].get("input_token_count", 0), +# "completion_tokens": results[0].get("generated_token_count", 0), +# } +# return {"text": "", "is_finished": False} +# except Exception as e: +# raise e + +# def handle_triton_stream(self, chunk): +# try: +# if isinstance(chunk, dict): +# parsed_response = chunk +# elif isinstance(chunk, (str, bytes)): +# if isinstance(chunk, bytes): +# chunk = chunk.decode("utf-8") +# if "text_output" in chunk: +# response = chunk.replace("data: ", "").strip() +# parsed_response = json.loads(response) +# else: +# return { +# "text": "", +# "is_finished": False, +# "prompt_tokens": 0, +# "completion_tokens": 0, +# } +# else: +# print_verbose(f"chunk: {chunk} (Type: {type(chunk)})") +# raise ValueError( +# f"Unable to parse response. Original response: {chunk}" +# ) +# text = parsed_response.get("text_output", "") +# finish_reason = parsed_response.get("stop_reason") +# is_finished = parsed_response.get("is_finished", False) +# return { +# "text": text, +# "is_finished": is_finished, +# "finish_reason": finish_reason, +# "prompt_tokens": parsed_response.get("input_token_count", 0), +# "completion_tokens": parsed_response.get("generated_token_count", 0), +# } +# return {"text": "", "is_finished": False} +# except Exception as e: +# raise e + +# def handle_clarifai_completion_chunk(self, chunk): +# try: +# if isinstance(chunk, dict): +# parsed_response = chunk +# elif isinstance(chunk, (str, bytes)): +# if isinstance(chunk, bytes): +# parsed_response = chunk.decode("utf-8") +# else: +# parsed_response = chunk +# else: +# raise ValueError("Unable to parse streaming chunk") +# if isinstance(parsed_response, dict): +# data_json = parsed_response +# else: +# data_json = json.loads(parsed_response) +# text = ( +# data_json.get("outputs", "")[0] +# .get("data", "") +# .get("text", "") +# .get("raw", "") +# ) +# len( +# encoding.encode( +# data_json.get("outputs", "")[0] +# .get("input", "") +# .get("data", "") +# .get("text", "") +# .get("raw", "") +# ) +# ) +# len(encoding.encode(text)) +# return { +# "text": text, +# "is_finished": True, +# } +# except Exception as e: +# verbose_logger.exception( +# "litellm.CustomStreamWrapper.handle_clarifai_chunk(): Exception occured - {}".format( +# str(e) +# ) +# ) +# return "" + +# def model_response_creator( +# self, chunk: Optional[dict] = None, hidden_params: Optional[dict] = None +# ): +# _model = self.model +# _received_llm_provider = self.custom_llm_provider +# _logging_obj_llm_provider = self.logging_obj.model_call_details.get("custom_llm_provider", None) # type: ignore +# if ( +# _received_llm_provider == "openai" +# and _received_llm_provider != _logging_obj_llm_provider +# ): +# _model = "{}/{}".format(_logging_obj_llm_provider, _model) +# if chunk is None: +# chunk = {} +# else: +# # pop model keyword +# chunk.pop("model", None) + +# model_response = ModelResponse( +# stream=True, model=_model, stream_options=self.stream_options, **chunk +# ) +# if self.response_id is not None: +# model_response.id = self.response_id +# else: +# self.response_id = model_response.id # type: ignore +# if self.system_fingerprint is not None: +# model_response.system_fingerprint = self.system_fingerprint +# if hidden_params is not None: +# model_response._hidden_params = hidden_params +# model_response._hidden_params["custom_llm_provider"] = _logging_obj_llm_provider +# model_response._hidden_params["created_at"] = time.time() +# model_response._hidden_params = { +# **model_response._hidden_params, +# **self._hidden_params, +# } - choice = processed_chunk.choices[0] - if isinstance(choice, StreamingChoices): - self.response_uptil_now += ( - choice.delta.get("content", "") or "" - ) - else: - self.response_uptil_now += "" - self.rules.post_call_rules( - input=self.response_uptil_now, model=self.model - ) - # RETURN RESULT - self.chunks.append(processed_chunk) - return processed_chunk - except (StopAsyncIteration, StopIteration): - if self.sent_last_chunk is True: - # log the final chunk with accurate streaming values - complete_streaming_response = litellm.stream_chunk_builder( - chunks=self.chunks, messages=self.messages - ) - response = self.model_response_creator() - if complete_streaming_response is not None: - setattr( - response, - "usage", - getattr(complete_streaming_response, "usage"), - ) - ## LOGGING - threading.Thread( - target=self.logging_obj.success_handler, - args=(response, None, None, cache_hit), - ).start() # log response - asyncio.create_task( - self.logging_obj.async_success_handler( - response, cache_hit=cache_hit - ) - ) - if self.sent_stream_usage is False and self.send_stream_usage is True: - self.sent_stream_usage = True - return response - raise StopAsyncIteration # Re-raise StopIteration - else: - self.sent_last_chunk = True - processed_chunk = self.finish_reason_handler() - ## LOGGING - threading.Thread( - target=self.logging_obj.success_handler, - args=(processed_chunk, None, None, cache_hit), - ).start() # log response - asyncio.create_task( - self.logging_obj.async_success_handler( - processed_chunk, cache_hit=cache_hit - ) - ) - return processed_chunk - except httpx.TimeoutException as e: # if httpx read timeout error occues - traceback_exception = traceback.format_exc() - ## ADD DEBUG INFORMATION - E.G. LITELLM REQUEST TIMEOUT - traceback_exception += "\nLiteLLM Default Request Timeout - {}".format( - litellm.request_timeout - ) - if self.logging_obj is not None: - ## LOGGING - threading.Thread( - target=self.logging_obj.failure_handler, - args=(e, traceback_exception), - ).start() # log response - # Handle any exceptions that might occur during streaming - asyncio.create_task( - self.logging_obj.async_failure_handler(e, traceback_exception) - ) - raise e - except Exception as e: - traceback_exception = traceback.format_exc() - if self.logging_obj is not None: - ## LOGGING - threading.Thread( - target=self.logging_obj.failure_handler, - args=(e, traceback_exception), - ).start() # log response - # Handle any exceptions that might occur during streaming - asyncio.create_task( - self.logging_obj.async_failure_handler(e, traceback_exception) # type: ignore - ) - ## Map to OpenAI Exception - raise exception_type( - model=self.model, - custom_llm_provider=self.custom_llm_provider, - original_exception=e, - completion_kwargs={}, - extra_kwargs={}, - ) +# if ( +# len(model_response.choices) > 0 +# and getattr(model_response.choices[0], "delta") is not None +# ): +# # do nothing, if object instantiated +# pass +# else: +# model_response.choices = [StreamingChoices(finish_reason=None)] +# return model_response + +# def is_delta_empty(self, delta: Delta) -> bool: +# is_empty = True +# if delta.content is not None: +# is_empty = False +# elif delta.tool_calls is not None: +# is_empty = False +# elif delta.function_call is not None: +# is_empty = False +# return is_empty + +# def return_processed_chunk_logic( # noqa +# self, +# completion_obj: dict, +# model_response: ModelResponseStream, +# response_obj: dict, +# ): + +# print_verbose( +# f"completion_obj: {completion_obj}, model_response.choices[0]: {model_response.choices[0]}, response_obj: {response_obj}" +# ) +# if ( +# "content" in completion_obj +# and ( +# isinstance(completion_obj["content"], str) +# and len(completion_obj["content"]) > 0 +# ) +# or ( +# "tool_calls" in completion_obj +# and completion_obj["tool_calls"] is not None +# and len(completion_obj["tool_calls"]) > 0 +# ) +# or ( +# "function_call" in completion_obj +# and completion_obj["function_call"] is not None +# ) +# ): # cannot set content of an OpenAI Object to be an empty string +# self.safety_checker() +# hold, model_response_str = self.check_special_tokens( +# chunk=completion_obj["content"], +# finish_reason=model_response.choices[0].finish_reason, +# ) # filter out bos/eos tokens from openai-compatible hf endpoints +# print_verbose(f"hold - {hold}, model_response_str - {model_response_str}") +# if hold is False: +# ## check if openai/azure chunk +# original_chunk = response_obj.get("original_chunk", None) +# if original_chunk: +# model_response.id = original_chunk.id +# self.response_id = original_chunk.id +# if len(original_chunk.choices) > 0: +# choices = [] +# for choice in original_chunk.choices: +# try: +# if isinstance(choice, BaseModel): +# choice_json = choice.model_dump() +# choice_json.pop( +# "finish_reason", None +# ) # for mistral etc. which return a value in their last chunk (not-openai compatible). +# print_verbose(f"choice_json: {choice_json}") +# choices.append(StreamingChoices(**choice_json)) +# except Exception: +# choices.append(StreamingChoices()) +# print_verbose(f"choices in streaming: {choices}") +# setattr(model_response, "choices", choices) +# else: +# return +# model_response.system_fingerprint = ( +# original_chunk.system_fingerprint +# ) +# setattr( +# model_response, +# "citations", +# getattr(original_chunk, "citations", None), +# ) +# print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}") +# if self.sent_first_chunk is False: +# model_response.choices[0].delta["role"] = "assistant" +# self.sent_first_chunk = True +# elif self.sent_first_chunk is True and hasattr( +# model_response.choices[0].delta, "role" +# ): +# _initial_delta = model_response.choices[0].delta.model_dump() +# _initial_delta.pop("role", None) +# model_response.choices[0].delta = Delta(**_initial_delta) +# print_verbose( +# f"model_response.choices[0].delta: {model_response.choices[0].delta}" +# ) +# else: +# ## else +# completion_obj["content"] = model_response_str +# if self.sent_first_chunk is False: +# completion_obj["role"] = "assistant" +# self.sent_first_chunk = True + +# model_response.choices[0].delta = Delta(**completion_obj) +# _index: Optional[int] = completion_obj.get("index") +# if _index is not None: +# model_response.choices[0].index = _index +# print_verbose(f"returning model_response: {model_response}") +# return model_response +# else: +# return +# elif self.received_finish_reason is not None: +# if self.sent_last_chunk is True: +# # Bedrock returns the guardrail trace in the last chunk - we want to return this here +# if self.custom_llm_provider == "bedrock" and "trace" in model_response: +# return model_response + +# # Default - return StopIteration +# raise StopIteration +# # flush any remaining holding chunk +# if len(self.holding_chunk) > 0: +# if model_response.choices[0].delta.content is None: +# model_response.choices[0].delta.content = self.holding_chunk +# else: +# model_response.choices[0].delta.content = ( +# self.holding_chunk + model_response.choices[0].delta.content +# ) +# self.holding_chunk = "" +# # if delta is None +# _is_delta_empty = self.is_delta_empty(delta=model_response.choices[0].delta) + +# if _is_delta_empty: +# # get any function call arguments +# model_response.choices[0].finish_reason = map_finish_reason( +# finish_reason=self.received_finish_reason +# ) # ensure consistent output to openai + +# self.sent_last_chunk = True + +# return model_response +# elif ( +# model_response.choices[0].delta.tool_calls is not None +# or model_response.choices[0].delta.function_call is not None +# ): +# if self.sent_first_chunk is False: +# model_response.choices[0].delta["role"] = "assistant" +# self.sent_first_chunk = True +# return model_response +# elif ( +# len(model_response.choices) > 0 +# and hasattr(model_response.choices[0].delta, "audio") +# and model_response.choices[0].delta.audio is not None +# ): +# return model_response +# else: +# if hasattr(model_response, "usage"): +# self.chunks.append(model_response) +# return + +# def chunk_creator(self, chunk): # type: ignore # noqa: PLR0915 +# model_response = self.model_response_creator() +# response_obj: dict = {} +# try: +# # return this for all models +# completion_obj = {"content": ""} +# from litellm.litellm_core_utils.streaming_utils import ( +# generic_chunk_has_all_required_fields, +# ) +# from litellm.types.utils import GenericStreamingChunk as GChunk + +# if ( +# isinstance(chunk, dict) +# and generic_chunk_has_all_required_fields( +# chunk=chunk +# ) # check if chunk is a generic streaming chunk +# ) or ( +# self.custom_llm_provider +# and ( +# self.custom_llm_provider == "anthropic" +# or self.custom_llm_provider in litellm._custom_providers +# ) +# ): + +# if self.received_finish_reason is not None: +# if "provider_specific_fields" not in chunk: +# raise StopIteration +# anthropic_response_obj: GChunk = chunk +# completion_obj["content"] = anthropic_response_obj["text"] +# if anthropic_response_obj["is_finished"]: +# self.received_finish_reason = anthropic_response_obj[ +# "finish_reason" +# ] + +# if anthropic_response_obj["usage"] is not None: +# model_response.usage = litellm.Usage( +# **anthropic_response_obj["usage"] +# ) + +# if ( +# "tool_use" in anthropic_response_obj +# and anthropic_response_obj["tool_use"] is not None +# ): +# completion_obj["tool_calls"] = [anthropic_response_obj["tool_use"]] + +# if ( +# "provider_specific_fields" in anthropic_response_obj +# and anthropic_response_obj["provider_specific_fields"] is not None +# ): +# for key, value in anthropic_response_obj[ +# "provider_specific_fields" +# ].items(): +# setattr(model_response, key, value) + +# response_obj = anthropic_response_obj +# elif ( +# self.custom_llm_provider +# and self.custom_llm_provider == "anthropic_text" +# ): +# response_obj = self.handle_anthropic_text_chunk(chunk) +# completion_obj["content"] = response_obj["text"] +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# elif self.custom_llm_provider and self.custom_llm_provider == "clarifai": +# response_obj = self.handle_clarifai_completion_chunk(chunk) +# completion_obj["content"] = response_obj["text"] +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# elif self.model == "replicate" or self.custom_llm_provider == "replicate": +# response_obj = self.handle_replicate_chunk(chunk) +# completion_obj["content"] = response_obj["text"] +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# elif self.custom_llm_provider and self.custom_llm_provider == "huggingface": +# response_obj = self.handle_huggingface_chunk(chunk) +# completion_obj["content"] = response_obj["text"] +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# elif self.custom_llm_provider and self.custom_llm_provider == "predibase": +# response_obj = self.handle_predibase_chunk(chunk) +# completion_obj["content"] = response_obj["text"] +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# elif ( +# self.custom_llm_provider and self.custom_llm_provider == "baseten" +# ): # baseten doesn't provide streaming +# completion_obj["content"] = self.handle_baseten_chunk(chunk) +# elif ( +# self.custom_llm_provider and self.custom_llm_provider == "ai21" +# ): # ai21 doesn't provide streaming +# response_obj = self.handle_ai21_chunk(chunk) +# completion_obj["content"] = response_obj["text"] +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# elif self.custom_llm_provider and self.custom_llm_provider == "maritalk": +# response_obj = self.handle_maritalk_chunk(chunk) +# completion_obj["content"] = response_obj["text"] +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# elif self.custom_llm_provider and self.custom_llm_provider == "vllm": +# completion_obj["content"] = chunk[0].outputs[0].text +# elif ( +# self.custom_llm_provider and self.custom_llm_provider == "aleph_alpha" +# ): # aleph alpha doesn't provide streaming +# response_obj = self.handle_aleph_alpha_chunk(chunk) +# completion_obj["content"] = response_obj["text"] +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# elif self.custom_llm_provider == "nlp_cloud": +# try: +# response_obj = self.handle_nlp_cloud_chunk(chunk) +# completion_obj["content"] = response_obj["text"] +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# except Exception as e: +# if self.received_finish_reason: +# raise e +# else: +# if self.sent_first_chunk is False: +# raise Exception("An unknown error occurred with the stream") +# self.received_finish_reason = "stop" +# elif self.custom_llm_provider == "vertex_ai": +# import proto # type: ignore + +# if self.model.startswith("claude-3"): +# response_obj = self.handle_vertexai_anthropic_chunk(chunk=chunk) +# if response_obj is None: +# return +# completion_obj["content"] = response_obj["text"] +# setattr(model_response, "usage", Usage()) +# if response_obj.get("prompt_tokens", None) is not None: +# model_response.usage.prompt_tokens = response_obj[ +# "prompt_tokens" +# ] +# if response_obj.get("completion_tokens", None) is not None: +# model_response.usage.completion_tokens = response_obj[ +# "completion_tokens" +# ] +# if hasattr(model_response.usage, "prompt_tokens"): +# model_response.usage.total_tokens = ( +# getattr(model_response.usage, "total_tokens", 0) +# + model_response.usage.prompt_tokens +# ) +# if hasattr(model_response.usage, "completion_tokens"): +# model_response.usage.total_tokens = ( +# getattr(model_response.usage, "total_tokens", 0) +# + model_response.usage.completion_tokens +# ) + +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# elif hasattr(chunk, "candidates") is True: +# try: +# try: +# completion_obj["content"] = chunk.text +# except Exception as e: +# if "Part has no text." in str(e): +# ## check for function calling +# function_call = ( +# chunk.candidates[0].content.parts[0].function_call +# ) + +# args_dict = {} + +# # Check if it's a RepeatedComposite instance +# for key, val in function_call.args.items(): +# if isinstance( +# val, +# proto.marshal.collections.repeated.RepeatedComposite, +# ): +# # If so, convert to list +# args_dict[key] = [v for v in val] +# else: +# args_dict[key] = val + +# try: +# args_str = json.dumps(args_dict) +# except Exception as e: +# raise e +# _delta_obj = litellm.utils.Delta( +# content=None, +# tool_calls=[ +# { +# "id": f"call_{str(uuid.uuid4())}", +# "function": { +# "arguments": args_str, +# "name": function_call.name, +# }, +# "type": "function", +# } +# ], +# ) +# _streaming_response = StreamingChoices(delta=_delta_obj) +# _model_response = ModelResponse(stream=True) +# _model_response.choices = [_streaming_response] +# response_obj = {"original_chunk": _model_response} +# else: +# raise e +# if ( +# hasattr(chunk.candidates[0], "finish_reason") +# and chunk.candidates[0].finish_reason.name +# != "FINISH_REASON_UNSPECIFIED" +# ): # every non-final chunk in vertex ai has this +# self.received_finish_reason = chunk.candidates[ +# 0 +# ].finish_reason.name +# except Exception: +# if chunk.candidates[0].finish_reason.name == "SAFETY": +# raise Exception( +# f"The response was blocked by VertexAI. {str(chunk)}" +# ) +# else: +# completion_obj["content"] = str(chunk) +# elif self.custom_llm_provider == "cohere": +# response_obj = self.handle_cohere_chunk(chunk) +# completion_obj["content"] = response_obj["text"] +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# elif self.custom_llm_provider == "cohere_chat": +# response_obj = self.handle_cohere_chat_chunk(chunk) +# if response_obj is None: +# return +# completion_obj["content"] = response_obj["text"] +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] + +# elif self.custom_llm_provider == "petals": +# if len(self.completion_stream) == 0: +# if self.received_finish_reason is not None: +# raise StopIteration +# else: +# self.received_finish_reason = "stop" +# chunk_size = 30 +# new_chunk = self.completion_stream[:chunk_size] +# completion_obj["content"] = new_chunk +# self.completion_stream = self.completion_stream[chunk_size:] +# elif self.custom_llm_provider == "palm": +# # fake streaming +# response_obj = {} +# if len(self.completion_stream) == 0: +# if self.received_finish_reason is not None: +# raise StopIteration +# else: +# self.received_finish_reason = "stop" +# chunk_size = 30 +# new_chunk = self.completion_stream[:chunk_size] +# completion_obj["content"] = new_chunk +# self.completion_stream = self.completion_stream[chunk_size:] +# elif self.custom_llm_provider == "ollama": +# response_obj = self.handle_ollama_stream(chunk) +# completion_obj["content"] = response_obj["text"] +# print_verbose(f"completion obj content: {completion_obj['content']}") +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# elif self.custom_llm_provider == "ollama_chat": +# response_obj = self.handle_ollama_chat_stream(chunk) +# completion_obj["content"] = response_obj["text"] +# print_verbose(f"completion obj content: {completion_obj['content']}") +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# elif self.custom_llm_provider == "cloudflare": +# response_obj = self.handle_cloudlfare_stream(chunk) +# completion_obj["content"] = response_obj["text"] +# print_verbose(f"completion obj content: {completion_obj['content']}") +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# elif self.custom_llm_provider == "watsonx": +# response_obj = self.handle_watsonx_stream(chunk) +# completion_obj["content"] = response_obj["text"] +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# elif self.custom_llm_provider == "triton": +# response_obj = self.handle_triton_stream(chunk) +# completion_obj["content"] = response_obj["text"] +# print_verbose(f"completion obj content: {completion_obj['content']}") +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# elif self.custom_llm_provider == "text-completion-openai": +# response_obj = self.handle_openai_text_completion_chunk(chunk) +# completion_obj["content"] = response_obj["text"] +# print_verbose(f"completion obj content: {completion_obj['content']}") +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# if response_obj["usage"] is not None: +# model_response.usage = litellm.Usage( +# prompt_tokens=response_obj["usage"].prompt_tokens, +# completion_tokens=response_obj["usage"].completion_tokens, +# total_tokens=response_obj["usage"].total_tokens, +# ) +# elif self.custom_llm_provider == "text-completion-codestral": +# response_obj = litellm.MistralTextCompletionConfig()._chunk_parser( +# chunk +# ) +# completion_obj["content"] = response_obj["text"] +# print_verbose(f"completion obj content: {completion_obj['content']}") +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# if "usage" in response_obj is not None: +# model_response.usage = litellm.Usage( +# prompt_tokens=response_obj["usage"].prompt_tokens, +# completion_tokens=response_obj["usage"].completion_tokens, +# total_tokens=response_obj["usage"].total_tokens, +# ) +# elif self.custom_llm_provider == "azure_text": +# response_obj = self.handle_azure_text_completion_chunk(chunk) +# completion_obj["content"] = response_obj["text"] +# print_verbose(f"completion obj content: {completion_obj['content']}") +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# elif self.custom_llm_provider == "cached_response": +# response_obj = { +# "text": chunk.choices[0].delta.content, +# "is_finished": True, +# "finish_reason": chunk.choices[0].finish_reason, +# "original_chunk": chunk, +# "tool_calls": ( +# chunk.choices[0].delta.tool_calls +# if hasattr(chunk.choices[0].delta, "tool_calls") +# else None +# ), +# } + +# completion_obj["content"] = response_obj["text"] +# if response_obj["tool_calls"] is not None: +# completion_obj["tool_calls"] = response_obj["tool_calls"] +# print_verbose(f"completion obj content: {completion_obj['content']}") +# if hasattr(chunk, "id"): +# model_response.id = chunk.id +# self.response_id = chunk.id +# if hasattr(chunk, "system_fingerprint"): +# self.system_fingerprint = chunk.system_fingerprint +# if response_obj["is_finished"]: +# self.received_finish_reason = response_obj["finish_reason"] +# else: # openai / azure chat model +# if self.custom_llm_provider == "azure": +# if hasattr(chunk, "model"): +# # for azure, we need to pass the model from the orignal chunk +# self.model = chunk.model +# response_obj = self.handle_openai_chat_completion_chunk(chunk) +# if response_obj is None: +# return +# completion_obj["content"] = response_obj["text"] +# print_verbose(f"completion obj content: {completion_obj['content']}") +# if response_obj["is_finished"]: +# if response_obj["finish_reason"] == "error": +# raise Exception( +# "{} raised a streaming error - finish_reason: error, no content string given. Received Chunk={}".format( +# self.custom_llm_provider, response_obj +# ) +# ) +# self.received_finish_reason = response_obj["finish_reason"] +# if response_obj.get("original_chunk", None) is not None: +# if hasattr(response_obj["original_chunk"], "id"): +# model_response.id = response_obj["original_chunk"].id +# self.response_id = model_response.id +# if hasattr(response_obj["original_chunk"], "system_fingerprint"): +# model_response.system_fingerprint = response_obj[ +# "original_chunk" +# ].system_fingerprint +# self.system_fingerprint = response_obj[ +# "original_chunk" +# ].system_fingerprint +# if response_obj["logprobs"] is not None: +# model_response.choices[0].logprobs = response_obj["logprobs"] + +# if response_obj["usage"] is not None: +# if isinstance(response_obj["usage"], dict): +# model_response.usage = litellm.Usage( +# prompt_tokens=response_obj["usage"].get( +# "prompt_tokens", None +# ) +# or None, +# completion_tokens=response_obj["usage"].get( +# "completion_tokens", None +# ) +# or None, +# total_tokens=response_obj["usage"].get("total_tokens", None) +# or None, +# ) +# elif isinstance(response_obj["usage"], BaseModel): +# model_response.usage = litellm.Usage( +# **response_obj["usage"].model_dump() +# ) + +# model_response.model = self.model +# print_verbose( +# f"model_response finish reason 3: {self.received_finish_reason}; response_obj={response_obj}" +# ) +# ## FUNCTION CALL PARSING +# if ( +# response_obj is not None +# and response_obj.get("original_chunk", None) is not None +# ): # function / tool calling branch - only set for openai/azure compatible endpoints +# # enter this branch when no content has been passed in response +# original_chunk = response_obj.get("original_chunk", None) +# model_response.id = original_chunk.id +# self.response_id = original_chunk.id +# if original_chunk.choices and len(original_chunk.choices) > 0: +# delta = original_chunk.choices[0].delta +# if delta is not None and ( +# delta.function_call is not None or delta.tool_calls is not None +# ): +# try: +# model_response.system_fingerprint = ( +# original_chunk.system_fingerprint +# ) +# ## AZURE - check if arguments is not None +# if ( +# original_chunk.choices[0].delta.function_call +# is not None +# ): +# if ( +# getattr( +# original_chunk.choices[0].delta.function_call, +# "arguments", +# ) +# is None +# ): +# original_chunk.choices[ +# 0 +# ].delta.function_call.arguments = "" +# elif original_chunk.choices[0].delta.tool_calls is not None: +# if isinstance( +# original_chunk.choices[0].delta.tool_calls, list +# ): +# for t in original_chunk.choices[0].delta.tool_calls: +# if hasattr(t, "functions") and hasattr( +# t.functions, "arguments" +# ): +# if ( +# getattr( +# t.function, +# "arguments", +# ) +# is None +# ): +# t.function.arguments = "" +# _json_delta = delta.model_dump() +# print_verbose(f"_json_delta: {_json_delta}") +# if "role" not in _json_delta or _json_delta["role"] is None: +# _json_delta["role"] = ( +# "assistant" # mistral's api returns role as None +# ) +# if "tool_calls" in _json_delta and isinstance( +# _json_delta["tool_calls"], list +# ): +# for tool in _json_delta["tool_calls"]: +# if ( +# isinstance(tool, dict) +# and "function" in tool +# and isinstance(tool["function"], dict) +# and ("type" not in tool or tool["type"] is None) +# ): +# # if function returned but type set to None - mistral's api returns type: None +# tool["type"] = "function" +# model_response.choices[0].delta = Delta(**_json_delta) +# except Exception as e: +# verbose_logger.exception( +# "litellm.CustomStreamWrapper.chunk_creator(): Exception occured - {}".format( +# str(e) +# ) +# ) +# model_response.choices[0].delta = Delta() +# elif ( +# delta is not None and getattr(delta, "audio", None) is not None +# ): +# model_response.choices[0].delta.audio = delta.audio +# else: +# try: +# delta = ( +# dict() +# if original_chunk.choices[0].delta is None +# else dict(original_chunk.choices[0].delta) +# ) +# print_verbose(f"original delta: {delta}") +# model_response.choices[0].delta = Delta(**delta) +# print_verbose( +# f"new delta: {model_response.choices[0].delta}" +# ) +# except Exception: +# model_response.choices[0].delta = Delta() +# else: +# if ( +# self.stream_options is not None +# and self.stream_options["include_usage"] is True +# ): +# return model_response +# return +# print_verbose( +# f"model_response.choices[0].delta: {model_response.choices[0].delta}; completion_obj: {completion_obj}" +# ) +# print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}") + +# ## CHECK FOR TOOL USE +# if "tool_calls" in completion_obj and len(completion_obj["tool_calls"]) > 0: +# if self.is_function_call is True: # user passed in 'functions' param +# completion_obj["function_call"] = completion_obj["tool_calls"][0][ +# "function" +# ] +# completion_obj["tool_calls"] = None + +# self.tool_call = True + +# ## RETURN ARG +# return self.return_processed_chunk_logic( +# completion_obj=completion_obj, +# model_response=model_response, # type: ignore +# response_obj=response_obj, +# ) + +# except StopIteration: +# raise StopIteration +# except Exception as e: +# traceback.format_exc() +# e.message = str(e) +# raise exception_type( +# model=self.model, +# custom_llm_provider=self.custom_llm_provider, +# original_exception=e, +# ) + +# def set_logging_event_loop(self, loop): +# """ +# import litellm, asyncio + +# loop = asyncio.get_event_loop() # πŸ‘ˆ gets the current event loop + +# response = litellm.completion(.., stream=True) + +# response.set_logging_event_loop(loop=loop) # πŸ‘ˆ enables async_success callbacks for sync logging + +# for chunk in response: +# ... +# """ +# self.logging_loop = loop + +# def run_success_logging_and_cache_storage(self, processed_chunk, cache_hit: bool): +# """ +# Runs success logging in a thread and adds the response to the cache +# """ +# if litellm.disable_streaming_logging is True: +# """ +# [NOT RECOMMENDED] +# Set this via `litellm.disable_streaming_logging = True`. + +# Disables streaming logging. +# """ +# return +# ## ASYNC LOGGING +# # Create an event loop for the new thread +# if self.logging_loop is not None: +# future = asyncio.run_coroutine_threadsafe( +# self.logging_obj.async_success_handler( +# processed_chunk, None, None, cache_hit +# ), +# loop=self.logging_loop, +# ) +# future.result() +# else: +# asyncio.run( +# self.logging_obj.async_success_handler( +# processed_chunk, None, None, cache_hit +# ) +# ) +# ## SYNC LOGGING +# self.logging_obj.success_handler(processed_chunk, None, None, cache_hit) + +# ## Sync store in cache +# if self.logging_obj._llm_caching_handler is not None: +# self.logging_obj._llm_caching_handler._sync_add_streaming_response_to_cache( +# processed_chunk +# ) + +# def finish_reason_handler(self): +# model_response = self.model_response_creator() +# complete_streaming_response = litellm.stream_chunk_builder( +# chunks=self.chunks +# ) +# _finish_reason = complete_streaming_response.choices[0].finish_reason + +# print(f"_finish_reason: {_finish_reason}") +# if _finish_reason is not None: +# model_response.choices[0].finish_reason = _finish_reason +# else: +# model_response.choices[0].finish_reason = "stop" + +# ## if tool use +# if ( +# model_response.choices[0].finish_reason == "stop" and self.tool_call +# ): # don't overwrite for other - potential error finish reasons +# model_response.choices[0].finish_reason = "tool_calls" +# return model_response + +# def __next__(self): # noqa: PLR0915 +# cache_hit = False +# if ( +# self.custom_llm_provider is not None +# and self.custom_llm_provider == "cached_response" +# ): +# cache_hit = True +# try: +# if self.completion_stream is None: +# self.fetch_sync_stream() +# while True: +# if ( +# isinstance(self.completion_stream, str) +# or isinstance(self.completion_stream, bytes) +# or isinstance(self.completion_stream, ModelResponse) +# ): +# chunk = self.completion_stream +# else: +# chunk = next(self.completion_stream) +# if chunk is not None and chunk != b"": +# print_verbose( +# f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}; custom_llm_provider: {self.custom_llm_provider}" +# ) +# response: Optional[ModelResponse] = self.chunk_creator(chunk=chunk) +# print_verbose(f"PROCESSED CHUNK POST CHUNK CREATOR: {response}") + +# if response is None: +# continue +# ## LOGGING +# threading.Thread( +# target=self.run_success_logging_and_cache_storage, +# args=(response, cache_hit), +# ).start() # log response +# choice = response.choices[0] +# if isinstance(choice, StreamingChoices): +# self.response_uptil_now += choice.delta.get("content", "") or "" +# else: +# self.response_uptil_now += "" +# self.rules.post_call_rules( +# input=self.response_uptil_now, model=self.model +# ) +# # HANDLE STREAM OPTIONS +# self.chunks.append(response) +# if hasattr( +# response, "usage" +# ): # remove usage from chunk, only send on final chunk +# # Convert the object to a dictionary +# obj_dict = response.dict() + +# # Remove an attribute (e.g., 'attr2') +# if "usage" in obj_dict: +# del obj_dict["usage"] + +# # Create a new object without the removed attribute +# response = self.model_response_creator( +# chunk=obj_dict, hidden_params=response._hidden_params +# ) +# # add usage as hidden param +# if self.sent_last_chunk is True and self.stream_options is None: +# usage = calculate_total_usage(chunks=self.chunks) +# response._hidden_params["usage"] = usage +# # RETURN RESULT +# return response + +# except StopIteration: +# if self.sent_last_chunk is True: +# complete_streaming_response = litellm.stream_chunk_builder( +# chunks=self.chunks, messages=self.messages +# ) +# response = self.model_response_creator() +# if complete_streaming_response is not None: +# setattr( +# response, +# "usage", +# getattr(complete_streaming_response, "usage"), +# ) + +# ## LOGGING +# threading.Thread( +# target=self.logging_obj.success_handler, +# args=(response, None, None, cache_hit), +# ).start() # log response + +# if self.sent_stream_usage is False and self.send_stream_usage is True: +# self.sent_stream_usage = True +# return response +# raise # Re-raise StopIteration +# else: +# self.sent_last_chunk = True +# processed_chunk = self.finish_reason_handler() +# if self.stream_options is None: # add usage as hidden param +# usage = calculate_total_usage(chunks=self.chunks) +# processed_chunk._hidden_params["usage"] = usage +# ## LOGGING +# threading.Thread( +# target=self.run_success_logging_and_cache_storage, +# args=(processed_chunk, cache_hit), +# ).start() # log response +# return processed_chunk +# except Exception as e: +# traceback_exception = traceback.format_exc() +# # LOG FAILURE - handle streaming failure logging in the _next_ object, remove `handle_failure` once it's deprecated +# threading.Thread( +# target=self.logging_obj.failure_handler, args=(e, traceback_exception) +# ).start() +# if isinstance(e, OpenAIError): +# raise e +# else: +# raise exception_type( +# model=self.model, +# original_exception=e, +# custom_llm_provider=self.custom_llm_provider, +# ) + +# def fetch_sync_stream(self): +# if self.completion_stream is None and self.make_call is not None: +# # Call make_call to get the completion stream +# self.completion_stream = self.make_call(client=litellm.module_level_client) +# self._stream_iter = self.completion_stream.__iter__() + +# return self.completion_stream + +# async def fetch_stream(self): +# if self.completion_stream is None and self.make_call is not None: +# # Call make_call to get the completion stream +# self.completion_stream = await self.make_call( +# client=litellm.module_level_aclient +# ) +# self._stream_iter = self.completion_stream.__aiter__() + +# return self.completion_stream + +# async def __anext__(self): # noqa: PLR0915 +# cache_hit = False +# if ( +# self.custom_llm_provider is not None +# and self.custom_llm_provider == "cached_response" +# ): +# cache_hit = True +# try: +# if self.completion_stream is None: +# await self.fetch_stream() + +# if ( +# self.custom_llm_provider == "openai" +# or self.custom_llm_provider == "azure" +# or self.custom_llm_provider == "custom_openai" +# or self.custom_llm_provider == "text-completion-openai" +# or self.custom_llm_provider == "text-completion-codestral" +# or self.custom_llm_provider == "azure_text" +# or self.custom_llm_provider == "anthropic" +# or self.custom_llm_provider == "anthropic_text" +# or self.custom_llm_provider == "huggingface" +# or self.custom_llm_provider == "ollama" +# or self.custom_llm_provider == "ollama_chat" +# or self.custom_llm_provider == "vertex_ai" +# or self.custom_llm_provider == "vertex_ai_beta" +# or self.custom_llm_provider == "sagemaker" +# or self.custom_llm_provider == "sagemaker_chat" +# or self.custom_llm_provider == "gemini" +# or self.custom_llm_provider == "replicate" +# or self.custom_llm_provider == "cached_response" +# or self.custom_llm_provider == "predibase" +# or self.custom_llm_provider == "databricks" +# or self.custom_llm_provider == "bedrock" +# or self.custom_llm_provider == "triton" +# or self.custom_llm_provider == "watsonx" +# or self.custom_llm_provider in litellm.openai_compatible_endpoints +# or self.custom_llm_provider in litellm._custom_providers +# ): +# async for chunk in self.completion_stream: +# if chunk == "None" or chunk is None: +# raise Exception +# elif ( +# self.custom_llm_provider == "gemini" +# and hasattr(chunk, "parts") +# and len(chunk.parts) == 0 +# ): +# continue +# # chunk_creator() does logging/stream chunk building. We need to let it know its being called in_async_func, so we don't double add chunks. +# # __anext__ also calls async_success_handler, which does logging +# print_verbose(f"PROCESSED ASYNC CHUNK PRE CHUNK CREATOR: {chunk}") + +# processed_chunk: Optional[ModelResponse] = self.chunk_creator( +# chunk=chunk +# ) +# print_verbose( +# f"PROCESSED ASYNC CHUNK POST CHUNK CREATOR: {processed_chunk}" +# ) +# if processed_chunk is None: +# continue +# ## LOGGING +# ## LOGGING +# executor.submit( +# self.logging_obj.success_handler, +# result=processed_chunk, +# start_time=None, +# end_time=None, +# cache_hit=cache_hit, +# ) + +# asyncio.create_task( +# self.logging_obj.async_success_handler( +# processed_chunk, cache_hit=cache_hit +# ) +# ) + +# if self.logging_obj._llm_caching_handler is not None: +# asyncio.create_task( +# self.logging_obj._llm_caching_handler._add_streaming_response_to_cache( +# processed_chunk=processed_chunk, +# ) +# ) + +# choice = processed_chunk.choices[0] +# if isinstance(choice, StreamingChoices): +# self.response_uptil_now += choice.delta.get("content", "") or "" +# else: +# self.response_uptil_now += "" +# self.rules.post_call_rules( +# input=self.response_uptil_now, model=self.model +# ) +# self.chunks.append(processed_chunk) +# if hasattr( +# processed_chunk, "usage" +# ): # remove usage from chunk, only send on final chunk +# # Convert the object to a dictionary +# obj_dict = processed_chunk.dict() + +# # Remove an attribute (e.g., 'attr2') +# if "usage" in obj_dict: +# del obj_dict["usage"] + +# # Create a new object without the removed attribute +# processed_chunk = self.model_response_creator(chunk=obj_dict) +# print_verbose(f"final returned processed chunk: {processed_chunk}") +# return processed_chunk +# raise StopAsyncIteration +# else: # temporary patch for non-aiohttp async calls +# # example - boto3 bedrock llms +# while True: +# if isinstance(self.completion_stream, str) or isinstance( +# self.completion_stream, bytes +# ): +# chunk = self.completion_stream +# else: +# chunk = next(self.completion_stream) +# if chunk is not None and chunk != b"": +# print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}") +# processed_chunk: Optional[ModelResponse] = self.chunk_creator( +# chunk=chunk +# ) +# print_verbose( +# f"PROCESSED CHUNK POST CHUNK CREATOR: {processed_chunk}" +# ) +# if processed_chunk is None: +# continue +# ## LOGGING +# threading.Thread( +# target=self.logging_obj.success_handler, +# args=(processed_chunk, None, None, cache_hit), +# ).start() # log processed_chunk +# asyncio.create_task( +# self.logging_obj.async_success_handler( +# processed_chunk, cache_hit=cache_hit +# ) +# ) + +# choice = processed_chunk.choices[0] +# if isinstance(choice, StreamingChoices): +# self.response_uptil_now += ( +# choice.delta.get("content", "") or "" +# ) +# else: +# self.response_uptil_now += "" +# self.rules.post_call_rules( +# input=self.response_uptil_now, model=self.model +# ) +# # RETURN RESULT +# self.chunks.append(processed_chunk) +# return processed_chunk +# except (StopAsyncIteration, StopIteration): +# if self.sent_last_chunk is True: +# # log the final chunk with accurate streaming values +# complete_streaming_response = litellm.stream_chunk_builder( +# chunks=self.chunks, messages=self.messages +# ) +# response = self.model_response_creator() +# if complete_streaming_response is not None: +# setattr( +# response, +# "usage", +# getattr(complete_streaming_response, "usage"), +# ) +# ## LOGGING +# threading.Thread( +# target=self.logging_obj.success_handler, +# args=(response, None, None, cache_hit), +# ).start() # log response +# asyncio.create_task( +# self.logging_obj.async_success_handler( +# response, cache_hit=cache_hit +# ) +# ) +# if self.sent_stream_usage is False and self.send_stream_usage is True: +# self.sent_stream_usage = True +# return response +# raise StopAsyncIteration # Re-raise StopIteration +# else: +# self.sent_last_chunk = True +# processed_chunk = self.finish_reason_handler() +# ## LOGGING +# threading.Thread( +# target=self.logging_obj.success_handler, +# args=(processed_chunk, None, None, cache_hit), +# ).start() # log response +# asyncio.create_task( +# self.logging_obj.async_success_handler( +# processed_chunk, cache_hit=cache_hit +# ) +# ) +# return processed_chunk +# except httpx.TimeoutException as e: # if httpx read timeout error occues +# traceback_exception = traceback.format_exc() +# ## ADD DEBUG INFORMATION - E.G. LITELLM REQUEST TIMEOUT +# traceback_exception += "\nLiteLLM Default Request Timeout - {}".format( +# litellm.request_timeout +# ) +# if self.logging_obj is not None: +# ## LOGGING +# threading.Thread( +# target=self.logging_obj.failure_handler, +# args=(e, traceback_exception), +# ).start() # log response +# # Handle any exceptions that might occur during streaming +# asyncio.create_task( +# self.logging_obj.async_failure_handler(e, traceback_exception) +# ) +# raise e +# except Exception as e: +# traceback_exception = traceback.format_exc() +# if self.logging_obj is not None: +# ## LOGGING +# threading.Thread( +# target=self.logging_obj.failure_handler, +# args=(e, traceback_exception), +# ).start() # log response +# # Handle any exceptions that might occur during streaming +# asyncio.create_task( +# self.logging_obj.async_failure_handler(e, traceback_exception) # type: ignore +# ) +# ## Map to OpenAI Exception +# raise exception_type( +# model=self.model, +# custom_llm_provider=self.custom_llm_provider, +# original_exception=e, +# completion_kwargs={}, +# extra_kwargs={}, +# ) class TextCompletionStreamWrapper: @@ -8267,29 +8189,6 @@ def has_tool_call_blocks(messages: List[AllMessageValues]) -> bool: return False -def process_response_headers(response_headers: Union[httpx.Headers, dict]) -> dict: - openai_headers = {} - processed_headers = {} - additional_headers = {} - - for k, v in response_headers.items(): - if k in OPENAI_RESPONSE_HEADERS: # return openai-compatible headers - openai_headers[k] = v - if k.startswith( - "llm_provider-" - ): # return raw provider headers (incl. openai-compatible ones) - processed_headers[k] = v - else: - additional_headers["{}-{}".format("llm_provider", k)] = v - - additional_headers = { - **openai_headers, - **processed_headers, - **additional_headers, - } - return additional_headers - - def add_dummy_tool(custom_llm_provider: str) -> List[ChatCompletionToolParam]: """ Prevent Anthropic from raising error when tool_use block exists but no tools are provided. diff --git a/tests/local_testing/test_streaming.py b/tests/local_testing/test_streaming.py index 827a2495b0b3..fcdc6b60d4f9 100644 --- a/tests/local_testing/test_streaming.py +++ b/tests/local_testing/test_streaming.py @@ -3470,6 +3470,86 @@ def test_unit_test_custom_stream_wrapper_repeating_chunk( continue +def test_unit_test_gemini_streaming_content_filter(): + chunks = [ + { + "text": "##", + "tool_use": None, + "is_finished": False, + "finish_reason": "stop", + "usage": {"prompt_tokens": 37, "completion_tokens": 1, "total_tokens": 38}, + "index": 0, + }, + { + "text": "", + "is_finished": False, + "finish_reason": "", + "usage": None, + "index": 0, + "tool_use": None, + }, + { + "text": " Downsides of Prompt Hacking in a Customer Portal\n\nWhile prompt engineering can be incredibly", + "tool_use": None, + "is_finished": False, + "finish_reason": "stop", + "usage": {"prompt_tokens": 37, "completion_tokens": 17, "total_tokens": 54}, + "index": 0, + }, + { + "text": "", + "is_finished": False, + "finish_reason": "", + "usage": None, + "index": 0, + "tool_use": None, + }, + { + "text": "", + "tool_use": None, + "is_finished": False, + "finish_reason": "content_filter", + "usage": {"prompt_tokens": 37, "completion_tokens": 17, "total_tokens": 54}, + "index": 0, + }, + { + "text": "", + "is_finished": False, + "finish_reason": "", + "usage": None, + "index": 0, + "tool_use": None, + }, + ] + + completion_stream = ModelResponseListIterator(model_responses=chunks) + + response = litellm.CustomStreamWrapper( + completion_stream=completion_stream, + model="gemini/gemini-1.5-pro", + custom_llm_provider="gemini", + logging_obj=litellm.Logging( + model="gemini/gemini-1.5-pro", + messages=[{"role": "user", "content": "Hey"}], + stream=True, + call_type="completion", + start_time=time.time(), + litellm_call_id="12345", + function_id="1245", + ), + ) + + stream_finish_reason: Optional[str] = None + idx = 0 + for chunk in response: + print(f"chunk: {chunk}") + if chunk.choices[0].finish_reason is not None: + stream_finish_reason = chunk.choices[0].finish_reason + idx += 1 + print(f"num chunks: {idx}") + assert stream_finish_reason == "content_filter" + + def test_unit_test_custom_stream_wrapper_openai(): """ Test if last streaming chunk ends with '?', if the message repeats itself.