diff --git a/configs/llm_model_configs/azure-gpt-4o-mini.json b/configs/llm_model_configs/azure-gpt-4o-mini.json new file mode 100644 index 0000000000..58eb3a00dc --- /dev/null +++ b/configs/llm_model_configs/azure-gpt-4o-mini.json @@ -0,0 +1,7 @@ +{ + "context_window": 128000, + "model": "gpt-4o-mini", + "model_endpoint_type": "azure", + "api_version": "2023-03-15-preview", + "model_wrapper": null +} diff --git a/letta/agent.py b/letta/agent.py index 831e0f4a71..164c33690d 100644 --- a/letta/agent.py +++ b/letta/agent.py @@ -18,7 +18,7 @@ MESSAGE_SUMMARY_WARNING_FRAC, ) from letta.interface import AgentInterface -from letta.llm_api.llm_api_tools import create, is_context_overflow_error +from letta.llm_api.llm_api_tools import create from letta.memory import ArchivalMemory, RecallMemory, summarize_messages from letta.metadata import MetadataStore from letta.persistence_manager import LocalStateManager @@ -56,6 +56,7 @@ ) from .errors import LLMError +from .llm_api.helpers import is_context_overflow_error def compile_memory_metadata_block( @@ -207,7 +208,7 @@ def step( recreate_message_timestamp: bool = True, # if True, when input is a Message type, recreated the 'created_at' field stream: bool = False, # TODO move to config? timestamp: Optional[datetime.datetime] = None, - inner_thoughts_in_kwargs: OptionState = OptionState.DEFAULT, + inner_thoughts_in_kwargs_option: OptionState = OptionState.DEFAULT, ms: Optional[MetadataStore] = None, ) -> AgentStepResponse: """ @@ -223,7 +224,7 @@ def update_state(self) -> AgentState: class Agent(BaseAgent): def __init__( self, - interface: AgentInterface, + interface: Optional[AgentInterface], # agents can be created from providing agent_state agent_state: AgentState, tools: List[Tool], @@ -460,7 +461,7 @@ def _get_ai_reply( function_call: str = "auto", first_message: bool = False, # hint stream: bool = False, # TODO move to config? - inner_thoughts_in_kwargs: OptionState = OptionState.DEFAULT, + inner_thoughts_in_kwargs_option: OptionState = OptionState.DEFAULT, ) -> ChatCompletionResponse: """Get response from LLM API""" try: @@ -478,7 +479,7 @@ def _get_ai_reply( stream=stream, stream_inferface=self.interface, # putting inner thoughts in func args or not - inner_thoughts_in_kwargs=inner_thoughts_in_kwargs, + inner_thoughts_in_kwargs_option=inner_thoughts_in_kwargs_option, ) if len(response.choices) == 0: @@ -560,6 +561,8 @@ def _handle_ai_response( function_call = ( response_message.function_call if response_message.function_call is not None else response_message.tool_calls[0].function ) + + # Get the name of the function function_name = function_call.name printd(f"Request to call function {function_name} with tool_call_id: {tool_call_id}") @@ -608,6 +611,13 @@ def _handle_ai_response( self.interface.function_message(f"Error: {error_msg}", msg_obj=messages[-1]) return messages, False, True # force a heartbeat to allow agent to handle error + # Check if inner thoughts is in the function call arguments (possible apparently if you are using Azure) + if "inner_thoughts" in function_args: + response_message.content = function_args.pop("inner_thoughts") + # The content if then internal monologue, not chat + if response_message.content: + self.interface.internal_monologue(response_message.content, msg_obj=messages[-1]) + # (Still parsing function args) # Handle requests for immediate heartbeat heartbeat_request = function_args.pop("request_heartbeat", None) @@ -716,7 +726,7 @@ def step( recreate_message_timestamp: bool = True, # if True, when input is a Message type, recreated the 'created_at' field stream: bool = False, # TODO move to config? timestamp: Optional[datetime.datetime] = None, - inner_thoughts_in_kwargs: OptionState = OptionState.DEFAULT, + inner_thoughts_in_kwargs_option: OptionState = OptionState.DEFAULT, ms: Optional[MetadataStore] = None, ) -> AgentStepResponse: """Top-level event message handler for the Letta agent""" @@ -795,7 +805,7 @@ def step( message_sequence=input_message_sequence, first_message=True, # passed through to the prompt formatter stream=stream, - inner_thoughts_in_kwargs=inner_thoughts_in_kwargs, + inner_thoughts_in_kwargs_option=inner_thoughts_in_kwargs_option, ) if verify_first_message_correctness(response, require_monologue=self.first_message_verify_mono): break @@ -808,7 +818,7 @@ def step( response = self._get_ai_reply( message_sequence=input_message_sequence, stream=stream, - inner_thoughts_in_kwargs=inner_thoughts_in_kwargs, + inner_thoughts_in_kwargs_option=inner_thoughts_in_kwargs_option, ) # Step 3: check if LLM wanted to call a function @@ -892,7 +902,7 @@ def step( recreate_message_timestamp=recreate_message_timestamp, stream=stream, timestamp=timestamp, - inner_thoughts_in_kwargs=inner_thoughts_in_kwargs, + inner_thoughts_in_kwargs_option=inner_thoughts_in_kwargs_option, ms=ms, ) diff --git a/letta/credentials.py b/letta/credentials.py index d662e76e24..05d683ae22 100644 --- a/letta/credentials.py +++ b/letta/credentials.py @@ -30,7 +30,7 @@ class LettaCredentials: # azure config azure_auth_type: str = "api_key" - azure_key: Optional[str] = None + azure_key: Optional[str] = os.getenv("AZURE_OPENAI_API_KEY") # groq config groq_key: Optional[str] = os.getenv("GROQ_API_KEY") diff --git a/letta/errors.py b/letta/errors.py index 852ec874c4..bd1e5421c7 100644 --- a/letta/errors.py +++ b/letta/errors.py @@ -56,7 +56,7 @@ def construct_error_message(messages: List[Union["Message", "LettaMessage"]], er error_msg += f" (Explanation: {explanation})" # Pretty print out message JSON - message_json = json.dumps([message.model_dump_json(indent=4) for message in messages], indent=4) + message_json = json.dumps([message.model_dump() for message in messages], indent=4) return f"{error_msg}\n\n{message_json}" diff --git a/letta/llm_api/azure_openai.py b/letta/llm_api/azure_openai.py index bdcc8806b2..57b49f7cfc 100644 --- a/letta/llm_api/azure_openai.py +++ b/letta/llm_api/azure_openai.py @@ -2,8 +2,11 @@ import requests +from letta.schemas.llm_config import LLMConfig from letta.schemas.openai.chat_completion_response import ChatCompletionResponse +from letta.schemas.openai.chat_completions import ChatCompletionRequest from letta.schemas.openai.embedding_response import EmbeddingResponse +from letta.settings import ModelSettings from letta.utils import smart_urljoin MODEL_TO_AZURE_ENGINE = { @@ -13,17 +16,16 @@ "gpt-3.5": "gpt-35-turbo", "gpt-3.5-turbo": "gpt-35-turbo", "gpt-3.5-turbo-16k": "gpt-35-turbo-16k", + "gpt-4o-mini": "gpt-4o-mini", } -def clean_azure_endpoint(raw_endpoint_name: str) -> str: - """Make sure the endpoint is of format 'https://YOUR_RESOURCE_NAME.openai.azure.com'""" - if raw_endpoint_name is None: - raise ValueError(raw_endpoint_name) - endpoint_address = raw_endpoint_name.strip("/").replace(".openai.azure.com", "") - endpoint_address = endpoint_address.replace("http://", "") - endpoint_address = endpoint_address.replace("https://", "") - return endpoint_address +def get_azure_endpoint(llm_config: LLMConfig, model_settings: ModelSettings): + assert llm_config.api_version, "Missing model version! This field must be provided in the LLM config for Azure." + assert llm_config.model in MODEL_TO_AZURE_ENGINE, f"{llm_config.model} not in supported models: {list(MODEL_TO_AZURE_ENGINE.keys())}" + + model = MODEL_TO_AZURE_ENGINE[llm_config.model] + return f"{model_settings.azure_base_url}/openai/deployments/{model}/chat/completions?api-version={llm_config.api_version}" def azure_openai_get_model_list(url: str, api_key: Union[str, None], api_version: str) -> dict: @@ -72,19 +74,15 @@ def azure_openai_get_model_list(url: str, api_key: Union[str, None], api_version def azure_openai_chat_completions_request( - resource_name: str, deployment_id: str, api_version: str, api_key: str, data: dict + model_settings: ModelSettings, llm_config: LLMConfig, api_key: str, chat_completion_request: ChatCompletionRequest ) -> ChatCompletionResponse: """https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#chat-completions""" from letta.utils import printd - assert resource_name is not None, "Missing required field when calling Azure OpenAI" - assert deployment_id is not None, "Missing required field when calling Azure OpenAI" - assert api_version is not None, "Missing required field when calling Azure OpenAI" assert api_key is not None, "Missing required field when calling Azure OpenAI" - resource_name = clean_azure_endpoint(resource_name) - url = f"https://{resource_name}.openai.azure.com/openai/deployments/{deployment_id}/chat/completions?api-version={api_version}" headers = {"Content-Type": "application/json", "api-key": f"{api_key}"} + data = chat_completion_request.model_dump(exclude_none=True) # If functions == None, strip from the payload if "functions" in data and data["functions"] is None: @@ -95,11 +93,10 @@ def azure_openai_chat_completions_request( data.pop("tools") data.pop("tool_choice", None) # extra safe, should exist always (default="auto") - printd(f"Sending request to {url}") + model_endpoint = get_azure_endpoint(llm_config, model_settings) + printd(f"Sending request to {model_endpoint}") try: - data["messages"] = [i.to_openai_dict() for i in data["messages"]] - response = requests.post(url, headers=headers, json=data) - printd(f"response = {response}") + response = requests.post(model_endpoint, headers=headers, json=data) response.raise_for_status() # Raises HTTPError for 4XX/5XX status response = response.json() # convert to dict from string printd(f"response.json = {response}") @@ -128,7 +125,6 @@ def azure_openai_embeddings_request( """https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings""" from letta.utils import printd - resource_name = clean_azure_endpoint(resource_name) url = f"https://{resource_name}.openai.azure.com/openai/deployments/{deployment_id}/embeddings?api-version={api_version}" headers = {"Content-Type": "application/json", "api-key": f"{api_key}"} diff --git a/letta/llm_api/helpers.py b/letta/llm_api/helpers.py new file mode 100644 index 0000000000..3fae442ae1 --- /dev/null +++ b/letta/llm_api/helpers.py @@ -0,0 +1,153 @@ +import copy +import json +import warnings +from typing import List, Union + +import requests + +from letta.constants import OPENAI_CONTEXT_WINDOW_ERROR_SUBSTRING +from letta.schemas.enums import OptionState +from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, Choice +from letta.utils import json_dumps + + +# TODO update to use better types +def add_inner_thoughts_to_functions( + functions: List[dict], + inner_thoughts_key: str, + inner_thoughts_description: str, + inner_thoughts_required: bool = True, + # inner_thoughts_to_front: bool = True, TODO support sorting somewhere, probably in the to_dict? +) -> List[dict]: + """Add an inner_thoughts kwarg to every function in the provided list""" + # return copies + new_functions = [] + + # functions is a list of dicts in the OpenAI schema (https://platform.openai.com/docs/api-reference/chat/create) + for function_object in functions: + function_params = function_object["parameters"]["properties"] + required_params = list(function_object["parameters"]["required"]) + + # if the inner thoughts arg doesn't exist, add it + if inner_thoughts_key not in function_params: + function_params[inner_thoughts_key] = { + "type": "string", + "description": inner_thoughts_description, + } + + # make sure it's tagged as required + new_function_object = copy.deepcopy(function_object) + if inner_thoughts_required and inner_thoughts_key not in required_params: + required_params.append(inner_thoughts_key) + new_function_object["parameters"]["required"] = required_params + + new_functions.append(new_function_object) + + # return a list of copies + return new_functions + + +def unpack_all_inner_thoughts_from_kwargs( + response: ChatCompletionResponse, + inner_thoughts_key: str, +) -> ChatCompletionResponse: + """Strip the inner thoughts out of the tool call and put it in the message content""" + if len(response.choices) == 0: + raise ValueError(f"Unpacking inner thoughts from empty response not supported") + + new_choices = [] + for choice in response.choices: + new_choices.append(unpack_inner_thoughts_from_kwargs(choice, inner_thoughts_key)) + + # return an updated copy + new_response = response.model_copy(deep=True) + new_response.choices = new_choices + return new_response + + +def unpack_inner_thoughts_from_kwargs(choice: Choice, inner_thoughts_key: str) -> Choice: + message = choice.message + if message.role == "assistant" and message.tool_calls and len(message.tool_calls) >= 1: + if len(message.tool_calls) > 1: + warnings.warn(f"Unpacking inner thoughts from more than one tool call ({len(message.tool_calls)}) is not supported") + # TODO support multiple tool calls + tool_call = message.tool_calls[0] + + try: + # Sadly we need to parse the JSON since args are in string format + func_args = dict(json.loads(tool_call.function.arguments)) + if inner_thoughts_key in func_args: + # extract the inner thoughts + inner_thoughts = func_args.pop(inner_thoughts_key) + + # replace the kwargs + new_choice = choice.model_copy(deep=True) + new_choice.message.tool_calls[0].function.arguments = json_dumps(func_args) + # also replace the message content + if new_choice.message.content is not None: + warnings.warn(f"Overwriting existing inner monologue ({new_choice.message.content}) with kwarg ({inner_thoughts})") + new_choice.message.content = inner_thoughts + + return new_choice + else: + warnings.warn(f"Did not find inner thoughts in tool call: {str(tool_call)}") + + except json.JSONDecodeError as e: + warnings.warn(f"Failed to strip inner thoughts from kwargs: {e}") + raise e + + +def is_context_overflow_error(exception: Union[requests.exceptions.RequestException, Exception]) -> bool: + """Checks if an exception is due to context overflow (based on common OpenAI response messages)""" + from letta.utils import printd + + match_string = OPENAI_CONTEXT_WINDOW_ERROR_SUBSTRING + + # Backwards compatibility with openai python package/client v0.28 (pre-v1 client migration) + if match_string in str(exception): + printd(f"Found '{match_string}' in str(exception)={(str(exception))}") + return True + + # Based on python requests + OpenAI REST API (/v1) + elif isinstance(exception, requests.exceptions.HTTPError): + if exception.response is not None and "application/json" in exception.response.headers.get("Content-Type", ""): + try: + error_details = exception.response.json() + if "error" not in error_details: + printd(f"HTTPError occurred, but couldn't find error field: {error_details}") + return False + else: + error_details = error_details["error"] + + # Check for the specific error code + if error_details.get("code") == "context_length_exceeded": + printd(f"HTTPError occurred, caught error code {error_details.get('code')}") + return True + # Soft-check for "maximum context length" inside of the message + elif error_details.get("message") and "maximum context length" in error_details.get("message"): + printd(f"HTTPError occurred, found '{match_string}' in error message contents ({error_details})") + return True + else: + printd(f"HTTPError occurred, but unknown error message: {error_details}") + return False + except ValueError: + # JSON decoding failed + printd(f"HTTPError occurred ({exception}), but no JSON error message.") + + # Generic fail + else: + return False + + +def derive_inner_thoughts_in_kwargs(inner_thoughts_in_kwargs_option: OptionState, model: str): + if inner_thoughts_in_kwargs_option == OptionState.DEFAULT: + # model that are known to not use `content` fields on tool calls + inner_thoughts_in_kwargs = "gpt-4o" in model or "gpt-4-turbo" in model or "gpt-3.5-turbo" in model + else: + inner_thoughts_in_kwargs = True if inner_thoughts_in_kwargs_option == OptionState.YES else False + + if not isinstance(inner_thoughts_in_kwargs, bool): + warnings.warn(f"Bad type detected: {type(inner_thoughts_in_kwargs)}") + inner_thoughts_in_kwargs = bool(inner_thoughts_in_kwargs) + + return inner_thoughts_in_kwargs diff --git a/letta/llm_api/llm_api_tools.py b/letta/llm_api/llm_api_tools.py index c95ef7a35e..7ccd23ac3e 100644 --- a/letta/llm_api/llm_api_tools.py +++ b/letta/llm_api/llm_api_tools.py @@ -1,25 +1,25 @@ -import copy -import json import os import random import time -import warnings from typing import List, Optional, Union import requests -from letta.constants import CLI_WARNING_PREFIX, OPENAI_CONTEXT_WINDOW_ERROR_SUBSTRING +from letta.constants import CLI_WARNING_PREFIX from letta.llm_api.anthropic import anthropic_chat_completions_request -from letta.llm_api.azure_openai import ( - MODEL_TO_AZURE_ENGINE, - azure_openai_chat_completions_request, -) +from letta.llm_api.azure_openai import azure_openai_chat_completions_request from letta.llm_api.cohere import cohere_chat_completions_request from letta.llm_api.google_ai import ( convert_tools_to_google_ai_format, google_ai_chat_completions_request, ) +from letta.llm_api.helpers import ( + add_inner_thoughts_to_functions, + derive_inner_thoughts_in_kwargs, + unpack_all_inner_thoughts_from_kwargs, +) from letta.llm_api.openai import ( + build_openai_chat_completions_request, openai_chat_completions_process_stream, openai_chat_completions_request, ) @@ -37,144 +37,15 @@ Tool, cast_message_to_subtype, ) -from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, Choice +from letta.schemas.openai.chat_completion_response import ChatCompletionResponse from letta.streaming_interface import ( AgentChunkStreamingInterface, AgentRefreshStreamingInterface, ) -from letta.utils import json_dumps LLM_API_PROVIDER_OPTIONS = ["openai", "azure", "anthropic", "google_ai", "cohere", "local", "groq"] -# TODO update to use better types -def add_inner_thoughts_to_functions( - functions: List[dict], - inner_thoughts_key: str, - inner_thoughts_description: str, - inner_thoughts_required: bool = True, - # inner_thoughts_to_front: bool = True, TODO support sorting somewhere, probably in the to_dict? -) -> List[dict]: - """Add an inner_thoughts kwarg to every function in the provided list""" - # return copies - new_functions = [] - - # functions is a list of dicts in the OpenAI schema (https://platform.openai.com/docs/api-reference/chat/create) - for function_object in functions: - function_params = function_object["parameters"]["properties"] - required_params = list(function_object["parameters"]["required"]) - - # if the inner thoughts arg doesn't exist, add it - if inner_thoughts_key not in function_params: - function_params[inner_thoughts_key] = { - "type": "string", - "description": inner_thoughts_description, - } - - # make sure it's tagged as required - new_function_object = copy.deepcopy(function_object) - if inner_thoughts_required and inner_thoughts_key not in required_params: - required_params.append(inner_thoughts_key) - new_function_object["parameters"]["required"] = required_params - - new_functions.append(new_function_object) - - # return a list of copies - return new_functions - - -def unpack_all_inner_thoughts_from_kwargs( - response: ChatCompletionResponse, - inner_thoughts_key: str, -) -> ChatCompletionResponse: - """Strip the inner thoughts out of the tool call and put it in the message content""" - if len(response.choices) == 0: - raise ValueError(f"Unpacking inner thoughts from empty response not supported") - - new_choices = [] - for choice in response.choices: - new_choices.append(unpack_inner_thoughts_from_kwargs(choice, inner_thoughts_key)) - - # return an updated copy - new_response = response.model_copy(deep=True) - new_response.choices = new_choices - return new_response - - -def unpack_inner_thoughts_from_kwargs(choice: Choice, inner_thoughts_key: str) -> Choice: - message = choice.message - if message.role == "assistant" and message.tool_calls and len(message.tool_calls) >= 1: - if len(message.tool_calls) > 1: - warnings.warn(f"Unpacking inner thoughts from more than one tool call ({len(message.tool_calls)}) is not supported") - # TODO support multiple tool calls - tool_call = message.tool_calls[0] - - try: - # Sadly we need to parse the JSON since args are in string format - func_args = dict(json.loads(tool_call.function.arguments)) - if inner_thoughts_key in func_args: - # extract the inner thoughts - inner_thoughts = func_args.pop(inner_thoughts_key) - - # replace the kwargs - new_choice = choice.model_copy(deep=True) - new_choice.message.tool_calls[0].function.arguments = json_dumps(func_args) - # also replace the message content - if new_choice.message.content is not None: - warnings.warn(f"Overwriting existing inner monologue ({new_choice.message.content}) with kwarg ({inner_thoughts})") - new_choice.message.content = inner_thoughts - - return new_choice - else: - warnings.warn(f"Did not find inner thoughts in tool call: {str(tool_call)}") - - except json.JSONDecodeError as e: - warnings.warn(f"Failed to strip inner thoughts from kwargs: {e}") - raise e - - -def is_context_overflow_error(exception: requests.exceptions.RequestException) -> bool: - """Checks if an exception is due to context overflow (based on common OpenAI response messages)""" - from letta.utils import printd - - match_string = OPENAI_CONTEXT_WINDOW_ERROR_SUBSTRING - - # Backwards compatibility with openai python package/client v0.28 (pre-v1 client migration) - if match_string in str(exception): - printd(f"Found '{match_string}' in str(exception)={(str(exception))}") - return True - - # Based on python requests + OpenAI REST API (/v1) - elif isinstance(exception, requests.exceptions.HTTPError): - if exception.response is not None and "application/json" in exception.response.headers.get("Content-Type", ""): - try: - error_details = exception.response.json() - if "error" not in error_details: - printd(f"HTTPError occurred, but couldn't find error field: {error_details}") - return False - else: - error_details = error_details["error"] - - # Check for the specific error code - if error_details.get("code") == "context_length_exceeded": - printd(f"HTTPError occurred, caught error code {error_details.get('code')}") - return True - # Soft-check for "maximum context length" inside of the message - elif error_details.get("message") and "maximum context length" in error_details.get("message"): - printd(f"HTTPError occurred, found '{match_string}' in error message contents ({error_details})") - return True - else: - printd(f"HTTPError occurred, but unknown error message: {error_details}") - return False - except ValueError: - # JSON decoding failed - printd(f"HTTPError occurred ({exception}), but no JSON error message.") - - # Generic fail - else: - return False - - def retry_with_exponential_backoff( func, initial_delay: float = 1, @@ -248,7 +119,8 @@ def create( stream_inferface: Optional[Union[AgentRefreshStreamingInterface, AgentChunkStreamingInterface]] = None, # TODO move to llm_config? # if unspecified (None), default to something we've tested - inner_thoughts_in_kwargs: OptionState = OptionState.DEFAULT, + inner_thoughts_in_kwargs_option: OptionState = OptionState.DEFAULT, + max_tokens: Optional[int] = None, model_settings: Optional[dict] = None, # TODO: eventually pass from server ) -> ChatCompletionResponse: """Return response to chat completion with backoff""" @@ -267,59 +139,14 @@ def create( # openai if llm_config.model_endpoint_type == "openai": - - if inner_thoughts_in_kwargs == OptionState.DEFAULT: - # model that are known to not use `content` fields on tool calls - inner_thoughts_in_kwargs = ( - "gpt-4o" in llm_config.model or "gpt-4-turbo" in llm_config.model or "gpt-3.5-turbo" in llm_config.model - ) - else: - inner_thoughts_in_kwargs = True if inner_thoughts_in_kwargs == OptionState.YES else False - - if not isinstance(inner_thoughts_in_kwargs, bool): - warnings.warn(f"Bad type detected: {type(inner_thoughts_in_kwargs)}") - inner_thoughts_in_kwargs = bool(inner_thoughts_in_kwargs) - if inner_thoughts_in_kwargs: - functions = add_inner_thoughts_to_functions( - functions=functions, - inner_thoughts_key=INNER_THOUGHTS_KWARG, - inner_thoughts_description=INNER_THOUGHTS_KWARG_DESCRIPTION, - ) - - openai_message_list = [ - cast_message_to_subtype(m.to_openai_dict(put_inner_thoughts_in_kwargs=inner_thoughts_in_kwargs)) for m in messages - ] - - # TODO do the same for Azure? if model_settings.openai_api_key is None and llm_config.model_endpoint == "https://api.openai.com/v1": # only is a problem if we are *not* using an openai proxy raise ValueError(f"OpenAI key is missing from letta config file") - if use_tool_naming: - data = ChatCompletionRequest( - model=llm_config.model, - messages=openai_message_list, - tools=[{"type": "function", "function": f} for f in functions] if functions else None, - tool_choice=function_call, - user=str(user_id), - ) - else: - data = ChatCompletionRequest( - model=llm_config.model, - messages=openai_message_list, - functions=functions, - function_call=function_call, - user=str(user_id), - ) - # https://platform.openai.com/docs/guides/text-generation/json-mode - # only supported by gpt-4o, gpt-4-turbo, or gpt-3.5-turbo - if "gpt-4o" in llm_config.model or "gpt-4-turbo" in llm_config.model or "gpt-3.5-turbo" in llm_config.model: - data.response_format = {"type": "json_object"} - if "inference.memgpt.ai" in llm_config.model_endpoint: - # override user id for inference.memgpt.ai - import uuid - - data.user = str(uuid.UUID(int=0)) + inner_thoughts_in_kwargs = derive_inner_thoughts_in_kwargs(inner_thoughts_in_kwargs_option, model=llm_config.model) + data = build_openai_chat_completions_request( + llm_config, messages, user_id, functions, function_call, use_tool_naming, inner_thoughts_in_kwargs, max_tokens + ) if stream: # Client requested token streaming data.stream = True @@ -356,35 +183,32 @@ def create( if stream: raise NotImplementedError(f"Streaming not yet implemented for {llm_config.model_endpoint_type}") - azure_deployment = ( - model_settings.azure_deployment if model_settings.azure_deployment is not None else MODEL_TO_AZURE_ENGINE[llm_config.model] + if model_settings.azure_api_key is None: + raise ValueError(f"Azure API key is missing. Did you set AZURE_API_KEY in your env?") + + if model_settings.azure_base_url is None: + raise ValueError(f"Azure base url is missing. Did you set AZURE_BASE_URL in your env?") + + # Set the llm config model_endpoint from model_settings + # For Azure, this model_endpoint is required to be configured via env variable, so users don't need to provide it in the LLM config + llm_config.model_endpoint = model_settings.azure_base_url + inner_thoughts_in_kwargs = derive_inner_thoughts_in_kwargs(inner_thoughts_in_kwargs_option, llm_config.model) + chat_completion_request = build_openai_chat_completions_request( + llm_config, messages, user_id, functions, function_call, use_tool_naming, inner_thoughts_in_kwargs, max_tokens ) - if use_tool_naming: - data = dict( - # NOTE: don't pass model to Azure calls, that is the deployment_id - # model=agent_config.model, - messages=[m.to_openai_dict() for m in messages], - tools=[{"type": "function", "function": f} for f in functions] if functions else None, - tool_choice=function_call, - user=str(user_id), - ) - else: - data = dict( - # NOTE: don't pass model to Azure calls, that is the deployment_id - # model=agent_config.model, - messages=[m.to_openai_dict() for m in messages], - functions=functions, - function_call=function_call, - user=str(user_id), - ) - return azure_openai_chat_completions_request( - resource_name=model_settings.azure_endpoint, - deployment_id=azure_deployment, - api_version=model_settings.azure_version, - api_key=model_settings.azure_key, - data=data, + + response = azure_openai_chat_completions_request( + model_settings=model_settings, + llm_config=llm_config, + api_key=model_settings.azure_api_key, + chat_completion_request=chat_completion_request, ) + if inner_thoughts_in_kwargs: + response = unpack_all_inner_thoughts_from_kwargs(response=response, inner_thoughts_key=INNER_THOUGHTS_KWARG) + + return response + elif llm_config.model_endpoint_type == "google_ai": if stream: raise NotImplementedError(f"Streaming not yet implemented for {llm_config.model_endpoint_type}") @@ -517,7 +341,7 @@ def create( stream_inferface.stream_end() if inner_thoughts_in_kwargs: - response = unpack_inner_thoughts_from_kwargs(response=response, inner_thoughts_key=INNER_THOUGHTS_KWARG) + response = unpack_all_inner_thoughts_from_kwargs(response=response, inner_thoughts_key=INNER_THOUGHTS_KWARG) return response diff --git a/letta/llm_api/openai.py b/letta/llm_api/openai.py index 49d00f373f..18f923723e 100644 --- a/letta/llm_api/openai.py +++ b/letta/llm_api/openai.py @@ -1,5 +1,6 @@ import json -from typing import Generator, Optional, Union +import warnings +from typing import Generator, List, Optional, Union import httpx import requests @@ -8,10 +9,19 @@ from letta.constants import OPENAI_CONTEXT_WINDOW_ERROR_SUBSTRING from letta.errors import LLMError +from letta.llm_api.helpers import add_inner_thoughts_to_functions +from letta.local_llm.constants import ( + INNER_THOUGHTS_KWARG, + INNER_THOUGHTS_KWARG_DESCRIPTION, +) from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages +from letta.schemas.llm_config import LLMConfig from letta.schemas.message import Message as _Message from letta.schemas.message import MessageRole as _MessageRole -from letta.schemas.openai.chat_completion_request import ChatCompletionRequest +from letta.schemas.openai.chat_completion_request import ( + ChatCompletionRequest, + cast_message_to_subtype, +) from letta.schemas.openai.chat_completion_response import ( ChatCompletionChunkResponse, ChatCompletionResponse, @@ -81,6 +91,64 @@ def openai_get_model_list(url: str, api_key: Union[str, None], fix_url: Optional raise e +def build_openai_chat_completions_request( + llm_config: LLMConfig, + messages: List[Message], + user_id: Optional[str], + functions: Optional[list], + function_call: str, + use_tool_naming: bool, + inner_thoughts_in_kwargs: bool, + max_tokens: Optional[int], +) -> ChatCompletionRequest: + if inner_thoughts_in_kwargs: + functions = add_inner_thoughts_to_functions( + functions=functions, + inner_thoughts_key=INNER_THOUGHTS_KWARG, + inner_thoughts_description=INNER_THOUGHTS_KWARG_DESCRIPTION, + ) + + openai_message_list = [ + cast_message_to_subtype(m.to_openai_dict(put_inner_thoughts_in_kwargs=inner_thoughts_in_kwargs)) for m in messages + ] + if llm_config.model: + model = llm_config.model + else: + warnings.warn(f"Model type not set in llm_config: {llm_config.model_dump_json(indent=4)}") + model = None + + if use_tool_naming: + data = ChatCompletionRequest( + model=model, + messages=openai_message_list, + tools=[{"type": "function", "function": f} for f in functions] if functions else None, + tool_choice=function_call, + user=str(user_id), + max_tokens=max_tokens, + ) + else: + data = ChatCompletionRequest( + model=model, + messages=openai_message_list, + functions=functions, + function_call=function_call, + user=str(user_id), + max_tokens=max_tokens, + ) + # https://platform.openai.com/docs/guides/text-generation/json-mode + # only supported by gpt-4o, gpt-4-turbo, or gpt-3.5-turbo + if "gpt-4o" in llm_config.model or "gpt-4-turbo" in llm_config.model or "gpt-3.5-turbo" in llm_config.model: + data.response_format = {"type": "json_object"} + + if "inference.memgpt.ai" in llm_config.model_endpoint: + # override user id for inference.memgpt.ai + import uuid + + data.user = str(uuid.UUID(int=0)) + + return data + + def openai_chat_completions_process_stream( url: str, api_key: str, diff --git a/letta/providers.py b/letta/providers.py index c0fab23002..ccb6c97c3a 100644 --- a/letta/providers.py +++ b/letta/providers.py @@ -220,7 +220,11 @@ def get_model_context_window(self, model_name: str): class AzureProvider(Provider): - pass + name: str = "azure" + base_url: str = Field( + ..., description="Base URL for the Azure API endpoint. This should be specific to your org, e.g. `https://letta.openai.azure.com`." + ) + api_key: str = Field(..., description="API key for the Azure API.") class VLLMProvider(OpenAIProvider): diff --git a/letta/schemas/llm_config.py b/letta/schemas/llm_config.py index d951c2dd08..493adabf85 100644 --- a/letta/schemas/llm_config.py +++ b/letta/schemas/llm_config.py @@ -11,7 +11,7 @@ class LLMConfig(BaseModel): model (str): The name of the LLM model. model_endpoint_type (str): The endpoint type for the model. model_endpoint (str): The endpoint for the model. - model_wrapper (str): The wrapper for the model. + model_wrapper (str): The wrapper for the model. This is used to wrap additional text around the input/output of the model. This is useful for text-to-text completions, such as the Completions API in OpenAI. context_window (int): The context window size for the model. """ @@ -34,7 +34,10 @@ class LLMConfig(BaseModel): "vllm", "hugging-face", ] = Field(..., description="The endpoint type for the model.") - model_endpoint: str = Field(..., description="The endpoint for the model.") + model_endpoint: Optional[str] = Field(None, description="The endpoint for the model.") + api_version: Optional[str] = Field( + None, description="The version for the model API. Used by the Azure provider backend, e.g. 2023-03-15-preview." + ) model_wrapper: Optional[str] = Field(None, description="The wrapper for the model.") context_window: int = Field(..., description="The context window size for the model.") diff --git a/letta/server/server.py b/letta/server/server.py index ff1e6c6e72..2e08bcba93 100644 --- a/letta/server/server.py +++ b/letta/server/server.py @@ -45,6 +45,7 @@ from letta.prompts import gpt_system from letta.providers import ( AnthropicProvider, + AzureProvider, GoogleAIProvider, OllamaProvider, OpenAIProvider, @@ -270,6 +271,8 @@ def __init__( self._enabled_providers.append(VLLMProvider(base_url=model_settings.vllm_base_url)) if model_settings.gemini_api_key: self._enabled_providers.append(GoogleAIProvider(api_key=model_settings.gemini_api_key)) + if model_settings.azure_api_key and model_settings.azure_base_url: + self._enabled_providers.append(AzureProvider(api_key=model_settings.azure_api_key, base_url=model_settings.azure_base_url)) def save_agents(self): """Saves all the agents that are in the in-memory object store""" diff --git a/letta/settings.py b/letta/settings.py index 84066c411b..4f8bb2de2f 100644 --- a/letta/settings.py +++ b/letta/settings.py @@ -23,7 +23,8 @@ class ModelSettings(BaseSettings): ollama_base_url: Optional[str] = None # azure - azure_deployment: Optional[str] = None + azure_api_key: Optional[str] = None + azure_base_url: Optional[str] = None # google ai gemini_api_key: Optional[str] = None diff --git a/tests/configs/embedding_model_configs/letta-hosted.json b/tests/configs/embedding_model_configs/letta-hosted.json new file mode 100644 index 0000000000..42478ed8d2 --- /dev/null +++ b/tests/configs/embedding_model_configs/letta-hosted.json @@ -0,0 +1,7 @@ +{ + "embedding_endpoint": "https://embeddings.memgpt.ai", + "embedding_model": "BAAI/bge-large-en-v1.5", + "embedding_dim": 1024, + "embedding_chunk_size": 300, + "embedding_endpoint_type": "hugging-face" +} diff --git a/tests/configs/embedding_model_configs/local.json b/tests/configs/embedding_model_configs/local.json new file mode 100644 index 0000000000..aaac3621a5 --- /dev/null +++ b/tests/configs/embedding_model_configs/local.json @@ -0,0 +1,7 @@ +{ + "embedding_endpoint": null, + "embedding_model": "BAAI/bge-small-en-v1.5", + "embedding_dim": 384, + "embedding_chunk_size": 300, + "embedding_endpoint_type": "local" +} diff --git a/tests/configs/embedding_model_configs/ollama.json b/tests/configs/embedding_model_configs/ollama.json new file mode 100644 index 0000000000..84ad72f650 --- /dev/null +++ b/tests/configs/embedding_model_configs/ollama.json @@ -0,0 +1,7 @@ +{ + "embedding_endpoint_type": "ollama", + "embedding_endpoint": "http://127.0.0.1:11434", + "embedding_model": "mxbai-embed-large", + "embedding_dim": 512, + "embedding_chunk_size": 200 +} diff --git a/tests/configs/embedding_model_configs/text-embedding-ada-002.json b/tests/configs/embedding_model_configs/text-embedding-ada-002.json new file mode 100644 index 0000000000..8791ad67e4 --- /dev/null +++ b/tests/configs/embedding_model_configs/text-embedding-ada-002.json @@ -0,0 +1,7 @@ +{ + "embedding_endpoint_type": "openai", + "embedding_endpoint": "https://api.openai.com/v1", + "embedding_model": "text-embedding-ada-002", + "embedding_dim": 1536, + "embedding_chunk_size": 300 +} diff --git a/tests/configs/letta_hosted.json b/tests/configs/letta_hosted.json new file mode 100644 index 0000000000..3fd85a4c18 --- /dev/null +++ b/tests/configs/letta_hosted.json @@ -0,0 +1,11 @@ +{ + "context_window": 8192, + "model_endpoint_type": "openai", + "model_endpoint": "https://inference.memgpt.ai", + "model": "memgpt-openai", + "embedding_endpoint_type": "hugging-face", + "embedding_endpoint": "https://embeddings.memgpt.ai", + "embedding_model": "BAAI/bge-large-en-v1.5", + "embedding_dim": 1024, + "embedding_chunk_size": 300 +} diff --git a/tests/configs/llm_model_configs/azure-gpt-4o-mini.json b/tests/configs/llm_model_configs/azure-gpt-4o-mini.json new file mode 100644 index 0000000000..58eb3a00dc --- /dev/null +++ b/tests/configs/llm_model_configs/azure-gpt-4o-mini.json @@ -0,0 +1,7 @@ +{ + "context_window": 128000, + "model": "gpt-4o-mini", + "model_endpoint_type": "azure", + "api_version": "2023-03-15-preview", + "model_wrapper": null +} diff --git a/tests/configs/llm_model_configs/claude-3-opus.json b/tests/configs/llm_model_configs/claude-3-opus.json new file mode 100644 index 0000000000..6281aa9644 --- /dev/null +++ b/tests/configs/llm_model_configs/claude-3-opus.json @@ -0,0 +1,7 @@ +{ + "context_window": 200000, + "model": "claude-3-opus-20240229", + "model_endpoint_type": "anthropic", + "model_endpoint": "https://api.anthropic.com/v1", + "model_wrapper": null +} diff --git a/tests/configs/llm_model_configs/gpt-4.json b/tests/configs/llm_model_configs/gpt-4.json new file mode 100644 index 0000000000..c572428e49 --- /dev/null +++ b/tests/configs/llm_model_configs/gpt-4.json @@ -0,0 +1,7 @@ +{ + "context_window": 8192, + "model": "gpt-4", + "model_endpoint_type": "openai", + "model_endpoint": "https://api.openai.com/v1", + "model_wrapper": null +} diff --git a/tests/configs/llm_model_configs/groq.json b/tests/configs/llm_model_configs/groq.json new file mode 100644 index 0000000000..a63acbf06c --- /dev/null +++ b/tests/configs/llm_model_configs/groq.json @@ -0,0 +1,7 @@ +{ + "context_window": 8192, + "model": "llama3-groq-70b-8192-tool-use-preview", + "model_endpoint_type": "groq", + "model_endpoint": "https://api.groq.com/openai/v1", + "model_wrapper": null +} diff --git a/tests/configs/llm_model_configs/letta-hosted.json b/tests/configs/llm_model_configs/letta-hosted.json new file mode 100644 index 0000000000..3ba968226b --- /dev/null +++ b/tests/configs/llm_model_configs/letta-hosted.json @@ -0,0 +1,6 @@ +{ + "context_window": 16384, + "model_endpoint_type": "openai", + "model_endpoint": "https://inference.memgpt.ai", + "model": "memgpt-openai" +} diff --git a/tests/configs/llm_model_configs/ollama.json b/tests/configs/llm_model_configs/ollama.json new file mode 100644 index 0000000000..d18a4e7724 --- /dev/null +++ b/tests/configs/llm_model_configs/ollama.json @@ -0,0 +1,6 @@ +{ + "context_window": 8192, + "model_endpoint_type": "ollama", + "model_endpoint": "http://127.0.0.1:11434", + "model": "dolphin2.2-mistral:7b-q6_K" +} diff --git a/tests/configs/openai.json b/tests/configs/openai.json new file mode 100644 index 0000000000..82ed0d72e1 --- /dev/null +++ b/tests/configs/openai.json @@ -0,0 +1,12 @@ +{ + "context_window": 8192, + "model": "gpt-4", + "model_endpoint_type": "openai", + "model_endpoint": "https://api.openai.com/v1", + "model_wrapper": null, + "embedding_endpoint_type": "openai", + "embedding_endpoint": "https://api.openai.com/v1", + "embedding_model": "text-embedding-ada-002", + "embedding_dim": 1536, + "embedding_chunk_size": 300 +} diff --git a/tests/helpers/endpoints_helper.py b/tests/helpers/endpoints_helper.py index d32503a3db..7aac1f03d9 100644 --- a/tests/helpers/endpoints_helper.py +++ b/tests/helpers/endpoints_helper.py @@ -3,6 +3,12 @@ import uuid from typing import Callable, List, Optional, Union +from letta.llm_api.helpers import ( + derive_inner_thoughts_in_kwargs, + unpack_inner_thoughts_from_kwargs, +) +from letta.schemas.enums import OptionState + logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) @@ -18,7 +24,7 @@ MissingFunctionCallError, MissingInnerMonologueError, ) -from letta.llm_api.llm_api_tools import create, unpack_inner_thoughts_from_kwargs +from letta.llm_api.llm_api_tools import create from letta.local_llm.constants import INNER_THOUGHTS_KWARG from letta.schemas.agent import AgentState from letta.schemas.embedding_config import EmbeddingConfig @@ -83,7 +89,7 @@ def setup_agent( # ====================================================================================================================== -def check_first_response_is_valid_for_llm_endpoint(filename: str, inner_thoughts_in_kwargs: bool = False) -> ChatCompletionResponse: +def check_first_response_is_valid_for_llm_endpoint(filename: str) -> ChatCompletionResponse: """ Checks that the first response is valid: @@ -113,7 +119,9 @@ def check_first_response_is_valid_for_llm_endpoint(filename: str, inner_thoughts ) # Basic check - assert response is not None + assert response is not None, response + assert response.choices is not None, response + assert len(response.choices) > 0, response # Select first choice choice = response.choices[0] @@ -122,6 +130,9 @@ def check_first_response_is_valid_for_llm_endpoint(filename: str, inner_thoughts validator_func = lambda function_call: function_call.name == "send_message" or function_call.name == "archival_memory_search" assert_contains_valid_function_call(choice.message, validator_func) + # Get inner_thoughts_in_kwargs + inner_thoughts_in_kwargs = derive_inner_thoughts_in_kwargs(OptionState.DEFAULT, agent_state.llm_config.model) + # Assert that the message has an inner monologue assert_contains_correct_inner_monologue(choice, inner_thoughts_in_kwargs) @@ -302,9 +313,9 @@ def run_embedding_endpoint(filename): def assert_sanity_checks(response: LettaResponse): - assert response is not None - assert response.messages is not None - assert len(response.messages) > 0 + assert response is not None, response + assert response.messages is not None, response + assert len(response.messages) > 0, response def assert_invoked_send_message_with_keyword(messages: List[LettaMessage], keyword: str, case_sensitive: bool = False) -> None: diff --git a/tests/test_endpoints.py b/tests/test_endpoints.py index a6cd16a300..5bf9d326d3 100644 --- a/tests/test_endpoints.py +++ b/tests/test_endpoints.py @@ -12,7 +12,7 @@ # directories embedding_config_dir = "configs/embedding_model_configs" -llm_config_dir = "configs/llm_model_configs" +llm_config_dir = "tests/configs/llm_model_configs" # ====================================================================================================================== @@ -66,6 +66,52 @@ def test_embedding_endpoint_openai(): run_embedding_endpoint(filename) +# ====================================================================================================================== +# AZURE TESTS +# ====================================================================================================================== +def test_azure_gpt_4o_mini_returns_valid_first_message(): + filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json") + response = check_first_response_is_valid_for_llm_endpoint(filename) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") + + +def test_azure_gpt_4o_mini_returns_keyword(): + keyword = "banana" + filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json") + response = check_response_contains_keyword(filename, keyword=keyword) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") + + +def test_azure_gpt_4o_mini_uses_external_tool(): + filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json") + response = check_agent_uses_external_tool(filename) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") + + +def test_azure_gpt_4o_mini_recall_chat_memory(): + filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json") + response = check_agent_recall_chat_memory(filename) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") + + +def test_azure_gpt_4o_mini_archival_memory_retrieval(): + filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json") + response = check_agent_archival_memory_retrieval(filename) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") + + +def test_azure_gpt_4o_mini_edit_core_memory(): + filename = os.path.join(llm_config_dir, "azure-gpt-4o-mini.json") + response = check_agent_edit_core_memory(filename) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") + + # ====================================================================================================================== # LETTA HOSTED # ======================================================================================================================