diff --git a/agents/examples/default/property.json b/agents/examples/default/property.json index 75196981..1b0ab506 100644 --- a/agents/examples/default/property.json +++ b/agents/examples/default/property.json @@ -698,7 +698,7 @@ "max_memory_length": 10, "max_tokens": 512, "model": "${env:OPENAI_MODEL}", - "prompt": "You are an ai agent bot producing child picture books. Each response should be short and no more than 50 words as it's for child. \nFor each response, you will use the 'image_generate' tool to create an image based on the description or key moment in that part of the story. The story should be set in a fantasy world. Try asking questions relevant to the story to decide how the story should proceed. Each response should include rich, vivid descriptions that will guide the 'image_generate' tool to produce an image that aligns with the scene or mood.\n Whether it’s the setting, a character’s expression, or a dramatic moment, the paragraph should give enough detail for a meaningful visual representation.", + "prompt": "You are an ai agent bot producing child picture books. Each response should be short and no more than 50 words as it's for child. \nFor every response relevant to the story-telling, you will use the 'image_generate' tool to create an image based on the description or key moment in that part of the story. \n The story should be set in a fantasy world. Try asking questions relevant to the story to decide how the story should proceed. Every response should include rich, vivid descriptions that will guide the 'image_generate' tool to produce an image that aligns with the scene or mood.\n Whether it’s the setting, a character’s expression, or a dramatic moment, the paragraph should give enough detail for a meaningful visual representation.", "proxy_url": "${env:OPENAI_PROXY_URL}" } }, @@ -845,14 +845,6 @@ "extension": "message_collector" } ] - }, - { - "name": "raw_text_data", - "dest": [ - { - "extension": "message_collector2" - } - ] } ] }, @@ -952,6 +944,16 @@ } ] } + ], + "data": [ + { + "name": "raw_text_data", + "dest": [ + { + "extension": "message_collector2" + } + ] + } ] } ] diff --git a/agents/ten_packages/extension/message_collector/src/extension.py b/agents/ten_packages/extension/message_collector/src/extension.py index 450b0856..f90638e1 100644 --- a/agents/ten_packages/extension/message_collector/src/extension.py +++ b/agents/ten_packages/extension/message_collector/src/extension.py @@ -32,8 +32,6 @@ TEXT_DATA_STREAM_ID_FIELD = "stream_id" TEXT_DATA_END_OF_SEGMENT_FIELD = "end_of_segment" -# record the cached text data for each stream id -cached_text_map = {} MAX_CHUNK_SIZE_BYTES = 1024 @@ -104,6 +102,7 @@ def __init__(self, name: str): super().__init__(name) self.queue = asyncio.Queue() self.loop = None + self.cached_text_map = {} def on_init(self, ten_env: TenEnv) -> None: ten_env.log_info("on_init") @@ -191,15 +190,15 @@ def on_data(self, ten_env: TenEnv, data: Data) -> None: # We cache all final text data and append the non-final text data to the cached data # until the end of the segment. if end_of_segment: - if stream_id in cached_text_map: - text = cached_text_map[stream_id] + text - del cached_text_map[stream_id] + if stream_id in self.cached_text_map: + text = self.cached_text_map[stream_id] + text + del self.cached_text_map[stream_id] else: if final: - if stream_id in cached_text_map: - text = cached_text_map[stream_id] + text + if stream_id in self.cached_text_map: + text = self.cached_text_map[stream_id] + text - cached_text_map[stream_id] = text + self.cached_text_map[stream_id] = text # Generate a unique message ID for this batch of parts message_id = str(uuid.uuid4())[:8] diff --git a/agents/ten_packages/extension/openai_chatgpt_python/extension.py b/agents/ten_packages/extension/openai_chatgpt_python/extension.py index 520c0b22..79c4e424 100644 --- a/agents/ten_packages/extension/openai_chatgpt_python/extension.py +++ b/agents/ten_packages/extension/openai_chatgpt_python/extension.py @@ -378,5 +378,8 @@ def message_to_dict(self, message: LLMChatCompletionMessageParam): def _append_memory(self, message: str): if len(self.memory) > self.config.max_memory_length: - self.memory.pop(0) + removed_item = self.memory.pop(0) + # Remove tool calls from memory + if removed_item.get("tool_calls") and self.memory[0].get("role") == "tool": + self.memory.pop(0) self.memory.append(message) diff --git a/agents/ten_packages/extension/openai_chatgpt_python/manifest.json b/agents/ten_packages/extension/openai_chatgpt_python/manifest.json index b955f5b8..f71d0d76 100644 --- a/agents/ten_packages/extension/openai_chatgpt_python/manifest.json +++ b/agents/ten_packages/extension/openai_chatgpt_python/manifest.json @@ -85,14 +85,6 @@ "type": "string" } } - }, - { - "name": "raw_text_data", - "property": { - "text": { - "type": "string" - } - } } ], "cmd_in": [ diff --git a/agents/ten_packages/extension/openai_image_generate_tool/extension.py b/agents/ten_packages/extension/openai_image_generate_tool/extension.py index a39629be..e5635c36 100644 --- a/agents/ten_packages/extension/openai_image_generate_tool/extension.py +++ b/agents/ten_packages/extension/openai_image_generate_tool/extension.py @@ -3,14 +3,17 @@ # Licensed under the Apache License, Version 2.0. # See the LICENSE file for more information. # +import asyncio import json from ten import ( + Data, TenEnv, AsyncTenEnv, ) from ten_ai_base import ( AsyncLLMToolBaseExtension, LLMToolMetadata, LLMToolResult ) +from ten_ai_base.const import DATA_OUT_PROPERTY_END_OF_SEGMENT, DATA_OUT_PROPERTY_TEXT, RAW_DATA_OUT_NAME from ten_ai_base.types import LLMChatCompletionContentPartImageParam, LLMToolMetadataParameter, LLMToolResultNormal from .openai import OpenAIImageGenerateClient, OpenAIImageGenerateToolConfig @@ -52,6 +55,27 @@ def get_tool_metadata(self, ten_env: TenEnv) -> list[LLMToolMetadata]: ) ] + async def send_image(self, async_ten_env: AsyncTenEnv, image_url: str) -> None: + # Implement this method to send the image to the chat. + async_ten_env.log_info(f"Sending image: {image_url}") + try: + sentence = json.dumps({"data":{"image_url": image_url}, "type": "image_url"}) + output_data = Data.create(RAW_DATA_OUT_NAME) + output_data.set_property_string( + DATA_OUT_PROPERTY_TEXT, + sentence + ) + output_data.set_property_bool( + DATA_OUT_PROPERTY_END_OF_SEGMENT, True + ) + asyncio.create_task(async_ten_env.send_data(output_data)) + async_ten_env.log_info( + f"sent sentence [{sentence}]" + ) + except Exception as err: + async_ten_env.log_warn(f"send sentence [{sentence}] failed, err: {err}") + + async def run_tool(self, ten_env: AsyncTenEnv, name: str, args: dict) -> LLMToolResult | None: ten_env.log_info(f"run_tool {name} {args}") if name == "image_generate": @@ -62,8 +86,9 @@ async def run_tool(self, ten_env: AsyncTenEnv, name: str, args: dict) -> LLMTool # call OpenAIImageGenerateClient to generate images response_url = await self.client.generate_images(prompt) ten_env.log_info(f"Generated image: {response_url}") + await self.send_image(ten_env, response_url) result = LLMToolResultNormal( type="normal", - content={"data":{"image_url": response_url}, "type": "image_url"}, + content=json.dumps({"success": True}), ) return result diff --git a/agents/ten_packages/extension/openai_image_generate_tool/manifest.json b/agents/ten_packages/extension/openai_image_generate_tool/manifest.json index 6f5c5342..b04f49be 100644 --- a/agents/ten_packages/extension/openai_image_generate_tool/manifest.json +++ b/agents/ten_packages/extension/openai_image_generate_tool/manifest.json @@ -102,6 +102,16 @@ } } } + ], + "data_out": [ + { + "name": "raw_text_data", + "property": { + "text": { + "type": "string" + } + } + } ] } } \ No newline at end of file diff --git a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py index 44af592a..b90d0914 100644 --- a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py +++ b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py @@ -117,22 +117,6 @@ async def flush_input_items(self, async_ten_env: AsyncTenEnv): async_ten_env.log_info("Cancelling the current task during flush.") self.current_task.cancel() - def send_raw_text_output( - self, async_ten_env: AsyncTenEnv, sentence: str, end_of_segment: bool - ): - try: - output_data = Data.create(RAW_DATA_OUT_NAME) - output_data.set_property_string(DATA_OUT_PROPERTY_TEXT, sentence) - output_data.set_property_bool( - DATA_OUT_PROPERTY_END_OF_SEGMENT, end_of_segment - ) - asyncio.create_task(async_ten_env.send_data(output_data)) - async_ten_env.log_info( - f"{'end of segment ' if end_of_segment else ''}sent raw sentence [{sentence}]" - ) - except Exception as err: - async_ten_env.log_warn(f"send sentence [{sentence}] failed, err: {err}") - def send_text_output( self, async_ten_env: AsyncTenEnv, sentence: str, end_of_segment: bool ): diff --git a/playground/src/manager/rtc/rtc.ts b/playground/src/manager/rtc/rtc.ts index 65242322..ed6dcd9f 100644 --- a/playground/src/manager/rtc/rtc.ts +++ b/playground/src/manager/rtc/rtc.ts @@ -233,6 +233,7 @@ export class RtcManager extends AGEventEmitter { const { stream_id, is_final, text, text_ts, data_type } = JSON.parse( atob(completeMessage) ); + console.log(`[test] message_id: ${message_id} stream_id: ${stream_id}, text: ${text}, data_type: ${data_type}`); const isAgent = Number(stream_id) != Number(this.userId) let textItem: IChatItem = { type: isAgent ? EMessageType.AGENT : EMessageType.USER,