feat: Implement WebSocket support (#156)

- Add WebSocket support for audio speech, transcriptions, and chat interactions - Introduce asynchronous WebSocket clients for real-time communication - Expand event handling for various WebSocket events - Enhance error handling and logging for WebSocket connections
coze-dev · Jan 10, 2025 · da63a6d · da63a6d
1 parent a3042be
commit da63a6d
Show file tree

Hide file tree

Showing 22 changed files with 2,560 additions and 63 deletions.
diff --git a/.coderabbit.yaml b/.coderabbit.yaml
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,4 @@ dist/
 .env_private
 scripts/
 .cache/
+output.wav
diff --git a/cozepy/__init__.py b/cozepy/__init__.py
@@ -91,6 +91,52 @@
 from .request import AsyncHTTPClient, SyncHTTPClient
 from .templates import TemplateDuplicateResp, TemplateEntityType
 from .version import VERSION
+from .websockets.audio.speech import (
+    AsyncWebsocketsAudioSpeechClient,
+    AsyncWebsocketsAudioSpeechEventHandler,
+    InputTextBufferAppendEvent,
+    InputTextBufferCompletedEvent,
+    InputTextBufferCompleteEvent,
+    SpeechAudioCompletedEvent,
+    SpeechAudioUpdateEvent,
+    SpeechUpdateEvent,
+    WebsocketsAudioSpeechClient,
+    WebsocketsAudioSpeechEventHandler,
+)
+from .websockets.audio.transcriptions import (
+    AsyncWebsocketsAudioTranscriptionsClient,
+    AsyncWebsocketsAudioTranscriptionsEventHandler,
+    InputAudioBufferAppendEvent,
+    InputAudioBufferCompletedEvent,
+    InputAudioBufferCompleteEvent,
+    TranscriptionsMessageCompletedEvent,
+    TranscriptionsMessageUpdateEvent,
+    TranscriptionsUpdateEvent,
+    WebsocketsAudioTranscriptionsClient,
+    WebsocketsAudioTranscriptionsEventHandler,
+)
+from .websockets.chat import (
+    AsyncWebsocketsChatClient,
+    AsyncWebsocketsChatEventHandler,
+    ChatUpdateEvent,
+    ConversationAudioDeltaEvent,
+    ConversationChatCompletedEvent,
+    ConversationChatCreatedEvent,
+    ConversationChatRequiresActionEvent,
+    ConversationChatSubmitToolOutputsEvent,
+    ConversationMessageDeltaEvent,
+    WebsocketsChatClient,
+    WebsocketsChatEventHandler,
+)
+from .websockets.ws import (
+    InputAudio,
+    OpusConfig,
+    OutputAudio,
+    PCMConfig,
+    WebsocketsErrorEvent,
+    WebsocketsEvent,
+    WebsocketsEventType,
+)
 from .workflows.runs import (
     WorkflowEvent,
     WorkflowEventError,
@@ -180,6 +226,48 @@
     "DocumentSourceInfo",
     "DocumentUpdateRule",
     "DocumentBase",
+    # websockets.audio.speech
+    "InputTextBufferAppendEvent",
+    "InputTextBufferCompleteEvent",
+    "SpeechUpdateEvent",
+    "InputTextBufferCompletedEvent",
+    "SpeechAudioUpdateEvent",
+    "SpeechAudioCompletedEvent",
+    "WebsocketsAudioSpeechEventHandler",
+    "WebsocketsAudioSpeechClient",
+    "AsyncWebsocketsAudioSpeechEventHandler",
+    "AsyncWebsocketsAudioSpeechClient",
+    # websockets.audio.transcriptions
+    "InputAudioBufferAppendEvent",
+    "InputAudioBufferCompleteEvent",
+    "TranscriptionsUpdateEvent",
+    "InputAudioBufferCompletedEvent",
+    "TranscriptionsMessageUpdateEvent",
+    "TranscriptionsMessageCompletedEvent",
+    "WebsocketsAudioTranscriptionsEventHandler",
+    "WebsocketsAudioTranscriptionsClient",
+    "AsyncWebsocketsAudioTranscriptionsEventHandler",
+    "AsyncWebsocketsAudioTranscriptionsClient",
+    # websockets.chat
+    "ChatUpdateEvent",
+    "ConversationChatSubmitToolOutputsEvent",
+    "ConversationChatCreatedEvent",
+    "ConversationMessageDeltaEvent",
+    "ConversationChatRequiresActionEvent",
+    "ConversationAudioDeltaEvent",
+    "ConversationChatCompletedEvent",
+    "WebsocketsChatEventHandler",
+    "WebsocketsChatClient",
+    "AsyncWebsocketsChatEventHandler",
+    "AsyncWebsocketsChatClient",
+    # websockets
+    "WebsocketsEventType",
+    "WebsocketsEvent",
+    "WebsocketsErrorEvent",
+    "InputAudio",
+    "OpusConfig",
+    "PCMConfig",
+    "OutputAudio",
     # workflows.runs
     "WorkflowRunResult",
     "WorkflowEventType",

diff --git a/cozepy/audio/voices/__init__.py b/cozepy/audio/voices/__init__.py
@@ -210,11 +210,7 @@ async def clone(
         return await self._requester.arequest("post", url, False, Voice, headers=headers, body=body, files=files)
 
     async def list(
-        self,
-        *,
-        filter_system_voice: bool = False,
-        page_num: int = 1,
-        page_size: int = 100,
+        self, *, filter_system_voice: bool = False, page_num: int = 1, page_size: int = 100, **kwargs
     ) -> AsyncNumberPaged[Voice]:
         """
         Get available voices, including system voices + user cloned voices
@@ -227,6 +223,7 @@ async def list(
         :return: list of Voice
         """
         url = f"{self._base_url}/v1/audio/voices"
+        headers: Optional[dict] = kwargs.get("headers")
 
         def request_maker(i_page_num: int, i_page_size: int) -> HTTPRequest:
             return self._requester.make_request(
@@ -237,6 +234,7 @@ def request_maker(i_page_num: int, i_page_size: int) -> HTTPRequest:
                     "page_num": i_page_num,
                     "page_size": i_page_size,
                 },
+                headers=headers,
                 cast=_PrivateListVoiceData,
                 is_async=False,
                 stream=False,

diff --git a/cozepy/chat/__init__.py b/cozepy/chat/__init__.py
@@ -1,3 +1,4 @@
+import base64
 import json
 import time
 from enum import Enum
@@ -24,6 +25,8 @@ class MessageRole(str, Enum):
 
 
 class MessageType(str, Enum):
+    UNKNOWN = ""
+
     # User input content.
     # 用户输入内容。
     QUESTION = "question"
@@ -52,8 +55,6 @@ class MessageType(str, Enum):
     # 多 answer 场景下，服务端会返回一个 verbose 包，对应的 content 为 JSON 格式，content.msg_type =generate_answer_finish 代表全部 answer 回复完成。不支持在请求中作为入参。
     VERBOSE = "verbose"
 
-    UNKNOWN = ""
-
 
 class MessageContentType(str, Enum):
     # Text.
@@ -187,12 +188,19 @@ def build_assistant_answer(content: str, meta_data: Optional[Dict[str, str]] = N
             meta_data=meta_data,
         )
 
+    def get_audio(self) -> Optional[bytes]:
+        if self.content_type == MessageContentType.AUDIO:
+            return base64.b64decode(self.content)
+        return None
+
 
 class ChatStatus(str, Enum):
     """
     The running status of the session
     """
 
+    UNKNOWN = ""
+
     # The session has been created.
     CREATED = "created"
 
@@ -214,9 +222,9 @@ class ChatStatus(str, Enum):
 
 class ChatError(CozeModel):
     # The error code. An integer type. 0 indicates success, other values indicate failure.
-    code: int
+    code: int = 0
     # The error message. A string type.
-    msg: str
+    msg: str = ""
 
 
 class ChatRequiredActionType(str, Enum):
@@ -266,13 +274,13 @@ class ChatRequiredAction(CozeModel):
 class ChatUsage(CozeModel):
     # The total number of Tokens consumed in this chat, including the consumption for both the input
     # and output parts.
-    token_count: int
+    token_count: int = 0
 
     # The total number of Tokens consumed for the output part.
-    output_count: int
+    output_count: int = 0
 
     # The total number of Tokens consumed for the input part.
-    input_count: int
+    input_count: int = 0
 
 
 class Chat(CozeModel):
@@ -301,7 +309,7 @@ class Chat(CozeModel):
     # completed: The Bot has finished processing, and the session has ended.
     # failed: The session has failed.
     # requires_action: The session is interrupted and requires further processing.
-    status: ChatStatus
+    status: ChatStatus = ChatStatus.UNKNOWN
 
     # Details of the information needed for execution.
     required_action: Optional[ChatRequiredAction] = None

diff --git a/cozepy/coze.py b/cozepy/coze.py
@@ -15,6 +15,7 @@
     from .files import AsyncFilesClient, FilesClient
     from .knowledge import AsyncKnowledgeClient, KnowledgeClient  # deprecated
     from .templates import AsyncTemplatesClient, TemplatesClient
+    from .websockets import AsyncWebsocketsClient
     from .workflows import AsyncWorkflowsClient, WorkflowsClient
     from .workspaces import AsyncWorkspacesClient, WorkspacesClient
 
@@ -151,6 +152,7 @@ def __init__(
         self._workspaces: Optional[AsyncWorkspacesClient] = None
         self._audio: Optional[AsyncAudioClient] = None
         self._templates: Optional[AsyncTemplatesClient] = None
+        self._websockets: Optional[AsyncWebsocketsClient] = None
 
     @property
     def bots(self) -> "AsyncBotsClient":
@@ -237,3 +239,11 @@ def templates(self) -> "AsyncTemplatesClient":
 
             self._templates = AsyncTemplatesClient(self._base_url, self._auth, self._requester)
         return self._templates
+
+    @property
+    def websockets(self) -> "AsyncWebsocketsClient":
+        if not self._websockets:
+            from .websockets import AsyncWebsocketsClient
+
+            self._websockets = AsyncWebsocketsClient(self._base_url, self._auth, self._requester)
+        return self._websockets
diff --git a/cozepy/util.py b/cozepy/util.py
@@ -44,6 +44,18 @@ def remove_url_trailing_slash(base_url: str) -> str:
     return base_url
 
 
+def http_base_url_to_ws(base_url: str) -> str:
+    if not base_url:
+        raise ValueError("base_url cannot be empty")
+    if not base_url.startswith("https://"):
+        raise ValueError("base_url must start with 'https://'")
+    base_url = base_url.replace("https://", "wss://")
+
+    if "api-" in base_url:
+        return base_url.replace("api-", "ws-")
+    return base_url.replace("api.", "ws.")
+
+
 def remove_none_values(d: dict) -> dict:
     return {k: v for k, v in d.items() if v is not None}
 
@@ -55,7 +67,7 @@ def write_pcm_to_wav_file(
     Save PCM binary data to WAV file
 
     :param pcm_data: PCM binary data (24kHz, 16-bit, 1 channel, little-endian)
-    :param output_filename: Output WAV filename
+    :param filepath: Output WAV filename
     """
 
     with wave.open(filepath, "wb") as wav_file:

diff --git a/cozepy/websockets/__init__.py b/cozepy/websockets/__init__.py
@@ -0,0 +1,52 @@
+from cozepy import Auth
+from cozepy.request import Requester
+from cozepy.util import http_base_url_to_ws, remove_url_trailing_slash
+
+from .audio import AsyncWebsocketsAudioClient, WebsocketsAudioClient
+from .chat import AsyncWebsocketsChatBuildClient, WebsocketsChatBuildClient
+
+
+class WebsocketsClient(object):
+    def __init__(self, base_url: str, auth: Auth, requester: Requester):
+        self._base_url = http_base_url_to_ws(remove_url_trailing_slash(base_url))
+        self._auth = auth
+        self._requester = requester
+
+    @property
+    def audio(self) -> WebsocketsAudioClient:
+        return WebsocketsAudioClient(
+            base_url=self._base_url,
+            auth=self._auth,
+            requester=self._requester,
+        )
+
+    @property
+    def chat(self) -> WebsocketsChatBuildClient:
+        return WebsocketsChatBuildClient(
+            base_url=self._base_url,
+            auth=self._auth,
+            requester=self._requester,
+        )
+
+
+class AsyncWebsocketsClient(object):
+    def __init__(self, base_url: str, auth: Auth, requester: Requester):
+        self._base_url = http_base_url_to_ws(remove_url_trailing_slash(base_url))
+        self._auth = auth
+        self._requester = requester
+
+    @property
+    def audio(self) -> AsyncWebsocketsAudioClient:
+        return AsyncWebsocketsAudioClient(
+            base_url=self._base_url,
+            auth=self._auth,
+            requester=self._requester,
+        )
+
+    @property
+    def chat(self) -> AsyncWebsocketsChatBuildClient:
+        return AsyncWebsocketsChatBuildClient(
+            base_url=self._base_url,
+            auth=self._auth,
+            requester=self._requester,
+        )
diff --git a/cozepy/websockets/audio/__init__.py b/cozepy/websockets/audio/__init__.py
@@ -0,0 +1,51 @@
+from cozepy.auth import Auth
+from cozepy.request import Requester
+
+from .speech import AsyncWebsocketsAudioSpeechBuildClient, WebsocketsAudioSpeechBuildClient
+from .transcriptions import AsyncWebsocketsAudioTranscriptionsBuildClient, WebsocketsAudioTranscriptionsBuildClient
+
+
+class WebsocketsAudioClient(object):
+    def __init__(self, base_url: str, auth: Auth, requester: Requester):
+        self._base_url = base_url
+        self._auth = auth
+        self._requester = requester
+
+    @property
+    def transcriptions(self) -> "WebsocketsAudioTranscriptionsBuildClient":
+        return WebsocketsAudioTranscriptionsBuildClient(
+            base_url=self._base_url,
+            auth=self._auth,
+            requester=self._requester,
+        )
+
+    @property
+    def speech(self) -> "WebsocketsAudioSpeechBuildClient":
+        return WebsocketsAudioSpeechBuildClient(
+            base_url=self._base_url,
+            auth=self._auth,
+            requester=self._requester,
+        )
+
+
+class AsyncWebsocketsAudioClient(object):
+    def __init__(self, base_url: str, auth: Auth, requester: Requester):
+        self._base_url = base_url
+        self._auth = auth
+        self._requester = requester
+
+    @property
+    def transcriptions(self) -> "AsyncWebsocketsAudioTranscriptionsBuildClient":
+        return AsyncWebsocketsAudioTranscriptionsBuildClient(
+            base_url=self._base_url,
+            auth=self._auth,
+            requester=self._requester,
+        )
+
+    @property
+    def speech(self) -> "AsyncWebsocketsAudioSpeechBuildClient":
+        return AsyncWebsocketsAudioSpeechBuildClient(
+            base_url=self._base_url,
+            auth=self._auth,
+            requester=self._requester,
+        )