diff --git a/codex-cli/package.json b/codex-cli/package.json index 369e0d95f..b0be18179 100644 --- a/codex-cli/package.json +++ b/codex-cli/package.json @@ -30,6 +30,7 @@ ], "dependencies": { "@inkjs/ui": "^2.0.0", + "@picovoice/pvrecorder-node": "^1.2.8", "chalk": "^5.2.0", "diff": "^7.0.0", "dotenv": "^16.1.4", diff --git a/codex-cli/src/components/chat/terminal-chat-input.tsx b/codex-cli/src/components/chat/terminal-chat-input.tsx index 88a89039d..f1e638bb1 100644 --- a/codex-cli/src/components/chat/terminal-chat-input.tsx +++ b/codex-cli/src/components/chat/terminal-chat-input.tsx @@ -1,6 +1,7 @@ import type { MultilineTextEditorHandle } from "./multiline-editor"; import type { ReviewDecision } from "../../utils/agent/review.js"; import type { HistoryEntry } from "../../utils/storage/command-history.js"; +import type { TranscriptionEvent } from "../../utils/transcriber.js"; import type { ResponseInputItem, ResponseItem, @@ -20,6 +21,7 @@ import { addToHistory, } from "../../utils/storage/command-history.js"; import { clearTerminal, onExit } from "../../utils/terminal.js"; +import { RealtimeTranscriber } from "../../utils/transcriber.js"; import { Box, Text, useApp, useInput, useStdin } from "ink"; import { fileURLToPath } from "node:url"; import React, { @@ -101,6 +103,11 @@ export default function TerminalChatInput({ // Track the caret row across keystrokes const prevCursorRow = useRef(null); const prevCursorWasAtLastRow = useRef(false); + // Track recording state + const [isRecording, setIsRecording] = useState(false); + // Track if recording should resume after thinking + const [shouldResumeRecording, setShouldResumeRecording] = useState(false); + const transcriber = useRef(null); // Load command history on component mount useEffect(() => { @@ -111,6 +118,25 @@ export default function TerminalChatInput({ loadHistory(); }, []); + + // Handle pausing/resuming recording when loading state changes + useEffect(() => { + if (loading && isRecording) { + // Pause recording while thinking + setIsRecording(false); + setShouldResumeRecording(true); + + if (transcriber.current) { + transcriber.current.cleanup(); + transcriber.current = null; + } + } else if (!loading && shouldResumeRecording) { + // Resume recording when thinking is done + setIsRecording(true); + setShouldResumeRecording(false); + } + }, [loading, isRecording, shouldResumeRecording, setItems]); + // Reset slash suggestion index when input prefix changes useEffect(() => { if (input.trim().startsWith("/")) { @@ -118,8 +144,125 @@ export default function TerminalChatInput({ } }, [input]); + // Start/stop speech transcription when isRecording changes + useEffect(() => { + if (isRecording) { + // Initialize and start transcriber + const startTranscription = async () => { + try { + transcriber.current = new RealtimeTranscriber(); + + transcriber.current.on( + "transcription", + (event: TranscriptionEvent) => { + if (event.type === "transcription.delta" && event.delta) { + // transcription doesn't start until after the VAD detects the speech has finished + setInput((prev) => prev + event.delta); + // Force re-render of the editor with the latest text + setEditorKey((k) => k + 1); + setTimeout(() => { + editorRef.current?.moveCursorToEnd?.(); + }, 0); + } else if ( + event.type === "transcription.done" && + event.transcript + ) { + // occurs when the speech has finished being transcribed + setInput((prev) => prev + "\n"); // new line to indicate it's finished transcribing that speech + setEditorKey((k) => k + 1); + setTimeout(() => { + editorRef.current?.moveCursorToEnd?.(); + }, 0); + } + }, + ); + + transcriber.current.on("error", (error) => { + setIsRecording(false); + setShouldResumeRecording(false); + setItems((prev) => [ + ...prev, + { + id: `speak-transcription-error-${Date.now()}`, + type: "message", + role: "system", + content: [ + { + type: "input_text", + text: `Transcription ${error}`, + }, + ], + }, + ]); + + // Clean up the transcriber + if (transcriber.current) { + transcriber.current.cleanup(); + transcriber.current = null; + } + }); + + await transcriber.current.start(); + } catch (error) { + setIsRecording(false); + setShouldResumeRecording(false); + setItems((prev) => [ + ...prev, + { + id: `speak-start-error-${Date.now()}`, + type: "message", + role: "system", + content: [ + { + type: "input_text", + text: `Error starting transcription: ${error}`, + }, + ], + }, + ]); + } + }; + + startTranscription(); + } else if (transcriber.current) { + // Clean up transcriber when recording stops + transcriber.current.cleanup(); + transcriber.current = null; + + // Input will be submitted by the user manually after reviewing + } + + return () => { + if (transcriber.current) { + transcriber.current.cleanup(); + transcriber.current = null; + } + }; + }, [isRecording, setItems, setEditorKey]); + useInput( - (_input, _key) => { + (char_input, key) => { + // Stop recording if any key except enter is pressed while recording + if (isRecording && !key.return) { + setIsRecording(false); + setShouldResumeRecording(false); + setItems((prev) => [ + ...prev, + { + id: `speak-stop-${Date.now()}`, + type: "message", + role: "system", + content: [ + { + type: "input_text", + text: "Recording stopped.", + }, + ], + }, + ]); + return; + } + // Slash command navigation: up/down to select, enter to fill if (!confirmationPrompt && !loading && input.trim().startsWith("/")) { const prefix = input.trim(); @@ -127,11 +270,11 @@ export default function TerminalChatInput({ cmd.command.startsWith(prefix), ); if (matches.length > 0) { - if (_key.tab) { + if (key.tab) { // Cycle and fill slash command suggestions on Tab const len = matches.length; // Determine new index based on shift state - const nextIdx = _key.shift + const nextIdx = key.shift ? selectedSlashSuggestion <= 0 ? len - 1 : selectedSlashSuggestion - 1 @@ -149,19 +292,19 @@ export default function TerminalChatInput({ setDraftInput(cmd); return; } - if (_key.upArrow) { + if (key.upArrow) { setSelectedSlashSuggestion((prev) => prev <= 0 ? matches.length - 1 : prev - 1, ); return; } - if (_key.downArrow) { + if (key.downArrow) { setSelectedSlashSuggestion((prev) => prev < 0 || prev >= matches.length - 1 ? 0 : prev + 1, ); return; } - if (_key.return) { + if (key.return) { // Execute the currently selected slash command const selIdx = selectedSlashSuggestion; const cmdObj = matches[selIdx]; @@ -189,6 +332,9 @@ export default function TerminalChatInput({ case "/diff": openDiffOverlay(); break; + case "/speak": + onSubmit(cmd); + break; case "/bug": onSubmit(cmd); break; @@ -208,21 +354,21 @@ export default function TerminalChatInput({ } if (!confirmationPrompt && !loading) { if (fsSuggestions.length > 0) { - if (_key.upArrow) { + if (key.upArrow) { setSelectedCompletion((prev) => prev <= 0 ? fsSuggestions.length - 1 : prev - 1, ); return; } - if (_key.downArrow) { + if (key.downArrow) { setSelectedCompletion((prev) => prev >= fsSuggestions.length - 1 ? 0 : prev + 1, ); return; } - if (_key.tab && selectedCompletion >= 0) { + if (key.tab && selectedCompletion >= 0) { const words = input.trim().split(/\s+/); const selected = fsSuggestions[selectedCompletion]; @@ -245,7 +391,7 @@ export default function TerminalChatInput({ } } - if (_key.upArrow) { + if (key.upArrow) { let moveThroughHistory = true; // Only use history when the caret was *already* on the very first @@ -284,7 +430,7 @@ export default function TerminalChatInput({ // Otherwise let it propagate. } - if (_key.downArrow) { + if (key.downArrow) { // Only move forward in history when we're already *in* history mode // AND the caret sits on the last line of the buffer. const wasAtLastRow = @@ -307,7 +453,7 @@ export default function TerminalChatInput({ // Otherwise let it propagate } - if (_key.tab) { + if (key.tab) { const words = input.split(/\s+/); const mostRecentWord = words[words.length - 1]; if (mostRecentWord === undefined || mostRecentWord === "") { @@ -343,11 +489,11 @@ export default function TerminalChatInput({ }, 1); if (input.trim() === "" && isNew) { - if (_key.tab) { + if (key.tab) { setSelectedSuggestion( - (s) => (s + (_key.shift ? -1 : 1)) % (suggestions.length + 1), + (s) => (s + (key.shift ? -1 : 1)) % (suggestions.length + 1), ); - } else if (selectedSuggestion && _key.return) { + } else if (selectedSuggestion && key.return) { const suggestion = suggestions[selectedSuggestion - 1] || ""; setInput(""); setSelectedSuggestion(0); @@ -359,7 +505,7 @@ export default function TerminalChatInput({ }, ]); } - } else if (_input === "\u0003" || (_input === "c" && _key.ctrl)) { + } else if (char_input === "\u0003" || (char_input === "c" && key.ctrl)) { setTimeout(() => { app.exit(); onExit(); @@ -538,6 +684,26 @@ export default function TerminalChatInput({ ]); } + return; + } else if (inputValue.startsWith("/speak")) { + // Handle /speak command + setIsRecording(true); + setShouldResumeRecording(false); + setInput(""); + setItems((prev) => [ + ...prev, + { + id: `speak-start-${Date.now()}`, + type: "message", + role: "system", + content: [ + { + type: "input_text", + text: "Recording started. Press any key (except enter) to stop recording.", + }, + ], + }, + ]); return; } else if (inputValue.startsWith("/")) { // Handle invalid/unrecognized commands. Only single-word inputs starting with '/' @@ -665,6 +831,11 @@ export default function TerminalChatInput({ /> ) : ( + {isRecording && ( + + + + )} { diff --git a/codex-cli/src/components/help-overlay.tsx b/codex-cli/src/components/help-overlay.tsx index d302f7551..f36dd385a 100644 --- a/codex-cli/src/components/help-overlay.tsx +++ b/codex-cli/src/components/help-overlay.tsx @@ -62,6 +62,9 @@ export default function HelpOverlay({ /compact – condense context into a summary + + /speak – activate speech-to-text input mode + diff --git a/codex-cli/src/utils/config.ts b/codex-cli/src/utils/config.ts index 1bf053032..137a6d1dd 100644 --- a/codex-cli/src/utils/config.ts +++ b/codex-cli/src/utils/config.ts @@ -7,6 +7,7 @@ // compiled `dist/` output used by the published CLI. import type { FullAutoErrorMode } from "./auto-approval-mode.js"; +import type { TranscriptionSessionCreateParams } from "openai/resources/beta/realtime/transcription-sessions.mjs"; import { AutoApprovalMode } from "./auto-approval-mode.js"; import { log } from "./logger/log.js"; @@ -136,6 +137,7 @@ export type StoredConfig = { saveHistory?: boolean; sensitivePatterns?: Array; }; + transcription?: TranscriptionConfig; }; // Minimal config written on first run. An *empty* model string ensures that @@ -150,6 +152,13 @@ export type MemoryConfig = { enabled: boolean; }; +/** Settings for speech-to-text transcription, taken directly from the OpenAI Realtime API. */ +export type TranscriptionConfig = { + input_audio_transcription?: TranscriptionSessionCreateParams.InputAudioTranscription; + turn_detection?: TranscriptionSessionCreateParams.TurnDetection; + input_audio_noise_reduction?: TranscriptionSessionCreateParams.InputAudioNoiseReduction; +}; + // Represents full runtime config, including loaded instructions. export type AppConfig = { apiKey?: string; @@ -159,6 +168,7 @@ export type AppConfig = { approvalMode?: AutoApprovalMode; fullAutoErrorMode?: FullAutoErrorMode; memory?: MemoryConfig; + transcription?: TranscriptionConfig; /** Whether to enable desktop notifications for responses */ notify?: boolean; @@ -440,6 +450,11 @@ export const loadConfig = ( // Merge default providers with user configured providers in the config. config.providers = { ...providers, ...storedConfig.providers }; + // Load transcription config if it exists + if (storedConfig.transcription !== undefined) { + config.transcription = storedConfig.transcription; + } + return config; }; @@ -474,6 +489,7 @@ export const saveConfig = ( provider: config.provider, providers: config.providers, approvalMode: config.approvalMode, + transcription: config.transcription, }; // Add history settings if they exist diff --git a/codex-cli/src/utils/slash-commands.ts b/codex-cli/src/utils/slash-commands.ts index 4ccc3a9fc..046fe504e 100644 --- a/codex-cli/src/utils/slash-commands.ts +++ b/codex-cli/src/utils/slash-commands.ts @@ -32,4 +32,8 @@ export const SLASH_COMMANDS: Array = [ description: "Show git diff of the working directory (or applied patches if not in git)", }, + { + command: "/speak", + description: "Activate speech-to-text input mode", + }, ]; diff --git a/codex-cli/src/utils/transcriber.ts b/codex-cli/src/utils/transcriber.ts new file mode 100644 index 000000000..cce33f2c1 --- /dev/null +++ b/codex-cli/src/utils/transcriber.ts @@ -0,0 +1,249 @@ +import type { TranscriptionConfig } from "./config"; + +import { getApiKey, getBaseUrl, loadConfig } from "./config"; +import { CLI_VERSION, ORIGIN, getSessionId } from "./session"; +import { EventEmitter } from "events"; +import { createRequire } from "node:module"; +import OpenAI from "openai"; +// workaround since pvrecorder-node is a commonjs module +import { OpenAIRealtimeWS } from "openai/beta/realtime/ws"; + +const require = createRequire(import.meta.url); +const { PvRecorder } = require("@picovoice/pvrecorder-node"); + +export interface TranscriptionEvent { + type: string; + delta?: string; + transcript?: string; +} + +export class RealtimeTranscriber extends EventEmitter { + private rt: OpenAIRealtimeWS | null = null; + private recorder: typeof PvRecorder | null = null; + private isConnected = false; + private isRecording = false; + private transcriptionConfig: TranscriptionConfig; + + constructor() { + super(); + // Load config and use it for defaults + const config = loadConfig(); + + // Load values from config with sensible defaults + this.transcriptionConfig = { + input_audio_transcription: config.transcription + ?.input_audio_transcription || { + model: "gpt-4o-transcribe", + prompt: "", + language: "en", + }, + turn_detection: config.transcription?.turn_detection || { + type: "server_vad", + threshold: 0.6, + prefix_padding_ms: 400, + silence_duration_ms: 500, + }, + input_audio_noise_reduction: config.transcription + ?.input_audio_noise_reduction || { + type: "near_field", + }, + }; + + this.setupSignalHandlers(); + } + + private setupSignalHandlers() { + process.on("SIGINT", () => this.cleanup()); + process.on("SIGTERM", () => this.cleanup()); + } + + public async start(): Promise { + try { + // Check API key + const apiKey = getApiKey("openai"); + if (!apiKey) { + throw new Error("OPENAI_API_KEY not found in environment variables"); + } + + // Initialize OpenAI client + const client = new OpenAI({ + apiKey: apiKey, + baseURL: getBaseUrl("openai"), + defaultHeaders: { + originator: ORIGIN, + version: CLI_VERSION, + session_id: getSessionId() || "", + }, + }); + + const model = + this.transcriptionConfig.input_audio_transcription?.model || + "gpt-4o-transcribe"; + + // Initialize the realtime client + this.rt = new OpenAIRealtimeWS({ model }, client); + + // Set up event handlers + this.rt.on("error", (error) => { + this.emit("error", error); + }); + + this.rt.on( + "conversation.item.input_audio_transcription.delta", + (event) => { + this.emit("transcription", { + type: "transcription.delta", + delta: event.delta, + }); + }, + ); + + this.rt.on( + "conversation.item.input_audio_transcription.completed", + (event) => { + this.emit("transcription", { + type: "transcription.done", + transcript: event.transcript, + }); + }, + ); + + // Set up WebSocket connection + this.rt.socket.on("open", () => { + this.isConnected = true; + this.emit("connected"); + + // Configure the session + this.rt?.send({ + type: "session.update", + session: { + input_audio_format: "pcm16", + input_audio_transcription: + this.transcriptionConfig.input_audio_transcription, + turn_detection: this.transcriptionConfig.turn_detection, + input_audio_noise_reduction: + this.transcriptionConfig.input_audio_noise_reduction, + }, + }); + + // Start audio capture once WebSocket is connected + this.startAudioCapture(); + }); + + this.rt.socket.on("close", (code: number, reason: string) => { + if (code !== 1000) { + // 1000 is a normal close + this.emit( + "error", + new Error(`WebSocket closed: code=${code}, reason=${reason}`), + ); + } + this.isConnected = false; + this.emit("disconnected"); + }); + + this.rt.socket.on("error", (error: Error) => { + this.emit("error", error); + }); + } catch (error) { + this.emit("error", error); + this.cleanup(); + throw error; + } + } + + private startAudioCapture() { + try { + // Get available audio devices + const devices = PvRecorder.getAvailableDevices(); + if (devices.length === 0) { + throw new Error("No audio input device found"); + } + + // Create recorder with first available device + const frameLength = 512; + this.recorder = new PvRecorder(frameLength, 0); + + // Start recording + this.recorder.start(); + this.isRecording = true; + + // Process audio frames + this.processAudioFrames(); + } catch (error) { + this.emit("error", error); + this.stopAudioCapture(); + throw error; + } + } + + private async processAudioFrames() { + if (!this.recorder || !this.isRecording) { + return; + } + + try { + while (this.isRecording && this.isConnected) { + try { + // We need to await each audio frame sequentially in the loop + // to maintain proper audio stream ordering + // eslint-disable-next-line no-await-in-loop + const frame = await this.recorder.read(); + + // Convert Int16Array to Buffer and send to OpenAI + if (this.rt && this.isConnected) { + const buffer = Buffer.from(frame.buffer); + this.rt.send({ + type: "input_audio_buffer.append", + audio: buffer.toString("base64"), + }); + } + } catch (error) { + // Silently break out if it's an InvalidState error (happens when stopping) + if ( + error instanceof Error && + (error.constructor.name === "PvRecorderStatusInvalidStateError" || + // require import doesn't perserve class prototype + error.message?.includes("failed to read audio data frame")) + ) { + break; + } + // Re-throw other errors + throw error; + } + } + } catch (error) { + this.emit("error", error); + } finally { + this.stopAudioCapture(); + } + } + + private stopAudioCapture() { + if (this.recorder) { + try { + this.recorder.stop(); + this.recorder.release(); + } catch (error) { + this.emit("error", error); + } + this.recorder = null; + this.isRecording = false; + } + } + + public cleanup(): void { + this.stopAudioCapture(); + + if (this.rt) { + try { + this.rt.close(); + } catch (error) { + this.emit("error", error); + } + this.rt = null; + } + + this.isConnected = false; + } +} diff --git a/codex-cli/tests/config.test.tsx b/codex-cli/tests/config.test.tsx index 831208a12..7a0a3b89d 100644 --- a/codex-cli/tests/config.test.tsx +++ b/codex-cli/tests/config.test.tsx @@ -275,3 +275,55 @@ test("handles empty user instructions when saving with project doc separator", ( }); expect(loadedConfig.instructions).toBe(""); }); + +test("loads and saves transcription config correctly", () => { + const customTranscriptionConfig = { + // Purposely not using the default values + input_audio_transcription: { + model: "whisper-1" as const, + language: "zh", + }, + turn_detection: { + type: "server_vad" as const, + threshold: 0.9, + silence_duration_ms: 500, + }, + input_audio_noise_reduction: { + type: "far_field" as const, + }, + }; + + const testConfig = { + model: "test-model", + instructions: "test instructions", + notify: false, + transcription: customTranscriptionConfig, + }; + + // Test saving the config + saveConfig(testConfig, testConfigPath, testInstructionsPath); + + // Test loading the config + const loadedConfig = loadConfig(testConfigPath, testInstructionsPath, { + disableProjectDoc: true, + }); + + // Test that the transcription config was loaded correctly + expect(loadedConfig.transcription).toEqual(customTranscriptionConfig); + expect(loadedConfig.transcription?.input_audio_transcription?.model).toEqual( + "whisper-1", + ); + expect( + loadedConfig.transcription?.input_audio_transcription?.language, + ).toEqual("zh"); + expect(loadedConfig.transcription?.turn_detection?.type).toEqual( + "server_vad", + ); + expect(loadedConfig.transcription?.turn_detection?.threshold).toEqual(0.9); + expect( + loadedConfig.transcription?.turn_detection?.silence_duration_ms, + ).toEqual(500); + expect(loadedConfig.transcription?.input_audio_noise_reduction?.type).toEqual( + "far_field", + ); +}); diff --git a/codex-cli/tests/slash-commands.test.ts b/codex-cli/tests/slash-commands.test.ts index b10a484f3..c89e40f70 100644 --- a/codex-cli/tests/slash-commands.test.ts +++ b/codex-cli/tests/slash-commands.test.ts @@ -11,6 +11,7 @@ test("SLASH_COMMANDS includes expected commands", () => { expect(commands).toContain("/approval"); expect(commands).toContain("/clearhistory"); expect(commands).toContain("/diff"); + expect(commands).toContain("/speak"); }); test("filters slash commands by prefix", () => { @@ -23,7 +24,14 @@ test("filters slash commands by prefix", () => { expect.arrayContaining(["/clear", "/clearhistory", "/compact"]), ); expect(names).not.toEqual( - expect.arrayContaining(["/history", "/help", "/model", "/approval"]), + expect.arrayContaining([ + "/history", + "/help", + "/model", + "/approval", + "/diff", + "/speak", + ]), ); const emptyPrefixFiltered = SLASH_COMMANDS.filter((c: SlashCommand) => diff --git a/codex-cli/tests/terminal-chat-input-transcription.test.tsx b/codex-cli/tests/terminal-chat-input-transcription.test.tsx new file mode 100644 index 000000000..6e48f25b8 --- /dev/null +++ b/codex-cli/tests/terminal-chat-input-transcription.test.tsx @@ -0,0 +1,180 @@ +import React from "react"; +import type { ComponentProps } from "react"; +import { renderTui } from "./ui-test-helpers.js"; +import TerminalChatInput from "../src/components/chat/terminal-chat-input.js"; +import { describe, it, expect, vi } from "vitest"; + +// Helper that lets us type and then immediately flush ink's async timers +async function type( + stdin: NodeJS.WritableStream, + text: string, + flush: () => Promise, +) { + stdin.write(text); + await flush(); +} + +// Mock the createInputItem function to avoid filesystem operations +vi.mock("../src/utils/input-utils.js", () => ({ + createInputItem: vi.fn(async (text: string) => ({ + role: "user", + type: "message", + content: [{ type: "input_text", text }], + })), +})); + +const mocks = { + transcriber: null as any, +}; + +// Mock the RealtimeTranscriber to avoid real WebSocket connections +vi.mock("../src/utils/transcriber.js", () => ({ + RealtimeTranscriber: vi.fn().mockImplementation(() => { + const mock = { + on: vi.fn(), + start: vi.fn().mockResolvedValue(undefined), + cleanup: vi.fn(), + }; + mocks.transcriber = mock; + return mock; + }), +})); + +describe("TerminalChatInput transcription functionality", () => { + it("/speak command starts recording and shows the recording indicator", async () => { + const props: ComponentProps = { + isNew: false, + loading: false, + submitInput: () => {}, + confirmationPrompt: null, + explanation: undefined, + submitConfirmation: () => {}, + setLastResponseId: () => {}, + setItems: () => {}, + contextLeftPercent: 50, + openOverlay: () => {}, + openDiffOverlay: () => {}, + openModelOverlay: () => {}, + openApprovalOverlay: () => {}, + openHelpOverlay: () => {}, + onCompact: () => {}, + interruptAgent: () => {}, + active: true, + thinkingSeconds: 0, + }; + + const { stdin, flush, lastFrameStripped } = renderTui( + , + ); + // Wait for initial render to settle + await flush(); + + // Simulate the /speak command and press Enter to start recording + await type(stdin, "/speak", flush); + await type(stdin, "\r", flush); + // Allow UI to update after pressing Enter + await flush(); + + // Check that the recording indicator is shown (isRecording = true) + expect(lastFrameStripped()).toContain("●"); + // Allow transcription effect to run + await flush(); + // Verify RealtimeTranscriber.start was called on the instance + const transcriber = mocks.transcriber; + expect(transcriber.start).toHaveBeenCalled(); + }); + + it("pressing any key while recording stops the recording", async () => { + const props: ComponentProps = { + isNew: false, + loading: false, + submitInput: () => {}, + confirmationPrompt: null, + explanation: undefined, + submitConfirmation: () => {}, + setLastResponseId: () => {}, + setItems: () => {}, + contextLeftPercent: 50, + openOverlay: () => {}, + openDiffOverlay: () => {}, + openModelOverlay: () => {}, + openApprovalOverlay: () => {}, + openHelpOverlay: () => {}, + onCompact: () => {}, + interruptAgent: () => {}, + active: true, + thinkingSeconds: 0, + }; + + const { stdin, flush, lastFrameStripped } = renderTui( + , + ); + await flush(); + + // Simulate the /speak command and press Enter to start recording + await type(stdin, "/speak", flush); + await type(stdin, "\r", flush); + await flush(); + + // Check that the recording indicator is shown (isRecording = true) + expect(lastFrameStripped()).toContain("●"); + + // Simulate pressing any key while recording + await type(stdin, "a", flush); + await flush(); + + // Check that the recording indicator is no longer shown (isRecording = false) + expect(lastFrameStripped()).not.toContain("●"); + + // Verify RealtimeTranscriber.cleanup was called on the instance + const transcriber = mocks.transcriber; + expect(transcriber.cleanup).toHaveBeenCalled(); + }); + + it("pressing enter while recording submits but doesn't stop recording", async () => { + const props: ComponentProps = { + isNew: false, + loading: false, + submitInput: () => {}, + confirmationPrompt: null, + explanation: undefined, + submitConfirmation: () => {}, + setLastResponseId: () => {}, + setItems: () => {}, + contextLeftPercent: 50, + openOverlay: () => {}, + openDiffOverlay: () => {}, + openModelOverlay: () => {}, + openApprovalOverlay: () => {}, + openHelpOverlay: () => {}, + onCompact: () => {}, + interruptAgent: () => {}, + active: true, + thinkingSeconds: 0, + }; + + const { stdin, flush, lastFrameStripped } = renderTui( + , + ); + await flush(); + + // Simulate the /speak command and press Enter to start recording + await type(stdin, "/speak", flush); + await type(stdin, "\r", flush); + await flush(); + + // Check that the recording indicator is shown (isRecording = true) + expect(lastFrameStripped()).toContain("●"); + + // Simulate pressing Enter while recording + await type(stdin, "\r", flush); + await flush(); + + // Check that the recording indicator is still shown (isRecording = true) + expect(lastFrameStripped()).toContain("●"); + + // Verify RealtimeTranscriber.cleanup has not been called + const transcriber = mocks.transcriber; + expect(transcriber.cleanup).not.toHaveBeenCalled(); + }); +}); diff --git a/codex-cli/tests/transcriber.test.ts b/codex-cli/tests/transcriber.test.ts new file mode 100644 index 000000000..419a8a912 --- /dev/null +++ b/codex-cli/tests/transcriber.test.ts @@ -0,0 +1,218 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { EventEmitter } from "events"; + +// Define types for our mocks +type MockSocket = { + socket: EventEmitter; + send: ReturnType; + close: ReturnType; + on: ReturnType; + emit: ReturnType; +}; + +type MockRecorder = { + start: ReturnType; + read: ReturnType; + stop: ReturnType; + release: ReturnType; +}; + +// Store mocks in this object so we can access them in tests +const mocks: { + webSocket: MockSocket | null; + PvRecorder: MockRecorder | null; +} = { + webSocket: null, + PvRecorder: null, +}; + +// Mock dependencies before importing the module under test +vi.mock("openai/beta/realtime/ws", () => { + return { + OpenAIRealtimeWS: vi.fn().mockImplementation(() => { + const eventHandlers: Record void> = {}; + const socketMock: MockSocket = { + socket: new EventEmitter(), + send: vi.fn(), + close: vi.fn(), + on: vi.fn().mockImplementation((event, handler) => { + eventHandlers[event] = handler; + }), + emit: vi.fn().mockImplementation((event, data) => { + if (eventHandlers[event]) { + eventHandlers[event](data); + } + }), + }; + // Store reference in our mocks object for test access + mocks.webSocket = socketMock; + return socketMock; + }), + }; +}); + +vi.mock("@picovoice/pvrecorder-node", () => { + return { + PvRecorder: vi.fn().mockImplementation(() => { + const recorderMock: MockRecorder = { + start: vi.fn(), + read: vi.fn().mockReturnValue(new Int16Array([1, 2, 3])), + stop: vi.fn(), + release: vi.fn(), + }; + // Store reference in our mocks object for test access + mocks.PvRecorder = recorderMock; + return recorderMock; + }), + }; +}); + +vi.mock("../src/utils/config.js", () => ({ + getApiKey: vi.fn().mockReturnValue("fake-api-key"), + getBaseUrl: vi.fn().mockReturnValue("https://api.openai.com/v1"), + loadConfig: vi.fn().mockReturnValue({ + transcription: { + input_audio_transcription: { + model: "whisper-1" as const, + prompt: "", + language: "fr", + }, + turn_detection: { + type: "server_vad" as const, + threshold: 0.8, + prefix_padding_ms: 300, + silence_duration_ms: 500, + }, + }, + }), +})); + +vi.mock("../src/utils/session.js", () => ({ + CLI_VERSION: "test-version", + ORIGIN: "test-origin", + getSessionId: vi.fn().mockReturnValue("test-session-id"), +})); + +// Import the module under test after mocks are defined +import { RealtimeTranscriber } from "../src/utils/transcriber.js"; + +describe("RealtimeTranscriber", () => { + let transcriber; + + beforeEach(() => { + vi.clearAllMocks(); + // Reset our references + mocks.webSocket = null; + mocks.PvRecorder = null; + }); + + it("creates instance without error", () => { + transcriber = new RealtimeTranscriber(); + expect(transcriber).toBeDefined(); + }); + + it("can start and stop", async () => { + transcriber = new RealtimeTranscriber(); + await transcriber.start(); + transcriber.cleanup(); + expect(true).toBe(true); // Simple assertion just to verify no errors + }); + + it("emits transcription events when receiving data", async () => { + const transcriber = new RealtimeTranscriber(); + const transcriptionHandler = vi.fn(); + transcriber.on("transcription", transcriptionHandler); + + await transcriber.start(); + + // References should be populated after start() is called + const ws = mocks.webSocket; + expect(ws).toBeTruthy(); + + // TypeScript needs non-null assertion since we've checked with expect + // Simulate WebSocket open event + ws!.socket.emit("open"); + + // Simulate receiving transcription data + ws!.emit("conversation.item.input_audio_transcription.delta", { + delta: "Hello world", + }); + + expect(transcriptionHandler).toHaveBeenCalledWith({ + type: "transcription.delta", + delta: "Hello world", + }); + }); + + it("emits errors when WebSocket encounters problems", async () => { + const transcriber = new RealtimeTranscriber(); + const errorHandler = vi.fn(); + transcriber.on("error", errorHandler); + + await transcriber.start(); + + const ws = mocks.webSocket; + expect(ws).toBeTruthy(); + + // Simulate WebSocket error + const testError = new Error("WebSocket error"); + ws!.socket.emit("error", testError); + + expect(errorHandler).toHaveBeenCalledWith(testError); + }); + + it("starts audio recording when WebSocket connection opens", async () => { + // Spy on the private startAudioCapture method + const startAudioCaptureSpy = vi.spyOn( + RealtimeTranscriber.prototype as any, + "startAudioCapture", + ); + + // Create transcriber and start it + const transcriber = new RealtimeTranscriber(); + await transcriber.start(); + + const ws = mocks.webSocket; + expect(ws).toBeTruthy(); + + // Simulate WebSocket open event + ws!.socket.emit("open"); + + // Verify that startAudioCapture was called + expect(startAudioCaptureSpy).toHaveBeenCalled(); + }); + + it("sends session.update event when WebSocket connection opens", async () => { + const transcriber = new RealtimeTranscriber(); + + await transcriber.start(); + + const ws = mocks.webSocket; + expect(ws).toBeTruthy(); + + // Simulate WebSocket open event + ws!.socket.emit("open"); + + // Verify that send was called with session.update event + expect(ws!.send).toHaveBeenCalledWith({ + type: "session.update", + session: { + input_audio_format: "pcm16", + input_audio_transcription: { + model: "whisper-1", + prompt: "", + language: "fr", + }, + turn_detection: { + type: "server_vad", + threshold: 0.8, + prefix_padding_ms: 300, + silence_duration_ms: 500, + }, + input_audio_noise_reduction: { + type: "near_field", + }, + }, + }); + }); +}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 6155dfc3a..04783616d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -31,6 +31,9 @@ importers: '@inkjs/ui': specifier: ^2.0.0 version: 2.0.0(ink@5.2.0(@types/react@18.3.20)(react@18.3.1)) + '@picovoice/pvrecorder-node': + specifier: ^1.2.8 + version: 1.2.8 chalk: specifier: ^5.2.0 version: 5.4.1 @@ -401,6 +404,10 @@ packages: resolution: {integrity: sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==} engines: {node: '>= 8'} + '@picovoice/pvrecorder-node@1.2.8': + resolution: {integrity: sha512-dbLJlplQQNRkM2ja/hP4sRADGDILuJ54dEf8cU5eULeNrddxXvOtE8IiJ5F2VhhbXmIv3Qmn79DqhttCOxjH8Q==} + engines: {node: '>=18.0.0'} + '@rollup/rollup-android-arm-eabi@4.40.0': resolution: {integrity: sha512-+Fbls/diZ0RDerhE8kyC6hjADCXA1K4yVNlH0EYfd2XjyH0UGgzaQ8MlT0pCXAThfxv3QUAczHaL+qSv1E4/Cg==} cpu: [arm] @@ -2655,6 +2662,8 @@ snapshots: '@nodelib/fs.scandir': 2.1.5 fastq: 1.19.1 + '@picovoice/pvrecorder-node@1.2.8': {} + '@rollup/rollup-android-arm-eabi@4.40.0': optional: true