diff --git a/codex-cli/package.json b/codex-cli/package.json
index 369e0d95f..b0be18179 100644
--- a/codex-cli/package.json
+++ b/codex-cli/package.json
@@ -30,6 +30,7 @@
   ],
   "dependencies": {
     "@inkjs/ui": "^2.0.0",
+    "@picovoice/pvrecorder-node": "^1.2.8",
     "chalk": "^5.2.0",
     "diff": "^7.0.0",
     "dotenv": "^16.1.4",
diff --git a/codex-cli/src/components/chat/terminal-chat-input.tsx b/codex-cli/src/components/chat/terminal-chat-input.tsx
index 88a89039d..f1e638bb1 100644
--- a/codex-cli/src/components/chat/terminal-chat-input.tsx
+++ b/codex-cli/src/components/chat/terminal-chat-input.tsx
@@ -1,6 +1,7 @@
 import type { MultilineTextEditorHandle } from "./multiline-editor";
 import type { ReviewDecision } from "../../utils/agent/review.js";
 import type { HistoryEntry } from "../../utils/storage/command-history.js";
+import type { TranscriptionEvent } from "../../utils/transcriber.js";
 import type {
   ResponseInputItem,
   ResponseItem,
@@ -20,6 +21,7 @@ import {
   addToHistory,
 } from "../../utils/storage/command-history.js";
 import { clearTerminal, onExit } from "../../utils/terminal.js";
+import { RealtimeTranscriber } from "../../utils/transcriber.js";
 import { Box, Text, useApp, useInput, useStdin } from "ink";
 import { fileURLToPath } from "node:url";
 import React, {
@@ -101,6 +103,11 @@ export default function TerminalChatInput({
   // Track the caret row across keystrokes
   const prevCursorRow = useRef<number | null>(null);
   const prevCursorWasAtLastRow = useRef<boolean>(false);
+  // Track recording state
+  const [isRecording, setIsRecording] = useState(false);
+  // Track if recording should resume after thinking
+  const [shouldResumeRecording, setShouldResumeRecording] = useState(false);
+  const transcriber = useRef<RealtimeTranscriber | null>(null);
 
   // Load command history on component mount
   useEffect(() => {
@@ -111,6 +118,25 @@ export default function TerminalChatInput({
 
     loadHistory();
   }, []);
+
+  // Handle pausing/resuming recording when loading state changes
+  useEffect(() => {
+    if (loading && isRecording) {
+      // Pause recording while thinking
+      setIsRecording(false);
+      setShouldResumeRecording(true);
+
+      if (transcriber.current) {
+        transcriber.current.cleanup();
+        transcriber.current = null;
+      }
+    } else if (!loading && shouldResumeRecording) {
+      // Resume recording when thinking is done
+      setIsRecording(true);
+      setShouldResumeRecording(false);
+    }
+  }, [loading, isRecording, shouldResumeRecording, setItems]);
+
   // Reset slash suggestion index when input prefix changes
   useEffect(() => {
     if (input.trim().startsWith("/")) {
@@ -118,8 +144,125 @@ export default function TerminalChatInput({
     }
   }, [input]);
 
+  // Start/stop speech transcription when isRecording changes
+  useEffect(() => {
+    if (isRecording) {
+      // Initialize and start transcriber
+      const startTranscription = async () => {
+        try {
+          transcriber.current = new RealtimeTranscriber();
+
+          transcriber.current.on(
+            "transcription",
+            (event: TranscriptionEvent) => {
+              if (event.type === "transcription.delta" && event.delta) {
+                // transcription doesn't start until after the VAD detects the speech has finished
+                setInput((prev) => prev + event.delta);
+                // Force re-render of the editor with the latest text
+                setEditorKey((k) => k + 1);
+                setTimeout(() => {
+                  editorRef.current?.moveCursorToEnd?.();
+                }, 0);
+              } else if (
+                event.type === "transcription.done" &&
+                event.transcript
+              ) {
+                // occurs when the speech has finished being transcribed
+                setInput((prev) => prev + "\n"); // new line to indicate it's finished transcribing that speech
+                setEditorKey((k) => k + 1);
+                setTimeout(() => {
+                  editorRef.current?.moveCursorToEnd?.();
+                }, 0);
+              }
+            },
+          );
+
+          transcriber.current.on("error", (error) => {
+            setIsRecording(false);
+            setShouldResumeRecording(false);
+            setItems((prev) => [
+              ...prev,
+              {
+                id: `speak-transcription-error-${Date.now()}`,
+                type: "message",
+                role: "system",
+                content: [
+                  {
+                    type: "input_text",
+                    text: `Transcription ${error}`,
+                  },
+                ],
+              },
+            ]);
+
+            // Clean up the transcriber
+            if (transcriber.current) {
+              transcriber.current.cleanup();
+              transcriber.current = null;
+            }
+          });
+
+          await transcriber.current.start();
+        } catch (error) {
+          setIsRecording(false);
+          setShouldResumeRecording(false);
+          setItems((prev) => [
+            ...prev,
+            {
+              id: `speak-start-error-${Date.now()}`,
+              type: "message",
+              role: "system",
+              content: [
+                {
+                  type: "input_text",
+                  text: `Error starting transcription: ${error}`,
+                },
+              ],
+            },
+          ]);
+        }
+      };
+
+      startTranscription();
+    } else if (transcriber.current) {
+      // Clean up transcriber when recording stops
+      transcriber.current.cleanup();
+      transcriber.current = null;
+
+      // Input will be submitted by the user manually after reviewing
+    }
+
+    return () => {
+      if (transcriber.current) {
+        transcriber.current.cleanup();
+        transcriber.current = null;
+      }
+    };
+  }, [isRecording, setItems, setEditorKey]);
+
   useInput(
-    (_input, _key) => {
+    (char_input, key) => {
+      // Stop recording if any key except enter is pressed while recording
+      if (isRecording && !key.return) {
+        setIsRecording(false);
+        setShouldResumeRecording(false);
+        setItems((prev) => [
+          ...prev,
+          {
+            id: `speak-stop-${Date.now()}`,
+            type: "message",
+            role: "system",
+            content: [
+              {
+                type: "input_text",
+                text: "Recording stopped.",
+              },
+            ],
+          },
+        ]);
+        return;
+      }
+
       // Slash command navigation: up/down to select, enter to fill
       if (!confirmationPrompt && !loading && input.trim().startsWith("/")) {
         const prefix = input.trim();
@@ -127,11 +270,11 @@ export default function TerminalChatInput({
           cmd.command.startsWith(prefix),
         );
         if (matches.length > 0) {
-          if (_key.tab) {
+          if (key.tab) {
             // Cycle and fill slash command suggestions on Tab
             const len = matches.length;
             // Determine new index based on shift state
-            const nextIdx = _key.shift
+            const nextIdx = key.shift
               ? selectedSlashSuggestion <= 0
                 ? len - 1
                 : selectedSlashSuggestion - 1
@@ -149,19 +292,19 @@ export default function TerminalChatInput({
             setDraftInput(cmd);
             return;
           }
-          if (_key.upArrow) {
+          if (key.upArrow) {
             setSelectedSlashSuggestion((prev) =>
               prev <= 0 ? matches.length - 1 : prev - 1,
             );
             return;
           }
-          if (_key.downArrow) {
+          if (key.downArrow) {
             setSelectedSlashSuggestion((prev) =>
               prev < 0 || prev >= matches.length - 1 ? 0 : prev + 1,
             );
             return;
           }
-          if (_key.return) {
+          if (key.return) {
             // Execute the currently selected slash command
             const selIdx = selectedSlashSuggestion;
             const cmdObj = matches[selIdx];
@@ -189,6 +332,9 @@ export default function TerminalChatInput({
                 case "/diff":
                   openDiffOverlay();
                   break;
+                case "/speak":
+                  onSubmit(cmd);
+                  break;
                 case "/bug":
                   onSubmit(cmd);
                   break;
@@ -208,21 +354,21 @@ export default function TerminalChatInput({
       }
       if (!confirmationPrompt && !loading) {
         if (fsSuggestions.length > 0) {
-          if (_key.upArrow) {
+          if (key.upArrow) {
             setSelectedCompletion((prev) =>
               prev <= 0 ? fsSuggestions.length - 1 : prev - 1,
             );
             return;
           }
 
-          if (_key.downArrow) {
+          if (key.downArrow) {
             setSelectedCompletion((prev) =>
               prev >= fsSuggestions.length - 1 ? 0 : prev + 1,
             );
             return;
           }
 
-          if (_key.tab && selectedCompletion >= 0) {
+          if (key.tab && selectedCompletion >= 0) {
             const words = input.trim().split(/\s+/);
             const selected = fsSuggestions[selectedCompletion];
 
@@ -245,7 +391,7 @@ export default function TerminalChatInput({
           }
         }
 
-        if (_key.upArrow) {
+        if (key.upArrow) {
           let moveThroughHistory = true;
 
           // Only use history when the caret was *already* on the very first
@@ -284,7 +430,7 @@ export default function TerminalChatInput({
           // Otherwise let it propagate.
         }
 
-        if (_key.downArrow) {
+        if (key.downArrow) {
           // Only move forward in history when we're already *in* history mode
           // AND the caret sits on the last line of the buffer.
           const wasAtLastRow =
@@ -307,7 +453,7 @@ export default function TerminalChatInput({
           // Otherwise let it propagate
         }
 
-        if (_key.tab) {
+        if (key.tab) {
           const words = input.split(/\s+/);
           const mostRecentWord = words[words.length - 1];
           if (mostRecentWord === undefined || mostRecentWord === "") {
@@ -343,11 +489,11 @@ export default function TerminalChatInput({
       }, 1);
 
       if (input.trim() === "" && isNew) {
-        if (_key.tab) {
+        if (key.tab) {
           setSelectedSuggestion(
-            (s) => (s + (_key.shift ? -1 : 1)) % (suggestions.length + 1),
+            (s) => (s + (key.shift ? -1 : 1)) % (suggestions.length + 1),
           );
-        } else if (selectedSuggestion && _key.return) {
+        } else if (selectedSuggestion && key.return) {
           const suggestion = suggestions[selectedSuggestion - 1] || "";
           setInput("");
           setSelectedSuggestion(0);
@@ -359,7 +505,7 @@ export default function TerminalChatInput({
             },
           ]);
         }
-      } else if (_input === "\u0003" || (_input === "c" && _key.ctrl)) {
+      } else if (char_input === "\u0003" || (char_input === "c" && key.ctrl)) {
         setTimeout(() => {
           app.exit();
           onExit();
@@ -538,6 +684,26 @@ export default function TerminalChatInput({
           ]);
         }
 
+        return;
+      } else if (inputValue.startsWith("/speak")) {
+        // Handle /speak command
+        setIsRecording(true);
+        setShouldResumeRecording(false);
+        setInput("");
+        setItems((prev) => [
+          ...prev,
+          {
+            id: `speak-start-${Date.now()}`,
+            type: "message",
+            role: "system",
+            content: [
+              {
+                type: "input_text",
+                text: "Recording started. Press any key (except enter) to stop recording.",
+              },
+            ],
+          },
+        ]);
         return;
       } else if (inputValue.startsWith("/")) {
         // Handle invalid/unrecognized commands. Only single-word inputs starting with '/'
@@ -665,6 +831,11 @@ export default function TerminalChatInput({
           />
         ) : (
           <Box paddingX={1}>
+            {isRecording && (
+              <Box paddingRight={1}>
+                <Text color="red">●</Text>
+              </Box>
+            )}
             <MultilineTextEditor
               ref={editorRef}
               onChange={(txt: string) => {
diff --git a/codex-cli/src/components/help-overlay.tsx b/codex-cli/src/components/help-overlay.tsx
index d302f7551..f36dd385a 100644
--- a/codex-cli/src/components/help-overlay.tsx
+++ b/codex-cli/src/components/help-overlay.tsx
@@ -62,6 +62,9 @@ export default function HelpOverlay({
         <Text>
           <Text color="cyan">/compact</Text> – condense context into a summary
         </Text>
+        <Text>
+          <Text color="cyan">/speak</Text> – activate speech-to-text input mode
+        </Text>
 
         <Box marginTop={1}>
           <Text bold dimColor>
diff --git a/codex-cli/src/utils/config.ts b/codex-cli/src/utils/config.ts
index 1bf053032..137a6d1dd 100644
--- a/codex-cli/src/utils/config.ts
+++ b/codex-cli/src/utils/config.ts
@@ -7,6 +7,7 @@
 // compiled `dist/` output used by the published CLI.
 
 import type { FullAutoErrorMode } from "./auto-approval-mode.js";
+import type { TranscriptionSessionCreateParams } from "openai/resources/beta/realtime/transcription-sessions.mjs";
 
 import { AutoApprovalMode } from "./auto-approval-mode.js";
 import { log } from "./logger/log.js";
@@ -136,6 +137,7 @@ export type StoredConfig = {
     saveHistory?: boolean;
     sensitivePatterns?: Array<string>;
   };
+  transcription?: TranscriptionConfig;
 };
 
 // Minimal config written on first run.  An *empty* model string ensures that
@@ -150,6 +152,13 @@ export type MemoryConfig = {
   enabled: boolean;
 };
 
+/** Settings for speech-to-text transcription, taken directly from the OpenAI Realtime API. */
+export type TranscriptionConfig = {
+  input_audio_transcription?: TranscriptionSessionCreateParams.InputAudioTranscription;
+  turn_detection?: TranscriptionSessionCreateParams.TurnDetection;
+  input_audio_noise_reduction?: TranscriptionSessionCreateParams.InputAudioNoiseReduction;
+};
+
 // Represents full runtime config, including loaded instructions.
 export type AppConfig = {
   apiKey?: string;
@@ -159,6 +168,7 @@ export type AppConfig = {
   approvalMode?: AutoApprovalMode;
   fullAutoErrorMode?: FullAutoErrorMode;
   memory?: MemoryConfig;
+  transcription?: TranscriptionConfig;
   /** Whether to enable desktop notifications for responses */
   notify?: boolean;
 
@@ -440,6 +450,11 @@ export const loadConfig = (
   // Merge default providers with user configured providers in the config.
   config.providers = { ...providers, ...storedConfig.providers };
 
+  // Load transcription config if it exists
+  if (storedConfig.transcription !== undefined) {
+    config.transcription = storedConfig.transcription;
+  }
+
   return config;
 };
 
@@ -474,6 +489,7 @@ export const saveConfig = (
     provider: config.provider,
     providers: config.providers,
     approvalMode: config.approvalMode,
+    transcription: config.transcription,
   };
 
   // Add history settings if they exist
diff --git a/codex-cli/src/utils/slash-commands.ts b/codex-cli/src/utils/slash-commands.ts
index 4ccc3a9fc..046fe504e 100644
--- a/codex-cli/src/utils/slash-commands.ts
+++ b/codex-cli/src/utils/slash-commands.ts
@@ -32,4 +32,8 @@ export const SLASH_COMMANDS: Array<SlashCommand> = [
     description:
       "Show git diff of the working directory (or applied patches if not in git)",
   },
+  {
+    command: "/speak",
+    description: "Activate speech-to-text input mode",
+  },
 ];
diff --git a/codex-cli/src/utils/transcriber.ts b/codex-cli/src/utils/transcriber.ts
new file mode 100644
index 000000000..cce33f2c1
--- /dev/null
+++ b/codex-cli/src/utils/transcriber.ts
@@ -0,0 +1,249 @@
+import type { TranscriptionConfig } from "./config";
+
+import { getApiKey, getBaseUrl, loadConfig } from "./config";
+import { CLI_VERSION, ORIGIN, getSessionId } from "./session";
+import { EventEmitter } from "events";
+import { createRequire } from "node:module";
+import OpenAI from "openai";
+// workaround since pvrecorder-node is a commonjs module
+import { OpenAIRealtimeWS } from "openai/beta/realtime/ws";
+
+const require = createRequire(import.meta.url);
+const { PvRecorder } = require("@picovoice/pvrecorder-node");
+
+export interface TranscriptionEvent {
+  type: string;
+  delta?: string;
+  transcript?: string;
+}
+
+export class RealtimeTranscriber extends EventEmitter {
+  private rt: OpenAIRealtimeWS | null = null;
+  private recorder: typeof PvRecorder | null = null;
+  private isConnected = false;
+  private isRecording = false;
+  private transcriptionConfig: TranscriptionConfig;
+
+  constructor() {
+    super();
+    // Load config and use it for defaults
+    const config = loadConfig();
+
+    // Load values from config with sensible defaults
+    this.transcriptionConfig = {
+      input_audio_transcription: config.transcription
+        ?.input_audio_transcription || {
+        model: "gpt-4o-transcribe",
+        prompt: "",
+        language: "en",
+      },
+      turn_detection: config.transcription?.turn_detection || {
+        type: "server_vad",
+        threshold: 0.6,
+        prefix_padding_ms: 400,
+        silence_duration_ms: 500,
+      },
+      input_audio_noise_reduction: config.transcription
+        ?.input_audio_noise_reduction || {
+        type: "near_field",
+      },
+    };
+
+    this.setupSignalHandlers();
+  }
+
+  private setupSignalHandlers() {
+    process.on("SIGINT", () => this.cleanup());
+    process.on("SIGTERM", () => this.cleanup());
+  }
+
+  public async start(): Promise<void> {
+    try {
+      // Check API key
+      const apiKey = getApiKey("openai");
+      if (!apiKey) {
+        throw new Error("OPENAI_API_KEY not found in environment variables");
+      }
+
+      // Initialize OpenAI client
+      const client = new OpenAI({
+        apiKey: apiKey,
+        baseURL: getBaseUrl("openai"),
+        defaultHeaders: {
+          originator: ORIGIN,
+          version: CLI_VERSION,
+          session_id: getSessionId() || "",
+        },
+      });
+
+      const model =
+        this.transcriptionConfig.input_audio_transcription?.model ||
+        "gpt-4o-transcribe";
+
+      // Initialize the realtime client
+      this.rt = new OpenAIRealtimeWS({ model }, client);
+
+      // Set up event handlers
+      this.rt.on("error", (error) => {
+        this.emit("error", error);
+      });
+
+      this.rt.on(
+        "conversation.item.input_audio_transcription.delta",
+        (event) => {
+          this.emit("transcription", {
+            type: "transcription.delta",
+            delta: event.delta,
+          });
+        },
+      );
+
+      this.rt.on(
+        "conversation.item.input_audio_transcription.completed",
+        (event) => {
+          this.emit("transcription", {
+            type: "transcription.done",
+            transcript: event.transcript,
+          });
+        },
+      );
+
+      // Set up WebSocket connection
+      this.rt.socket.on("open", () => {
+        this.isConnected = true;
+        this.emit("connected");
+
+        // Configure the session
+        this.rt?.send({
+          type: "session.update",
+          session: {
+            input_audio_format: "pcm16",
+            input_audio_transcription:
+              this.transcriptionConfig.input_audio_transcription,
+            turn_detection: this.transcriptionConfig.turn_detection,
+            input_audio_noise_reduction:
+              this.transcriptionConfig.input_audio_noise_reduction,
+          },
+        });
+
+        // Start audio capture once WebSocket is connected
+        this.startAudioCapture();
+      });
+
+      this.rt.socket.on("close", (code: number, reason: string) => {
+        if (code !== 1000) {
+          // 1000 is a normal close
+          this.emit(
+            "error",
+            new Error(`WebSocket closed: code=${code}, reason=${reason}`),
+          );
+        }
+        this.isConnected = false;
+        this.emit("disconnected");
+      });
+
+      this.rt.socket.on("error", (error: Error) => {
+        this.emit("error", error);
+      });
+    } catch (error) {
+      this.emit("error", error);
+      this.cleanup();
+      throw error;
+    }
+  }
+
+  private startAudioCapture() {
+    try {
+      // Get available audio devices
+      const devices = PvRecorder.getAvailableDevices();
+      if (devices.length === 0) {
+        throw new Error("No audio input device found");
+      }
+
+      // Create recorder with first available device
+      const frameLength = 512;
+      this.recorder = new PvRecorder(frameLength, 0);
+
+      // Start recording
+      this.recorder.start();
+      this.isRecording = true;
+
+      // Process audio frames
+      this.processAudioFrames();
+    } catch (error) {
+      this.emit("error", error);
+      this.stopAudioCapture();
+      throw error;
+    }
+  }
+
+  private async processAudioFrames() {
+    if (!this.recorder || !this.isRecording) {
+      return;
+    }
+
+    try {
+      while (this.isRecording && this.isConnected) {
+        try {
+          // We need to await each audio frame sequentially in the loop
+          // to maintain proper audio stream ordering
+          // eslint-disable-next-line no-await-in-loop
+          const frame = await this.recorder.read();
+
+          // Convert Int16Array to Buffer and send to OpenAI
+          if (this.rt && this.isConnected) {
+            const buffer = Buffer.from(frame.buffer);
+            this.rt.send({
+              type: "input_audio_buffer.append",
+              audio: buffer.toString("base64"),
+            });
+          }
+        } catch (error) {
+          // Silently break out if it's an InvalidState error (happens when stopping)
+          if (
+            error instanceof Error &&
+            (error.constructor.name === "PvRecorderStatusInvalidStateError" ||
+              // require import doesn't perserve class prototype
+              error.message?.includes("failed to read audio data frame"))
+          ) {
+            break;
+          }
+          // Re-throw other errors
+          throw error;
+        }
+      }
+    } catch (error) {
+      this.emit("error", error);
+    } finally {
+      this.stopAudioCapture();
+    }
+  }
+
+  private stopAudioCapture() {
+    if (this.recorder) {
+      try {
+        this.recorder.stop();
+        this.recorder.release();
+      } catch (error) {
+        this.emit("error", error);
+      }
+      this.recorder = null;
+      this.isRecording = false;
+    }
+  }
+
+  public cleanup(): void {
+    this.stopAudioCapture();
+
+    if (this.rt) {
+      try {
+        this.rt.close();
+      } catch (error) {
+        this.emit("error", error);
+      }
+      this.rt = null;
+    }
+
+    this.isConnected = false;
+  }
+}
diff --git a/codex-cli/tests/config.test.tsx b/codex-cli/tests/config.test.tsx
index 831208a12..7a0a3b89d 100644
--- a/codex-cli/tests/config.test.tsx
+++ b/codex-cli/tests/config.test.tsx
@@ -275,3 +275,55 @@ test("handles empty user instructions when saving with project doc separator", (
   });
   expect(loadedConfig.instructions).toBe("");
 });
+
+test("loads and saves transcription config correctly", () => {
+  const customTranscriptionConfig = {
+    // Purposely not using the default values
+    input_audio_transcription: {
+      model: "whisper-1" as const,
+      language: "zh",
+    },
+    turn_detection: {
+      type: "server_vad" as const,
+      threshold: 0.9,
+      silence_duration_ms: 500,
+    },
+    input_audio_noise_reduction: {
+      type: "far_field" as const,
+    },
+  };
+
+  const testConfig = {
+    model: "test-model",
+    instructions: "test instructions",
+    notify: false,
+    transcription: customTranscriptionConfig,
+  };
+
+  // Test saving the config
+  saveConfig(testConfig, testConfigPath, testInstructionsPath);
+
+  // Test loading the config
+  const loadedConfig = loadConfig(testConfigPath, testInstructionsPath, {
+    disableProjectDoc: true,
+  });
+
+  // Test that the transcription config was loaded correctly
+  expect(loadedConfig.transcription).toEqual(customTranscriptionConfig);
+  expect(loadedConfig.transcription?.input_audio_transcription?.model).toEqual(
+    "whisper-1",
+  );
+  expect(
+    loadedConfig.transcription?.input_audio_transcription?.language,
+  ).toEqual("zh");
+  expect(loadedConfig.transcription?.turn_detection?.type).toEqual(
+    "server_vad",
+  );
+  expect(loadedConfig.transcription?.turn_detection?.threshold).toEqual(0.9);
+  expect(
+    loadedConfig.transcription?.turn_detection?.silence_duration_ms,
+  ).toEqual(500);
+  expect(loadedConfig.transcription?.input_audio_noise_reduction?.type).toEqual(
+    "far_field",
+  );
+});
diff --git a/codex-cli/tests/slash-commands.test.ts b/codex-cli/tests/slash-commands.test.ts
index b10a484f3..c89e40f70 100644
--- a/codex-cli/tests/slash-commands.test.ts
+++ b/codex-cli/tests/slash-commands.test.ts
@@ -11,6 +11,7 @@ test("SLASH_COMMANDS includes expected commands", () => {
   expect(commands).toContain("/approval");
   expect(commands).toContain("/clearhistory");
   expect(commands).toContain("/diff");
+  expect(commands).toContain("/speak");
 });
 
 test("filters slash commands by prefix", () => {
@@ -23,7 +24,14 @@ test("filters slash commands by prefix", () => {
     expect.arrayContaining(["/clear", "/clearhistory", "/compact"]),
   );
   expect(names).not.toEqual(
-    expect.arrayContaining(["/history", "/help", "/model", "/approval"]),
+    expect.arrayContaining([
+      "/history",
+      "/help",
+      "/model",
+      "/approval",
+      "/diff",
+      "/speak",
+    ]),
   );
 
   const emptyPrefixFiltered = SLASH_COMMANDS.filter((c: SlashCommand) =>
diff --git a/codex-cli/tests/terminal-chat-input-transcription.test.tsx b/codex-cli/tests/terminal-chat-input-transcription.test.tsx
new file mode 100644
index 000000000..6e48f25b8
--- /dev/null
+++ b/codex-cli/tests/terminal-chat-input-transcription.test.tsx
@@ -0,0 +1,180 @@
+import React from "react";
+import type { ComponentProps } from "react";
+import { renderTui } from "./ui-test-helpers.js";
+import TerminalChatInput from "../src/components/chat/terminal-chat-input.js";
+import { describe, it, expect, vi } from "vitest";
+
+// Helper that lets us type and then immediately flush ink's async timers
+async function type(
+  stdin: NodeJS.WritableStream,
+  text: string,
+  flush: () => Promise<void>,
+) {
+  stdin.write(text);
+  await flush();
+}
+
+// Mock the createInputItem function to avoid filesystem operations
+vi.mock("../src/utils/input-utils.js", () => ({
+  createInputItem: vi.fn(async (text: string) => ({
+    role: "user",
+    type: "message",
+    content: [{ type: "input_text", text }],
+  })),
+}));
+
+const mocks = {
+  transcriber: null as any,
+};
+
+// Mock the RealtimeTranscriber to avoid real WebSocket connections
+vi.mock("../src/utils/transcriber.js", () => ({
+  RealtimeTranscriber: vi.fn().mockImplementation(() => {
+    const mock = {
+      on: vi.fn(),
+      start: vi.fn().mockResolvedValue(undefined),
+      cleanup: vi.fn(),
+    };
+    mocks.transcriber = mock;
+    return mock;
+  }),
+}));
+
+describe("TerminalChatInput transcription functionality", () => {
+  it("/speak command starts recording and shows the recording indicator", async () => {
+    const props: ComponentProps<typeof TerminalChatInput> = {
+      isNew: false,
+      loading: false,
+      submitInput: () => {},
+      confirmationPrompt: null,
+      explanation: undefined,
+      submitConfirmation: () => {},
+      setLastResponseId: () => {},
+      setItems: () => {},
+      contextLeftPercent: 50,
+      openOverlay: () => {},
+      openDiffOverlay: () => {},
+      openModelOverlay: () => {},
+      openApprovalOverlay: () => {},
+      openHelpOverlay: () => {},
+      onCompact: () => {},
+      interruptAgent: () => {},
+      active: true,
+      thinkingSeconds: 0,
+    };
+
+    const { stdin, flush, lastFrameStripped } = renderTui(
+      <TerminalChatInput {...props} />,
+    );
+    // Wait for initial render to settle
+    await flush();
+
+    // Simulate the /speak command and press Enter to start recording
+    await type(stdin, "/speak", flush);
+    await type(stdin, "\r", flush);
+    // Allow UI to update after pressing Enter
+    await flush();
+
+    // Check that the recording indicator is shown (isRecording = true)
+    expect(lastFrameStripped()).toContain("●");
+    // Allow transcription effect to run
+    await flush();
+    // Verify RealtimeTranscriber.start was called on the instance
+    const transcriber = mocks.transcriber;
+    expect(transcriber.start).toHaveBeenCalled();
+  });
+
+  it("pressing any key while recording stops the recording", async () => {
+    const props: ComponentProps<typeof TerminalChatInput> = {
+      isNew: false,
+      loading: false,
+      submitInput: () => {},
+      confirmationPrompt: null,
+      explanation: undefined,
+      submitConfirmation: () => {},
+      setLastResponseId: () => {},
+      setItems: () => {},
+      contextLeftPercent: 50,
+      openOverlay: () => {},
+      openDiffOverlay: () => {},
+      openModelOverlay: () => {},
+      openApprovalOverlay: () => {},
+      openHelpOverlay: () => {},
+      onCompact: () => {},
+      interruptAgent: () => {},
+      active: true,
+      thinkingSeconds: 0,
+    };
+
+    const { stdin, flush, lastFrameStripped } = renderTui(
+      <TerminalChatInput {...props} />,
+    );
+    await flush();
+
+    // Simulate the /speak command and press Enter to start recording
+    await type(stdin, "/speak", flush);
+    await type(stdin, "\r", flush);
+    await flush();
+
+    // Check that the recording indicator is shown (isRecording = true)
+    expect(lastFrameStripped()).toContain("●");
+
+    // Simulate pressing any key while recording
+    await type(stdin, "a", flush);
+    await flush();
+
+    // Check that the recording indicator is no longer shown (isRecording = false)
+    expect(lastFrameStripped()).not.toContain("●");
+
+    // Verify RealtimeTranscriber.cleanup was called on the instance
+    const transcriber = mocks.transcriber;
+    expect(transcriber.cleanup).toHaveBeenCalled();
+  });
+
+  it("pressing enter while recording submits but doesn't stop recording", async () => {
+    const props: ComponentProps<typeof TerminalChatInput> = {
+      isNew: false,
+      loading: false,
+      submitInput: () => {},
+      confirmationPrompt: null,
+      explanation: undefined,
+      submitConfirmation: () => {},
+      setLastResponseId: () => {},
+      setItems: () => {},
+      contextLeftPercent: 50,
+      openOverlay: () => {},
+      openDiffOverlay: () => {},
+      openModelOverlay: () => {},
+      openApprovalOverlay: () => {},
+      openHelpOverlay: () => {},
+      onCompact: () => {},
+      interruptAgent: () => {},
+      active: true,
+      thinkingSeconds: 0,
+    };
+
+    const { stdin, flush, lastFrameStripped } = renderTui(
+      <TerminalChatInput {...props} />,
+    );
+    await flush();
+
+    // Simulate the /speak command and press Enter to start recording
+    await type(stdin, "/speak", flush);
+    await type(stdin, "\r", flush);
+    await flush();
+
+    // Check that the recording indicator is shown (isRecording = true)
+    expect(lastFrameStripped()).toContain("●");
+
+    // Simulate pressing Enter while recording
+    await type(stdin, "\r", flush);
+    await flush();
+
+    // Check that the recording indicator is still shown (isRecording = true)
+    expect(lastFrameStripped()).toContain("●");
+
+    // Verify RealtimeTranscriber.cleanup has not been called
+    const transcriber = mocks.transcriber;
+    expect(transcriber.cleanup).not.toHaveBeenCalled();
+  });
+});
diff --git a/codex-cli/tests/transcriber.test.ts b/codex-cli/tests/transcriber.test.ts
new file mode 100644
index 000000000..419a8a912
--- /dev/null
+++ b/codex-cli/tests/transcriber.test.ts
@@ -0,0 +1,218 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { EventEmitter } from "events";
+
+// Define types for our mocks
+type MockSocket = {
+  socket: EventEmitter;
+  send: ReturnType<typeof vi.fn>;
+  close: ReturnType<typeof vi.fn>;
+  on: ReturnType<typeof vi.fn>;
+  emit: ReturnType<typeof vi.fn>;
+};
+
+type MockRecorder = {
+  start: ReturnType<typeof vi.fn>;
+  read: ReturnType<typeof vi.fn>;
+  stop: ReturnType<typeof vi.fn>;
+  release: ReturnType<typeof vi.fn>;
+};
+
+// Store mocks in this object so we can access them in tests
+const mocks: {
+  webSocket: MockSocket | null;
+  PvRecorder: MockRecorder | null;
+} = {
+  webSocket: null,
+  PvRecorder: null,
+};
+
+// Mock dependencies before importing the module under test
+vi.mock("openai/beta/realtime/ws", () => {
+  return {
+    OpenAIRealtimeWS: vi.fn().mockImplementation(() => {
+      const eventHandlers: Record<string, (data: any) => void> = {};
+      const socketMock: MockSocket = {
+        socket: new EventEmitter(),
+        send: vi.fn(),
+        close: vi.fn(),
+        on: vi.fn().mockImplementation((event, handler) => {
+          eventHandlers[event] = handler;
+        }),
+        emit: vi.fn().mockImplementation((event, data) => {
+          if (eventHandlers[event]) {
+            eventHandlers[event](data);
+          }
+        }),
+      };
+      // Store reference in our mocks object for test access
+      mocks.webSocket = socketMock;
+      return socketMock;
+    }),
+  };
+});
+
+vi.mock("@picovoice/pvrecorder-node", () => {
+  return {
+    PvRecorder: vi.fn().mockImplementation(() => {
+      const recorderMock: MockRecorder = {
+        start: vi.fn(),
+        read: vi.fn().mockReturnValue(new Int16Array([1, 2, 3])),
+        stop: vi.fn(),
+        release: vi.fn(),
+      };
+      // Store reference in our mocks object for test access
+      mocks.PvRecorder = recorderMock;
+      return recorderMock;
+    }),
+  };
+});
+
+vi.mock("../src/utils/config.js", () => ({
+  getApiKey: vi.fn().mockReturnValue("fake-api-key"),
+  getBaseUrl: vi.fn().mockReturnValue("https://api.openai.com/v1"),
+  loadConfig: vi.fn().mockReturnValue({
+    transcription: {
+      input_audio_transcription: {
+        model: "whisper-1" as const,
+        prompt: "",
+        language: "fr",
+      },
+      turn_detection: {
+        type: "server_vad" as const,
+        threshold: 0.8,
+        prefix_padding_ms: 300,
+        silence_duration_ms: 500,
+      },
+    },
+  }),
+}));
+
+vi.mock("../src/utils/session.js", () => ({
+  CLI_VERSION: "test-version",
+  ORIGIN: "test-origin",
+  getSessionId: vi.fn().mockReturnValue("test-session-id"),
+}));
+
+// Import the module under test after mocks are defined
+import { RealtimeTranscriber } from "../src/utils/transcriber.js";
+
+describe("RealtimeTranscriber", () => {
+  let transcriber;
+
+  beforeEach(() => {
+    vi.clearAllMocks();
+    // Reset our references
+    mocks.webSocket = null;
+    mocks.PvRecorder = null;
+  });
+
+  it("creates instance without error", () => {
+    transcriber = new RealtimeTranscriber();
+    expect(transcriber).toBeDefined();
+  });
+
+  it("can start and stop", async () => {
+    transcriber = new RealtimeTranscriber();
+    await transcriber.start();
+    transcriber.cleanup();
+    expect(true).toBe(true); // Simple assertion just to verify no errors
+  });
+
+  it("emits transcription events when receiving data", async () => {
+    const transcriber = new RealtimeTranscriber();
+    const transcriptionHandler = vi.fn();
+    transcriber.on("transcription", transcriptionHandler);
+
+    await transcriber.start();
+
+    // References should be populated after start() is called
+    const ws = mocks.webSocket;
+    expect(ws).toBeTruthy();
+
+    // TypeScript needs non-null assertion since we've checked with expect
+    // Simulate WebSocket open event
+    ws!.socket.emit("open");
+
+    // Simulate receiving transcription data
+    ws!.emit("conversation.item.input_audio_transcription.delta", {
+      delta: "Hello world",
+    });
+
+    expect(transcriptionHandler).toHaveBeenCalledWith({
+      type: "transcription.delta",
+      delta: "Hello world",
+    });
+  });
+
+  it("emits errors when WebSocket encounters problems", async () => {
+    const transcriber = new RealtimeTranscriber();
+    const errorHandler = vi.fn();
+    transcriber.on("error", errorHandler);
+
+    await transcriber.start();
+
+    const ws = mocks.webSocket;
+    expect(ws).toBeTruthy();
+
+    // Simulate WebSocket error
+    const testError = new Error("WebSocket error");
+    ws!.socket.emit("error", testError);
+
+    expect(errorHandler).toHaveBeenCalledWith(testError);
+  });
+
+  it("starts audio recording when WebSocket connection opens", async () => {
+    // Spy on the private startAudioCapture method
+    const startAudioCaptureSpy = vi.spyOn(
+      RealtimeTranscriber.prototype as any,
+      "startAudioCapture",
+    );
+
+    // Create transcriber and start it
+    const transcriber = new RealtimeTranscriber();
+    await transcriber.start();
+
+    const ws = mocks.webSocket;
+    expect(ws).toBeTruthy();
+
+    // Simulate WebSocket open event
+    ws!.socket.emit("open");
+
+    // Verify that startAudioCapture was called
+    expect(startAudioCaptureSpy).toHaveBeenCalled();
+  });
+
+  it("sends session.update event when WebSocket connection opens", async () => {
+    const transcriber = new RealtimeTranscriber();
+
+    await transcriber.start();
+
+    const ws = mocks.webSocket;
+    expect(ws).toBeTruthy();
+
+    // Simulate WebSocket open event
+    ws!.socket.emit("open");
+
+    // Verify that send was called with session.update event
+    expect(ws!.send).toHaveBeenCalledWith({
+      type: "session.update",
+      session: {
+        input_audio_format: "pcm16",
+        input_audio_transcription: {
+          model: "whisper-1",
+          prompt: "",
+          language: "fr",
+        },
+        turn_detection: {
+          type: "server_vad",
+          threshold: 0.8,
+          prefix_padding_ms: 300,
+          silence_duration_ms: 500,
+        },
+        input_audio_noise_reduction: {
+          type: "near_field",
+        },
+      },
+    });
+  });
+});
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 6155dfc3a..04783616d 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -31,6 +31,9 @@ importers:
       '@inkjs/ui':
         specifier: ^2.0.0
         version: 2.0.0(ink@5.2.0(@types/react@18.3.20)(react@18.3.1))
+      '@picovoice/pvrecorder-node':
+        specifier: ^1.2.8
+        version: 1.2.8
       chalk:
         specifier: ^5.2.0
         version: 5.4.1
@@ -401,6 +404,10 @@ packages:
     resolution: {integrity: sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==}
     engines: {node: '>= 8'}
 
+  '@picovoice/pvrecorder-node@1.2.8':
+    resolution: {integrity: sha512-dbLJlplQQNRkM2ja/hP4sRADGDILuJ54dEf8cU5eULeNrddxXvOtE8IiJ5F2VhhbXmIv3Qmn79DqhttCOxjH8Q==}
+    engines: {node: '>=18.0.0'}
+
   '@rollup/rollup-android-arm-eabi@4.40.0':
     resolution: {integrity: sha512-+Fbls/diZ0RDerhE8kyC6hjADCXA1K4yVNlH0EYfd2XjyH0UGgzaQ8MlT0pCXAThfxv3QUAczHaL+qSv1E4/Cg==}
     cpu: [arm]
@@ -2655,6 +2662,8 @@ snapshots:
       '@nodelib/fs.scandir': 2.1.5
       fastq: 1.19.1
 
+  '@picovoice/pvrecorder-node@1.2.8': {}
+
   '@rollup/rollup-android-arm-eabi@4.40.0':
     optional: true