diff --git a/enjoy/src/i18n/en.json b/enjoy/src/i18n/en.json
index a7851e2bc..5124e9067 100644
--- a/enjoy/src/i18n/en.json
+++ b/enjoy/src/i18n/en.json
@@ -927,5 +927,7 @@
   "openaiTtsServiceDescription": "Use OpenAI TTS service from your own key.",
   "enjoyTtsServiceDescription": "Use TTS service provided by Enjoy. OpenAI or Azure is supported.",
   "compressMediaBeforeAdding": "Compress media before adding",
-  "keepOriginalMedia": "Keep original media"
+  "keepOriginalMedia": "Keep original media",
+  "myPronunciation": "My pronunciation",
+  "originalPronunciation": "Original pronunciation"
 }
diff --git a/enjoy/src/i18n/zh-CN.json b/enjoy/src/i18n/zh-CN.json
index d83ef469d..4fbb8f7e6 100644
--- a/enjoy/src/i18n/zh-CN.json
+++ b/enjoy/src/i18n/zh-CN.json
@@ -927,5 +927,7 @@
   "openaiTtsServiceDescription": "使用您自己的 API key 来使用 OpenAI TTS 服务。",
   "enjoyTtsServiceDescription": "使用 Enjoy 提供的 TTS 服务，支持 OpenAI 或 Azure。",
   "compressMediaBeforeAdding": "添加前压缩媒体",
-  "keepOriginalMedia": "保存原始媒体"
+  "keepOriginalMedia": "保存原始媒体",
+  "myPronunciation": "我的发音",
+  "originalPronunciation": "原始发音"
 }
diff --git a/enjoy/src/renderer/components/medias/media-bottom-panel/media-current-recording.tsx b/enjoy/src/renderer/components/medias/media-bottom-panel/media-current-recording.tsx
index 89b437bb5..e4595ff51 100644
--- a/enjoy/src/renderer/components/medias/media-bottom-panel/media-current-recording.tsx
+++ b/enjoy/src/renderer/components/medias/media-bottom-panel/media-current-recording.tsx
@@ -1,4 +1,4 @@
-import { useEffect, useContext, useRef, useState } from "react";
+import { useEffect, useContext, useRef, useState, useMemo } from "react";
 import {
   AppSettingsProviderContext,
   HotKeysSettingsProviderContext,
@@ -50,17 +50,13 @@ import { formatDuration } from "@renderer/lib/utils";
 import { useHotkeys } from "react-hotkeys-hook";
 import { LiveAudioVisualizer } from "react-audio-visualize";
 import debounce from "lodash/debounce";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
 
 const ACTION_BUTTON_HEIGHT = 35;
 export const MediaCurrentRecording = () => {
   const {
     isRecording,
     isPaused,
-    cancelRecording,
-    togglePauseResume,
-    stopRecording,
-    recordingTime,
-    mediaRecorder,
     currentRecording,
     renderPitchContour: renderMediaPitchContour,
     regions: mediaRegions,
@@ -71,6 +67,8 @@ export const MediaCurrentRecording = () => {
     currentSegment,
     createSegment,
     currentTime: mediaCurrentTime,
+    caption,
+    toggleRegion,
   } = useContext(MediaShadowProviderContext);
   const { webApi, EnjoyApp } = useContext(AppSettingsProviderContext);
   const { currentHotkeys } = useContext(HotKeysSettingsProviderContext);
@@ -263,6 +261,23 @@ export const MediaCurrentRecording = () => {
       });
   };
 
+  const playWord = (word: string, index: number) => {
+    const candidates = caption.timeline.filter(
+      (w: TimelineEntry) => w.text.toLowerCase() === word.toLowerCase()
+    );
+    const target = candidates[index];
+    if (!target) return;
+
+    const wordIndex = caption.timeline.findIndex(
+      (w) => w.startTime === target.startTime
+    );
+
+    toggleRegion([wordIndex]);
+    setTimeout(() => {
+      wavesurfer?.playPause();
+    }, 250);
+  };
+
   const calContainerSize = () => {
     const size = ref?.current
       ?.closest(".media-recording-wrapper")
@@ -685,7 +700,12 @@ export const MediaCurrentRecording = () => {
             </SheetClose>
           </SheetHeader>
 
-          <RecordingDetail recording={currentRecording} />
+          <RecordingDetail
+            recording={currentRecording}
+            onPlayOrigin={(word: string, index: number = 0) =>
+              playWord(word, index)
+            }
+          />
         </SheetContent>
       </Sheet>
     </div>
@@ -745,7 +765,6 @@ const MediaRecorder = () => {
   const {
     mediaRecorder,
     recordingTime,
-    isRecording,
     isPaused,
     cancelRecording,
     togglePauseResume,
diff --git a/enjoy/src/renderer/components/medias/media-bottom-panel/media-player-controls.tsx b/enjoy/src/renderer/components/medias/media-bottom-panel/media-player-controls.tsx
index db388fd98..f83f0da4d 100644
--- a/enjoy/src/renderer/components/medias/media-bottom-panel/media-player-controls.tsx
+++ b/enjoy/src/renderer/components/medias/media-bottom-panel/media-player-controls.tsx
@@ -318,7 +318,7 @@ export const MediaPlayerControls = () => {
           wavesurfer.pause();
           setTimeout(() => {
             activeRegionDebouncePlay();
-          }, 500);
+          }, 250);
         } else if (playMode === "single") {
           wavesurfer.pause();
         }
diff --git a/enjoy/src/renderer/components/medias/media-right-panel/media-caption.tsx b/enjoy/src/renderer/components/medias/media-right-panel/media-caption.tsx
index 558468259..731c28ee3 100644
--- a/enjoy/src/renderer/components/medias/media-right-panel/media-caption.tsx
+++ b/enjoy/src/renderer/components/medias/media-right-panel/media-caption.tsx
@@ -34,7 +34,13 @@ export const MediaCaption = (props: {
 
   const [notedquoteIndices, setNotedquoteIndices] = useState<number[]>([]);
 
-  let words = caption.text.split(" ");
+  let words = caption.text
+    .replace(/ ([.,!?:;])/g, "$1")
+    .replace(/ (['"")])/g, "$1")
+    .replace(/ \.\.\./g, "...")
+    .split(/([—]|\s+)/g)
+    .filter((word) => word.trim() !== "" && word !== "—");
+
   const ipas = caption.timeline.map((w) =>
     w.timeline?.map((t) =>
       t.timeline && language.startsWith("en")
diff --git a/enjoy/src/renderer/components/medias/media-right-panel/media-right-panel.tsx b/enjoy/src/renderer/components/medias/media-right-panel/media-right-panel.tsx
index 04035f48d..d5a1547be 100644
--- a/enjoy/src/renderer/components/medias/media-right-panel/media-right-panel.tsx
+++ b/enjoy/src/renderer/components/medias/media-right-panel/media-right-panel.tsx
@@ -1,4 +1,4 @@
-import { useEffect, useState, useContext, useRef } from "react";
+import { useEffect, useState, useContext, useRef, useMemo } from "react";
 import { MediaShadowProviderContext } from "@renderer/context";
 import cloneDeep from "lodash/cloneDeep";
 import {
@@ -11,10 +11,6 @@ import {
 } from "@renderer/components/ui";
 import { MediaCaption, MediaCaptionActions } from "@renderer/components";
 import { t } from "i18next";
-import {
-  Timeline,
-  TimelineEntry,
-} from "echogarden/dist/utilities/Timeline.d.js";
 import {
   MediaCaptionAnalysis,
   MediaCaptionNote,
@@ -29,12 +25,14 @@ export const MediaRightPanel = (props: {
 }) => {
   const { className, setDisplayPanel } = props;
   const {
+    caption,
     currentSegmentIndex,
     currentTime,
     transcription,
     regions,
     activeRegion,
     setActiveRegion,
+    toggleRegion,
     editingRegion,
     setEditingRegion,
     setTranscriptionDraft,
@@ -47,7 +45,6 @@ export const MediaRightPanel = (props: {
   const [displayIpa, setDisplayIpa] = useState<boolean>(true);
   const [displayNotes, setDisplayNotes] = useState<boolean>(true);
 
-  const [caption, setCaption] = useState<TimelineEntry | null>(null);
   const [tab, setTab] = useState<string>("translation");
 
   const toggleMultiSelect = (event: KeyboardEvent) => {
@@ -79,67 +76,6 @@ export const MediaRightPanel = (props: {
     }
   };
 
-  const toggleRegion = (params: number[]) => {
-    if (!activeRegion) return;
-    if (editingRegion) {
-      toast.warning(t("currentRegionIsBeingEdited"));
-      return;
-    }
-    if (params.length === 0) {
-      if (activeRegion.id.startsWith("word-region")) {
-        activeRegion.remove();
-        setActiveRegion(
-          regions.getRegions().find((r) => r.id.startsWith("segment-region"))
-        );
-      }
-      return;
-    }
-
-    const startIndex = Math.min(...params);
-    const endIndex = Math.max(...params);
-
-    const startWord = caption.timeline[startIndex];
-    if (!startWord) return;
-
-    const endWord = caption.timeline[endIndex] || startWord;
-
-    const start = startWord.startTime;
-    const end = endWord.endTime;
-
-    // If the active region is a word region, then merge the selected words into a single region.
-    if (activeRegion.id.startsWith("word-region")) {
-      activeRegion.remove();
-
-      const region = regions.addRegion({
-        id: `word-region-${startIndex}`,
-        start,
-        end,
-        color: "#fb6f9233",
-        drag: false,
-        resize: editingRegion,
-      });
-
-      setActiveRegion(region);
-      // If the active region is a meaning group region, then active the segment region.
-    } else if (activeRegion.id.startsWith("meaning-group-region")) {
-      setActiveRegion(
-        regions.getRegions().find((r) => r.id.startsWith("segment-region"))
-      );
-      // If the active region is a segment region, then create a new word region.
-    } else {
-      const region = regions.addRegion({
-        id: `word-region-${startIndex}`,
-        start,
-        end,
-        color: "#fb6f9233",
-        drag: false,
-        resize: false,
-      });
-
-      setActiveRegion(region);
-    }
-  };
-
   useEffect(() => {
     if (!caption) return;
 
@@ -160,6 +96,7 @@ export const MediaRightPanel = (props: {
     toggleRegion(selectedIndices);
   }, [caption, selectedIndices]);
 
+  // Edit region to update transcription draft
   useEffect(() => {
     if (!activeRegion) return;
     if (!activeRegion.id.startsWith("word-region")) return;
@@ -234,12 +171,6 @@ export const MediaRightPanel = (props: {
     };
   }, [editingRegion]);
 
-  useEffect(() => {
-    setCaption(
-      (transcription?.result?.timeline as Timeline)?.[currentSegmentIndex]
-    );
-  }, [currentSegmentIndex, transcription]);
-
   useEffect(() => {
     return () => setSelectedIndices([]);
   }, [caption]);
diff --git a/enjoy/src/renderer/components/medias/media-shadow-player.tsx b/enjoy/src/renderer/components/medias/media-shadow-player.tsx
index 4ddb6235c..937786321 100644
--- a/enjoy/src/renderer/components/medias/media-shadow-player.tsx
+++ b/enjoy/src/renderer/components/medias/media-shadow-player.tsx
@@ -4,16 +4,13 @@ import {
   MediaRightPanel,
   MediaLeftPanel,
   MediaBottomPanel,
-  MediaProvider,
 } from "@renderer/components";
 import {
-  Button,
   ResizableHandle,
   ResizablePanel,
   ResizablePanelGroup,
 } from "@renderer/components/ui";
 import { useContext, useState } from "react";
-import { RefreshCcwDotIcon } from "lucide-react";
 
 export const MediaShadowPlayer = () => {
   return (
diff --git a/enjoy/src/renderer/components/pronunciation-assessments/pronunciation-assessment-fulltext-result.tsx b/enjoy/src/renderer/components/pronunciation-assessments/pronunciation-assessment-fulltext-result.tsx
index b6c5451c9..857963b1b 100644
--- a/enjoy/src/renderer/components/pronunciation-assessments/pronunciation-assessment-fulltext-result.tsx
+++ b/enjoy/src/renderer/components/pronunciation-assessments/pronunciation-assessment-fulltext-result.tsx
@@ -8,8 +8,9 @@ export const PronunciationAssessmentFulltextResult = (props: {
   words: PronunciationAssessmentWordResultType[];
   currentTime?: number;
   src?: string;
+  onPlayOrigin?: (word: string, index: number) => void;
 }) => {
-  const { words, currentTime, src } = props;
+  const { words, currentTime, src, onPlayOrigin } = props;
   const [errorStats, setErrorStats] = useState({
     mispronunciation: 0,
     omission: 0,
@@ -65,6 +66,16 @@ export const PronunciationAssessmentFulltextResult = (props: {
               errorDisplay={errorDisplay}
               currentTime={currentTime}
               src={src}
+              onPlayOrigin={() => {
+                if (!onPlayOrigin) return;
+
+                const word = words[index];
+                const candidates = words.filter((w) => w.word === word.word);
+                const wordIndex = candidates.findIndex(
+                  (w) => w.offset === word.offset
+                );
+                onPlayOrigin(word.word, wordIndex);
+              }}
             />
           ))}
         </div>
diff --git a/enjoy/src/renderer/components/pronunciation-assessments/pronunciation-assessment-word-result.tsx b/enjoy/src/renderer/components/pronunciation-assessments/pronunciation-assessment-word-result.tsx
index 025b59283..fa58c92ef 100644
--- a/enjoy/src/renderer/components/pronunciation-assessments/pronunciation-assessment-word-result.tsx
+++ b/enjoy/src/renderer/components/pronunciation-assessments/pronunciation-assessment-word-result.tsx
@@ -20,6 +20,7 @@ export const PronunciationAssessmentWordResult = (props: {
     monotone: boolean;
   };
   currentTime?: number;
+  onPlayOrigin?: () => void;
 }) => {
   const {
     result,
@@ -32,6 +33,7 @@ export const PronunciationAssessmentWordResult = (props: {
       monotone: true,
     },
     currentTime = 0,
+    onPlayOrigin,
   } = props;
 
   const audio = useRef<HTMLAudioElement>(null);
@@ -71,25 +73,41 @@ export const PronunciationAssessmentWordResult = (props: {
   }[result.pronunciationAssessment.errorType];
 
   const play = () => {
+    if (!audio.current || !props.src) return;
+
     const { offset, duration } = result;
+    if (!offset || !duration) return;
+
+    const startTime = (offset * 1.0) / 1e7;
+    const endTime = ((offset + duration) * 1.0) / 1e7;
 
-    // create a new audio element and play the segment
-    audio.current.src = `${props.src}#t=${(offset * 1.0) / 1e7},${
-      ((offset + duration) * 1.0) / 1e7
-    }`;
+    audio.current.currentTime = startTime;
+
+    // Add timeupdate listener to stop at the end of the segment
+    const handleTimeUpdate = () => {
+      if (audio.current.currentTime >= endTime) {
+        audio.current.pause();
+        audio.current.removeEventListener("timeupdate", handleTimeUpdate);
+      }
+    };
+
+    audio.current.addEventListener("timeupdate", handleTimeUpdate);
     audio.current.play();
   };
 
   useEffect(() => {
     if (!audio.current) {
-      audio.current = new Audio();
+      audio.current = new Audio(props.src);
     }
 
     return () => {
-      audio.current?.pause();
-      delete audio.current;
+      if (audio.current) {
+        audio.current.pause();
+        audio.current.removeEventListener("timeupdate", () => {});
+        audio.current = null;
+      }
     };
-  }, []);
+  }, [props.src]);
 
   return (
     <Popover>
@@ -152,11 +170,20 @@ export const PronunciationAssessmentWordResult = (props: {
           </div>
         )}
 
-        <div className="">
+        <div className="flex items-center space-x-2">
+          <span className="text-sm">{t("myPronunciation")}:</span>
           <Button onClick={play} variant="ghost" size="icon">
             <Volume2Icon className="w-5 h-5" />
           </Button>
         </div>
+        {onPlayOrigin && (
+          <div className="flex items-center space-x-2">
+            <span className="text-sm">{t("originalPronunciation")}:</span>
+            <Button onClick={onPlayOrigin} variant="ghost" size="icon">
+              <Volume2Icon className="w-5 h-5" />
+            </Button>
+          </div>
+        )}
       </PopoverContent>
     </Popover>
   );
diff --git a/enjoy/src/renderer/components/recordings/recording-detail.tsx b/enjoy/src/renderer/components/recordings/recording-detail.tsx
index 741e9d2fc..84bf54cfb 100644
--- a/enjoy/src/renderer/components/recordings/recording-detail.tsx
+++ b/enjoy/src/renderer/components/recordings/recording-detail.tsx
@@ -14,8 +14,9 @@ export const RecordingDetail = (props: {
   recording: RecordingType;
   pronunciationAssessment?: PronunciationAssessmentType;
   onAssess?: (assessment: PronunciationAssessmentType) => void;
+  onPlayOrigin?: (word: string) => void;
 }) => {
-  const { recording, onAssess } = props;
+  const { recording, onAssess, onPlayOrigin } = props;
   if (!recording) return;
 
   const [pronunciationAssessment, setPronunciationAssessment] =
@@ -40,7 +41,7 @@ export const RecordingDetail = (props: {
     setAssessing(true);
     createAssessment({
       recording,
-      reference: recording.referenceText || "",
+      reference: recording.referenceText?.replace(/[—]/g, ", ") || "",
       language: recording.language || learningLanguage,
     })
       .then((assessment) => {
@@ -76,6 +77,7 @@ export const RecordingDetail = (props: {
           words={result.words}
           currentTime={currentTime}
           src={recording.src}
+          onPlayOrigin={onPlayOrigin}
         />
       ) : (
         <ScrollArea className="min-h-72 py-4 px-8 select-text">
diff --git a/enjoy/src/renderer/context/media-shadow-provider.tsx b/enjoy/src/renderer/context/media-shadow-provider.tsx
index ac9bc2c2f..6e6494ca5 100644
--- a/enjoy/src/renderer/context/media-shadow-provider.tsx
+++ b/enjoy/src/renderer/context/media-shadow-provider.tsx
@@ -1,4 +1,4 @@
-import { createContext, useEffect, useState, useContext } from "react";
+import { createContext, useEffect, useState, useContext, useMemo } from "react";
 import { convertIpaToNormal, extractFrequencies } from "@/utils";
 import { AppSettingsProviderContext } from "@renderer/context";
 import {
@@ -12,7 +12,10 @@ import Regions, {
   type Region as RegionType,
 } from "wavesurfer.js/dist/plugins/regions";
 import Chart from "chart.js/auto";
-import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
+import {
+  Timeline,
+  TimelineEntry,
+} from "echogarden/dist/utilities/Timeline.d.js";
 import { toast } from "@renderer/components/ui";
 import { Tooltip } from "react-tooltip";
 import { useAudioRecorder } from "react-audio-voice-recorder";
@@ -48,6 +51,7 @@ type MediaShadowContextType = {
   regions: Regions | null;
   activeRegion: RegionType;
   setActiveRegion: (region: RegionType) => void;
+  toggleRegion: (params: number[]) => void;
   renderPitchContour: (
     region: RegionType,
     options?: {
@@ -74,6 +78,7 @@ type MediaShadowContextType = {
   transcribingOutput: string;
   transcriptionDraft: TranscriptionType["result"];
   setTranscriptionDraft: (result: TranscriptionType["result"]) => void;
+  caption: TimelineEntry;
   // Recordings
   startRecording: () => void;
   stopRecording: () => void;
@@ -180,6 +185,10 @@ export const MediaShadowProvider = ({
     toast.error(exception.message);
   });
 
+  const caption = useMemo(() => {
+    return (transcription?.result?.timeline as Timeline)?.[currentSegmentIndex];
+  }, [currentSegmentIndex, transcription]);
+
   const { segment, createSegment } = useSegments({
     targetId: media?.id,
     targetType: media?.mediaType,
@@ -466,6 +475,67 @@ export const MediaShadowProvider = ({
       );
   };
 
+  const toggleRegion = (params: number[]) => {
+    if (!activeRegion) return;
+    if (editingRegion) {
+      toast.warning(t("currentRegionIsBeingEdited"));
+      return;
+    }
+    if (params.length === 0) {
+      if (activeRegion.id.startsWith("word-region")) {
+        activeRegion.remove();
+        setActiveRegion(
+          regions.getRegions().find((r) => r.id.startsWith("segment-region"))
+        );
+      }
+      return;
+    }
+
+    const startIndex = Math.min(...params);
+    const endIndex = Math.max(...params);
+
+    const startWord = caption.timeline[startIndex];
+    if (!startWord) return;
+
+    const endWord = caption.timeline[endIndex] || startWord;
+
+    const start = startWord.startTime;
+    const end = endWord.endTime;
+
+    // If the active region is a word region, then merge the selected words into a single region.
+    if (activeRegion.id.startsWith("word-region")) {
+      activeRegion.remove();
+
+      const region = regions.addRegion({
+        id: `word-region-${startIndex}`,
+        start,
+        end,
+        color: "#fb6f9233",
+        drag: false,
+        resize: editingRegion,
+      });
+
+      setActiveRegion(region);
+      // If the active region is a meaning group region, then active the segment region.
+    } else if (activeRegion.id.startsWith("meaning-group-region")) {
+      setActiveRegion(
+        regions.getRegions().find((r) => r.id.startsWith("segment-region"))
+      );
+      // If the active region is a segment region, then create a new word region.
+    } else {
+      const region = regions.addRegion({
+        id: `word-region-${startIndex}`,
+        start,
+        end,
+        color: "#fb6f9233",
+        drag: false,
+        resize: false,
+      });
+
+      setActiveRegion(region);
+    }
+  };
+
   /*
    * When wavesurfer is decoded,
    * set up event listeners for wavesurfer
@@ -667,6 +737,7 @@ export const MediaShadowProvider = ({
           pitchChart,
           activeRegion,
           setActiveRegion,
+          toggleRegion,
           renderPitchContour,
           editingRegion,
           setEditingRegion,
@@ -676,6 +747,7 @@ export const MediaShadowProvider = ({
           transcribingOutput,
           transcriptionDraft,
           setTranscriptionDraft,
+          caption,
           startRecording,
           stopRecording,
           cancelRecording,