Improve transcription (#1182)

* handle special character in transcription when assess * may play original pronounciation in assessment result
ZuodaoTech · Nov 15, 2024 · b8167a9 · b8167a9
1 parent 1f531b0
commit b8167a9
Show file tree

Hide file tree

Showing 11 changed files with 171 additions and 102 deletions.
diff --git a/enjoy/src/i18n/en.json b/enjoy/src/i18n/en.json
@@ -927,5 +927,7 @@
   "openaiTtsServiceDescription": "Use OpenAI TTS service from your own key.",
   "enjoyTtsServiceDescription": "Use TTS service provided by Enjoy. OpenAI or Azure is supported.",
   "compressMediaBeforeAdding": "Compress media before adding",
-  "keepOriginalMedia": "Keep original media"
+  "keepOriginalMedia": "Keep original media",
+  "myPronunciation": "My pronunciation",
+  "originalPronunciation": "Original pronunciation"
 }
diff --git a/enjoy/src/i18n/zh-CN.json b/enjoy/src/i18n/zh-CN.json
@@ -927,5 +927,7 @@
   "openaiTtsServiceDescription": "使用您自己的 API key 来使用 OpenAI TTS 服务。",
   "enjoyTtsServiceDescription": "使用 Enjoy 提供的 TTS 服务，支持 OpenAI 或 Azure。",
   "compressMediaBeforeAdding": "添加前压缩媒体",
-  "keepOriginalMedia": "保存原始媒体"
+  "keepOriginalMedia": "保存原始媒体",
+  "myPronunciation": "我的发音",
+  "originalPronunciation": "原始发音"
 }
diff --git a/enjoy/src/renderer/components/medias/media-bottom-panel/media-current-recording.tsx b/enjoy/src/renderer/components/medias/media-bottom-panel/media-current-recording.tsx
@@ -1,4 +1,4 @@
-import { useEffect, useContext, useRef, useState } from "react";
+import { useEffect, useContext, useRef, useState, useMemo } from "react";
 import {
   AppSettingsProviderContext,
   HotKeysSettingsProviderContext,
@@ -50,17 +50,13 @@ import { formatDuration } from "@renderer/lib/utils";
 import { useHotkeys } from "react-hotkeys-hook";
 import { LiveAudioVisualizer } from "react-audio-visualize";
 import debounce from "lodash/debounce";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
 
 const ACTION_BUTTON_HEIGHT = 35;
 export const MediaCurrentRecording = () => {
   const {
     isRecording,
     isPaused,
-    cancelRecording,
-    togglePauseResume,
-    stopRecording,
-    recordingTime,
-    mediaRecorder,
     currentRecording,
     renderPitchContour: renderMediaPitchContour,
     regions: mediaRegions,
@@ -71,6 +67,8 @@ export const MediaCurrentRecording = () => {
     currentSegment,
     createSegment,
     currentTime: mediaCurrentTime,
+    caption,
+    toggleRegion,
   } = useContext(MediaShadowProviderContext);
   const { webApi, EnjoyApp } = useContext(AppSettingsProviderContext);
   const { currentHotkeys } = useContext(HotKeysSettingsProviderContext);
@@ -263,6 +261,23 @@ export const MediaCurrentRecording = () => {
       });
   };
 
+  const playWord = (word: string, index: number) => {
+    const candidates = caption.timeline.filter(
+      (w: TimelineEntry) => w.text.toLowerCase() === word.toLowerCase()
+    );
+    const target = candidates[index];
+    if (!target) return;
+
+    const wordIndex = caption.timeline.findIndex(
+      (w) => w.startTime === target.startTime
+    );
+
+    toggleRegion([wordIndex]);
+    setTimeout(() => {
+      wavesurfer?.playPause();
+    }, 250);
+  };
+
   const calContainerSize = () => {
     const size = ref?.current
       ?.closest(".media-recording-wrapper")
@@ -685,7 +700,12 @@ export const MediaCurrentRecording = () => {
             </SheetClose>
           </SheetHeader>
 
-          <RecordingDetail recording={currentRecording} />
+          <RecordingDetail
+            recording={currentRecording}
+            onPlayOrigin={(word: string, index: number = 0) =>
+              playWord(word, index)
+            }
+          />
         </SheetContent>
       </Sheet>
     </div>
@@ -745,7 +765,6 @@ const MediaRecorder = () => {
   const {
     mediaRecorder,
     recordingTime,
-    isRecording,
     isPaused,
     cancelRecording,
     togglePauseResume,

diff --git a/enjoy/src/renderer/components/medias/media-bottom-panel/media-player-controls.tsx b/enjoy/src/renderer/components/medias/media-bottom-panel/media-player-controls.tsx
@@ -318,7 +318,7 @@ export const MediaPlayerControls = () => {
           wavesurfer.pause();
           setTimeout(() => {
             activeRegionDebouncePlay();
-          }, 500);
+          }, 250);
         } else if (playMode === "single") {
           wavesurfer.pause();
         }

diff --git a/enjoy/src/renderer/components/medias/media-right-panel/media-caption.tsx b/enjoy/src/renderer/components/medias/media-right-panel/media-caption.tsx
@@ -34,7 +34,13 @@ export const MediaCaption = (props: {
 
   const [notedquoteIndices, setNotedquoteIndices] = useState<number[]>([]);
 
-  let words = caption.text.split(" ");
+  let words = caption.text
+    .replace(/ ([.,!?:;])/g, "$1")
+    .replace(/ (['"")])/g, "$1")
+    .replace(/ \.\.\./g, "...")
+    .split(/([—]|\s+)/g)
+    .filter((word) => word.trim() !== "" && word !== "—");
+
   const ipas = caption.timeline.map((w) =>
     w.timeline?.map((t) =>
       t.timeline && language.startsWith("en")

diff --git a/enjoy/src/renderer/components/medias/media-right-panel/media-right-panel.tsx b/enjoy/src/renderer/components/medias/media-right-panel/media-right-panel.tsx
@@ -1,4 +1,4 @@
-import { useEffect, useState, useContext, useRef } from "react";
+import { useEffect, useState, useContext, useRef, useMemo } from "react";
 import { MediaShadowProviderContext } from "@renderer/context";
 import cloneDeep from "lodash/cloneDeep";
 import {
@@ -11,10 +11,6 @@ import {
 } from "@renderer/components/ui";
 import { MediaCaption, MediaCaptionActions } from "@renderer/components";
 import { t } from "i18next";
-import {
-  Timeline,
-  TimelineEntry,
-} from "echogarden/dist/utilities/Timeline.d.js";
 import {
   MediaCaptionAnalysis,
   MediaCaptionNote,
@@ -29,12 +25,14 @@ export const MediaRightPanel = (props: {
 }) => {
   const { className, setDisplayPanel } = props;
   const {
+    caption,
     currentSegmentIndex,
     currentTime,
     transcription,
     regions,
     activeRegion,
     setActiveRegion,
+    toggleRegion,
     editingRegion,
     setEditingRegion,
     setTranscriptionDraft,
@@ -47,7 +45,6 @@ export const MediaRightPanel = (props: {
   const [displayIpa, setDisplayIpa] = useState<boolean>(true);
   const [displayNotes, setDisplayNotes] = useState<boolean>(true);
 
-  const [caption, setCaption] = useState<TimelineEntry | null>(null);
   const [tab, setTab] = useState<string>("translation");
 
   const toggleMultiSelect = (event: KeyboardEvent) => {
@@ -79,67 +76,6 @@ export const MediaRightPanel = (props: {
     }
   };
 
-  const toggleRegion = (params: number[]) => {
-    if (!activeRegion) return;
-    if (editingRegion) {
-      toast.warning(t("currentRegionIsBeingEdited"));
-      return;
-    }
-    if (params.length === 0) {
-      if (activeRegion.id.startsWith("word-region")) {
-        activeRegion.remove();
-        setActiveRegion(
-          regions.getRegions().find((r) => r.id.startsWith("segment-region"))
-        );
-      }
-      return;
-    }
-
-    const startIndex = Math.min(...params);
-    const endIndex = Math.max(...params);
-
-    const startWord = caption.timeline[startIndex];
-    if (!startWord) return;
-
-    const endWord = caption.timeline[endIndex] || startWord;
-
-    const start = startWord.startTime;
-    const end = endWord.endTime;
-
-    // If the active region is a word region, then merge the selected words into a single region.
-    if (activeRegion.id.startsWith("word-region")) {
-      activeRegion.remove();
-
-      const region = regions.addRegion({
-        id: `word-region-${startIndex}`,
-        start,
-        end,
-        color: "#fb6f9233",
-        drag: false,
-        resize: editingRegion,
-      });
-
-      setActiveRegion(region);
-      // If the active region is a meaning group region, then active the segment region.
-    } else if (activeRegion.id.startsWith("meaning-group-region")) {
-      setActiveRegion(
-        regions.getRegions().find((r) => r.id.startsWith("segment-region"))
-      );
-      // If the active region is a segment region, then create a new word region.
-    } else {
-      const region = regions.addRegion({
-        id: `word-region-${startIndex}`,
-        start,
-        end,
-        color: "#fb6f9233",
-        drag: false,
-        resize: false,
-      });
-
-      setActiveRegion(region);
-    }
-  };
-
   useEffect(() => {
     if (!caption) return;
 
@@ -160,6 +96,7 @@ export const MediaRightPanel = (props: {
     toggleRegion(selectedIndices);
   }, [caption, selectedIndices]);
 
+  // Edit region to update transcription draft
   useEffect(() => {
     if (!activeRegion) return;
     if (!activeRegion.id.startsWith("word-region")) return;
@@ -234,12 +171,6 @@ export const MediaRightPanel = (props: {
     };
   }, [editingRegion]);
 
-  useEffect(() => {
-    setCaption(
-      (transcription?.result?.timeline as Timeline)?.[currentSegmentIndex]
-    );
-  }, [currentSegmentIndex, transcription]);
-
   useEffect(() => {
     return () => setSelectedIndices([]);
   }, [caption]);

diff --git a/enjoy/src/renderer/components/medias/media-shadow-player.tsx b/enjoy/src/renderer/components/medias/media-shadow-player.tsx
@@ -4,16 +4,13 @@ import {
   MediaRightPanel,
   MediaLeftPanel,
   MediaBottomPanel,
-  MediaProvider,
 } from "@renderer/components";
 import {
-  Button,
   ResizableHandle,
   ResizablePanel,
   ResizablePanelGroup,
 } from "@renderer/components/ui";
 import { useContext, useState } from "react";
-import { RefreshCcwDotIcon } from "lucide-react";
 
 export const MediaShadowPlayer = () => {
   return (

diff --git a/...enderer/components/pronunciation-assessments/pronunciation-assessment-fulltext-result.tsx b/...enderer/components/pronunciation-assessments/pronunciation-assessment-fulltext-result.tsx
@@ -8,8 +8,9 @@ export const PronunciationAssessmentFulltextResult = (props: {
   words: PronunciationAssessmentWordResultType[];
   currentTime?: number;
   src?: string;
+  onPlayOrigin?: (word: string, index: number) => void;
 }) => {
-  const { words, currentTime, src } = props;
+  const { words, currentTime, src, onPlayOrigin } = props;
   const [errorStats, setErrorStats] = useState({
     mispronunciation: 0,
     omission: 0,
@@ -65,6 +66,16 @@ export const PronunciationAssessmentFulltextResult = (props: {
               errorDisplay={errorDisplay}
               currentTime={currentTime}
               src={src}
+              onPlayOrigin={() => {
+                if (!onPlayOrigin) return;
+
+                const word = words[index];
+                const candidates = words.filter((w) => w.word === word.word);
+                const wordIndex = candidates.findIndex(
+                  (w) => w.offset === word.offset
+                );
+                onPlayOrigin(word.word, wordIndex);
+              }}
             />
           ))}
         </div>

diff --git a/...rc/renderer/components/pronunciation-assessments/pronunciation-assessment-word-result.tsx b/...rc/renderer/components/pronunciation-assessments/pronunciation-assessment-word-result.tsx
@@ -20,6 +20,7 @@ export const PronunciationAssessmentWordResult = (props: {
     monotone: boolean;
   };
   currentTime?: number;
+  onPlayOrigin?: () => void;
 }) => {
   const {
     result,
@@ -32,6 +33,7 @@ export const PronunciationAssessmentWordResult = (props: {
       monotone: true,
     },
     currentTime = 0,
+    onPlayOrigin,
   } = props;
 
   const audio = useRef<HTMLAudioElement>(null);
@@ -71,25 +73,41 @@ export const PronunciationAssessmentWordResult = (props: {
   }[result.pronunciationAssessment.errorType];
 
   const play = () => {
+    if (!audio.current || !props.src) return;
+
     const { offset, duration } = result;
+    if (!offset || !duration) return;
+
+    const startTime = (offset * 1.0) / 1e7;
+    const endTime = ((offset + duration) * 1.0) / 1e7;
 
-    // create a new audio element and play the segment
-    audio.current.src = `${props.src}#t=${(offset * 1.0) / 1e7},${
-      ((offset + duration) * 1.0) / 1e7
-    }`;
+    audio.current.currentTime = startTime;
+
+    // Add timeupdate listener to stop at the end of the segment
+    const handleTimeUpdate = () => {
+      if (audio.current.currentTime >= endTime) {
+        audio.current.pause();
+        audio.current.removeEventListener("timeupdate", handleTimeUpdate);
+      }
+    };
+
+    audio.current.addEventListener("timeupdate", handleTimeUpdate);
     audio.current.play();
   };
 
   useEffect(() => {
     if (!audio.current) {
-      audio.current = new Audio();
+      audio.current = new Audio(props.src);
     }
 
     return () => {
-      audio.current?.pause();
-      delete audio.current;
+      if (audio.current) {
+        audio.current.pause();
+        audio.current.removeEventListener("timeupdate", () => {});
+        audio.current = null;
+      }
     };
-  }, []);
+  }, [props.src]);
 
   return (
     <Popover>
@@ -152,11 +170,20 @@ export const PronunciationAssessmentWordResult = (props: {
           </div>
         )}
 
-        <div className="">
+        <div className="flex items-center space-x-2">
+          <span className="text-sm">{t("myPronunciation")}:</span>
           <Button onClick={play} variant="ghost" size="icon">
             <Volume2Icon className="w-5 h-5" />
           </Button>
         </div>
+        {onPlayOrigin && (
+          <div className="flex items-center space-x-2">
+            <span className="text-sm">{t("originalPronunciation")}:</span>
+            <Button onClick={onPlayOrigin} variant="ghost" size="icon">
+              <Volume2Icon className="w-5 h-5" />
+            </Button>
+          </div>
+        )}
       </PopoverContent>
     </Popover>
   );