Merge pull request #209 from hyperaudio/208-client-whisper-improvements

maboa · web-flow · commit f1578d8dc69a · 2023-12-04T16:19:23.000+01:00
208 client whisper improvements
diff --git a/hyperaudio-client-whisper-template.html b/hyperaudio-client-whisper-template.html
@@ -8,32 +8,30 @@
       <span style="display:block; padding:16px" class="label-text">or</span>
       <input id="file-input" name="file" type="file" class="file-input w-full max-w-xs" />
       <hr class="my-2 h-0 border border-t-0 border-solid border-neutral-700 opacity-50 dark:border-neutral-200" />
-
-      <!--<label for="file-input" class="form-label">Which video/audio file should be transcribed?</label>
-      <input class="form-control" type="file" id="file-input" accept=".mp3,.wav,.mp4,.mov,.avi,.flv,.wmv,.mpeg,.mpg,.webm,.opus">
-      <div class="form-text">We only support audio and video files.</div>-->
     </div>
     <div class="mb-3">
       <label for="model-name-input" class="form-label label-text">Which model should be used?</label>
       <div>
         <select class="form-select select select-bordered w-full max-w-xs" aria-label="Default select example" id="model-name-input">
-          <option selected="" value="whisper-tiny.en">Whisper (Tiny) English</option>
-          <option value="whisper-tiny">Whisper (Tiny)</option>
-          <option value="whisper-base">Whisper (Base) English</option>
-          <option value="whisper-base">Whisper (Base)</option>
-          <option value="whisper-small.en">Whisper (Small) English</option>
-          <option value="whisper-small">Whisper (Small)</option>
+          <option selected="" value="Xenova/whisper-tiny.en">Whisper (Tiny) English</option>
+          <option value="Xenova/whisper-tiny">Whisper (Tiny)</option>
+          <option value="Xenova/whisper-base">Whisper (Base) English</option>
+          <option value="Xenova/whisper-base">Whisper (Base)</option>
+          <option value="Xenova/whisper-small.en">Whisper (Small) English</option>
+          <option value="Xenova/whisper-small">Whisper (Small)</option>
         </select>
       </div>
       <div class="form-text" style="font-size: 90%;">
         <p style="padding-top:16px">The models are listed in order of size. The larger the model, the more accurate it is – and slower to process.</p> 
         <p>The English models are slightly more accurate (for the English language only).</p>
+        <p>* Whisper running in the browser is currently in beta.</p>
       </div>
+      
     </div>
+    
     <div class="modal-action">
       <label id="form-submit-btn" for="transcribe-modal" class="btn btn-primary">TRANSCRIBE</label>
     </div>
-    <!--<button id="form-submit-btn" class="btn btn-primary" disabled="">Submit</button>-->
   </form>
 </div>
 </body>
diff --git a/hyperaudio-deepgram-modal.html b/hyperaudio-deepgram-modal.html
@@ -4,8 +4,6 @@
 <div id="deepgram-modal-template">
   <form id="deepgram-form" name="deepgram-form">
     <div class="flex flex-col gap-4 w-full">
-      <!--<label id="close-modal" for="transcribe-modal" class="btn btn-sm btn-circle absolute right-2 top-2">✕</label>
-      <h3 class="font-bold text-lg">Transcribe</h3>-->
       <input id="token" type="text" placeholder="Deepgram token" class="input input-bordered w-full max-w-xs" />
       <hr class="my-2 h-0 border border-t-0 border-solid border-neutral-700 opacity-50 dark:border-neutral-200" />
       <input id="media" type="text" placeholder="Link to media" class="input input-bordered w-full max-w-xs" />
diff --git a/index.html b/index.html
@@ -1,5 +1,5 @@
 <!--  (C) The Hyperaudio Project. AGPL 3.0 @license: https://www.gnu.org/licenses/agpl-3.0.en.html -->
-<!--  Hyperaudio Lite Editor - Version 0.3 -->
+<!--  Hyperaudio Lite Editor - Version 0.4 -->
 
 <!--  Hyperaudio Lite Editor's source code is provided under a dual license model.
 
@@ -219,7 +219,7 @@ <h3 class="text-lg font-bold">Topics</h3>
               <h3 class="font-bold text-lg"  style="margin-bottom:16px">Transcribe</h3>
               <div role="tablist" class="tabs tabs-lifted">
 
-                <input type="radio" name="my_tabs_2" role="tab" class="tab" style="width:160px" aria-label="Whisper (Local)" checked />
+                <input type="radio" name="my_tabs_2" role="tab" class="tab" style="width:160px" aria-label="Whisper (Local) *" checked />
                 <div role="tabpanel" class="tab-content bg-base-100 border-base-300 rounded-box p-10">
                   <client-whisper-service></client-whisper-service>
                 </div>
diff --git a/js/hyperaudio-lite-editor-whisper.js b/js/hyperaudio-lite-editor-whisper.js
@@ -1,5 +1,6 @@
 /*! (C) The Hyperaudio Project. MIT @license: en.wikipedia.org/wiki/MIT_License. */
-/*! Version 0.0.4 */
+/*! Version 0.0.5 */
+
 
 class WhisperService extends HTMLElement {
 
@@ -50,33 +51,6 @@ function loadWhisperClient(modal) {
 
   const whisperWorkerPath = "./js/whisper.worker.js";
 
-  // leave the following three consts as is as they are shared by 
-  // web.worker.js
-
-  const MessageTypes = {
-    DOWNLOADING: "DOWNLOADING",
-    LOADING: "LOADING",
-    RESULT: "RESULT",
-    RESULT_PARTIAL: "RESULT_PARTIAL",
-    INFERENCE_REQUEST: "INFERENCE_REQUEST",
-    INFERENCE_DONE: "INFERENCE_DONE"
-  };
-  
-  const LoadingStatus = {
-    SUCCESS: "success",
-    ERROR: "error",
-    LOADING: "loading"
-  };
-
-  const ModelNames = {
-    WHISPER_TINY_EN: "openai/whisper-tiny.en",
-    WHISPER_TINY: "openai/whisper-tiny",
-    WHISPER_BASE: "openai/whisper-base",
-    WHISPER_BASE_EN: "openai/whisper-base.en",
-    WHISPER_SMALL: "openai/whisper-small",
-    WHISPER_SMALL_EN: "openai/whisper-small.en"
-  };
-
   let webWorker = createWorker();
 
   formSubmitBtn.disabled = true;
@@ -85,77 +59,54 @@ function loadWhisperClient(modal) {
   });
 
   function createWorker() {
-    const worker = new Worker(whisperWorkerPath);
+    const worker = new Worker(whisperWorkerPath, { type: "module" });
+
     let results = [];
-    worker.onmessage = (event2) => {
-      const { type } = event2.data;
-      if (type === MessageTypes.LOADING) {
-        handleLoadingMessage(event2.data);
-      }
-      if (type === MessageTypes.DOWNLOADING) {
-        loadingMessageContainer.innerHTML = '<div class="vertically-centre"><center>Downloading model...</center><br/><img src="'+transcribingSvg+'" width="50" alt="transcribing" style="margin: auto; display: block;"></div>';
-      }
-      if (type === MessageTypes.RESULT) {
-        handleResultMessage(event2.data);
-        results = event2.data.results;
-      }
-      if (type === MessageTypes.RESULT_PARTIAL) {
-        
-      }
-      if (type === MessageTypes.INFERENCE_DONE) {
-        handleInferenceDone(results);
-      }
+    worker.onmessage = (event) => {
+      handleInferenceDone(event.data);
     };
 
     return worker;
   }
 
-  function handleLoadingMessage(data) {
-    const { status } = data;
-
-    if (status === LoadingStatus.SUCCESS) {
-      loadingMessageContainer.innerHTML = '<div class="vertically-centre"><center>Transcribing.... <span id="transcription-progress">0</span>%</center><br/><img src="'+transcribingSvg+'" width="50" alt="transcribing" style="margin: auto; display: block;"></div>';
-    }
-    if (status === LoadingStatus.ERROR) {
-      loadingMessageContainer.innerHTML = '<div class="vertically-centre"><center>Oops! Something went wrong. Please refresh the page and try again.</center><br/><img src="'+errorSvg+'" width="50" alt="error" style="margin: auto; display: block;"></div>';
-    }
-    if (status === LoadingStatus.LOADING) {
-      loadingMessageContainer.innerHTML = '<div class="vertically-centre"><center>Loading model into memory...</center><br/><img src="'+transcribingSvg+'" width="50" alt="transcribing" style="margin: auto; display: block;"></div>';
-    }
-  }
-  
-  function handleResultMessage(data) {
-    const { results, completedUntilTimestamp } = data;
-    const totalDuration = videoPlayer.duration;
-    const progress = completedUntilTimestamp / totalDuration * 100;
-    document.querySelector("#transcription-progress").innerHTML = Math.round(progress);
-  }
-
   function handleInferenceDone(results) {
 
     console.log(results);
 
     videoPlayer.currentTime = 0;
 
     let hypertranscript = "";
-    results.forEach((result) => {
-      let words = result.text.split(' ');
-      let interval = (result.end - result.start) / words.length;
-      let timecode = result.start * 1000;
-      let duration = Math.floor((interval*1000)-1);
-      words.forEach((word) => {
-        let start = Math.floor(timecode);
-        hypertranscript += `<span data-m='${start}' data-d='${duration}'>${word} </span>\n`;
-        timecode += interval*1000;
-      });
-
-      // new para every 5 sentences
-      if (result.index % 5 === 0 && result.index !== 0) {
-        hypertranscript += "\n  </p>\n  <p>\n";
-      }
+    let sentences = 0;
+    let lastWord = "";
 
-      console.log(hypertranscript);
+    results.output.chunks.forEach((word) => {
+
+      // ignore text with square brackets - usually contains things like [BLANK _AUDIO]
+      if (word.text.indexOf("[") < 0  && word.text.indexOf("]") < 0) {
+        let start = Math.floor(word.timestamp[0]*1000);
+        let duration = Math.floor((word.timestamp[1]*1000)-1) - start;
+        let wordCapitalised = false;
+  
+        if (Array.from(word.text)[0].toUpperCase() === Array.from(word.text)[0]){
+          wordCapitalised = true;
+        }
+  
+        if (wordCapitalised === true && lastWord.endsWith(".") ){
+          sentences += 1;
+        }
+  
+        lastWord = word.text;
+        
+        // new para every 5 sentences
+        if (sentences % 5 === 0 && sentences !== 0) {
+          hypertranscript += "\n  </p>\n  <p>\n";
+          sentences = 0;
+        }
+  
+        hypertranscript += `<span data-m='${start}' data-d='${duration}'>${word.text} </span>\n`;
+      }
     });
+    
     resultsContainer.innerHTML = "<article>\n <section>\n  <p>\n" + hypertranscript + "  </p>\n </section>\n</article>\n";
 
     const initEvent = new CustomEvent('hyperaudioInit');
@@ -166,20 +117,21 @@ function loadWhisperClient(modal) {
 
   async function handleFormSubmission() {
 
-    if (!isFileUploaded() || !isModelNameSelected()) {
-      return;
-    }
-    
-    const model_name = `openai/${modelNameSelectionInput.value}`;
+    const model_name = modelNameSelectionInput.value;
     const file = fileUploadBtn.files[0];
     const audio = await readAudioFrom(file);
 
     webWorker.postMessage({
-      type: MessageTypes.INFERENCE_REQUEST,
+      type: "INFERENCE_REQUEST",
       audio,
       model_name
     });
+
+    console.log("web worker");
+    console.log(webWorker);
     videoPlayer.src = URL.createObjectURL(file);
+
+    loadingMessageContainer.innerHTML = '<div class="vertically-centre"><center>Transcribing.... </center><br/><img src="'+transcribingSvg+'" width="50" alt="transcribing" style="margin: auto; display: block;"></div>';
   }
 
   async function readAudioFrom(file) {
@@ -190,20 +142,4 @@ function loadWhisperClient(modal) {
     const audio = decoded.getChannelData(0);
     return audio;
   }
-
-  function isFileUploaded() {
-    if (fileUploadBtn.files.length === 0) {
-      return false;
-    }
-    return true;
-  }
-
-  function isModelNameSelected() {
-    const selectedValue = modelNameSelectionInput.value;
-    if (modelNameSelectionInput.value === "") {
-      return false;
-    }
-    const modelName = `openai/${selectedValue}`;
-    return Object.values(ModelNames).indexOf(modelName) !== -1;
-  }
 }
diff --git a/js/whisper.worker.js b/js/whisper.worker.js