Skip to content

Commit f1578d8

Browse files
authoredDec 4, 2023
Merge pull request #209 from hyperaudio/208-client-whisper-improvements
208 client whisper improvements
2 parents 3deb95b + b4882ab commit f1578d8

5 files changed

+73
-36365
lines changed
 

‎hyperaudio-client-whisper-template.html

+9-11
Original file line numberDiff line numberDiff line change
@@ -8,32 +8,30 @@
88
<span style="display:block; padding:16px" class="label-text">or</span>
99
<input id="file-input" name="file" type="file" class="file-input w-full max-w-xs" />
1010
<hr class="my-2 h-0 border border-t-0 border-solid border-neutral-700 opacity-50 dark:border-neutral-200" />
11-
12-
<!--<label for="file-input" class="form-label">Which video/audio file should be transcribed?</label>
13-
<input class="form-control" type="file" id="file-input" accept=".mp3,.wav,.mp4,.mov,.avi,.flv,.wmv,.mpeg,.mpg,.webm,.opus">
14-
<div class="form-text">We only support audio and video files.</div>-->
1511
</div>
1612
<div class="mb-3">
1713
<label for="model-name-input" class="form-label label-text">Which model should be used?</label>
1814
<div>
1915
<select class="form-select select select-bordered w-full max-w-xs" aria-label="Default select example" id="model-name-input">
20-
<option selected="" value="whisper-tiny.en">Whisper (Tiny) English</option>
21-
<option value="whisper-tiny">Whisper (Tiny)</option>
22-
<option value="whisper-base">Whisper (Base) English</option>
23-
<option value="whisper-base">Whisper (Base)</option>
24-
<option value="whisper-small.en">Whisper (Small) English</option>
25-
<option value="whisper-small">Whisper (Small)</option>
16+
<option selected="" value="Xenova/whisper-tiny.en">Whisper (Tiny) English</option>
17+
<option value="Xenova/whisper-tiny">Whisper (Tiny)</option>
18+
<option value="Xenova/whisper-base">Whisper (Base) English</option>
19+
<option value="Xenova/whisper-base">Whisper (Base)</option>
20+
<option value="Xenova/whisper-small.en">Whisper (Small) English</option>
21+
<option value="Xenova/whisper-small">Whisper (Small)</option>
2622
</select>
2723
</div>
2824
<div class="form-text" style="font-size: 90%;">
2925
<p style="padding-top:16px">The models are listed in order of size. The larger the model, the more accurate it is – and slower to process.</p>
3026
<p>The English models are slightly more accurate (for the English language only).</p>
27+
<p>* Whisper running in the browser is currently in beta.</p>
3128
</div>
29+
3230
</div>
31+
3332
<div class="modal-action">
3433
<label id="form-submit-btn" for="transcribe-modal" class="btn btn-primary">TRANSCRIBE</label>
3534
</div>
36-
<!--<button id="form-submit-btn" class="btn btn-primary" disabled="">Submit</button>-->
3735
</form>
3836
</div>
3937
</body>

‎hyperaudio-deepgram-modal.html

-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
<div id="deepgram-modal-template">
55
<form id="deepgram-form" name="deepgram-form">
66
<div class="flex flex-col gap-4 w-full">
7-
<!--<label id="close-modal" for="transcribe-modal" class="btn btn-sm btn-circle absolute right-2 top-2">✕</label>
8-
<h3 class="font-bold text-lg">Transcribe</h3>-->
97
<input id="token" type="text" placeholder="Deepgram token" class="input input-bordered w-full max-w-xs" />
108
<hr class="my-2 h-0 border border-t-0 border-solid border-neutral-700 opacity-50 dark:border-neutral-200" />
119
<input id="media" type="text" placeholder="Link to media" class="input input-bordered w-full max-w-xs" />

‎index.html

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<!-- (C) The Hyperaudio Project. AGPL 3.0 @license: https://www.gnu.org/licenses/agpl-3.0.en.html -->
2-
<!-- Hyperaudio Lite Editor - Version 0.3 -->
2+
<!-- Hyperaudio Lite Editor - Version 0.4 -->
33

44
<!-- Hyperaudio Lite Editor's source code is provided under a dual license model.
55
@@ -219,7 +219,7 @@ <h3 class="text-lg font-bold">Topics</h3>
219219
<h3 class="font-bold text-lg" style="margin-bottom:16px">Transcribe</h3>
220220
<div role="tablist" class="tabs tabs-lifted">
221221

222-
<input type="radio" name="my_tabs_2" role="tab" class="tab" style="width:160px" aria-label="Whisper (Local)" checked />
222+
<input type="radio" name="my_tabs_2" role="tab" class="tab" style="width:160px" aria-label="Whisper (Local) *" checked />
223223
<div role="tabpanel" class="tab-content bg-base-100 border-base-300 rounded-box p-10">
224224
<client-whisper-service></client-whisper-service>
225225
</div>

‎js/hyperaudio-lite-editor-whisper.js

+42-106
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*! (C) The Hyperaudio Project. MIT @license: en.wikipedia.org/wiki/MIT_License. */
2-
/*! Version 0.0.4 */
2+
/*! Version 0.0.5 */
3+
34

45
class WhisperService extends HTMLElement {
56

@@ -50,33 +51,6 @@ function loadWhisperClient(modal) {
5051

5152
const whisperWorkerPath = "./js/whisper.worker.js";
5253

53-
// leave the following three consts as is as they are shared by
54-
// web.worker.js
55-
56-
const MessageTypes = {
57-
DOWNLOADING: "DOWNLOADING",
58-
LOADING: "LOADING",
59-
RESULT: "RESULT",
60-
RESULT_PARTIAL: "RESULT_PARTIAL",
61-
INFERENCE_REQUEST: "INFERENCE_REQUEST",
62-
INFERENCE_DONE: "INFERENCE_DONE"
63-
};
64-
65-
const LoadingStatus = {
66-
SUCCESS: "success",
67-
ERROR: "error",
68-
LOADING: "loading"
69-
};
70-
71-
const ModelNames = {
72-
WHISPER_TINY_EN: "openai/whisper-tiny.en",
73-
WHISPER_TINY: "openai/whisper-tiny",
74-
WHISPER_BASE: "openai/whisper-base",
75-
WHISPER_BASE_EN: "openai/whisper-base.en",
76-
WHISPER_SMALL: "openai/whisper-small",
77-
WHISPER_SMALL_EN: "openai/whisper-small.en"
78-
};
79-
8054
let webWorker = createWorker();
8155

8256
formSubmitBtn.disabled = true;
@@ -85,77 +59,54 @@ function loadWhisperClient(modal) {
8559
});
8660

8761
function createWorker() {
88-
const worker = new Worker(whisperWorkerPath);
62+
const worker = new Worker(whisperWorkerPath, { type: "module" });
63+
8964
let results = [];
90-
worker.onmessage = (event2) => {
91-
const { type } = event2.data;
92-
if (type === MessageTypes.LOADING) {
93-
handleLoadingMessage(event2.data);
94-
}
95-
if (type === MessageTypes.DOWNLOADING) {
96-
loadingMessageContainer.innerHTML = '<div class="vertically-centre"><center>Downloading model...</center><br/><img src="'+transcribingSvg+'" width="50" alt="transcribing" style="margin: auto; display: block;"></div>';
97-
}
98-
if (type === MessageTypes.RESULT) {
99-
handleResultMessage(event2.data);
100-
results = event2.data.results;
101-
}
102-
if (type === MessageTypes.RESULT_PARTIAL) {
103-
104-
}
105-
if (type === MessageTypes.INFERENCE_DONE) {
106-
handleInferenceDone(results);
107-
}
65+
worker.onmessage = (event) => {
66+
handleInferenceDone(event.data);
10867
};
10968

11069
return worker;
11170
}
11271

113-
function handleLoadingMessage(data) {
114-
const { status } = data;
115-
116-
if (status === LoadingStatus.SUCCESS) {
117-
loadingMessageContainer.innerHTML = '<div class="vertically-centre"><center>Transcribing.... <span id="transcription-progress">0</span>%</center><br/><img src="'+transcribingSvg+'" width="50" alt="transcribing" style="margin: auto; display: block;"></div>';
118-
}
119-
if (status === LoadingStatus.ERROR) {
120-
loadingMessageContainer.innerHTML = '<div class="vertically-centre"><center>Oops! Something went wrong. Please refresh the page and try again.</center><br/><img src="'+errorSvg+'" width="50" alt="error" style="margin: auto; display: block;"></div>';
121-
}
122-
if (status === LoadingStatus.LOADING) {
123-
loadingMessageContainer.innerHTML = '<div class="vertically-centre"><center>Loading model into memory...</center><br/><img src="'+transcribingSvg+'" width="50" alt="transcribing" style="margin: auto; display: block;"></div>';
124-
}
125-
}
126-
127-
function handleResultMessage(data) {
128-
const { results, completedUntilTimestamp } = data;
129-
const totalDuration = videoPlayer.duration;
130-
const progress = completedUntilTimestamp / totalDuration * 100;
131-
document.querySelector("#transcription-progress").innerHTML = Math.round(progress);
132-
}
133-
13472
function handleInferenceDone(results) {
13573

13674
console.log(results);
13775

13876
videoPlayer.currentTime = 0;
13977

14078
let hypertranscript = "";
141-
results.forEach((result) => {
142-
let words = result.text.split(' ');
143-
let interval = (result.end - result.start) / words.length;
144-
let timecode = result.start * 1000;
145-
let duration = Math.floor((interval*1000)-1);
146-
words.forEach((word) => {
147-
let start = Math.floor(timecode);
148-
hypertranscript += `<span data-m='${start}' data-d='${duration}'>${word} </span>\n`;
149-
timecode += interval*1000;
150-
});
151-
152-
// new para every 5 sentences
153-
if (result.index % 5 === 0 && result.index !== 0) {
154-
hypertranscript += "\n </p>\n <p>\n";
155-
}
79+
let sentences = 0;
80+
let lastWord = "";
15681

157-
console.log(hypertranscript);
82+
results.output.chunks.forEach((word) => {
83+
84+
// ignore text with square brackets - usually contains things like [BLANK _AUDIO]
85+
if (word.text.indexOf("[") < 0 && word.text.indexOf("]") < 0) {
86+
let start = Math.floor(word.timestamp[0]*1000);
87+
let duration = Math.floor((word.timestamp[1]*1000)-1) - start;
88+
let wordCapitalised = false;
89+
90+
if (Array.from(word.text)[0].toUpperCase() === Array.from(word.text)[0]){
91+
wordCapitalised = true;
92+
}
93+
94+
if (wordCapitalised === true && lastWord.endsWith(".") ){
95+
sentences += 1;
96+
}
97+
98+
lastWord = word.text;
99+
100+
// new para every 5 sentences
101+
if (sentences % 5 === 0 && sentences !== 0) {
102+
hypertranscript += "\n </p>\n <p>\n";
103+
sentences = 0;
104+
}
105+
106+
hypertranscript += `<span data-m='${start}' data-d='${duration}'>${word.text} </span>\n`;
107+
}
158108
});
109+
159110
resultsContainer.innerHTML = "<article>\n <section>\n <p>\n" + hypertranscript + " </p>\n </section>\n</article>\n";
160111

161112
const initEvent = new CustomEvent('hyperaudioInit');
@@ -166,20 +117,21 @@ function loadWhisperClient(modal) {
166117

167118
async function handleFormSubmission() {
168119

169-
if (!isFileUploaded() || !isModelNameSelected()) {
170-
return;
171-
}
172-
173-
const model_name = `openai/${modelNameSelectionInput.value}`;
120+
const model_name = modelNameSelectionInput.value;
174121
const file = fileUploadBtn.files[0];
175122
const audio = await readAudioFrom(file);
176123

177124
webWorker.postMessage({
178-
type: MessageTypes.INFERENCE_REQUEST,
125+
type: "INFERENCE_REQUEST",
179126
audio,
180127
model_name
181128
});
129+
130+
console.log("web worker");
131+
console.log(webWorker);
182132
videoPlayer.src = URL.createObjectURL(file);
133+
134+
loadingMessageContainer.innerHTML = '<div class="vertically-centre"><center>Transcribing.... </center><br/><img src="'+transcribingSvg+'" width="50" alt="transcribing" style="margin: auto; display: block;"></div>';
183135
}
184136

185137
async function readAudioFrom(file) {
@@ -190,20 +142,4 @@ function loadWhisperClient(modal) {
190142
const audio = decoded.getChannelData(0);
191143
return audio;
192144
}
193-
194-
function isFileUploaded() {
195-
if (fileUploadBtn.files.length === 0) {
196-
return false;
197-
}
198-
return true;
199-
}
200-
201-
function isModelNameSelected() {
202-
const selectedValue = modelNameSelectionInput.value;
203-
if (modelNameSelectionInput.value === "") {
204-
return false;
205-
}
206-
const modelName = `openai/${selectedValue}`;
207-
return Object.values(ModelNames).indexOf(modelName) !== -1;
208-
}
209145
}

‎js/whisper.worker.js

+20-36,244
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
Please sign in to comment.