Merge pull request #262 from makaveli10/discard_no_speech_segments

Discard no speech segments.
collabora · Aug 19, 2024 · bdaed45 · bdaed45
2 parents aade677 + 4870e9f
commit bdaed45
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 15 deletions.
diff --git a/requirements/server.txt b/requirements/server.txt
@@ -1,5 +1,5 @@
 faster-whisper==1.0.1
-torch
+torch==2.3.0
 websockets
 onnxruntime==1.16.0
 numba

diff --git a/whisper_live/client.py b/whisper_live/client.py
@@ -29,7 +29,8 @@ def __init__(
         translate=False,
         model="small",
         srt_file_path="output.srt",
-        use_vad=True
+        use_vad=True,
+        log_transcription=True
     ):
         """
         Initializes a Client instance for audio recording and streaming to a server.
@@ -57,11 +58,11 @@ def __init__(
         self.use_vad = use_vad
         self.last_segment = None
         self.last_received_segment = None
+        self.log_transcription = log_transcription
 
         if translate:
             self.task = "translate"
 
-        self.timestamp_offset = 0.0
         self.audio_bytes = None
 
         if host is not None and port is not None:
@@ -118,10 +119,11 @@ def process_segments(self, segments):
             self.last_response_received = time.time()
             self.last_received_segment = segments[-1]["text"]
 
-        # Truncate to last 3 entries for brevity.
-        text = text[-3:]
-        utils.clear_screen()
-        utils.print_transcript(text)
+        if self.log_transcription:
+            # Truncate to last 3 entries for brevity.
+            text = text[-3:]
+            utils.clear_screen()
+            utils.print_transcript(text)
 
     def on_message(self, ws, message):
         """
@@ -677,9 +679,10 @@ def __init__(
         use_vad=True,
         save_output_recording=False,
         output_recording_filename="./output_recording.wav",
-        output_transcription_path="./output.srt"
+        output_transcription_path="./output.srt",
+        log_transcription=True,
     ):
-        self.client = Client(host, port, lang, translate, model, srt_file_path=output_transcription_path, use_vad=use_vad)
+        self.client = Client(host, port, lang, translate, model, srt_file_path=output_transcription_path, use_vad=use_vad, log_transcription=log_transcription)
         if save_output_recording and not output_recording_filename.endswith(".wav"):
             raise ValueError(f"Please provide a valid `output_recording_filename`: {output_recording_filename}")
         if not output_transcription_path.endswith(".srt"):

diff --git a/whisper_live/server.py b/whisper_live/server.py
@@ -973,6 +973,7 @@ def speech_to_text(self):
 
             input_bytes, duration = self.get_audio_chunk_for_processing()
             if duration < 1.0:
+                time.sleep(0.1)     # wait for audio chunks to arrive
                 continue
             try:
                 input_sample = input_bytes.copy()
@@ -1046,12 +1047,14 @@ def update_segments(self, segments, duration):
                 self.transcript.append(self.format_segment(start, end, text_))
                 offset = min(duration, s.end)
 
-        self.current_out += segments[-1].text
-        last_segment = self.format_segment(
-            self.timestamp_offset + segments[-1].start,
-            self.timestamp_offset + min(duration, segments[-1].end),
-            self.current_out
-        )
+        # only process the segments if it satisfies the no_speech_thresh
+        if segments[-1].no_speech_prob <= self.no_speech_thresh:
+            self.current_out += segments[-1].text
+            last_segment = self.format_segment(
+                self.timestamp_offset + segments[-1].start,
+                self.timestamp_offset + min(duration, segments[-1].end),
+                self.current_out
+            )
 
         # if same incomplete segment is seen multiple times then update the offset
         # and append the segment to the list