collabora · zoq · Feb 28, 2024 · Feb 28, 2024
diff --git a/whisper_live/server.py b/whisper_live/server.py
@@ -408,6 +408,11 @@ def add_frames(self, frame_np):
         if self.frames_np is not None and self.frames_np.shape[0] > 45*self.RATE:
             self.frames_offset += 30.0
             self.frames_np = self.frames_np[int(30*self.RATE):]
+            # check timestamp offset(should be >= self.frame_offset)
+            # this basically means that there is no speech as timestamp offset hasnt updated
+            # and is less than frame_offset
+            if self.timestamp_offset < self.frames_offset:
+                self.timestamp_offset = self.frames_offset
         if self.frames_np is None:
             self.frames_np = frame_np.copy()
         else:
@@ -796,7 +801,8 @@ def transcribe_audio(self, input_sample):
             task=self.task,
             vad_filter=self.use_vad,
             vad_parameters=self.vad_parameters if self.use_vad else None)
-        if self.language is None:
+
+        if self.language is None and info is not None:
             self.set_language(info)
         return result
 
@@ -881,7 +887,9 @@ def speech_to_text(self):
                 input_sample = input_bytes.copy()
                 result = self.transcribe_audio(input_sample)
 
-                if self.language is None:
+                if result is None or self.language is None:
+                    self.timestamp_offset += duration
+                    time.sleep(0.25)    # wait for voice activity, result is None when no voice activity
                     continue
                 self.handle_transcription_output(result, duration)
 
@@ -932,7 +940,6 @@ def update_segments(self, segments, duration):
         """
         offset = None
         self.current_out = ''
-        last_segment = None
         # process complete segments
         if len(segments) > 1:
             for i, s in enumerate(segments[:-1]):

diff --git a/whisper_live/transcriber.py b/whisper_live/transcriber.py
@@ -180,7 +180,7 @@ def _get_feature_kwargs(self, model_path) -> dict:
 
         return config
 
-    def transcribe(
+    def transcribe(                                                         # noqa: C901
         self,
         audio: Union[str, BinaryIO, np.ndarray],
         language: Optional[str] = None,
@@ -315,6 +315,9 @@ def transcribe(
         else:
             speech_chunks = None
 
+        if audio.shape[0] == 0:
+            return None, None
+
         features = self.feature_extractor(audio)
 
         encoder_output = None