From 18b6168807625ba46ea69c0b675dda0c2e563b9b Mon Sep 17 00:00:00 2001 From: makaveli10 Date: Wed, 28 Feb 2024 13:55:37 +0530 Subject: [PATCH] fix: cpu usage issue --- whisper_live/server.py | 13 ++++++++++--- whisper_live/transcriber.py | 5 ++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/whisper_live/server.py b/whisper_live/server.py index 9cc2bb53..e6e2b4fd 100644 --- a/whisper_live/server.py +++ b/whisper_live/server.py @@ -408,6 +408,11 @@ def add_frames(self, frame_np): if self.frames_np is not None and self.frames_np.shape[0] > 45*self.RATE: self.frames_offset += 30.0 self.frames_np = self.frames_np[int(30*self.RATE):] + # check timestamp offset(should be >= self.frame_offset) + # this basically means that there is no speech as timestamp offset hasnt updated + # and is less than frame_offset + if self.timestamp_offset < self.frames_offset: + self.timestamp_offset = self.frames_offset if self.frames_np is None: self.frames_np = frame_np.copy() else: @@ -796,7 +801,8 @@ def transcribe_audio(self, input_sample): task=self.task, vad_filter=self.use_vad, vad_parameters=self.vad_parameters if self.use_vad else None) - if self.language is None: + + if self.language is None and info is not None: self.set_language(info) return result @@ -881,7 +887,9 @@ def speech_to_text(self): input_sample = input_bytes.copy() result = self.transcribe_audio(input_sample) - if self.language is None: + if result is None or self.language is None: + self.timestamp_offset += duration + time.sleep(0.25) # wait for voice activity, result is None when no voice activity continue self.handle_transcription_output(result, duration) @@ -932,7 +940,6 @@ def update_segments(self, segments, duration): """ offset = None self.current_out = '' - last_segment = None # process complete segments if len(segments) > 1: for i, s in enumerate(segments[:-1]): diff --git a/whisper_live/transcriber.py b/whisper_live/transcriber.py index bd2e0823..33d90073 100644 --- a/whisper_live/transcriber.py +++ b/whisper_live/transcriber.py @@ -180,7 +180,7 @@ def _get_feature_kwargs(self, model_path) -> dict: return config - def transcribe( + def transcribe( # noqa: C901 self, audio: Union[str, BinaryIO, np.ndarray], language: Optional[str] = None, @@ -315,6 +315,9 @@ def transcribe( else: speech_chunks = None + if audio.shape[0] == 0: + return None, None + features = self.feature_extractor(audio) encoder_output = None