diff --git a/requirements/server.txt b/requirements/server.txt index 552bda5..6f5132f 100644 --- a/requirements/server.txt +++ b/requirements/server.txt @@ -1,5 +1,5 @@ faster-whisper==1.0.1 -torch +torch==2.3.0 websockets onnxruntime==1.16.0 numba diff --git a/whisper_live/client.py b/whisper_live/client.py index 8e42e2d..c252607 100644 --- a/whisper_live/client.py +++ b/whisper_live/client.py @@ -29,7 +29,8 @@ def __init__( translate=False, model="small", srt_file_path="output.srt", - use_vad=True + use_vad=True, + log_transcription=True ): """ Initializes a Client instance for audio recording and streaming to a server. @@ -57,11 +58,11 @@ def __init__( self.use_vad = use_vad self.last_segment = None self.last_received_segment = None + self.log_transcription = log_transcription if translate: self.task = "translate" - self.timestamp_offset = 0.0 self.audio_bytes = None if host is not None and port is not None: @@ -118,10 +119,11 @@ def process_segments(self, segments): self.last_response_received = time.time() self.last_received_segment = segments[-1]["text"] - # Truncate to last 3 entries for brevity. - text = text[-3:] - utils.clear_screen() - utils.print_transcript(text) + if self.log_transcription: + # Truncate to last 3 entries for brevity. + text = text[-3:] + utils.clear_screen() + utils.print_transcript(text) def on_message(self, ws, message): """ @@ -677,9 +679,10 @@ def __init__( use_vad=True, save_output_recording=False, output_recording_filename="./output_recording.wav", - output_transcription_path="./output.srt" + output_transcription_path="./output.srt", + log_transcription=True, ): - self.client = Client(host, port, lang, translate, model, srt_file_path=output_transcription_path, use_vad=use_vad) + self.client = Client(host, port, lang, translate, model, srt_file_path=output_transcription_path, use_vad=use_vad, log_transcription=log_transcription) if save_output_recording and not output_recording_filename.endswith(".wav"): raise ValueError(f"Please provide a valid `output_recording_filename`: {output_recording_filename}") if not output_transcription_path.endswith(".srt"): diff --git a/whisper_live/server.py b/whisper_live/server.py index b6efc77..b668a6d 100644 --- a/whisper_live/server.py +++ b/whisper_live/server.py @@ -973,6 +973,7 @@ def speech_to_text(self): input_bytes, duration = self.get_audio_chunk_for_processing() if duration < 1.0: + time.sleep(0.1) # wait for audio chunks to arrive continue try: input_sample = input_bytes.copy() @@ -1046,12 +1047,14 @@ def update_segments(self, segments, duration): self.transcript.append(self.format_segment(start, end, text_)) offset = min(duration, s.end) - self.current_out += segments[-1].text - last_segment = self.format_segment( - self.timestamp_offset + segments[-1].start, - self.timestamp_offset + min(duration, segments[-1].end), - self.current_out - ) + # only process the segments if it satisfies the no_speech_thresh + if segments[-1].no_speech_prob <= self.no_speech_thresh: + self.current_out += segments[-1].text + last_segment = self.format_segment( + self.timestamp_offset + segments[-1].start, + self.timestamp_offset + min(duration, segments[-1].end), + self.current_out + ) # if same incomplete segment is seen multiple times then update the offset # and append the segment to the list