Skip to content

Commit

Permalink
Merge pull request #262 from makaveli10/discard_no_speech_segments
Browse files Browse the repository at this point in the history
Discard no speech segments.
  • Loading branch information
zoq authored Aug 19, 2024
2 parents aade677 + 4870e9f commit bdaed45
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 15 deletions.
2 changes: 1 addition & 1 deletion requirements/server.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
faster-whisper==1.0.1
torch
torch==2.3.0
websockets
onnxruntime==1.16.0
numba
Expand Down
19 changes: 11 additions & 8 deletions whisper_live/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def __init__(
translate=False,
model="small",
srt_file_path="output.srt",
use_vad=True
use_vad=True,
log_transcription=True
):
"""
Initializes a Client instance for audio recording and streaming to a server.
Expand Down Expand Up @@ -57,11 +58,11 @@ def __init__(
self.use_vad = use_vad
self.last_segment = None
self.last_received_segment = None
self.log_transcription = log_transcription

if translate:
self.task = "translate"

self.timestamp_offset = 0.0
self.audio_bytes = None

if host is not None and port is not None:
Expand Down Expand Up @@ -118,10 +119,11 @@ def process_segments(self, segments):
self.last_response_received = time.time()
self.last_received_segment = segments[-1]["text"]

# Truncate to last 3 entries for brevity.
text = text[-3:]
utils.clear_screen()
utils.print_transcript(text)
if self.log_transcription:
# Truncate to last 3 entries for brevity.
text = text[-3:]
utils.clear_screen()
utils.print_transcript(text)

def on_message(self, ws, message):
"""
Expand Down Expand Up @@ -677,9 +679,10 @@ def __init__(
use_vad=True,
save_output_recording=False,
output_recording_filename="./output_recording.wav",
output_transcription_path="./output.srt"
output_transcription_path="./output.srt",
log_transcription=True,
):
self.client = Client(host, port, lang, translate, model, srt_file_path=output_transcription_path, use_vad=use_vad)
self.client = Client(host, port, lang, translate, model, srt_file_path=output_transcription_path, use_vad=use_vad, log_transcription=log_transcription)
if save_output_recording and not output_recording_filename.endswith(".wav"):
raise ValueError(f"Please provide a valid `output_recording_filename`: {output_recording_filename}")
if not output_transcription_path.endswith(".srt"):
Expand Down
15 changes: 9 additions & 6 deletions whisper_live/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -973,6 +973,7 @@ def speech_to_text(self):

input_bytes, duration = self.get_audio_chunk_for_processing()
if duration < 1.0:
time.sleep(0.1) # wait for audio chunks to arrive
continue
try:
input_sample = input_bytes.copy()
Expand Down Expand Up @@ -1046,12 +1047,14 @@ def update_segments(self, segments, duration):
self.transcript.append(self.format_segment(start, end, text_))
offset = min(duration, s.end)

self.current_out += segments[-1].text
last_segment = self.format_segment(
self.timestamp_offset + segments[-1].start,
self.timestamp_offset + min(duration, segments[-1].end),
self.current_out
)
# only process the segments if it satisfies the no_speech_thresh
if segments[-1].no_speech_prob <= self.no_speech_thresh:
self.current_out += segments[-1].text
last_segment = self.format_segment(
self.timestamp_offset + segments[-1].start,
self.timestamp_offset + min(duration, segments[-1].end),
self.current_out
)

# if same incomplete segment is seen multiple times then update the offset
# and append the segment to the list
Expand Down

0 comments on commit bdaed45

Please sign in to comment.