From 0aab9f8331c26eb10395913e382561029dea4a47 Mon Sep 17 00:00:00 2001 From: kadirnar Date: Fri, 24 Nov 2023 17:50:36 +0300 Subject: [PATCH] Update ASR model and add batch size and return timestamps options --- README.md | 2 +- whisperplus/pipelines/whisper_diarize.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ccdab65..a07369d 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ pipeline = ASRDiarizationPipeline.from_pretrained( device=device, ) -output_text = pipeline(audio_path) +output_text = pipeline(audio_path, num_speakers=2, min_speaker=1, max_speaker=2) dialogue = format_speech_to_dialogue(output_text) print(dialogue) ``` diff --git a/whisperplus/pipelines/whisper_diarize.py b/whisperplus/pipelines/whisper_diarize.py index 4244059..56390d0 100644 --- a/whisperplus/pipelines/whisper_diarize.py +++ b/whisperplus/pipelines/whisper_diarize.py @@ -24,7 +24,7 @@ def __init__( @classmethod def from_pretrained( cls, - asr_model: Optional[str] = "openai/whisper-medium", + asr_model: Optional[str] = "openai/whisper-large-v3", *, diarizer_model: Optional[str] = "pyannote/speaker-diarization", chunk_length_s: Optional[int] = 30, @@ -35,7 +35,9 @@ def from_pretrained( "automatic-speech-recognition", model=asr_model, chunk_length_s=chunk_length_s, - token=use_auth_token, # 08/25/2023: Changed argument from use_auth_token to token + token=use_auth_token, + batch_size=24, + return_timestamps=True, **kwargs, ) diarization_pipeline = Pipeline.from_pretrained(diarizer_model, use_auth_token=use_auth_token)