diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index d3d2bdf7..5110e0fd 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -94,7 +94,6 @@ class TranscriptionOptions: prepend_punctuations: str append_punctuations: str multilingual: bool - output_language: Optional[str] max_new_tokens: Optional[int] clip_timestamps: Union[str, List[float]] hallucination_silence_threshold: Optional[float] @@ -298,10 +297,7 @@ def transcribe( Static params: (Fixed for batched version) max_initial_timestamp: The initial timestamp cannot be later than this, set at 0.0. - multilingual: If True, perform transcription on multilingual videos. Set as False. - output_language: Valid only if multilingual is set to True. - Specifies the string representing the output language. One of - 'en' (English) or 'hybrid' (code-switched transcription). set as None. + multilingual: Perform language detection on every segment. Set as False. condition_on_previous_text: If True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, @@ -450,7 +446,6 @@ def transcribe( clip_timestamps="0", prompt_reset_on_temperature=0.5, multilingual=False, - output_language=None, without_timestamps=without_timestamps, max_initial_timestamp=0.0, ) @@ -669,7 +664,6 @@ def transcribe( prepend_punctuations: str = "\"'“¿([{-", append_punctuations: str = "\"'.。,,!!??::”)]}、", multilingual: bool = False, - output_language: Optional[str] = None, vad_filter: bool = False, vad_parameters: Optional[Union[dict, VadOptions]] = None, max_new_tokens: Optional[int] = None, @@ -729,12 +723,7 @@ def transcribe( with the next word append_punctuations: If word_timestamps is True, merge these punctuation symbols with the previous word - multilingual: If True, perform transcription on multilingual videos - and return the transcript based - on the 'output_language' flag. - output_language: Valid only if multilingual is set to True. - Specifies the string representing the output language. One of - 'en' (English) or 'hybrid' (code-switched transcription). + multilingual: Perform language detection on every segment. vad_filter: Enable the voice activity detection (VAD) to filter out parts of the audio without speech. This step is using the Silero VAD model https://github.com/snakers4/silero-vad. @@ -811,13 +800,6 @@ def transcribe( encoder_output = None all_language_probs = None - # setting output_language for multilingual videos - if multilingual: - if output_language is None: - output_language = "en" - elif output_language not in ["en", "hybrid"]: - raise ValueError("Output language needs to be one of 'en'/'hybrid'.") - # detecting the language if not provided if language is None: if not self.model.is_multilingual: @@ -897,7 +879,6 @@ def transcribe( prepend_punctuations=prepend_punctuations, append_punctuations=append_punctuations, multilingual=multilingual, - output_language=output_language, max_new_tokens=max_new_tokens, clip_timestamps=clip_timestamps, hallucination_silence_threshold=hallucination_silence_threshold, @@ -1087,27 +1068,18 @@ def generate_segments( previous_tokens = all_tokens[prompt_reset_since:] - if encoder_output is None: + if seek > 0 or encoder_output is None: encoder_output = self.encode(segment) - # Perform language detection at every segment to update task based on output language, - # if the language is english, task is transcribe, - # else the task is translate to english (default) - # or transcribe if 'output_language' is 'hybrid'. + # Perform language detection at every segment, if options.multilingual: results = self.model.detect_language(encoder_output) language_token, language_probability = results[0][0] language = language_token[2:-2] - if options.output_language == "en" and language != "en": - task = "translate" - else: - task = "transcribe" - # Update tokenizer based on task and language - tokenizer.task = tokenizer.tokenizer.token_to_id(f"<|{task}|>") tokenizer.language = tokenizer.tokenizer.token_to_id(language_token) tokenizer.language_code = language - # Update prompt based on task and language + prompt = self.get_prompt( tokenizer, previous_tokens, @@ -1116,9 +1088,6 @@ def generate_segments( hotwords=options.hotwords, ) - if seek > 0 or encoder_output is None: - encoder_output = self.encode(segment) - ( result, avg_logprob, diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index 63fcdd2a..7a27d5f9 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -167,7 +167,6 @@ def test_multilingual_transcription(data_dir): segments, info = model.transcribe( audio, multilingual=True, - output_language="hyprid", without_timestamps=True, condition_on_previous_text=False, )