initial commit

SYSTRAN · Nov 16, 2024 · 8002d62 · 8002d62
1 parent 39bec4a
commit 8002d62
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 37 deletions.
diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
@@ -94,7 +94,6 @@ class TranscriptionOptions:
     prepend_punctuations: str
     append_punctuations: str
     multilingual: bool
-    output_language: Optional[str]
     max_new_tokens: Optional[int]
     clip_timestamps: Union[str, List[float]]
     hallucination_silence_threshold: Optional[float]
@@ -298,10 +297,7 @@ def transcribe(
 
         Static params: (Fixed for batched version)
             max_initial_timestamp: The initial timestamp cannot be later than this, set at 0.0.
-            multilingual: If True, perform transcription on multilingual videos. Set as False.
-            output_language: Valid only if multilingual is set to True.
-                Specifies the string representing the output language. One of
-                'en' (English) or 'hybrid' (code-switched transcription). set as None.
+            multilingual: Perform language detection on every segment. Set as False.
             condition_on_previous_text: If True, the previous output of the model is provided
                 as a prompt for the next window; disabling may make the text inconsistent across
                 windows, but the model becomes less prone to getting stuck in a failure loop,
@@ -450,7 +446,6 @@ def transcribe(
             clip_timestamps="0",
             prompt_reset_on_temperature=0.5,
             multilingual=False,
-            output_language=None,
             without_timestamps=without_timestamps,
             max_initial_timestamp=0.0,
         )
@@ -669,7 +664,6 @@ def transcribe(
         prepend_punctuations: str = "\"'“¿([{-",
         append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
         multilingual: bool = False,
-        output_language: Optional[str] = None,
         vad_filter: bool = False,
         vad_parameters: Optional[Union[dict, VadOptions]] = None,
         max_new_tokens: Optional[int] = None,
@@ -729,12 +723,7 @@ def transcribe(
             with the next word
           append_punctuations: If word_timestamps is True, merge these punctuation symbols
             with the previous word
-          multilingual: If True, perform transcription on multilingual videos
-            and return the transcript based
-            on the 'output_language' flag.
-          output_language: Valid only if multilingual is set to True.
-            Specifies the string representing the output language. One of
-            'en' (English) or 'hybrid' (code-switched transcription).
+          multilingual: Perform language detection on every segment.
           vad_filter: Enable the voice activity detection (VAD) to filter out parts of the audio
             without speech. This step is using the Silero VAD model
             https://github.com/snakers4/silero-vad.
@@ -811,13 +800,6 @@ def transcribe(
         encoder_output = None
         all_language_probs = None
 
-        # setting output_language for multilingual videos
-        if multilingual:
-            if output_language is None:
-                output_language = "en"
-            elif output_language not in ["en", "hybrid"]:
-                raise ValueError("Output language needs to be one of 'en'/'hybrid'.")
-
         # detecting the language if not provided
         if language is None:
             if not self.model.is_multilingual:
@@ -897,7 +879,6 @@ def transcribe(
             prepend_punctuations=prepend_punctuations,
             append_punctuations=append_punctuations,
             multilingual=multilingual,
-            output_language=output_language,
             max_new_tokens=max_new_tokens,
             clip_timestamps=clip_timestamps,
             hallucination_silence_threshold=hallucination_silence_threshold,
@@ -1087,27 +1068,18 @@ def generate_segments(
 
             previous_tokens = all_tokens[prompt_reset_since:]
 
-            if encoder_output is None:
+            if seek > 0 or encoder_output is None:
                 encoder_output = self.encode(segment)
 
-            # Perform language detection at every segment to update task based on output language,
-            # if the language is english, task is transcribe,
-            # else the task is translate to english (default)
-            # or transcribe if 'output_language' is 'hybrid'.
+            # Perform language detection at every segment,
             if options.multilingual:
                 results = self.model.detect_language(encoder_output)
                 language_token, language_probability = results[0][0]
                 language = language_token[2:-2]
-                if options.output_language == "en" and language != "en":
-                    task = "translate"
-                else:
-                    task = "transcribe"
 
-                # Update tokenizer based on task and language
-                tokenizer.task = tokenizer.tokenizer.token_to_id(f"<|{task}|>")
                 tokenizer.language = tokenizer.tokenizer.token_to_id(language_token)
                 tokenizer.language_code = language
-            # Update prompt based on task and language
+
             prompt = self.get_prompt(
                 tokenizer,
                 previous_tokens,
@@ -1116,9 +1088,6 @@ def generate_segments(
                 hotwords=options.hotwords,
             )
 
-            if seek > 0 or encoder_output is None:
-                encoder_output = self.encode(segment)
-
             (
                 result,
                 avg_logprob,

diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py
@@ -167,7 +167,6 @@ def test_multilingual_transcription(data_dir):
     segments, info = model.transcribe(
         audio,
         multilingual=True,
-        output_language="hyprid",
         without_timestamps=True,
         condition_on_previous_text=False,
     )