Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
MahmoudAshraf97 committed Nov 16, 2024
1 parent 39bec4a commit 8002d62
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 37 deletions.
41 changes: 5 additions & 36 deletions faster_whisper/transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ class TranscriptionOptions:
prepend_punctuations: str
append_punctuations: str
multilingual: bool
output_language: Optional[str]
max_new_tokens: Optional[int]
clip_timestamps: Union[str, List[float]]
hallucination_silence_threshold: Optional[float]
Expand Down Expand Up @@ -298,10 +297,7 @@ def transcribe(
Static params: (Fixed for batched version)
max_initial_timestamp: The initial timestamp cannot be later than this, set at 0.0.
multilingual: If True, perform transcription on multilingual videos. Set as False.
output_language: Valid only if multilingual is set to True.
Specifies the string representing the output language. One of
'en' (English) or 'hybrid' (code-switched transcription). set as None.
multilingual: Perform language detection on every segment. Set as False.
condition_on_previous_text: If True, the previous output of the model is provided
as a prompt for the next window; disabling may make the text inconsistent across
windows, but the model becomes less prone to getting stuck in a failure loop,
Expand Down Expand Up @@ -450,7 +446,6 @@ def transcribe(
clip_timestamps="0",
prompt_reset_on_temperature=0.5,
multilingual=False,
output_language=None,
without_timestamps=without_timestamps,
max_initial_timestamp=0.0,
)
Expand Down Expand Up @@ -669,7 +664,6 @@ def transcribe(
prepend_punctuations: str = "\"'“¿([{-",
append_punctuations: str = "\"'.。,,!!??::”)]}、",
multilingual: bool = False,
output_language: Optional[str] = None,
vad_filter: bool = False,
vad_parameters: Optional[Union[dict, VadOptions]] = None,
max_new_tokens: Optional[int] = None,
Expand Down Expand Up @@ -729,12 +723,7 @@ def transcribe(
with the next word
append_punctuations: If word_timestamps is True, merge these punctuation symbols
with the previous word
multilingual: If True, perform transcription on multilingual videos
and return the transcript based
on the 'output_language' flag.
output_language: Valid only if multilingual is set to True.
Specifies the string representing the output language. One of
'en' (English) or 'hybrid' (code-switched transcription).
multilingual: Perform language detection on every segment.
vad_filter: Enable the voice activity detection (VAD) to filter out parts of the audio
without speech. This step is using the Silero VAD model
https://github.com/snakers4/silero-vad.
Expand Down Expand Up @@ -811,13 +800,6 @@ def transcribe(
encoder_output = None
all_language_probs = None

# setting output_language for multilingual videos
if multilingual:
if output_language is None:
output_language = "en"
elif output_language not in ["en", "hybrid"]:
raise ValueError("Output language needs to be one of 'en'/'hybrid'.")

# detecting the language if not provided
if language is None:
if not self.model.is_multilingual:
Expand Down Expand Up @@ -897,7 +879,6 @@ def transcribe(
prepend_punctuations=prepend_punctuations,
append_punctuations=append_punctuations,
multilingual=multilingual,
output_language=output_language,
max_new_tokens=max_new_tokens,
clip_timestamps=clip_timestamps,
hallucination_silence_threshold=hallucination_silence_threshold,
Expand Down Expand Up @@ -1087,27 +1068,18 @@ def generate_segments(

previous_tokens = all_tokens[prompt_reset_since:]

if encoder_output is None:
if seek > 0 or encoder_output is None:
encoder_output = self.encode(segment)

# Perform language detection at every segment to update task based on output language,
# if the language is english, task is transcribe,
# else the task is translate to english (default)
# or transcribe if 'output_language' is 'hybrid'.
# Perform language detection at every segment,
if options.multilingual:
results = self.model.detect_language(encoder_output)
language_token, language_probability = results[0][0]
language = language_token[2:-2]
if options.output_language == "en" and language != "en":
task = "translate"
else:
task = "transcribe"

# Update tokenizer based on task and language
tokenizer.task = tokenizer.tokenizer.token_to_id(f"<|{task}|>")
tokenizer.language = tokenizer.tokenizer.token_to_id(language_token)
tokenizer.language_code = language
# Update prompt based on task and language

prompt = self.get_prompt(
tokenizer,
previous_tokens,
Expand All @@ -1116,9 +1088,6 @@ def generate_segments(
hotwords=options.hotwords,
)

if seek > 0 or encoder_output is None:
encoder_output = self.encode(segment)

(
result,
avg_logprob,
Expand Down
1 change: 0 additions & 1 deletion tests/test_transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,6 @@ def test_multilingual_transcription(data_dir):
segments, info = model.transcribe(
audio,
multilingual=True,
output_language="hyprid",
without_timestamps=True,
condition_on_previous_text=False,
)
Expand Down

0 comments on commit 8002d62

Please sign in to comment.