Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Load audio from buffer or filename path #563

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 18 additions & 7 deletions whisperx/audio.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from functools import lru_cache
from typing import Optional, Union

from io import BytesIO
import ffmpeg
import numpy as np
import torch
Expand All @@ -23,7 +23,7 @@
TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN) # 20ms per audio token


def load_audio(file: str, sr: int = SAMPLE_RATE):
def load_audio(file: Union[str, BytesIO], sr: int = SAMPLE_RATE):
"""
Open an audio file and read as mono waveform, resampling as necessary

Expand All @@ -42,11 +42,22 @@ def load_audio(file: str, sr: int = SAMPLE_RATE):
try:
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
if isinstance(file, BytesIO):
process = (
ffmpeg
.input('pipe:') \
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) \
.overwrite_output() \
.run_async(cmd=["ffmpeg", "-nostdin"],pipe_stdin=True, pipe_stdout=True, pipe_stderr=True) \
)

out, _ = process.communicate(input=file.getbuffer())
elif isinstance(file, str):
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e

Expand Down