Skip to content

Commit

Permalink
better readability, impl pcm to wavfile
Browse files Browse the repository at this point in the history
  • Loading branch information
Yosshi999 committed Oct 16, 2024
1 parent 5ae184f commit 43cee57
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 72 deletions.
5 changes: 3 additions & 2 deletions crates/voicevox_core/src/blocking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
pub use crate::{
engine::open_jtalk::blocking::OpenJtalk, infer::runtimes::onnxruntime::blocking::Onnxruntime,
synthesizer::blocking::Audio, synthesizer::blocking::Synthesizer,
user_dict::dict::blocking::UserDict, voice_model::blocking::VoiceModelFile,
synthesizer::blocking::wav_from_s16le, synthesizer::blocking::Audio,
synthesizer::blocking::Synthesizer, user_dict::dict::blocking::UserDict,
voice_model::blocking::VoiceModelFile,
};

pub mod onnxruntime {
Expand Down
96 changes: 47 additions & 49 deletions crates/voicevox_core/src/synthesizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,39 @@ pub struct InitializeOptions {
pub(crate) mod blocking {
use std::io::{Cursor, Write as _};

/// 16bit PCMにヘッダを付加しWAVフォーマットのバイナリを生成する。
pub fn wav_from_s16le(pcm: &[u8], output_sampling_rate: u32, output_stereo: bool) -> Vec<u8> {
// TODO: 44.1kHzなどの対応

let num_channels: u16 = if output_stereo { 2 } else { 1 };
let bit_depth: u16 = 16;
let block_size: u16 = bit_depth * num_channels / 8;

let bytes_size = pcm.len() as u32;
let wave_size = bytes_size + 44;

let buf: Vec<u8> = Vec::with_capacity(wave_size as usize);
let mut cur = Cursor::new(buf);

cur.write_all("RIFF".as_bytes()).unwrap();
cur.write_all(&(wave_size - 8).to_le_bytes()).unwrap();
cur.write_all("WAVEfmt ".as_bytes()).unwrap();
cur.write_all(&16_u32.to_le_bytes()).unwrap(); // fmt header length
cur.write_all(&1_u16.to_le_bytes()).unwrap(); //linear PCM
cur.write_all(&num_channels.to_le_bytes()).unwrap();
cur.write_all(&output_sampling_rate.to_le_bytes()).unwrap();

let block_rate = output_sampling_rate * block_size as u32;

cur.write_all(&block_rate.to_le_bytes()).unwrap();
cur.write_all(&block_size.to_le_bytes()).unwrap();
cur.write_all(&bit_depth.to_le_bytes()).unwrap();
cur.write_all("data".as_bytes()).unwrap();
cur.write_all(&bytes_size.to_le_bytes()).unwrap();
cur.write_all(&pcm).unwrap();
cur.into_inner()
}

use enum_map::enum_map;
use tracing::info;

Expand Down Expand Up @@ -111,17 +144,17 @@ pub(crate) mod blocking {
/// ユーザに渡す中間生成物。
pub struct Audio {
/// (フレーム数, 特徴数)の形を持つ音声特徴量。
pub internal_state: ndarray::Array2<f32>,
internal_state: ndarray::Array2<f32>,
/// 生成時に指定したスタイル番号。
pub style_id: crate::StyleId,
style_id: crate::StyleId,
/// workaround paddingを除いた音声特徴量のフレーム数。
pub length: usize,
/// サンプリングレート。全体の秒数は`length / sampling_rate`で表せる。
pub sampling_rate: f32,
/// workaroundとして付け足されているパディング長。
pub padding_length: usize,
padding_length: usize,
/// 生成時に利用したクエリ。
pub audio_query: AudioQuery,
audio_query: AudioQuery,
}

/// 音声シンセサイザ。
Expand Down Expand Up @@ -500,21 +533,21 @@ pub(crate) mod blocking {
}

/// 中間表現から16bit PCMで音声波形を生成する。
pub fn render(&self, audio: &Audio, begin: usize, end: usize) -> Result<Vec<u8>> {
pub fn render(&self, audio: &Audio, start: usize, end: usize) -> Result<Vec<u8>> {
const MARGIN: usize = 14; // 使われているHifiGANのreceptive fieldから計算される安全マージン
use std::cmp::min;
// 実態(workaround paddingを含まない)上での区間
let clipped_begin = min(begin, audio.length);
let clipped_start = min(start, audio.length);
let clipped_end = min(end, audio.length);
// データからはみ出さない安全マージン
let left_margin = min(MARGIN, audio.padding_length + clipped_begin);
let left_margin = min(MARGIN, audio.padding_length + clipped_start);
let right_margin = min(MARGIN, audio.padding_length + (audio.length - clipped_end));
// 安全マージンを追加したデータ上での区間
let slice_begin = audio.padding_length + clipped_begin - left_margin;
let slice_start = audio.padding_length + clipped_start - left_margin;
let slice_end = audio.padding_length + clipped_end + right_margin;
let window = audio
.internal_state
.slice(ndarray::s![slice_begin..slice_end, ..]);
.slice(ndarray::s![slice_start..slice_end, ..]);
let wave_with_margin =
self.render_audio_segment(window.into_owned(), audio.style_id)?;
let wave = wave_with_margin
Expand Down Expand Up @@ -562,46 +595,11 @@ pub(crate) mod blocking {
) -> Result<Vec<u8>> {
let audio = self.seekable_synthesis(audio_query, style_id, options)?;
let pcm = self.render(&audio, 0, audio.length)?;
return Ok(to_wav(&pcm, &audio_query));

fn to_wav(
pcm: &[u8],
&AudioQuery {
output_sampling_rate,
output_stereo,
..
}: &AudioQuery,
) -> Vec<u8> {
// TODO: 44.1kHzなどの対応

let num_channels: u16 = if output_stereo { 2 } else { 1 };
let bit_depth: u16 = 16;
let block_size: u16 = bit_depth * num_channels / 8;

let bytes_size = pcm.len() as u32;
let wave_size = bytes_size + 44;

let buf: Vec<u8> = Vec::with_capacity(wave_size as usize);
let mut cur = Cursor::new(buf);

cur.write_all("RIFF".as_bytes()).unwrap();
cur.write_all(&(wave_size - 8).to_le_bytes()).unwrap();
cur.write_all("WAVEfmt ".as_bytes()).unwrap();
cur.write_all(&16_u32.to_le_bytes()).unwrap(); // fmt header length
cur.write_all(&1_u16.to_le_bytes()).unwrap(); //linear PCM
cur.write_all(&num_channels.to_le_bytes()).unwrap();
cur.write_all(&output_sampling_rate.to_le_bytes()).unwrap();

let block_rate = output_sampling_rate * block_size as u32;

cur.write_all(&block_rate.to_le_bytes()).unwrap();
cur.write_all(&block_size.to_le_bytes()).unwrap();
cur.write_all(&bit_depth.to_le_bytes()).unwrap();
cur.write_all("data".as_bytes()).unwrap();
cur.write_all(&bytes_size.to_le_bytes()).unwrap();
cur.write_all(&pcm).unwrap();
cur.into_inner()
}
return Ok(wav_from_s16le(
&pcm,
audio_query.output_sampling_rate,
audio_query.output_stereo,
));
}

/// AquesTalk風記法からAccentPhrase (アクセント句)の配列を生成する。
Expand Down
32 changes: 31 additions & 1 deletion crates/voicevox_core_python_api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use pyo3::{
create_exception,
exceptions::{PyException, PyKeyError, PyValueError},
pyfunction, pymodule,
types::{PyList, PyModule},
types::{PyBytes, PyList, PyModule},
wrap_pyfunction, Py, PyObject, PyResult, PyTypeInfo, Python,
};
use voicevox_core::__internal::interop::raii::MaybeClosed;
Expand All @@ -25,6 +25,7 @@ fn rust(py: Python<'_>, module: &PyModule) -> PyResult<()> {
module.add("__version__", env!("CARGO_PKG_VERSION"))?;
module.add_wrapped(wrap_pyfunction!(_validate_pronunciation))?;
module.add_wrapped(wrap_pyfunction!(_to_zenkaku))?;
module.add_wrapped(wrap_pyfunction!(wav_from_s16le))?;

add_exceptions(module)?;

Expand Down Expand Up @@ -263,6 +264,35 @@ fn _to_zenkaku(text: &str) -> PyResult<String> {
Ok(voicevox_core::__internal::to_zenkaku(text))
}

/// 16bit PCMにヘッダを付加しWAVフォーマットのバイナリを生成する。
///
/// Parameters
/// ----------
/// pcm : bytes
/// 16bit PCMで表現された音声データ
/// output_sampling_rate: int
/// pcmのサンプリングレート
/// output_stereo: bool
/// pcmがステレオかどうか
///
/// Returns
/// -------
/// bytes
/// WAVフォーマットで表現された音声データ
#[pyfunction]
fn wav_from_s16le(
pcm: &[u8],
output_sampling_rate: u32,
output_stereo: bool,
py: Python<'_>,
) -> PyObject {
PyBytes::new(
py,
&voicevox_core::blocking::wav_from_s16le(pcm, output_sampling_rate, output_stereo),
)
.into()
}

mod blocking {
use std::{ffi::OsString, path::PathBuf, sync::Arc};

Expand Down
29 changes: 9 additions & 20 deletions example/python/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from voicevox_core import AccelerationMode, AudioQuery
from voicevox_core.blocking import Onnxruntime, OpenJtalk, Synthesizer, VoiceModelFile
from voicevox_core._rust import wav_from_s16le


def main() -> None:
Expand Down Expand Up @@ -51,34 +52,22 @@ def main() -> None:
audio_query = synthesizer.audio_query(text, style_id)

mode_name = "streaming" if streaming else "normal"
logger.info("%s", f"Synthesizing with {display_as_json(audio_query)} in {mode_name} mode")
logger.info(
"%s", f"Synthesizing with {display_as_json(audio_query)} in {mode_name} mode"
)
if streaming:
chunk_sec = 1.0
intermediate = synthesizer.seekable_synthesis(audio_query, style_id)
chunk_frames = int(intermediate.sampling_rate * chunk_sec)
pcm = b""
for i in range(0, intermediate.length, chunk_frames):
logger.info("%s", f"synthesis {i/intermediate.length:.2%}")
pcm += synthesizer.render(intermediate, i, i+chunk_frames)
pcm += synthesizer.render(intermediate, i, i + chunk_frames)
logger.info("%s", f"synthesis 100%")
num_channels = 2 if audio_query.output_stereo else 1
block_size = 16 * num_channels // 8
sr = audio_query.output_sampling_rate
wav = (
b"RIFF" +
struct.pack("<I", len(pcm) + 44 - 8) +
b"WAVEfmt " +
struct.pack("<I", 16) +
struct.pack("<H", 1) +
struct.pack("<H", num_channels) +
struct.pack("<I", sr) +
struct.pack("<I", sr * block_size) +
struct.pack("<H", block_size) +
struct.pack("<H", 16) +
b"data" +
struct.pack("<I", len(pcm)) +
pcm)

wav = wav_from_s16le(
pcm, audio_query.output_sampling_rate, audio_query.output_stereo
)

else:
wav = synthesizer.synthesis(audio_query, style_id)

Expand Down

0 comments on commit 43cee57

Please sign in to comment.