diff --git a/crates/voicevox_core/src/blocking.rs b/crates/voicevox_core/src/blocking.rs index 3443e3085..d45dcfe49 100644 --- a/crates/voicevox_core/src/blocking.rs +++ b/crates/voicevox_core/src/blocking.rs @@ -2,8 +2,8 @@ pub use crate::{ engine::open_jtalk::blocking::OpenJtalk, infer::runtimes::onnxruntime::blocking::Onnxruntime, - synthesizer::blocking::Synthesizer, user_dict::dict::blocking::UserDict, - voice_model::blocking::VoiceModelFile, + synthesizer::blocking::AudioFeature, synthesizer::blocking::Synthesizer, + user_dict::dict::blocking::UserDict, voice_model::blocking::VoiceModelFile, }; pub mod onnxruntime { diff --git a/crates/voicevox_core/src/engine/audio_file.rs b/crates/voicevox_core/src/engine/audio_file.rs new file mode 100644 index 000000000..470f5fc84 --- /dev/null +++ b/crates/voicevox_core/src/engine/audio_file.rs @@ -0,0 +1,32 @@ +use std::io::{Cursor, Write as _}; + +/// 16bit PCMにヘッダを付加しWAVフォーマットのバイナリを生成する。 +pub fn wav_from_s16le(pcm: &[u8], sampling_rate: u32, is_stereo: bool) -> Vec { + let num_channels: u16 = if is_stereo { 2 } else { 1 }; + let bit_depth: u16 = 16; + let block_size: u16 = bit_depth * num_channels / 8; + + let bytes_size = pcm.len() as u32; + let wave_size = bytes_size + 44; + + let buf: Vec = Vec::with_capacity(wave_size as usize); + let mut cur = Cursor::new(buf); + + cur.write_all("RIFF".as_bytes()).unwrap(); + cur.write_all(&(wave_size - 8).to_le_bytes()).unwrap(); + cur.write_all("WAVEfmt ".as_bytes()).unwrap(); + cur.write_all(&16_u32.to_le_bytes()).unwrap(); // fmt header length + cur.write_all(&1_u16.to_le_bytes()).unwrap(); // linear PCM + cur.write_all(&num_channels.to_le_bytes()).unwrap(); + cur.write_all(&sampling_rate.to_le_bytes()).unwrap(); + + let block_rate = sampling_rate * block_size as u32; + + cur.write_all(&block_rate.to_le_bytes()).unwrap(); + cur.write_all(&block_size.to_le_bytes()).unwrap(); + cur.write_all(&bit_depth.to_le_bytes()).unwrap(); + cur.write_all("data".as_bytes()).unwrap(); + cur.write_all(&bytes_size.to_le_bytes()).unwrap(); + cur.write_all(pcm).unwrap(); + cur.into_inner() +} diff --git a/crates/voicevox_core/src/engine/mod.rs b/crates/voicevox_core/src/engine/mod.rs index d446a7253..32e304114 100644 --- a/crates/voicevox_core/src/engine/mod.rs +++ b/crates/voicevox_core/src/engine/mod.rs @@ -1,4 +1,5 @@ mod acoustic_feature_extractor; +mod audio_file; mod full_context_label; mod kana_parser; mod model; @@ -6,6 +7,7 @@ mod mora_list; pub(crate) mod open_jtalk; pub(crate) use self::acoustic_feature_extractor::OjtPhoneme; +pub use self::audio_file::wav_from_s16le; pub(crate) use self::full_context_label::{ extract_full_context_label, mora_to_text, FullContextLabelError, }; diff --git a/crates/voicevox_core/src/lib.rs b/crates/voicevox_core/src/lib.rs index c5ab200d7..629c15963 100644 --- a/crates/voicevox_core/src/lib.rs +++ b/crates/voicevox_core/src/lib.rs @@ -83,7 +83,7 @@ use rstest_reuse; pub use self::{ devices::SupportedDevices, - engine::{AccentPhrase, AudioQuery, FullcontextExtractor, Mora}, + engine::{wav_from_s16le, AccentPhrase, AudioQuery, FullcontextExtractor, Mora}, error::{Error, ErrorKind}, metas::{ RawStyleId, RawStyleVersion, SpeakerMeta, StyleId, StyleMeta, StyleType, StyleVersion, diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index adeb010a0..adf73cb68 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -80,14 +80,13 @@ pub struct InitializeOptions { } pub(crate) mod blocking { - use std::io::{Cursor, Write as _}; - use enum_map::enum_map; + use std::io::{Cursor, Write as _}; use tracing::info; use crate::{ devices::{DeviceSpec, GpuSpec}, - engine::{create_kana, mora_to_text, Mora, OjtPhoneme}, + engine::{create_kana, mora_to_text, wav_from_s16le, Mora, OjtPhoneme}, error::ErrorRepr, infer::{ domains::{ @@ -108,6 +107,22 @@ pub(crate) mod blocking { const DEFAULT_SAMPLING_RATE: u32 = 24000; + /// 音声の中間表現。 + pub struct AudioFeature { + /// (フレーム数, 特徴数)の形を持つ音声特徴量。 + internal_state: ndarray::Array2, + /// 生成時に指定したスタイル番号。 + style_id: crate::StyleId, + /// workaround paddingを除いた音声特徴量のフレーム数。 + pub frame_length: usize, + /// フレームレート。全体の秒数は`frame_length / frame_rate`で表せる。 + pub frame_rate: f64, + /// workaroundとして付け足されているパディング長。 + padding_frame_length: usize, + /// 生成時に利用したクエリ。 + audio_query: AudioQuery, + } + /// 音声シンセサイザ。 pub struct Synthesizer { pub(super) status: Status, @@ -257,13 +272,13 @@ pub(crate) mod blocking { self.status.metas() } - /// AudioQueryから音声合成を行う。 - pub fn synthesis( + /// AudioQueryから音声合成用の中間表現を生成する。 + pub fn precompute_render( &self, audio_query: &AudioQuery, style_id: StyleId, options: &SynthesisOptions, - ) -> Result> { + ) -> Result { let AudioQuery { accent_phrases, speed_scale, @@ -362,14 +377,37 @@ pub(crate) mod blocking { } } - let wave = &self.decode( - f0.len(), - OjtPhoneme::num_phoneme(), - &f0, + // 音が途切れてしまうのを避けるworkaround処理が入っている + // NOTE: `render()`内でこのpaddingを取り除くために、padding_frame_lengthにpadding長を保持している。 + // TODO: 改善したらここのpadding処理を取り除く + const PADDING_SIZE: f64 = 0.4; + let padding_size = + ((PADDING_SIZE * DEFAULT_SAMPLING_RATE as f64) / 256.0).round() as usize; + let start_and_end_padding_size = 2 * padding_size; + let length_with_padding = f0.len() + start_and_end_padding_size; + let f0_with_padding = make_f0_with_padding(&f0, length_with_padding, padding_size); + let phoneme_with_padding = make_phoneme_with_padding( phoneme.as_flattened(), + OjtPhoneme::num_phoneme(), + length_with_padding, + padding_size, + ); + + let spec = self.generate_full_intermediate( + f0_with_padding.len(), + OjtPhoneme::num_phoneme(), + &f0_with_padding, + &phoneme_with_padding, style_id, )?; - return Ok(to_wav(wave, audio_query)); + return Ok(AudioFeature { + internal_state: spec, + style_id, + frame_length: f0.len(), + frame_rate: (DEFAULT_SAMPLING_RATE as f64) / 256.0, + padding_frame_length: padding_size, + audio_query: audio_query.clone(), + }); fn adjust_interrogative_accent_phrases( accent_phrases: &[AccentPhrase], @@ -420,7 +458,86 @@ pub(crate) mod blocking { } } - fn to_wav( + fn make_f0_with_padding( + f0_slice: &[f32], + length_with_padding: usize, + padding_size: usize, + ) -> Vec { + // 音が途切れてしまうのを避けるworkaround処理 + // 改善したらこの関数を削除する + let mut f0_with_padding = Vec::with_capacity(length_with_padding); + let padding = vec![0.0; padding_size]; + f0_with_padding.extend_from_slice(&padding); + f0_with_padding.extend_from_slice(f0_slice); + f0_with_padding.extend_from_slice(&padding); + f0_with_padding + } + + fn make_phoneme_with_padding( + phoneme_slice: &[f32], + phoneme_size: usize, + length_with_padding: usize, + padding_size: usize, + ) -> Vec { + // 音が途切れてしまうのを避けるworkaround処理 + // 改善したらこの関数を削除する + let mut padding_phoneme = vec![0.0; phoneme_size]; + padding_phoneme[0] = 1.0; + let padding_phoneme_len = padding_phoneme.len(); + let padding_phonemes: Vec = padding_phoneme + .into_iter() + .cycle() + .take(padding_phoneme_len * padding_size) + .collect(); + let mut phoneme_with_padding = + Vec::with_capacity(phoneme_size * length_with_padding); + phoneme_with_padding.extend_from_slice(&padding_phonemes); + phoneme_with_padding.extend_from_slice(phoneme_slice); + phoneme_with_padding.extend_from_slice(&padding_phonemes); + + phoneme_with_padding + } + } + + /// 中間表現から16bit PCMで音声波形を生成する。 + pub fn render(&self, audio: &AudioFeature, start: usize, end: usize) -> Result> { + // TODO: 44.1kHzなどの対応 + const MARGIN: usize = 14; // 使われているHifiGANのreceptive fieldから計算される安全マージン + use std::cmp::min; + // 実態(workaround paddingを含まない)上での区間 + let clipped_start = min(start, audio.frame_length); + let clipped_end = min(end, audio.frame_length); + // 指定領域が空の区間だった場合、ONNXRuntimeに渡す前に早期リターン + if (clipped_start..clipped_end).is_empty() { + return Ok(vec![]); + } + // マージンがデータからはみ出さないことを保証 + // cf. https://github.com/VOICEVOX/voicevox_core/pull/854#discussion_r1803691291 + if MARGIN > audio.padding_frame_length + clipped_start + || MARGIN > audio.padding_frame_length + (audio.frame_length - clipped_end) + { + unreachable!("Validation error: Too short padding for input, please report this issue on GitHub."); + } + let left_margin = MARGIN; + let right_margin = MARGIN; + // 安全マージンを追加したデータ上での区間 + let slice_start = audio.padding_frame_length + clipped_start - left_margin; + let slice_end = audio.padding_frame_length + clipped_end + right_margin; + let segment = audio + .internal_state + .slice(ndarray::s![slice_start..slice_end, ..]); + let wave_with_margin = + self.render_audio_segment(segment.into_owned(), audio.style_id)?; + // 変換前に追加した安全マージンを生成音声から取り除く + let wave = wave_with_margin + .slice(ndarray::s![ + left_margin * 256..wave_with_margin.len() - right_margin * 256 + ]) + .into_owned() + .into_raw_vec(); + return Ok(to_s16le_pcm(&wave, &audio.audio_query)); + + fn to_s16le_pcm( wave: &[f32], &AudioQuery { volume_scale, @@ -429,36 +546,13 @@ pub(crate) mod blocking { .. }: &AudioQuery, ) -> Vec { - // TODO: 44.1kHzなどの対応 - let num_channels: u16 = if output_stereo { 2 } else { 1 }; - let bit_depth: u16 = 16; let repeat_count: u32 = (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32; - let block_size: u16 = bit_depth * num_channels / 8; - let bytes_size = wave.len() as u32 * repeat_count * 2; - let wave_size = bytes_size + 44; - - let buf: Vec = Vec::with_capacity(wave_size as usize); + let buf: Vec = Vec::with_capacity(bytes_size as usize); let mut cur = Cursor::new(buf); - cur.write_all("RIFF".as_bytes()).unwrap(); - cur.write_all(&(wave_size - 8).to_le_bytes()).unwrap(); - cur.write_all("WAVEfmt ".as_bytes()).unwrap(); - cur.write_all(&16_u32.to_le_bytes()).unwrap(); // fmt header length - cur.write_all(&1_u16.to_le_bytes()).unwrap(); //linear PCM - cur.write_all(&num_channels.to_le_bytes()).unwrap(); - cur.write_all(&output_sampling_rate.to_le_bytes()).unwrap(); - - let block_rate = output_sampling_rate * block_size as u32; - - cur.write_all(&block_rate.to_le_bytes()).unwrap(); - cur.write_all(&block_size.to_le_bytes()).unwrap(); - cur.write_all(&bit_depth.to_le_bytes()).unwrap(); - cur.write_all("data".as_bytes()).unwrap(); - cur.write_all(&bytes_size.to_le_bytes()).unwrap(); - for value in wave { let v = (value * volume_scale).clamp(-1., 1.); let data = (v * 0x7fff as f32) as i16; @@ -471,6 +565,22 @@ pub(crate) mod blocking { } } + /// AudioQueryから直接WAVフォーマットで音声波形を生成する。 + pub fn synthesis( + &self, + audio_query: &AudioQuery, + style_id: StyleId, + options: &SynthesisOptions, + ) -> Result> { + let audio = self.precompute_render(audio_query, style_id, options)?; + let pcm = self.render(&audio, 0, audio.frame_length)?; + Ok(wav_from_s16le( + &pcm, + audio_query.output_sampling_rate, + audio_query.output_stereo, + )) + } + /// AquesTalk風記法からAccentPhrase (アクセント句)の配列を生成する。 /// /// # Example @@ -840,6 +950,21 @@ pub(crate) mod blocking { style_id: StyleId, ) -> Result>; + fn generate_full_intermediate( + &self, + length: usize, + phoneme_size: usize, + f0: &[f32], + phoneme_vector: &[f32], + style_id: StyleId, + ) -> Result>; + + fn render_audio_segment( + &self, + spec: ndarray::Array2, + style_id: StyleId, + ) -> Result>; + /// `decode`を実行する。 /// /// # Performance @@ -911,102 +1036,58 @@ pub(crate) mod blocking { Ok(output.into_raw_vec()) } - fn decode( + fn generate_full_intermediate( &self, length: usize, phoneme_size: usize, f0: &[f32], phoneme_vector: &[f32], style_id: StyleId, - ) -> Result> { + ) -> Result> { let (model_id, inner_voice_id) = self.status.ids_for::(style_id)?; - // 音が途切れてしまうのを避けるworkaround処理が入っている - // TODO: 改善したらここのpadding処理を取り除く - const PADDING_SIZE: f64 = 0.4; - let padding_size = - ((PADDING_SIZE * DEFAULT_SAMPLING_RATE as f64) / 256.0).round() as usize; - let start_and_end_padding_size = 2 * padding_size; - let length_with_padding = length + start_and_end_padding_size; - let f0_with_padding = make_f0_with_padding(f0, length_with_padding, padding_size); - - let phoneme_with_padding = make_phoneme_with_padding( - phoneme_vector, - phoneme_size, - length_with_padding, - padding_size, - ); - let GenerateFullIntermediateOutput { spec } = self.status.run_session( model_id, GenerateFullIntermediateInput { - f0: ndarray::arr1(&f0_with_padding) - .into_shape([length_with_padding, 1]) - .unwrap(), - phoneme: ndarray::arr1(&phoneme_with_padding) - .into_shape([length_with_padding, phoneme_size]) + f0: ndarray::arr1(f0).into_shape([length, 1]).unwrap(), + phoneme: ndarray::arr1(phoneme_vector) + .into_shape([length, phoneme_size]) .unwrap(), speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]), }, )?; + Ok(spec) + } - let RenderAudioSegmentOutput { wave: output } = self + fn render_audio_segment( + &self, + spec: ndarray::Array2, + style_id: StyleId, + ) -> Result> { + let (model_id, _inner_voice_id) = self.status.ids_for::(style_id)?; + let RenderAudioSegmentOutput { wave } = self .status .run_session(model_id, RenderAudioSegmentInput { spec })?; + Ok(wave) + } - return Ok(trim_padding_from_output( - output.into_raw_vec(), - padding_size, - )); - - fn make_f0_with_padding( - f0_slice: &[f32], - length_with_padding: usize, - padding_size: usize, - ) -> Vec { - // 音が途切れてしまうのを避けるworkaround処理 - // 改善したらこの関数を削除する - let mut f0_with_padding = Vec::with_capacity(length_with_padding); - let padding = vec![0.0; padding_size]; - f0_with_padding.extend_from_slice(&padding); - f0_with_padding.extend_from_slice(f0_slice); - f0_with_padding.extend_from_slice(&padding); - f0_with_padding - } - - fn make_phoneme_with_padding( - phoneme_slice: &[f32], - phoneme_size: usize, - length_with_padding: usize, - padding_size: usize, - ) -> Vec { - // 音が途切れてしまうのを避けるworkaround処理 - // 改善したらこの関数を削除する - let mut padding_phoneme = vec![0.0; phoneme_size]; - padding_phoneme[0] = 1.0; - let padding_phoneme_len = padding_phoneme.len(); - let padding_phonemes: Vec = padding_phoneme - .into_iter() - .cycle() - .take(padding_phoneme_len * padding_size) - .collect(); - let mut phoneme_with_padding = - Vec::with_capacity(phoneme_size * length_with_padding); - phoneme_with_padding.extend_from_slice(&padding_phonemes); - phoneme_with_padding.extend_from_slice(phoneme_slice); - phoneme_with_padding.extend_from_slice(&padding_phonemes); - - phoneme_with_padding - } - - fn trim_padding_from_output(mut output: Vec, padding_f0_size: usize) -> Vec { - // 音が途切れてしまうのを避けるworkaround処理 - // 改善したらこの関数を削除する - let padding_sampling_size = padding_f0_size * 256; - output - .drain(padding_sampling_size..output.len() - padding_sampling_size) - .collect() - } + fn decode( + &self, + length: usize, + phoneme_size: usize, + f0: &[f32], + phoneme_vector: &[f32], + style_id: StyleId, + ) -> Result> { + let intermediate = self.generate_full_intermediate( + length, + phoneme_size, + f0, + phoneme_vector, + style_id, + )?; + let output = self.render_audio_segment(intermediate, style_id)?; + Ok(output.into_raw_vec()) } } diff --git a/crates/voicevox_core_python_api/python/voicevox_core/__init__.py b/crates/voicevox_core_python_api/python/voicevox_core/__init__.py index ea1f246b9..3e68bf455 100644 --- a/crates/voicevox_core_python_api/python/voicevox_core/__init__.py +++ b/crates/voicevox_core_python_api/python/voicevox_core/__init__.py @@ -35,12 +35,14 @@ UseUserDictError, WordNotFoundError, __version__, + wav_from_s16le, ) from . import asyncio, blocking # noqa: F401 isort: skip __all__ = [ "__version__", + "wav_from_s16le", "AccelerationMode", "AccentPhrase", "AudioQuery", diff --git a/crates/voicevox_core_python_api/python/voicevox_core/_rust/__init__.pyi b/crates/voicevox_core_python_api/python/voicevox_core/_rust/__init__.pyi index a456b1162..8099d9a9d 100644 --- a/crates/voicevox_core_python_api/python/voicevox_core/_rust/__init__.pyi +++ b/crates/voicevox_core_python_api/python/voicevox_core/_rust/__init__.pyi @@ -102,3 +102,22 @@ class InvalidWordError(ValueError): def _validate_pronunciation(pronunciation: str) -> None: ... def _to_zenkaku(text: str) -> str: ... +def wav_from_s16le(pcm: bytes, sampling_rate: int, is_stereo: bool) -> bytes: + """ + 16bit PCMにヘッダを付加しWAVフォーマットのバイナリを生成する。 + + Parameters + ---------- + pcm : bytes + 16bit PCMで表現された音声データ + sampling_rate: int + 入力pcmのサンプリングレート + is_stereo: bool + 入力pcmがステレオかどうか + + Returns + ------- + bytes + WAVフォーマットで表現された音声データ + """ + ... diff --git a/crates/voicevox_core_python_api/src/lib.rs b/crates/voicevox_core_python_api/src/lib.rs index 24d28261d..58791ead1 100644 --- a/crates/voicevox_core_python_api/src/lib.rs +++ b/crates/voicevox_core_python_api/src/lib.rs @@ -12,7 +12,7 @@ use pyo3::{ create_exception, exceptions::{PyException, PyKeyError, PyValueError}, pyfunction, pymodule, - types::{PyList, PyModule}, + types::{PyBytes, PyList, PyModule}, wrap_pyfunction, Py, PyObject, PyResult, PyTypeInfo, Python, }; use voicevox_core::__internal::interop::raii::MaybeClosed; @@ -25,6 +25,7 @@ fn rust(py: Python<'_>, module: &PyModule) -> PyResult<()> { module.add("__version__", env!("CARGO_PKG_VERSION"))?; module.add_wrapped(wrap_pyfunction!(_validate_pronunciation))?; module.add_wrapped(wrap_pyfunction!(_to_zenkaku))?; + module.add_wrapped(wrap_pyfunction!(wav_from_s16le))?; add_exceptions(module)?; @@ -34,6 +35,7 @@ fn rust(py: Python<'_>, module: &PyModule) -> PyResult<()> { blocking_module.add_class::()?; blocking_module.add_class::()?; blocking_module.add_class::()?; + blocking_module.add_class::()?; module.add_and_register_submodule(blocking_module)?; let asyncio_module = PyModule::new(py, "voicevox_core._rust.asyncio")?; @@ -262,6 +264,19 @@ fn _to_zenkaku(text: &str) -> PyResult { Ok(voicevox_core::__internal::to_zenkaku(text)) } +#[pyfunction] +fn wav_from_s16le<'py>( + pcm: &[u8], + sampling_rate: u32, + is_stereo: bool, + py: Python<'py>, +) -> &'py PyBytes { + PyBytes::new( + py, + &voicevox_core::wav_from_s16le(pcm, sampling_rate, is_stereo), + ) +} + mod blocking { use std::{ffi::OsString, path::PathBuf, sync::Arc}; @@ -424,6 +439,24 @@ mod blocking { } } + #[pyclass] + pub(crate) struct AudioFeature { + audio: voicevox_core::blocking::AudioFeature, + } + + #[pymethods] + impl AudioFeature { + #[getter] + fn frame_length(&self) -> usize { + self.audio.frame_length + } + + #[getter] + fn frame_rate(&self) -> f64 { + self.audio.frame_rate + } + } + #[pyclass] pub(crate) struct Synthesizer { synthesizer: Closable< @@ -642,6 +675,48 @@ mod blocking { ) } + #[pyo3(signature=( + audio_query, + style_id, + enable_interrogative_upspeak = TtsOptions::default().enable_interrogative_upspeak + ))] + fn precompute_render( + &self, + #[pyo3(from_py_with = "crate::convert::from_dataclass")] audio_query: AudioQuery, + style_id: u32, + enable_interrogative_upspeak: bool, + py: Python<'_>, + ) -> PyResult { + let audio = self + .synthesizer + .read()? + .precompute_render( + &audio_query, + StyleId::new(style_id), + &SynthesisOptions { + enable_interrogative_upspeak, + }, + ) + .into_py_result(py)?; + Ok(AudioFeature { audio }) + } + + #[pyo3(signature=(audio, start, end))] + fn render<'py>( + &self, + audio: &AudioFeature, + start: usize, + end: usize, + py: Python<'py>, + ) -> PyResult<&'py PyBytes> { + let wav = &self + .synthesizer + .read()? + .render(&audio.audio, start, end) + .into_py_result(py)?; + Ok(PyBytes::new(py, wav)) + } + #[pyo3(signature=( audio_query, style_id, diff --git a/example/python/run.py b/example/python/run.py index 5f11a1a62..64a871bc2 100644 --- a/example/python/run.py +++ b/example/python/run.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import Tuple -from voicevox_core import AccelerationMode, AudioQuery +from voicevox_core import AccelerationMode, AudioQuery, wav_from_s16le from voicevox_core.blocking import Onnxruntime, OpenJtalk, Synthesizer, VoiceModelFile @@ -24,6 +24,7 @@ def main() -> None: text, out, style_id, + streaming, ) = parse_args() logger.info("%s", f"Loading ONNX Runtime ({onnxruntime_filename=})") @@ -49,7 +50,22 @@ def main() -> None: audio_query = synthesizer.audio_query(text, style_id) logger.info("%s", f"Synthesizing with {display_as_json(audio_query)}") - wav = synthesizer.synthesis(audio_query, style_id) + if streaming: + logger.info("%s", "In streaming mode") + chunk_sec = 1.0 + intermediate = synthesizer.precompute_render(audio_query, style_id) + chunk_frames = int(intermediate.frame_rate * chunk_sec) + pcm = b"" + for i in range(0, intermediate.frame_length, chunk_frames): + logger.info("%s", f"{i/intermediate.frame_length:.2%}") + pcm += synthesizer.render(intermediate, i, i + chunk_frames) + logger.info("%s", f"100%") + wav = wav_from_s16le( + pcm, audio_query.output_sampling_rate, audio_query.output_stereo + ) + + else: + wav = synthesizer.synthesis(audio_query, style_id) out.write_bytes(wav) logger.info("%s", f"Wrote `{out}`") @@ -96,6 +112,11 @@ def parse_args() -> Tuple[AccelerationMode, Path, str, Path, str, Path, int]: type=int, help="話者IDを指定", ) + argparser.add_argument( + "--streaming", + action="store_true", + help="ストリーミング生成", + ) args = argparser.parse_args() # FIXME: 流石に多くなってきたので、`dataclass`化する return ( @@ -106,6 +127,7 @@ def parse_args() -> Tuple[AccelerationMode, Path, str, Path, str, Path, int]: args.text, args.out, args.style_id, + args.streaming, )