VOICEVOX · qryxip · Oct 29, 2024 · Oct 9, 2024 · Oct 10, 2024 · Oct 11, 2024
diff --git a/crates/voicevox_core/src/blocking.rs b/crates/voicevox_core/src/blocking.rs
@@ -2,8 +2,8 @@
 
 pub use crate::{
     engine::open_jtalk::blocking::OpenJtalk, infer::runtimes::onnxruntime::blocking::Onnxruntime,
-    synthesizer::blocking::Synthesizer, user_dict::dict::blocking::UserDict,
-    voice_model::blocking::VoiceModelFile,
+    synthesizer::blocking::Audio, synthesizer::blocking::Synthesizer,
+    user_dict::dict::blocking::UserDict, voice_model::blocking::VoiceModelFile,
 };
 
 pub mod onnxruntime {

diff --git a/crates/voicevox_core/src/engine/audio_file.rs b/crates/voicevox_core/src/engine/audio_file.rs
@@ -0,0 +1,34 @@
+use std::io::{Cursor, Write as _};
+
+/// 16bit PCMにヘッダを付加しWAVフォーマットのバイナリを生成する。
+pub fn wav_from_s16le(pcm: &[u8], output_sampling_rate: u32, output_stereo: bool) -> Vec<u8> {
+    // TODO: 44.1kHzなどの対応
+
+    let num_channels: u16 = if output_stereo { 2 } else { 1 };
+    let bit_depth: u16 = 16;
+    let block_size: u16 = bit_depth * num_channels / 8;
+
+    let bytes_size = pcm.len() as u32;
+    let wave_size = bytes_size + 44;
+
+    let buf: Vec<u8> = Vec::with_capacity(wave_size as usize);
+    let mut cur = Cursor::new(buf);
+
+    cur.write_all("RIFF".as_bytes()).unwrap();
+    cur.write_all(&(wave_size - 8).to_le_bytes()).unwrap();
+    cur.write_all("WAVEfmt ".as_bytes()).unwrap();
+    cur.write_all(&16_u32.to_le_bytes()).unwrap(); // fmt header length
+    cur.write_all(&1_u16.to_le_bytes()).unwrap(); //linear PCM
+    cur.write_all(&num_channels.to_le_bytes()).unwrap();
+    cur.write_all(&output_sampling_rate.to_le_bytes()).unwrap();
+
+    let block_rate = output_sampling_rate * block_size as u32;
+
+    cur.write_all(&block_rate.to_le_bytes()).unwrap();
+    cur.write_all(&block_size.to_le_bytes()).unwrap();
+    cur.write_all(&bit_depth.to_le_bytes()).unwrap();
+    cur.write_all("data".as_bytes()).unwrap();
+    cur.write_all(&bytes_size.to_le_bytes()).unwrap();
+    cur.write_all(pcm).unwrap();
+    cur.into_inner()
+}
diff --git a/crates/voicevox_core/src/engine/mod.rs b/crates/voicevox_core/src/engine/mod.rs
@@ -1,11 +1,13 @@
 mod acoustic_feature_extractor;
+mod audio_file;
 mod full_context_label;
 mod kana_parser;
 mod model;
 mod mora_list;
 pub(crate) mod open_jtalk;
 
 pub(crate) use self::acoustic_feature_extractor::OjtPhoneme;
+pub use self::audio_file::wav_from_s16le;
 pub(crate) use self::full_context_label::{
     extract_full_context_label, mora_to_text, FullContextLabelError,
 };

diff --git a/crates/voicevox_core/src/lib.rs b/crates/voicevox_core/src/lib.rs
@@ -83,7 +83,7 @@ use rstest_reuse;
 
 pub use self::{
     devices::SupportedDevices,
-    engine::{AccentPhrase, AudioQuery, FullcontextExtractor, Mora},
+    engine::{wav_from_s16le, AccentPhrase, AudioQuery, FullcontextExtractor, Mora},
     error::{Error, ErrorKind},
     metas::{
         RawStyleId, RawStyleVersion, SpeakerMeta, StyleId, StyleMeta, StyleType, StyleVersion,