Skip to content

Commit

Permalink
split decoder into spectrogram and vocoder without changing API (#851)
Browse files Browse the repository at this point in the history
この本文は @qryxip が記述している。

ストリーミング処理を見据え、decoderからvocoderを切り離す。ただしこのPRで
はAPIは変えない。

モデルとしては、`decode`を`generate_full_intermediate`と
`render_audio_segment`に分離する。"audio"ではなく"wave"の方が適切かもし
れないが、リリースするまでに考えることとする。

#851 (review)

Refs: Hiroshiba/vv_core_inference#28
  • Loading branch information
Yosshi999 authored Oct 12, 2024
1 parent 991fbc8 commit 4547925
Show file tree
Hide file tree
Showing 10 changed files with 77 additions and 30 deletions.
5 changes: 3 additions & 2 deletions crates/voicevox_core/src/infer/domains.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ use educe::Educe;
use serde::{Deserialize, Deserializer};

pub(crate) use self::talk::{
DecodeInput, DecodeOutput, PredictDurationInput, PredictDurationOutput, PredictIntonationInput,
PredictIntonationOutput, TalkDomain, TalkOperation,
GenerateFullIntermediateInput, GenerateFullIntermediateOutput, PredictDurationInput,
PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput,
RenderAudioSegmentInput, RenderAudioSegmentOutput, TalkDomain, TalkOperation,
};

#[derive(Educe)]
Expand Down
31 changes: 25 additions & 6 deletions crates/voicevox_core/src/infer/domains/talk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,16 @@ pub(crate) enum TalkOperation {
PredictIntonation,

#[inference_operation(
type Input = DecodeInput;
type Output = DecodeOutput;
type Input = GenerateFullIntermediateInput;
type Output = GenerateFullIntermediateOutput;
)]
Decode,
GenerateFullIntermediate,

#[inference_operation(
type Input = RenderAudioSegmentInput;
type Output = RenderAudioSegmentOutput;
)]
RenderAudioSegment,
}

#[derive(InferenceInputSignature)]
Expand Down Expand Up @@ -83,15 +89,28 @@ pub(crate) struct PredictIntonationOutput {

#[derive(InferenceInputSignature)]
#[inference_input_signature(
type Signature = Decode;
type Signature = GenerateFullIntermediate;
)]
pub(crate) struct DecodeInput {
pub(crate) struct GenerateFullIntermediateInput {
pub(crate) f0: Array2<f32>,
pub(crate) phoneme: Array2<f32>,
pub(crate) speaker_id: Array1<i64>,
}

#[derive(InferenceOutputSignature)]
pub(crate) struct DecodeOutput {
pub(crate) struct GenerateFullIntermediateOutput {
pub(crate) spec: Array2<f32>,
}

#[derive(InferenceInputSignature)]
#[inference_input_signature(
type Signature = RenderAudioSegment;
)]
pub(crate) struct RenderAudioSegmentInput {
pub(crate) spec: Array2<f32>,
}

#[derive(InferenceOutputSignature)]
pub(crate) struct RenderAudioSegmentOutput {
pub(crate) wave: Array1<f32>,
}
7 changes: 5 additions & 2 deletions crates/voicevox_core/src/manifest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,11 @@ pub(crate) struct TalkManifest {
#[index_for_fields(TalkOperation::PredictIntonation)]
pub(crate) predict_intonation_filename: Arc<str>,

#[index_for_fields(TalkOperation::Decode)]
pub(crate) decode_filename: Arc<str>,
#[index_for_fields(TalkOperation::GenerateFullIntermediate)]
pub(crate) generate_full_intermediate_filename: Arc<str>,

#[index_for_fields(TalkOperation::RenderAudioSegment)]
pub(crate) render_audio_segment_filename: Arc<str>,

#[serde(default)]
pub(crate) style_id_to_inner_voice_id: StyleIdToInnerVoiceId,
Expand Down
11 changes: 8 additions & 3 deletions crates/voicevox_core/src/status.rs
Original file line number Diff line number Diff line change
Expand Up @@ -375,8 +375,9 @@ mod tests {
let session_options = InferenceDomainMap {
talk: enum_map! {
TalkOperation::PredictDuration
| TalkOperation::PredictIntonation => light_session_options,
TalkOperation::Decode => heavy_session_options,
| TalkOperation::PredictIntonation
| TalkOperation::GenerateFullIntermediate => light_session_options,
TalkOperation::RenderAudioSegment => heavy_session_options,
},
};
let status = Status::new(
Expand All @@ -392,9 +393,13 @@ mod tests {
light_session_options,
status.session_options.talk[TalkOperation::PredictIntonation],
);
assert_eq!(
light_session_options,
status.session_options.talk[TalkOperation::GenerateFullIntermediate],
);
assert_eq!(
heavy_session_options,
status.session_options.talk[TalkOperation::Decode],
status.session_options.talk[TalkOperation::RenderAudioSegment],
);

assert!(status.loaded_models.lock().unwrap().0.is_empty());
Expand Down
20 changes: 13 additions & 7 deletions crates/voicevox_core/src/synthesizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,10 @@ pub(crate) mod blocking {
error::ErrorRepr,
infer::{
domains::{
DecodeInput, DecodeOutput, InferenceDomainMap, PredictDurationInput,
PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput, TalkDomain,
TalkOperation,
GenerateFullIntermediateInput, GenerateFullIntermediateOutput, InferenceDomainMap,
PredictDurationInput, PredictDurationOutput, PredictIntonationInput,
PredictIntonationOutput, RenderAudioSegmentInput, RenderAudioSegmentOutput,
TalkDomain, TalkOperation,
},
InferenceRuntime as _, InferenceSessionOptions,
},
Expand Down Expand Up @@ -204,8 +205,9 @@ pub(crate) mod blocking {
InferenceDomainMap {
talk: enum_map! {
TalkOperation::PredictDuration
| TalkOperation::PredictIntonation => light_session_options,
TalkOperation::Decode => heavy_session_options,
| TalkOperation::PredictIntonation
| TalkOperation::GenerateFullIntermediate => light_session_options,
TalkOperation::RenderAudioSegment => heavy_session_options,
},
},
);
Expand Down Expand Up @@ -935,9 +937,9 @@ pub(crate) mod blocking {
padding_size,
);

let DecodeOutput { wave: output } = self.status.run_session(
let GenerateFullIntermediateOutput { spec } = self.status.run_session(
model_id,
DecodeInput {
GenerateFullIntermediateInput {
f0: ndarray::arr1(&f0_with_padding)
.into_shape([length_with_padding, 1])
.unwrap(),
Expand All @@ -948,6 +950,10 @@ pub(crate) mod blocking {
},
)?;

let RenderAudioSegmentOutput { wave: output } = self
.status
.run_session(model_id, RenderAudioSegmentInput { spec })?;

return Ok(trim_padding_from_output(
output.into_raw_vec(),
padding_size,
Expand Down
23 changes: 16 additions & 7 deletions crates/voicevox_core/src/voice_model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,11 @@ impl<A: Async> Inner<A> {
TalkOperation::PredictIntonation => {
find_entry_index(&manifest.predict_intonation_filename)?
}
TalkOperation::Decode => {
find_entry_index(&manifest.decode_filename)?
TalkOperation::GenerateFullIntermediate => {
find_entry_index(&manifest.generate_full_intermediate_filename)?
}
TalkOperation::RenderAudioSegment => {
find_entry_index(&manifest.render_audio_segment_filename)?
}
};

Expand Down Expand Up @@ -232,14 +235,20 @@ impl<A: Async> Inner<A> {

let talk = OptionFuture::from(talk.map(
|(entries, style_id_to_inner_voice_id)| async move {
let [predict_duration, predict_intonation, decode] = entries.into_array();
let [predict_duration, predict_intonation, predict_spectrogram, run_vocoder] =
entries.into_array();

let predict_duration = read_file!(predict_duration);
let predict_intonation = read_file!(predict_intonation);
let decode = read_file!(decode);

let model_bytes =
EnumMap::from_array([predict_duration, predict_intonation, decode]);
let predict_spectrogram = read_file!(predict_spectrogram);
let run_vocoder = read_file!(run_vocoder);

let model_bytes = EnumMap::from_array([
predict_duration,
predict_intonation,
predict_spectrogram,
run_vocoder,
]);

Ok((style_id_to_inner_voice_id, model_bytes))
},
Expand Down
7 changes: 5 additions & 2 deletions crates/voicevox_core_macros/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,11 @@ pub fn derive_inference_output_signature(
/// #[index_for_fields(TalkOperation::PredictIntonation)]
/// pub(crate) predict_intonation_filename: Arc<str>,
///
/// #[index_for_fields(TalkOperation::Decode)]
/// pub(crate) decode_filename: Arc<str>,
/// #[index_for_fields(TalkOperation::GenerateFullIntermediate)]
/// pub(crate) generate_full_intermediate_filename: Arc<str>,
///
/// #[index_for_fields(TalkOperation::RenderAudioSegment)]
/// pub(crate) render_audio_segment_filename: Arc<str>,
///
/// // …
/// }
Expand Down
3 changes: 2 additions & 1 deletion model/sample.vvm/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
"talk": {
"predict_duration_filename": "predict_duration.onnx",
"predict_intonation_filename": "predict_intonation.onnx",
"decode_filename": "decode.onnx",
"generate_full_intermediate_filename": "predict_spectrogram.onnx",
"render_audio_segment_filename": "vocoder.onnx",
"style_id_to_inner_voice_id": {
"302": 2,
"303": 3
Expand Down
Binary file added model/sample.vvm/predict_spectrogram.onnx
Binary file not shown.
Binary file not shown.

0 comments on commit 4547925

Please sign in to comment.