diff --git a/.vscode/settings.json b/.vscode/settings.json index 8d84196..ff821af 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -16,6 +16,7 @@ "golint", "govet", "unstaged", - "untracked" + "untracked", + "webm" ] } \ No newline at end of file diff --git a/audio/audio_config.go b/audio/audio_config.go index bd4538c..47f05ca 100644 --- a/audio/audio_config.go +++ b/audio/audio_config.go @@ -98,6 +98,19 @@ func NewAudioConfigFromDefaultSpeakerOutput() (*AudioConfig, error) { return newAudioConfigFromHandle(handle) } +// NewAudioConfigFromSpeakerOutput creates an AudioConfig object representing the specific audio output device +// (speaker) on the system. +func NewAudioConfigFromSpeakerOutput(deviceName string) (*AudioConfig, error) { + var handle C.SPXHANDLE + dn := C.CString(deviceName) + defer C.free(unsafe.Pointer(dn)) + ret := uintptr(C.audio_config_create_audio_output_from_a_speaker(&handle, dn)) + if ret != C.SPX_NOERROR { + return nil, common.NewCarbonError(ret) + } + return newAudioConfigFromHandle(handle) +} + // NewAudioConfigFromWavFileOutput creates an AudioConfig object representing the specified file for audio output. func NewAudioConfigFromWavFileOutput(filename string) (*AudioConfig, error) { var handle C.SPXHANDLE diff --git a/common/cancellation_reason.go b/common/cancellation_reason.go index 621d9f2..850c350 100644 --- a/common/cancellation_reason.go +++ b/common/cancellation_reason.go @@ -7,9 +7,13 @@ package common type CancellationReason int const ( - // Indicates that an error occurred during speech recognition. + // Error indicates that an error occurred during speech recognition. Error CancellationReason = 1 - // Indicates that the end of the audio stream was reached. + // EndOfStream indicates that the end of the audio stream was reached. EndOfStream CancellationReason = 2 + + // CancelledByUser indicates that request was cancelled by the user. + // Added in version 1.17.0 + CancelledByUser CancellationReason = 3 ) diff --git a/common/property_id.go b/common/property_id.go index 23b705a..04af717 100644 --- a/common/property_id.go +++ b/common/property_id.go @@ -102,7 +102,7 @@ const ( // SpeechSessionID is the session id. This id is a universally unique identifier (aka UUID) representing a specific // binding of an audio input stream and the underlying speech recognition instance to which it is bound. Under normal // circumstances, you shouldn't have to use this property directly. - /// Instead use SessionEventArgs.SessionId. + // Instead use SessionEventArgs.SessionId. SpeechSessionID PropertyID = 3002 // SpeechServiceConnectionUserDefinedQueryParameters are the query parameters provided by users. They will be passed @@ -118,6 +118,14 @@ const ( // SpeechServiceConnectionSynthOutputFormat is the string to specify TTS output audio format. SpeechServiceConnectionSynthOutputFormat PropertyID = 3102 + // SpeechServiceConnectionSynthEnableCompressedAudioTransmission indicates if use compressed audio format + // for speech synthesis audio transmission. + // This property only affects when SpeechServiceConnectionSynthOutputFormat is set to a pcm format. + // If this property is not set and GStreamer is available, SDK will use compressed format for synthesized audio transmission, + // and decode it. You can set this property to "false" to use raw pcm format for transmission on wire. + // Added in version 1.17.0 + SpeechServiceConnectionSynthEnableCompressedAudioTransmission PropertyID = 3103 + // SpeechServiceConnectionInitialSilenceTimeoutMs is the initial silence timeout value (in milliseconds) used by the // service. SpeechServiceConnectionInitialSilenceTimeoutMs PropertyID = 3200 @@ -186,6 +194,29 @@ const ( // the speech service. SpeechServiceResponseRecognitionLatencyMs PropertyID = 5002 + // SpeechServiceResponseSynthesisFirstByteLatencyMs is the speech synthesis first byte latency in milliseconds. + // Read-only, available on final speech synthesis results. + // This measures the latency between when the synthesis is started to be processed, and the moment the first byte audio is available. + // Added in version 1.17.0. + SpeechServiceResponseSynthesisFirstByteLatencyMs PropertyID = 5010 + + // SpeechServiceResponseSynthesisFinishLatencyMs is the speech synthesis all bytes latency in milliseconds. + // Read-only, available on final speech synthesis results. + // This measures the latency between when the synthesis is started to be processed, and the moment the whole audio is synthesized. + // Added in version 1.17.0. + SpeechServiceResponseSynthesisFinishLatencyMs PropertyID = 5011 + + // SpeechServiceResponseSynthesisUnderrunTimeMs is the underrun time for speech synthesis in milliseconds. + // Read-only, available on results in SynthesisCompleted events. + // This measures the total underrun time from AudioConfigPlaybackBufferLengthInMs is filled to synthesis completed. + // Added in version 1.17.0. + SpeechServiceResponseSynthesisUnderrunTimeMs PropertyID = 5012 + + // SpeechServiceResponseSynthesisBackend indicates which backend the synthesis is finished by. + // Read-only, available on speech synthesis results, except for the result in SynthesisStarted event + // Added in version 1.17.0. + SpeechServiceResponseSynthesisBackend PropertyID = 5020 + // CancellationDetailsReason is the cancellation reason. Currently unused. CancellationDetailsReason PropertyID = 6000 @@ -216,6 +247,14 @@ const ( // AudioConfigAudioSource is the audio source. Allowed values are "Microphones", "File", and "Stream". AudioConfigAudioSource PropertyID = 8004 + // AudioConfigDeviceNameForRender indicates the device name for audio render. Under normal circumstances, + // you shouldn't have to use this property directly. Instead, use NewAudioConfigFromDefaultSpeakerOutput. + // Added in version 1.17.0 + AudioConfigDeviceNameForRender PropertyID = 8005 + + // AudioConfigPlaybackBufferLengthInMs indicates the playback buffer length in milliseconds, default is 50 milliseconds. + AudioConfigPlaybackBufferLengthInMs PropertyID = 8006 + // SpeechLogFilename is the file name to write logs. SpeechLogFilename PropertyID = 9001 diff --git a/common/speech_synthesis_output_format.go b/common/speech_synthesis_output_format.go index 1fadeb8..a8988b6 100644 --- a/common/speech_synthesis_output_format.go +++ b/common/speech_synthesis_output_format.go @@ -11,9 +11,11 @@ const ( Raw8Khz8BitMonoMULaw SpeechSynthesisOutputFormat = 1 // Riff16Khz16KbpsMonoSiren stands for riff-16khz-16kbps-mono-siren + // Unsupported by the service. Do not use this value. Riff16Khz16KbpsMonoSiren SpeechSynthesisOutputFormat = 2 // Audio16Khz16KbpsMonoSiren stands for audio-16khz-16kbps-mono-siren + // Unsupported by the service. Do not use this value. Audio16Khz16KbpsMonoSiren SpeechSynthesisOutputFormat = 3 // Audio16Khz32KBitRateMonoMp3 stands for audio-16khz-32kbitrate-mono-mp3 @@ -57,4 +59,40 @@ const ( // Raw8Khz16BitMonoPcm stands for raw-8khz-16bit-mono-pcm Raw8Khz16BitMonoPcm SpeechSynthesisOutputFormat = 17 + + // Ogg16Khz16BitMonoOpus stands for ogg-16khz-16bit-mono-opus + Ogg16Khz16BitMonoOpus SpeechSynthesisOutputFormat = 18 + + // Ogg24Khz16BitMonoOpus stands for ogg-24khz-16bit-mono-opus + Ogg24Khz16BitMonoOpus SpeechSynthesisOutputFormat = 19 + + // Raw48Khz16BitMonoPcm stands for raw-48khz-16bit-mono-pcm + Raw48Khz16BitMonoPcm SpeechSynthesisOutputFormat = 20 + + // Riff48Khz16BitMonoPcm stands for riff-48khz-16bit-mono-pcm + Riff48Khz16BitMonoPcm SpeechSynthesisOutputFormat = 21 + + // Audio48Khz96KBitRateMonoMp3 stands for audio-48khz-96kbitrate-mono-mp3 + Audio48Khz96KBitRateMonoMp3 SpeechSynthesisOutputFormat = 22 + + // Audio48Khz192KBitRateMonoMp3 stands for audio-48khz-192kbitrate-mono-mp3 + Audio48Khz192KBitRateMonoMp3 SpeechSynthesisOutputFormat = 23 + + // Ogg48Khz16BitMonoOpus stands for ogg-48khz-16bit-mono-opus + Ogg48Khz16BitMonoOpus SpeechSynthesisOutputFormat = 24 + + // Webm16Khz16BitMonoOpus stands for webm-16khz-16bit-mono-opus + Webm16Khz16BitMonoOpus SpeechSynthesisOutputFormat = 25 + + // Webm24Khz16BitMonoOpus stands for webm-24khz-16bit-mono-opus + Webm24Khz16BitMonoOpus SpeechSynthesisOutputFormat = 26 + + // Raw24Khz16BitMonoTrueSilk stands for raw-24khz-16bit-mono-truesilk + Raw24Khz16BitMonoTrueSilk SpeechSynthesisOutputFormat = 27 + + // Raw8Khz8BitMonoALaw stands for raw-8khz-8bit-mono-alaw + Raw8Khz8BitMonoALaw SpeechSynthesisOutputFormat = 28 + + // Riff8Khz8BitMonoALaw stands for riff-8khz-8bit-mono-alaw + Riff8Khz8BitMonoALaw SpeechSynthesisOutputFormat = 29 ) diff --git a/common/synthesis_voice_gender.go b/common/synthesis_voice_gender.go new file mode 100644 index 0000000..22bc6a9 --- /dev/null +++ b/common/synthesis_voice_gender.go @@ -0,0 +1,18 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. + +package common + +// SynthesisVoiceGender defines the gender of a synthesis voice. +type SynthesisVoiceGender int + +const ( + // GenderUnknown means the gender is unknown. + GenderUnknown SynthesisVoiceGender = 0 + + // Female indicates female. + Female SynthesisVoiceGender = 1 + + // Male indicates male. + Male SynthesisVoiceGender = 2 +) diff --git a/samples/main.go b/samples/main.go index 462597b..1b5d8d9 100644 --- a/samples/main.go +++ b/samples/main.go @@ -10,6 +10,7 @@ import ( "github.com/Microsoft/cognitive-services-speech-sdk-go/samples/dialog_service_connector" "github.com/Microsoft/cognitive-services-speech-sdk-go/samples/recognizer" + "github.com/Microsoft/cognitive-services-speech-sdk-go/samples/synthesizer" ) type functionMap = map[string]func(string, string, string) @@ -27,12 +28,14 @@ func printHelp(executableName string, samples functionMap) { func main() { samples := functionMap{ - "speech_recognizer:RecognizeOnceFromWavFile": recognizer.RecognizeOnceFromWavFile, - "speech_recognizer:RecognizeOnceFromCompressedFile": recognizer.RecognizeOnceFromCompressedFile, - "speech_recognizer:ContinuousFromMicrophone": recognizer.ContinuousFromMicrophone, - "dialog_service_connector:ListenOnce": dialog_service_connector.ListenOnce, - "dialog_service_connector:KWS": dialog_service_connector.KWS, - "dialog_service_connector:ListenOnceFromStream": dialog_service_connector.ListenOnceFromStream, + "speech_recognizer:RecognizeOnceFromWavFile": recognizer.RecognizeOnceFromWavFile, + "speech_recognizer:RecognizeOnceFromCompressedFile": recognizer.RecognizeOnceFromCompressedFile, + "speech_recognizer:ContinuousFromMicrophone": recognizer.ContinuousFromMicrophone, + "dialog_service_connector:ListenOnce": dialog_service_connector.ListenOnce, + "dialog_service_connector:KWS": dialog_service_connector.KWS, + "dialog_service_connector:ListenOnceFromStream": dialog_service_connector.ListenOnceFromStream, + "speech_synthesizer:SynthesisToSpeaker": synthesizer.SynthesisToSpeaker, + "speech_synthesizer:SynthesisToAudioDataStream": synthesizer.SynthesisToAudioDataStream, } args := os.Args[1:] if len(args) != 4 { diff --git a/samples/synthesizer/doc.go b/samples/synthesizer/doc.go new file mode 100644 index 0000000..dd74b18 --- /dev/null +++ b/samples/synthesizer/doc.go @@ -0,0 +1,5 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. + +// Package synthesizer provides samples of text-to-speech +package synthesizer diff --git a/samples/synthesizer/to_audio_data_stream.go b/samples/synthesizer/to_audio_data_stream.go new file mode 100644 index 0000000..b440f6c --- /dev/null +++ b/samples/synthesizer/to_audio_data_stream.go @@ -0,0 +1,82 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. + +package synthesizer + +import ( + "bufio" + "fmt" + "io" + "os" + "strings" + "time" + + "github.com/Microsoft/cognitive-services-speech-sdk-go/speech" +) + +func SynthesisToAudioDataStream(subscription string, region string, file string) { + config, err := speech.NewSpeechConfigFromSubscription(subscription, region) + if err != nil { + fmt.Println("Got an error: ", err) + return + } + defer config.Close() + speechSynthesizer, err := speech.NewSpeechSynthesizerFromConfig(config, nil) + if err != nil { + fmt.Println("Got an error: ", err) + return + } + defer speechSynthesizer.Close() + + speechSynthesizer.SynthesisStarted(synthesizeStartedHandler) + speechSynthesizer.Synthesizing(synthesizingHandler) + speechSynthesizer.SynthesisCompleted(synthesizedHandler) + speechSynthesizer.SynthesisCanceled(cancelledHandler) + + for { + fmt.Printf("Enter some text that you want to speak, or enter empty text to exit.\n> ") + text, _ := bufio.NewReader(os.Stdin).ReadString('\n') + text = strings.TrimSuffix(text, "\n") + if len(text) == 0 { + break + } + + // StartSpeakingTextAsync sends the result to channel when the synthesis starts. + task := speechSynthesizer.StartSpeakingTextAsync(text) + var outcome speech.SpeechSynthesisOutcome + select { + case outcome = <-task: + case <-time.After(60 * time.Second): + fmt.Println("Timed out") + return + } + defer outcome.Close() + if outcome.Error != nil { + fmt.Println("Got an error: ", outcome.Error) + return + } + + // in most case we want to streaming receive the audio to lower the latency, + // we can use AudioDataStream to do so. + stream, err := speech.NewAudioDataStreamFromSpeechSynthesisResult(outcome.Result) + defer stream.Close() + if err != nil { + fmt.Println("Got an error: ", err) + return + } + + var all_audio []byte + audio_chunk := make([]byte, 2048) + for { + n, err := stream.Read(audio_chunk) + + if err == io.EOF { + break + } + + all_audio = append(all_audio, audio_chunk[:n]...) + } + + fmt.Printf("Read [%d] bytes from audio data stream.\n", len(all_audio)) + } +} diff --git a/samples/synthesizer/to_speaker.go b/samples/synthesizer/to_speaker.go new file mode 100644 index 0000000..dab44df --- /dev/null +++ b/samples/synthesizer/to_speaker.go @@ -0,0 +1,98 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. + +package synthesizer + +import ( + "bufio" + "fmt" + "os" + "strings" + "time" + + "github.com/Microsoft/cognitive-services-speech-sdk-go/audio" + "github.com/Microsoft/cognitive-services-speech-sdk-go/common" + "github.com/Microsoft/cognitive-services-speech-sdk-go/speech" +) + +func synthesizeStartedHandler(event speech.SpeechSynthesisEventArgs) { + defer event.Close() + fmt.Println("Synthesis started.") +} + +func synthesizingHandler(event speech.SpeechSynthesisEventArgs) { + defer event.Close() + fmt.Printf("Synthesizing, audio chunk size %d.\n", len(event.Result.AudioData)) +} + +func synthesizedHandler(event speech.SpeechSynthesisEventArgs) { + defer event.Close() + fmt.Printf("Synthesized, audio length %d.\n", len(event.Result.AudioData)) +} + +func cancelledHandler(event speech.SpeechSynthesisEventArgs) { + defer event.Close() + fmt.Println("Received a cancellation.") +} + +func SynthesisToSpeaker(subscription string, region string, file string) { + audioConfig, err := audio.NewAudioConfigFromDefaultSpeakerOutput() + if err != nil { + fmt.Println("Got an error: ", err) + return + } + defer audioConfig.Close() + config, err := speech.NewSpeechConfigFromSubscription(subscription, region) + if err != nil { + fmt.Println("Got an error: ", err) + return + } + defer config.Close() + speechSynthesizer, err := speech.NewSpeechSynthesizerFromConfig(config, audioConfig) + if err != nil { + fmt.Println("Got an error: ", err) + return + } + defer speechSynthesizer.Close() + + speechSynthesizer.SynthesisStarted(synthesizeStartedHandler) + speechSynthesizer.Synthesizing(synthesizingHandler) + speechSynthesizer.SynthesisCompleted(synthesizedHandler) + speechSynthesizer.SynthesisCanceled(cancelledHandler) + + for { + fmt.Printf("Enter some text that you want to speak, or enter empty text to exit.\n> ") + text, _ := bufio.NewReader(os.Stdin).ReadString('\n') + text = strings.TrimSuffix(text, "\n") + if len(text) == 0 { + break + } + + task := speechSynthesizer.SpeakTextAsync(text) + var outcome speech.SpeechSynthesisOutcome + select { + case outcome = <-task: + case <-time.After(60 * time.Second): + fmt.Println("Timed out") + return + } + defer outcome.Close() + if outcome.Error != nil { + fmt.Println("Got an error: ", outcome.Error) + return + } + + if outcome.Result.Reason == common.SynthesizingAudioCompleted { + fmt.Printf("Speech synthesized to speaker for text [%s].\n", text) + } else { + cancellation, _ := speech.NewCancellationDetailsFromSpeechSynthesisResult(outcome.Result) + fmt.Printf("CANCELED: Reason=%d.\n", cancellation.Reason) + + if cancellation.Reason == common.Error { + fmt.Printf("CANCELED: ErrorCode=%d\nCANCELED: ErrorDetails=[%s]\nCANCELED: Did you update the subscription info?\n", + cancellation.ErrorCode, + cancellation.ErrorDetails) + } + } + } +} diff --git a/speech/auto_detect_source_language_config.go b/speech/auto_detect_source_language_config.go index 007ae81..278709a 100644 --- a/speech/auto_detect_source_language_config.go +++ b/speech/auto_detect_source_language_config.go @@ -33,6 +33,16 @@ func newAutoDetectSourceLanguageConfigFromHandle(handle C.SPXHANDLE) (*AutoDetec return config, nil } +// NewAutoDetectSourceLanguageConfigFromOpenRange creates an instance of the AutoDetectSourceLanguageConfig with open range as source languages +func NewAutoDetectSourceLanguageConfigFromOpenRange() (*AutoDetectSourceLanguageConfig, error) { + var handle C.SPXHANDLE + ret := uintptr(C.create_auto_detect_source_lang_config_from_open_range(&handle)) + if ret != C.SPX_NOERROR { + return nil, common.NewCarbonError(ret) + } + return newAutoDetectSourceLanguageConfigFromHandle(handle) +} + // NewAutoDetectSourceLanguageConfigFromLanguages creates an instance of the AutoDetectSourceLanguageConfig with source languages func NewAutoDetectSourceLanguageConfigFromLanguages(languages []string) (*AutoDetectSourceLanguageConfig, error) { var handle C.SPXHANDLE diff --git a/speech/speech_synthesizer_test.go b/speech/speech_synthesizer_test.go index b9f8a01..d0d7315 100644 --- a/speech/speech_synthesizer_test.go +++ b/speech/speech_synthesizer_test.go @@ -226,7 +226,8 @@ func TestSynthesisToAudioDataStream(t *testing.T) { t.Error("audio data is not equal.") } - saveOutcome := stream.SaveToWavFileAsync("tmp_synthesis.mp3") + // todo: uncomment following lines after 1.17 released + /*saveOutcome := stream.SaveToWavFileAsync("tmp_synthesis.mp3") select { case err = <-saveOutcome: if err != nil { @@ -242,7 +243,7 @@ func TestSynthesisToAudioDataStream(t *testing.T) { file.Read(audioData4) if !bytes.Equal(audioData2, audioData4) { t.Error("audio data is not equal.") - } + }*/ } func TestSynthesisWithInvalidVoice(t *testing.T) { @@ -315,13 +316,43 @@ func TestSynthesisToPullAudioOutputStream(t *testing.T) { } } -// word boundary, viseme received, and bookmark reached +// viseme received +func TestSynthesizerVisemeEvents(t *testing.T) { + synthesizer := createSpeechSynthesizerFromAudioConfig(t, nil) + defer synthesizer.Close() + visemeReceivedFuture := make(chan string) + synthesizer.VisemeReceived(func(event SpeechSynthesisVisemeEventArgs) { + defer event.Close() + t.Logf("viseme received event, audio offset [%d], viseme ID [%d]", event.AudioOffset, event.VisemeID) + if event.AudioOffset <= 0 { + t.Error("viseme received audio offset") + } + select { + case visemeReceivedFuture <- "visemeReceivedFuture": + default: + } + }) + resultFuture := synthesizer.SpeakSsmlAsync("yet") + select { + case <-visemeReceivedFuture: + case <-time.After(timeout): + t.Error("Timeout waiting for VisemeReceived event.") + } + select { + case result := <-resultFuture: + defer result.Close() + checkSynthesisResult(t, result.Result, common.SynthesizingAudioCompleted) + case <-time.After(timeout): + t.Error("Timeout waiting for synthesis result.") + } +} + +// word boundary and bookmark reached func TestSynthesizerEvents2(t *testing.T) { synthesizer := createSpeechSynthesizerFromAudioConfig(t, nil) defer synthesizer.Close() wordBoundaryFuture := make(chan bool) synthesizer.WordBoundary(func(event SpeechSynthesisWordBoundaryEventArgs) { - t.Log("9999") defer event.Close() t.Logf("word boundary event, audio offset [%d], text offset [%d], word length [%d]", event.AudioOffset, event.TextOffset, event.WordLength) if event.AudioOffset <= 0 { @@ -338,22 +369,10 @@ func TestSynthesizerEvents2(t *testing.T) { default: } }) - visemeReceivedFuture := make(chan string) - synthesizer.VisemeReceived(func(event SpeechSynthesisVisemeEventArgs) { - defer event.Close() - t.Logf("viseme received event, audio offset [%d], viseme ID [%d]", event.AudioOffset, event.VisemeID) - if event.AudioOffset <= 0 { - t.Error("viseme received audio offset") - } - select { - case visemeReceivedFuture <- "visemeReceivedFuture": - default: - } - }) bookmarkReachedFuture := make(chan string) synthesizer.BookmarkReached(func(event SpeechSynthesisBookmarkEventArgs) { defer event.Close() - t.Logf("Bookmark reached event, audio offset [%d], text [%sl", event.AudioOffset, event.Text) + t.Logf("Bookmark reached event, audio offset [%d], text [%s]", event.AudioOffset, event.Text) if event.AudioOffset <= 0 { t.Error("bookmark audio offset error") } @@ -362,12 +381,7 @@ func TestSynthesizerEvents2(t *testing.T) { } bookmarkReachedFuture <- "bookmarkReachedFuture" }) - resultFuture := synthesizer.SpeakSsmlAsync("yet") - select { - case <-visemeReceivedFuture: - case <-time.After(timeout): - t.Error("Timeout waiting for VisemeReceived event.") - } + resultFuture := synthesizer.SpeakSsmlAsync("hello") select { case <-bookmarkReachedFuture: t.Logf("Received BookmarkReached event.") @@ -406,7 +420,51 @@ func TestSynthesisGetAvailableVoices(t *testing.T) { t.Error("voice name error") } } + jenny := outcome.Result.Voices[0] + if jenny.LocalName != "Jenny" { + t.Errorf("The first en-US voice [%s] is not Jenny.", jenny.LocalName) + } + if jenny.VoiceType != common.OnlineNeural { + t.Error("Jenny's voice type is incorrect.") + } + if len(jenny.StyleList) < 2 { + t.Error("Jenny's style list error.") + } + if jenny.Gender != common.Female { + t.Error("Jenny's gender error.") + } case <-time.After(timeout): t.Error("Timeout waiting for synthesis voices result.") } } + +func TestSynthesisWithLanguageAutoDetection(t *testing.T) { + config := createSpeechConfig(t) + defer config.Close() + languageConfig, err := NewAutoDetectSourceLanguageConfigFromOpenRange() + if err != nil { + t.Error("Got an error: ", err) + } + defer languageConfig.Close() + synthesizer, err := NewSpeechSynthesizerFomAutoDetectSourceLangConfig(config, languageConfig, nil) + if err != nil { + t.Error("Got an error: ", err) + } + if synthesizer == nil { + t.Error("synthesizer creation failed") + } + defer synthesizer.Close() + textResultFuture := synthesizer.SpeakTextAsync("你好,世界。") + + var textResult SpeechSynthesisOutcome + select { + case textResult = <-textResultFuture: + defer textResult.Close() + checkSynthesisResult(t, textResult.Result, common.SynthesizingAudioCompleted) + if len(textResult.Result.AudioData) < 32000 { + t.Error("audio should longer than 1s.") + } + case <-time.After(timeout): + t.Error("Timeout waiting for synthesis result.") + } +} diff --git a/speech/voice_info.go b/speech/voice_info.go index 44d07f4..a93c87c 100644 --- a/speech/voice_info.go +++ b/speech/voice_info.go @@ -33,6 +33,9 @@ type VoiceInfo struct { // LocalName specifies the local name of the voice LocalName string + // Gender specifies the gender of the voice. + Gender common.SynthesisVoiceGender + // VoiceType specifies the voice type. VoiceType common.SynthesisVoiceType @@ -93,5 +96,13 @@ func NewVoiceInfoFromHandle(handle common.SPXHandle) (*VoiceInfo, error) { return nil, common.NewCarbonError(ret) } voiceInfo.Properties = common.NewPropertyCollectionFromHandle(handle2uintptr(propBagHandle)) + gender := voiceInfo.Properties.GetPropertyByString("Gender", "") + if gender == "Female" { + voiceInfo.Gender = common.Female + } else if gender == "Male" { + voiceInfo.Gender = common.Male + } else { + voiceInfo.Gender = common.GenderUnknown + } return voiceInfo, nil }