diff --git a/.vscode/settings.json b/.vscode/settings.json
index 8d84196..ff821af 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -16,6 +16,7 @@
"golint",
"govet",
"unstaged",
- "untracked"
+ "untracked",
+ "webm"
]
}
\ No newline at end of file
diff --git a/audio/audio_config.go b/audio/audio_config.go
index bd4538c..47f05ca 100644
--- a/audio/audio_config.go
+++ b/audio/audio_config.go
@@ -98,6 +98,19 @@ func NewAudioConfigFromDefaultSpeakerOutput() (*AudioConfig, error) {
return newAudioConfigFromHandle(handle)
}
+// NewAudioConfigFromSpeakerOutput creates an AudioConfig object representing the specific audio output device
+// (speaker) on the system.
+func NewAudioConfigFromSpeakerOutput(deviceName string) (*AudioConfig, error) {
+ var handle C.SPXHANDLE
+ dn := C.CString(deviceName)
+ defer C.free(unsafe.Pointer(dn))
+ ret := uintptr(C.audio_config_create_audio_output_from_a_speaker(&handle, dn))
+ if ret != C.SPX_NOERROR {
+ return nil, common.NewCarbonError(ret)
+ }
+ return newAudioConfigFromHandle(handle)
+}
+
// NewAudioConfigFromWavFileOutput creates an AudioConfig object representing the specified file for audio output.
func NewAudioConfigFromWavFileOutput(filename string) (*AudioConfig, error) {
var handle C.SPXHANDLE
diff --git a/common/cancellation_reason.go b/common/cancellation_reason.go
index 621d9f2..850c350 100644
--- a/common/cancellation_reason.go
+++ b/common/cancellation_reason.go
@@ -7,9 +7,13 @@ package common
type CancellationReason int
const (
- // Indicates that an error occurred during speech recognition.
+ // Error indicates that an error occurred during speech recognition.
Error CancellationReason = 1
- // Indicates that the end of the audio stream was reached.
+ // EndOfStream indicates that the end of the audio stream was reached.
EndOfStream CancellationReason = 2
+
+ // CancelledByUser indicates that request was cancelled by the user.
+ // Added in version 1.17.0
+ CancelledByUser CancellationReason = 3
)
diff --git a/common/property_id.go b/common/property_id.go
index 23b705a..04af717 100644
--- a/common/property_id.go
+++ b/common/property_id.go
@@ -102,7 +102,7 @@ const (
// SpeechSessionID is the session id. This id is a universally unique identifier (aka UUID) representing a specific
// binding of an audio input stream and the underlying speech recognition instance to which it is bound. Under normal
// circumstances, you shouldn't have to use this property directly.
- /// Instead use SessionEventArgs.SessionId.
+ // Instead use SessionEventArgs.SessionId.
SpeechSessionID PropertyID = 3002
// SpeechServiceConnectionUserDefinedQueryParameters are the query parameters provided by users. They will be passed
@@ -118,6 +118,14 @@ const (
// SpeechServiceConnectionSynthOutputFormat is the string to specify TTS output audio format.
SpeechServiceConnectionSynthOutputFormat PropertyID = 3102
+ // SpeechServiceConnectionSynthEnableCompressedAudioTransmission indicates if use compressed audio format
+ // for speech synthesis audio transmission.
+ // This property only affects when SpeechServiceConnectionSynthOutputFormat is set to a pcm format.
+ // If this property is not set and GStreamer is available, SDK will use compressed format for synthesized audio transmission,
+ // and decode it. You can set this property to "false" to use raw pcm format for transmission on wire.
+ // Added in version 1.17.0
+ SpeechServiceConnectionSynthEnableCompressedAudioTransmission PropertyID = 3103
+
// SpeechServiceConnectionInitialSilenceTimeoutMs is the initial silence timeout value (in milliseconds) used by the
// service.
SpeechServiceConnectionInitialSilenceTimeoutMs PropertyID = 3200
@@ -186,6 +194,29 @@ const (
// the speech service.
SpeechServiceResponseRecognitionLatencyMs PropertyID = 5002
+ // SpeechServiceResponseSynthesisFirstByteLatencyMs is the speech synthesis first byte latency in milliseconds.
+ // Read-only, available on final speech synthesis results.
+ // This measures the latency between when the synthesis is started to be processed, and the moment the first byte audio is available.
+ // Added in version 1.17.0.
+ SpeechServiceResponseSynthesisFirstByteLatencyMs PropertyID = 5010
+
+ // SpeechServiceResponseSynthesisFinishLatencyMs is the speech synthesis all bytes latency in milliseconds.
+ // Read-only, available on final speech synthesis results.
+ // This measures the latency between when the synthesis is started to be processed, and the moment the whole audio is synthesized.
+ // Added in version 1.17.0.
+ SpeechServiceResponseSynthesisFinishLatencyMs PropertyID = 5011
+
+ // SpeechServiceResponseSynthesisUnderrunTimeMs is the underrun time for speech synthesis in milliseconds.
+ // Read-only, available on results in SynthesisCompleted events.
+ // This measures the total underrun time from AudioConfigPlaybackBufferLengthInMs is filled to synthesis completed.
+ // Added in version 1.17.0.
+ SpeechServiceResponseSynthesisUnderrunTimeMs PropertyID = 5012
+
+ // SpeechServiceResponseSynthesisBackend indicates which backend the synthesis is finished by.
+ // Read-only, available on speech synthesis results, except for the result in SynthesisStarted event
+ // Added in version 1.17.0.
+ SpeechServiceResponseSynthesisBackend PropertyID = 5020
+
// CancellationDetailsReason is the cancellation reason. Currently unused.
CancellationDetailsReason PropertyID = 6000
@@ -216,6 +247,14 @@ const (
// AudioConfigAudioSource is the audio source. Allowed values are "Microphones", "File", and "Stream".
AudioConfigAudioSource PropertyID = 8004
+ // AudioConfigDeviceNameForRender indicates the device name for audio render. Under normal circumstances,
+ // you shouldn't have to use this property directly. Instead, use NewAudioConfigFromDefaultSpeakerOutput.
+ // Added in version 1.17.0
+ AudioConfigDeviceNameForRender PropertyID = 8005
+
+ // AudioConfigPlaybackBufferLengthInMs indicates the playback buffer length in milliseconds, default is 50 milliseconds.
+ AudioConfigPlaybackBufferLengthInMs PropertyID = 8006
+
// SpeechLogFilename is the file name to write logs.
SpeechLogFilename PropertyID = 9001
diff --git a/common/speech_synthesis_output_format.go b/common/speech_synthesis_output_format.go
index 1fadeb8..a8988b6 100644
--- a/common/speech_synthesis_output_format.go
+++ b/common/speech_synthesis_output_format.go
@@ -11,9 +11,11 @@ const (
Raw8Khz8BitMonoMULaw SpeechSynthesisOutputFormat = 1
// Riff16Khz16KbpsMonoSiren stands for riff-16khz-16kbps-mono-siren
+ // Unsupported by the service. Do not use this value.
Riff16Khz16KbpsMonoSiren SpeechSynthesisOutputFormat = 2
// Audio16Khz16KbpsMonoSiren stands for audio-16khz-16kbps-mono-siren
+ // Unsupported by the service. Do not use this value.
Audio16Khz16KbpsMonoSiren SpeechSynthesisOutputFormat = 3
// Audio16Khz32KBitRateMonoMp3 stands for audio-16khz-32kbitrate-mono-mp3
@@ -57,4 +59,40 @@ const (
// Raw8Khz16BitMonoPcm stands for raw-8khz-16bit-mono-pcm
Raw8Khz16BitMonoPcm SpeechSynthesisOutputFormat = 17
+
+ // Ogg16Khz16BitMonoOpus stands for ogg-16khz-16bit-mono-opus
+ Ogg16Khz16BitMonoOpus SpeechSynthesisOutputFormat = 18
+
+ // Ogg24Khz16BitMonoOpus stands for ogg-24khz-16bit-mono-opus
+ Ogg24Khz16BitMonoOpus SpeechSynthesisOutputFormat = 19
+
+ // Raw48Khz16BitMonoPcm stands for raw-48khz-16bit-mono-pcm
+ Raw48Khz16BitMonoPcm SpeechSynthesisOutputFormat = 20
+
+ // Riff48Khz16BitMonoPcm stands for riff-48khz-16bit-mono-pcm
+ Riff48Khz16BitMonoPcm SpeechSynthesisOutputFormat = 21
+
+ // Audio48Khz96KBitRateMonoMp3 stands for audio-48khz-96kbitrate-mono-mp3
+ Audio48Khz96KBitRateMonoMp3 SpeechSynthesisOutputFormat = 22
+
+ // Audio48Khz192KBitRateMonoMp3 stands for audio-48khz-192kbitrate-mono-mp3
+ Audio48Khz192KBitRateMonoMp3 SpeechSynthesisOutputFormat = 23
+
+ // Ogg48Khz16BitMonoOpus stands for ogg-48khz-16bit-mono-opus
+ Ogg48Khz16BitMonoOpus SpeechSynthesisOutputFormat = 24
+
+ // Webm16Khz16BitMonoOpus stands for webm-16khz-16bit-mono-opus
+ Webm16Khz16BitMonoOpus SpeechSynthesisOutputFormat = 25
+
+ // Webm24Khz16BitMonoOpus stands for webm-24khz-16bit-mono-opus
+ Webm24Khz16BitMonoOpus SpeechSynthesisOutputFormat = 26
+
+ // Raw24Khz16BitMonoTrueSilk stands for raw-24khz-16bit-mono-truesilk
+ Raw24Khz16BitMonoTrueSilk SpeechSynthesisOutputFormat = 27
+
+ // Raw8Khz8BitMonoALaw stands for raw-8khz-8bit-mono-alaw
+ Raw8Khz8BitMonoALaw SpeechSynthesisOutputFormat = 28
+
+ // Riff8Khz8BitMonoALaw stands for riff-8khz-8bit-mono-alaw
+ Riff8Khz8BitMonoALaw SpeechSynthesisOutputFormat = 29
)
diff --git a/common/synthesis_voice_gender.go b/common/synthesis_voice_gender.go
new file mode 100644
index 0000000..22bc6a9
--- /dev/null
+++ b/common/synthesis_voice_gender.go
@@ -0,0 +1,18 @@
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+
+package common
+
+// SynthesisVoiceGender defines the gender of a synthesis voice.
+type SynthesisVoiceGender int
+
+const (
+ // GenderUnknown means the gender is unknown.
+ GenderUnknown SynthesisVoiceGender = 0
+
+ // Female indicates female.
+ Female SynthesisVoiceGender = 1
+
+ // Male indicates male.
+ Male SynthesisVoiceGender = 2
+)
diff --git a/samples/main.go b/samples/main.go
index 462597b..1b5d8d9 100644
--- a/samples/main.go
+++ b/samples/main.go
@@ -10,6 +10,7 @@ import (
"github.com/Microsoft/cognitive-services-speech-sdk-go/samples/dialog_service_connector"
"github.com/Microsoft/cognitive-services-speech-sdk-go/samples/recognizer"
+ "github.com/Microsoft/cognitive-services-speech-sdk-go/samples/synthesizer"
)
type functionMap = map[string]func(string, string, string)
@@ -27,12 +28,14 @@ func printHelp(executableName string, samples functionMap) {
func main() {
samples := functionMap{
- "speech_recognizer:RecognizeOnceFromWavFile": recognizer.RecognizeOnceFromWavFile,
- "speech_recognizer:RecognizeOnceFromCompressedFile": recognizer.RecognizeOnceFromCompressedFile,
- "speech_recognizer:ContinuousFromMicrophone": recognizer.ContinuousFromMicrophone,
- "dialog_service_connector:ListenOnce": dialog_service_connector.ListenOnce,
- "dialog_service_connector:KWS": dialog_service_connector.KWS,
- "dialog_service_connector:ListenOnceFromStream": dialog_service_connector.ListenOnceFromStream,
+ "speech_recognizer:RecognizeOnceFromWavFile": recognizer.RecognizeOnceFromWavFile,
+ "speech_recognizer:RecognizeOnceFromCompressedFile": recognizer.RecognizeOnceFromCompressedFile,
+ "speech_recognizer:ContinuousFromMicrophone": recognizer.ContinuousFromMicrophone,
+ "dialog_service_connector:ListenOnce": dialog_service_connector.ListenOnce,
+ "dialog_service_connector:KWS": dialog_service_connector.KWS,
+ "dialog_service_connector:ListenOnceFromStream": dialog_service_connector.ListenOnceFromStream,
+ "speech_synthesizer:SynthesisToSpeaker": synthesizer.SynthesisToSpeaker,
+ "speech_synthesizer:SynthesisToAudioDataStream": synthesizer.SynthesisToAudioDataStream,
}
args := os.Args[1:]
if len(args) != 4 {
diff --git a/samples/synthesizer/doc.go b/samples/synthesizer/doc.go
new file mode 100644
index 0000000..dd74b18
--- /dev/null
+++ b/samples/synthesizer/doc.go
@@ -0,0 +1,5 @@
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+
+// Package synthesizer provides samples of text-to-speech
+package synthesizer
diff --git a/samples/synthesizer/to_audio_data_stream.go b/samples/synthesizer/to_audio_data_stream.go
new file mode 100644
index 0000000..b440f6c
--- /dev/null
+++ b/samples/synthesizer/to_audio_data_stream.go
@@ -0,0 +1,82 @@
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+
+package synthesizer
+
+import (
+ "bufio"
+ "fmt"
+ "io"
+ "os"
+ "strings"
+ "time"
+
+ "github.com/Microsoft/cognitive-services-speech-sdk-go/speech"
+)
+
+func SynthesisToAudioDataStream(subscription string, region string, file string) {
+ config, err := speech.NewSpeechConfigFromSubscription(subscription, region)
+ if err != nil {
+ fmt.Println("Got an error: ", err)
+ return
+ }
+ defer config.Close()
+ speechSynthesizer, err := speech.NewSpeechSynthesizerFromConfig(config, nil)
+ if err != nil {
+ fmt.Println("Got an error: ", err)
+ return
+ }
+ defer speechSynthesizer.Close()
+
+ speechSynthesizer.SynthesisStarted(synthesizeStartedHandler)
+ speechSynthesizer.Synthesizing(synthesizingHandler)
+ speechSynthesizer.SynthesisCompleted(synthesizedHandler)
+ speechSynthesizer.SynthesisCanceled(cancelledHandler)
+
+ for {
+ fmt.Printf("Enter some text that you want to speak, or enter empty text to exit.\n> ")
+ text, _ := bufio.NewReader(os.Stdin).ReadString('\n')
+ text = strings.TrimSuffix(text, "\n")
+ if len(text) == 0 {
+ break
+ }
+
+ // StartSpeakingTextAsync sends the result to channel when the synthesis starts.
+ task := speechSynthesizer.StartSpeakingTextAsync(text)
+ var outcome speech.SpeechSynthesisOutcome
+ select {
+ case outcome = <-task:
+ case <-time.After(60 * time.Second):
+ fmt.Println("Timed out")
+ return
+ }
+ defer outcome.Close()
+ if outcome.Error != nil {
+ fmt.Println("Got an error: ", outcome.Error)
+ return
+ }
+
+ // in most case we want to streaming receive the audio to lower the latency,
+ // we can use AudioDataStream to do so.
+ stream, err := speech.NewAudioDataStreamFromSpeechSynthesisResult(outcome.Result)
+ defer stream.Close()
+ if err != nil {
+ fmt.Println("Got an error: ", err)
+ return
+ }
+
+ var all_audio []byte
+ audio_chunk := make([]byte, 2048)
+ for {
+ n, err := stream.Read(audio_chunk)
+
+ if err == io.EOF {
+ break
+ }
+
+ all_audio = append(all_audio, audio_chunk[:n]...)
+ }
+
+ fmt.Printf("Read [%d] bytes from audio data stream.\n", len(all_audio))
+ }
+}
diff --git a/samples/synthesizer/to_speaker.go b/samples/synthesizer/to_speaker.go
new file mode 100644
index 0000000..dab44df
--- /dev/null
+++ b/samples/synthesizer/to_speaker.go
@@ -0,0 +1,98 @@
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+
+package synthesizer
+
+import (
+ "bufio"
+ "fmt"
+ "os"
+ "strings"
+ "time"
+
+ "github.com/Microsoft/cognitive-services-speech-sdk-go/audio"
+ "github.com/Microsoft/cognitive-services-speech-sdk-go/common"
+ "github.com/Microsoft/cognitive-services-speech-sdk-go/speech"
+)
+
+func synthesizeStartedHandler(event speech.SpeechSynthesisEventArgs) {
+ defer event.Close()
+ fmt.Println("Synthesis started.")
+}
+
+func synthesizingHandler(event speech.SpeechSynthesisEventArgs) {
+ defer event.Close()
+ fmt.Printf("Synthesizing, audio chunk size %d.\n", len(event.Result.AudioData))
+}
+
+func synthesizedHandler(event speech.SpeechSynthesisEventArgs) {
+ defer event.Close()
+ fmt.Printf("Synthesized, audio length %d.\n", len(event.Result.AudioData))
+}
+
+func cancelledHandler(event speech.SpeechSynthesisEventArgs) {
+ defer event.Close()
+ fmt.Println("Received a cancellation.")
+}
+
+func SynthesisToSpeaker(subscription string, region string, file string) {
+ audioConfig, err := audio.NewAudioConfigFromDefaultSpeakerOutput()
+ if err != nil {
+ fmt.Println("Got an error: ", err)
+ return
+ }
+ defer audioConfig.Close()
+ config, err := speech.NewSpeechConfigFromSubscription(subscription, region)
+ if err != nil {
+ fmt.Println("Got an error: ", err)
+ return
+ }
+ defer config.Close()
+ speechSynthesizer, err := speech.NewSpeechSynthesizerFromConfig(config, audioConfig)
+ if err != nil {
+ fmt.Println("Got an error: ", err)
+ return
+ }
+ defer speechSynthesizer.Close()
+
+ speechSynthesizer.SynthesisStarted(synthesizeStartedHandler)
+ speechSynthesizer.Synthesizing(synthesizingHandler)
+ speechSynthesizer.SynthesisCompleted(synthesizedHandler)
+ speechSynthesizer.SynthesisCanceled(cancelledHandler)
+
+ for {
+ fmt.Printf("Enter some text that you want to speak, or enter empty text to exit.\n> ")
+ text, _ := bufio.NewReader(os.Stdin).ReadString('\n')
+ text = strings.TrimSuffix(text, "\n")
+ if len(text) == 0 {
+ break
+ }
+
+ task := speechSynthesizer.SpeakTextAsync(text)
+ var outcome speech.SpeechSynthesisOutcome
+ select {
+ case outcome = <-task:
+ case <-time.After(60 * time.Second):
+ fmt.Println("Timed out")
+ return
+ }
+ defer outcome.Close()
+ if outcome.Error != nil {
+ fmt.Println("Got an error: ", outcome.Error)
+ return
+ }
+
+ if outcome.Result.Reason == common.SynthesizingAudioCompleted {
+ fmt.Printf("Speech synthesized to speaker for text [%s].\n", text)
+ } else {
+ cancellation, _ := speech.NewCancellationDetailsFromSpeechSynthesisResult(outcome.Result)
+ fmt.Printf("CANCELED: Reason=%d.\n", cancellation.Reason)
+
+ if cancellation.Reason == common.Error {
+ fmt.Printf("CANCELED: ErrorCode=%d\nCANCELED: ErrorDetails=[%s]\nCANCELED: Did you update the subscription info?\n",
+ cancellation.ErrorCode,
+ cancellation.ErrorDetails)
+ }
+ }
+ }
+}
diff --git a/speech/auto_detect_source_language_config.go b/speech/auto_detect_source_language_config.go
index 007ae81..278709a 100644
--- a/speech/auto_detect_source_language_config.go
+++ b/speech/auto_detect_source_language_config.go
@@ -33,6 +33,16 @@ func newAutoDetectSourceLanguageConfigFromHandle(handle C.SPXHANDLE) (*AutoDetec
return config, nil
}
+// NewAutoDetectSourceLanguageConfigFromOpenRange creates an instance of the AutoDetectSourceLanguageConfig with open range as source languages
+func NewAutoDetectSourceLanguageConfigFromOpenRange() (*AutoDetectSourceLanguageConfig, error) {
+ var handle C.SPXHANDLE
+ ret := uintptr(C.create_auto_detect_source_lang_config_from_open_range(&handle))
+ if ret != C.SPX_NOERROR {
+ return nil, common.NewCarbonError(ret)
+ }
+ return newAutoDetectSourceLanguageConfigFromHandle(handle)
+}
+
// NewAutoDetectSourceLanguageConfigFromLanguages creates an instance of the AutoDetectSourceLanguageConfig with source languages
func NewAutoDetectSourceLanguageConfigFromLanguages(languages []string) (*AutoDetectSourceLanguageConfig, error) {
var handle C.SPXHANDLE
diff --git a/speech/speech_synthesizer_test.go b/speech/speech_synthesizer_test.go
index b9f8a01..d0d7315 100644
--- a/speech/speech_synthesizer_test.go
+++ b/speech/speech_synthesizer_test.go
@@ -226,7 +226,8 @@ func TestSynthesisToAudioDataStream(t *testing.T) {
t.Error("audio data is not equal.")
}
- saveOutcome := stream.SaveToWavFileAsync("tmp_synthesis.mp3")
+ // todo: uncomment following lines after 1.17 released
+ /*saveOutcome := stream.SaveToWavFileAsync("tmp_synthesis.mp3")
select {
case err = <-saveOutcome:
if err != nil {
@@ -242,7 +243,7 @@ func TestSynthesisToAudioDataStream(t *testing.T) {
file.Read(audioData4)
if !bytes.Equal(audioData2, audioData4) {
t.Error("audio data is not equal.")
- }
+ }*/
}
func TestSynthesisWithInvalidVoice(t *testing.T) {
@@ -315,13 +316,43 @@ func TestSynthesisToPullAudioOutputStream(t *testing.T) {
}
}
-// word boundary, viseme received, and bookmark reached
+// viseme received
+func TestSynthesizerVisemeEvents(t *testing.T) {
+ synthesizer := createSpeechSynthesizerFromAudioConfig(t, nil)
+ defer synthesizer.Close()
+ visemeReceivedFuture := make(chan string)
+ synthesizer.VisemeReceived(func(event SpeechSynthesisVisemeEventArgs) {
+ defer event.Close()
+ t.Logf("viseme received event, audio offset [%d], viseme ID [%d]", event.AudioOffset, event.VisemeID)
+ if event.AudioOffset <= 0 {
+ t.Error("viseme received audio offset")
+ }
+ select {
+ case visemeReceivedFuture <- "visemeReceivedFuture":
+ default:
+ }
+ })
+ resultFuture := synthesizer.SpeakSsmlAsync("yet")
+ select {
+ case <-visemeReceivedFuture:
+ case <-time.After(timeout):
+ t.Error("Timeout waiting for VisemeReceived event.")
+ }
+ select {
+ case result := <-resultFuture:
+ defer result.Close()
+ checkSynthesisResult(t, result.Result, common.SynthesizingAudioCompleted)
+ case <-time.After(timeout):
+ t.Error("Timeout waiting for synthesis result.")
+ }
+}
+
+// word boundary and bookmark reached
func TestSynthesizerEvents2(t *testing.T) {
synthesizer := createSpeechSynthesizerFromAudioConfig(t, nil)
defer synthesizer.Close()
wordBoundaryFuture := make(chan bool)
synthesizer.WordBoundary(func(event SpeechSynthesisWordBoundaryEventArgs) {
- t.Log("9999")
defer event.Close()
t.Logf("word boundary event, audio offset [%d], text offset [%d], word length [%d]", event.AudioOffset, event.TextOffset, event.WordLength)
if event.AudioOffset <= 0 {
@@ -338,22 +369,10 @@ func TestSynthesizerEvents2(t *testing.T) {
default:
}
})
- visemeReceivedFuture := make(chan string)
- synthesizer.VisemeReceived(func(event SpeechSynthesisVisemeEventArgs) {
- defer event.Close()
- t.Logf("viseme received event, audio offset [%d], viseme ID [%d]", event.AudioOffset, event.VisemeID)
- if event.AudioOffset <= 0 {
- t.Error("viseme received audio offset")
- }
- select {
- case visemeReceivedFuture <- "visemeReceivedFuture":
- default:
- }
- })
bookmarkReachedFuture := make(chan string)
synthesizer.BookmarkReached(func(event SpeechSynthesisBookmarkEventArgs) {
defer event.Close()
- t.Logf("Bookmark reached event, audio offset [%d], text [%sl", event.AudioOffset, event.Text)
+ t.Logf("Bookmark reached event, audio offset [%d], text [%s]", event.AudioOffset, event.Text)
if event.AudioOffset <= 0 {
t.Error("bookmark audio offset error")
}
@@ -362,12 +381,7 @@ func TestSynthesizerEvents2(t *testing.T) {
}
bookmarkReachedFuture <- "bookmarkReachedFuture"
})
- resultFuture := synthesizer.SpeakSsmlAsync("yet")
- select {
- case <-visemeReceivedFuture:
- case <-time.After(timeout):
- t.Error("Timeout waiting for VisemeReceived event.")
- }
+ resultFuture := synthesizer.SpeakSsmlAsync("hello")
select {
case <-bookmarkReachedFuture:
t.Logf("Received BookmarkReached event.")
@@ -406,7 +420,51 @@ func TestSynthesisGetAvailableVoices(t *testing.T) {
t.Error("voice name error")
}
}
+ jenny := outcome.Result.Voices[0]
+ if jenny.LocalName != "Jenny" {
+ t.Errorf("The first en-US voice [%s] is not Jenny.", jenny.LocalName)
+ }
+ if jenny.VoiceType != common.OnlineNeural {
+ t.Error("Jenny's voice type is incorrect.")
+ }
+ if len(jenny.StyleList) < 2 {
+ t.Error("Jenny's style list error.")
+ }
+ if jenny.Gender != common.Female {
+ t.Error("Jenny's gender error.")
+ }
case <-time.After(timeout):
t.Error("Timeout waiting for synthesis voices result.")
}
}
+
+func TestSynthesisWithLanguageAutoDetection(t *testing.T) {
+ config := createSpeechConfig(t)
+ defer config.Close()
+ languageConfig, err := NewAutoDetectSourceLanguageConfigFromOpenRange()
+ if err != nil {
+ t.Error("Got an error: ", err)
+ }
+ defer languageConfig.Close()
+ synthesizer, err := NewSpeechSynthesizerFomAutoDetectSourceLangConfig(config, languageConfig, nil)
+ if err != nil {
+ t.Error("Got an error: ", err)
+ }
+ if synthesizer == nil {
+ t.Error("synthesizer creation failed")
+ }
+ defer synthesizer.Close()
+ textResultFuture := synthesizer.SpeakTextAsync("你好,世界。")
+
+ var textResult SpeechSynthesisOutcome
+ select {
+ case textResult = <-textResultFuture:
+ defer textResult.Close()
+ checkSynthesisResult(t, textResult.Result, common.SynthesizingAudioCompleted)
+ if len(textResult.Result.AudioData) < 32000 {
+ t.Error("audio should longer than 1s.")
+ }
+ case <-time.After(timeout):
+ t.Error("Timeout waiting for synthesis result.")
+ }
+}
diff --git a/speech/voice_info.go b/speech/voice_info.go
index 44d07f4..a93c87c 100644
--- a/speech/voice_info.go
+++ b/speech/voice_info.go
@@ -33,6 +33,9 @@ type VoiceInfo struct {
// LocalName specifies the local name of the voice
LocalName string
+ // Gender specifies the gender of the voice.
+ Gender common.SynthesisVoiceGender
+
// VoiceType specifies the voice type.
VoiceType common.SynthesisVoiceType
@@ -93,5 +96,13 @@ func NewVoiceInfoFromHandle(handle common.SPXHandle) (*VoiceInfo, error) {
return nil, common.NewCarbonError(ret)
}
voiceInfo.Properties = common.NewPropertyCollectionFromHandle(handle2uintptr(propBagHandle))
+ gender := voiceInfo.Properties.GetPropertyByString("Gender", "")
+ if gender == "Female" {
+ voiceInfo.Gender = common.Female
+ } else if gender == "Male" {
+ voiceInfo.Gender = common.Male
+ } else {
+ voiceInfo.Gender = common.GenderUnknown
+ }
return voiceInfo, nil
}