TTS implementation: part 4 (#30)

microsoft · Apr 28, 2021 · a8d94ac · a8d94ac
1 parent 44d2f09
commit a8d94ac
Show file tree

Hide file tree

Showing 13 changed files with 413 additions and 33 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -16,6 +16,7 @@
         "golint",
         "govet",
         "unstaged",
-        "untracked"
+        "untracked",
+        "webm"
     ]
 }
diff --git a/audio/audio_config.go b/audio/audio_config.go
@@ -98,6 +98,19 @@ func NewAudioConfigFromDefaultSpeakerOutput() (*AudioConfig, error) {
 	return newAudioConfigFromHandle(handle)
 }
 
+// NewAudioConfigFromSpeakerOutput creates an AudioConfig object representing the specific audio output device
+// (speaker) on the system.
+func NewAudioConfigFromSpeakerOutput(deviceName string) (*AudioConfig, error) {
+	var handle C.SPXHANDLE
+	dn := C.CString(deviceName)
+	defer C.free(unsafe.Pointer(dn))
+	ret := uintptr(C.audio_config_create_audio_output_from_a_speaker(&handle, dn))
+	if ret != C.SPX_NOERROR {
+		return nil, common.NewCarbonError(ret)
+	}
+	return newAudioConfigFromHandle(handle)
+}
+
 // NewAudioConfigFromWavFileOutput creates an AudioConfig object representing the specified file for audio output.
 func NewAudioConfigFromWavFileOutput(filename string) (*AudioConfig, error) {
 	var handle C.SPXHANDLE

diff --git a/common/cancellation_reason.go b/common/cancellation_reason.go
@@ -7,9 +7,13 @@ package common
 type CancellationReason int
 
 const (
-	// Indicates that an error occurred during speech recognition.
+	// Error indicates that an error occurred during speech recognition.
 	Error CancellationReason = 1
 
-	// Indicates that the end of the audio stream was reached.
+	// EndOfStream indicates that the end of the audio stream was reached.
 	EndOfStream CancellationReason = 2
+
+	// CancelledByUser indicates that request was cancelled by the user.
+	// Added in version 1.17.0
+	CancelledByUser CancellationReason = 3
 )
diff --git a/common/property_id.go b/common/property_id.go
@@ -102,7 +102,7 @@ const (
 	// SpeechSessionID is the session id. This id is a universally unique identifier (aka UUID) representing a specific
 	// binding of an audio input stream and the underlying speech recognition instance to which it is bound. Under normal
 	// circumstances, you shouldn't have to use this property directly.
-	/// Instead use SessionEventArgs.SessionId.
+	// Instead use SessionEventArgs.SessionId.
 	SpeechSessionID PropertyID = 3002
 
 	// SpeechServiceConnectionUserDefinedQueryParameters are the query parameters provided by users. They will be passed
@@ -118,6 +118,14 @@ const (
 	// SpeechServiceConnectionSynthOutputFormat is the string to specify TTS output audio format.
 	SpeechServiceConnectionSynthOutputFormat PropertyID = 3102
 
+	// SpeechServiceConnectionSynthEnableCompressedAudioTransmission indicates if use compressed audio format
+	// for speech synthesis audio transmission.
+	// This property only affects when SpeechServiceConnectionSynthOutputFormat is set to a pcm format.
+	// If this property is not set and GStreamer is available, SDK will use compressed format for synthesized audio transmission,
+	// and decode it. You can set this property to "false" to use raw pcm format for transmission on wire.
+	// Added in version 1.17.0
+	SpeechServiceConnectionSynthEnableCompressedAudioTransmission PropertyID = 3103
+
 	// SpeechServiceConnectionInitialSilenceTimeoutMs is the initial silence timeout value (in milliseconds) used by the
 	// service.
 	SpeechServiceConnectionInitialSilenceTimeoutMs PropertyID = 3200
@@ -186,6 +194,29 @@ const (
 	// the speech service.
 	SpeechServiceResponseRecognitionLatencyMs PropertyID = 5002
 
+	// SpeechServiceResponseSynthesisFirstByteLatencyMs is the speech synthesis first byte latency in milliseconds.
+	// Read-only, available on final speech synthesis results.
+	// This measures the latency between when the synthesis is started to be processed, and the moment the first byte audio is available.
+	// Added in version 1.17.0.
+	SpeechServiceResponseSynthesisFirstByteLatencyMs PropertyID = 5010
+
+	// SpeechServiceResponseSynthesisFinishLatencyMs is the speech synthesis all bytes latency in milliseconds.
+	// Read-only, available on final speech synthesis results.
+	// This measures the latency between when the synthesis is started to be processed, and the moment the whole audio is synthesized.
+	// Added in version 1.17.0.
+	SpeechServiceResponseSynthesisFinishLatencyMs PropertyID = 5011
+
+	// SpeechServiceResponseSynthesisUnderrunTimeMs is the underrun time for speech synthesis in milliseconds.
+	// Read-only, available on results in SynthesisCompleted events.
+	// This measures the total underrun time from AudioConfigPlaybackBufferLengthInMs is filled to synthesis completed.
+	// Added in version 1.17.0.
+	SpeechServiceResponseSynthesisUnderrunTimeMs PropertyID = 5012
+
+	// SpeechServiceResponseSynthesisBackend indicates which backend the synthesis is finished by.
+	// Read-only, available on speech synthesis results, except for the result in SynthesisStarted event
+	// Added in version 1.17.0.
+	SpeechServiceResponseSynthesisBackend PropertyID = 5020
+
 	// CancellationDetailsReason is the cancellation reason. Currently unused.
 	CancellationDetailsReason PropertyID = 6000
 
@@ -216,6 +247,14 @@ const (
 	// AudioConfigAudioSource is the audio source. Allowed values are "Microphones", "File", and "Stream".
 	AudioConfigAudioSource PropertyID = 8004
 
+	// AudioConfigDeviceNameForRender indicates the device name for audio render. Under normal circumstances,
+	// you shouldn't have to use this property directly. Instead, use NewAudioConfigFromDefaultSpeakerOutput.
+	// Added in version 1.17.0
+	AudioConfigDeviceNameForRender PropertyID = 8005
+
+	// AudioConfigPlaybackBufferLengthInMs indicates the playback buffer length in milliseconds, default is 50 milliseconds.
+	AudioConfigPlaybackBufferLengthInMs PropertyID = 8006
+
 	// SpeechLogFilename is the file name to write logs.
 	SpeechLogFilename PropertyID = 9001
 

diff --git a/common/speech_synthesis_output_format.go b/common/speech_synthesis_output_format.go
@@ -11,9 +11,11 @@ const (
 	Raw8Khz8BitMonoMULaw SpeechSynthesisOutputFormat = 1
 
 	// Riff16Khz16KbpsMonoSiren stands for riff-16khz-16kbps-mono-siren
+	// Unsupported by the service. Do not use this value.
 	Riff16Khz16KbpsMonoSiren SpeechSynthesisOutputFormat = 2
 
 	// Audio16Khz16KbpsMonoSiren stands for audio-16khz-16kbps-mono-siren
+	// Unsupported by the service. Do not use this value.
 	Audio16Khz16KbpsMonoSiren SpeechSynthesisOutputFormat = 3
 
 	// Audio16Khz32KBitRateMonoMp3 stands for audio-16khz-32kbitrate-mono-mp3
@@ -57,4 +59,40 @@ const (
 
 	// Raw8Khz16BitMonoPcm stands for raw-8khz-16bit-mono-pcm
 	Raw8Khz16BitMonoPcm SpeechSynthesisOutputFormat = 17
+
+	// Ogg16Khz16BitMonoOpus stands for ogg-16khz-16bit-mono-opus
+	Ogg16Khz16BitMonoOpus SpeechSynthesisOutputFormat = 18
+
+	// Ogg24Khz16BitMonoOpus stands for ogg-24khz-16bit-mono-opus
+	Ogg24Khz16BitMonoOpus SpeechSynthesisOutputFormat = 19
+
+	// Raw48Khz16BitMonoPcm stands for raw-48khz-16bit-mono-pcm
+	Raw48Khz16BitMonoPcm SpeechSynthesisOutputFormat = 20
+
+	// Riff48Khz16BitMonoPcm stands for riff-48khz-16bit-mono-pcm
+	Riff48Khz16BitMonoPcm SpeechSynthesisOutputFormat = 21
+
+	// Audio48Khz96KBitRateMonoMp3 stands for audio-48khz-96kbitrate-mono-mp3
+	Audio48Khz96KBitRateMonoMp3 SpeechSynthesisOutputFormat = 22
+
+	// Audio48Khz192KBitRateMonoMp3 stands for audio-48khz-192kbitrate-mono-mp3
+	Audio48Khz192KBitRateMonoMp3 SpeechSynthesisOutputFormat = 23
+
+	// Ogg48Khz16BitMonoOpus stands for ogg-48khz-16bit-mono-opus
+	Ogg48Khz16BitMonoOpus SpeechSynthesisOutputFormat = 24
+
+	// Webm16Khz16BitMonoOpus stands for webm-16khz-16bit-mono-opus
+	Webm16Khz16BitMonoOpus SpeechSynthesisOutputFormat = 25
+
+	// Webm24Khz16BitMonoOpus stands for webm-24khz-16bit-mono-opus
+	Webm24Khz16BitMonoOpus SpeechSynthesisOutputFormat = 26
+
+	// Raw24Khz16BitMonoTrueSilk stands for raw-24khz-16bit-mono-truesilk
+	Raw24Khz16BitMonoTrueSilk SpeechSynthesisOutputFormat = 27
+
+	// Raw8Khz8BitMonoALaw stands for raw-8khz-8bit-mono-alaw
+	Raw8Khz8BitMonoALaw SpeechSynthesisOutputFormat = 28
+
+	// Riff8Khz8BitMonoALaw stands for riff-8khz-8bit-mono-alaw
+	Riff8Khz8BitMonoALaw SpeechSynthesisOutputFormat = 29
 )
diff --git a/common/synthesis_voice_gender.go b/common/synthesis_voice_gender.go
@@ -0,0 +1,18 @@
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+
+package common
+
+// SynthesisVoiceGender defines the gender of a synthesis voice.
+type SynthesisVoiceGender int
+
+const (
+	// GenderUnknown means the gender is unknown.
+	GenderUnknown SynthesisVoiceGender = 0
+
+	// Female indicates female.
+	Female SynthesisVoiceGender = 1
+
+	// Male indicates male.
+	Male SynthesisVoiceGender = 2
+)
diff --git a/samples/main.go b/samples/main.go
@@ -10,6 +10,7 @@ import (
 
 	"github.com/Microsoft/cognitive-services-speech-sdk-go/samples/dialog_service_connector"
 	"github.com/Microsoft/cognitive-services-speech-sdk-go/samples/recognizer"
+	"github.com/Microsoft/cognitive-services-speech-sdk-go/samples/synthesizer"
 )
 
 type functionMap = map[string]func(string, string, string)
@@ -27,12 +28,14 @@ func printHelp(executableName string, samples functionMap) {
 
 func main() {
 	samples := functionMap{
-		"speech_recognizer:RecognizeOnceFromWavFile":    recognizer.RecognizeOnceFromWavFile,
-		"speech_recognizer:RecognizeOnceFromCompressedFile":    recognizer.RecognizeOnceFromCompressedFile,
-		"speech_recognizer:ContinuousFromMicrophone":    recognizer.ContinuousFromMicrophone,
-		"dialog_service_connector:ListenOnce":           dialog_service_connector.ListenOnce,
-		"dialog_service_connector:KWS":                  dialog_service_connector.KWS,
-		"dialog_service_connector:ListenOnceFromStream": dialog_service_connector.ListenOnceFromStream,
+		"speech_recognizer:RecognizeOnceFromWavFile":        recognizer.RecognizeOnceFromWavFile,
+		"speech_recognizer:RecognizeOnceFromCompressedFile": recognizer.RecognizeOnceFromCompressedFile,
+		"speech_recognizer:ContinuousFromMicrophone":        recognizer.ContinuousFromMicrophone,
+		"dialog_service_connector:ListenOnce":               dialog_service_connector.ListenOnce,
+		"dialog_service_connector:KWS":                      dialog_service_connector.KWS,
+		"dialog_service_connector:ListenOnceFromStream":     dialog_service_connector.ListenOnceFromStream,
+		"speech_synthesizer:SynthesisToSpeaker":             synthesizer.SynthesisToSpeaker,
+		"speech_synthesizer:SynthesisToAudioDataStream":     synthesizer.SynthesisToAudioDataStream,
 	}
 	args := os.Args[1:]
 	if len(args) != 4 {

diff --git a/samples/synthesizer/doc.go b/samples/synthesizer/doc.go
@@ -0,0 +1,5 @@
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+
+// Package synthesizer provides samples of text-to-speech
+package synthesizer
diff --git a/samples/synthesizer/to_audio_data_stream.go b/samples/synthesizer/to_audio_data_stream.go
@@ -0,0 +1,82 @@
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+
+package synthesizer
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"os"
+	"strings"
+	"time"
+
+	"github.com/Microsoft/cognitive-services-speech-sdk-go/speech"
+)
+
+func SynthesisToAudioDataStream(subscription string, region string, file string) {
+	config, err := speech.NewSpeechConfigFromSubscription(subscription, region)
+	if err != nil {
+		fmt.Println("Got an error: ", err)
+		return
+	}
+	defer config.Close()
+	speechSynthesizer, err := speech.NewSpeechSynthesizerFromConfig(config, nil)
+	if err != nil {
+		fmt.Println("Got an error: ", err)
+		return
+	}
+	defer speechSynthesizer.Close()
+
+	speechSynthesizer.SynthesisStarted(synthesizeStartedHandler)
+	speechSynthesizer.Synthesizing(synthesizingHandler)
+	speechSynthesizer.SynthesisCompleted(synthesizedHandler)
+	speechSynthesizer.SynthesisCanceled(cancelledHandler)
+
+	for {
+		fmt.Printf("Enter some text that you want to speak, or enter empty text to exit.\n> ")
+		text, _ := bufio.NewReader(os.Stdin).ReadString('\n')
+		text = strings.TrimSuffix(text, "\n")
+		if len(text) == 0 {
+			break
+		}
+
+		// StartSpeakingTextAsync sends the result to channel when the synthesis starts.
+		task := speechSynthesizer.StartSpeakingTextAsync(text)
+		var outcome speech.SpeechSynthesisOutcome
+		select {
+		case outcome = <-task:
+		case <-time.After(60 * time.Second):
+			fmt.Println("Timed out")
+			return
+		}
+		defer outcome.Close()
+		if outcome.Error != nil {
+			fmt.Println("Got an error: ", outcome.Error)
+			return
+		}
+
+		// in most case we want to streaming receive the audio to lower the latency,
+		// we can use AudioDataStream to do so.
+		stream, err := speech.NewAudioDataStreamFromSpeechSynthesisResult(outcome.Result)
+		defer stream.Close()
+		if err != nil {
+			fmt.Println("Got an error: ", err)
+			return
+		}
+
+		var all_audio []byte
+		audio_chunk := make([]byte, 2048)
+		for {
+			n, err := stream.Read(audio_chunk)
+
+			if err == io.EOF {
+				break
+			}
+
+			all_audio = append(all_audio, audio_chunk[:n]...)
+		}
+
+		fmt.Printf("Read [%d] bytes from audio data stream.\n", len(all_audio))
+	}
+}