Skip to content

Commit

Permalink
TTS implementation: part 4 (#30)
Browse files Browse the repository at this point in the history
  • Loading branch information
yulin-li committed Apr 28, 2021
1 parent 44d2f09 commit a8d94ac
Show file tree
Hide file tree
Showing 13 changed files with 413 additions and 33 deletions.
3 changes: 2 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"golint",
"govet",
"unstaged",
"untracked"
"untracked",
"webm"
]
}
13 changes: 13 additions & 0 deletions audio/audio_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,19 @@ func NewAudioConfigFromDefaultSpeakerOutput() (*AudioConfig, error) {
return newAudioConfigFromHandle(handle)
}

// NewAudioConfigFromSpeakerOutput creates an AudioConfig object representing the specific audio output device
// (speaker) on the system.
func NewAudioConfigFromSpeakerOutput(deviceName string) (*AudioConfig, error) {
var handle C.SPXHANDLE
dn := C.CString(deviceName)
defer C.free(unsafe.Pointer(dn))
ret := uintptr(C.audio_config_create_audio_output_from_a_speaker(&handle, dn))
if ret != C.SPX_NOERROR {
return nil, common.NewCarbonError(ret)
}
return newAudioConfigFromHandle(handle)
}

// NewAudioConfigFromWavFileOutput creates an AudioConfig object representing the specified file for audio output.
func NewAudioConfigFromWavFileOutput(filename string) (*AudioConfig, error) {
var handle C.SPXHANDLE
Expand Down
8 changes: 6 additions & 2 deletions common/cancellation_reason.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@ package common
type CancellationReason int

const (
// Indicates that an error occurred during speech recognition.
// Error indicates that an error occurred during speech recognition.
Error CancellationReason = 1

// Indicates that the end of the audio stream was reached.
// EndOfStream indicates that the end of the audio stream was reached.
EndOfStream CancellationReason = 2

// CancelledByUser indicates that request was cancelled by the user.
// Added in version 1.17.0
CancelledByUser CancellationReason = 3
)
41 changes: 40 additions & 1 deletion common/property_id.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ const (
// SpeechSessionID is the session id. This id is a universally unique identifier (aka UUID) representing a specific
// binding of an audio input stream and the underlying speech recognition instance to which it is bound. Under normal
// circumstances, you shouldn't have to use this property directly.
/// Instead use SessionEventArgs.SessionId.
// Instead use SessionEventArgs.SessionId.
SpeechSessionID PropertyID = 3002

// SpeechServiceConnectionUserDefinedQueryParameters are the query parameters provided by users. They will be passed
Expand All @@ -118,6 +118,14 @@ const (
// SpeechServiceConnectionSynthOutputFormat is the string to specify TTS output audio format.
SpeechServiceConnectionSynthOutputFormat PropertyID = 3102

// SpeechServiceConnectionSynthEnableCompressedAudioTransmission indicates if use compressed audio format
// for speech synthesis audio transmission.
// This property only affects when SpeechServiceConnectionSynthOutputFormat is set to a pcm format.
// If this property is not set and GStreamer is available, SDK will use compressed format for synthesized audio transmission,
// and decode it. You can set this property to "false" to use raw pcm format for transmission on wire.
// Added in version 1.17.0
SpeechServiceConnectionSynthEnableCompressedAudioTransmission PropertyID = 3103

// SpeechServiceConnectionInitialSilenceTimeoutMs is the initial silence timeout value (in milliseconds) used by the
// service.
SpeechServiceConnectionInitialSilenceTimeoutMs PropertyID = 3200
Expand Down Expand Up @@ -186,6 +194,29 @@ const (
// the speech service.
SpeechServiceResponseRecognitionLatencyMs PropertyID = 5002

// SpeechServiceResponseSynthesisFirstByteLatencyMs is the speech synthesis first byte latency in milliseconds.
// Read-only, available on final speech synthesis results.
// This measures the latency between when the synthesis is started to be processed, and the moment the first byte audio is available.
// Added in version 1.17.0.
SpeechServiceResponseSynthesisFirstByteLatencyMs PropertyID = 5010

// SpeechServiceResponseSynthesisFinishLatencyMs is the speech synthesis all bytes latency in milliseconds.
// Read-only, available on final speech synthesis results.
// This measures the latency between when the synthesis is started to be processed, and the moment the whole audio is synthesized.
// Added in version 1.17.0.
SpeechServiceResponseSynthesisFinishLatencyMs PropertyID = 5011

// SpeechServiceResponseSynthesisUnderrunTimeMs is the underrun time for speech synthesis in milliseconds.
// Read-only, available on results in SynthesisCompleted events.
// This measures the total underrun time from AudioConfigPlaybackBufferLengthInMs is filled to synthesis completed.
// Added in version 1.17.0.
SpeechServiceResponseSynthesisUnderrunTimeMs PropertyID = 5012

// SpeechServiceResponseSynthesisBackend indicates which backend the synthesis is finished by.
// Read-only, available on speech synthesis results, except for the result in SynthesisStarted event
// Added in version 1.17.0.
SpeechServiceResponseSynthesisBackend PropertyID = 5020

// CancellationDetailsReason is the cancellation reason. Currently unused.
CancellationDetailsReason PropertyID = 6000

Expand Down Expand Up @@ -216,6 +247,14 @@ const (
// AudioConfigAudioSource is the audio source. Allowed values are "Microphones", "File", and "Stream".
AudioConfigAudioSource PropertyID = 8004

// AudioConfigDeviceNameForRender indicates the device name for audio render. Under normal circumstances,
// you shouldn't have to use this property directly. Instead, use NewAudioConfigFromDefaultSpeakerOutput.
// Added in version 1.17.0
AudioConfigDeviceNameForRender PropertyID = 8005

// AudioConfigPlaybackBufferLengthInMs indicates the playback buffer length in milliseconds, default is 50 milliseconds.
AudioConfigPlaybackBufferLengthInMs PropertyID = 8006

// SpeechLogFilename is the file name to write logs.
SpeechLogFilename PropertyID = 9001

Expand Down
38 changes: 38 additions & 0 deletions common/speech_synthesis_output_format.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@ const (
Raw8Khz8BitMonoMULaw SpeechSynthesisOutputFormat = 1

// Riff16Khz16KbpsMonoSiren stands for riff-16khz-16kbps-mono-siren
// Unsupported by the service. Do not use this value.
Riff16Khz16KbpsMonoSiren SpeechSynthesisOutputFormat = 2

// Audio16Khz16KbpsMonoSiren stands for audio-16khz-16kbps-mono-siren
// Unsupported by the service. Do not use this value.
Audio16Khz16KbpsMonoSiren SpeechSynthesisOutputFormat = 3

// Audio16Khz32KBitRateMonoMp3 stands for audio-16khz-32kbitrate-mono-mp3
Expand Down Expand Up @@ -57,4 +59,40 @@ const (

// Raw8Khz16BitMonoPcm stands for raw-8khz-16bit-mono-pcm
Raw8Khz16BitMonoPcm SpeechSynthesisOutputFormat = 17

// Ogg16Khz16BitMonoOpus stands for ogg-16khz-16bit-mono-opus
Ogg16Khz16BitMonoOpus SpeechSynthesisOutputFormat = 18

// Ogg24Khz16BitMonoOpus stands for ogg-24khz-16bit-mono-opus
Ogg24Khz16BitMonoOpus SpeechSynthesisOutputFormat = 19

// Raw48Khz16BitMonoPcm stands for raw-48khz-16bit-mono-pcm
Raw48Khz16BitMonoPcm SpeechSynthesisOutputFormat = 20

// Riff48Khz16BitMonoPcm stands for riff-48khz-16bit-mono-pcm
Riff48Khz16BitMonoPcm SpeechSynthesisOutputFormat = 21

// Audio48Khz96KBitRateMonoMp3 stands for audio-48khz-96kbitrate-mono-mp3
Audio48Khz96KBitRateMonoMp3 SpeechSynthesisOutputFormat = 22

// Audio48Khz192KBitRateMonoMp3 stands for audio-48khz-192kbitrate-mono-mp3
Audio48Khz192KBitRateMonoMp3 SpeechSynthesisOutputFormat = 23

// Ogg48Khz16BitMonoOpus stands for ogg-48khz-16bit-mono-opus
Ogg48Khz16BitMonoOpus SpeechSynthesisOutputFormat = 24

// Webm16Khz16BitMonoOpus stands for webm-16khz-16bit-mono-opus
Webm16Khz16BitMonoOpus SpeechSynthesisOutputFormat = 25

// Webm24Khz16BitMonoOpus stands for webm-24khz-16bit-mono-opus
Webm24Khz16BitMonoOpus SpeechSynthesisOutputFormat = 26

// Raw24Khz16BitMonoTrueSilk stands for raw-24khz-16bit-mono-truesilk
Raw24Khz16BitMonoTrueSilk SpeechSynthesisOutputFormat = 27

// Raw8Khz8BitMonoALaw stands for raw-8khz-8bit-mono-alaw
Raw8Khz8BitMonoALaw SpeechSynthesisOutputFormat = 28

// Riff8Khz8BitMonoALaw stands for riff-8khz-8bit-mono-alaw
Riff8Khz8BitMonoALaw SpeechSynthesisOutputFormat = 29
)
18 changes: 18 additions & 0 deletions common/synthesis_voice_gender.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.

package common

// SynthesisVoiceGender defines the gender of a synthesis voice.
type SynthesisVoiceGender int

const (
// GenderUnknown means the gender is unknown.
GenderUnknown SynthesisVoiceGender = 0

// Female indicates female.
Female SynthesisVoiceGender = 1

// Male indicates male.
Male SynthesisVoiceGender = 2
)
15 changes: 9 additions & 6 deletions samples/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

"github.com/Microsoft/cognitive-services-speech-sdk-go/samples/dialog_service_connector"
"github.com/Microsoft/cognitive-services-speech-sdk-go/samples/recognizer"
"github.com/Microsoft/cognitive-services-speech-sdk-go/samples/synthesizer"
)

type functionMap = map[string]func(string, string, string)
Expand All @@ -27,12 +28,14 @@ func printHelp(executableName string, samples functionMap) {

func main() {
samples := functionMap{
"speech_recognizer:RecognizeOnceFromWavFile": recognizer.RecognizeOnceFromWavFile,
"speech_recognizer:RecognizeOnceFromCompressedFile": recognizer.RecognizeOnceFromCompressedFile,
"speech_recognizer:ContinuousFromMicrophone": recognizer.ContinuousFromMicrophone,
"dialog_service_connector:ListenOnce": dialog_service_connector.ListenOnce,
"dialog_service_connector:KWS": dialog_service_connector.KWS,
"dialog_service_connector:ListenOnceFromStream": dialog_service_connector.ListenOnceFromStream,
"speech_recognizer:RecognizeOnceFromWavFile": recognizer.RecognizeOnceFromWavFile,
"speech_recognizer:RecognizeOnceFromCompressedFile": recognizer.RecognizeOnceFromCompressedFile,
"speech_recognizer:ContinuousFromMicrophone": recognizer.ContinuousFromMicrophone,
"dialog_service_connector:ListenOnce": dialog_service_connector.ListenOnce,
"dialog_service_connector:KWS": dialog_service_connector.KWS,
"dialog_service_connector:ListenOnceFromStream": dialog_service_connector.ListenOnceFromStream,
"speech_synthesizer:SynthesisToSpeaker": synthesizer.SynthesisToSpeaker,
"speech_synthesizer:SynthesisToAudioDataStream": synthesizer.SynthesisToAudioDataStream,
}
args := os.Args[1:]
if len(args) != 4 {
Expand Down
5 changes: 5 additions & 0 deletions samples/synthesizer/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.

// Package synthesizer provides samples of text-to-speech
package synthesizer
82 changes: 82 additions & 0 deletions samples/synthesizer/to_audio_data_stream.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.

package synthesizer

import (
"bufio"
"fmt"
"io"
"os"
"strings"
"time"

"github.com/Microsoft/cognitive-services-speech-sdk-go/speech"
)

func SynthesisToAudioDataStream(subscription string, region string, file string) {
config, err := speech.NewSpeechConfigFromSubscription(subscription, region)
if err != nil {
fmt.Println("Got an error: ", err)
return
}
defer config.Close()
speechSynthesizer, err := speech.NewSpeechSynthesizerFromConfig(config, nil)
if err != nil {
fmt.Println("Got an error: ", err)
return
}
defer speechSynthesizer.Close()

speechSynthesizer.SynthesisStarted(synthesizeStartedHandler)
speechSynthesizer.Synthesizing(synthesizingHandler)
speechSynthesizer.SynthesisCompleted(synthesizedHandler)
speechSynthesizer.SynthesisCanceled(cancelledHandler)

for {
fmt.Printf("Enter some text that you want to speak, or enter empty text to exit.\n> ")
text, _ := bufio.NewReader(os.Stdin).ReadString('\n')
text = strings.TrimSuffix(text, "\n")
if len(text) == 0 {
break
}

// StartSpeakingTextAsync sends the result to channel when the synthesis starts.
task := speechSynthesizer.StartSpeakingTextAsync(text)
var outcome speech.SpeechSynthesisOutcome
select {
case outcome = <-task:
case <-time.After(60 * time.Second):
fmt.Println("Timed out")
return
}
defer outcome.Close()
if outcome.Error != nil {
fmt.Println("Got an error: ", outcome.Error)
return
}

// in most case we want to streaming receive the audio to lower the latency,
// we can use AudioDataStream to do so.
stream, err := speech.NewAudioDataStreamFromSpeechSynthesisResult(outcome.Result)
defer stream.Close()
if err != nil {
fmt.Println("Got an error: ", err)
return
}

var all_audio []byte
audio_chunk := make([]byte, 2048)
for {
n, err := stream.Read(audio_chunk)

if err == io.EOF {
break
}

all_audio = append(all_audio, audio_chunk[:n]...)
}

fmt.Printf("Read [%d] bytes from audio data stream.\n", len(all_audio))
}
}
Loading

0 comments on commit a8d94ac

Please sign in to comment.