Add duration property of speech synthesis result, update word boundary

microsoft · Mar 3, 2022 · 72701e8 · 72701e8
1 parent 31e045e
commit 72701e8
Show file tree

Hide file tree

Showing 8 changed files with 112 additions and 13 deletions.
diff --git a/common/property_id.go b/common/property_id.go
@@ -178,6 +178,20 @@ const (
 	// partial results by omitting words in the end.
 	SpeechServiceResponseTranslationRequestStablePartialResult PropertyID = 4100
 
+	// SpeechServiceResponseRequestWordBoundary is a boolean value specifying whether to request WordBoundary events.
+	// Added in version 1.21.0.
+	SpeechServiceResponseRequestWordBoundary PropertyID = 4200
+
+	// SpeechServiceResponseRequestPunctuationBoundary is a boolean value specifying whether to request punctuation boundary
+	// in WordBoundary Events. Default is true.
+	// Added in version 1.21.0.
+	SpeechServiceResponseRequestPunctuationBoundary PropertyID = 4201
+
+	// SpeechServiceResponseRequestSentenceBoundary ia a boolean value specifying whether to request sentence boundary
+	// in WordBoundary Events. Default is false.
+	// Added in version 1.21.0.
+	SpeechServiceResponseRequestSentenceBoundary PropertyID = 4202
+
 	// SpeechServiceResponseJSONResult is the Cognitive Services Speech Service response output (in JSON format). This
 	// property is available on recognition result objects only.
 	SpeechServiceResponseJSONResult PropertyID = 5000

diff --git a/common/speech_synthesis_boundary_type.go b/common/speech_synthesis_boundary_type.go
@@ -0,0 +1,18 @@
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+
+package common
+
+// SpeechSynthesisBoundaryType defines the boundary type of speech synthesis boundary event.
+type SpeechSynthesisBoundaryType int
+
+const (
+	// WordBoundary indicates word boundary.
+	WordBoundary SpeechSynthesisBoundaryType = 1
+
+	// PunctuationBoundary indicates punctuation boundary.
+	PunctuationBoundary SpeechSynthesisBoundaryType = 2
+
+	// SentenceBoundary indicates sentence boundary.
+	SentenceBoundary SpeechSynthesisBoundaryType = 3
+)
diff --git a/common/speech_synthesis_output_format.go b/common/speech_synthesis_output_format.go
@@ -95,4 +95,20 @@ const (
 
 	// Riff8Khz8BitMonoALaw stands for riff-8khz-8bit-mono-alaw
 	Riff8Khz8BitMonoALaw SpeechSynthesisOutputFormat = 29
+
+	// Webm24Khz16Bit24KbpsMonoOpus stands for webm-24khz-16bit-24kbps-mono-opus
+	// Audio compressed by OPUS codec in a WebM container, with bitrate of 24kbps, optimized for IoT scenario.
+	Webm24Khz16Bit24KbpsMonoOpus SpeechSynthesisOutputFormat = 30
+
+	// Audio16Khz16Bit32KbpsMonoOpus stands for audio-16khz-16bit-32kbps-mono-opus
+	// Audio compressed by OPUS codec without container, with bitrate of 32kbps.
+	Audio16Khz16Bit32KbpsMonoOpus SpeechSynthesisOutputFormat = 31
+
+	// Audio24Khz16Bit48KbpsMonoOpus stands for audio-24khz-16bit-48kbps-mono-opus
+	// Audio compressed by OPUS codec without container, with bitrate of 48kbps.
+	Audio24Khz16Bit48KbpsMonoOpus SpeechSynthesisOutputFormat = 32
+
+	// Audio24Khz16Bit24KbpsMonoOpus stands for audio-24khz-16bit-24kbps-mono-opus
+	// Audio compressed by OPUS codec without container, with bitrate of 24kbps.
+	Audio24Khz16Bit24KbpsMonoOpus SpeechSynthesisOutputFormat = 33
 )
diff --git a/speech/speech_synthesis_bookmark_event_args.go b/speech/speech_synthesis_bookmark_event_args.go
@@ -14,9 +14,13 @@ import "C"
 
 // SpeechSynthesisBookmarkEventArgs represents the speech synthesis bookmark event arguments.
 type SpeechSynthesisBookmarkEventArgs struct {
-	handle      C.SPXHANDLE
+	handle C.SPXHANDLE
+
+	// AudioOffset is the audio offset of the bookmark event, in ticks (100 nanoseconds).
 	AudioOffset uint64
-	Text        string
+
+	// Text is the text of the bookmark.
+	Text string
 }
 
 // Close releases the underlying resources
@@ -36,7 +40,7 @@ func NewSpeechSynthesisBookmarkEventArgsFromHandle(handle common.SPXHandle) (*Sp
 	}
 	event.AudioOffset = uint64(cAudioOffset)
 	/* Text */
-	value := C.synthesizer_bookmark_event_get_text(event.handle)
+	value := C.synthesizer_event_get_text(event.handle)
 	event.Text = C.GoString(value)
 	C.property_bag_free_string(value)
 	return event, nil

diff --git a/speech/speech_synthesis_result.go b/speech/speech_synthesis_result.go
@@ -4,6 +4,7 @@
 package speech
 
 import (
+	"time"
 	"unsafe"
 
 	"github.com/Microsoft/cognitive-services-speech-sdk-go/common"
@@ -30,6 +31,9 @@ type SpeechSynthesisResult struct {
 	// AudioData presents the synthesized audio.
 	AudioData []byte
 
+	// AudioDuration presents the time duration of synthesized audio.
+	AudioDuration time.Duration
+
 	// Collection of additional synthesisResult properties.
 	Properties *common.PropertyCollection
 }
@@ -45,12 +49,14 @@ func NewSpeechSynthesisResultFromHandle(handle common.SPXHandle) (*SpeechSynthes
 
 	result := new(SpeechSynthesisResult)
 	result.handle = uintptr2handle(handle)
-	/* AudioData length */
+	/* AudioData length and duration */
 	var cAudioLength C.uint32_t
-	ret := uintptr(C.synth_result_get_audio_length(result.handle, &cAudioLength))
+	var cAudioDuration C.uint64_t
+	ret := uintptr(C.synth_result_get_audio_length_duration(result.handle, &cAudioLength, &cAudioDuration))
 	if ret != C.SPX_NOERROR {
 		return nil, common.NewCarbonError(ret)
 	}
+	result.AudioDuration = time.Duration(cAudioDuration*100) * time.Nanosecond
 	// using max(1024, cAudioLength) as buffer size
 	if cAudioLength < 1024 {
 		cAudioLength = 1024

diff --git a/speech/speech_synthesis_viseme_event_args.go b/speech/speech_synthesis_viseme_event_args.go
@@ -14,10 +14,16 @@ import "C"
 
 // SpeechSynthesisVisemeEventArgs represents the speech synthesis viseme event arguments.
 type SpeechSynthesisVisemeEventArgs struct {
-	handle      C.SPXHANDLE
+	handle C.SPXHANDLE
+
+	// AudioOffset is the audio offset of the viseme event, in ticks (100 nanoseconds).
 	AudioOffset uint64
-	VisemeID    uint
-	Animation   string
+
+	// VisemeID is the viseme ID.
+	VisemeID uint
+
+	// Animation is the animation.
+	Animation string
 }
 
 // Close releases the underlying resources

diff --git a/speech/speech_synthesis_word_boundary_event_args.go b/speech/speech_synthesis_word_boundary_event_args.go
@@ -4,6 +4,8 @@
 package speech
 
 import (
+	"time"
+
 	"github.com/Microsoft/cognitive-services-speech-sdk-go/common"
 )
 
@@ -14,10 +16,25 @@ import "C"
 
 // SpeechSynthesisWordBoundaryEventArgs represents the speech synthesis word boundary event arguments.
 type SpeechSynthesisWordBoundaryEventArgs struct {
-	handle      C.SPXHANDLE
+	handle C.SPXHANDLE
+
+	// AudioOffset is the audio offset of the word boundary event, in ticks (100 nanoseconds).
 	AudioOffset uint64
-	TextOffset  uint
-	WordLength  uint
+
+	// Duration is the duration of the word boundary event.
+	Duration time.Duration
+
+	// TextOffset is the text offset.
+	TextOffset uint
+
+	// WordLength is the length of the word.
+	WordLength uint
+
+	// Text is the text.
+	Text string
+
+	// BoundaryType is the boundary type.
+	BoundaryType common.SpeechSynthesisBoundaryType
 }
 
 // Close releases the underlying resources
@@ -29,15 +46,22 @@ func (event SpeechSynthesisWordBoundaryEventArgs) Close() {
 func NewSpeechSynthesisWordBoundaryEventArgsFromHandle(handle common.SPXHandle) (*SpeechSynthesisWordBoundaryEventArgs, error) {
 	event := new(SpeechSynthesisWordBoundaryEventArgs)
 	event.handle = uintptr2handle(handle)
-	var cAudioOffset C.uint64_t
+	var cAudioOffset, cDuration C.uint64_t
 	var cTextOffset, cWordLength C.uint32_t
-	ret := uintptr(C.synthesizer_word_boundary_event_get_values(event.handle, &cAudioOffset, &cTextOffset, &cWordLength))
+	var cBoundaryType C.SpeechSynthesis_BoundaryType
+	ret := uintptr(C.synthesizer_word_boundary_event_get_values(event.handle, &cAudioOffset, &cDuration, &cTextOffset, &cWordLength, &cBoundaryType))
 	if ret != C.SPX_NOERROR {
 		return nil, common.NewCarbonError(ret)
 	}
 	event.AudioOffset = uint64(cAudioOffset)
+	event.Duration = time.Duration(cDuration*100) * time.Nanosecond
 	event.TextOffset = uint(cTextOffset)
 	event.WordLength = uint(cWordLength)
+	event.BoundaryType = (common.SpeechSynthesisBoundaryType)(cBoundaryType)
+	/* Text */
+	value := C.synthesizer_event_get_text(event.handle)
+	event.Text = C.GoString(value)
+	C.property_bag_free_string(value)
 	return event, nil
 }
 

diff --git a/speech/speech_synthesizer_test.go b/speech/speech_synthesizer_test.go
@@ -5,6 +5,7 @@ package speech
 
 import (
 	"bytes"
+	"math"
 	"os"
 	"strings"
 	"testing"
@@ -81,6 +82,10 @@ func checkBinaryEqual(t *testing.T, result1 *SpeechSynthesisResult, result2 *Spe
 	}
 }
 
+func almostEqual(expected, actual, threshold float64) bool {
+	return math.Abs(expected-actual) <= threshold
+}
+
 func TestSynthesizerEvents(t *testing.T) {
 	synthesizer := createSpeechSynthesizerFromAudioConfig(t, nil)
 	if synthesizer == nil {
@@ -110,6 +115,9 @@ func TestSynthesizerEvents(t *testing.T) {
 		defer event.Close()
 		t.Logf("SynthesisCompleted, audio length %d", len(event.Result.AudioData))
 		checkSynthesisResult(t, &event.Result, common.SynthesizingAudioCompleted)
+		if !almostEqual((float64)(event.Result.AudioDuration/time.Millisecond), (float64)(len(event.Result.AudioData)/32000), 100) {
+			t.Errorf("Synthesis duration incorrect")
+		}
 		synthesisCompletedFuture <- "synthesisCompletedFuture"
 	})
 	resultFuture := synthesizer.SpeakTextAsync("test")
@@ -359,6 +367,9 @@ func TestSynthesizerEvents2(t *testing.T) {
 		if event.AudioOffset <= 0 {
 			t.Error("word boundary audio offset")
 		}
+		if event.Duration <= 0 {
+			t.Error("word boundary duration")
+		}
 		if event.TextOffset <= 0 {
 			t.Error("word boundary text offset")
 		}