Skip to content

Commit

Permalink
Add duration property of speech synthesis result, update word boundary
Browse files Browse the repository at this point in the history
  • Loading branch information
Yulin Li committed Mar 3, 2022
1 parent 31e045e commit 72701e8
Show file tree
Hide file tree
Showing 8 changed files with 112 additions and 13 deletions.
14 changes: 14 additions & 0 deletions common/property_id.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,20 @@ const (
// partial results by omitting words in the end.
SpeechServiceResponseTranslationRequestStablePartialResult PropertyID = 4100

// SpeechServiceResponseRequestWordBoundary is a boolean value specifying whether to request WordBoundary events.
// Added in version 1.21.0.
SpeechServiceResponseRequestWordBoundary PropertyID = 4200

// SpeechServiceResponseRequestPunctuationBoundary is a boolean value specifying whether to request punctuation boundary
// in WordBoundary Events. Default is true.
// Added in version 1.21.0.
SpeechServiceResponseRequestPunctuationBoundary PropertyID = 4201

// SpeechServiceResponseRequestSentenceBoundary ia a boolean value specifying whether to request sentence boundary
// in WordBoundary Events. Default is false.
// Added in version 1.21.0.
SpeechServiceResponseRequestSentenceBoundary PropertyID = 4202

// SpeechServiceResponseJSONResult is the Cognitive Services Speech Service response output (in JSON format). This
// property is available on recognition result objects only.
SpeechServiceResponseJSONResult PropertyID = 5000
Expand Down
18 changes: 18 additions & 0 deletions common/speech_synthesis_boundary_type.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.

package common

// SpeechSynthesisBoundaryType defines the boundary type of speech synthesis boundary event.
type SpeechSynthesisBoundaryType int

const (
// WordBoundary indicates word boundary.
WordBoundary SpeechSynthesisBoundaryType = 1

// PunctuationBoundary indicates punctuation boundary.
PunctuationBoundary SpeechSynthesisBoundaryType = 2

// SentenceBoundary indicates sentence boundary.
SentenceBoundary SpeechSynthesisBoundaryType = 3
)
16 changes: 16 additions & 0 deletions common/speech_synthesis_output_format.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,4 +95,20 @@ const (

// Riff8Khz8BitMonoALaw stands for riff-8khz-8bit-mono-alaw
Riff8Khz8BitMonoALaw SpeechSynthesisOutputFormat = 29

// Webm24Khz16Bit24KbpsMonoOpus stands for webm-24khz-16bit-24kbps-mono-opus
// Audio compressed by OPUS codec in a WebM container, with bitrate of 24kbps, optimized for IoT scenario.
Webm24Khz16Bit24KbpsMonoOpus SpeechSynthesisOutputFormat = 30

// Audio16Khz16Bit32KbpsMonoOpus stands for audio-16khz-16bit-32kbps-mono-opus
// Audio compressed by OPUS codec without container, with bitrate of 32kbps.
Audio16Khz16Bit32KbpsMonoOpus SpeechSynthesisOutputFormat = 31

// Audio24Khz16Bit48KbpsMonoOpus stands for audio-24khz-16bit-48kbps-mono-opus
// Audio compressed by OPUS codec without container, with bitrate of 48kbps.
Audio24Khz16Bit48KbpsMonoOpus SpeechSynthesisOutputFormat = 32

// Audio24Khz16Bit24KbpsMonoOpus stands for audio-24khz-16bit-24kbps-mono-opus
// Audio compressed by OPUS codec without container, with bitrate of 24kbps.
Audio24Khz16Bit24KbpsMonoOpus SpeechSynthesisOutputFormat = 33
)
10 changes: 7 additions & 3 deletions speech/speech_synthesis_bookmark_event_args.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,13 @@ import "C"

// SpeechSynthesisBookmarkEventArgs represents the speech synthesis bookmark event arguments.
type SpeechSynthesisBookmarkEventArgs struct {
handle C.SPXHANDLE
handle C.SPXHANDLE

// AudioOffset is the audio offset of the bookmark event, in ticks (100 nanoseconds).
AudioOffset uint64
Text string

// Text is the text of the bookmark.
Text string
}

// Close releases the underlying resources
Expand All @@ -36,7 +40,7 @@ func NewSpeechSynthesisBookmarkEventArgsFromHandle(handle common.SPXHandle) (*Sp
}
event.AudioOffset = uint64(cAudioOffset)
/* Text */
value := C.synthesizer_bookmark_event_get_text(event.handle)
value := C.synthesizer_event_get_text(event.handle)
event.Text = C.GoString(value)
C.property_bag_free_string(value)
return event, nil
Expand Down
10 changes: 8 additions & 2 deletions speech/speech_synthesis_result.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package speech

import (
"time"
"unsafe"

"github.com/Microsoft/cognitive-services-speech-sdk-go/common"
Expand All @@ -30,6 +31,9 @@ type SpeechSynthesisResult struct {
// AudioData presents the synthesized audio.
AudioData []byte

// AudioDuration presents the time duration of synthesized audio.
AudioDuration time.Duration

// Collection of additional synthesisResult properties.
Properties *common.PropertyCollection
}
Expand All @@ -45,12 +49,14 @@ func NewSpeechSynthesisResultFromHandle(handle common.SPXHandle) (*SpeechSynthes

result := new(SpeechSynthesisResult)
result.handle = uintptr2handle(handle)
/* AudioData length */
/* AudioData length and duration */
var cAudioLength C.uint32_t
ret := uintptr(C.synth_result_get_audio_length(result.handle, &cAudioLength))
var cAudioDuration C.uint64_t
ret := uintptr(C.synth_result_get_audio_length_duration(result.handle, &cAudioLength, &cAudioDuration))
if ret != C.SPX_NOERROR {
return nil, common.NewCarbonError(ret)
}
result.AudioDuration = time.Duration(cAudioDuration*100) * time.Nanosecond
// using max(1024, cAudioLength) as buffer size
if cAudioLength < 1024 {
cAudioLength = 1024
Expand Down
12 changes: 9 additions & 3 deletions speech/speech_synthesis_viseme_event_args.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,16 @@ import "C"

// SpeechSynthesisVisemeEventArgs represents the speech synthesis viseme event arguments.
type SpeechSynthesisVisemeEventArgs struct {
handle C.SPXHANDLE
handle C.SPXHANDLE

// AudioOffset is the audio offset of the viseme event, in ticks (100 nanoseconds).
AudioOffset uint64
VisemeID uint
Animation string

// VisemeID is the viseme ID.
VisemeID uint

// Animation is the animation.
Animation string
}

// Close releases the underlying resources
Expand Down
34 changes: 29 additions & 5 deletions speech/speech_synthesis_word_boundary_event_args.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
package speech

import (
"time"

"github.com/Microsoft/cognitive-services-speech-sdk-go/common"
)

Expand All @@ -14,10 +16,25 @@ import "C"

// SpeechSynthesisWordBoundaryEventArgs represents the speech synthesis word boundary event arguments.
type SpeechSynthesisWordBoundaryEventArgs struct {
handle C.SPXHANDLE
handle C.SPXHANDLE

// AudioOffset is the audio offset of the word boundary event, in ticks (100 nanoseconds).
AudioOffset uint64
TextOffset uint
WordLength uint

// Duration is the duration of the word boundary event.
Duration time.Duration

// TextOffset is the text offset.
TextOffset uint

// WordLength is the length of the word.
WordLength uint

// Text is the text.
Text string

// BoundaryType is the boundary type.
BoundaryType common.SpeechSynthesisBoundaryType
}

// Close releases the underlying resources
Expand All @@ -29,15 +46,22 @@ func (event SpeechSynthesisWordBoundaryEventArgs) Close() {
func NewSpeechSynthesisWordBoundaryEventArgsFromHandle(handle common.SPXHandle) (*SpeechSynthesisWordBoundaryEventArgs, error) {
event := new(SpeechSynthesisWordBoundaryEventArgs)
event.handle = uintptr2handle(handle)
var cAudioOffset C.uint64_t
var cAudioOffset, cDuration C.uint64_t
var cTextOffset, cWordLength C.uint32_t
ret := uintptr(C.synthesizer_word_boundary_event_get_values(event.handle, &cAudioOffset, &cTextOffset, &cWordLength))
var cBoundaryType C.SpeechSynthesis_BoundaryType
ret := uintptr(C.synthesizer_word_boundary_event_get_values(event.handle, &cAudioOffset, &cDuration, &cTextOffset, &cWordLength, &cBoundaryType))
if ret != C.SPX_NOERROR {
return nil, common.NewCarbonError(ret)
}
event.AudioOffset = uint64(cAudioOffset)
event.Duration = time.Duration(cDuration*100) * time.Nanosecond
event.TextOffset = uint(cTextOffset)
event.WordLength = uint(cWordLength)
event.BoundaryType = (common.SpeechSynthesisBoundaryType)(cBoundaryType)
/* Text */
value := C.synthesizer_event_get_text(event.handle)
event.Text = C.GoString(value)
C.property_bag_free_string(value)
return event, nil
}

Expand Down
11 changes: 11 additions & 0 deletions speech/speech_synthesizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package speech

import (
"bytes"
"math"
"os"
"strings"
"testing"
Expand Down Expand Up @@ -81,6 +82,10 @@ func checkBinaryEqual(t *testing.T, result1 *SpeechSynthesisResult, result2 *Spe
}
}

func almostEqual(expected, actual, threshold float64) bool {
return math.Abs(expected-actual) <= threshold
}

func TestSynthesizerEvents(t *testing.T) {
synthesizer := createSpeechSynthesizerFromAudioConfig(t, nil)
if synthesizer == nil {
Expand Down Expand Up @@ -110,6 +115,9 @@ func TestSynthesizerEvents(t *testing.T) {
defer event.Close()
t.Logf("SynthesisCompleted, audio length %d", len(event.Result.AudioData))
checkSynthesisResult(t, &event.Result, common.SynthesizingAudioCompleted)
if !almostEqual((float64)(event.Result.AudioDuration/time.Millisecond), (float64)(len(event.Result.AudioData)/32000), 100) {
t.Errorf("Synthesis duration incorrect")
}
synthesisCompletedFuture <- "synthesisCompletedFuture"
})
resultFuture := synthesizer.SpeakTextAsync("test")
Expand Down Expand Up @@ -359,6 +367,9 @@ func TestSynthesizerEvents2(t *testing.T) {
if event.AudioOffset <= 0 {
t.Error("word boundary audio offset")
}
if event.Duration <= 0 {
t.Error("word boundary duration")
}
if event.TextOffset <= 0 {
t.Error("word boundary text offset")
}
Expand Down

0 comments on commit 72701e8

Please sign in to comment.