forked from DingKe/yoyo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_loop.py
126 lines (102 loc) · 3.65 KB
/
main_loop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 18 23:09:04 2018
@author: keding
"""
from __future__ import print_function
import os
import collections
import wave
import webrtcvad
from wakeup import Snowboy
from asr import BasicStreamASR
from tts import BasicTTS
from audio import play, AudioStream
import pyaudio
from utils import no_alsa_error
def wakeup_alert(keyword):
ding = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'snowboy/resources/ding.wav')
dong = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'snowboy/resources/dong.wav')
if keyword == 1:
play(ding)
else:
play(dong)
if __name__ == '__main__':
app_id = '11????605'
app_key = 'XeJaa????7uNHeyC'
# initialize TTS
tts_engine = BasicTTS(app_id=app_id, app_key=app_key)
# initialize ASR
asr_engine = BasicStreamASR(app_id=app_id, app_key=app_key)
vad = webrtcvad.Vad(0)
with no_alsa_error():
pa = pyaudio.PyAudio()
# set up wakeup
models = ['snowboy/resources/models/snowboy.umdl',
'snowboy/resources/models/alexa_02092017.umdl']
keyword_detector = Snowboy(models, 0.5)
tts_engine.say("你好", speaker=6)
while True:
keyword_detector.start_detect(callback=wakeup_alert)
# speech streaming
rate = 16000
chunk_duration = 30 # in ms
chunk = rate // 1000 * chunk_duration
stream = pa.open(format=pyaudio.paInt16, channels=1, rate=rate,
input=True, frames_per_buffer=chunk)
stream.start_stream()
# feed speech into ASR engine
buffer_duration = 1500 # 1.5s
num_buffer_chunks = buffer_duration // chunk_duration
num_window_chunks = 400 // chunk_duration
num_window_chunks_end = 2 * num_window_chunks
max_waiting_chunks = 2000 // chunk_duration
ring_buffer = collections.deque(maxlen=num_buffer_chunks)
triggered = False
count = 0
flags = [0] * num_window_chunks
flags_end = [1] * num_window_chunks_end
idx, idx_end = 0, 0
voiced_frames = []
seq = 0
text = ''
while True:
'''adapted from https://github.com/wangshub/python-vad
'''
count += 1
frames = stream.read(chunk)
status = vad.is_speech(frames, rate)
active = 1 if status else 0
flags[idx] = flags_end[idx_end] = active
idx = (idx + 1) % num_window_chunks
idx_end = (idx_end + 1) % num_window_chunks_end
if not triggered:
# time out
if count > max_waiting_chunks:
break
ring_buffer.append(frames)
if sum(flags) > 0.8 * len(flags):
triggered = True
voiced_frames.append(b''.join(ring_buffer))
ring_buffer.clear()
else: # check end point
voiced_frames.append(frames)
if sum(flags_end) < 0.1 * len(flags_end):
triggered = False
break
stream.stop_stream()
stream.close()
# save wave
wav_file = 'wav/test_main_loop.wav'
wf = wave.open(wav_file, 'wb')
wf.setnchannels(1)
wf.setframerate(16000)
wf.setsampwidth(2)
wf.writeframes(b''.join(voiced_frames))
wf.close()
play(wav_file)
text = asr_engine.stt(wav_file)
print(text)
pa.terminate()