From a29e96a65e2d7eb078b08a30c97a8f72e01edbde Mon Sep 17 00:00:00 2001 From: janvarev Date: Mon, 9 May 2022 15:17:02 +0300 Subject: [PATCH] =?UTF-8?q?5.3=20runva=5Fvoskrem.py=20-=20=D0=B7=D0=B0?= =?UTF-8?q?=D0=BF=D1=83=D1=81=D0=BA=D0=B0=D0=BB=D1=8C=D1=89=D0=B8=D0=BA=20?= =?UTF-8?q?=D0=98=D1=80=D0=B8=D0=BD=D1=8B=20=D1=81=20=D1=80=D0=B0=D1=81?= =?UTF-8?q?=D0=BF=D0=BE=D0=B7=D0=BD=D0=B0=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5?= =?UTF-8?q?=D0=BC=20=D0=B3=D0=BE=D0=BB=D0=BE=D1=81=D0=B0=20=D1=87=D0=B5?= =?UTF-8?q?=D1=80=D0=B5=D0=B7=20VOSK=20=D0=94=D0=BE=D0=BA=D0=B5=D1=80.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 21 +++++++++++ runva_voskrem.py | 95 ++++++++++++++++++++++++++++++++++++++++++++++++ vacore.py | 2 +- 3 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 runva_voskrem.py diff --git a/README.md b/README.md index 7664a70..05012c9 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,27 @@ https://github.com/timhok/IreneVA-hassio-script-trigger-plugin [Подробнее про настройку клиент-серверного режима](/docs/INSTALL_MULTI.md) +## Speech-to-Text через VOSK remote + +Если у вас проблемы с установкой VOSK (например, на Mac), то вы можете воспользоваться +работой через VOSK Auto Speech Recognition Server, который запускается через Докер. + +- Запустите `docker run -d -p 2700:2700 alphacep/kaldi-ru:latest` +(детали: https://alphacephei.com/vosk/server ) + - или как вариант, вы можете запустить `vosk_asr_server.py`, переопределив внутри параметры + +```python + args.interface = os.environ.get('VOSK_SERVER_INTERFACE', "0.0.0.0") + args.port = int(os.environ.get('VOSK_SERVER_PORT', 2700) +``` + +- Запустите `runva_voskrem.py`. Он будет читать данные с микрофона и отправлять на сервер +для распознавания. + +В случае, если надо запустить распознавание на другой машине - +используйте параметр -u (--uri): `runva_voskrem.py -u=ws://100.100.100.100:2700` +для уточения адреса сервера. + ## Speech-to-Text через SpeechRecognition SpeechRecognition - классический движок для запуска распознавания через Google и ряд других сервисов. diff --git a/runva_voskrem.py b/runva_voskrem.py new file mode 100644 index 0000000..4eebb3b --- /dev/null +++ b/runva_voskrem.py @@ -0,0 +1,95 @@ +# VOSK-remote based type of Irene +# Speech will be recognized on Server +# Run Docker server: docker run -d -p 2700:2700 alphacep/kaldi-ru:latest +# Details: https://alphacephei.com/vosk/server + +import json +import asyncio +import websockets +import logging +import sounddevice as sd +import argparse + +from vacore import VACore + +mic_blocked = False + +def block_mic(): + global mic_blocked + #print("Blocking microphone...") + mic_blocked = True + +# ------------------- vosk ------------------ +def int_or_str(text): + """Helper function for argument parsing.""" + try: + return int(text) + except ValueError: + return text + +def callback(indata, frames, time, status): + """This is called (from a separate thread) for each audio block.""" + loop.call_soon_threadsafe(audio_queue.put_nowait, bytes(indata)) + +async def run_test(): + + # initing core + core = VACore() + core.init_with_plugins() + + with sd.RawInputStream(samplerate=args.samplerate, blocksize = 4000, device=args.device, dtype='int16', + channels=1, callback=callback) as device: + + async with websockets.connect(args.uri) as websocket: + await websocket.send('{ "config" : { "sample_rate" : %d } }' % (device.samplerate)) + + while True: + data = await audio_queue.get() + await websocket.send(data) + #print (await websocket.recv()) + res = await websocket.recv() + resj = json.loads(res) + if "text" in resj: + voice_input_str = resj["text"] + #print(restext) + + if voice_input_str != "": + core.run_input_str(voice_input_str,block_mic) + mic_blocked = False + + await websocket.send('{"eof" : 1}') + print (await websocket.recv()) + +async def main(): + + global args + global loop + global audio_queue + + parser = argparse.ArgumentParser(add_help=False) + parser.add_argument('-l', '--list-devices', action='store_true', + help='show list of audio devices and exit') + args, remaining = parser.parse_known_args() + if args.list_devices: + print(sd.query_devices()) + parser.exit(0) + parser = argparse.ArgumentParser(description="ASR Server", + formatter_class=argparse.RawDescriptionHelpFormatter, + parents=[parser]) + parser.add_argument('-u', '--uri', type=str, metavar='URL', + help='Server URL', default='ws://localhost:2700') + parser.add_argument('-d', '--device', type=int_or_str, + help='input device (numeric ID or substring)') + parser.add_argument('-r', '--samplerate', type=int, help='sampling rate', default=16000) + args = parser.parse_args(remaining) + loop = asyncio.get_running_loop() + audio_queue = asyncio.Queue() + + logging.basicConfig(level=logging.INFO) + + await run_test() + +if __name__ == '__main__': + asyncio.run(main()) + + diff --git a/vacore.py b/vacore.py index a91c72a..4edeea8 100644 --- a/vacore.py +++ b/vacore.py @@ -6,7 +6,7 @@ from jaa import JaaCore -version = "5.2" +version = "5.3" # main VACore class