Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[respeaker_ros] Add publish_multichannel option for publishing multi channel audio #350

Open
wants to merge 19 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
5cd8291
[respeaker_ros] Publish audio_info
iory Jun 11, 2022
950ba8d
[respeaker_ros] Enable speech to text for multi channel.
iory Jun 11, 2022
dffe449
[respeaker_ros] Add audio_info arg in sample
iory Jun 11, 2022
227708e
[respeaker_ros] Add publish_multichannel option for publishing multi …
iory Jun 11, 2022
206aaaa
[respeaker_ros] Restore audio input to use argment's audio
iory Jun 12, 2022
c31b13c
Merge branch 'master' into respeaker
k-okada Jun 13, 2022
7f40f50
[respeaker_ros] Add publish_multichannel option to fix launch file er…
iory Jun 13, 2022
305343a
[respeaker_ros] Add roslaunch_add_file_check to test launch file format
iory Jun 13, 2022
296cc6a
[respeaker_ros] Set audio_info topic name and check length of it.
iory Jun 13, 2022
7d3e2c9
[respeaker_ros] Add comment for why we add publish_multichannel option.
iory Jun 13, 2022
4b7632b
[respeaker_ros] Remove publish_multichannel option and publish raw mu…
iory Jun 13, 2022
f872a10
[respeaker_ros] Publish spech_audio_raw
iory Jun 13, 2022
6e49927
[respeaker_ros] Add comment to know more defails
iory Jun 13, 2022
e8ae9fe
[respeaker_ros] Publish speech audio raw
iory Jun 13, 2022
92477a5
[respeaker_ros] Add parameters for respaker ros
iory Jun 13, 2022
8caae81
[respeaker_ros] Fixed publishing audio topic's namespace
iory Jun 13, 2022
383670f
[respeaker_ros] Add parameters for speech_to_text
iory Jun 13, 2022
2f87dbb
[respeaker_ros] Avoid AudioInfo import for backward compatibility
iory Jun 14, 2022
4b99d01
[respeaker_ros] Fixed bytes calculation 'self.n_channe - 2' -> '(self…
iory Jun 14, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion respeaker_ros/launch/sample_respeaker.launch
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,20 @@
<arg name="language" default="en-US"/>
<!-- self cancellation -->
<arg name="self_cancellation" default="true"/>
<!-- audio info topic name -->
<arg name="audio_info" default="audio_info"/>
708yamaguchi marked this conversation as resolved.
Show resolved Hide resolved

<node if="$(arg publish_tf)"
name="static_transformer" pkg="tf" type="static_transform_publisher"
args="0 0 0 0 0 0 1 map respeaker_base 100"/>

<node if="$(arg launch_respeaker)"
name="respeaker_node" pkg="respeaker_ros" type="respeaker_node.py"
respawn="true" respawn_delay="10" />
respawn="true" respawn_delay="10" >
<rosparam subst_value="true" >
publish_multichannel: $(arg publish_multichannel)
708yamaguchi marked this conversation as resolved.
Show resolved Hide resolved
</rosparam>
</node>

<node if="$(arg launch_soundplay)"
name="sound_play" pkg="sound_play" type="soundplay_node.py"/>
Expand All @@ -30,6 +36,7 @@
<remap from="audio" to="$(arg audio)"/>
<remap from="speech_to_text" to="$(arg speech_to_text)"/>
<rosparam subst_value="true">
audio_info: $(arg audio_info)
language: $(arg language)
self_cancellation: $(arg self_cancellation)
tts_tolerance: 0.5
Expand Down
35 changes: 29 additions & 6 deletions respeaker_ros/scripts/respeaker_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import sys
import time
from audio_common_msgs.msg import AudioData
from audio_common_msgs.msg import AudioInfo
708yamaguchi marked this conversation as resolved.
Show resolved Hide resolved
from geometry_msgs.msg import PoseStamped
from std_msgs.msg import Bool, Int32, ColorRGBA
from dynamic_reconfigure.server import Server
Expand Down Expand Up @@ -221,7 +222,9 @@ def close(self):


class RespeakerAudio(object):
def __init__(self, on_audio, channel=0, suppress_error=True):
def __init__(self, on_audio, channel=0, suppress_error=True,
publish_multichannel=False):
self.publish_multichannel = publish_multichannel
self.on_audio = on_audio
with ignore_stderr(enable=suppress_error):
self.pyaudio = pyaudio.PyAudio()
Expand Down Expand Up @@ -254,7 +257,6 @@ def __init__(self, on_audio, channel=0, suppress_error=True):
if self.channels != 6:
rospy.logwarn("%d channel is found for respeaker" % self.channels)
rospy.logwarn("You may have to update firmware.")
self.channel = min(self.channels - 1, max(0, self.channel))

self.stream = self.pyaudio.open(
input=True, start=False,
Expand Down Expand Up @@ -284,7 +286,10 @@ def stream_callback(self, in_data, frame_count, time_info, status):
data = np.frombuffer(in_data, dtype=np.int16)
chunk_per_channel = int(len(data) / self.channels)
data = np.reshape(data, (chunk_per_channel, self.channels))
chan_data = data[:, self.channel]
if self.publish_multichannel is True:
chan_data = data
else:
chan_data = data[:, self.channel]
# invoke callback
self.on_audio(chan_data.tobytes())
return None, pyaudio.paContinue
Expand All @@ -306,6 +311,7 @@ def __init__(self):
self.doa_xy_offset = rospy.get_param("~doa_xy_offset", 0.0)
self.doa_yaw_offset = rospy.get_param("~doa_yaw_offset", 90.0)
self.speech_prefetch = rospy.get_param("~speech_prefetch", 0.5)
self.publish_multichannel = rospy.get_param("~publish_multichannel", False)
self.speech_continuation = rospy.get_param("~speech_continuation", 0.5)
self.speech_max_duration = rospy.get_param("~speech_max_duration", 7.0)
self.speech_min_duration = rospy.get_param("~speech_min_duration", 0.1)
Expand All @@ -322,21 +328,38 @@ def __init__(self):
self.pub_doa_raw = rospy.Publisher("sound_direction", Int32, queue_size=1, latch=True)
self.pub_doa = rospy.Publisher("sound_localization", PoseStamped, queue_size=1, latch=True)
self.pub_audio = rospy.Publisher("audio", AudioData, queue_size=10)
self.pub_audio_info = rospy.Publisher("audio_info", AudioInfo,
queue_size=1, latch=True)
self.pub_speech_audio = rospy.Publisher("speech_audio", AudioData, queue_size=10)
# init config
self.config = None
self.dyn_srv = Server(RespeakerConfig, self.on_config)
# start
self.respeaker_audio = RespeakerAudio(self.on_audio, suppress_error=suppress_pyaudio_error)
self.respeaker_audio = RespeakerAudio(self.on_audio, suppress_error=suppress_pyaudio_error,
publish_multichannel=self.publish_multichannel)
self.n_channel = 1
if self.publish_multichannel:
self.n_channel = self.respeaker_audio.channels
tkmtnt7000 marked this conversation as resolved.
Show resolved Hide resolved
self.speech_prefetch_bytes = int(
self.speech_prefetch * self.respeaker_audio.rate * self.respeaker_audio.bitdepth / 8.0)
self.n_channel
* self.speech_prefetch
* self.respeaker_audio.rate
* self.respeaker_audio.bitdepth / 8.0)
self.speech_prefetch_buffer = b""
self.respeaker_audio.start()
self.info_timer = rospy.Timer(rospy.Duration(1.0 / self.update_rate),
self.on_timer)
self.timer_led = None
self.sub_led = rospy.Subscriber("status_led", ColorRGBA, self.on_status_led)

info_msg = AudioInfo(
channels=self.n_channel,
sample_rate=self.respeaker_audio.rate,
sample_format='S16LE',
bitrate=self.respeaker_audio.rate * self.respeaker_audio.bitdepth,
coding_format='WAVE')
self.pub_audio_info.publish(info_msg)

def on_shutdown(self):
try:
self.respeaker.close()
Expand Down Expand Up @@ -423,7 +446,7 @@ def on_timer(self, event):
self.speech_audio_buffer = b""
self.is_speeching = False
duration = 8.0 * len(buf) * self.respeaker_audio.bitwidth
duration = duration / self.respeaker_audio.rate / self.respeaker_audio.bitdepth
duration = duration / self.respeaker_audio.rate / self.respeaker_audio.bitdepth / self.n_channel
rospy.loginfo("Speech detected for %.3f seconds" % duration)
if self.speech_min_duration <= duration < self.speech_max_duration:

Expand Down
32 changes: 29 additions & 3 deletions respeaker_ros/scripts/speech_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,46 @@
# -*- coding: utf-8 -*-
# Author: Yuki Furuta <[email protected]>

from __future__ import division

import actionlib
import rospy
try:
import speech_recognition as SR
except ImportError as e:
raise ImportError(str(e) + '\nplease try "pip install speechrecognition"')

import numpy as np
from actionlib_msgs.msg import GoalStatus, GoalStatusArray
from audio_common_msgs.msg import AudioData
from audio_common_msgs.msg import AudioInfo
from sound_play.msg import SoundRequest, SoundRequestAction, SoundRequestGoal
from speech_recognition_msgs.msg import SpeechRecognitionCandidates


class SpeechToText(object):
def __init__(self):
# format of input audio data
self.sample_rate = rospy.get_param("~sample_rate", 16000)
self.sample_width = rospy.get_param("~sample_width", 2)
if rospy.get_param('~audio_info', None):
708yamaguchi marked this conversation as resolved.
Show resolved Hide resolved
rospy.loginfo('Extract audio info params from {}'.format(
rospy.get_param('~audio_info')))
audio_info_msg = rospy.wait_for_message(
rospy.get_param('~audio_info'), AudioInfo)
self.sample_rate = audio_info_msg.sample_rate
self.sample_width = audio_info_msg.bitrate // self.sample_rate // 8
self.channels = audio_info_msg.channels
else:
self.sample_rate = rospy.get_param("~sample_rate", 16000)
self.sample_width = rospy.get_param("~sample_width", 2)
self.channels = rospy.get_param("~channels", 1)
if self.sample_width == 2:
self.dtype = 'int16'
elif self.sample_width == 4:
self.dtype = 'int32'
else:
raise NotImplementedError('sample_width {} is not supported'
.format(self.sample_width))
self.target_channel = rospy.get_param("~target_channel", 0)
# language of STT service
self.language = rospy.get_param("~language", "ja-JP")
# ignore voice input while the robot is speaking
Expand Down Expand Up @@ -78,7 +100,11 @@ def audio_cb(self, msg):
if self.is_canceling:
rospy.loginfo("Speech is cancelled")
return
data = SR.AudioData(msg.data, self.sample_rate, self.sample_width)

data = SR.AudioData(
np.frombuffer(msg.data, dtype=self.dtype)[
self.target_channel::self.channels].tobytes(),
self.sample_rate, self.sample_width)
try:
rospy.loginfo("Waiting for result %d" % len(data.get_raw_data()))
result = self.recognizer.recognize_google(
Expand Down