From 8000c791961ac88f446480a75226791f3dc9299d Mon Sep 17 00:00:00 2001 From: miro Date: Mon, 21 Oct 2024 18:49:53 +0100 Subject: [PATCH 1/3] fix:handle empty string transcriptions --- ovos_dinkum_listener/service.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ovos_dinkum_listener/service.py b/ovos_dinkum_listener/service.py index ae5d05a..5c9b8dc 100644 --- a/ovos_dinkum_listener/service.py +++ b/ovos_dinkum_listener/service.py @@ -659,17 +659,17 @@ def _record_end_signal(self): ) self.bus.emit(Message("recognizer_loop:record_end")) - def _stt_text(self, transcripts: List[Tuple[str, float]], - stt_context: dict): + def _stt_text(self, transcripts: List[Tuple[str, float]], stt_context: dict): # Report utterance to intent service - if transcripts: - utts = [u[0] for u in transcripts] # filter confidence + utts = [u[0] for u in transcripts if u[0].strip()] # filter confidence and empty strings + if utts: lang = stt_context.get("lang") or Configuration().get("lang", "en-us") LOG.debug(f"STT: {utts}") - payload = {"utterances": utts, - "lang": lang} + payload = {"utterances": utts, "lang": lang} self.bus.emit(Message("recognizer_loop:utterance", payload, stt_context)) - elif self.voice_loop.listen_mode == ListeningMode.CONTINUOUS: + return + + if self.voice_loop.listen_mode == ListeningMode.CONTINUOUS: LOG.debug("ignoring transcription failure") else: self.bus.emit(Message("recognizer_loop:speech.recognition.unknown", context=stt_context)) From 45e37d189ae1b80b5854f3d977e6284600e8e7ba Mon Sep 17 00:00:00 2001 From: miro Date: Mon, 21 Oct 2024 19:19:30 +0100 Subject: [PATCH 2/3] fix:handle empty string transcriptions --- ovos_dinkum_listener/service.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/ovos_dinkum_listener/service.py b/ovos_dinkum_listener/service.py index 5c9b8dc..4394ccc 100644 --- a/ovos_dinkum_listener/service.py +++ b/ovos_dinkum_listener/service.py @@ -660,19 +660,17 @@ def _record_end_signal(self): self.bus.emit(Message("recognizer_loop:record_end")) def _stt_text(self, transcripts: List[Tuple[str, float]], stt_context: dict): - # Report utterance to intent service - utts = [u[0] for u in transcripts if u[0].strip()] # filter confidence and empty strings + utts = [u[0] for u in transcripts if u[0].strip()] + LOG.debug(f"STT: {utts}") if utts: lang = stt_context.get("lang") or Configuration().get("lang", "en-us") - LOG.debug(f"STT: {utts}") - payload = {"utterances": utts, "lang": lang} + payload = {"utterances": utts, "lang": lang} self.bus.emit(Message("recognizer_loop:utterance", payload, stt_context)) - return - - if self.voice_loop.listen_mode == ListeningMode.CONTINUOUS: - LOG.debug("ignoring transcription failure") else: - self.bus.emit(Message("recognizer_loop:speech.recognition.unknown", context=stt_context)) + if self.voice_loop.listen_mode != ListeningMode.CONTINUOUS: + self.bus.emit(Message("recognizer_loop:speech.recognition.unknown", context=stt_context)) + else: + LOG.debug("Ignoring empty transcription in continuous listening mode") def _save_stt(self, audio_bytes, stt_meta, save_path=None): LOG.info("Saving Utterance Recording") From 51bf8ef250807e0293b462f6ea346535bee5dcda Mon Sep 17 00:00:00 2001 From: miro Date: Mon, 21 Oct 2024 19:43:53 +0100 Subject: [PATCH 3/3] fix:hallucinations --- ovos_dinkum_listener/service.py | 22 ++++++++++++++++++- ovos_dinkum_listener/voice_loop/voice_loop.py | 6 ++--- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/ovos_dinkum_listener/service.py b/ovos_dinkum_listener/service.py index 4394ccc..daee843 100644 --- a/ovos_dinkum_listener/service.py +++ b/ovos_dinkum_listener/service.py @@ -659,8 +659,27 @@ def _record_end_signal(self): ) self.bus.emit(Message("recognizer_loop:record_end")) + def __normtranscripts(self, transcripts: List[Tuple[str, float]]) -> List[str]: + # unfortunately common enough when using whisper to deserve a setting + # mainly happens on silent audio, not as a mistranscription + default_hallucinations = [ + "thanks for watching!", + 'thank you for watching!', + "so", + "beep!" + # "Thank you" # this one can also be valid!! + ] + hallucinations = self.config.get("hallucination_list", default_hallucinations) \ + if self.config.get("filter_hallucinations", True) else [] + utts = [u[0].lstrip(" \"'").strip(" \"'") for u in transcripts] + filtered_hutts = [u for u in utts if u and u.lower() not in hallucinations] + hutts = [u for u in utts if u and u not in filtered_hutts] + if hutts: + LOG.debug(f"Filtered hallucinations: {hutts}") + return filtered_hutts + def _stt_text(self, transcripts: List[Tuple[str, float]], stt_context: dict): - utts = [u[0] for u in transcripts if u[0].strip()] + utts = self.__normtranscripts(transcripts) LOG.debug(f"STT: {utts}") if utts: lang = stt_context.get("lang") or Configuration().get("lang", "en-us") @@ -668,6 +687,7 @@ def _stt_text(self, transcripts: List[Tuple[str, float]], stt_context: dict): self.bus.emit(Message("recognizer_loop:utterance", payload, stt_context)) else: if self.voice_loop.listen_mode != ListeningMode.CONTINUOUS: + LOG.error("Empty transcription, either recorded silence or STT failed!") self.bus.emit(Message("recognizer_loop:speech.recognition.unknown", context=stt_context)) else: LOG.debug("Ignoring empty transcription in continuous listening mode") diff --git a/ovos_dinkum_listener/voice_loop/voice_loop.py b/ovos_dinkum_listener/voice_loop/voice_loop.py index d0f6e77..02f3794 100644 --- a/ovos_dinkum_listener/voice_loop/voice_loop.py +++ b/ovos_dinkum_listener/voice_loop/voice_loop.py @@ -781,12 +781,10 @@ def _after_cmd(self, chunk: bytes): self._vad_remove_silence() utts, stt_context = self._get_tx(stt_context) - + LOG.info(f"Raw transcription: {utts}") if utts: LOG.debug(f"transformers metadata: {stt_context}") - LOG.info(f"transcribed: {utts}") - else: - LOG.info("nothing transcribed") + # Voice command has finished recording if self.stt_audio_callback is not None: self.stt_audio_callback(self.stt_audio_bytes, stt_context)