From 9de49c71a1c8210e755443247ad06f8d87a4d7c7 Mon Sep 17 00:00:00 2001 From: Fabio Manganiello Date: Sat, 13 Apr 2024 19:49:58 +0200 Subject: [PATCH] [assistant.picovoice] Conversation flow improvements. - The `Responding` state should be modelled as an extra event/binary flag, not as an assistant state. The assistant may be listening for hotwords even while the `tts` plugin is responding, and we don't want the two states to interfere with each either - neither to build a more complex state machine that also needs to take concurrent states into account. - Stop any responses being rendered upon the `tts` plugin when a new hotword audio is detected. If e.g. I say "Ok Google", I should always be able to trigger the assistant and stop any concurrent audio process. - `SpeechRecognizedEvent` should be emitted even if `cheetah`'s latest audio frame results weren't marked as final, and the speech detection window timed out. Cheetah's `is_final` detection seems to be quite buggy sometimes, and it may not properly detect the end of utterances, especially with non-native accents. The workaround is to flush out whatever text is available (if at least some speech was detected) into a `SpeechRecognizedEvent` upon timeout. --- .../plugins/assistant/picovoice/__init__.py | 11 +++----- .../plugins/assistant/picovoice/_assistant.py | 26 ++++++++++++++++++- .../plugins/assistant/picovoice/_recorder.py | 19 ++++++++++++++ .../plugins/assistant/picovoice/_state.py | 1 - 4 files changed, 48 insertions(+), 9 deletions(-) diff --git a/platypush/plugins/assistant/picovoice/__init__.py b/platypush/plugins/assistant/picovoice/__init__.py index d275494d9..52fad4b27 100644 --- a/platypush/plugins/assistant/picovoice/__init__.py +++ b/platypush/plugins/assistant/picovoice/__init__.py @@ -165,16 +165,12 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin): def _on_response_render_start(self, text: Optional[str]): if self._assistant: - self._assistant.state = AssistantState.RESPONDING + self._assistant.set_responding(True) return super()._on_response_render_start(text) def _on_response_render_end(self): if self._assistant: - self._assistant.state = ( - AssistantState.DETECTING_HOTWORD - if self._assistant.hotword_enabled - else AssistantState.IDLE - ) + self._assistant.set_responding(False) return super()._on_response_render_end() @@ -257,7 +253,8 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin): with Assistant(**self._assistant_args) as self._assistant: try: for event in self._assistant: - self.logger.debug('Picovoice assistant event: %s', event) + if event is not None: + self.logger.debug('Picovoice assistant event: %s', event) except KeyboardInterrupt: break except Exception as e: diff --git a/platypush/plugins/assistant/picovoice/_assistant.py b/platypush/plugins/assistant/picovoice/_assistant.py index 11da4c88a..a713f5247 100644 --- a/platypush/plugins/assistant/picovoice/_assistant.py +++ b/platypush/plugins/assistant/picovoice/_assistant.py @@ -63,6 +63,7 @@ class Assistant: self.keywords = list(keywords or []) self.keyword_paths = None self.keyword_model_path = None + self._responding = Event() self.frame_expiration = frame_expiration self.endpoint_duration = endpoint_duration self.enable_automatic_punctuation = enable_automatic_punctuation @@ -113,6 +114,10 @@ class Assistant: self._porcupine: Optional[pvporcupine.Porcupine] = None self._rhino: Optional[pvrhino.Rhino] = None + @property + def is_responding(self): + return self._responding.is_set() + @property def speech_model_path(self): return self._speech_model_path_override or self._speech_model_path @@ -123,6 +128,12 @@ class Assistant: assert p, 'Picovoice TTS plugin not configured/found' return p + def set_responding(self, responding: bool): + if responding: + self._responding.set() + else: + self._responding.clear() + def should_stop(self): return self._stop_event.is_set() @@ -194,6 +205,9 @@ class Assistant: return self._cheetah[self.speech_model_path] def __enter__(self): + """ + Get the assistant ready to start processing audio frames. + """ if self.should_stop(): return self @@ -223,6 +237,9 @@ class Assistant: return self def __exit__(self, *_): + """ + Stop the assistant and release all resources. + """ if self._recorder: self._recorder.__exit__(*_) self._recorder = None @@ -246,9 +263,15 @@ class Assistant: self._rhino = None def __iter__(self): + """ + Iterate over processed assistant events. + """ return self def __next__(self): + """ + Process the next audio frame and return the corresponding event. + """ has_data = False if self.should_stop() or not self._recorder: raise StopIteration @@ -285,6 +308,7 @@ class Assistant: if self.start_conversation_on_hotword: self.state = AssistantState.DETECTING_SPEECH + self.tts.stop() self._on_hotword_detected(hotword=self.keywords[keyword_index]) return HotwordDetectedEvent(hotword=self.keywords[keyword_index]) @@ -311,7 +335,7 @@ class Assistant: phrase = self._ctx.transcript phrase = phrase[:1].lower() + phrase[1:] - if self._ctx.is_final and phrase: + if phrase: event = SpeechRecognizedEvent(phrase=phrase) self._on_speech_recognized(phrase=phrase) else: diff --git a/platypush/plugins/assistant/picovoice/_recorder.py b/platypush/plugins/assistant/picovoice/_recorder.py index e0c23a8e9..ac8e1bacd 100644 --- a/platypush/plugins/assistant/picovoice/_recorder.py +++ b/platypush/plugins/assistant/picovoice/_recorder.py @@ -42,11 +42,17 @@ class AudioRecorder: ) def __enter__(self): + """ + Start the audio stream. + """ self._stop_event.clear() self.stream.start() return self def __exit__(self, *_): + """ + Stop the audio stream. + """ self.stop() def _audio_callback(self, indata, *_): @@ -59,6 +65,13 @@ class AudioRecorder: self.logger.warning('Audio queue is full, dropping audio frame') def read(self, timeout: Optional[float] = None): + """ + Read an audio frame from the queue. + + :param timeout: Timeout in seconds. If None, the method will block until + an audio frame is available. + :return: Audio frame or None if the timeout has expired. + """ try: return self._audio_queue.get(timeout=timeout) except TimeoutError: @@ -66,6 +79,9 @@ class AudioRecorder: return None def stop(self): + """ + Stop the audio stream. + """ self._stop_event.set() self.stream.stop() @@ -73,4 +89,7 @@ class AudioRecorder: return self._stop_event.is_set() or self._upstream_stop_event.is_set() def wait(self, timeout: Optional[float] = None): + """ + Wait until the audio stream is stopped. + """ wait_for_either(self._stop_event, self._upstream_stop_event, timeout=timeout) diff --git a/platypush/plugins/assistant/picovoice/_state.py b/platypush/plugins/assistant/picovoice/_state.py index 22e1ee743..e0eb7e719 100644 --- a/platypush/plugins/assistant/picovoice/_state.py +++ b/platypush/plugins/assistant/picovoice/_state.py @@ -9,7 +9,6 @@ class AssistantState(Enum): IDLE = 'idle' DETECTING_HOTWORD = 'detecting_hotword' DETECTING_SPEECH = 'detecting_speech' - RESPONDING = 'responding' # vim:sw=4:ts=4:et: