[assistant.picovoice] Conversation flow improvements.

- The `Responding` state should be modelled as an extra event/binary flag, not as an assistant state. The assistant may be listening for hotwords even while the `tts` plugin is responding, and we don't want the two states to interfere with each either - neither to build a more complex state machine that also needs to take concurrent states into account. - Stop any responses being rendered upon the `tts` plugin when a new hotword audio is detected. If e.g. I say "Ok Google", I should always be able to trigger the assistant and stop any concurrent audio process. - `SpeechRecognizedEvent` should be emitted even if `cheetah`'s latest audio frame results weren't marked as final, and the speech detection window timed out. Cheetah's `is_final` detection seems to be quite buggy sometimes, and it may not properly detect the end of utterances, especially with non-native accents. The workaround is to flush out whatever text is available (if at least some speech was detected) into a `SpeechRecognizedEvent` upon timeout.
2024-04-13 19:49:58 +02:00 · 2024-04-13 19:49:58 +02:00 · 2b287b569f
parent 24e93ad160
commit 2b287b569f
3 changed files with 29 additions and 9 deletions
--- a/platypush/plugins/assistant/picovoice/init.py
+++ b/platypush/plugins/assistant/picovoice/init.py
@ -165,16 +165,12 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):

    def _on_response_render_start(self, text: Optional[str]):
        if self._assistant:
-            self._assistant.state = AssistantState.RESPONDING
+            self._assistant.set_responding(True)
        return super()._on_response_render_start(text)

    def _on_response_render_end(self):
        if self._assistant:
-            self._assistant.state = (
-                AssistantState.DETECTING_HOTWORD
-                if self._assistant.hotword_enabled
-                else AssistantState.IDLE
-            )
+            self._assistant.set_responding(False)

        return super()._on_response_render_end()

@ -257,7 +253,8 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
            with Assistant(**self._assistant_args) as self._assistant:
                try:
                    for event in self._assistant:
-                        self.logger.debug('Picovoice assistant event: %s', event)
+                        if event is not None:
+                            self.logger.debug('Picovoice assistant event: %s', event)
                except KeyboardInterrupt:
                    break
                except Exception as e:
--- a/platypush/plugins/assistant/picovoice/_assistant.py
+++ b/platypush/plugins/assistant/picovoice/_assistant.py
@ -63,6 +63,7 @@ class Assistant:
        self.keywords = list(keywords or [])
        self.keyword_paths = None
        self.keyword_model_path = None
+        self._responding = Event()
        self.frame_expiration = frame_expiration
        self.endpoint_duration = endpoint_duration
        self.enable_automatic_punctuation = enable_automatic_punctuation
@ -113,6 +114,10 @@ class Assistant:
        self._porcupine: Optional[pvporcupine.Porcupine] = None
        self._rhino: Optional[pvrhino.Rhino] = None

+    @property
+    def is_responding(self):
+        return self._responding.is_set()
+
    @property
    def speech_model_path(self):
        return self._speech_model_path_override or self._speech_model_path
@ -123,6 +128,12 @@ class Assistant:
        assert p, 'Picovoice TTS plugin not configured/found'
        return p

+    def set_responding(self, responding: bool):
+        if responding:
+            self._responding.set()
+        else:
+            self._responding.clear()
+
    def should_stop(self):
        return self._stop_event.is_set()

@ -194,6 +205,9 @@ class Assistant:
        return self._cheetah[self.speech_model_path]

    def __enter__(self):
+        """
+        Get the assistant ready to start processing audio frames.
+        """
        if self.should_stop():
            return self

@ -223,6 +237,9 @@ class Assistant:
        return self

    def __exit__(self, *_):
+        """
+        Stop the assistant and release all resources.
+        """
        if self._recorder:
            self._recorder.__exit__(*_)
            self._recorder = None
@ -246,9 +263,15 @@ class Assistant:
            self._rhino = None

    def __iter__(self):
+        """
+        Iterate over processed assistant events.
+        """
        return self

    def __next__(self):
+        """
+        Process the next audio frame and return the corresponding event.
+        """
        has_data = False
        if self.should_stop() or not self._recorder:
            raise StopIteration
@ -285,6 +308,7 @@ class Assistant:
            if self.start_conversation_on_hotword:
                self.state = AssistantState.DETECTING_SPEECH

+            self.tts.stop()
            self._on_hotword_detected(hotword=self.keywords[keyword_index])
            return HotwordDetectedEvent(hotword=self.keywords[keyword_index])

@ -311,7 +335,7 @@ class Assistant:
            phrase = self._ctx.transcript
            phrase = phrase[:1].lower() + phrase[1:]

-            if self._ctx.is_final and phrase:
+            if phrase:
                event = SpeechRecognizedEvent(phrase=phrase)
                self._on_speech_recognized(phrase=phrase)
            else:
--- a/platypush/plugins/assistant/picovoice/_state.py
+++ b/platypush/plugins/assistant/picovoice/_state.py
@ -9,7 +9,6 @@ class AssistantState(Enum):
    IDLE = 'idle'
    DETECTING_HOTWORD = 'detecting_hotword'
    DETECTING_SPEECH = 'detecting_speech'
-    RESPONDING = 'responding'


 # vim:sw=4:ts=4:et: