[assistant.picovoice] Conversation flow improvements.

- The `Responding` state should be modelled as an extra event/binary flag, not as an assistant state. The assistant may be listening for hotwords even while the `tts` plugin is responding, and we don't want the two states to interfere with each either - neither to build a more complex state machine that also needs to take concurrent states into account. - Stop any responses being rendered upon the `tts` plugin when a new hotword audio is detected. If e.g. I say "Ok Google", I should always be able to trigger the assistant and stop any concurrent audio process. - `SpeechRecognizedEvent` should be emitted even if `cheetah`'s latest audio frame results weren't marked as final, and the speech detection window timed out. Cheetah's `is_final` detection seems to be quite buggy sometimes, and it may not properly detect the end of utterances, especially with non-native accents. The workaround is to flush out whatever text is available (if at least some speech was detected) into a `SpeechRecognizedEvent` upon timeout.
2024-04-13 19:49:58 +02:00 · 2024-04-13 19:49:58 +02:00 · 9de49c71a1
commit 9de49c71a1
parent a6f7b6e790
4 changed files with 48 additions and 9 deletions
--- a/platypush/plugins/assistant/picovoice/init.py
+++ b/platypush/plugins/assistant/picovoice/init.py
@ -165,16 +165,12 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
    def _on_response_render_start(self, text: Optional[str]):
        if self._assistant:
-            self._assistant.state = AssistantState.RESPONDING
+            self._assistant.set_responding(True)
        return super()._on_response_render_start(text)
    def _on_response_render_end(self):
        if self._assistant:
-            self._assistant.state = (
+            self._assistant.set_responding(False)
                AssistantState.DETECTING_HOTWORD
                if self._assistant.hotword_enabled
                else AssistantState.IDLE
            )
        return super()._on_response_render_end()
@ -257,6 +253,7 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
            with Assistant(**self._assistant_args) as self._assistant:
                try:
                    for event in self._assistant:
                        if event is not None:
                            self.logger.debug('Picovoice assistant event: %s', event)
                except KeyboardInterrupt:
                    break
--- a/platypush/plugins/assistant/picovoice/_assistant.py
+++ b/platypush/plugins/assistant/picovoice/_assistant.py
@ -63,6 +63,7 @@ class Assistant:
        self.keywords = list(keywords or [])
        self.keyword_paths = None
        self.keyword_model_path = None
        self._responding = Event()
        self.frame_expiration = frame_expiration
        self.endpoint_duration = endpoint_duration
        self.enable_automatic_punctuation = enable_automatic_punctuation
@ -113,6 +114,10 @@ class Assistant:
        self._porcupine: Optional[pvporcupine.Porcupine] = None
        self._rhino: Optional[pvrhino.Rhino] = None
    @property
    def is_responding(self):
        return self._responding.is_set()
    @property
    def speech_model_path(self):
        return self._speech_model_path_override or self._speech_model_path
@ -123,6 +128,12 @@ class Assistant:
        assert p, 'Picovoice TTS plugin not configured/found'
        return p
    def set_responding(self, responding: bool):
        if responding:
            self._responding.set()
        else:
            self._responding.clear()
    def should_stop(self):
        return self._stop_event.is_set()
@ -194,6 +205,9 @@ class Assistant:
        return self._cheetah[self.speech_model_path]
    def __enter__(self):
        """
        Get the assistant ready to start processing audio frames.
        """
        if self.should_stop():
            return self
@ -223,6 +237,9 @@ class Assistant:
        return self
    def __exit__(self, *_):
        """
        Stop the assistant and release all resources.
        """
        if self._recorder:
            self._recorder.__exit__(*_)
            self._recorder = None
@ -246,9 +263,15 @@ class Assistant:
            self._rhino = None
    def __iter__(self):
        """
        Iterate over processed assistant events.
        """
        return self
    def __next__(self):
        """
        Process the next audio frame and return the corresponding event.
        """
        has_data = False
        if self.should_stop() or not self._recorder:
            raise StopIteration
@ -285,6 +308,7 @@ class Assistant:
            if self.start_conversation_on_hotword:
                self.state = AssistantState.DETECTING_SPEECH
            self.tts.stop()
            self._on_hotword_detected(hotword=self.keywords[keyword_index])
            return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
@ -311,7 +335,7 @@ class Assistant:
            phrase = self._ctx.transcript
            phrase = phrase[:1].lower() + phrase[1:]
-            if self._ctx.is_final and phrase:
+            if phrase:
                event = SpeechRecognizedEvent(phrase=phrase)
                self._on_speech_recognized(phrase=phrase)
            else:
--- a/platypush/plugins/assistant/picovoice/_recorder.py
+++ b/platypush/plugins/assistant/picovoice/_recorder.py
@ -42,11 +42,17 @@ class AudioRecorder:
        )
    def __enter__(self):
        """
        Start the audio stream.
        """
        self._stop_event.clear()
        self.stream.start()
        return self
    def __exit__(self, *_):
        """
        Stop the audio stream.
        """
        self.stop()
    def _audio_callback(self, indata, *_):
@ -59,6 +65,13 @@ class AudioRecorder:
            self.logger.warning('Audio queue is full, dropping audio frame')
    def read(self, timeout: Optional[float] = None):
        """
        Read an audio frame from the queue.
        :param timeout: Timeout in seconds. If None, the method will block until
            an audio frame is available.
        :return: Audio frame or None if the timeout has expired.
        """
        try:
            return self._audio_queue.get(timeout=timeout)
        except TimeoutError:
@ -66,6 +79,9 @@ class AudioRecorder:
            return None
    def stop(self):
        """
        Stop the audio stream.
        """
        self._stop_event.set()
        self.stream.stop()
@ -73,4 +89,7 @@ class AudioRecorder:
        return self._stop_event.is_set() or self._upstream_stop_event.is_set()
    def wait(self, timeout: Optional[float] = None):
        """
        Wait until the audio stream is stopped.
        """
        wait_for_either(self._stop_event, self._upstream_stop_event, timeout=timeout)
--- a/platypush/plugins/assistant/picovoice/_state.py
+++ b/platypush/plugins/assistant/picovoice/_state.py
@ -9,7 +9,6 @@ class AssistantState(Enum):
    IDLE = 'idle'
    DETECTING_HOTWORD = 'detecting_hotword'
    DETECTING_SPEECH = 'detecting_speech'
    RESPONDING = 'responding'
 # vim:sw=4:ts=4:et: