From 9de49c71a1c8210e755443247ad06f8d87a4d7c7 Mon Sep 17 00:00:00 2001
From: Fabio Manganiello <fabio@manganiello.tech>
Date: Sat, 13 Apr 2024 19:49:58 +0200
Subject: [PATCH] [assistant.picovoice] Conversation flow improvements.

- The `Responding` state should be modelled as an extra event/binary
  flag, not as an assistant state. The assistant may be listening for
  hotwords even while the `tts` plugin is responding, and we don't want
  the two states to interfere with each either - neither to build a more
  complex state machine that also needs to take concurrent states into
  account.

- Stop any responses being rendered upon the `tts` plugin when a new
  hotword audio is detected. If e.g. I say "Ok Google", I should always
  be able to trigger the assistant and stop any concurrent audio
  process.

- `SpeechRecognizedEvent` should be emitted even if `cheetah`'s latest
  audio frame results weren't marked as final, and the speech detection
  window timed out. Cheetah's `is_final` detection seems to be quite
  buggy sometimes, and it may not properly detect the end of utterances,
  especially with non-native accents. The workaround is to flush out
  whatever text is available (if at least some speech was detected) into
  a `SpeechRecognizedEvent` upon timeout.
---
 .../plugins/assistant/picovoice/__init__.py   | 11 +++-----
 .../plugins/assistant/picovoice/_assistant.py | 26 ++++++++++++++++++-
 .../plugins/assistant/picovoice/_recorder.py  | 19 ++++++++++++++
 .../plugins/assistant/picovoice/_state.py     |  1 -
 4 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/platypush/plugins/assistant/picovoice/__init__.py b/platypush/plugins/assistant/picovoice/__init__.py
index d275494d9..52fad4b27 100644
--- a/platypush/plugins/assistant/picovoice/__init__.py
+++ b/platypush/plugins/assistant/picovoice/__init__.py
@@ -165,16 +165,12 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
 
     def _on_response_render_start(self, text: Optional[str]):
         if self._assistant:
-            self._assistant.state = AssistantState.RESPONDING
+            self._assistant.set_responding(True)
         return super()._on_response_render_start(text)
 
     def _on_response_render_end(self):
         if self._assistant:
-            self._assistant.state = (
-                AssistantState.DETECTING_HOTWORD
-                if self._assistant.hotword_enabled
-                else AssistantState.IDLE
-            )
+            self._assistant.set_responding(False)
 
         return super()._on_response_render_end()
 
@@ -257,7 +253,8 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
             with Assistant(**self._assistant_args) as self._assistant:
                 try:
                     for event in self._assistant:
-                        self.logger.debug('Picovoice assistant event: %s', event)
+                        if event is not None:
+                            self.logger.debug('Picovoice assistant event: %s', event)
                 except KeyboardInterrupt:
                     break
                 except Exception as e:
diff --git a/platypush/plugins/assistant/picovoice/_assistant.py b/platypush/plugins/assistant/picovoice/_assistant.py
index 11da4c88a..a713f5247 100644
--- a/platypush/plugins/assistant/picovoice/_assistant.py
+++ b/platypush/plugins/assistant/picovoice/_assistant.py
@@ -63,6 +63,7 @@ class Assistant:
         self.keywords = list(keywords or [])
         self.keyword_paths = None
         self.keyword_model_path = None
+        self._responding = Event()
         self.frame_expiration = frame_expiration
         self.endpoint_duration = endpoint_duration
         self.enable_automatic_punctuation = enable_automatic_punctuation
@@ -113,6 +114,10 @@ class Assistant:
         self._porcupine: Optional[pvporcupine.Porcupine] = None
         self._rhino: Optional[pvrhino.Rhino] = None
 
+    @property
+    def is_responding(self):
+        return self._responding.is_set()
+
     @property
     def speech_model_path(self):
         return self._speech_model_path_override or self._speech_model_path
@@ -123,6 +128,12 @@ class Assistant:
         assert p, 'Picovoice TTS plugin not configured/found'
         return p
 
+    def set_responding(self, responding: bool):
+        if responding:
+            self._responding.set()
+        else:
+            self._responding.clear()
+
     def should_stop(self):
         return self._stop_event.is_set()
 
@@ -194,6 +205,9 @@ class Assistant:
         return self._cheetah[self.speech_model_path]
 
     def __enter__(self):
+        """
+        Get the assistant ready to start processing audio frames.
+        """
         if self.should_stop():
             return self
 
@@ -223,6 +237,9 @@ class Assistant:
         return self
 
     def __exit__(self, *_):
+        """
+        Stop the assistant and release all resources.
+        """
         if self._recorder:
             self._recorder.__exit__(*_)
             self._recorder = None
@@ -246,9 +263,15 @@ class Assistant:
             self._rhino = None
 
     def __iter__(self):
+        """
+        Iterate over processed assistant events.
+        """
         return self
 
     def __next__(self):
+        """
+        Process the next audio frame and return the corresponding event.
+        """
         has_data = False
         if self.should_stop() or not self._recorder:
             raise StopIteration
@@ -285,6 +308,7 @@ class Assistant:
             if self.start_conversation_on_hotword:
                 self.state = AssistantState.DETECTING_SPEECH
 
+            self.tts.stop()
             self._on_hotword_detected(hotword=self.keywords[keyword_index])
             return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
 
@@ -311,7 +335,7 @@ class Assistant:
             phrase = self._ctx.transcript
             phrase = phrase[:1].lower() + phrase[1:]
 
-            if self._ctx.is_final and phrase:
+            if phrase:
                 event = SpeechRecognizedEvent(phrase=phrase)
                 self._on_speech_recognized(phrase=phrase)
             else:
diff --git a/platypush/plugins/assistant/picovoice/_recorder.py b/platypush/plugins/assistant/picovoice/_recorder.py
index e0c23a8e9..ac8e1bacd 100644
--- a/platypush/plugins/assistant/picovoice/_recorder.py
+++ b/platypush/plugins/assistant/picovoice/_recorder.py
@@ -42,11 +42,17 @@ class AudioRecorder:
         )
 
     def __enter__(self):
+        """
+        Start the audio stream.
+        """
         self._stop_event.clear()
         self.stream.start()
         return self
 
     def __exit__(self, *_):
+        """
+        Stop the audio stream.
+        """
         self.stop()
 
     def _audio_callback(self, indata, *_):
@@ -59,6 +65,13 @@ class AudioRecorder:
             self.logger.warning('Audio queue is full, dropping audio frame')
 
     def read(self, timeout: Optional[float] = None):
+        """
+        Read an audio frame from the queue.
+
+        :param timeout: Timeout in seconds. If None, the method will block until
+            an audio frame is available.
+        :return: Audio frame or None if the timeout has expired.
+        """
         try:
             return self._audio_queue.get(timeout=timeout)
         except TimeoutError:
@@ -66,6 +79,9 @@ class AudioRecorder:
             return None
 
     def stop(self):
+        """
+        Stop the audio stream.
+        """
         self._stop_event.set()
         self.stream.stop()
 
@@ -73,4 +89,7 @@ class AudioRecorder:
         return self._stop_event.is_set() or self._upstream_stop_event.is_set()
 
     def wait(self, timeout: Optional[float] = None):
+        """
+        Wait until the audio stream is stopped.
+        """
         wait_for_either(self._stop_event, self._upstream_stop_event, timeout=timeout)
diff --git a/platypush/plugins/assistant/picovoice/_state.py b/platypush/plugins/assistant/picovoice/_state.py
index 22e1ee743..e0eb7e719 100644
--- a/platypush/plugins/assistant/picovoice/_state.py
+++ b/platypush/plugins/assistant/picovoice/_state.py
@@ -9,7 +9,6 @@ class AssistantState(Enum):
     IDLE = 'idle'
     DETECTING_HOTWORD = 'detecting_hotword'
     DETECTING_SPEECH = 'detecting_speech'
-    RESPONDING = 'responding'
 
 
 # vim:sw=4:ts=4:et: