From aa333db05cbcd4c0899a82a387ea6a5ddcf23cdd Mon Sep 17 00:00:00 2001
From: Fabio Manganiello <fabio@manganiello.tech>
Date: Wed, 10 Apr 2024 22:26:45 +0200
Subject: [PATCH] [assistant.picovoice] More features.

- Added wiring between `assistant.picovoice` and `tts.picovoice`.

- Added `RESPONDING` status to the assistant.

- Added ability to override the default speech model upon
  `start_conversation`.

- Better handling of conversation timeouts.

- Cache Cheetah objects in a `model -> object` map - at least the
  default model should be pre-loaded, since model loading at runtime
  seems to take a while, and that could impact the ability to detect the
  speech in the first seconds after a hotword is detected.
---
 .../plugins/assistant/picovoice/__init__.py   | 48 ++++++++++++++-
 .../plugins/assistant/picovoice/_assistant.py | 59 +++++++++++++------
 .../plugins/assistant/picovoice/_context.py   | 12 +++-
 .../plugins/assistant/picovoice/_state.py     |  1 +
 4 files changed, 96 insertions(+), 24 deletions(-)

diff --git a/platypush/plugins/assistant/picovoice/__init__.py b/platypush/plugins/assistant/picovoice/__init__.py
index 9426ba6081..d275494d99 100644
--- a/platypush/plugins/assistant/picovoice/__init__.py
+++ b/platypush/plugins/assistant/picovoice/__init__.py
@@ -1,7 +1,10 @@
+import os
 from typing import Optional, Sequence
 
+from platypush.context import get_plugin
 from platypush.plugins import RunnablePlugin, action
 from platypush.plugins.assistant import AssistantPlugin
+from platypush.plugins.tts.picovoice import TtsPicovoicePlugin
 
 from ._assistant import Assistant
 from ._state import AssistantState
@@ -96,7 +99,12 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
             using a language other than English, you can provide the path to the
             model file for that language. Model files are available for all the
             supported languages through the `Picovoice repository
-            <https://github.com/Picovoice/porcupine/tree/master/lib/common>`_.
+            <https://github.com/Picovoice/cheetah/tree/master/lib/common>`_.
+            You can also use the `Picovoice console
+            <https://console.picovoice.ai/cat>`_
+            to train your custom models. You can use a base model and fine-tune
+            it by boosting the detection of your own words and phrases and edit
+            the phonetic representation of the words you want to detect.
         :param endpoint_duration: If set, the assistant will stop listening when
             no speech is detected for the specified duration (in seconds) after
             the end of an utterance.
@@ -146,15 +154,47 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
             'on_hotword_detected': self._on_hotword_detected,
         }
 
+    @property
+    def tts(self) -> TtsPicovoicePlugin:
+        p = get_plugin('tts.picovoice')
+        assert p, 'Picovoice TTS plugin not configured/found'
+        return p
+
+    def _get_tts_plugin(self) -> TtsPicovoicePlugin:
+        return self.tts
+
+    def _on_response_render_start(self, text: Optional[str]):
+        if self._assistant:
+            self._assistant.state = AssistantState.RESPONDING
+        return super()._on_response_render_start(text)
+
+    def _on_response_render_end(self):
+        if self._assistant:
+            self._assistant.state = (
+                AssistantState.DETECTING_HOTWORD
+                if self._assistant.hotword_enabled
+                else AssistantState.IDLE
+            )
+
+        return super()._on_response_render_end()
+
     @action
-    def start_conversation(self, *_, **__):
+    def start_conversation(self, *_, model_file: Optional[str] = None, **__):
         """
-        Programmatically start a conversation with the assistant
+        Programmatically start a conversation with the assistant.
+
+        :param model_file: Override the model file to be used to detect speech
+            in this conversation. If not set, the configured
+            ``speech_model_path`` will be used.
         """
         if not self._assistant:
             self.logger.warning('Assistant not initialized')
             return
 
+        if model_file:
+            model_file = os.path.expanduser(model_file)
+
+        self._assistant.override_speech_model(model_file)
         self._assistant.state = AssistantState.DETECTING_SPEECH
 
     @action
@@ -166,6 +206,8 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
             self.logger.warning('Assistant not initialized')
             return
 
+        self._assistant.override_speech_model(None)
+
         if self._assistant.hotword_enabled:
             self._assistant.state = AssistantState.DETECTING_HOTWORD
         else:
diff --git a/platypush/plugins/assistant/picovoice/_assistant.py b/platypush/plugins/assistant/picovoice/_assistant.py
index 761fe981a8..11da4c88a9 100644
--- a/platypush/plugins/assistant/picovoice/_assistant.py
+++ b/platypush/plugins/assistant/picovoice/_assistant.py
@@ -9,11 +9,13 @@ import pvleopard
 import pvporcupine
 import pvrhino
 
+from platypush.context import get_plugin
 from platypush.message.event.assistant import (
     ConversationTimeoutEvent,
     HotwordDetectedEvent,
     SpeechRecognizedEvent,
 )
+from platypush.plugins.tts.picovoice import TtsPicovoicePlugin
 
 from ._context import ConversationContext
 from ._recorder import AudioRecorder
@@ -25,6 +27,7 @@ class Assistant:
     A facade class that wraps the Picovoice engines under an assistant API.
     """
 
+    @staticmethod
     def _default_callback(*_, **__):
         pass
 
@@ -61,11 +64,12 @@ class Assistant:
         self.keyword_paths = None
         self.keyword_model_path = None
         self.frame_expiration = frame_expiration
-        self.speech_model_path = speech_model_path
         self.endpoint_duration = endpoint_duration
         self.enable_automatic_punctuation = enable_automatic_punctuation
         self.start_conversation_on_hotword = start_conversation_on_hotword
         self.audio_queue_size = audio_queue_size
+        self._speech_model_path = speech_model_path
+        self._speech_model_path_override = None
 
         self._on_conversation_start = on_conversation_start
         self._on_conversation_end = on_conversation_end
@@ -103,11 +107,22 @@ class Assistant:
 
                 self.keyword_model_path = keyword_model_path
 
-        self._cheetah: Optional[pvcheetah.Cheetah] = None
+        # Model path -> model instance cache
+        self._cheetah = {}
         self._leopard: Optional[pvleopard.Leopard] = None
         self._porcupine: Optional[pvporcupine.Porcupine] = None
         self._rhino: Optional[pvrhino.Rhino] = None
 
+    @property
+    def speech_model_path(self):
+        return self._speech_model_path_override or self._speech_model_path
+
+    @property
+    def tts(self) -> TtsPicovoicePlugin:
+        p = get_plugin('tts.picovoice')
+        assert p, 'Picovoice TTS plugin not configured/found'
+        return p
+
     def should_stop(self):
         return self._stop_event.is_set()
 
@@ -130,12 +145,18 @@ class Assistant:
             return
 
         if prev_state == AssistantState.DETECTING_SPEECH:
+            self.tts.stop()
             self._ctx.stop()
+            self._speech_model_path_override = None
             self._on_conversation_end()
         elif new_state == AssistantState.DETECTING_SPEECH:
             self._ctx.start()
             self._on_conversation_start()
 
+        if new_state == AssistantState.DETECTING_HOTWORD:
+            self.tts.stop()
+            self._ctx.reset()
+
     @property
     def porcupine(self) -> Optional[pvporcupine.Porcupine]:
         if not self.hotword_enabled:
@@ -159,7 +180,7 @@ class Assistant:
         if not self.stt_enabled:
             return None
 
-        if not self._cheetah:
+        if not self._cheetah.get(self.speech_model_path):
             args: Dict[str, Any] = {'access_key': self._access_key}
             if self.speech_model_path:
                 args['model_path'] = self.speech_model_path
@@ -168,9 +189,9 @@ class Assistant:
             if self.enable_automatic_punctuation:
                 args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
 
-            self._cheetah = pvcheetah.create(**args)
+            self._cheetah[self.speech_model_path] = pvcheetah.create(**args)
 
-        return self._cheetah
+        return self._cheetah[self.speech_model_path]
 
     def __enter__(self):
         if self.should_stop():
@@ -178,10 +199,9 @@ class Assistant:
 
         if self._recorder:
             self.logger.info('A recording stream already exists')
-        elif self.porcupine or self.cheetah:
+        elif self.hotword_enabled or self.stt_enabled:
             sample_rate = (self.porcupine or self.cheetah).sample_rate  # type: ignore
             frame_length = (self.porcupine or self.cheetah).frame_length  # type: ignore
-
             self._recorder = AudioRecorder(
                 stop_event=self._stop_event,
                 sample_rate=sample_rate,
@@ -190,6 +210,9 @@ class Assistant:
                 channels=1,
             )
 
+            if self.stt_enabled:
+                self._cheetah[self.speech_model_path] = self.cheetah
+
             self._recorder.__enter__()
 
             if self.porcupine:
@@ -205,10 +228,10 @@ class Assistant:
             self._recorder = None
 
         self.state = AssistantState.IDLE
-
-        if self._cheetah:
-            self._cheetah.delete()
-            self._cheetah = None
+        for model in [*self._cheetah.keys()]:
+            cheetah = self._cheetah.pop(model, None)
+            if cheetah:
+                cheetah.delete()
 
         if self._leopard:
             self._leopard.delete()
@@ -242,10 +265,10 @@ class Assistant:
                 )
                 continue  # The audio frame is too old
 
-            if self.porcupine and self.state == AssistantState.DETECTING_HOTWORD:
+            if self.hotword_enabled and self.state == AssistantState.DETECTING_HOTWORD:
                 return self._process_hotword(frame)
 
-            if self.cheetah and self.state == AssistantState.DETECTING_SPEECH:
+            if self.stt_enabled and self.state == AssistantState.DETECTING_SPEECH:
                 return self._process_speech(frame)
 
         raise StopIteration
@@ -283,15 +306,12 @@ class Assistant:
             )
 
         if self._ctx.is_final or self._ctx.timed_out:
-            phrase = ''
-            if self.cheetah:
-                phrase = self.cheetah.flush()
-
+            phrase = self.cheetah.flush() or ''
             self._ctx.transcript += phrase
             phrase = self._ctx.transcript
             phrase = phrase[:1].lower() + phrase[1:]
 
-            if self._ctx.is_final or phrase:
+            if self._ctx.is_final and phrase:
                 event = SpeechRecognizedEvent(phrase=phrase)
                 self._on_speech_recognized(phrase=phrase)
             else:
@@ -304,5 +324,8 @@ class Assistant:
 
         return event
 
+    def override_speech_model(self, model_path: Optional[str]):
+        self._speech_model_path_override = model_path
+
 
 # vim:sw=4:ts=4:et:
diff --git a/platypush/plugins/assistant/picovoice/_context.py b/platypush/plugins/assistant/picovoice/_context.py
index 1a53407398..e3696601c9 100644
--- a/platypush/plugins/assistant/picovoice/_context.py
+++ b/platypush/plugins/assistant/picovoice/_context.py
@@ -9,7 +9,7 @@ class ConversationContext:
     Context of the conversation process.
     """
 
-    partial_transcript: str = ''
+    transcript: str = ''
     is_final: bool = False
     timeout: Optional[float] = None
     t_start: Optional[float] = None
@@ -24,7 +24,7 @@ class ConversationContext:
         self.t_end = time()
 
     def reset(self):
-        self.partial_transcript = ''
+        self.transcript = ''
         self.is_final = False
         self.t_start = None
         self.t_end = None
@@ -32,11 +32,17 @@ class ConversationContext:
     @property
     def timed_out(self):
         return (
-            not self.partial_transcript
+            not self.transcript
             and not self.is_final
             and self.timeout
             and self.t_start
             and time() - self.t_start > self.timeout
+        ) or (
+            self.transcript
+            and not self.is_final
+            and self.timeout
+            and self.t_start
+            and time() - self.t_start > self.timeout * 2
         )
 
 
diff --git a/platypush/plugins/assistant/picovoice/_state.py b/platypush/plugins/assistant/picovoice/_state.py
index e0eb7e719a..22e1ee7430 100644
--- a/platypush/plugins/assistant/picovoice/_state.py
+++ b/platypush/plugins/assistant/picovoice/_state.py
@@ -9,6 +9,7 @@ class AssistantState(Enum):
     IDLE = 'idle'
     DETECTING_HOTWORD = 'detecting_hotword'
     DETECTING_SPEECH = 'detecting_speech'
+    RESPONDING = 'responding'
 
 
 # vim:sw=4:ts=4:et: