From f7517eb321545b888d234b22ee2afb5b48492f54 Mon Sep 17 00:00:00 2001
From: Fabio Manganiello <fabio@manganiello.tech>
Date: Mon, 8 Apr 2024 01:54:26 +0200
Subject: [PATCH] [WIP] Added speech detection logic over Cheetah.

---
 docs/source/backends.rst                      |   2 -
 .../backend/stt.picovoice.hotword.rst         |   5 -
 .../backend/stt.picovoice.speech.rst          |   5 -
 docs/source/platypush/plugins/picovoice.rst   |   5 +
 .../plugins/stt.picovoice.hotword.rst         |   5 -
 .../plugins/stt.picovoice.speech.rst          |   5 -
 docs/source/plugins.rst                       |   3 +-
 platypush/plugins/picovoice/__init__.py       |  63 +++++-
 platypush/plugins/picovoice/_assistant.py     | 198 ++++++++++++++----
 platypush/plugins/picovoice/_context.py       |  43 ++++
 platypush/plugins/picovoice/_recorder.py      |   3 +-
 11 files changed, 263 insertions(+), 74 deletions(-)
 delete mode 100644 docs/source/platypush/backend/stt.picovoice.hotword.rst
 delete mode 100644 docs/source/platypush/backend/stt.picovoice.speech.rst
 create mode 100644 docs/source/platypush/plugins/picovoice.rst
 delete mode 100644 docs/source/platypush/plugins/stt.picovoice.hotword.rst
 delete mode 100644 docs/source/platypush/plugins/stt.picovoice.speech.rst
 create mode 100644 platypush/plugins/picovoice/_context.py

diff --git a/docs/source/backends.rst b/docs/source/backends.rst
index 2a43daeec..4171e8825 100644
--- a/docs/source/backends.rst
+++ b/docs/source/backends.rst
@@ -10,6 +10,4 @@ Backends
     platypush/backend/midi.rst
     platypush/backend/nodered.rst
     platypush/backend/redis.rst
-    platypush/backend/stt.picovoice.hotword.rst
-    platypush/backend/stt.picovoice.speech.rst
     platypush/backend/tcp.rst
diff --git a/docs/source/platypush/backend/stt.picovoice.hotword.rst b/docs/source/platypush/backend/stt.picovoice.hotword.rst
deleted file mode 100644
index 858386889..000000000
--- a/docs/source/platypush/backend/stt.picovoice.hotword.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-``stt.picovoice.hotword``
-===========================================
-
-.. automodule:: platypush.backend.stt.picovoice.hotword
-    :members:
diff --git a/docs/source/platypush/backend/stt.picovoice.speech.rst b/docs/source/platypush/backend/stt.picovoice.speech.rst
deleted file mode 100644
index 8b5809662..000000000
--- a/docs/source/platypush/backend/stt.picovoice.speech.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-``stt.picovoice.speech``
-==========================================
-
-.. automodule:: platypush.backend.stt.picovoice.speech
-    :members:
diff --git a/docs/source/platypush/plugins/picovoice.rst b/docs/source/platypush/plugins/picovoice.rst
new file mode 100644
index 000000000..f1f8acded
--- /dev/null
+++ b/docs/source/platypush/plugins/picovoice.rst
@@ -0,0 +1,5 @@
+``picovoice``
+=============
+
+.. automodule:: platypush.plugins.picovoice
+    :members:
diff --git a/docs/source/platypush/plugins/stt.picovoice.hotword.rst b/docs/source/platypush/plugins/stt.picovoice.hotword.rst
deleted file mode 100644
index 11eb37dd5..000000000
--- a/docs/source/platypush/plugins/stt.picovoice.hotword.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-``stt.picovoice.hotword``
-===========================================
-
-.. automodule:: platypush.plugins.stt.picovoice.hotword
-    :members:
diff --git a/docs/source/platypush/plugins/stt.picovoice.speech.rst b/docs/source/platypush/plugins/stt.picovoice.speech.rst
deleted file mode 100644
index 890c904cc..000000000
--- a/docs/source/platypush/plugins/stt.picovoice.speech.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-``stt.picovoice.speech``
-==========================================
-
-.. automodule:: platypush.plugins.stt.picovoice.speech
-    :members:
diff --git a/docs/source/plugins.rst b/docs/source/plugins.rst
index 5e583f5e5..783cb841e 100644
--- a/docs/source/plugins.rst
+++ b/docs/source/plugins.rst
@@ -95,6 +95,7 @@ Plugins
     platypush/plugins/nmap.rst
     platypush/plugins/ntfy.rst
     platypush/plugins/otp.rst
+    platypush/plugins/picovoice.rst
     platypush/plugins/pihole.rst
     platypush/plugins/ping.rst
     platypush/plugins/printer.cups.rst
@@ -119,8 +120,6 @@ Plugins
     platypush/plugins/smartthings.rst
     platypush/plugins/sound.rst
     platypush/plugins/ssh.rst
-    platypush/plugins/stt.picovoice.hotword.rst
-    platypush/plugins/stt.picovoice.speech.rst
     platypush/plugins/sun.rst
     platypush/plugins/switch.tplink.rst
     platypush/plugins/switch.wemo.rst
diff --git a/platypush/plugins/picovoice/__init__.py b/platypush/plugins/picovoice/__init__.py
index a861f66bd..c1e55570f 100644
--- a/platypush/plugins/picovoice/__init__.py
+++ b/platypush/plugins/picovoice/__init__.py
@@ -51,27 +51,34 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
         keywords: Optional[Sequence[str]] = None,
         keyword_paths: Optional[Sequence[str]] = None,
         keyword_model_path: Optional[str] = None,
+        speech_model_path: Optional[str] = None,
+        endpoint_duration: Optional[float] = 0.5,
+        enable_automatic_punctuation: bool = False,
+        start_conversation_on_hotword: bool = True,
+        audio_queue_size: int = 100,
+        conversation_timeout: Optional[float] = 5.0,
         **kwargs,
     ):
         """
         :param access_key: Your Picovoice access key. You can get it by signing
             up at the `Picovoice console <https://console.picovoice.ai/>`.
         :param hotword_enabled: Enable the wake-word engine (default: True).
-            .. note:: The wake-word engine requires you to add Porcupine to the
-                products available in your Picovoice account.
+            **Note**: The wake-word engine requires you to add Porcupine to the
+            products available in your Picovoice account.
         :param stt_enabled: Enable the speech-to-text engine (default: True).
-            .. note:: The speech-to-text engine requires you to add Cheetah to
-                the products available in your Picovoice account.
+            **Note**: The speech-to-text engine requires you to add Cheetah to
+            the products available in your Picovoice account.
         :param intent_enabled: Enable the intent recognition engine (default:
             False).
-            .. note:: The intent recognition engine requires you to add Rhino
-                to the products available in your Picovoice account.
+            **Note**: The intent recognition engine requires you to add Rhino
+            to the products available in your Picovoice account.
         :param keywords: List of keywords to listen for (e.g. ``alexa``, ``ok
-            google``...). Either ``keywords`` or ``keyword_paths`` must be
-            provided if the wake-word engine is enabled. This list can include
-            any of the default Picovoice keywords (available on the `Picovoice
-            repository
+            google``...). This is required if the wake-word engine is enabled.
+            See the `Picovoice repository
             <https://github.com/Picovoice/porcupine/tree/master/resources/keyword_files>`_).
+            for a list of the stock keywords available. If you have a custom
+            model, you can pass its path to the ``keyword_paths`` parameter and
+            its filename (without the path and the platform extension) here.
         :param keyword_paths: List of paths to the keyword files to listen for.
             Custom keyword files can be created using the `Picovoice console
             <https://console.picovoice.ai/ppn>`_ and downloaded from the
@@ -81,6 +88,35 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
             for its language. Model files are available for all the supported
             languages through the `Picovoice repository
             <https://github.com/Picovoice/porcupine/tree/master/lib/common>`_.
+        :param speech_model_path: Path to the speech model file. If you are
+            using a language other than English, you can provide the path to the
+            model file for that language. Model files are available for all the
+            supported languages through the `Picovoice repository
+            <https://github.com/Picovoice/porcupine/tree/master/lib/common>`_.
+        :param endpoint_duration: If set, the assistant will stop listening when
+            no speech is detected for the specified duration (in seconds) after
+            the end of an utterance.
+        :param enable_automatic_punctuation: Enable automatic punctuation
+            insertion.
+        :param start_conversation_on_hotword: If set to True (default), a speech
+            detection session will be started when the hotword is detected. If
+            set to False, you may want to start the conversation programmatically
+            by calling the :meth:`.start_conversation` method instead, or run any
+            custom logic hotword detection logic. This can be particularly useful
+            when you want to run the assistant in a push-to-talk mode, or when you
+            want different hotwords to trigger conversations with different models
+            or languages.
+        :param audio_queue_size: Maximum number of audio frames to hold in the
+            processing queue. You may want to increase this value if you are
+            running this integration on a slow device and/or the logs report
+            audio frame drops too often. Keep in mind that increasing this value
+            will increase the memory usage of the integration. Also, a higher
+            value may result in higher accuracy at the cost of higher latency.
+        :param conversation_timeout: Maximum time to wait for some speech to be
+            detected after the hotword is detected. If no speech is detected
+            within this time, the conversation will time out and the plugin will
+            go back into hotword detection mode, if the mode is enabled. Default:
+            5 seconds.
         """
         super().__init__(**kwargs)
         self._assistant_args = {
@@ -92,6 +128,12 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
             'keywords': keywords,
             'keyword_paths': keyword_paths,
             'keyword_model_path': keyword_model_path,
+            'speech_model_path': speech_model_path,
+            'endpoint_duration': endpoint_duration,
+            'enable_automatic_punctuation': enable_automatic_punctuation,
+            'start_conversation_on_hotword': start_conversation_on_hotword,
+            'audio_queue_size': audio_queue_size,
+            'conversation_timeout': conversation_timeout,
         }
 
     @action
@@ -151,6 +193,7 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
                 try:
                     for event in assistant:
                         if event:
+                            event.args['assistant'] = 'picovoice'
                             get_bus().post(event)
                 except KeyboardInterrupt:
                     break
diff --git a/platypush/plugins/picovoice/_assistant.py b/platypush/plugins/picovoice/_assistant.py
index 5181aa572..27a727129 100644
--- a/platypush/plugins/picovoice/_assistant.py
+++ b/platypush/plugins/picovoice/_assistant.py
@@ -1,6 +1,6 @@
 import logging
 import os
-from threading import Event
+from threading import Event, RLock
 from time import time
 from typing import Any, Dict, Optional, Sequence
 
@@ -9,9 +9,18 @@ import pvleopard
 import pvporcupine
 import pvrhino
 
-from platypush.message.event.assistant import HotwordDetectedEvent
+from platypush.context import get_bus
+from platypush.message.event.assistant import (
+    ConversationStartEvent,
+    ConversationEndEvent,
+    ConversationTimeoutEvent,
+    HotwordDetectedEvent,
+    SpeechRecognizedEvent,
+)
 
+from ._context import SpeechDetectionContext
 from ._recorder import AudioRecorder
+from ._state import AssistantState
 
 
 class Assistant:
@@ -30,10 +39,16 @@ class Assistant:
         keyword_paths: Optional[Sequence[str]] = None,
         keyword_model_path: Optional[str] = None,
         frame_expiration: float = 3.0,  # Don't process audio frames older than this
+        speech_model_path: Optional[str] = None,
+        endpoint_duration: Optional[float] = None,
+        enable_automatic_punctuation: bool = False,
+        start_conversation_on_hotword: bool = False,
+        audio_queue_size: int = 100,
+        conversation_timeout: Optional[float] = None,
     ):
-        self.logger = logging.getLogger(__name__)
         self._access_key = access_key
         self._stop_event = stop_event
+        self.logger = logging.getLogger(__name__)
         self.hotword_enabled = hotword_enabled
         self.stt_enabled = stt_enabled
         self.intent_enabled = intent_enabled
@@ -41,9 +56,23 @@ class Assistant:
         self.keyword_paths = None
         self.keyword_model_path = None
         self.frame_expiration = frame_expiration
+        self.speech_model_path = speech_model_path
+        self.endpoint_duration = endpoint_duration
+        self.enable_automatic_punctuation = enable_automatic_punctuation
+        self.start_conversation_on_hotword = start_conversation_on_hotword
+        self.audio_queue_size = audio_queue_size
+
         self._recorder = None
+        self._state = AssistantState.IDLE
+        self._state_lock = RLock()
+        self._speech_ctx = SpeechDetectionContext(timeout=conversation_timeout)
 
         if hotword_enabled:
+            if not keywords:
+                raise ValueError(
+                    'You need to provide a list of keywords if the wake-word engine is enabled'
+                )
+
             if keyword_paths:
                 keyword_paths = [os.path.expanduser(path) for path in keyword_paths]
                 missing_paths = [
@@ -74,46 +103,89 @@ class Assistant:
     def wait_stop(self):
         self._stop_event.wait()
 
-    def _create_porcupine(self):
-        if not self.hotword_enabled:
-            return None
+    @property
+    def state(self) -> AssistantState:
+        with self._state_lock:
+            return self._state
 
-        args: Dict[str, Any] = {'access_key': self._access_key}
-        if not (self.keywords or self.keyword_paths):
-            raise ValueError(
-                'You need to provide either a list of keywords or a list of '
-                'keyword paths if the wake-word engine is enabled'
-            )
+    @state.setter
+    def state(self, state: AssistantState):
+        with self._state_lock:
+            prev_state = self._state
+            self._state = state
+            new_state = self.state
 
-        if self.keywords:
-            args['keywords'] = self.keywords
-        if self.keyword_paths:
-            args['keyword_paths'] = self.keyword_paths
-        if self.keyword_model_path:
-            args['model_path'] = self.keyword_model_path
+        if prev_state == new_state:
+            return
 
-        return pvporcupine.create(**args)
+        if prev_state == AssistantState.DETECTING_SPEECH:
+            self._speech_ctx.stop()
+            self._post_event(ConversationEndEvent())
+        elif new_state == AssistantState.DETECTING_SPEECH:
+            self._speech_ctx.start()
+            self._post_event(ConversationStartEvent())
 
     @property
     def porcupine(self) -> Optional[pvporcupine.Porcupine]:
+        if not self.hotword_enabled:
+            return None
+
         if not self._porcupine:
-            self._porcupine = self._create_porcupine()
+            args: Dict[str, Any] = {'access_key': self._access_key}
+            if self.keywords:
+                args['keywords'] = self.keywords
+            if self.keyword_paths:
+                args['keyword_paths'] = self.keyword_paths
+            if self.keyword_model_path:
+                args['model_path'] = self.keyword_model_path
+
+            self._porcupine = pvporcupine.create(**args)
 
         return self._porcupine
 
+    @property
+    def cheetah(self) -> Optional[pvcheetah.Cheetah]:
+        if not self.stt_enabled:
+            return None
+
+        if not self._cheetah:
+            args: Dict[str, Any] = {'access_key': self._access_key}
+            if self.speech_model_path:
+                args['model_path'] = self.speech_model_path
+            if self.endpoint_duration:
+                args['endpoint_duration_sec'] = self.endpoint_duration
+            if self.enable_automatic_punctuation:
+                args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
+
+            self._cheetah = pvcheetah.create(**args)
+
+        return self._cheetah
+
     def __enter__(self):
+        if self.should_stop():
+            return self
+
         if self._recorder:
             self.logger.info('A recording stream already exists')
-        elif self.porcupine:
+        elif self.porcupine or self.cheetah:
+            sample_rate = (self.porcupine or self.cheetah).sample_rate  # type: ignore
+            frame_length = (self.porcupine or self.cheetah).frame_length  # type: ignore
+
             self._recorder = AudioRecorder(
                 stop_event=self._stop_event,
-                sample_rate=self.porcupine.sample_rate,
-                frame_size=self.porcupine.frame_length,
+                sample_rate=sample_rate,
+                frame_size=frame_length,
+                queue_size=self.audio_queue_size,
                 channels=1,
             )
 
             self._recorder.__enter__()
 
+            if self.porcupine:
+                self.state = AssistantState.DETECTING_HOTWORD
+            else:
+                self.state = AssistantState.DETECTING_SPEECH
+
         return self
 
     def __exit__(self, *_):
@@ -121,6 +193,8 @@ class Assistant:
             self._recorder.__exit__(*_)
             self._recorder = None
 
+        self.state = AssistantState.IDLE
+
         if self._cheetah:
             self._cheetah.delete()
             self._cheetah = None
@@ -146,26 +220,74 @@ class Assistant:
             raise StopIteration
 
         while not (self.should_stop() or has_data):
-            if self.porcupine:  # TODO also check current state
-                data = self._recorder.read()
-                if data is None:
-                    continue
+            data = self._recorder.read()
+            if data is None:
+                continue
 
-                frame, t = data
-                if time() - t > self.frame_expiration:
-                    self.logger.info(
-                        'Skipping audio frame older than %ss', self.frame_expiration
-                    )
-                    continue  # The audio frame is too old
+            frame, t = data
+            if time() - t > self.frame_expiration:
+                self.logger.info(
+                    'Skipping audio frame older than %ss', self.frame_expiration
+                )
+                continue  # The audio frame is too old
 
-                keyword_index = self.porcupine.process(frame)
-                if keyword_index is None:
-                    continue  # No keyword detected
+            if self.porcupine and self.state == AssistantState.DETECTING_HOTWORD:
+                return self._process_hotword(frame)
 
-                if keyword_index >= 0 and self.keywords:
-                    return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
+            if self.cheetah and self.state == AssistantState.DETECTING_SPEECH:
+                return self._process_speech(frame)
 
         raise StopIteration
 
+    def _post_event(self, event):
+        if event:
+            event.args['assistant'] = 'picovoice'
+            get_bus().post(event)
+
+    def _process_hotword(self, frame):
+        if not self.porcupine:
+            return None
+
+        keyword_index = self.porcupine.process(frame)
+        if keyword_index is None:
+            return None  # No keyword detected
+
+        if keyword_index >= 0 and self.keywords:
+            if self.start_conversation_on_hotword:
+                self.state = AssistantState.DETECTING_SPEECH
+
+            return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
+
+        return None
+
+    def _process_speech(self, frame):
+        if not self.cheetah:
+            return None
+
+        event = None
+        (
+            self._speech_ctx.partial_transcript,
+            self._speech_ctx.is_final,
+        ) = self.cheetah.process(frame)
+
+        if self._speech_ctx.partial_transcript:
+            self.logger.info(
+                'Partial transcript: %s, is_final: %s',
+                self._speech_ctx.partial_transcript,
+                self._speech_ctx.is_final,
+            )
+
+        if self._speech_ctx.is_final or self._speech_ctx.timed_out:
+            event = (
+                ConversationTimeoutEvent()
+                if self._speech_ctx.timed_out
+                else SpeechRecognizedEvent(phrase=self.cheetah.flush())
+            )
+
+            if self.porcupine:
+                self.state = AssistantState.DETECTING_HOTWORD
+
+        return event
+
 
 # vim:sw=4:ts=4:et:
diff --git a/platypush/plugins/picovoice/_context.py b/platypush/plugins/picovoice/_context.py
new file mode 100644
index 000000000..cb7546105
--- /dev/null
+++ b/platypush/plugins/picovoice/_context.py
@@ -0,0 +1,43 @@
+from dataclasses import dataclass
+from time import time
+from typing import Optional
+
+
+@dataclass
+class SpeechDetectionContext:
+    """
+    Context of the speech detection process.
+    """
+
+    partial_transcript: str = ''
+    is_final: bool = False
+    timeout: Optional[float] = None
+    t_start: Optional[float] = None
+    t_end: Optional[float] = None
+
+    def start(self):
+        self.reset()
+        self.t_start = time()
+
+    def stop(self):
+        self.reset()
+        self.t_end = time()
+
+    def reset(self):
+        self.partial_transcript = ''
+        self.is_final = False
+        self.t_start = None
+        self.t_end = None
+
+    @property
+    def timed_out(self):
+        return (
+            not self.partial_transcript
+            and not self.is_final
+            and self.timeout
+            and self.t_start
+            and time() - self.t_start > self.timeout
+        )
+
+
+# vim:sw=4:ts=4:et:
diff --git a/platypush/plugins/picovoice/_recorder.py b/platypush/plugins/picovoice/_recorder.py
index 9df81e7c9..e0c23a8e9 100644
--- a/platypush/plugins/picovoice/_recorder.py
+++ b/platypush/plugins/picovoice/_recorder.py
@@ -26,7 +26,7 @@ class AudioRecorder:
         frame_size: int,
         channels: int,
         dtype: str = 'int16',
-        queue_size: int = 20,
+        queue_size: int = 100,
     ):
         self.logger = getLogger(__name__)
         self._audio_queue: Queue[AudioFrame] = Queue(maxsize=queue_size)
@@ -48,7 +48,6 @@ class AudioRecorder:
 
     def __exit__(self, *_):
         self.stop()
-        # self.stream.close()
 
     def _audio_callback(self, indata, *_):
         if self.should_stop():