[WIP] Added speech detection logic over Cheetah.

2024-04-08 01:54:26 +02:00 · 2024-04-08 01:54:26 +02:00 · f021b471aa
commit f021b471aa
parent 01dec0b7a4
11 changed files with 263 additions and 74 deletions
--- a/docs/source/backends.rst
+++ b/docs/source/backends.rst
@ -10,6 +10,4 @@ Backends
    platypush/backend/midi.rst
    platypush/backend/nodered.rst
    platypush/backend/redis.rst
-    platypush/backend/stt.picovoice.hotword.rst
-    platypush/backend/stt.picovoice.speech.rst
    platypush/backend/tcp.rst
--- a/docs/source/platypush/backend/stt.picovoice.hotword.rst
+++ b/docs/source/platypush/backend/stt.picovoice.hotword.rst
@ -1,5 +0,0 @@
-``stt.picovoice.hotword``
-===========================================
-
-.. automodule:: platypush.backend.stt.picovoice.hotword
-    :members:
--- a/docs/source/platypush/backend/stt.picovoice.speech.rst
+++ b/docs/source/platypush/backend/stt.picovoice.speech.rst
@ -1,5 +0,0 @@
-``stt.picovoice.speech``
-==========================================
-
-.. automodule:: platypush.backend.stt.picovoice.speech
-    :members:
--- a/docs/source/platypush/plugins/picovoice.rst
+++ b/docs/source/platypush/plugins/picovoice.rst
@ -0,0 +1,5 @@
+``picovoice``
+=============
+
+.. automodule:: platypush.plugins.picovoice
+    :members:
--- a/docs/source/platypush/plugins/stt.picovoice.hotword.rst
+++ b/docs/source/platypush/plugins/stt.picovoice.hotword.rst
@ -1,5 +0,0 @@
-``stt.picovoice.hotword``
-===========================================
-
-.. automodule:: platypush.plugins.stt.picovoice.hotword
-    :members:
--- a/docs/source/platypush/plugins/stt.picovoice.speech.rst
+++ b/docs/source/platypush/plugins/stt.picovoice.speech.rst
@ -1,5 +0,0 @@
-``stt.picovoice.speech``
-==========================================
-
-.. automodule:: platypush.plugins.stt.picovoice.speech
-    :members:
--- a/docs/source/plugins.rst
+++ b/docs/source/plugins.rst
@ -95,6 +95,7 @@ Plugins
    platypush/plugins/nmap.rst
    platypush/plugins/ntfy.rst
    platypush/plugins/otp.rst
+    platypush/plugins/picovoice.rst
    platypush/plugins/pihole.rst
    platypush/plugins/ping.rst
    platypush/plugins/printer.cups.rst
@ -119,8 +120,6 @@ Plugins
    platypush/plugins/smartthings.rst
    platypush/plugins/sound.rst
    platypush/plugins/ssh.rst
-    platypush/plugins/stt.picovoice.hotword.rst
-    platypush/plugins/stt.picovoice.speech.rst
    platypush/plugins/sun.rst
    platypush/plugins/switch.tplink.rst
    platypush/plugins/switch.wemo.rst
--- a/platypush/plugins/picovoice/init.py
+++ b/platypush/plugins/picovoice/init.py
@ -51,27 +51,34 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
        keywords: Optional[Sequence[str]] = None,
        keyword_paths: Optional[Sequence[str]] = None,
        keyword_model_path: Optional[str] = None,
+        speech_model_path: Optional[str] = None,
+        endpoint_duration: Optional[float] = 0.5,
+        enable_automatic_punctuation: bool = False,
+        start_conversation_on_hotword: bool = True,
+        audio_queue_size: int = 100,
+        conversation_timeout: Optional[float] = 5.0,
        **kwargs,
    ):
        """
        :param access_key: Your Picovoice access key. You can get it by signing
            up at the `Picovoice console <https://console.picovoice.ai/>`.
        :param hotword_enabled: Enable the wake-word engine (default: True).
-            .. note:: The wake-word engine requires you to add Porcupine to the
+            **Note**: The wake-word engine requires you to add Porcupine to the
            products available in your Picovoice account.
        :param stt_enabled: Enable the speech-to-text engine (default: True).
-            .. note:: The speech-to-text engine requires you to add Cheetah to
+            **Note**: The speech-to-text engine requires you to add Cheetah to
            the products available in your Picovoice account.
        :param intent_enabled: Enable the intent recognition engine (default:
            False).
-            .. note:: The intent recognition engine requires you to add Rhino
+            **Note**: The intent recognition engine requires you to add Rhino
            to the products available in your Picovoice account.
        :param keywords: List of keywords to listen for (e.g. ``alexa``, ``ok
-            google``...). Either ``keywords`` or ``keyword_paths`` must be
-            provided if the wake-word engine is enabled. This list can include
-            any of the default Picovoice keywords (available on the `Picovoice
-            repository
+            google``...). This is required if the wake-word engine is enabled.
+            See the `Picovoice repository
            <https://github.com/Picovoice/porcupine/tree/master/resources/keyword_files>`_).
+            for a list of the stock keywords available. If you have a custom
+            model, you can pass its path to the ``keyword_paths`` parameter and
+            its filename (without the path and the platform extension) here.
        :param keyword_paths: List of paths to the keyword files to listen for.
            Custom keyword files can be created using the `Picovoice console
            <https://console.picovoice.ai/ppn>`_ and downloaded from the
@ -81,6 +88,35 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
            for its language. Model files are available for all the supported
            languages through the `Picovoice repository
            <https://github.com/Picovoice/porcupine/tree/master/lib/common>`_.
+        :param speech_model_path: Path to the speech model file. If you are
+            using a language other than English, you can provide the path to the
+            model file for that language. Model files are available for all the
+            supported languages through the `Picovoice repository
+            <https://github.com/Picovoice/porcupine/tree/master/lib/common>`_.
+        :param endpoint_duration: If set, the assistant will stop listening when
+            no speech is detected for the specified duration (in seconds) after
+            the end of an utterance.
+        :param enable_automatic_punctuation: Enable automatic punctuation
+            insertion.
+        :param start_conversation_on_hotword: If set to True (default), a speech
+            detection session will be started when the hotword is detected. If
+            set to False, you may want to start the conversation programmatically
+            by calling the :meth:`.start_conversation` method instead, or run any
+            custom logic hotword detection logic. This can be particularly useful
+            when you want to run the assistant in a push-to-talk mode, or when you
+            want different hotwords to trigger conversations with different models
+            or languages.
+        :param audio_queue_size: Maximum number of audio frames to hold in the
+            processing queue. You may want to increase this value if you are
+            running this integration on a slow device and/or the logs report
+            audio frame drops too often. Keep in mind that increasing this value
+            will increase the memory usage of the integration. Also, a higher
+            value may result in higher accuracy at the cost of higher latency.
+        :param conversation_timeout: Maximum time to wait for some speech to be
+            detected after the hotword is detected. If no speech is detected
+            within this time, the conversation will time out and the plugin will
+            go back into hotword detection mode, if the mode is enabled. Default:
+            5 seconds.
        """
        super().__init__(**kwargs)
        self._assistant_args = {
@ -92,6 +128,12 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
            'keywords': keywords,
            'keyword_paths': keyword_paths,
            'keyword_model_path': keyword_model_path,
+            'speech_model_path': speech_model_path,
+            'endpoint_duration': endpoint_duration,
+            'enable_automatic_punctuation': enable_automatic_punctuation,
+            'start_conversation_on_hotword': start_conversation_on_hotword,
+            'audio_queue_size': audio_queue_size,
+            'conversation_timeout': conversation_timeout,
        }

    @action
@ -151,6 +193,7 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
                try:
                    for event in assistant:
                        if event:
+                            event.args['assistant'] = 'picovoice'
                            get_bus().post(event)
                except KeyboardInterrupt:
                    break
--- a/platypush/plugins/picovoice/_assistant.py
+++ b/platypush/plugins/picovoice/_assistant.py
@ -1,6 +1,6 @@
 import logging
 import os
-from threading import Event
+from threading import Event, RLock
 from time import time
 from typing import Any, Dict, Optional, Sequence

@ -9,9 +9,18 @@ import pvleopard
 import pvporcupine
 import pvrhino

-from platypush.message.event.assistant import HotwordDetectedEvent
+from platypush.context import get_bus
+from platypush.message.event.assistant import (
+    ConversationStartEvent,
+    ConversationEndEvent,
+    ConversationTimeoutEvent,
+    HotwordDetectedEvent,
+    SpeechRecognizedEvent,
+)

+from ._context import SpeechDetectionContext
 from ._recorder import AudioRecorder
+from ._state import AssistantState


 class Assistant:
@ -30,10 +39,16 @@ class Assistant:
        keyword_paths: Optional[Sequence[str]] = None,
        keyword_model_path: Optional[str] = None,
        frame_expiration: float = 3.0,  # Don't process audio frames older than this
+        speech_model_path: Optional[str] = None,
+        endpoint_duration: Optional[float] = None,
+        enable_automatic_punctuation: bool = False,
+        start_conversation_on_hotword: bool = False,
+        audio_queue_size: int = 100,
+        conversation_timeout: Optional[float] = None,
    ):
-        self.logger = logging.getLogger(__name__)
        self._access_key = access_key
        self._stop_event = stop_event
+        self.logger = logging.getLogger(__name__)
        self.hotword_enabled = hotword_enabled
        self.stt_enabled = stt_enabled
        self.intent_enabled = intent_enabled
@ -41,9 +56,23 @@ class Assistant:
        self.keyword_paths = None
        self.keyword_model_path = None
        self.frame_expiration = frame_expiration
+        self.speech_model_path = speech_model_path
+        self.endpoint_duration = endpoint_duration
+        self.enable_automatic_punctuation = enable_automatic_punctuation
+        self.start_conversation_on_hotword = start_conversation_on_hotword
+        self.audio_queue_size = audio_queue_size
+
        self._recorder = None
+        self._state = AssistantState.IDLE
+        self._state_lock = RLock()
+        self._speech_ctx = SpeechDetectionContext(timeout=conversation_timeout)

        if hotword_enabled:
+            if not keywords:
+                raise ValueError(
+                    'You need to provide a list of keywords if the wake-word engine is enabled'
+                )
+
            if keyword_paths:
                keyword_paths = [os.path.expanduser(path) for path in keyword_paths]
                missing_paths = [
@ -74,17 +103,35 @@ class Assistant:
    def wait_stop(self):
        self._stop_event.wait()

-    def _create_porcupine(self):
+    @property
+    def state(self) -> AssistantState:
+        with self._state_lock:
+            return self._state
+
+    @state.setter
+    def state(self, state: AssistantState):
+        with self._state_lock:
+            prev_state = self._state
+            self._state = state
+            new_state = self.state
+
+        if prev_state == new_state:
+            return
+
+        if prev_state == AssistantState.DETECTING_SPEECH:
+            self._speech_ctx.stop()
+            self._post_event(ConversationEndEvent())
+        elif new_state == AssistantState.DETECTING_SPEECH:
+            self._speech_ctx.start()
+            self._post_event(ConversationStartEvent())
+
+    @property
+    def porcupine(self) -> Optional[pvporcupine.Porcupine]:
        if not self.hotword_enabled:
            return None

+        if not self._porcupine:
            args: Dict[str, Any] = {'access_key': self._access_key}
-        if not (self.keywords or self.keyword_paths):
-            raise ValueError(
-                'You need to provide either a list of keywords or a list of '
-                'keyword paths if the wake-word engine is enabled'
-            )
-
            if self.keywords:
                args['keywords'] = self.keywords
            if self.keyword_paths:
@ -92,28 +139,53 @@ class Assistant:
            if self.keyword_model_path:
                args['model_path'] = self.keyword_model_path

-        return pvporcupine.create(**args)
-
-    @property
-    def porcupine(self) -> Optional[pvporcupine.Porcupine]:
-        if not self._porcupine:
-            self._porcupine = self._create_porcupine()
+            self._porcupine = pvporcupine.create(**args)

        return self._porcupine

+    @property
+    def cheetah(self) -> Optional[pvcheetah.Cheetah]:
+        if not self.stt_enabled:
+            return None
+
+        if not self._cheetah:
+            args: Dict[str, Any] = {'access_key': self._access_key}
+            if self.speech_model_path:
+                args['model_path'] = self.speech_model_path
+            if self.endpoint_duration:
+                args['endpoint_duration_sec'] = self.endpoint_duration
+            if self.enable_automatic_punctuation:
+                args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
+
+            self._cheetah = pvcheetah.create(**args)
+
+        return self._cheetah
+
    def __enter__(self):
+        if self.should_stop():
+            return self
+
        if self._recorder:
            self.logger.info('A recording stream already exists')
-        elif self.porcupine:
+        elif self.porcupine or self.cheetah:
+            sample_rate = (self.porcupine or self.cheetah).sample_rate  # type: ignore
+            frame_length = (self.porcupine or self.cheetah).frame_length  # type: ignore
+
            self._recorder = AudioRecorder(
                stop_event=self._stop_event,
-                sample_rate=self.porcupine.sample_rate,
-                frame_size=self.porcupine.frame_length,
+                sample_rate=sample_rate,
+                frame_size=frame_length,
+                queue_size=self.audio_queue_size,
                channels=1,
            )

            self._recorder.__enter__()

+            if self.porcupine:
+                self.state = AssistantState.DETECTING_HOTWORD
+            else:
+                self.state = AssistantState.DETECTING_SPEECH
+
        return self

    def __exit__(self, *_):
@ -121,6 +193,8 @@ class Assistant:
            self._recorder.__exit__(*_)
            self._recorder = None

+        self.state = AssistantState.IDLE
+
        if self._cheetah:
            self._cheetah.delete()
            self._cheetah = None
@ -146,7 +220,6 @@ class Assistant:
            raise StopIteration

        while not (self.should_stop() or has_data):
-            if self.porcupine:  # TODO also check current state
            data = self._recorder.read()
            if data is None:
                continue
@ -158,14 +231,63 @@ class Assistant:
                )
                continue  # The audio frame is too old

-                keyword_index = self.porcupine.process(frame)
-                if keyword_index is None:
-                    continue  # No keyword detected
+            if self.porcupine and self.state == AssistantState.DETECTING_HOTWORD:
+                return self._process_hotword(frame)

-                if keyword_index >= 0 and self.keywords:
-                    return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
+            if self.cheetah and self.state == AssistantState.DETECTING_SPEECH:
+                return self._process_speech(frame)

        raise StopIteration

+    def _post_event(self, event):
+        if event:
+            event.args['assistant'] = 'picovoice'
+            get_bus().post(event)
+
+    def _process_hotword(self, frame):
+        if not self.porcupine:
+            return None
+
+        keyword_index = self.porcupine.process(frame)
+        if keyword_index is None:
+            return None  # No keyword detected
+
+        if keyword_index >= 0 and self.keywords:
+            if self.start_conversation_on_hotword:
+                self.state = AssistantState.DETECTING_SPEECH
+
+            return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
+
+        return None
+
+    def _process_speech(self, frame):
+        if not self.cheetah:
+            return None
+
+        event = None
+        (
+            self._speech_ctx.partial_transcript,
+            self._speech_ctx.is_final,
+        ) = self.cheetah.process(frame)
+
+        if self._speech_ctx.partial_transcript:
+            self.logger.info(
+                'Partial transcript: %s, is_final: %s',
+                self._speech_ctx.partial_transcript,
+                self._speech_ctx.is_final,
+            )
+
+        if self._speech_ctx.is_final or self._speech_ctx.timed_out:
+            event = (
+                ConversationTimeoutEvent()
+                if self._speech_ctx.timed_out
+                else SpeechRecognizedEvent(phrase=self.cheetah.flush())
+            )
+
+            if self.porcupine:
+                self.state = AssistantState.DETECTING_HOTWORD
+
+        return event
+

 # vim:sw=4:ts=4:et:
--- a/platypush/plugins/picovoice/_context.py
+++ b/platypush/plugins/picovoice/_context.py
@ -0,0 +1,43 @@
+from dataclasses import dataclass
+from time import time
+from typing import Optional
+
+
+@dataclass
+class SpeechDetectionContext:
+    """
+    Context of the speech detection process.
+    """
+
+    partial_transcript: str = ''
+    is_final: bool = False
+    timeout: Optional[float] = None
+    t_start: Optional[float] = None
+    t_end: Optional[float] = None
+
+    def start(self):
+        self.reset()
+        self.t_start = time()
+
+    def stop(self):
+        self.reset()
+        self.t_end = time()
+
+    def reset(self):
+        self.partial_transcript = ''
+        self.is_final = False
+        self.t_start = None
+        self.t_end = None
+
+    @property
+    def timed_out(self):
+        return (
+            not self.partial_transcript
+            and not self.is_final
+            and self.timeout
+            and self.t_start
+            and time() - self.t_start > self.timeout
+        )
+
+
+# vim:sw=4:ts=4:et:
--- a/platypush/plugins/picovoice/_recorder.py
+++ b/platypush/plugins/picovoice/_recorder.py
@ -26,7 +26,7 @@ class AudioRecorder:
        frame_size: int,
        channels: int,
        dtype: str = 'int16',
-        queue_size: int = 20,
+        queue_size: int = 100,
    ):
        self.logger = getLogger(__name__)
        self._audio_queue: Queue[AudioFrame] = Queue(maxsize=queue_size)
@ -48,7 +48,6 @@ class AudioRecorder:

    def __exit__(self, *_):
        self.stop()
-        # self.stream.close()

    def _audio_callback(self, indata, *_):
        if self.should_stop():