[WIP] Added speech detection logic over Cheetah.

2024-04-08 01:54:26 +02:00 · 2024-04-08 01:54:26 +02:00 · f021b471aa
parent 01dec0b7a4
commit f021b471aa
11 changed files with 263 additions and 74 deletions
--- a/docs/source/backends.rst
+++ b/docs/source/backends.rst
@ -10,6 +10,4 @@ Backends
    platypush/backend/midi.rst
    platypush/backend/nodered.rst
    platypush/backend/redis.rst
    platypush/backend/stt.picovoice.hotword.rst
    platypush/backend/stt.picovoice.speech.rst
    platypush/backend/tcp.rst
--- a/docs/source/platypush/backend/stt.picovoice.hotword.rst
+++ b/docs/source/platypush/backend/stt.picovoice.hotword.rst
@ -1,5 +0,0 @@
 ``stt.picovoice.hotword``
 ===========================================
 .. automodule:: platypush.backend.stt.picovoice.hotword
    :members:
--- a/docs/source/platypush/backend/stt.picovoice.speech.rst
+++ b/docs/source/platypush/backend/stt.picovoice.speech.rst
@ -1,5 +0,0 @@
 ``stt.picovoice.speech``
 ==========================================
 .. automodule:: platypush.backend.stt.picovoice.speech
    :members:
--- a/docs/source/platypush/plugins/picovoice.rst
+++ b/docs/source/platypush/plugins/picovoice.rst
@ -0,0 +1,5 @@
 ``picovoice``
 =============
 .. automodule:: platypush.plugins.picovoice
    :members:
--- a/docs/source/platypush/plugins/stt.picovoice.hotword.rst
+++ b/docs/source/platypush/plugins/stt.picovoice.hotword.rst
@ -1,5 +0,0 @@
 ``stt.picovoice.hotword``
 ===========================================
 .. automodule:: platypush.plugins.stt.picovoice.hotword
    :members:
--- a/docs/source/platypush/plugins/stt.picovoice.speech.rst
+++ b/docs/source/platypush/plugins/stt.picovoice.speech.rst
@ -1,5 +0,0 @@
 ``stt.picovoice.speech``
 ==========================================
 .. automodule:: platypush.plugins.stt.picovoice.speech
    :members:
--- a/docs/source/plugins.rst
+++ b/docs/source/plugins.rst
@ -95,6 +95,7 @@ Plugins
    platypush/plugins/nmap.rst
    platypush/plugins/ntfy.rst
    platypush/plugins/otp.rst
    platypush/plugins/picovoice.rst
    platypush/plugins/pihole.rst
    platypush/plugins/ping.rst
    platypush/plugins/printer.cups.rst
@ -119,8 +120,6 @@ Plugins
    platypush/plugins/smartthings.rst
    platypush/plugins/sound.rst
    platypush/plugins/ssh.rst
    platypush/plugins/stt.picovoice.hotword.rst
    platypush/plugins/stt.picovoice.speech.rst
    platypush/plugins/sun.rst
    platypush/plugins/switch.tplink.rst
    platypush/plugins/switch.wemo.rst
--- a/platypush/plugins/picovoice/init.py
+++ b/platypush/plugins/picovoice/init.py
@ -51,27 +51,34 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
        keywords: Optional[Sequence[str]] = None,
        keyword_paths: Optional[Sequence[str]] = None,
        keyword_model_path: Optional[str] = None,
        speech_model_path: Optional[str] = None,
        endpoint_duration: Optional[float] = 0.5,
        enable_automatic_punctuation: bool = False,
        start_conversation_on_hotword: bool = True,
        audio_queue_size: int = 100,
        conversation_timeout: Optional[float] = 5.0,
        **kwargs,
    ):
        """
        :param access_key: Your Picovoice access key. You can get it by signing
            up at the `Picovoice console <https://console.picovoice.ai/>`.
        :param hotword_enabled: Enable the wake-word engine (default: True).
-            .. note:: The wake-word engine requires you to add Porcupine to the
+            **Note**: The wake-word engine requires you to add Porcupine to the
-                products available in your Picovoice account.
+            products available in your Picovoice account.
        :param stt_enabled: Enable the speech-to-text engine (default: True).
-            .. note:: The speech-to-text engine requires you to add Cheetah to
+            **Note**: The speech-to-text engine requires you to add Cheetah to
-                the products available in your Picovoice account.
+            the products available in your Picovoice account.
        :param intent_enabled: Enable the intent recognition engine (default:
            False).
-            .. note:: The intent recognition engine requires you to add Rhino
+            **Note**: The intent recognition engine requires you to add Rhino
-                to the products available in your Picovoice account.
+            to the products available in your Picovoice account.
        :param keywords: List of keywords to listen for (e.g. ``alexa``, ``ok
-            google``...). Either ``keywords`` or ``keyword_paths`` must be
+            google``...). This is required if the wake-word engine is enabled.
-            provided if the wake-word engine is enabled. This list can include
+            See the `Picovoice repository
            any of the default Picovoice keywords (available on the `Picovoice
            repository
            <https://github.com/Picovoice/porcupine/tree/master/resources/keyword_files>`_).
            for a list of the stock keywords available. If you have a custom
            model, you can pass its path to the ``keyword_paths`` parameter and
            its filename (without the path and the platform extension) here.
        :param keyword_paths: List of paths to the keyword files to listen for.
            Custom keyword files can be created using the `Picovoice console
            <https://console.picovoice.ai/ppn>`_ and downloaded from the
@ -81,6 +88,35 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
            for its language. Model files are available for all the supported
            languages through the `Picovoice repository
            <https://github.com/Picovoice/porcupine/tree/master/lib/common>`_.
        :param speech_model_path: Path to the speech model file. If you are
            using a language other than English, you can provide the path to the
            model file for that language. Model files are available for all the
            supported languages through the `Picovoice repository
            <https://github.com/Picovoice/porcupine/tree/master/lib/common>`_.
        :param endpoint_duration: If set, the assistant will stop listening when
            no speech is detected for the specified duration (in seconds) after
            the end of an utterance.
        :param enable_automatic_punctuation: Enable automatic punctuation
            insertion.
        :param start_conversation_on_hotword: If set to True (default), a speech
            detection session will be started when the hotword is detected. If
            set to False, you may want to start the conversation programmatically
            by calling the :meth:`.start_conversation` method instead, or run any
            custom logic hotword detection logic. This can be particularly useful
            when you want to run the assistant in a push-to-talk mode, or when you
            want different hotwords to trigger conversations with different models
            or languages.
        :param audio_queue_size: Maximum number of audio frames to hold in the
            processing queue. You may want to increase this value if you are
            running this integration on a slow device and/or the logs report
            audio frame drops too often. Keep in mind that increasing this value
            will increase the memory usage of the integration. Also, a higher
            value may result in higher accuracy at the cost of higher latency.
        :param conversation_timeout: Maximum time to wait for some speech to be
            detected after the hotword is detected. If no speech is detected
            within this time, the conversation will time out and the plugin will
            go back into hotword detection mode, if the mode is enabled. Default:
            5 seconds.
        """
        super().__init__(**kwargs)
        self._assistant_args = {
@ -92,6 +128,12 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
            'keywords': keywords,
            'keyword_paths': keyword_paths,
            'keyword_model_path': keyword_model_path,
            'speech_model_path': speech_model_path,
            'endpoint_duration': endpoint_duration,
            'enable_automatic_punctuation': enable_automatic_punctuation,
            'start_conversation_on_hotword': start_conversation_on_hotword,
            'audio_queue_size': audio_queue_size,
            'conversation_timeout': conversation_timeout,
        }
    @action
@ -151,6 +193,7 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
                try:
                    for event in assistant:
                        if event:
                            event.args['assistant'] = 'picovoice'
                            get_bus().post(event)
                except KeyboardInterrupt:
                    break
--- a/platypush/plugins/picovoice/_assistant.py
+++ b/platypush/plugins/picovoice/_assistant.py
@ -1,6 +1,6 @@
 import logging
 import os
-from threading import Event
+from threading import Event, RLock
 from time import time
 from typing import Any, Dict, Optional, Sequence
@ -9,9 +9,18 @@ import pvleopard
 import pvporcupine
 import pvrhino
-from platypush.message.event.assistant import HotwordDetectedEvent
+from platypush.context import get_bus
 from platypush.message.event.assistant import (
    ConversationStartEvent,
    ConversationEndEvent,
    ConversationTimeoutEvent,
    HotwordDetectedEvent,
    SpeechRecognizedEvent,
 )
 from ._context import SpeechDetectionContext
 from ._recorder import AudioRecorder
 from ._state import AssistantState
 class Assistant:
@ -30,10 +39,16 @@ class Assistant:
        keyword_paths: Optional[Sequence[str]] = None,
        keyword_model_path: Optional[str] = None,
        frame_expiration: float = 3.0,  # Don't process audio frames older than this
        speech_model_path: Optional[str] = None,
        endpoint_duration: Optional[float] = None,
        enable_automatic_punctuation: bool = False,
        start_conversation_on_hotword: bool = False,
        audio_queue_size: int = 100,
        conversation_timeout: Optional[float] = None,
    ):
        self.logger = logging.getLogger(__name__)
        self._access_key = access_key
        self._stop_event = stop_event
        self.logger = logging.getLogger(__name__)
        self.hotword_enabled = hotword_enabled
        self.stt_enabled = stt_enabled
        self.intent_enabled = intent_enabled
@ -41,9 +56,23 @@ class Assistant:
        self.keyword_paths = None
        self.keyword_model_path = None
        self.frame_expiration = frame_expiration
        self.speech_model_path = speech_model_path
        self.endpoint_duration = endpoint_duration
        self.enable_automatic_punctuation = enable_automatic_punctuation
        self.start_conversation_on_hotword = start_conversation_on_hotword
        self.audio_queue_size = audio_queue_size
        self._recorder = None
        self._state = AssistantState.IDLE
        self._state_lock = RLock()
        self._speech_ctx = SpeechDetectionContext(timeout=conversation_timeout)
        if hotword_enabled:
            if not keywords:
                raise ValueError(
                    'You need to provide a list of keywords if the wake-word engine is enabled'
                )
            if keyword_paths:
                keyword_paths = [os.path.expanduser(path) for path in keyword_paths]
                missing_paths = [
@ -74,46 +103,89 @@ class Assistant:
    def wait_stop(self):
        self._stop_event.wait()
-    def _create_porcupine(self):
+    @property
-        if not self.hotword_enabled:
+    def state(self) -> AssistantState:
-            return None
+        with self._state_lock:
            return self._state
-        args: Dict[str, Any] = {'access_key': self._access_key}
+    @state.setter
-        if not (self.keywords or self.keyword_paths):
+    def state(self, state: AssistantState):
-            raise ValueError(
+        with self._state_lock:
-                'You need to provide either a list of keywords or a list of '
+            prev_state = self._state
-                'keyword paths if the wake-word engine is enabled'
+            self._state = state
-            )
+            new_state = self.state
-        if self.keywords:
+        if prev_state == new_state:
-            args['keywords'] = self.keywords
+            return
        if self.keyword_paths:
            args['keyword_paths'] = self.keyword_paths
        if self.keyword_model_path:
            args['model_path'] = self.keyword_model_path
-        return pvporcupine.create(**args)
+        if prev_state == AssistantState.DETECTING_SPEECH:
            self._speech_ctx.stop()
            self._post_event(ConversationEndEvent())
        elif new_state == AssistantState.DETECTING_SPEECH:
            self._speech_ctx.start()
            self._post_event(ConversationStartEvent())
    @property
    def porcupine(self) -> Optional[pvporcupine.Porcupine]:
        if not self.hotword_enabled:
            return None
        if not self._porcupine:
-            self._porcupine = self._create_porcupine()
+            args: Dict[str, Any] = {'access_key': self._access_key}
            if self.keywords:
                args['keywords'] = self.keywords
            if self.keyword_paths:
                args['keyword_paths'] = self.keyword_paths
            if self.keyword_model_path:
                args['model_path'] = self.keyword_model_path
            self._porcupine = pvporcupine.create(**args)
        return self._porcupine
    @property
    def cheetah(self) -> Optional[pvcheetah.Cheetah]:
        if not self.stt_enabled:
            return None
        if not self._cheetah:
            args: Dict[str, Any] = {'access_key': self._access_key}
            if self.speech_model_path:
                args['model_path'] = self.speech_model_path
            if self.endpoint_duration:
                args['endpoint_duration_sec'] = self.endpoint_duration
            if self.enable_automatic_punctuation:
                args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
            self._cheetah = pvcheetah.create(**args)
        return self._cheetah
    def __enter__(self):
        if self.should_stop():
            return self
        if self._recorder:
            self.logger.info('A recording stream already exists')
-        elif self.porcupine:
+        elif self.porcupine or self.cheetah:
            sample_rate = (self.porcupine or self.cheetah).sample_rate  # type: ignore
            frame_length = (self.porcupine or self.cheetah).frame_length  # type: ignore
            self._recorder = AudioRecorder(
                stop_event=self._stop_event,
-                sample_rate=self.porcupine.sample_rate,
+                sample_rate=sample_rate,
-                frame_size=self.porcupine.frame_length,
+                frame_size=frame_length,
                queue_size=self.audio_queue_size,
                channels=1,
            )
            self._recorder.__enter__()
            if self.porcupine:
                self.state = AssistantState.DETECTING_HOTWORD
            else:
                self.state = AssistantState.DETECTING_SPEECH
        return self
    def __exit__(self, *_):
@ -121,6 +193,8 @@ class Assistant:
            self._recorder.__exit__(*_)
            self._recorder = None
        self.state = AssistantState.IDLE
        if self._cheetah:
            self._cheetah.delete()
            self._cheetah = None
@ -146,26 +220,74 @@ class Assistant:
            raise StopIteration
        while not (self.should_stop() or has_data):
-            if self.porcupine:  # TODO also check current state
+            data = self._recorder.read()
-                data = self._recorder.read()
+            if data is None:
-                if data is None:
+                continue
                    continue
-                frame, t = data
+            frame, t = data
-                if time() - t > self.frame_expiration:
+            if time() - t > self.frame_expiration:
-                    self.logger.info(
+                self.logger.info(
-                        'Skipping audio frame older than %ss', self.frame_expiration
+                    'Skipping audio frame older than %ss', self.frame_expiration
-                    )
+                )
-                    continue  # The audio frame is too old
+                continue  # The audio frame is too old
-                keyword_index = self.porcupine.process(frame)
+            if self.porcupine and self.state == AssistantState.DETECTING_HOTWORD:
-                if keyword_index is None:
+                return self._process_hotword(frame)
                    continue  # No keyword detected
-                if keyword_index >= 0 and self.keywords:
+            if self.cheetah and self.state == AssistantState.DETECTING_SPEECH:
-                    return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
+                return self._process_speech(frame)
        raise StopIteration
    def _post_event(self, event):
        if event:
            event.args['assistant'] = 'picovoice'
            get_bus().post(event)
    def _process_hotword(self, frame):
        if not self.porcupine:
            return None
        keyword_index = self.porcupine.process(frame)
        if keyword_index is None:
            return None  # No keyword detected
        if keyword_index >= 0 and self.keywords:
            if self.start_conversation_on_hotword:
                self.state = AssistantState.DETECTING_SPEECH
            return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
        return None
    def _process_speech(self, frame):
        if not self.cheetah:
            return None
        event = None
        (
            self._speech_ctx.partial_transcript,
            self._speech_ctx.is_final,
        ) = self.cheetah.process(frame)
        if self._speech_ctx.partial_transcript:
            self.logger.info(
                'Partial transcript: %s, is_final: %s',
                self._speech_ctx.partial_transcript,
                self._speech_ctx.is_final,
            )
        if self._speech_ctx.is_final or self._speech_ctx.timed_out:
            event = (
                ConversationTimeoutEvent()
                if self._speech_ctx.timed_out
                else SpeechRecognizedEvent(phrase=self.cheetah.flush())
            )
            if self.porcupine:
                self.state = AssistantState.DETECTING_HOTWORD
        return event
 # vim:sw=4:ts=4:et:
--- a/platypush/plugins/picovoice/_context.py
+++ b/platypush/plugins/picovoice/_context.py
@ -0,0 +1,43 @@
 from dataclasses import dataclass
 from time import time
 from typing import Optional
@dataclass
 class SpeechDetectionContext:
    """
    Context of the speech detection process.
    """
    partial_transcript: str = ''
    is_final: bool = False
    timeout: Optional[float] = None
    t_start: Optional[float] = None
    t_end: Optional[float] = None
    def start(self):
        self.reset()
        self.t_start = time()
    def stop(self):
        self.reset()
        self.t_end = time()
    def reset(self):
        self.partial_transcript = ''
        self.is_final = False
        self.t_start = None
        self.t_end = None
    @property
    def timed_out(self):
        return (
            not self.partial_transcript
            and not self.is_final
            and self.timeout
            and self.t_start
            and time() - self.t_start > self.timeout
        )
 # vim:sw=4:ts=4:et:
--- a/platypush/plugins/picovoice/_recorder.py
+++ b/platypush/plugins/picovoice/_recorder.py
@ -26,7 +26,7 @@ class AudioRecorder:
        frame_size: int,
        channels: int,
        dtype: str = 'int16',
-        queue_size: int = 20,
+        queue_size: int = 100,
    ):
        self.logger = getLogger(__name__)
        self._audio_queue: Queue[AudioFrame] = Queue(maxsize=queue_size)
@ -48,7 +48,6 @@ class AudioRecorder:
    def __exit__(self, *_):
        self.stop()
        # self.stream.close()
    def _audio_callback(self, indata, *_):
        if self.should_stop():