New architecture for the assistant speech detection logic.

The assistant object now runs in its own thread and leverages an external `SpeechProcessor` that uses two threads to scan for both intents and speech in parallel on audio frames.
2024-04-20 17:24:03 +02:00 · 2024-04-20 17:24:03 +02:00 · 8df34a13f1
parent cb2077eddc
commit 8df34a13f1
13 changed files with 704 additions and 106 deletions
--- a/platypush/message/event/assistant/init.py
+++ b/platypush/message/event/assistant/init.py
@ -107,13 +107,17 @@ class ResponseEndEvent(ConversationEndEvent):
    Event triggered when a response has been rendered on the assistant.
    """

-    def __init__(self, *args, with_follow_on_turn: bool = False, **kwargs):
+    def __init__(
+        self, *args, response_text: str, with_follow_on_turn: bool = False, **kwargs
+    ):
        """
+        :param response_text: Response text rendered on the assistant.
        :param with_follow_on_turn: Set to true if the conversation expects a
            user follow-up, false otherwise.
        """
        super().__init__(
            *args,
+            response_text=response_text,
            with_follow_on_turn=with_follow_on_turn,
            **kwargs,
        )
--- a/platypush/plugins/assistant/init.py
+++ b/platypush/plugins/assistant/init.py
@ -244,7 +244,7 @@ class AssistantPlugin(Plugin, AssistantEntityManager, ABC):
    def _on_response_render_end(self):
        from platypush.message.event.assistant import ResponseEndEvent

-        self._send_event(ResponseEndEvent)
+        self._send_event(ResponseEndEvent, response_text=self._last_response)

    def _on_hotword_detected(self, hotword: Optional[str]):
        from platypush.message.event.assistant import HotwordDetectedEvent
--- a/platypush/plugins/assistant/picovoice/init.py
+++ b/platypush/plugins/assistant/picovoice/init.py
@ -216,6 +216,7 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
            'on_conversation_end': self._on_conversation_end,
            'on_conversation_timeout': self._on_conversation_timeout,
            'on_speech_recognized': self._on_speech_recognized,
+            'on_intent_matched': self._on_intent_matched,
            'on_hotword_detected': self._on_hotword_detected,
        }

--- a/platypush/plugins/assistant/picovoice/_assistant.py
+++ b/platypush/plugins/assistant/picovoice/_assistant.py
@ -1,28 +1,28 @@
 import logging
 import os
-from threading import Event, RLock
+from queue import Full, Queue
+from threading import Event, RLock, Thread
 from time import time
 from typing import Any, Dict, Optional, Sequence

-import pvcheetah
-import pvleopard
 import pvporcupine
-import pvrhino

 from platypush.context import get_plugin
 from platypush.message.event.assistant import (
+    AssistantEvent,
    ConversationTimeoutEvent,
    HotwordDetectedEvent,
+    IntentMatchedEvent,
    SpeechRecognizedEvent,
 )
 from platypush.plugins.tts.picovoice import TtsPicovoicePlugin

-from ._context import ConversationContext
 from ._recorder import AudioRecorder
+from ._speech import SpeechProcessor
 from ._state import AssistantState


-class Assistant:
+class Assistant(Thread):
    """
    A facade class that wraps the Picovoice engines under an assistant API.
    """
@ -43,6 +43,7 @@ class Assistant:
        keyword_model_path: Optional[str] = None,
        frame_expiration: float = 3.0,  # Don't process audio frames older than this
        speech_model_path: Optional[str] = None,
+        intent_model_path: Optional[str] = None,
        endpoint_duration: Optional[float] = None,
        enable_automatic_punctuation: bool = False,
        start_conversation_on_hotword: bool = False,
@ -53,8 +54,13 @@ class Assistant:
        on_conversation_end=_default_callback,
        on_conversation_timeout=_default_callback,
        on_speech_recognized=_default_callback,
+        on_intent_matched=_default_callback,
        on_hotword_detected=_default_callback,
    ):
+        super().__init__(name='picovoice:Assistant')
+        if intent_enabled:
+            assert intent_model_path, 'Intent model path not provided'
+
        self._access_key = access_key
        self._stop_event = stop_event
        self.logger = logging.getLogger(__name__)
@ -64,26 +70,40 @@ class Assistant:
        self.keywords = list(keywords or [])
        self.keyword_paths = None
        self.keyword_model_path = None
-        self._responding = Event()
        self.frame_expiration = frame_expiration
        self.endpoint_duration = endpoint_duration
        self.enable_automatic_punctuation = enable_automatic_punctuation
        self.start_conversation_on_hotword = start_conversation_on_hotword
        self.audio_queue_size = audio_queue_size
+        self._responding = Event()
        self._muted = muted
        self._speech_model_path = speech_model_path
        self._speech_model_path_override = None
+        self._intent_model_path = intent_model_path
+        self._intent_model_path_override = None
+        self._in_ctx = False
+
+        self._speech_processor = SpeechProcessor(
+            stop_event=stop_event,
+            stt_enabled=stt_enabled,
+            intent_enabled=intent_enabled,
+            conversation_timeout=conversation_timeout,
+            model_path=speech_model_path,
+            get_cheetah_args=self._get_speech_engine_args,
+            get_rhino_args=self._get_speech_engine_args,
+        )

        self._on_conversation_start = on_conversation_start
        self._on_conversation_end = on_conversation_end
        self._on_conversation_timeout = on_conversation_timeout
        self._on_speech_recognized = on_speech_recognized
+        self._on_intent_matched = on_intent_matched
        self._on_hotword_detected = on_hotword_detected

        self._recorder = None
        self._state = AssistantState.IDLE
        self._state_lock = RLock()
-        self._ctx = ConversationContext(timeout=conversation_timeout)
+        self._evt_queue = Queue(maxsize=100)

        if hotword_enabled:
            if not keywords:
@ -110,11 +130,7 @@ class Assistant:

                self.keyword_model_path = keyword_model_path

-        # Model path -> model instance cache
-        self._cheetah = {}
-        self._leopard: Optional[pvleopard.Leopard] = None
        self._porcupine: Optional[pvporcupine.Porcupine] = None
-        self._rhino: Optional[pvrhino.Rhino] = None

    @property
    def is_responding(self):
@ -124,6 +140,10 @@ class Assistant:
    def speech_model_path(self):
        return self._speech_model_path_override or self._speech_model_path

+    @property
+    def intent_model_path(self):
+        return self._intent_model_path_override or self._intent_model_path
+
    @property
    def tts(self) -> TtsPicovoicePlugin:
        p = get_plugin('tts.picovoice')
@ -157,18 +177,23 @@ class Assistant:
        if prev_state == new_state:
            return

+        self.logger.info('Assistant state transition: %s -> %s', prev_state, new_state)
        if prev_state == AssistantState.DETECTING_SPEECH:
            self.tts.stop()
-            self._ctx.stop()
            self._speech_model_path_override = None
+            self._intent_model_path_override = None
+            self._speech_processor.on_conversation_end()
            self._on_conversation_end()
        elif new_state == AssistantState.DETECTING_SPEECH:
-            self._ctx.start()
+            self._speech_processor.on_conversation_start()
            self._on_conversation_start()

        if new_state == AssistantState.DETECTING_HOTWORD:
            self.tts.stop()
-            self._ctx.reset()
+            self._speech_processor.on_conversation_reset()
+
+        # Put a null event on the event queue to unblock next_event
+        self._evt_queue.put(None)

    @property
    def porcupine(self) -> Optional[pvporcupine.Porcupine]:
@ -188,23 +213,16 @@ class Assistant:

        return self._porcupine

-    @property
-    def cheetah(self) -> Optional[pvcheetah.Cheetah]:
-        if not self.stt_enabled:
-            return None
+    def _get_speech_engine_args(self) -> dict:
+        args: Dict[str, Any] = {'access_key': self._access_key}
+        if self.speech_model_path:
+            args['model_path'] = self.speech_model_path
+        if self.endpoint_duration:
+            args['endpoint_duration_sec'] = self.endpoint_duration
+        if self.enable_automatic_punctuation:
+            args['enable_automatic_punctuation'] = self.enable_automatic_punctuation

-        if not self._cheetah.get(self.speech_model_path):
-            args: Dict[str, Any] = {'access_key': self._access_key}
-            if self.speech_model_path:
-                args['model_path'] = self.speech_model_path
-            if self.endpoint_duration:
-                args['endpoint_duration_sec'] = self.endpoint_duration
-            if self.enable_automatic_punctuation:
-                args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
-
-            self._cheetah[self.speech_model_path] = pvcheetah.create(**args)
-
-        return self._cheetah[self.speech_model_path]
+        return args

    def __enter__(self):
        """
@ -213,11 +231,14 @@ class Assistant:
        if self.should_stop():
            return self

+        assert not self.is_alive(), 'The assistant is already running'
+        self._in_ctx = True
+
        if self._recorder:
            self.logger.info('A recording stream already exists')
-        elif self.hotword_enabled or self.stt_enabled:
-            sample_rate = (self.porcupine or self.cheetah).sample_rate  # type: ignore
-            frame_length = (self.porcupine or self.cheetah).frame_length  # type: ignore
+        elif self.hotword_enabled or self.stt_enabled or self.intent_enabled:
+            sample_rate = (self.porcupine or self._speech_processor).sample_rate
+            frame_length = (self.porcupine or self._speech_processor).frame_length
            self._recorder = AudioRecorder(
                stop_event=self._stop_event,
                sample_rate=sample_rate,
@ -227,9 +248,7 @@ class Assistant:
                channels=1,
            )

-            if self.stt_enabled:
-                self._cheetah[self.speech_model_path] = self.cheetah
-
+            self._speech_processor.__enter__()
            self._recorder.__enter__()

            if self.porcupine:
@ -237,33 +256,25 @@ class Assistant:
            else:
                self.state = AssistantState.DETECTING_SPEECH

+        self.start()
        return self

    def __exit__(self, *_):
        """
        Stop the assistant and release all resources.
        """
+        self._in_ctx = False
        if self._recorder:
            self._recorder.__exit__(*_)
            self._recorder = None

        self.state = AssistantState.IDLE
-        for model in [*self._cheetah.keys()]:
-            cheetah = self._cheetah.pop(model, None)
-            if cheetah:
-                cheetah.delete()
-
-        if self._leopard:
-            self._leopard.delete()
-            self._leopard = None

        if self._porcupine:
            self._porcupine.delete()
            self._porcupine = None

-        if self._rhino:
-            self._rhino.delete()
-            self._rhino = None
+        self._speech_processor.__exit__(*_)

    def __iter__(self):
        """
@ -275,29 +286,36 @@ class Assistant:
        """
        Process the next audio frame and return the corresponding event.
        """
-        has_data = False
        if self.should_stop() or not self._recorder:
            raise StopIteration

-        while not (self.should_stop() or has_data):
-            data = self._recorder.read()
-            if data is None:
-                continue
+        if self.hotword_enabled and self.state == AssistantState.DETECTING_HOTWORD:
+            return self._evt_queue.get()

-            frame, t = data
-            if time() - t > self.frame_expiration:
-                self.logger.info(
-                    'Skipping audio frame older than %ss', self.frame_expiration
-                )
-                continue  # The audio frame is too old
+        evt = None
+        if (
+            self._speech_processor.enabled
+            and self.state == AssistantState.DETECTING_SPEECH
+        ):
+            evt = self._speech_processor.next_event()

-            if self.hotword_enabled and self.state == AssistantState.DETECTING_HOTWORD:
-                return self._process_hotword(frame)
+        if isinstance(evt, SpeechRecognizedEvent):
+            self._on_speech_recognized(phrase=evt.args['phrase'])
+        if isinstance(evt, IntentMatchedEvent):
+            self._on_intent_matched(
+                intent=evt.args['intent'], slots=evt.args.get('slots', {})
+            )
+        if isinstance(evt, ConversationTimeoutEvent):
+            self._on_conversation_timeout()

-            if self.stt_enabled and self.state == AssistantState.DETECTING_SPEECH:
-                return self._process_speech(frame)
+        if (
+            evt
+            and self.state == AssistantState.DETECTING_SPEECH
+            and self.hotword_enabled
+        ):
+            self.state = AssistantState.DETECTING_HOTWORD

-        raise StopIteration
+        return evt

    def mute(self):
        self._muted = True
@ -321,7 +339,7 @@ class Assistant:
        else:
            self.mute()

-    def _process_hotword(self, frame):
+    def _process_hotword(self, frame) -> Optional[HotwordDetectedEvent]:
        if not self.porcupine:
            return None

@ -333,48 +351,61 @@ class Assistant:
            if self.start_conversation_on_hotword:
                self.state = AssistantState.DETECTING_SPEECH

-            self.tts.stop()
+            self.tts.stop()  # Stop any ongoing TTS when the hotword is detected
            self._on_hotword_detected(hotword=self.keywords[keyword_index])
            return HotwordDetectedEvent(hotword=self.keywords[keyword_index])

        return None

-    def _process_speech(self, frame):
-        if not self.cheetah:
-            return None
-
-        event = None
-        partial_transcript, self._ctx.is_final = self.cheetah.process(frame)
-
-        if partial_transcript:
-            self._ctx.transcript += partial_transcript
-            self.logger.info(
-                'Partial transcript: %s, is_final: %s',
-                self._ctx.transcript,
-                self._ctx.is_final,
-            )
-
-        if self._ctx.is_final or self._ctx.timed_out:
-            phrase = self.cheetah.flush() or ''
-            self._ctx.transcript += phrase
-            phrase = self._ctx.transcript
-            phrase = phrase[:1].lower() + phrase[1:]
-
-            if phrase:
-                event = SpeechRecognizedEvent(phrase=phrase)
-                self._on_speech_recognized(phrase=phrase)
-            else:
-                event = ConversationTimeoutEvent()
-                self._on_conversation_timeout()
-
-            self._ctx.reset()
-            if self.hotword_enabled:
-                self.state = AssistantState.DETECTING_HOTWORD
-
-        return event
-
    def override_speech_model(self, model_path: Optional[str]):
        self._speech_model_path_override = model_path

+    def override_intent_model(self, model_path: Optional[str]):
+        self._intent_model_path_override = model_path
+
+    def _put_event(self, evt: AssistantEvent):
+        try:
+            self._evt_queue.put_nowait(evt)
+        except Full:
+            self.logger.warning('The assistant event queue is full')
+
+    def run(self):
+        assert (
+            self._in_ctx
+        ), 'The assistant can only be started through a context manager'
+
+        super().run()
+
+        while not self.should_stop() and self._recorder:
+            self._recorder.wait_start()
+            if self.should_stop():
+                break
+
+            data = self._recorder.read()
+            if data is None:
+                continue
+
+            frame, t = data
+            if time() - t > self.frame_expiration:
+                self.logger.info(
+                    'Skipping audio frame older than %ss', self.frame_expiration
+                )
+                continue  # The audio frame is too old
+
+            if self.hotword_enabled and self.state == AssistantState.DETECTING_HOTWORD:
+                evt = self._process_hotword(frame)
+                if evt:
+                    self._put_event(evt)
+
+                continue
+
+            if (
+                self._speech_processor.enabled
+                and self.state == AssistantState.DETECTING_SPEECH
+            ):
+                self._speech_processor.process(frame, block=False)
+
+        self.logger.info('Assistant stopped')
+

 # vim:sw=4:ts=4:et:
--- a/platypush/plugins/assistant/picovoice/_context.py
+++ b/platypush/plugins/assistant/picovoice/_context.py
@ -2,6 +2,8 @@ from dataclasses import dataclass
 from time import time
 from typing import Optional

+from ._intent import Intent
+

@dataclass
 class ConversationContext:
@ -11,6 +13,7 @@ class ConversationContext:

    transcript: str = ''
    is_final: bool = False
+    intent: Optional[Intent] = None
    timeout: Optional[float] = None
    t_start: Optional[float] = None
    t_end: Optional[float] = None
@ -25,6 +28,7 @@ class ConversationContext:

    def reset(self):
        self.transcript = ''
+        self.intent = None
        self.is_final = False
        self.t_start = None
        self.t_end = None
@ -32,14 +36,18 @@ class ConversationContext:
    @property
    def timed_out(self):
        return (
-            not self.transcript
-            and not self.is_final
+            (
+                (not self.transcript and not self.is_final)
+                or (not self.intent and not self.is_final)
+            )
            and self.timeout
            and self.t_start
            and time() - self.t_start > self.timeout
        ) or (
-            self.transcript
-            and not self.is_final
+            (
+                (self.transcript and not self.is_final)
+                or (self.intent and not self.is_final)
+            )
            and self.timeout
            and self.t_start
            and time() - self.t_start > self.timeout * 2
--- a/platypush/plugins/assistant/picovoice/_intent.py
+++ b/platypush/plugins/assistant/picovoice/_intent.py
@ -0,0 +1,11 @@
+from dataclasses import dataclass, field
+
+
+@dataclass
+class Intent:
+    """
+    Speech intent data class.
+    """
+
+    name: str
+    slots: dict = field(default_factory=dict)
--- a/platypush/plugins/assistant/picovoice/_recorder.py
+++ b/platypush/plugins/assistant/picovoice/_recorder.py
@ -178,3 +178,14 @@ class AudioRecorder:
        Wait until the audio stream is stopped.
        """
        wait_for_either(self._stop_event, self._upstream_stop_event, timeout=timeout)
+
+    def wait_start(self, timeout: Optional[float] = None):
+        """
+        Wait until the audio stream is started.
+        """
+        wait_for_either(
+            self._stop_event,
+            self._upstream_stop_event,
+            self._paused_state._recording_event,
+            timeout=timeout,
+        )
--- a/platypush/plugins/assistant/picovoice/_speech/init.py
+++ b/platypush/plugins/assistant/picovoice/_speech/init.py
@ -0,0 +1,3 @@
+from ._processor import SpeechProcessor
+
+__all__ = ['SpeechProcessor']
--- a/platypush/plugins/assistant/picovoice/_speech/_base.py
+++ b/platypush/plugins/assistant/picovoice/_speech/_base.py
@ -0,0 +1,152 @@
+import logging
+from abc import ABC, abstractmethod
+from queue import Empty, Queue
+from threading import Event, Thread, get_ident
+from typing import Optional, Sequence
+
+from platypush.message.event.assistant import AssistantEvent
+
+from .._context import ConversationContext
+
+
+class BaseProcessor(ABC, Thread):
+    """
+    Base speech processor class. It is implemented by the ``SttProcessor`` and
+    the ``IntentProcessor`` classes.
+    """
+
+    def __init__(
+        self,
+        *args,
+        stop_event: Event,
+        conversation_timeout: Optional[float] = None,
+        **kwargs,
+    ):
+        super().__init__(*args, name=f'picovoice:{self.__class__.__name__}', **kwargs)
+
+        self.logger = logging.getLogger(self.name)
+        self._audio_queue = Queue()
+        self._stop_event = stop_event
+        self._ctx = ConversationContext(timeout=conversation_timeout)
+        self._event_queue = Queue()
+        # This event is set if the upstream processor is waiting for an event
+        # from this processor
+        self._event_wait = Event()
+        # This event is set when the processor is done with the audio
+        # processing and it's ready to accept a new audio frame
+        self._processing_done = Event()
+        self._processing_done.set()
+
+    def should_stop(self) -> bool:
+        return self._stop_event.is_set()
+
+    def wait_stop(self, timeout: Optional[float] = None) -> bool:
+        return self._stop_event.wait(timeout)
+
+    def enqueue(self, audio: Sequence[int]):
+        self._event_wait.set()
+        self._processing_done.clear()
+        self._audio_queue.put_nowait(audio)
+
+    @property
+    def processing_done(self) -> Event:
+        return self._processing_done
+
+    @property
+    @abstractmethod
+    def _model_path(self) -> Optional[str]:
+        """
+        Return the model path.
+        """
+
+    @property
+    @abstractmethod
+    def sample_rate(self) -> int:
+        """
+        :return: The sample rate wanted by Cheetah/Rhino.
+        """
+
+    @property
+    @abstractmethod
+    def frame_length(self) -> int:
+        """
+        :return: The frame length wanted by Cheetah/Rhino.
+        """
+
+    def last_event(self) -> Optional[AssistantEvent]:
+        """
+        :return: The latest event that was processed by the processor.
+        """
+        evt = None
+        try:
+            while True:
+                evt = self._event_queue.get_nowait()
+        except Empty:
+            pass
+
+        if evt:
+            self._event_wait.clear()
+
+        return evt
+
+    def clear_wait(self):
+        self._event_wait.clear()
+
+    @abstractmethod
+    def process(self, audio: Sequence[int]) -> Optional[AssistantEvent]:
+        """
+        Process speech events from a raw audio input.
+        """
+
+    def run(self):
+        super().run()
+        self._ctx.reset()
+        self._processing_done.clear()
+        self.logger.info('Processor started: %s', self.name)
+
+        while not self.should_stop():
+            audio = self._audio_queue.get()
+
+            # The thread is stopped when it receives a None object
+            if audio is None:
+                break
+
+            # Don't process the audio if the upstream processor is not waiting
+            # for an event
+            if not self._event_wait.is_set():
+                continue
+
+            try:
+                self._processing_done.clear()
+                event = self.process(audio)
+                if event:
+                    self._event_queue.put_nowait(event)
+                    self._processing_done.set()
+            except Exception as e:
+                self.logger.error(
+                    'An error occurred while processing the audio on %s: %s',
+                    self.name,
+                    e,
+                    exc_info=e,
+                )
+                self.wait_stop(timeout=1)
+                self._processing_done.set()
+                continue
+
+        self._ctx.reset()
+        self.logger.info('Processor stopped: %s', self.name)
+
+    def stop(self):
+        self._audio_queue.put_nowait(None)
+        if self.is_alive() and self.ident != get_ident():
+            self.logger.debug('Stopping %s', self.name)
+            self.join()
+
+    def on_conversation_start(self):
+        self._ctx.start()
+
+    def on_conversation_end(self):
+        self._ctx.stop()
+
+    def on_conversation_reset(self):
+        self._ctx.reset()
--- a/platypush/plugins/assistant/picovoice/_speech/_intent.py
+++ b/platypush/plugins/assistant/picovoice/_speech/_intent.py
@ -0,0 +1,86 @@
+from typing import Callable, Optional, Sequence, Union
+
+import pvrhino
+
+from platypush.message.event.assistant import (
+    ConversationTimeoutEvent,
+    IntentMatchedEvent,
+)
+
+from ._base import BaseProcessor
+
+
+class IntentProcessor(BaseProcessor):
+    """
+    Implementation of the speech-to-intent processor using the Picovoice Rhino
+    engine.
+    """
+
+    def __init__(
+        self, *args, get_rhino_args: Callable[[], dict] = lambda: {}, **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self._get_rhino_args = get_rhino_args
+        # model_path -> Rhino instance cache
+        self._rhino = {}
+
+    @property
+    def _model_path(self) -> Optional[str]:
+        return self._get_rhino_args().get('model_path')
+
+    @property
+    def sample_rate(self) -> int:
+        return self._get_rhino().sample_rate
+
+    @property
+    def frame_length(self) -> int:
+        return self._get_rhino().frame_length
+
+    def _get_rhino(self) -> pvrhino.Rhino:
+        if not self._rhino.get(self._model_path):
+            self._rhino[self._model_path] = pvrhino.create(**self._get_rhino_args())
+
+        return self._rhino[self._model_path]
+
+    def process(
+        self, audio: Sequence[int]
+    ) -> Optional[Union[IntentMatchedEvent, ConversationTimeoutEvent]]:
+        """
+        Process the audio and return an ``IntentMatchedEvent`` if the intent was
+        understood, or a ``ConversationTimeoutEvent`` if the conversation timed
+        out, or ``None`` if the intent processing is not yet finalized.
+        """
+        event = None
+        rhino = self._get_rhino()
+        self._ctx.is_final = rhino.process(audio)
+
+        if self._ctx.is_final:
+            inference = rhino.get_inference()
+            self.logger.debug(
+                'Intent detection finalized. Inference understood: %s',
+                inference.is_understood,
+            )
+
+            if inference.is_understood:
+                event = IntentMatchedEvent(
+                    intent=inference.intent,
+                    slots={slot.key: slot.value for slot in inference.slots},
+                )
+
+        if not event and self._ctx.timed_out:
+            event = ConversationTimeoutEvent()
+
+        if event:
+            self._ctx.reset()
+
+        if event:
+            self.logger.debug('Intent event: %s', event)
+
+        return event
+
+    def stop(self):
+        super().stop()
+        objs = self._rhino.copy()
+        for key, obj in objs.items():
+            obj.delete()
+            self._rhino.pop(key)
--- a/platypush/plugins/assistant/picovoice/_speech/_processor.py
+++ b/platypush/plugins/assistant/picovoice/_speech/_processor.py
@ -0,0 +1,196 @@
+import logging
+from queue import Queue
+from threading import Event
+from typing import Callable, Optional, Sequence
+
+from platypush.message.event.assistant import AssistantEvent
+from platypush.utils import wait_for_either
+
+from ._intent import IntentProcessor
+from ._stt import SttProcessor
+
+
+class SpeechProcessor:
+    """
+    Speech processor class that wraps the STT and Intent processors under the
+    same interface.
+    """
+
+    def __init__(
+        self,
+        stop_event: Event,
+        model_path: Optional[str] = None,
+        stt_enabled: bool = True,
+        intent_enabled: bool = False,
+        conversation_timeout: Optional[float] = None,
+        get_cheetah_args: Callable[[], dict] = lambda: {},
+        get_rhino_args: Callable[[], dict] = lambda: {},
+    ):
+        self.logger = logging.getLogger(self.__class__.__name__)
+        self._stt_enabled = stt_enabled
+        self._intent_enabled = intent_enabled
+        self._model_path = model_path
+        self._conversation_timeout = conversation_timeout
+        self._audio_queue = Queue()
+        self._stop_event = stop_event
+        self._get_cheetah_args = get_cheetah_args
+        self._get_rhino_args = get_rhino_args
+
+        self._stt_processor = SttProcessor(
+            conversation_timeout=conversation_timeout,
+            stop_event=stop_event,
+            get_cheetah_args=get_cheetah_args,
+        )
+
+        self._intent_processor = IntentProcessor(
+            conversation_timeout=conversation_timeout,
+            stop_event=stop_event,
+            get_rhino_args=get_rhino_args,
+        )
+
+    @property
+    def enabled(self) -> bool:
+        """
+        The processor is enabled if either the STT or the Intent processor are
+        enabled.
+        """
+        return self._stt_enabled or self._intent_enabled
+
+    def should_stop(self) -> bool:
+        return self._stop_event.is_set()
+
+    def next_event(self, timeout: Optional[float] = None) -> Optional[AssistantEvent]:
+        evt = None
+
+        # Wait for either the STT or Intent processor to finish processing the audio
+        completed = wait_for_either(
+            self._stt_processor.processing_done,
+            self._intent_processor.processing_done,
+            self._stop_event,
+            timeout=timeout,
+        )
+
+        if not completed:
+            self.logger.warning('Timeout while waiting for the processors to finish')
+
+        # Immediately return if the stop event is set
+        if self.should_stop():
+            return evt
+
+        # Priority to the intent processor event, if the processor is enabled
+        if self._intent_enabled:
+            evt = self._intent_processor.last_event()
+            if evt:
+                self.logger.debug('Intent processor event: %s', evt)
+
+        # If the intent processor didn't return any event, then return the STT
+        # processor event
+        if not evt and self._stt_enabled:
+            evt = self._stt_processor.last_event()
+            if evt:
+                self.logger.debug('STT processor event: %s', evt)
+
+        if evt:
+            self._stt_processor.clear_wait()
+            self._intent_processor.clear_wait()
+
+        return evt
+
+    def process(
+        self, audio: Sequence[int], block: bool = True, timeout: Optional[float] = None
+    ) -> Optional[AssistantEvent]:
+        """
+        Process an audio frame.
+
+        The audio frame is enqueued to both the STT and Intent processors, if
+        enabled. The function waits for either processor to finish processing
+        the audio, and returns the event from the first processor that returns
+        a result.
+
+        Priority is given to the Intent processor if enabled, otherwise the STT
+        processor is used.
+        """
+        # Enqueue the audio to both the STT and Intent processors if enabled
+        if self._stt_enabled:
+            self._stt_processor.enqueue(audio)
+
+        if self._intent_enabled:
+            self._intent_processor.enqueue(audio)
+
+        if not block:
+            return None
+
+        return self.next_event(timeout=timeout)
+
+    def __enter__(self):
+        """
+        Context manager entry point - it wraps :meth:`start`.
+        """
+        self.start()
+
+    def __exit__(self, *_, **__):
+        """
+        Context manager exit point - it wraps :meth:`stop`.
+        """
+        self.stop()
+
+    def start(self):
+        """
+        Start the STT and Intent processors.
+        """
+        self._stt_processor.start()
+        self._intent_processor.start()
+
+    def stop(self):
+        """
+        Stop the STT and Intent processors.
+        """
+        self._stt_processor.stop()
+        self._intent_processor.stop()
+
+    def on_conversation_start(self):
+        if self._stt_enabled:
+            self._stt_processor.on_conversation_start()
+
+        if self._intent_enabled:
+            self._intent_processor.on_conversation_start()
+
+    def on_conversation_end(self):
+        if self._stt_enabled:
+            self._stt_processor.on_conversation_end()
+
+        if self._intent_enabled:
+            self._intent_processor.on_conversation_end()
+
+    def on_conversation_reset(self):
+        if self._stt_enabled:
+            self._stt_processor.on_conversation_reset()
+
+        if self._intent_enabled:
+            self._intent_processor.on_conversation_reset()
+
+    @property
+    def sample_rate(self) -> int:
+        """
+        The sample rate of the audio frames.
+        """
+        if self._intent_enabled:
+            return self._intent_processor.sample_rate
+
+        if self._stt_enabled:
+            return self._stt_processor.sample_rate
+
+        raise ValueError('No processor enabled')
+
+    @property
+    def frame_length(self) -> int:
+        """
+        The frame length of the audio frames.
+        """
+        if self._intent_enabled:
+            return self._intent_processor.frame_length
+
+        if self._stt_enabled:
+            return self._stt_processor.frame_length
+
+        raise ValueError('No processor enabled')
--- a/platypush/plugins/assistant/picovoice/_speech/_stt.py
+++ b/platypush/plugins/assistant/picovoice/_speech/_stt.py
@ -0,0 +1,92 @@
+from typing import Callable, Optional, Sequence, Union
+
+import pvcheetah
+
+from platypush.message.event.assistant import (
+    ConversationTimeoutEvent,
+    SpeechRecognizedEvent,
+)
+
+from ._base import BaseProcessor
+
+
+class SttProcessor(BaseProcessor):
+    """
+    Implementation of the speech-to-text processor using the Picovoice Cheetah
+    engine.
+    """
+
+    def __init__(
+        self, *args, get_cheetah_args: Callable[[], dict] = lambda: {}, **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self._get_cheetah_args = get_cheetah_args
+        # model_path -> Cheetah instance cache
+        self._cheetah = {self._model_path: pvcheetah.create(**self._get_cheetah_args())}
+
+    @property
+    def _model_path(self) -> Optional[str]:
+        return self._get_cheetah_args().get('model_path')
+
+    @property
+    def sample_rate(self) -> int:
+        return self._get_cheetah().sample_rate
+
+    @property
+    def frame_length(self) -> int:
+        return self._get_cheetah().frame_length
+
+    def _get_cheetah(self) -> pvcheetah.Cheetah:
+        if not self._cheetah.get(self._model_path):
+            self.logger.debug(
+                'Creating Cheetah instance for model %s', self._model_path
+            )
+            self._cheetah[self._model_path] = pvcheetah.create(
+                **self._get_cheetah_args()
+            )
+            self.logger.debug('Cheetah instance created for model %s', self._model_path)
+
+        return self._cheetah[self._model_path]
+
+    def process(
+        self, audio: Sequence[int]
+    ) -> Optional[Union[SpeechRecognizedEvent, ConversationTimeoutEvent]]:
+        event = None
+        cheetah = self._get_cheetah()
+        partial_transcript, self._ctx.is_final = cheetah.process(audio)
+
+        # Concatenate the partial transcript to the context
+        if partial_transcript:
+            self._ctx.transcript += partial_transcript
+            self.logger.info(
+                'Partial transcript: %s, is_final: %s',
+                self._ctx.transcript,
+                self._ctx.is_final,
+            )
+
+        # If the transcript is final or the conversation timed out, then
+        # process and return whatever is available in the context
+        if self._ctx.is_final or self._ctx.timed_out:
+            phrase = cheetah.flush() or ''
+            self._ctx.transcript += phrase
+            phrase = self._ctx.transcript
+            phrase = phrase[:1].lower() + phrase[1:]
+            event = (
+                SpeechRecognizedEvent(phrase=phrase)
+                if phrase
+                else ConversationTimeoutEvent()
+            )
+
+            self._ctx.reset()
+
+        if event:
+            self.logger.debug('STT event: %s', event)
+
+        return event
+
+    def stop(self):
+        super().stop()
+        objs = self._cheetah.copy()
+        for key, obj in objs.items():
+            obj.delete()
+            self._cheetah.pop(key)
--- a/platypush/plugins/assistant/picovoice/manifest.yaml
+++ b/platypush/plugins/assistant/picovoice/manifest.yaml
@ -6,9 +6,11 @@ manifest:
    - platypush.message.event.assistant.ConversationStartEvent
    - platypush.message.event.assistant.ConversationTimeoutEvent
    - platypush.message.event.assistant.HotwordDetectedEvent
+    - platypush.message.event.assistant.IntentMatchedEvent
    - platypush.message.event.assistant.MicMutedEvent
    - platypush.message.event.assistant.MicUnmutedEvent
    - platypush.message.event.assistant.NoResponseEvent
+    - platypush.message.event.assistant.ResponseEndEvent
    - platypush.message.event.assistant.ResponseEvent
    - platypush.message.event.assistant.SpeechRecognizedEvent
  install:
@ -22,6 +24,7 @@ manifest:
      - ffmpeg
      - python-sounddevice
    pip:
+      - num2words  # Temporary dependency
      - pvcheetah
      - pvleopard
      - pvorca