platypush/platypush/plugins/assistant/picovoice/_speech/_processor.py

import logging
from queue import Queue
from threading import Event
from typing import Callable, Optional, Sequence

from platypush.message.event.assistant import AssistantEvent, ConversationTimeoutEvent
from platypush.utils import wait_for_either

from ._intent import IntentProcessor
from ._stt import SttProcessor


class SpeechProcessor:
    """
    Speech processor class that wraps the STT and Intent processors under the
    same interface.
    """

    def __init__(
        self,
        stop_event: Event,
        model_path: Optional[str] = None,
        stt_enabled: bool = True,
        intent_enabled: bool = False,
        conversation_timeout: Optional[float] = None,
        get_cheetah_args: Callable[[], dict] = lambda: {},
        get_rhino_args: Callable[[], dict] = lambda: {},
    ):
        self.logger = logging.getLogger(self.__class__.__name__)
        self._stt_enabled = stt_enabled
        self._intent_enabled = intent_enabled
        self._model_path = model_path
        self._conversation_timeout = conversation_timeout
        self._audio_queue = Queue()
        self._stop_event = stop_event
        self._get_cheetah_args = get_cheetah_args
        self._get_rhino_args = get_rhino_args

        self._stt_processor = SttProcessor(
            conversation_timeout=conversation_timeout,
            stop_event=stop_event,
            enabled=stt_enabled,
            get_cheetah_args=get_cheetah_args,
        )

        self._intent_processor = IntentProcessor(
            conversation_timeout=conversation_timeout,
            stop_event=stop_event,
            enabled=intent_enabled,
            get_rhino_args=get_rhino_args,
        )

    @property
    def enabled(self) -> bool:
        """
        The processor is enabled if either the STT or the Intent processor are
        enabled.
        """
        return self._stt_enabled or self._intent_enabled

    def should_stop(self) -> bool:
        return self._stop_event.is_set()

    def next_event(self, timeout: Optional[float] = None) -> Optional[AssistantEvent]:
        evt = None

        # Wait for either the STT or Intent processor to finish processing the audio
        completed = wait_for_either(
            self._stt_processor.processing_done,
            self._intent_processor.processing_done,
            self._stop_event,
            timeout=timeout,
        )

        if not completed:
            self.logger.warning('Timeout while waiting for the processors to finish')

        # Immediately return if the stop event is set
        if self.should_stop():
            return evt

        with self._stt_processor._state_lock, self._intent_processor._state_lock:
            # Priority to the intent processor event, if the processor is enabled
            if self._intent_enabled:
                evt = self._intent_processor.last_event()

            # If the intent processor didn't return any event, then return the STT
            # processor event
            if (
                not evt or isinstance(evt, ConversationTimeoutEvent)
            ) and self._stt_enabled:
                # self._stt_processor.processing_done.wait(timeout=timeout)
                evt = self._stt_processor.last_event()

            if evt:
                self._stt_processor.reset()
                self._intent_processor.reset()

            return evt

    def process(
        self, audio: Sequence[int], block: bool = True, timeout: Optional[float] = None
    ) -> Optional[AssistantEvent]:
        """
        Process an audio frame.

        The audio frame is enqueued to both the STT and Intent processors, if
        enabled. The function waits for either processor to finish processing
        the audio, and returns the event from the first processor that returns
        a result.

        Priority is given to the Intent processor if enabled, otherwise the STT
        processor is used.
        """
        # Enqueue the audio to both the STT and Intent processors if enabled
        if self._stt_enabled:
            self._stt_processor.enqueue(audio)

        if self._intent_enabled:
            self._intent_processor.enqueue(audio)

        if not block:
            return None

        return self.next_event(timeout=timeout)

    def __enter__(self):
        """
        Context manager entry point - it wraps :meth:`start`.
        """
        self.start()

    def __exit__(self, *_, **__):
        """
        Context manager exit point - it wraps :meth:`stop`.
        """
        self.stop()

    def start(self):
        """
        Start the STT and Intent processors.
        """
        if self._stt_enabled:
            self._stt_processor.start()

        if self._intent_enabled:
            self._intent_processor.start()

    def stop(self):
        """
        Stop the STT and Intent processors.
        """
        self._stt_processor.stop()
        self._intent_processor.stop()

    def on_conversation_start(self):
        if self._stt_enabled:
            self._stt_processor.on_conversation_start()

        if self._intent_enabled:
            self._intent_processor.on_conversation_start()

    def on_conversation_end(self):
        if self._stt_enabled:
            self._stt_processor.on_conversation_end()

        if self._intent_enabled:
            self._intent_processor.on_conversation_end()

    def on_conversation_reset(self):
        if self._stt_enabled:
            self._stt_processor.on_conversation_reset()

        if self._intent_enabled:
            self._intent_processor.on_conversation_reset()

    def reset(self):
        """
        Reset the state of the STT and Intent processors.
        """
        self._stt_processor.reset()
        self._intent_processor.reset()

    @property
    def sample_rate(self) -> int:
        """
        The sample rate of the audio frames.
        """
        if self._intent_enabled:
            return self._intent_processor.sample_rate

        if self._stt_enabled:
            return self._stt_processor.sample_rate

        raise ValueError('No processor enabled')

    @property
    def frame_length(self) -> int:
        """
        The frame length of the audio frames.
        """
        if self._intent_enabled:
            return self._intent_processor.frame_length

        if self._stt_enabled:
            return self._stt_processor.frame_length

        raise ValueError('No processor enabled')