platypush/platypush/plugins/stt/__init__.py

import queue
import threading
from abc import ABC, abstractmethod
from typing import Optional, Union, List

import sounddevice as sd

from platypush.context import get_bus
from platypush.message.event.stt import SpeechDetectionStartedEvent, SpeechDetectionStoppedEvent, SpeechStartedEvent, \
    SpeechDetectedEvent, HotwordDetectedEvent, ConversationDetectedEvent
from platypush.message.response.stt import SpeechDetectedResponse
from platypush.plugins import Plugin, action


class SttPlugin(ABC, Plugin):
    """
    Abstract class for speech-to-text plugins.

    Triggers:

        * :class:`platypush.message.event.stt.SpeechStartedEvent` when speech starts being detected.
        * :class:`platypush.message.event.stt.SpeechDetectedEvent` when speech is detected.
        * :class:`platypush.message.event.stt.SpeechDetectionStartedEvent` when speech detection starts.
        * :class:`platypush.message.event.stt.SpeechDetectionStoppedEvent` when speech detection stops.
        * :class:`platypush.message.event.stt.HotwordDetectedEvent` when a user-defined hotword is detected.
        * :class:`platypush.message.event.stt.ConversationDetectedEvent` when speech is detected after a hotword.

    """

    _thread_stop_timeout = 10.0
    rate = 16000
    channels = 1

    def __init__(self,
                 input_device: Optional[Union[int, str]] = None,
                 hotword: Optional[str] = None,
                 hotwords: Optional[List[str]] = None,
                 conversation_timeout: Optional[float] = 10.0,
                 block_duration: float = 1.0):
        """
        :param input_device: PortAudio device index or name that will be used for recording speech (default: default
            system audio input device).
        :param hotword: When this word is detected, the plugin will trigger a
            :class:`platypush.message.event.stt.HotwordDetectedEvent` instead of a
            :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can use these events for hooking other
            assistants.
        :param hotwords: Use a list of hotwords instead of a single one.
        :param conversation_timeout: If ``hotword`` or ``hotwords`` are set and ``conversation_timeout`` is set,
            the next speech detected event will trigger a :class:`platypush.message.event.stt.ConversationDetectedEvent`
            instead of a :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can hook custom hooks
            here to run any logic depending on the detected speech - it can emulate a kind of
            "OK, Google. Turn on the lights" interaction without using an external assistant (default: 10 seconds).
        :param block_duration: Duration of the acquired audio blocks (default: 1 second).
        """

        super().__init__()
        self.input_device = input_device
        self.conversation_timeout = conversation_timeout
        self.block_duration = block_duration

        self.hotwords = set(hotwords or [])
        if hotword:
            self.hotwords = {hotword}

        self._conversation_event = threading.Event()
        self._input_stream: Optional[sd.InputStream] = None
        self._recording_thread: Optional[threading.Thread] = None
        self._detection_thread: Optional[threading.Thread] = None
        self._audio_queue: Optional[queue.Queue] = None
        self._current_text = ''

    def _get_input_device(self, device: Optional[Union[int, str]] = None) -> int:
        """
        Get the index of the input device by index or name.

        :param device: Device index or name. If None is set then the function will return the index of the
            default audio input device.
        :return: Index of the audio input device.
        """
        if not device:
            device = self.input_device
        if not device:
            return sd.query_hostapis()[0].get('default_input_device')

        if isinstance(device, int):
            assert device <= len(sd.query_devices())
            return device

        for i, dev in enumerate(sd.query_devices()):
            if dev['name'] == device:
                return i

        raise AssertionError('Device {} not found'.format(device))

    def on_speech_detected(self, speech: str) -> None:
        """
        Hook called when speech is detected. Triggers the right event depending on the current context.

        :param speech: Detected speech.
        """
        speech = speech.strip()

        if speech in self.hotwords:
            event = HotwordDetectedEvent(hotword=speech)
            if self.conversation_timeout:
                self._conversation_event.set()
                threading.Timer(self.conversation_timeout, lambda: self._conversation_event.clear()).start()
        elif self._conversation_event.is_set():
            event = ConversationDetectedEvent(speech=speech)
        else:
            event = SpeechDetectedEvent(speech=speech)

        get_bus().post(event)

    @staticmethod
    def convert_frames(frames:  bytes) -> bytes:
        """
        Conversion method for raw audio frames. It just returns the input frames as bytes. Override it if required
        by your logic.

        :param frames: Input audio frames, as bytes.
        :return: The audio frames as passed on the input. Override if required.
        """
        return frames

    def on_detection_started(self) -> None:
        """
        Method called when the ``detection_thread`` starts. Initialize your context variables and models here if
        required.
        """
        pass

    def on_detection_ended(self) -> None:
        """
        Method called when the ``detection_thread`` stops. Clean up your context variables and models here.
        """
        pass

    def before_recording(self) -> None:
        """
        Method called when the ``recording_thread`` starts. Put here any logic that you may want to run before the
        recording thread starts.
        """
        pass

    def on_recording_started(self) -> None:
        """
        Method called after the ``recording_thread`` opens the audio device. Put here any logic that you may want to
        run after the recording starts.
        """
        pass

    def on_recording_ended(self) -> None:
        """
        Method called when the ``recording_thread`` stops. Put here any logic that you want to run after the audio
        device is closed.
        """
        pass

    @abstractmethod
    def detect_speech(self, frames) -> str:
        """
        Method called within the ``detection_thread`` when new audio frames have been captured. Must be implemented
        by the derived classes.

        :param frames: Audio frames, as returned by ``convert_frames``.
        :return: Detected text, as a string. Returns an empty string if no text has been detected.
        """
        raise NotImplementedError

    def process_text(self, text: str) -> None:
        if (not text and self._current_text) or (text and text == self._current_text):
            self.on_speech_detected(self._current_text)
            self._current_text = ''
        else:
            if text:
                if not self._current_text:
                    get_bus().post(SpeechStartedEvent())
                self.logger.info('Intermediate speech results: [{}]'.format(text))

            self._current_text = text

    def detection_thread(self) -> None:
        """
        This thread reads frames from ``_audio_queue``, performs the speech-to-text detection and calls
        """
        self._current_text = ''
        self.logger.debug('Detection thread started')
        self.on_detection_started()

        while self._audio_queue:
            try:
                frames = self._audio_queue.get()
                frames = self.convert_frames(frames)
            except Exception as e:
                self.logger.warning('Error while feeding audio to the model: {}'.format(str(e)))
                continue

            text = self.detect_speech(frames).strip()
            self.process_text(text)

        self.on_detection_ended()
        self.logger.debug('Detection thread terminated')

    def recording_thread(self, block_duration: Optional[float] = None, block_size: Optional[int] = None,
                         input_device: Optional[str] = None) -> None:
        """
        Recording thread. It reads raw frames from the audio device and dispatches them to ``detection_thread``.

        :param block_duration: Audio blocks duration. Specify either ``block_duration`` or ``block_size``.
        :param block_size: Size of the audio blocks. Specify either ``block_duration`` or ``block_size``.
        :param input_device: Input device
        """
        assert (block_duration or block_size) and not (block_duration and block_size), \
            'Please specify either block_duration or block_size'

        if not block_size:
            block_size = int(self.rate * self.channels * block_duration)

        self.before_recording()
        self.logger.debug('Recording thread started')
        device = self._get_input_device(input_device)
        self._input_stream = sd.InputStream(samplerate=self.rate, device=device,
                                            channels=self.channels, dtype='int16', latency=0,
                                            blocksize=block_size)
        self._input_stream.start()
        self.on_recording_started()
        get_bus().post(SpeechDetectionStartedEvent())

        while self._input_stream:
            try:
                frames = self._input_stream.read(block_size)[0]
            except Exception as e:
                self.logger.warning('Error while reading from the audio input: {}'.format(str(e)))
                continue

            self._audio_queue.put(frames)

        get_bus().post(SpeechDetectionStoppedEvent())
        self.on_recording_ended()
        self.logger.debug('Recording thread terminated')

    @abstractmethod
    @action
    def detect(self, audio_file: str) -> SpeechDetectedResponse:
        """
        Perform speech-to-text analysis on an audio file. Must be implemented by the derived classes.

        :param audio_file: Path to the audio file.
        """
        raise NotImplementedError

    def __enter__(self):
        """
        Context manager enter. Starts detection and returns self.
        """
        self.start_detection()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """
        Context manager exit. Stops detection.
        """
        self.stop_detection()

    @action
    def start_detection(self, input_device: Optional[str] = None, seconds: Optional[float] = None,
                        block_duration: Optional[float] = None) -> None:
        """
        Start the speech detection engine.

        :param input_device: Audio input device name/index override
        :param seconds: If set, then the detection engine will stop after this many seconds, otherwise it'll
            start running until ``stop_detection`` is called or application stop.
        :param block_duration: ``block_duration`` override.
        """
        assert not self._input_stream and not self._recording_thread, 'Speech detection is already running'
        block_duration = block_duration or self.block_duration
        input_device = input_device if input_device is not None else self.input_device
        self._audio_queue = queue.Queue()
        self._recording_thread = threading.Thread(
            target=lambda: self.recording_thread(block_duration=block_duration, input_device=input_device))

        self._recording_thread.start()
        self._detection_thread = threading.Thread(target=lambda: self.detection_thread())
        self._detection_thread.start()

        if seconds:
            threading.Timer(seconds, lambda: self.stop_detection()).start()

    @action
    def stop_detection(self) -> None:
        """
        Stop the speech detection engine.
        """
        assert self._input_stream, 'Speech detection is not running'
        self._input_stream.stop(ignore_errors=True)
        self._input_stream.close(ignore_errors=True)
        self._input_stream = None

        if self._recording_thread:
            self._recording_thread.join(timeout=self._thread_stop_timeout)
            self._recording_thread = None

        self._audio_queue = None
        if self._detection_thread:
            self._detection_thread.join(timeout=self._thread_stop_timeout)
            self._detection_thread = None


# vim:sw=4:ts=4:et: