Implemented Mozilla DeepSpeech speech-to-text integration [closes #126]

2020-03-06 00:38:24 +01:00 · 2020-03-06 00:38:24 +01:00 · b0339754b2
commit b0339754b2
parent fc949ed9f1
17 changed files with 477 additions and 1 deletions
--- a/docs/source/backends.rst
+++ b/docs/source/backends.rst
@ -53,6 +53,7 @@ Backends
    platypush/backend/sensor.mcp3008.rst
    platypush/backend/sensor.motion.pwm3901.rst
    platypush/backend/sensor.serial.rst
    platypush/backend/stt.deepspeech.rst
    platypush/backend/tcp.rst
    platypush/backend/todoist.rst
    platypush/backend/travisci.rst
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -240,6 +240,7 @@ autodoc_mock_imports = ['googlesamples.assistant.grpc.audio_helpers',
                        'cpuinfo',
                        'psutil',
                        'openzwave',
                        'deepspeech',
                        ]
 sys.path.insert(0, os.path.abspath('../..'))
--- a/docs/source/events.rst
+++ b/docs/source/events.rst
@ -45,6 +45,7 @@ Events
    platypush/events/sensor.light.rst
    platypush/events/serial.rst
    platypush/events/sound.rst
    platypush/events/stt.rst
    platypush/events/todoist.rst
    platypush/events/torrent.rst
    platypush/events/travisci.rst
--- a/docs/source/platypush/backend/stt.deepspeech.rst
+++ b/docs/source/platypush/backend/stt.deepspeech.rst
@ -0,0 +1,5 @@
 ``platypush.backend.stt.deepspeech``
 ====================================
 .. automodule:: platypush.backend.stt.deepspeech
    :members:
--- a/docs/source/platypush/events/stt.rst
+++ b/docs/source/platypush/events/stt.rst
@ -0,0 +1,5 @@
 ``platypush.message.event.stt``
 ===============================
 .. automodule:: platypush.message.event.stt
    :members:
--- a/docs/source/platypush/plugins/stt.deepspeech.rst
+++ b/docs/source/platypush/plugins/stt.deepspeech.rst
@ -0,0 +1,5 @@
 ``platypush.plugins.stt.deepspeech``
 ====================================
 .. automodule:: platypush.plugins.stt.deepspeech
    :members:
--- a/docs/source/platypush/responses/stt.rst
+++ b/docs/source/platypush/responses/stt.rst
@ -0,0 +1,5 @@
 ``platypush.message.response.stt``
 ==================================
 .. automodule:: platypush.message.response.stt
    :members:
--- a/docs/source/plugins.rst
+++ b/docs/source/plugins.rst
@ -90,6 +90,7 @@ Plugins
    platypush/plugins/serial.rst
    platypush/plugins/shell.rst
    platypush/plugins/sound.rst
    platypush/plugins/stt.deepspeech.rst
    platypush/plugins/switch.rst
    platypush/plugins/switch.switchbot.rst
    platypush/plugins/switch.tplink.rst
--- a/docs/source/responses.rst
+++ b/docs/source/responses.rst
@ -6,15 +6,16 @@ Responses
    :maxdepth: 2
    :caption: Responses:
    platypush/responses/.rst
    platypush/responses/bluetooth.rst
    platypush/responses/camera.rst
    platypush/responses/camera.android.rst
    platypush/responses/chat.telegram.rst
    platypush/responses/deepspeech.rst
    platypush/responses/google.drive.rst
    platypush/responses/pihole.rst
    platypush/responses/ping.rst
    platypush/responses/printer.cups.rst
    platypush/responses/stt.rst
    platypush/responses/system.rst
    platypush/responses/todoist.rst
    platypush/responses/trello.rst
--- a/platypush/backend/stt/init.py
+++ b/platypush/backend/stt/init.py
--- a/platypush/backend/stt/deepspeech.py
+++ b/platypush/backend/stt/deepspeech.py
@ -0,0 +1,48 @@
 import time
 from platypush.backend import Backend
 from platypush.context import get_plugin
 from platypush.plugins.stt.deepspeech import SttDeepspeechPlugin
 class SttDeepspeechBackend(Backend):
    """
    Backend for the Mozilla Deepspeech speech-to-text engine plugin. Set this plugin to ``enabled`` if you
    want to run the speech-to-text engine continuously instead of programmatically using
    ``start_detection`` and ``stop_detection``.
    Requires:
        - The :class:`platypush.plugins.stt.deepspeech.SttDeepspeechPlugin` plugin configured and its dependencies
          installed, as well as the language model files.
    """
    def __init__(self, retry_sleep: float = 5.0, *args, **kwargs):
        """
        :param retry_sleep: Number of seconds the backend will wait on failure before re-initializing the plugin
            (default: 5 seconds).
        """
        super().__init__(*args, **kwargs)
        self.retry_sleep = retry_sleep
    def run(self):
        super().run()
        self.logger.info('Starting Mozilla Deepspeech speech-to-text backend')
        while not self.should_stop():
            try:
                plugin: SttDeepspeechPlugin = get_plugin('stt.deepspeech')
                with plugin:
                    plugin.start_detection()
                    # noinspection PyProtectedMember
                    plugin._detection_thread.join()
            except Exception as e:
                self.logger.exception(e)
                self.logger.warning('Deepspeech backend encountered an unexpected error, retrying in {} seconds'.
                                    format(self.retry_sleep))
                time.sleep(self.retry_sleep)
 # vim:sw=4:ts=4:et:
--- a/platypush/message/event/stt.py
+++ b/platypush/message/event/stt.py
@ -0,0 +1,61 @@
 from platypush.message.event import Event
 class SttEvent(Event):
    """ Base class for speech-to-text events """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
 class SpeechStartedEvent(SttEvent):
    """
    Event triggered when speech starts being detected.
    """
    pass
 class SpeechDetectedEvent(SttEvent):
    """
    Event triggered when speech is detected.
    """
    def __init__(self, speech: str, *args, **kwargs):
        """
        :param speech: Speech detected, as a string
        """
        super().__init__(*args, speech=speech.strip(), **kwargs)
 class ConversationDetectedEvent(SpeechDetectedEvent):
    """
    Event triggered when speech is detected after a hotword.
    """
    pass
 class HotwordDetectedEvent(SttEvent):
    """
    Event triggered when a custom hotword is detected.
    """
    def __init__(self, hotword: str = '', *args, **kwargs):
        """
        :param hotword: The detected user hotword.
        """
        super().__init__(*args, hotword=hotword, **kwargs)
 class SpeechDetectionStartedEvent(SttEvent):
    """
    Event triggered when the speech detection engine starts.
    """
    pass
 class SpeechDetectionStoppedEvent(SttEvent):
    """
    Event triggered when the speech detection engine stops.
    """
    pass
 # vim:sw=4:ts=4:et:
--- a/platypush/message/response/stt.py
+++ b/platypush/message/response/stt.py
@ -0,0 +1,11 @@
 from platypush.message.response import Response
 class SpeechDetectedResponse(Response):
    def __init__(self, *args, speech: str, **kwargs):
        super().__init__(*args, output={
            'speech': speech
        }, **kwargs)
 # vim:sw=4:ts=4:et:
--- a/platypush/plugins/stt/init.py
+++ b/platypush/plugins/stt/init.py
--- a/platypush/plugins/stt/deepspeech.py
+++ b/platypush/plugins/stt/deepspeech.py
@ -0,0 +1,324 @@
 from __future__ import annotations
 import queue
 import os
 import threading
 from typing import Optional, Union, List
 import deepspeech
 import numpy as np
 import sounddevice as sd
 import wave
 from platypush.context import get_bus
 from platypush.message.event.stt import SpeechDetectionStartedEvent, SpeechDetectionStoppedEvent, SpeechStartedEvent, \
    SpeechDetectedEvent, HotwordDetectedEvent, ConversationDetectedEvent
 from platypush.message.response.stt import SpeechDetectedResponse
 from platypush.plugins import Plugin, action
 class SttDeepspeechPlugin(Plugin):
    """
    This plugin performs speech-to-text and speech detection using the
    `Mozilla DeepSpeech <https://github.com/mozilla/DeepSpeech>`_ engine.
    Triggers:
        * :class:`platypush.message.event.stt.SpeechStartedEvent` when speech starts being detected.
        * :class:`platypush.message.event.stt.SpeechDetectedEvent` when speech is detected.
        * :class:`platypush.message.event.stt.SpeechDetectionStartedEvent` when speech detection starts.
        * :class:`platypush.message.event.stt.SpeechDetectionStoppedEvent` when speech detection stops.
        * :class:`platypush.message.event.stt.HotwordDetectedEvent` when a user-defined hotword is detected.
        * :class:`platypush.message.event.stt.ConversationDetectedEvent` when speech is detected after a hotword.
    Requires:
        * **deepspeech** (``pip install 'deepspeech>=0.6.0'``)
        * **numpy** (``pip install numpy``)
        * **sounddevice** (``pip install sounddevice``)
    """
    _thread_stop_timeout = 10.0
    rate = 16000
    channels = 1
    def __init__(self,
                 model_file: str,
                 lm_file: str,
                 trie_file: str,
                 lm_alpha: float = 0.75,
                 lm_beta: float = 1.85,
                 beam_width: int = 500,
                 input_device: Optional[Union[int, str]] = None,
                 hotword: Optional[str] = None,
                 hotwords: Optional[List[str]] = None,
                 conversation_timeout: Optional[float] = None,
                 block_duration: float = 1.0):
        """
        In order to run the speech-to-text engine you'll need to download the right model files for the
        Deepspeech engine that you have installed:
        .. code-block:: shell
            # Create the working folder for the models
            export MODELS_DIR=~/models
            mkdir -p $MODELS_DIR
            cd $MODELS_DIR
            # Download and extract the model files for your version of Deepspeech. This may take a while.
            export DEEPSPEECH_VERSION=0.6.1
            wget https://github.com/mozilla/DeepSpeech/releases/download/v$DEEPSPEECH_VERSION/deepspeech-$DEEPSPEECH_VERSION-models.tar.gz
            tar -xvzf deepspeech-$DEEPSPEECH_VERSION-models.tar.gz
            x deepspeech-0.6.1-models/
            x deepspeech-0.6.1-models/lm.binary
            x deepspeech-0.6.1-models/output_graph.pbmm
            x deepspeech-0.6.1-models/output_graph.pb
            x deepspeech-0.6.1-models/trie
            x deepspeech-0.6.1-models/output_graph.tflite
        :param model_file: Path to the model file (usually named ``output_graph.pb`` or ``output_graph.pbmm``).
            Note that ``.pbmm`` usually perform better and are smaller.
        :param lm_file: Path to the language model binary file (usually named ``lm.binary``).
        :param trie_file: The path to the trie file build from the same vocabulary as the language model binary
            (usually named ``trie``).
        :param lm_alpha: The alpha hyperparameter of the CTC decoder - Language Model weight.
            See <https://github.com/mozilla/DeepSpeech/releases/tag/v0.6.0>.
        :param lm_beta: The beta hyperparameter of the CTC decoder - Word Insertion weight.
            See <https://github.com/mozilla/DeepSpeech/releases/tag/v0.6.0>.
        :param beam_width:  Decoder beam width (see beam scoring in KenLM language model).
        :param input_device: PortAudio device index or name that will be used for recording speech (default: default
            system audio input device).
        :param hotword: When this word is detected, the plugin will trigger a
            :class:`platypush.message.event.stt.HotwordDetectedEvent` instead of a
            :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can use these events for hooking other
            assistants.
        :param hotwords: Use a list of hotwords instead of a single one.
        :param conversation_timeout: If ``hotword`` or ``hotwords`` are set and ``conversation_timeout`` is set,
            the next speech detected event will trigger a :class:`platypush.message.event.stt.ConversationDetectedEvent`
            instead of a :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can hook custom hooks
            here to run any logic depending on the detected speech - it can emulate a kind of
            "OK, Google. Turn on the lights" interaction without using an external assistant.
        :param block_duration: Duration of the acquired audio blocks (default: 1 second).
        """
        super().__init__()
        self.model_file = os.path.abspath(os.path.expanduser(model_file))
        self.lm_file = os.path.abspath(os.path.expanduser(lm_file))
        self.trie_file = os.path.abspath(os.path.expanduser(trie_file))
        self.lm_alpha = lm_alpha
        self.lm_beta = lm_beta
        self.beam_width = beam_width
        self.input_device = input_device
        self.conversation_timeout = conversation_timeout
        self.block_duration = block_duration
        self.hotwords = set(hotwords or [])
        if hotword:
            self.hotwords = {hotword}
        self._conversation_event = threading.Event()
        self._model: Optional[deepspeech.Model] = None
        self._input_stream: Optional[sd.InputStream] = None
        self._recording_thread: Optional[threading.Thread] = None
        self._detection_thread: Optional[threading.Thread] = None
        self._audio_queue: Optional[queue.Queue] = None
    def _get_model(self) -> deepspeech.Model:
        if not self._model:
            self._model = deepspeech.Model(self.model_file, self.beam_width)
            self._model.enableDecoderWithLM(self.lm_file, self.trie_file, self.lm_alpha, self.lm_beta)
        return self._model
    def _detect(self, data: Union[bytes, np.ndarray]) -> str:
        data = self._convert_data(data)
        model = self._get_model()
        return model.stt(data)
    @staticmethod
    def _convert_data(data: Union[np.ndarray, bytes]) -> np.ndarray:
        return np.frombuffer(data, dtype=np.int16)
    def _get_input_device(self, device: Optional[Union[int, str]] = None) -> int:
        """
        Get the index of the input device by index or name.
        :param device: Device index or name. If None is set then the function will return the index of the
            default audio input device.
        :return: Index of the audio input device.
        """
        if not device:
            device = self.input_device
        if not device:
            return sd.query_hostapis()[0].get('default_input_device')
        if isinstance(device, int):
            assert device <= len(sd.query_devices())
            return device
        for i, dev in enumerate(sd.query_devices()):
            if dev['name'] == device:
                return i
        raise AssertionError('Device {} not found'.format(device))
    def _on_speech_detected(self, speech: str) -> None:
        """
        Hook called when speech is detected. Triggers the right event depending on the current context.
        :param speech: Detected speech.
        """
        speech = speech.strip()
        if self._conversation_event.is_set():
            event = ConversationDetectedEvent(speech=speech)
        elif speech in self.hotwords:
            event = HotwordDetectedEvent(hotword=speech)
            if self.conversation_timeout:
                self._conversation_event.set()
                threading.Timer(self.conversation_timeout, lambda: self._conversation_event.clear()).start()
        else:
            event = SpeechDetectedEvent(speech=speech)
        get_bus().post(event)
    def detection_thread(self) -> None:
        """
        Speech detection thread. Reads from the ``audio_queue`` and uses the Deepspeech model to detect
        speech real-time.
        """
        self.logger.debug('Detection thread started')
        model = self._get_model()
        current_text = ''
        context = None
        while self._audio_queue:
            if not context:
                context = model.createStream()
            try:
                frames = self._audio_queue.get()
                frames = self._convert_data(frames)
            except Exception as e:
                self.logger.warning('Error while feeding audio to the model: {}'.format(str(e)))
                continue
            model.feedAudioContent(context, frames)
            text = model.intermediateDecode(context)
            if text == current_text:
                if current_text:
                    self._on_speech_detected(current_text)
                    model.finishStream(context)
                    context = None
                current_text = ''
            else:
                if not current_text:
                    get_bus().post(SpeechStartedEvent())
                self.logger.info('Intermediate speech results: [{}]'.format(text))
                current_text = text
        self.logger.debug('Detection thread terminated')
    def recording_thread(self, block_duration: float, input_device: Optional[str] = None) -> None:
        """
        Recording thread. It reads raw frames from the audio device and dispatches them to ``detection_thread``.
        :param block_duration: Audio blocks duration.
        :param input_device: Input device
        """
        self.logger.debug('Recording thread started')
        device = self._get_input_device(input_device)
        blocksize = int(self.rate * self.channels * block_duration)
        self._input_stream = sd.InputStream(samplerate=self.rate, device=device,
                                            channels=self.channels, dtype='int16', latency=0,
                                            blocksize=blocksize)
        self._input_stream.start()
        get_bus().post(SpeechDetectionStartedEvent())
        while self._input_stream:
            try:
                frames = self._input_stream.read(self.rate)[0]
            except Exception as e:
                self.logger.warning('Error while reading from the audio input: {}'.format(str(e)))
                continue
            self._audio_queue.put(frames)
        get_bus().post(SpeechDetectionStoppedEvent())
        self.logger.debug('Recording thread terminated')
    @action
    def detect(self, audio_file: str) -> SpeechDetectedResponse:
        """
        Perform speech-to-text analysis on an audio file.
        :param audio_file: Path to the audio file.
        """
        audio_file = os.path.abspath(os.path.expanduser(audio_file))
        wav = wave.open(audio_file, 'r')
        buffer = wav.readframes(wav.getnframes())
        speech = self._detect(buffer)
        return SpeechDetectedResponse(speech=speech)
    def __enter__(self) -> SttDeepspeechPlugin:
        """
        Context manager enter. Starts detection and returns self.
        """
        self.start_detection()
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        """
        Context manager exit. Stops detection.
        """
        self.stop_detection()
    @action
    def start_detection(self, input_device: Optional[str] = None, seconds: Optional[float] = None,
                        block_duration: Optional[float] = None) -> None:
        """
        Start the speech detection engine.
        :param input_device: Audio input device name/index override
        :param seconds: If set, then the detection engine will stop after this many seconds, otherwise it'll
            start running until ``stop_detection`` is called or application stop.
        :param block_duration: ``block_duration`` override.
        """
        assert not self._input_stream, 'Speech detection is already running'
        block_duration = block_duration or self.block_duration
        input_device = input_device if input_device is not None else self.input_device
        self._audio_queue = queue.Queue()
        self._recording_thread = threading.Thread(
            target=lambda: self.recording_thread(block_duration=block_duration, input_device=input_device))
        self._recording_thread.start()
        self._detection_thread = threading.Thread(target=lambda: self.detection_thread())
        self._detection_thread.start()
        if seconds:
            threading.Timer(seconds, lambda: self.stop_detection()).start()
    @action
    def stop_detection(self) -> None:
        """
        Stop the speech detection engine.
        """
        assert self._input_stream, 'Speech detection is not running'
        self._input_stream.stop(ignore_errors=True)
        self._input_stream.close(ignore_errors=True)
        self._input_stream = None
        if self._recording_thread:
            self._recording_thread.join(timeout=self._thread_stop_timeout)
        self._audio_queue = None
        if self._detection_thread:
            self._detection_thread.join(timeout=self._thread_stop_timeout)
 # vim:sw=4:ts=4:et:
--- a/requirements.txt
+++ b/requirements.txt
@ -229,3 +229,8 @@ croniter
 # Support for Z-Wave
 # python-openzwave
 # Support for DeepSpeech
 # deepspeech
 # numpy
 # sounddevice
--- a/setup.py
+++ b/setup.py
@ -283,5 +283,7 @@ setup(
        'zigbee': ['paho-mqtt'],
        # Support for Z-Wave
        'zwave': ['python-openzwave'],
        # Support for DeepSpeech
        'deepspeech': ['deepspeech', 'numpy','sounddevice'],
    },
 )