From b0339754b25f72cd2d3adc4062c1b70845d3fb24 Mon Sep 17 00:00:00 2001 From: Fabio Manganiello Date: Fri, 6 Mar 2020 00:38:24 +0100 Subject: [PATCH] Implemented Mozilla DeepSpeech speech-to-text integration [closes #126] --- docs/source/backends.rst | 1 + docs/source/conf.py | 1 + docs/source/events.rst | 1 + .../platypush/backend/stt.deepspeech.rst | 5 + docs/source/platypush/events/stt.rst | 5 + .../platypush/plugins/stt.deepspeech.rst | 5 + docs/source/platypush/responses/stt.rst | 5 + docs/source/plugins.rst | 1 + docs/source/responses.rst | 3 +- platypush/backend/stt/__init__.py | 0 platypush/backend/stt/deepspeech.py | 48 +++ platypush/message/event/stt.py | 61 ++++ platypush/message/response/stt.py | 11 + platypush/plugins/stt/__init__.py | 0 platypush/plugins/stt/deepspeech.py | 324 ++++++++++++++++++ requirements.txt | 5 + setup.py | 2 + 17 files changed, 477 insertions(+), 1 deletion(-) create mode 100644 docs/source/platypush/backend/stt.deepspeech.rst create mode 100644 docs/source/platypush/events/stt.rst create mode 100644 docs/source/platypush/plugins/stt.deepspeech.rst create mode 100644 docs/source/platypush/responses/stt.rst create mode 100644 platypush/backend/stt/__init__.py create mode 100644 platypush/backend/stt/deepspeech.py create mode 100644 platypush/message/event/stt.py create mode 100644 platypush/message/response/stt.py create mode 100644 platypush/plugins/stt/__init__.py create mode 100644 platypush/plugins/stt/deepspeech.py diff --git a/docs/source/backends.rst b/docs/source/backends.rst index bc4adf3a..866748e4 100644 --- a/docs/source/backends.rst +++ b/docs/source/backends.rst @@ -53,6 +53,7 @@ Backends platypush/backend/sensor.mcp3008.rst platypush/backend/sensor.motion.pwm3901.rst platypush/backend/sensor.serial.rst + platypush/backend/stt.deepspeech.rst platypush/backend/tcp.rst platypush/backend/todoist.rst platypush/backend/travisci.rst diff --git a/docs/source/conf.py b/docs/source/conf.py index 2156de70..d8026923 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -240,6 +240,7 @@ autodoc_mock_imports = ['googlesamples.assistant.grpc.audio_helpers', 'cpuinfo', 'psutil', 'openzwave', + 'deepspeech', ] sys.path.insert(0, os.path.abspath('../..')) diff --git a/docs/source/events.rst b/docs/source/events.rst index a52cb0d9..708ab392 100644 --- a/docs/source/events.rst +++ b/docs/source/events.rst @@ -45,6 +45,7 @@ Events platypush/events/sensor.light.rst platypush/events/serial.rst platypush/events/sound.rst + platypush/events/stt.rst platypush/events/todoist.rst platypush/events/torrent.rst platypush/events/travisci.rst diff --git a/docs/source/platypush/backend/stt.deepspeech.rst b/docs/source/platypush/backend/stt.deepspeech.rst new file mode 100644 index 00000000..06f73f1e --- /dev/null +++ b/docs/source/platypush/backend/stt.deepspeech.rst @@ -0,0 +1,5 @@ +``platypush.backend.stt.deepspeech`` +==================================== + +.. automodule:: platypush.backend.stt.deepspeech + :members: diff --git a/docs/source/platypush/events/stt.rst b/docs/source/platypush/events/stt.rst new file mode 100644 index 00000000..cb6d3bdf --- /dev/null +++ b/docs/source/platypush/events/stt.rst @@ -0,0 +1,5 @@ +``platypush.message.event.stt`` +=============================== + +.. automodule:: platypush.message.event.stt + :members: diff --git a/docs/source/platypush/plugins/stt.deepspeech.rst b/docs/source/platypush/plugins/stt.deepspeech.rst new file mode 100644 index 00000000..a996d53c --- /dev/null +++ b/docs/source/platypush/plugins/stt.deepspeech.rst @@ -0,0 +1,5 @@ +``platypush.plugins.stt.deepspeech`` +==================================== + +.. automodule:: platypush.plugins.stt.deepspeech + :members: diff --git a/docs/source/platypush/responses/stt.rst b/docs/source/platypush/responses/stt.rst new file mode 100644 index 00000000..84acce1b --- /dev/null +++ b/docs/source/platypush/responses/stt.rst @@ -0,0 +1,5 @@ +``platypush.message.response.stt`` +================================== + +.. automodule:: platypush.message.response.stt + :members: diff --git a/docs/source/plugins.rst b/docs/source/plugins.rst index e9836fc5..137f74fc 100644 --- a/docs/source/plugins.rst +++ b/docs/source/plugins.rst @@ -90,6 +90,7 @@ Plugins platypush/plugins/serial.rst platypush/plugins/shell.rst platypush/plugins/sound.rst + platypush/plugins/stt.deepspeech.rst platypush/plugins/switch.rst platypush/plugins/switch.switchbot.rst platypush/plugins/switch.tplink.rst diff --git a/docs/source/responses.rst b/docs/source/responses.rst index 38556fd2..0bbca198 100644 --- a/docs/source/responses.rst +++ b/docs/source/responses.rst @@ -6,15 +6,16 @@ Responses :maxdepth: 2 :caption: Responses: + platypush/responses/.rst platypush/responses/bluetooth.rst platypush/responses/camera.rst platypush/responses/camera.android.rst platypush/responses/chat.telegram.rst - platypush/responses/deepspeech.rst platypush/responses/google.drive.rst platypush/responses/pihole.rst platypush/responses/ping.rst platypush/responses/printer.cups.rst + platypush/responses/stt.rst platypush/responses/system.rst platypush/responses/todoist.rst platypush/responses/trello.rst diff --git a/platypush/backend/stt/__init__.py b/platypush/backend/stt/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/platypush/backend/stt/deepspeech.py b/platypush/backend/stt/deepspeech.py new file mode 100644 index 00000000..22916874 --- /dev/null +++ b/platypush/backend/stt/deepspeech.py @@ -0,0 +1,48 @@ +import time + +from platypush.backend import Backend +from platypush.context import get_plugin +from platypush.plugins.stt.deepspeech import SttDeepspeechPlugin + + +class SttDeepspeechBackend(Backend): + """ + Backend for the Mozilla Deepspeech speech-to-text engine plugin. Set this plugin to ``enabled`` if you + want to run the speech-to-text engine continuously instead of programmatically using + ``start_detection`` and ``stop_detection``. + + Requires: + + - The :class:`platypush.plugins.stt.deepspeech.SttDeepspeechPlugin` plugin configured and its dependencies + installed, as well as the language model files. + + """ + + def __init__(self, retry_sleep: float = 5.0, *args, **kwargs): + """ + :param retry_sleep: Number of seconds the backend will wait on failure before re-initializing the plugin + (default: 5 seconds). + """ + super().__init__(*args, **kwargs) + self.retry_sleep = retry_sleep + + def run(self): + super().run() + self.logger.info('Starting Mozilla Deepspeech speech-to-text backend') + + while not self.should_stop(): + try: + plugin: SttDeepspeechPlugin = get_plugin('stt.deepspeech') + with plugin: + plugin.start_detection() + # noinspection PyProtectedMember + plugin._detection_thread.join() + except Exception as e: + self.logger.exception(e) + self.logger.warning('Deepspeech backend encountered an unexpected error, retrying in {} seconds'. + format(self.retry_sleep)) + + time.sleep(self.retry_sleep) + + +# vim:sw=4:ts=4:et: diff --git a/platypush/message/event/stt.py b/platypush/message/event/stt.py new file mode 100644 index 00000000..794c501f --- /dev/null +++ b/platypush/message/event/stt.py @@ -0,0 +1,61 @@ +from platypush.message.event import Event + + +class SttEvent(Event): + """ Base class for speech-to-text events """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + +class SpeechStartedEvent(SttEvent): + """ + Event triggered when speech starts being detected. + """ + pass + + +class SpeechDetectedEvent(SttEvent): + """ + Event triggered when speech is detected. + """ + + def __init__(self, speech: str, *args, **kwargs): + """ + :param speech: Speech detected, as a string + """ + super().__init__(*args, speech=speech.strip(), **kwargs) + + +class ConversationDetectedEvent(SpeechDetectedEvent): + """ + Event triggered when speech is detected after a hotword. + """ + pass + +class HotwordDetectedEvent(SttEvent): + """ + Event triggered when a custom hotword is detected. + """ + + def __init__(self, hotword: str = '', *args, **kwargs): + """ + :param hotword: The detected user hotword. + """ + super().__init__(*args, hotword=hotword, **kwargs) + + +class SpeechDetectionStartedEvent(SttEvent): + """ + Event triggered when the speech detection engine starts. + """ + pass + + +class SpeechDetectionStoppedEvent(SttEvent): + """ + Event triggered when the speech detection engine stops. + """ + pass + + +# vim:sw=4:ts=4:et: diff --git a/platypush/message/response/stt.py b/platypush/message/response/stt.py new file mode 100644 index 00000000..7ba37920 --- /dev/null +++ b/platypush/message/response/stt.py @@ -0,0 +1,11 @@ +from platypush.message.response import Response + + +class SpeechDetectedResponse(Response): + def __init__(self, *args, speech: str, **kwargs): + super().__init__(*args, output={ + 'speech': speech + }, **kwargs) + + +# vim:sw=4:ts=4:et: diff --git a/platypush/plugins/stt/__init__.py b/platypush/plugins/stt/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/platypush/plugins/stt/deepspeech.py b/platypush/plugins/stt/deepspeech.py new file mode 100644 index 00000000..84fd8b6d --- /dev/null +++ b/platypush/plugins/stt/deepspeech.py @@ -0,0 +1,324 @@ +from __future__ import annotations + +import queue +import os +import threading +from typing import Optional, Union, List + +import deepspeech +import numpy as np +import sounddevice as sd +import wave + +from platypush.context import get_bus +from platypush.message.event.stt import SpeechDetectionStartedEvent, SpeechDetectionStoppedEvent, SpeechStartedEvent, \ + SpeechDetectedEvent, HotwordDetectedEvent, ConversationDetectedEvent +from platypush.message.response.stt import SpeechDetectedResponse +from platypush.plugins import Plugin, action + + +class SttDeepspeechPlugin(Plugin): + """ + This plugin performs speech-to-text and speech detection using the + `Mozilla DeepSpeech `_ engine. + + Triggers: + + * :class:`platypush.message.event.stt.SpeechStartedEvent` when speech starts being detected. + * :class:`platypush.message.event.stt.SpeechDetectedEvent` when speech is detected. + * :class:`platypush.message.event.stt.SpeechDetectionStartedEvent` when speech detection starts. + * :class:`platypush.message.event.stt.SpeechDetectionStoppedEvent` when speech detection stops. + * :class:`platypush.message.event.stt.HotwordDetectedEvent` when a user-defined hotword is detected. + * :class:`platypush.message.event.stt.ConversationDetectedEvent` when speech is detected after a hotword. + + Requires: + + * **deepspeech** (``pip install 'deepspeech>=0.6.0'``) + * **numpy** (``pip install numpy``) + * **sounddevice** (``pip install sounddevice``) + + """ + + _thread_stop_timeout = 10.0 + rate = 16000 + channels = 1 + + def __init__(self, + model_file: str, + lm_file: str, + trie_file: str, + lm_alpha: float = 0.75, + lm_beta: float = 1.85, + beam_width: int = 500, + input_device: Optional[Union[int, str]] = None, + hotword: Optional[str] = None, + hotwords: Optional[List[str]] = None, + conversation_timeout: Optional[float] = None, + block_duration: float = 1.0): + """ + In order to run the speech-to-text engine you'll need to download the right model files for the + Deepspeech engine that you have installed: + + .. code-block:: shell + + # Create the working folder for the models + export MODELS_DIR=~/models + mkdir -p $MODELS_DIR + cd $MODELS_DIR + + # Download and extract the model files for your version of Deepspeech. This may take a while. + export DEEPSPEECH_VERSION=0.6.1 + wget https://github.com/mozilla/DeepSpeech/releases/download/v$DEEPSPEECH_VERSION/deepspeech-$DEEPSPEECH_VERSION-models.tar.gz + tar -xvzf deepspeech-$DEEPSPEECH_VERSION-models.tar.gz + x deepspeech-0.6.1-models/ + x deepspeech-0.6.1-models/lm.binary + x deepspeech-0.6.1-models/output_graph.pbmm + x deepspeech-0.6.1-models/output_graph.pb + x deepspeech-0.6.1-models/trie + x deepspeech-0.6.1-models/output_graph.tflite + + :param model_file: Path to the model file (usually named ``output_graph.pb`` or ``output_graph.pbmm``). + Note that ``.pbmm`` usually perform better and are smaller. + + :param lm_file: Path to the language model binary file (usually named ``lm.binary``). + :param trie_file: The path to the trie file build from the same vocabulary as the language model binary + (usually named ``trie``). + :param lm_alpha: The alpha hyperparameter of the CTC decoder - Language Model weight. + See . + :param lm_beta: The beta hyperparameter of the CTC decoder - Word Insertion weight. + See . + :param beam_width: Decoder beam width (see beam scoring in KenLM language model). + :param input_device: PortAudio device index or name that will be used for recording speech (default: default + system audio input device). + :param hotword: When this word is detected, the plugin will trigger a + :class:`platypush.message.event.stt.HotwordDetectedEvent` instead of a + :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can use these events for hooking other + assistants. + :param hotwords: Use a list of hotwords instead of a single one. + :param conversation_timeout: If ``hotword`` or ``hotwords`` are set and ``conversation_timeout`` is set, + the next speech detected event will trigger a :class:`platypush.message.event.stt.ConversationDetectedEvent` + instead of a :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can hook custom hooks + here to run any logic depending on the detected speech - it can emulate a kind of + "OK, Google. Turn on the lights" interaction without using an external assistant. + :param block_duration: Duration of the acquired audio blocks (default: 1 second). + """ + + super().__init__() + self.model_file = os.path.abspath(os.path.expanduser(model_file)) + self.lm_file = os.path.abspath(os.path.expanduser(lm_file)) + self.trie_file = os.path.abspath(os.path.expanduser(trie_file)) + self.lm_alpha = lm_alpha + self.lm_beta = lm_beta + self.beam_width = beam_width + self.input_device = input_device + self.conversation_timeout = conversation_timeout + self.block_duration = block_duration + + self.hotwords = set(hotwords or []) + if hotword: + self.hotwords = {hotword} + + self._conversation_event = threading.Event() + self._model: Optional[deepspeech.Model] = None + self._input_stream: Optional[sd.InputStream] = None + self._recording_thread: Optional[threading.Thread] = None + self._detection_thread: Optional[threading.Thread] = None + self._audio_queue: Optional[queue.Queue] = None + + def _get_model(self) -> deepspeech.Model: + if not self._model: + self._model = deepspeech.Model(self.model_file, self.beam_width) + self._model.enableDecoderWithLM(self.lm_file, self.trie_file, self.lm_alpha, self.lm_beta) + + return self._model + + def _detect(self, data: Union[bytes, np.ndarray]) -> str: + data = self._convert_data(data) + model = self._get_model() + return model.stt(data) + + @staticmethod + def _convert_data(data: Union[np.ndarray, bytes]) -> np.ndarray: + return np.frombuffer(data, dtype=np.int16) + + def _get_input_device(self, device: Optional[Union[int, str]] = None) -> int: + """ + Get the index of the input device by index or name. + + :param device: Device index or name. If None is set then the function will return the index of the + default audio input device. + :return: Index of the audio input device. + """ + if not device: + device = self.input_device + if not device: + return sd.query_hostapis()[0].get('default_input_device') + + if isinstance(device, int): + assert device <= len(sd.query_devices()) + return device + + for i, dev in enumerate(sd.query_devices()): + if dev['name'] == device: + return i + + raise AssertionError('Device {} not found'.format(device)) + + def _on_speech_detected(self, speech: str) -> None: + """ + Hook called when speech is detected. Triggers the right event depending on the current context. + + :param speech: Detected speech. + """ + speech = speech.strip() + + if self._conversation_event.is_set(): + event = ConversationDetectedEvent(speech=speech) + elif speech in self.hotwords: + event = HotwordDetectedEvent(hotword=speech) + if self.conversation_timeout: + self._conversation_event.set() + threading.Timer(self.conversation_timeout, lambda: self._conversation_event.clear()).start() + else: + event = SpeechDetectedEvent(speech=speech) + + get_bus().post(event) + + def detection_thread(self) -> None: + """ + Speech detection thread. Reads from the ``audio_queue`` and uses the Deepspeech model to detect + speech real-time. + """ + self.logger.debug('Detection thread started') + model = self._get_model() + current_text = '' + context = None + + while self._audio_queue: + if not context: + context = model.createStream() + + try: + frames = self._audio_queue.get() + frames = self._convert_data(frames) + except Exception as e: + self.logger.warning('Error while feeding audio to the model: {}'.format(str(e))) + continue + + model.feedAudioContent(context, frames) + text = model.intermediateDecode(context) + + if text == current_text: + if current_text: + self._on_speech_detected(current_text) + model.finishStream(context) + context = None + + current_text = '' + else: + if not current_text: + get_bus().post(SpeechStartedEvent()) + + self.logger.info('Intermediate speech results: [{}]'.format(text)) + current_text = text + + self.logger.debug('Detection thread terminated') + + def recording_thread(self, block_duration: float, input_device: Optional[str] = None) -> None: + """ + Recording thread. It reads raw frames from the audio device and dispatches them to ``detection_thread``. + + :param block_duration: Audio blocks duration. + :param input_device: Input device + """ + self.logger.debug('Recording thread started') + device = self._get_input_device(input_device) + blocksize = int(self.rate * self.channels * block_duration) + self._input_stream = sd.InputStream(samplerate=self.rate, device=device, + channels=self.channels, dtype='int16', latency=0, + blocksize=blocksize) + self._input_stream.start() + get_bus().post(SpeechDetectionStartedEvent()) + + while self._input_stream: + try: + frames = self._input_stream.read(self.rate)[0] + except Exception as e: + self.logger.warning('Error while reading from the audio input: {}'.format(str(e))) + continue + + self._audio_queue.put(frames) + + get_bus().post(SpeechDetectionStoppedEvent()) + self.logger.debug('Recording thread terminated') + + @action + def detect(self, audio_file: str) -> SpeechDetectedResponse: + """ + Perform speech-to-text analysis on an audio file. + + :param audio_file: Path to the audio file. + """ + audio_file = os.path.abspath(os.path.expanduser(audio_file)) + wav = wave.open(audio_file, 'r') + buffer = wav.readframes(wav.getnframes()) + speech = self._detect(buffer) + return SpeechDetectedResponse(speech=speech) + + def __enter__(self) -> SttDeepspeechPlugin: + """ + Context manager enter. Starts detection and returns self. + """ + self.start_detection() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """ + Context manager exit. Stops detection. + """ + self.stop_detection() + + @action + def start_detection(self, input_device: Optional[str] = None, seconds: Optional[float] = None, + block_duration: Optional[float] = None) -> None: + """ + Start the speech detection engine. + + :param input_device: Audio input device name/index override + :param seconds: If set, then the detection engine will stop after this many seconds, otherwise it'll + start running until ``stop_detection`` is called or application stop. + :param block_duration: ``block_duration`` override. + """ + assert not self._input_stream, 'Speech detection is already running' + block_duration = block_duration or self.block_duration + input_device = input_device if input_device is not None else self.input_device + self._audio_queue = queue.Queue() + self._recording_thread = threading.Thread( + target=lambda: self.recording_thread(block_duration=block_duration, input_device=input_device)) + + self._recording_thread.start() + self._detection_thread = threading.Thread(target=lambda: self.detection_thread()) + self._detection_thread.start() + + if seconds: + threading.Timer(seconds, lambda: self.stop_detection()).start() + + @action + def stop_detection(self) -> None: + """ + Stop the speech detection engine. + """ + assert self._input_stream, 'Speech detection is not running' + self._input_stream.stop(ignore_errors=True) + self._input_stream.close(ignore_errors=True) + self._input_stream = None + + if self._recording_thread: + self._recording_thread.join(timeout=self._thread_stop_timeout) + + self._audio_queue = None + if self._detection_thread: + self._detection_thread.join(timeout=self._thread_stop_timeout) + + +# vim:sw=4:ts=4:et: diff --git a/requirements.txt b/requirements.txt index c4f2c4a3..cdd720ca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -229,3 +229,8 @@ croniter # Support for Z-Wave # python-openzwave + +# Support for DeepSpeech +# deepspeech +# numpy +# sounddevice diff --git a/setup.py b/setup.py index f9ee64d8..92838578 100755 --- a/setup.py +++ b/setup.py @@ -283,5 +283,7 @@ setup( 'zigbee': ['paho-mqtt'], # Support for Z-Wave 'zwave': ['python-openzwave'], + # Support for DeepSpeech + 'deepspeech': ['deepspeech', 'numpy','sounddevice'], }, )