Implemented Mozilla DeepSpeech speech-to-text integration [closes #126]

This commit is contained in:
Fabio Manganiello 2020-03-06 00:38:24 +01:00
parent fc949ed9f1
commit b0339754b2
17 changed files with 477 additions and 1 deletions

View File

@ -53,6 +53,7 @@ Backends
platypush/backend/sensor.mcp3008.rst
platypush/backend/sensor.motion.pwm3901.rst
platypush/backend/sensor.serial.rst
platypush/backend/stt.deepspeech.rst
platypush/backend/tcp.rst
platypush/backend/todoist.rst
platypush/backend/travisci.rst

View File

@ -240,6 +240,7 @@ autodoc_mock_imports = ['googlesamples.assistant.grpc.audio_helpers',
'cpuinfo',
'psutil',
'openzwave',
'deepspeech',
]
sys.path.insert(0, os.path.abspath('../..'))

View File

@ -45,6 +45,7 @@ Events
platypush/events/sensor.light.rst
platypush/events/serial.rst
platypush/events/sound.rst
platypush/events/stt.rst
platypush/events/todoist.rst
platypush/events/torrent.rst
platypush/events/travisci.rst

View File

@ -0,0 +1,5 @@
``platypush.backend.stt.deepspeech``
====================================
.. automodule:: platypush.backend.stt.deepspeech
:members:

View File

@ -0,0 +1,5 @@
``platypush.message.event.stt``
===============================
.. automodule:: platypush.message.event.stt
:members:

View File

@ -0,0 +1,5 @@
``platypush.plugins.stt.deepspeech``
====================================
.. automodule:: platypush.plugins.stt.deepspeech
:members:

View File

@ -0,0 +1,5 @@
``platypush.message.response.stt``
==================================
.. automodule:: platypush.message.response.stt
:members:

View File

@ -90,6 +90,7 @@ Plugins
platypush/plugins/serial.rst
platypush/plugins/shell.rst
platypush/plugins/sound.rst
platypush/plugins/stt.deepspeech.rst
platypush/plugins/switch.rst
platypush/plugins/switch.switchbot.rst
platypush/plugins/switch.tplink.rst

View File

@ -6,15 +6,16 @@ Responses
:maxdepth: 2
:caption: Responses:
platypush/responses/.rst
platypush/responses/bluetooth.rst
platypush/responses/camera.rst
platypush/responses/camera.android.rst
platypush/responses/chat.telegram.rst
platypush/responses/deepspeech.rst
platypush/responses/google.drive.rst
platypush/responses/pihole.rst
platypush/responses/ping.rst
platypush/responses/printer.cups.rst
platypush/responses/stt.rst
platypush/responses/system.rst
platypush/responses/todoist.rst
platypush/responses/trello.rst

View File

View File

@ -0,0 +1,48 @@
import time
from platypush.backend import Backend
from platypush.context import get_plugin
from platypush.plugins.stt.deepspeech import SttDeepspeechPlugin
class SttDeepspeechBackend(Backend):
"""
Backend for the Mozilla Deepspeech speech-to-text engine plugin. Set this plugin to ``enabled`` if you
want to run the speech-to-text engine continuously instead of programmatically using
``start_detection`` and ``stop_detection``.
Requires:
- The :class:`platypush.plugins.stt.deepspeech.SttDeepspeechPlugin` plugin configured and its dependencies
installed, as well as the language model files.
"""
def __init__(self, retry_sleep: float = 5.0, *args, **kwargs):
"""
:param retry_sleep: Number of seconds the backend will wait on failure before re-initializing the plugin
(default: 5 seconds).
"""
super().__init__(*args, **kwargs)
self.retry_sleep = retry_sleep
def run(self):
super().run()
self.logger.info('Starting Mozilla Deepspeech speech-to-text backend')
while not self.should_stop():
try:
plugin: SttDeepspeechPlugin = get_plugin('stt.deepspeech')
with plugin:
plugin.start_detection()
# noinspection PyProtectedMember
plugin._detection_thread.join()
except Exception as e:
self.logger.exception(e)
self.logger.warning('Deepspeech backend encountered an unexpected error, retrying in {} seconds'.
format(self.retry_sleep))
time.sleep(self.retry_sleep)
# vim:sw=4:ts=4:et:

View File

@ -0,0 +1,61 @@
from platypush.message.event import Event
class SttEvent(Event):
""" Base class for speech-to-text events """
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
class SpeechStartedEvent(SttEvent):
"""
Event triggered when speech starts being detected.
"""
pass
class SpeechDetectedEvent(SttEvent):
"""
Event triggered when speech is detected.
"""
def __init__(self, speech: str, *args, **kwargs):
"""
:param speech: Speech detected, as a string
"""
super().__init__(*args, speech=speech.strip(), **kwargs)
class ConversationDetectedEvent(SpeechDetectedEvent):
"""
Event triggered when speech is detected after a hotword.
"""
pass
class HotwordDetectedEvent(SttEvent):
"""
Event triggered when a custom hotword is detected.
"""
def __init__(self, hotword: str = '', *args, **kwargs):
"""
:param hotword: The detected user hotword.
"""
super().__init__(*args, hotword=hotword, **kwargs)
class SpeechDetectionStartedEvent(SttEvent):
"""
Event triggered when the speech detection engine starts.
"""
pass
class SpeechDetectionStoppedEvent(SttEvent):
"""
Event triggered when the speech detection engine stops.
"""
pass
# vim:sw=4:ts=4:et:

View File

@ -0,0 +1,11 @@
from platypush.message.response import Response
class SpeechDetectedResponse(Response):
def __init__(self, *args, speech: str, **kwargs):
super().__init__(*args, output={
'speech': speech
}, **kwargs)
# vim:sw=4:ts=4:et:

View File

View File

@ -0,0 +1,324 @@
from __future__ import annotations
import queue
import os
import threading
from typing import Optional, Union, List
import deepspeech
import numpy as np
import sounddevice as sd
import wave
from platypush.context import get_bus
from platypush.message.event.stt import SpeechDetectionStartedEvent, SpeechDetectionStoppedEvent, SpeechStartedEvent, \
SpeechDetectedEvent, HotwordDetectedEvent, ConversationDetectedEvent
from platypush.message.response.stt import SpeechDetectedResponse
from platypush.plugins import Plugin, action
class SttDeepspeechPlugin(Plugin):
"""
This plugin performs speech-to-text and speech detection using the
`Mozilla DeepSpeech <https://github.com/mozilla/DeepSpeech>`_ engine.
Triggers:
* :class:`platypush.message.event.stt.SpeechStartedEvent` when speech starts being detected.
* :class:`platypush.message.event.stt.SpeechDetectedEvent` when speech is detected.
* :class:`platypush.message.event.stt.SpeechDetectionStartedEvent` when speech detection starts.
* :class:`platypush.message.event.stt.SpeechDetectionStoppedEvent` when speech detection stops.
* :class:`platypush.message.event.stt.HotwordDetectedEvent` when a user-defined hotword is detected.
* :class:`platypush.message.event.stt.ConversationDetectedEvent` when speech is detected after a hotword.
Requires:
* **deepspeech** (``pip install 'deepspeech>=0.6.0'``)
* **numpy** (``pip install numpy``)
* **sounddevice** (``pip install sounddevice``)
"""
_thread_stop_timeout = 10.0
rate = 16000
channels = 1
def __init__(self,
model_file: str,
lm_file: str,
trie_file: str,
lm_alpha: float = 0.75,
lm_beta: float = 1.85,
beam_width: int = 500,
input_device: Optional[Union[int, str]] = None,
hotword: Optional[str] = None,
hotwords: Optional[List[str]] = None,
conversation_timeout: Optional[float] = None,
block_duration: float = 1.0):
"""
In order to run the speech-to-text engine you'll need to download the right model files for the
Deepspeech engine that you have installed:
.. code-block:: shell
# Create the working folder for the models
export MODELS_DIR=~/models
mkdir -p $MODELS_DIR
cd $MODELS_DIR
# Download and extract the model files for your version of Deepspeech. This may take a while.
export DEEPSPEECH_VERSION=0.6.1
wget https://github.com/mozilla/DeepSpeech/releases/download/v$DEEPSPEECH_VERSION/deepspeech-$DEEPSPEECH_VERSION-models.tar.gz
tar -xvzf deepspeech-$DEEPSPEECH_VERSION-models.tar.gz
x deepspeech-0.6.1-models/
x deepspeech-0.6.1-models/lm.binary
x deepspeech-0.6.1-models/output_graph.pbmm
x deepspeech-0.6.1-models/output_graph.pb
x deepspeech-0.6.1-models/trie
x deepspeech-0.6.1-models/output_graph.tflite
:param model_file: Path to the model file (usually named ``output_graph.pb`` or ``output_graph.pbmm``).
Note that ``.pbmm`` usually perform better and are smaller.
:param lm_file: Path to the language model binary file (usually named ``lm.binary``).
:param trie_file: The path to the trie file build from the same vocabulary as the language model binary
(usually named ``trie``).
:param lm_alpha: The alpha hyperparameter of the CTC decoder - Language Model weight.
See <https://github.com/mozilla/DeepSpeech/releases/tag/v0.6.0>.
:param lm_beta: The beta hyperparameter of the CTC decoder - Word Insertion weight.
See <https://github.com/mozilla/DeepSpeech/releases/tag/v0.6.0>.
:param beam_width: Decoder beam width (see beam scoring in KenLM language model).
:param input_device: PortAudio device index or name that will be used for recording speech (default: default
system audio input device).
:param hotword: When this word is detected, the plugin will trigger a
:class:`platypush.message.event.stt.HotwordDetectedEvent` instead of a
:class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can use these events for hooking other
assistants.
:param hotwords: Use a list of hotwords instead of a single one.
:param conversation_timeout: If ``hotword`` or ``hotwords`` are set and ``conversation_timeout`` is set,
the next speech detected event will trigger a :class:`platypush.message.event.stt.ConversationDetectedEvent`
instead of a :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can hook custom hooks
here to run any logic depending on the detected speech - it can emulate a kind of
"OK, Google. Turn on the lights" interaction without using an external assistant.
:param block_duration: Duration of the acquired audio blocks (default: 1 second).
"""
super().__init__()
self.model_file = os.path.abspath(os.path.expanduser(model_file))
self.lm_file = os.path.abspath(os.path.expanduser(lm_file))
self.trie_file = os.path.abspath(os.path.expanduser(trie_file))
self.lm_alpha = lm_alpha
self.lm_beta = lm_beta
self.beam_width = beam_width
self.input_device = input_device
self.conversation_timeout = conversation_timeout
self.block_duration = block_duration
self.hotwords = set(hotwords or [])
if hotword:
self.hotwords = {hotword}
self._conversation_event = threading.Event()
self._model: Optional[deepspeech.Model] = None
self._input_stream: Optional[sd.InputStream] = None
self._recording_thread: Optional[threading.Thread] = None
self._detection_thread: Optional[threading.Thread] = None
self._audio_queue: Optional[queue.Queue] = None
def _get_model(self) -> deepspeech.Model:
if not self._model:
self._model = deepspeech.Model(self.model_file, self.beam_width)
self._model.enableDecoderWithLM(self.lm_file, self.trie_file, self.lm_alpha, self.lm_beta)
return self._model
def _detect(self, data: Union[bytes, np.ndarray]) -> str:
data = self._convert_data(data)
model = self._get_model()
return model.stt(data)
@staticmethod
def _convert_data(data: Union[np.ndarray, bytes]) -> np.ndarray:
return np.frombuffer(data, dtype=np.int16)
def _get_input_device(self, device: Optional[Union[int, str]] = None) -> int:
"""
Get the index of the input device by index or name.
:param device: Device index or name. If None is set then the function will return the index of the
default audio input device.
:return: Index of the audio input device.
"""
if not device:
device = self.input_device
if not device:
return sd.query_hostapis()[0].get('default_input_device')
if isinstance(device, int):
assert device <= len(sd.query_devices())
return device
for i, dev in enumerate(sd.query_devices()):
if dev['name'] == device:
return i
raise AssertionError('Device {} not found'.format(device))
def _on_speech_detected(self, speech: str) -> None:
"""
Hook called when speech is detected. Triggers the right event depending on the current context.
:param speech: Detected speech.
"""
speech = speech.strip()
if self._conversation_event.is_set():
event = ConversationDetectedEvent(speech=speech)
elif speech in self.hotwords:
event = HotwordDetectedEvent(hotword=speech)
if self.conversation_timeout:
self._conversation_event.set()
threading.Timer(self.conversation_timeout, lambda: self._conversation_event.clear()).start()
else:
event = SpeechDetectedEvent(speech=speech)
get_bus().post(event)
def detection_thread(self) -> None:
"""
Speech detection thread. Reads from the ``audio_queue`` and uses the Deepspeech model to detect
speech real-time.
"""
self.logger.debug('Detection thread started')
model = self._get_model()
current_text = ''
context = None
while self._audio_queue:
if not context:
context = model.createStream()
try:
frames = self._audio_queue.get()
frames = self._convert_data(frames)
except Exception as e:
self.logger.warning('Error while feeding audio to the model: {}'.format(str(e)))
continue
model.feedAudioContent(context, frames)
text = model.intermediateDecode(context)
if text == current_text:
if current_text:
self._on_speech_detected(current_text)
model.finishStream(context)
context = None
current_text = ''
else:
if not current_text:
get_bus().post(SpeechStartedEvent())
self.logger.info('Intermediate speech results: [{}]'.format(text))
current_text = text
self.logger.debug('Detection thread terminated')
def recording_thread(self, block_duration: float, input_device: Optional[str] = None) -> None:
"""
Recording thread. It reads raw frames from the audio device and dispatches them to ``detection_thread``.
:param block_duration: Audio blocks duration.
:param input_device: Input device
"""
self.logger.debug('Recording thread started')
device = self._get_input_device(input_device)
blocksize = int(self.rate * self.channels * block_duration)
self._input_stream = sd.InputStream(samplerate=self.rate, device=device,
channels=self.channels, dtype='int16', latency=0,
blocksize=blocksize)
self._input_stream.start()
get_bus().post(SpeechDetectionStartedEvent())
while self._input_stream:
try:
frames = self._input_stream.read(self.rate)[0]
except Exception as e:
self.logger.warning('Error while reading from the audio input: {}'.format(str(e)))
continue
self._audio_queue.put(frames)
get_bus().post(SpeechDetectionStoppedEvent())
self.logger.debug('Recording thread terminated')
@action
def detect(self, audio_file: str) -> SpeechDetectedResponse:
"""
Perform speech-to-text analysis on an audio file.
:param audio_file: Path to the audio file.
"""
audio_file = os.path.abspath(os.path.expanduser(audio_file))
wav = wave.open(audio_file, 'r')
buffer = wav.readframes(wav.getnframes())
speech = self._detect(buffer)
return SpeechDetectedResponse(speech=speech)
def __enter__(self) -> SttDeepspeechPlugin:
"""
Context manager enter. Starts detection and returns self.
"""
self.start_detection()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""
Context manager exit. Stops detection.
"""
self.stop_detection()
@action
def start_detection(self, input_device: Optional[str] = None, seconds: Optional[float] = None,
block_duration: Optional[float] = None) -> None:
"""
Start the speech detection engine.
:param input_device: Audio input device name/index override
:param seconds: If set, then the detection engine will stop after this many seconds, otherwise it'll
start running until ``stop_detection`` is called or application stop.
:param block_duration: ``block_duration`` override.
"""
assert not self._input_stream, 'Speech detection is already running'
block_duration = block_duration or self.block_duration
input_device = input_device if input_device is not None else self.input_device
self._audio_queue = queue.Queue()
self._recording_thread = threading.Thread(
target=lambda: self.recording_thread(block_duration=block_duration, input_device=input_device))
self._recording_thread.start()
self._detection_thread = threading.Thread(target=lambda: self.detection_thread())
self._detection_thread.start()
if seconds:
threading.Timer(seconds, lambda: self.stop_detection()).start()
@action
def stop_detection(self) -> None:
"""
Stop the speech detection engine.
"""
assert self._input_stream, 'Speech detection is not running'
self._input_stream.stop(ignore_errors=True)
self._input_stream.close(ignore_errors=True)
self._input_stream = None
if self._recording_thread:
self._recording_thread.join(timeout=self._thread_stop_timeout)
self._audio_queue = None
if self._detection_thread:
self._detection_thread.join(timeout=self._thread_stop_timeout)
# vim:sw=4:ts=4:et:

View File

@ -229,3 +229,8 @@ croniter
# Support for Z-Wave
# python-openzwave
# Support for DeepSpeech
# deepspeech
# numpy
# sounddevice

View File

@ -283,5 +283,7 @@ setup(
'zigbee': ['paho-mqtt'],
# Support for Z-Wave
'zwave': ['python-openzwave'],
# Support for DeepSpeech
'deepspeech': ['deepspeech', 'numpy','sounddevice'],
},
)