2020-08-24 01:14:40 +02:00

153 lines
6.2 KiB

import os
from typing import Optional, Union
import numpy as np
import wave
from platypush.message.response.stt import SpeechDetectedResponse
from platypush.plugins import action
from platypush.plugins.stt import SttPlugin
class SttDeepspeechPlugin(SttPlugin):
This plugin performs speech-to-text and speech detection using the
`Mozilla DeepSpeech <>`_ engine.
* **deepspeech** (``pip install 'deepspeech>=0.6.0'``)
* **numpy** (``pip install numpy``)
* **sounddevice** (``pip install sounddevice``)
def __init__(self,
model_file: str,
lm_file: str,
trie_file: str,
lm_alpha: float = 0.75,
lm_beta: float = 1.85,
beam_width: int = 500,
*args, **kwargs):
In order to run the speech-to-text engine you'll need to download the right model files for the
Deepspeech engine that you have installed:
.. code-block:: shell
# Create the working folder for the models
export MODELS_DIR=~/models
mkdir -p $MODELS_DIR
# Download and extract the model files for your version of Deepspeech. This may take a while.
tar -xvzf deepspeech-$DEEPSPEECH_VERSION-models.tar.gz
x deepspeech-0.6.1-models/
x deepspeech-0.6.1-models/lm.binary
x deepspeech-0.6.1-models/output_graph.pbmm
x deepspeech-0.6.1-models/output_graph.pb
x deepspeech-0.6.1-models/trie
x deepspeech-0.6.1-models/output_graph.tflite
:param model_file: Path to the model file (usually named ``output_graph.pb`` or ``output_graph.pbmm``).
Note that ``.pbmm`` usually perform better and are smaller.
:param lm_file: Path to the language model binary file (usually named ``lm.binary``).
:param trie_file: The path to the trie file build from the same vocabulary as the language model binary
(usually named ``trie``).
:param lm_alpha: The alpha hyperparameter of the CTC decoder - Language Model weight.
See <>.
:param lm_beta: The beta hyperparameter of the CTC decoder - Word Insertion weight.
See <>.
:param beam_width: Decoder beam width (see beam scoring in KenLM language model).
:param input_device: PortAudio device index or name that will be used for recording speech (default: default
system audio input device).
:param hotword: When this word is detected, the plugin will trigger a
:class:`platypush.message.event.stt.HotwordDetectedEvent` instead of a
:class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can use these events for hooking other
:param hotwords: Use a list of hotwords instead of a single one.
:param conversation_timeout: If ``hotword`` or ``hotwords`` are set and ``conversation_timeout`` is set,
the next speech detected event will trigger a :class:`platypush.message.event.stt.ConversationDetectedEvent`
instead of a :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can hook custom hooks
here to run any logic depending on the detected speech - it can emulate a kind of
"OK, Google. Turn on the lights" interaction without using an external assistant.
:param block_duration: Duration of the acquired audio blocks (default: 1 second).
import deepspeech
super().__init__(*args, **kwargs)
self.model_file = os.path.abspath(os.path.expanduser(model_file))
self.lm_file = os.path.abspath(os.path.expanduser(lm_file))
self.trie_file = os.path.abspath(os.path.expanduser(trie_file))
self.lm_alpha = lm_alpha
self.lm_beta = lm_beta
self.beam_width = beam_width
self._model: Optional[deepspeech.Model] = None
self._context = None
def _get_model(self):
import deepspeech
if not self._model:
self._model = deepspeech.Model(self.model_file, self.beam_width)
self._model.enableDecoderWithLM(self.lm_file, self.trie_file, self.lm_alpha, self.lm_beta)
return self._model
def _get_context(self):
if not self._model:
self._model = self._get_model()
if not self._context:
self._context = self._model.createStream()
return self._context
def convert_frames(frames: Union[np.ndarray, bytes]) -> np.ndarray:
return np.frombuffer(frames, dtype=np.int16)
def on_detection_started(self):
self._context = self._get_context()
def on_detection_ended(self):
if self._model and self._context:
self._context = None
def detect_speech(self, frames) -> str:
model = self._get_model()
context = self._get_context()
model.feedAudioContent(context, frames)
return model.intermediateDecode(context)
def on_speech_detected(self, speech: str) -> None:
if not speech:
model = self._get_model()
context = self._get_context()
self._context = None
def detect(self, audio_file: str) -> SpeechDetectedResponse:
Perform speech-to-text analysis on an audio file.
:param audio_file: Path to the audio file.
audio_file = os.path.abspath(os.path.expanduser(audio_file))
wav =, 'r')
buffer = wav.readframes(wav.getnframes())
data = self.convert_frames(buffer)
model = self._get_model()
speech = model.stt(data)
return SpeechDetectedResponse(speech=speech)
# vim:sw=4:ts=4:et: