forked from platypush/platypush
Implemented Mozilla DeepSpeech speech-to-text integration [closes #126]
This commit is contained in:
parent
fc949ed9f1
commit
b0339754b2
17 changed files with 477 additions and 1 deletions
|
@ -53,6 +53,7 @@ Backends
|
||||||
platypush/backend/sensor.mcp3008.rst
|
platypush/backend/sensor.mcp3008.rst
|
||||||
platypush/backend/sensor.motion.pwm3901.rst
|
platypush/backend/sensor.motion.pwm3901.rst
|
||||||
platypush/backend/sensor.serial.rst
|
platypush/backend/sensor.serial.rst
|
||||||
|
platypush/backend/stt.deepspeech.rst
|
||||||
platypush/backend/tcp.rst
|
platypush/backend/tcp.rst
|
||||||
platypush/backend/todoist.rst
|
platypush/backend/todoist.rst
|
||||||
platypush/backend/travisci.rst
|
platypush/backend/travisci.rst
|
||||||
|
|
|
@ -240,6 +240,7 @@ autodoc_mock_imports = ['googlesamples.assistant.grpc.audio_helpers',
|
||||||
'cpuinfo',
|
'cpuinfo',
|
||||||
'psutil',
|
'psutil',
|
||||||
'openzwave',
|
'openzwave',
|
||||||
|
'deepspeech',
|
||||||
]
|
]
|
||||||
|
|
||||||
sys.path.insert(0, os.path.abspath('../..'))
|
sys.path.insert(0, os.path.abspath('../..'))
|
||||||
|
|
|
@ -45,6 +45,7 @@ Events
|
||||||
platypush/events/sensor.light.rst
|
platypush/events/sensor.light.rst
|
||||||
platypush/events/serial.rst
|
platypush/events/serial.rst
|
||||||
platypush/events/sound.rst
|
platypush/events/sound.rst
|
||||||
|
platypush/events/stt.rst
|
||||||
platypush/events/todoist.rst
|
platypush/events/todoist.rst
|
||||||
platypush/events/torrent.rst
|
platypush/events/torrent.rst
|
||||||
platypush/events/travisci.rst
|
platypush/events/travisci.rst
|
||||||
|
|
5
docs/source/platypush/backend/stt.deepspeech.rst
Normal file
5
docs/source/platypush/backend/stt.deepspeech.rst
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
``platypush.backend.stt.deepspeech``
|
||||||
|
====================================
|
||||||
|
|
||||||
|
.. automodule:: platypush.backend.stt.deepspeech
|
||||||
|
:members:
|
5
docs/source/platypush/events/stt.rst
Normal file
5
docs/source/platypush/events/stt.rst
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
``platypush.message.event.stt``
|
||||||
|
===============================
|
||||||
|
|
||||||
|
.. automodule:: platypush.message.event.stt
|
||||||
|
:members:
|
5
docs/source/platypush/plugins/stt.deepspeech.rst
Normal file
5
docs/source/platypush/plugins/stt.deepspeech.rst
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
``platypush.plugins.stt.deepspeech``
|
||||||
|
====================================
|
||||||
|
|
||||||
|
.. automodule:: platypush.plugins.stt.deepspeech
|
||||||
|
:members:
|
5
docs/source/platypush/responses/stt.rst
Normal file
5
docs/source/platypush/responses/stt.rst
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
``platypush.message.response.stt``
|
||||||
|
==================================
|
||||||
|
|
||||||
|
.. automodule:: platypush.message.response.stt
|
||||||
|
:members:
|
|
@ -90,6 +90,7 @@ Plugins
|
||||||
platypush/plugins/serial.rst
|
platypush/plugins/serial.rst
|
||||||
platypush/plugins/shell.rst
|
platypush/plugins/shell.rst
|
||||||
platypush/plugins/sound.rst
|
platypush/plugins/sound.rst
|
||||||
|
platypush/plugins/stt.deepspeech.rst
|
||||||
platypush/plugins/switch.rst
|
platypush/plugins/switch.rst
|
||||||
platypush/plugins/switch.switchbot.rst
|
platypush/plugins/switch.switchbot.rst
|
||||||
platypush/plugins/switch.tplink.rst
|
platypush/plugins/switch.tplink.rst
|
||||||
|
|
|
@ -6,15 +6,16 @@ Responses
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
:caption: Responses:
|
:caption: Responses:
|
||||||
|
|
||||||
|
platypush/responses/.rst
|
||||||
platypush/responses/bluetooth.rst
|
platypush/responses/bluetooth.rst
|
||||||
platypush/responses/camera.rst
|
platypush/responses/camera.rst
|
||||||
platypush/responses/camera.android.rst
|
platypush/responses/camera.android.rst
|
||||||
platypush/responses/chat.telegram.rst
|
platypush/responses/chat.telegram.rst
|
||||||
platypush/responses/deepspeech.rst
|
|
||||||
platypush/responses/google.drive.rst
|
platypush/responses/google.drive.rst
|
||||||
platypush/responses/pihole.rst
|
platypush/responses/pihole.rst
|
||||||
platypush/responses/ping.rst
|
platypush/responses/ping.rst
|
||||||
platypush/responses/printer.cups.rst
|
platypush/responses/printer.cups.rst
|
||||||
|
platypush/responses/stt.rst
|
||||||
platypush/responses/system.rst
|
platypush/responses/system.rst
|
||||||
platypush/responses/todoist.rst
|
platypush/responses/todoist.rst
|
||||||
platypush/responses/trello.rst
|
platypush/responses/trello.rst
|
||||||
|
|
0
platypush/backend/stt/__init__.py
Normal file
0
platypush/backend/stt/__init__.py
Normal file
48
platypush/backend/stt/deepspeech.py
Normal file
48
platypush/backend/stt/deepspeech.py
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
import time
|
||||||
|
|
||||||
|
from platypush.backend import Backend
|
||||||
|
from platypush.context import get_plugin
|
||||||
|
from platypush.plugins.stt.deepspeech import SttDeepspeechPlugin
|
||||||
|
|
||||||
|
|
||||||
|
class SttDeepspeechBackend(Backend):
|
||||||
|
"""
|
||||||
|
Backend for the Mozilla Deepspeech speech-to-text engine plugin. Set this plugin to ``enabled`` if you
|
||||||
|
want to run the speech-to-text engine continuously instead of programmatically using
|
||||||
|
``start_detection`` and ``stop_detection``.
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
|
||||||
|
- The :class:`platypush.plugins.stt.deepspeech.SttDeepspeechPlugin` plugin configured and its dependencies
|
||||||
|
installed, as well as the language model files.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, retry_sleep: float = 5.0, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
:param retry_sleep: Number of seconds the backend will wait on failure before re-initializing the plugin
|
||||||
|
(default: 5 seconds).
|
||||||
|
"""
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.retry_sleep = retry_sleep
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
super().run()
|
||||||
|
self.logger.info('Starting Mozilla Deepspeech speech-to-text backend')
|
||||||
|
|
||||||
|
while not self.should_stop():
|
||||||
|
try:
|
||||||
|
plugin: SttDeepspeechPlugin = get_plugin('stt.deepspeech')
|
||||||
|
with plugin:
|
||||||
|
plugin.start_detection()
|
||||||
|
# noinspection PyProtectedMember
|
||||||
|
plugin._detection_thread.join()
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.exception(e)
|
||||||
|
self.logger.warning('Deepspeech backend encountered an unexpected error, retrying in {} seconds'.
|
||||||
|
format(self.retry_sleep))
|
||||||
|
|
||||||
|
time.sleep(self.retry_sleep)
|
||||||
|
|
||||||
|
|
||||||
|
# vim:sw=4:ts=4:et:
|
61
platypush/message/event/stt.py
Normal file
61
platypush/message/event/stt.py
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
from platypush.message.event import Event
|
||||||
|
|
||||||
|
|
||||||
|
class SttEvent(Event):
|
||||||
|
""" Base class for speech-to-text events """
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class SpeechStartedEvent(SttEvent):
|
||||||
|
"""
|
||||||
|
Event triggered when speech starts being detected.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class SpeechDetectedEvent(SttEvent):
|
||||||
|
"""
|
||||||
|
Event triggered when speech is detected.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, speech: str, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
:param speech: Speech detected, as a string
|
||||||
|
"""
|
||||||
|
super().__init__(*args, speech=speech.strip(), **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class ConversationDetectedEvent(SpeechDetectedEvent):
|
||||||
|
"""
|
||||||
|
Event triggered when speech is detected after a hotword.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class HotwordDetectedEvent(SttEvent):
|
||||||
|
"""
|
||||||
|
Event triggered when a custom hotword is detected.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, hotword: str = '', *args, **kwargs):
|
||||||
|
"""
|
||||||
|
:param hotword: The detected user hotword.
|
||||||
|
"""
|
||||||
|
super().__init__(*args, hotword=hotword, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class SpeechDetectionStartedEvent(SttEvent):
|
||||||
|
"""
|
||||||
|
Event triggered when the speech detection engine starts.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class SpeechDetectionStoppedEvent(SttEvent):
|
||||||
|
"""
|
||||||
|
Event triggered when the speech detection engine stops.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# vim:sw=4:ts=4:et:
|
11
platypush/message/response/stt.py
Normal file
11
platypush/message/response/stt.py
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
from platypush.message.response import Response
|
||||||
|
|
||||||
|
|
||||||
|
class SpeechDetectedResponse(Response):
|
||||||
|
def __init__(self, *args, speech: str, **kwargs):
|
||||||
|
super().__init__(*args, output={
|
||||||
|
'speech': speech
|
||||||
|
}, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
# vim:sw=4:ts=4:et:
|
0
platypush/plugins/stt/__init__.py
Normal file
0
platypush/plugins/stt/__init__.py
Normal file
324
platypush/plugins/stt/deepspeech.py
Normal file
324
platypush/plugins/stt/deepspeech.py
Normal file
|
@ -0,0 +1,324 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import queue
|
||||||
|
import os
|
||||||
|
import threading
|
||||||
|
from typing import Optional, Union, List
|
||||||
|
|
||||||
|
import deepspeech
|
||||||
|
import numpy as np
|
||||||
|
import sounddevice as sd
|
||||||
|
import wave
|
||||||
|
|
||||||
|
from platypush.context import get_bus
|
||||||
|
from platypush.message.event.stt import SpeechDetectionStartedEvent, SpeechDetectionStoppedEvent, SpeechStartedEvent, \
|
||||||
|
SpeechDetectedEvent, HotwordDetectedEvent, ConversationDetectedEvent
|
||||||
|
from platypush.message.response.stt import SpeechDetectedResponse
|
||||||
|
from platypush.plugins import Plugin, action
|
||||||
|
|
||||||
|
|
||||||
|
class SttDeepspeechPlugin(Plugin):
|
||||||
|
"""
|
||||||
|
This plugin performs speech-to-text and speech detection using the
|
||||||
|
`Mozilla DeepSpeech <https://github.com/mozilla/DeepSpeech>`_ engine.
|
||||||
|
|
||||||
|
Triggers:
|
||||||
|
|
||||||
|
* :class:`platypush.message.event.stt.SpeechStartedEvent` when speech starts being detected.
|
||||||
|
* :class:`platypush.message.event.stt.SpeechDetectedEvent` when speech is detected.
|
||||||
|
* :class:`platypush.message.event.stt.SpeechDetectionStartedEvent` when speech detection starts.
|
||||||
|
* :class:`platypush.message.event.stt.SpeechDetectionStoppedEvent` when speech detection stops.
|
||||||
|
* :class:`platypush.message.event.stt.HotwordDetectedEvent` when a user-defined hotword is detected.
|
||||||
|
* :class:`platypush.message.event.stt.ConversationDetectedEvent` when speech is detected after a hotword.
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
|
||||||
|
* **deepspeech** (``pip install 'deepspeech>=0.6.0'``)
|
||||||
|
* **numpy** (``pip install numpy``)
|
||||||
|
* **sounddevice** (``pip install sounddevice``)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
_thread_stop_timeout = 10.0
|
||||||
|
rate = 16000
|
||||||
|
channels = 1
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
model_file: str,
|
||||||
|
lm_file: str,
|
||||||
|
trie_file: str,
|
||||||
|
lm_alpha: float = 0.75,
|
||||||
|
lm_beta: float = 1.85,
|
||||||
|
beam_width: int = 500,
|
||||||
|
input_device: Optional[Union[int, str]] = None,
|
||||||
|
hotword: Optional[str] = None,
|
||||||
|
hotwords: Optional[List[str]] = None,
|
||||||
|
conversation_timeout: Optional[float] = None,
|
||||||
|
block_duration: float = 1.0):
|
||||||
|
"""
|
||||||
|
In order to run the speech-to-text engine you'll need to download the right model files for the
|
||||||
|
Deepspeech engine that you have installed:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
# Create the working folder for the models
|
||||||
|
export MODELS_DIR=~/models
|
||||||
|
mkdir -p $MODELS_DIR
|
||||||
|
cd $MODELS_DIR
|
||||||
|
|
||||||
|
# Download and extract the model files for your version of Deepspeech. This may take a while.
|
||||||
|
export DEEPSPEECH_VERSION=0.6.1
|
||||||
|
wget https://github.com/mozilla/DeepSpeech/releases/download/v$DEEPSPEECH_VERSION/deepspeech-$DEEPSPEECH_VERSION-models.tar.gz
|
||||||
|
tar -xvzf deepspeech-$DEEPSPEECH_VERSION-models.tar.gz
|
||||||
|
x deepspeech-0.6.1-models/
|
||||||
|
x deepspeech-0.6.1-models/lm.binary
|
||||||
|
x deepspeech-0.6.1-models/output_graph.pbmm
|
||||||
|
x deepspeech-0.6.1-models/output_graph.pb
|
||||||
|
x deepspeech-0.6.1-models/trie
|
||||||
|
x deepspeech-0.6.1-models/output_graph.tflite
|
||||||
|
|
||||||
|
:param model_file: Path to the model file (usually named ``output_graph.pb`` or ``output_graph.pbmm``).
|
||||||
|
Note that ``.pbmm`` usually perform better and are smaller.
|
||||||
|
|
||||||
|
:param lm_file: Path to the language model binary file (usually named ``lm.binary``).
|
||||||
|
:param trie_file: The path to the trie file build from the same vocabulary as the language model binary
|
||||||
|
(usually named ``trie``).
|
||||||
|
:param lm_alpha: The alpha hyperparameter of the CTC decoder - Language Model weight.
|
||||||
|
See <https://github.com/mozilla/DeepSpeech/releases/tag/v0.6.0>.
|
||||||
|
:param lm_beta: The beta hyperparameter of the CTC decoder - Word Insertion weight.
|
||||||
|
See <https://github.com/mozilla/DeepSpeech/releases/tag/v0.6.0>.
|
||||||
|
:param beam_width: Decoder beam width (see beam scoring in KenLM language model).
|
||||||
|
:param input_device: PortAudio device index or name that will be used for recording speech (default: default
|
||||||
|
system audio input device).
|
||||||
|
:param hotword: When this word is detected, the plugin will trigger a
|
||||||
|
:class:`platypush.message.event.stt.HotwordDetectedEvent` instead of a
|
||||||
|
:class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can use these events for hooking other
|
||||||
|
assistants.
|
||||||
|
:param hotwords: Use a list of hotwords instead of a single one.
|
||||||
|
:param conversation_timeout: If ``hotword`` or ``hotwords`` are set and ``conversation_timeout`` is set,
|
||||||
|
the next speech detected event will trigger a :class:`platypush.message.event.stt.ConversationDetectedEvent`
|
||||||
|
instead of a :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can hook custom hooks
|
||||||
|
here to run any logic depending on the detected speech - it can emulate a kind of
|
||||||
|
"OK, Google. Turn on the lights" interaction without using an external assistant.
|
||||||
|
:param block_duration: Duration of the acquired audio blocks (default: 1 second).
|
||||||
|
"""
|
||||||
|
|
||||||
|
super().__init__()
|
||||||
|
self.model_file = os.path.abspath(os.path.expanduser(model_file))
|
||||||
|
self.lm_file = os.path.abspath(os.path.expanduser(lm_file))
|
||||||
|
self.trie_file = os.path.abspath(os.path.expanduser(trie_file))
|
||||||
|
self.lm_alpha = lm_alpha
|
||||||
|
self.lm_beta = lm_beta
|
||||||
|
self.beam_width = beam_width
|
||||||
|
self.input_device = input_device
|
||||||
|
self.conversation_timeout = conversation_timeout
|
||||||
|
self.block_duration = block_duration
|
||||||
|
|
||||||
|
self.hotwords = set(hotwords or [])
|
||||||
|
if hotword:
|
||||||
|
self.hotwords = {hotword}
|
||||||
|
|
||||||
|
self._conversation_event = threading.Event()
|
||||||
|
self._model: Optional[deepspeech.Model] = None
|
||||||
|
self._input_stream: Optional[sd.InputStream] = None
|
||||||
|
self._recording_thread: Optional[threading.Thread] = None
|
||||||
|
self._detection_thread: Optional[threading.Thread] = None
|
||||||
|
self._audio_queue: Optional[queue.Queue] = None
|
||||||
|
|
||||||
|
def _get_model(self) -> deepspeech.Model:
|
||||||
|
if not self._model:
|
||||||
|
self._model = deepspeech.Model(self.model_file, self.beam_width)
|
||||||
|
self._model.enableDecoderWithLM(self.lm_file, self.trie_file, self.lm_alpha, self.lm_beta)
|
||||||
|
|
||||||
|
return self._model
|
||||||
|
|
||||||
|
def _detect(self, data: Union[bytes, np.ndarray]) -> str:
|
||||||
|
data = self._convert_data(data)
|
||||||
|
model = self._get_model()
|
||||||
|
return model.stt(data)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _convert_data(data: Union[np.ndarray, bytes]) -> np.ndarray:
|
||||||
|
return np.frombuffer(data, dtype=np.int16)
|
||||||
|
|
||||||
|
def _get_input_device(self, device: Optional[Union[int, str]] = None) -> int:
|
||||||
|
"""
|
||||||
|
Get the index of the input device by index or name.
|
||||||
|
|
||||||
|
:param device: Device index or name. If None is set then the function will return the index of the
|
||||||
|
default audio input device.
|
||||||
|
:return: Index of the audio input device.
|
||||||
|
"""
|
||||||
|
if not device:
|
||||||
|
device = self.input_device
|
||||||
|
if not device:
|
||||||
|
return sd.query_hostapis()[0].get('default_input_device')
|
||||||
|
|
||||||
|
if isinstance(device, int):
|
||||||
|
assert device <= len(sd.query_devices())
|
||||||
|
return device
|
||||||
|
|
||||||
|
for i, dev in enumerate(sd.query_devices()):
|
||||||
|
if dev['name'] == device:
|
||||||
|
return i
|
||||||
|
|
||||||
|
raise AssertionError('Device {} not found'.format(device))
|
||||||
|
|
||||||
|
def _on_speech_detected(self, speech: str) -> None:
|
||||||
|
"""
|
||||||
|
Hook called when speech is detected. Triggers the right event depending on the current context.
|
||||||
|
|
||||||
|
:param speech: Detected speech.
|
||||||
|
"""
|
||||||
|
speech = speech.strip()
|
||||||
|
|
||||||
|
if self._conversation_event.is_set():
|
||||||
|
event = ConversationDetectedEvent(speech=speech)
|
||||||
|
elif speech in self.hotwords:
|
||||||
|
event = HotwordDetectedEvent(hotword=speech)
|
||||||
|
if self.conversation_timeout:
|
||||||
|
self._conversation_event.set()
|
||||||
|
threading.Timer(self.conversation_timeout, lambda: self._conversation_event.clear()).start()
|
||||||
|
else:
|
||||||
|
event = SpeechDetectedEvent(speech=speech)
|
||||||
|
|
||||||
|
get_bus().post(event)
|
||||||
|
|
||||||
|
def detection_thread(self) -> None:
|
||||||
|
"""
|
||||||
|
Speech detection thread. Reads from the ``audio_queue`` and uses the Deepspeech model to detect
|
||||||
|
speech real-time.
|
||||||
|
"""
|
||||||
|
self.logger.debug('Detection thread started')
|
||||||
|
model = self._get_model()
|
||||||
|
current_text = ''
|
||||||
|
context = None
|
||||||
|
|
||||||
|
while self._audio_queue:
|
||||||
|
if not context:
|
||||||
|
context = model.createStream()
|
||||||
|
|
||||||
|
try:
|
||||||
|
frames = self._audio_queue.get()
|
||||||
|
frames = self._convert_data(frames)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning('Error while feeding audio to the model: {}'.format(str(e)))
|
||||||
|
continue
|
||||||
|
|
||||||
|
model.feedAudioContent(context, frames)
|
||||||
|
text = model.intermediateDecode(context)
|
||||||
|
|
||||||
|
if text == current_text:
|
||||||
|
if current_text:
|
||||||
|
self._on_speech_detected(current_text)
|
||||||
|
model.finishStream(context)
|
||||||
|
context = None
|
||||||
|
|
||||||
|
current_text = ''
|
||||||
|
else:
|
||||||
|
if not current_text:
|
||||||
|
get_bus().post(SpeechStartedEvent())
|
||||||
|
|
||||||
|
self.logger.info('Intermediate speech results: [{}]'.format(text))
|
||||||
|
current_text = text
|
||||||
|
|
||||||
|
self.logger.debug('Detection thread terminated')
|
||||||
|
|
||||||
|
def recording_thread(self, block_duration: float, input_device: Optional[str] = None) -> None:
|
||||||
|
"""
|
||||||
|
Recording thread. It reads raw frames from the audio device and dispatches them to ``detection_thread``.
|
||||||
|
|
||||||
|
:param block_duration: Audio blocks duration.
|
||||||
|
:param input_device: Input device
|
||||||
|
"""
|
||||||
|
self.logger.debug('Recording thread started')
|
||||||
|
device = self._get_input_device(input_device)
|
||||||
|
blocksize = int(self.rate * self.channels * block_duration)
|
||||||
|
self._input_stream = sd.InputStream(samplerate=self.rate, device=device,
|
||||||
|
channels=self.channels, dtype='int16', latency=0,
|
||||||
|
blocksize=blocksize)
|
||||||
|
self._input_stream.start()
|
||||||
|
get_bus().post(SpeechDetectionStartedEvent())
|
||||||
|
|
||||||
|
while self._input_stream:
|
||||||
|
try:
|
||||||
|
frames = self._input_stream.read(self.rate)[0]
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning('Error while reading from the audio input: {}'.format(str(e)))
|
||||||
|
continue
|
||||||
|
|
||||||
|
self._audio_queue.put(frames)
|
||||||
|
|
||||||
|
get_bus().post(SpeechDetectionStoppedEvent())
|
||||||
|
self.logger.debug('Recording thread terminated')
|
||||||
|
|
||||||
|
@action
|
||||||
|
def detect(self, audio_file: str) -> SpeechDetectedResponse:
|
||||||
|
"""
|
||||||
|
Perform speech-to-text analysis on an audio file.
|
||||||
|
|
||||||
|
:param audio_file: Path to the audio file.
|
||||||
|
"""
|
||||||
|
audio_file = os.path.abspath(os.path.expanduser(audio_file))
|
||||||
|
wav = wave.open(audio_file, 'r')
|
||||||
|
buffer = wav.readframes(wav.getnframes())
|
||||||
|
speech = self._detect(buffer)
|
||||||
|
return SpeechDetectedResponse(speech=speech)
|
||||||
|
|
||||||
|
def __enter__(self) -> SttDeepspeechPlugin:
|
||||||
|
"""
|
||||||
|
Context manager enter. Starts detection and returns self.
|
||||||
|
"""
|
||||||
|
self.start_detection()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
"""
|
||||||
|
Context manager exit. Stops detection.
|
||||||
|
"""
|
||||||
|
self.stop_detection()
|
||||||
|
|
||||||
|
@action
|
||||||
|
def start_detection(self, input_device: Optional[str] = None, seconds: Optional[float] = None,
|
||||||
|
block_duration: Optional[float] = None) -> None:
|
||||||
|
"""
|
||||||
|
Start the speech detection engine.
|
||||||
|
|
||||||
|
:param input_device: Audio input device name/index override
|
||||||
|
:param seconds: If set, then the detection engine will stop after this many seconds, otherwise it'll
|
||||||
|
start running until ``stop_detection`` is called or application stop.
|
||||||
|
:param block_duration: ``block_duration`` override.
|
||||||
|
"""
|
||||||
|
assert not self._input_stream, 'Speech detection is already running'
|
||||||
|
block_duration = block_duration or self.block_duration
|
||||||
|
input_device = input_device if input_device is not None else self.input_device
|
||||||
|
self._audio_queue = queue.Queue()
|
||||||
|
self._recording_thread = threading.Thread(
|
||||||
|
target=lambda: self.recording_thread(block_duration=block_duration, input_device=input_device))
|
||||||
|
|
||||||
|
self._recording_thread.start()
|
||||||
|
self._detection_thread = threading.Thread(target=lambda: self.detection_thread())
|
||||||
|
self._detection_thread.start()
|
||||||
|
|
||||||
|
if seconds:
|
||||||
|
threading.Timer(seconds, lambda: self.stop_detection()).start()
|
||||||
|
|
||||||
|
@action
|
||||||
|
def stop_detection(self) -> None:
|
||||||
|
"""
|
||||||
|
Stop the speech detection engine.
|
||||||
|
"""
|
||||||
|
assert self._input_stream, 'Speech detection is not running'
|
||||||
|
self._input_stream.stop(ignore_errors=True)
|
||||||
|
self._input_stream.close(ignore_errors=True)
|
||||||
|
self._input_stream = None
|
||||||
|
|
||||||
|
if self._recording_thread:
|
||||||
|
self._recording_thread.join(timeout=self._thread_stop_timeout)
|
||||||
|
|
||||||
|
self._audio_queue = None
|
||||||
|
if self._detection_thread:
|
||||||
|
self._detection_thread.join(timeout=self._thread_stop_timeout)
|
||||||
|
|
||||||
|
|
||||||
|
# vim:sw=4:ts=4:et:
|
|
@ -229,3 +229,8 @@ croniter
|
||||||
|
|
||||||
# Support for Z-Wave
|
# Support for Z-Wave
|
||||||
# python-openzwave
|
# python-openzwave
|
||||||
|
|
||||||
|
# Support for DeepSpeech
|
||||||
|
# deepspeech
|
||||||
|
# numpy
|
||||||
|
# sounddevice
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -283,5 +283,7 @@ setup(
|
||||||
'zigbee': ['paho-mqtt'],
|
'zigbee': ['paho-mqtt'],
|
||||||
# Support for Z-Wave
|
# Support for Z-Wave
|
||||||
'zwave': ['python-openzwave'],
|
'zwave': ['python-openzwave'],
|
||||||
|
# Support for DeepSpeech
|
||||||
|
'deepspeech': ['deepspeech', 'numpy','sounddevice'],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in a new issue