From a5c08ed3e4839a440b8a88bc3d7c00521992e505 Mon Sep 17 00:00:00 2001 From: Fabio Manganiello Date: Sat, 7 Mar 2020 02:00:35 +0100 Subject: [PATCH] Added PicoVoice plugin with support for hotwords [see #130] --- docs/source/backends.rst | 2 + docs/source/conf.py | 1 + .../platypush/backend/stt.picovoice.rst | 5 + docs/source/platypush/backend/stt.rst | 5 + .../platypush/plugins/stt.picovoice.rst | 5 + docs/source/platypush/plugins/stt.rst | 5 + docs/source/plugins.rst | 2 + docs/source/responses.rst | 1 - platypush/backend/stt/__init__.py | 40 +++++++ platypush/backend/stt/deepspeech.py | 34 +----- platypush/backend/stt/picovoice.py | 21 ++++ platypush/config/__init__.py | 3 + platypush/plugins/stt/__init__.py | 88 ++++++++++----- platypush/plugins/stt/deepspeech.py | 2 +- platypush/plugins/stt/picovoice.py | 103 ++++++++++++++++++ requirements.txt | 5 +- setup.py | 4 +- 17 files changed, 265 insertions(+), 61 deletions(-) create mode 100644 docs/source/platypush/backend/stt.picovoice.rst create mode 100644 docs/source/platypush/backend/stt.rst create mode 100644 docs/source/platypush/plugins/stt.picovoice.rst create mode 100644 docs/source/platypush/plugins/stt.rst create mode 100644 platypush/backend/stt/picovoice.py create mode 100644 platypush/plugins/stt/picovoice.py diff --git a/docs/source/backends.rst b/docs/source/backends.rst index 866748e4e..2328d9e13 100644 --- a/docs/source/backends.rst +++ b/docs/source/backends.rst @@ -53,7 +53,9 @@ Backends platypush/backend/sensor.mcp3008.rst platypush/backend/sensor.motion.pwm3901.rst platypush/backend/sensor.serial.rst + platypush/backend/stt.rst platypush/backend/stt.deepspeech.rst + platypush/backend/stt.picovoice.rst platypush/backend/tcp.rst platypush/backend/todoist.rst platypush/backend/travisci.rst diff --git a/docs/source/conf.py b/docs/source/conf.py index a7cb379e3..9f8051075 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -242,6 +242,7 @@ autodoc_mock_imports = ['googlesamples.assistant.grpc.audio_helpers', 'openzwave', 'deepspeech', 'wave', + 'pvporcupine ', ] sys.path.insert(0, os.path.abspath('../..')) diff --git a/docs/source/platypush/backend/stt.picovoice.rst b/docs/source/platypush/backend/stt.picovoice.rst new file mode 100644 index 000000000..20f7e34c8 --- /dev/null +++ b/docs/source/platypush/backend/stt.picovoice.rst @@ -0,0 +1,5 @@ +``platypush.backend.stt.picovoice`` +=================================== + +.. automodule:: platypush.backend.stt.picovoice + :members: diff --git a/docs/source/platypush/backend/stt.rst b/docs/source/platypush/backend/stt.rst new file mode 100644 index 000000000..10d7e8203 --- /dev/null +++ b/docs/source/platypush/backend/stt.rst @@ -0,0 +1,5 @@ +``platypush.backend.stt`` +========================= + +.. automodule:: platypush.backend.stt + :members: diff --git a/docs/source/platypush/plugins/stt.picovoice.rst b/docs/source/platypush/plugins/stt.picovoice.rst new file mode 100644 index 000000000..593b8e5fc --- /dev/null +++ b/docs/source/platypush/plugins/stt.picovoice.rst @@ -0,0 +1,5 @@ +``platypush.plugins.stt.picovoice`` +=================================== + +.. automodule:: platypush.plugins.stt.picovoice + :members: diff --git a/docs/source/platypush/plugins/stt.rst b/docs/source/platypush/plugins/stt.rst new file mode 100644 index 000000000..7dfa3fcb3 --- /dev/null +++ b/docs/source/platypush/plugins/stt.rst @@ -0,0 +1,5 @@ +``platypush.plugins.stt`` +========================= + +.. automodule:: platypush.plugins.stt + :members: diff --git a/docs/source/plugins.rst b/docs/source/plugins.rst index 137f74fc3..6bcafd08f 100644 --- a/docs/source/plugins.rst +++ b/docs/source/plugins.rst @@ -90,7 +90,9 @@ Plugins platypush/plugins/serial.rst platypush/plugins/shell.rst platypush/plugins/sound.rst + platypush/plugins/stt.rst platypush/plugins/stt.deepspeech.rst + platypush/plugins/stt.picovoice.rst platypush/plugins/switch.rst platypush/plugins/switch.switchbot.rst platypush/plugins/switch.tplink.rst diff --git a/docs/source/responses.rst b/docs/source/responses.rst index 0bbca1980..bcdce47a8 100644 --- a/docs/source/responses.rst +++ b/docs/source/responses.rst @@ -6,7 +6,6 @@ Responses :maxdepth: 2 :caption: Responses: - platypush/responses/.rst platypush/responses/bluetooth.rst platypush/responses/camera.rst platypush/responses/camera.android.rst diff --git a/platypush/backend/stt/__init__.py b/platypush/backend/stt/__init__.py index e69de29bb..624c2b72f 100644 --- a/platypush/backend/stt/__init__.py +++ b/platypush/backend/stt/__init__.py @@ -0,0 +1,40 @@ +import time + +from platypush.backend import Backend +from platypush.context import get_plugin +from platypush.plugins.stt import SttPlugin + + +class SttBackend(Backend): + """ + Base class for speech-to-text backends. + """ + + def __init__(self, plugin_name: str, retry_sleep: float = 5.0, *args, **kwargs): + """ + :param plugin_name: Plugin name of the class that will be used for speech detection. Must be an instance of + :class:`platypush.plugins.stt.SttPlugin`. + :param retry_sleep: Number of seconds the backend will wait on failure before re-initializing the plugin + (default: 5 seconds). + """ + super().__init__(*args, **kwargs) + self.plugin_name = plugin_name + self.retry_sleep = retry_sleep + + def run(self): + super().run() + self.logger.info('Starting {} speech-to-text backend'.format(self.__class__.__name__)) + + while not self.should_stop(): + try: + plugin: SttPlugin = get_plugin(self.plugin_name) + with plugin: + # noinspection PyProtectedMember + plugin._detection_thread.join() + except Exception as e: + self.logger.exception(e) + self.logger.warning('Encountered an unexpected error, retrying in {} seconds'.format(self.retry_sleep)) + time.sleep(self.retry_sleep) + + +# vim:sw=4:ts=4:et: diff --git a/platypush/backend/stt/deepspeech.py b/platypush/backend/stt/deepspeech.py index 1a149be47..de1eed210 100644 --- a/platypush/backend/stt/deepspeech.py +++ b/platypush/backend/stt/deepspeech.py @@ -1,11 +1,7 @@ -import time - -from platypush.backend import Backend -from platypush.context import get_plugin -from platypush.plugins.stt.deepspeech import SttDeepspeechPlugin +from platypush.backend.stt import SttBackend -class SttDeepspeechBackend(Backend): +class SttDeepspeechBackend(SttBackend): """ Backend for the Mozilla Deepspeech speech-to-text engine plugin. Set this plugin to ``enabled`` if you want to run the speech-to-text engine continuously instead of programmatically using @@ -18,30 +14,8 @@ class SttDeepspeechBackend(Backend): """ - def __init__(self, retry_sleep: float = 5.0, *args, **kwargs): - """ - :param retry_sleep: Number of seconds the backend will wait on failure before re-initializing the plugin - (default: 5 seconds). - """ - super().__init__(*args, **kwargs) - self.retry_sleep = retry_sleep - - def run(self): - super().run() - self.logger.info('Starting Mozilla Deepspeech speech-to-text backend') - - while not self.should_stop(): - try: - plugin: SttDeepspeechPlugin = get_plugin('stt.deepspeech') - with plugin: - # noinspection PyProtectedMember - plugin._detection_thread.join() - except Exception as e: - self.logger.exception(e) - self.logger.warning('Deepspeech backend encountered an unexpected error, retrying in {} seconds'. - format(self.retry_sleep)) - - time.sleep(self.retry_sleep) + def __init__(self, *args, **kwargs): + super().__init__('stt.deepspeech', *args, **kwargs) # vim:sw=4:ts=4:et: diff --git a/platypush/backend/stt/picovoice.py b/platypush/backend/stt/picovoice.py new file mode 100644 index 000000000..f39f2552b --- /dev/null +++ b/platypush/backend/stt/picovoice.py @@ -0,0 +1,21 @@ +from platypush.backend.stt import SttBackend + + +class SttPicovoiceBackend(SttBackend): + """ + Backend for the PicoVoice speech-to-text engine plugin. Set this plugin to ``enabled`` if you + want to run the speech-to-text engine continuously instead of programmatically using + ``start_detection`` and ``stop_detection``. + + Requires: + + - The :class:`platypush.plugins.stt.deepspeech.SttPicovoicePlugin` plugin configured and its dependencies + installed. + + """ + + def __init__(self, *args, **kwargs): + super().__init__('stt.picovoice', *args, **kwargs) + + +# vim:sw=4:ts=4:et: diff --git a/platypush/config/__init__.py b/platypush/config/__init__.py index e6c5227f1..3bbcabc87 100644 --- a/platypush/config/__init__.py +++ b/platypush/config/__init__.py @@ -138,6 +138,9 @@ class Config(object): with open(cfgfile, 'r') as fp: file_config = yaml.safe_load(fp) + if not file_config: + return config + for section in file_config: if section == 'include': include_files = file_config[section] \ diff --git a/platypush/plugins/stt/__init__.py b/platypush/plugins/stt/__init__.py index f4978248b..bca339f8c 100644 --- a/platypush/plugins/stt/__init__.py +++ b/platypush/plugins/stt/__init__.py @@ -35,7 +35,7 @@ class SttPlugin(ABC, Plugin): input_device: Optional[Union[int, str]] = None, hotword: Optional[str] = None, hotwords: Optional[List[str]] = None, - conversation_timeout: Optional[float] = None, + conversation_timeout: Optional[float] = 10.0, block_duration: float = 1.0): """ :param input_device: PortAudio device index or name that will be used for recording speech (default: default @@ -49,7 +49,7 @@ class SttPlugin(ABC, Plugin): the next speech detected event will trigger a :class:`platypush.message.event.stt.ConversationDetectedEvent` instead of a :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can hook custom hooks here to run any logic depending on the detected speech - it can emulate a kind of - "OK, Google. Turn on the lights" interaction without using an external assistant. + "OK, Google. Turn on the lights" interaction without using an external assistant (default: 10 seconds). :param block_duration: Duration of the acquired audio blocks (default: 1 second). """ @@ -67,6 +67,7 @@ class SttPlugin(ABC, Plugin): self._recording_thread: Optional[threading.Thread] = None self._detection_thread: Optional[threading.Thread] = None self._audio_queue: Optional[queue.Queue] = None + self._current_text = '' def _get_input_device(self, device: Optional[Union[int, str]] = None) -> int: """ @@ -99,13 +100,13 @@ class SttPlugin(ABC, Plugin): """ speech = speech.strip() - if self._conversation_event.is_set(): - event = ConversationDetectedEvent(speech=speech) - elif speech in self.hotwords: + if speech in self.hotwords: event = HotwordDetectedEvent(hotword=speech) if self.conversation_timeout: self._conversation_event.set() threading.Timer(self.conversation_timeout, lambda: self._conversation_event.clear()).start() + elif self._conversation_event.is_set(): + event = ConversationDetectedEvent(speech=speech) else: event = SpeechDetectedEvent(speech=speech) @@ -122,35 +123,68 @@ class SttPlugin(ABC, Plugin): """ return frames - def on_detection_started(self): + def on_detection_started(self) -> None: """ Method called when the ``detection_thread`` starts. Initialize your context variables and models here if required. """ pass - def on_detection_ended(self): + def on_detection_ended(self) -> None: """ Method called when the ``detection_thread`` stops. Clean up your context variables and models here. """ pass + def before_recording(self) -> None: + """ + Method called when the ``recording_thread`` starts. Put here any logic that you may want to run before the + recording thread starts. + """ + pass + + def on_recording_started(self) -> None: + """ + Method called after the ``recording_thread`` opens the audio device. Put here any logic that you may want to + run after the recording starts. + """ + pass + + def on_recording_ended(self) -> None: + """ + Method called when the ``recording_thread`` stops. Put here any logic that you want to run after the audio + device is closed. + """ + pass + @abstractmethod - def detect_audio(self, frames) -> str: + def detect_speech(self, frames) -> str: """ Method called within the ``detection_thread`` when new audio frames have been captured. Must be implemented by the derived classes. :param frames: Audio frames, as returned by ``convert_frames``. - :return: Detected text, as a string. + :return: Detected text, as a string. Returns an empty string if no text has been detected. """ raise NotImplementedError + def process_text(self, text: str) -> None: + if (not text and self._current_text) or (text and text == self._current_text): + self.on_speech_detected(self._current_text) + self._current_text = '' + else: + if text: + if not self._current_text: + get_bus().post(SpeechStartedEvent()) + self.logger.info('Intermediate speech results: [{}]'.format(text)) + + self._current_text = text + def detection_thread(self) -> None: """ This thread reads frames from ``_audio_queue``, performs the speech-to-text detection and calls """ - current_text = '' + self._current_text = '' self.logger.debug('Detection thread started') self.on_detection_started() @@ -162,41 +196,40 @@ class SttPlugin(ABC, Plugin): self.logger.warning('Error while feeding audio to the model: {}'.format(str(e))) continue - text = self.detect_audio(frames) - if text == current_text: - if current_text: - self.on_speech_detected(current_text) - - current_text = '' - else: - if not current_text: - get_bus().post(SpeechStartedEvent()) - - self.logger.info('Intermediate speech results: [{}]'.format(text)) - current_text = text + text = self.detect_speech(frames).strip() + self.process_text(text) self.on_detection_ended() self.logger.debug('Detection thread terminated') - def recording_thread(self, block_duration: float, input_device: Optional[str] = None) -> None: + def recording_thread(self, block_duration: Optional[float] = None, block_size: Optional[int] = None, + input_device: Optional[str] = None) -> None: """ Recording thread. It reads raw frames from the audio device and dispatches them to ``detection_thread``. - :param block_duration: Audio blocks duration. + :param block_duration: Audio blocks duration. Specify either ``block_duration`` or ``block_size``. + :param block_size: Size of the audio blocks. Specify either ``block_duration`` or ``block_size``. :param input_device: Input device """ + assert (block_duration or block_size) and not (block_duration and block_size), \ + 'Please specify either block_duration or block_size' + + if not block_size: + block_size = int(self.rate * self.channels * block_duration) + + self.before_recording() self.logger.debug('Recording thread started') device = self._get_input_device(input_device) - blocksize = int(self.rate * self.channels * block_duration) self._input_stream = sd.InputStream(samplerate=self.rate, device=device, channels=self.channels, dtype='int16', latency=0, - blocksize=blocksize) + blocksize=block_size) self._input_stream.start() + self.on_recording_started() get_bus().post(SpeechDetectionStartedEvent()) while self._input_stream: try: - frames = self._input_stream.read(self.rate)[0] + frames = self._input_stream.read(block_size)[0] except Exception as e: self.logger.warning('Error while reading from the audio input: {}'.format(str(e))) continue @@ -204,6 +237,7 @@ class SttPlugin(ABC, Plugin): self._audio_queue.put(frames) get_bus().post(SpeechDetectionStoppedEvent()) + self.on_recording_ended() self.logger.debug('Recording thread terminated') @abstractmethod diff --git a/platypush/plugins/stt/deepspeech.py b/platypush/plugins/stt/deepspeech.py index f50bd1db3..10432567c 100644 --- a/platypush/plugins/stt/deepspeech.py +++ b/platypush/plugins/stt/deepspeech.py @@ -116,7 +116,7 @@ class SttDeepspeechPlugin(SttPlugin): self._model.finishStream() self._context = None - def detect_audio(self, frames) -> str: + def detect_speech(self, frames) -> str: model = self._get_model() context = self._get_context() model.feedAudioContent(context, frames) diff --git a/platypush/plugins/stt/picovoice.py b/platypush/plugins/stt/picovoice.py new file mode 100644 index 000000000..04388b165 --- /dev/null +++ b/platypush/plugins/stt/picovoice.py @@ -0,0 +1,103 @@ +import os +import struct +from typing import Optional, List + +from platypush.message.response.stt import SpeechDetectedResponse +from platypush.plugins import action +from platypush.plugins.stt import SttPlugin + + +class SttPicovoicePlugin(SttPlugin): + """ + This plugin performs speech-to-text and speech detection using the + `PicoVoice `_ speech-to-text integrations. + + Requires: + + * **pvporcupine** (``pip install pvporcupine``) for hotword detection. + + """ + + def __init__(self, + library_path: Optional[str] = None, + model_file_path: Optional[str] = None, + keyword_file_paths: Optional[List[str]] = None, + sensitivity: float = 0.5, + sensitivities: Optional[List[float]] = None, + *args, **kwargs): + from pvporcupine import Porcupine + from pvporcupine.resources.util.python.util import LIBRARY_PATH, MODEL_FILE_PATH, KEYWORD_FILE_PATHS + super().__init__(*args, **kwargs) + + self.hotwords = list(self.hotwords) + self._hotword_engine: Optional[Porcupine] = None + self._library_path = os.path.abspath(os.path.expanduser(library_path or LIBRARY_PATH)) + self._model_file_path = os.path.abspath(os.path.expanduser(model_file_path or MODEL_FILE_PATH)) + + if not keyword_file_paths: + hotwords = KEYWORD_FILE_PATHS + assert all(hotword in hotwords for hotword in self.hotwords), \ + 'Not all the hotwords could be found. Available hotwords: {}'.format(list(hotwords.keys())) + + self._keyword_file_paths = [os.path.abspath(os.path.expanduser(hotwords[hotword])) + for hotword in self.hotwords] + else: + self._keyword_file_paths = [ + os.path.abspath(os.path.expanduser(p)) + for p in keyword_file_paths + ] + + self._sensitivities = [] + if sensitivities: + assert len(self._keyword_file_paths) == len(sensitivities), \ + 'Please specify as many sensitivities as the number of configured hotwords' + + self._sensitivities = sensitivities + else: + self._sensitivities = [sensitivity] * len(self._keyword_file_paths) + + def convert_frames(self, frames: bytes) -> tuple: + assert self._hotword_engine, 'The hotword engine is not running' + return struct.unpack_from("h" * self._hotword_engine.frame_length, frames) + + def on_detection_ended(self) -> None: + if self._hotword_engine: + self._hotword_engine.delete() + self._hotword_engine = None + + def detect_speech(self, frames: tuple) -> str: + index = self._hotword_engine.process(frames) + if index < 0: + return '' + + if index is True: + index = 0 + return self.hotwords[index] + + @action + def detect(self, audio_file: str) -> SpeechDetectedResponse: + """ + Perform speech-to-text analysis on an audio file. + + :param audio_file: Path to the audio file. + """ + pass + + def recording_thread(self, input_device: Optional[str] = None, *args, **kwargs) -> None: + assert self._hotword_engine, 'The hotword engine has not yet been initialized' + super().recording_thread(block_size=self._hotword_engine.frame_length, input_device=input_device) + + @action + def start_detection(self, *args, **kwargs) -> None: + from pvporcupine import Porcupine + self._hotword_engine = Porcupine( + library_path=self._library_path, + model_file_path=self._model_file_path, + keyword_file_paths=self._keyword_file_paths, + sensitivities=self._sensitivities) + + self.rate = self._hotword_engine.sample_rate + super().start_detection(*args, **kwargs) + + +# vim:sw=4:ts=4:et: diff --git a/requirements.txt b/requirements.txt index cdd720ca7..eabe15ec1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -230,7 +230,10 @@ croniter # Support for Z-Wave # python-openzwave -# Support for DeepSpeech +# Support for Mozilla DeepSpeech speech-to-text engine # deepspeech # numpy # sounddevice + +# Support for PicoVoice speech-to-text engine +# pvporcupine diff --git a/setup.py b/setup.py index 928385785..5aeb0b552 100755 --- a/setup.py +++ b/setup.py @@ -283,7 +283,9 @@ setup( 'zigbee': ['paho-mqtt'], # Support for Z-Wave 'zwave': ['python-openzwave'], - # Support for DeepSpeech + # Support for Mozilla DeepSpeech speech-to-text engine 'deepspeech': ['deepspeech', 'numpy','sounddevice'], + # Support for PicoVoice speech-to-text engine + 'picovoice': ['pvporcupine'], }, )