From a5c08ed3e4839a440b8a88bc3d7c00521992e505 Mon Sep 17 00:00:00 2001
From: Fabio Manganiello <blacklight86@gmail.com>
Date: Sat, 7 Mar 2020 02:00:35 +0100
Subject: [PATCH] Added PicoVoice plugin with support for hotwords [see #130]

---
 docs/source/backends.rst                      |   2 +
 docs/source/conf.py                           |   1 +
 .../platypush/backend/stt.picovoice.rst       |   5 +
 docs/source/platypush/backend/stt.rst         |   5 +
 .../platypush/plugins/stt.picovoice.rst       |   5 +
 docs/source/platypush/plugins/stt.rst         |   5 +
 docs/source/plugins.rst                       |   2 +
 docs/source/responses.rst                     |   1 -
 platypush/backend/stt/__init__.py             |  40 +++++++
 platypush/backend/stt/deepspeech.py           |  34 +-----
 platypush/backend/stt/picovoice.py            |  21 ++++
 platypush/config/__init__.py                  |   3 +
 platypush/plugins/stt/__init__.py             |  88 ++++++++++-----
 platypush/plugins/stt/deepspeech.py           |   2 +-
 platypush/plugins/stt/picovoice.py            | 103 ++++++++++++++++++
 requirements.txt                              |   5 +-
 setup.py                                      |   4 +-
 17 files changed, 265 insertions(+), 61 deletions(-)
 create mode 100644 docs/source/platypush/backend/stt.picovoice.rst
 create mode 100644 docs/source/platypush/backend/stt.rst
 create mode 100644 docs/source/platypush/plugins/stt.picovoice.rst
 create mode 100644 docs/source/platypush/plugins/stt.rst
 create mode 100644 platypush/backend/stt/picovoice.py
 create mode 100644 platypush/plugins/stt/picovoice.py

diff --git a/docs/source/backends.rst b/docs/source/backends.rst
index 866748e4..2328d9e1 100644
--- a/docs/source/backends.rst
+++ b/docs/source/backends.rst
@@ -53,7 +53,9 @@ Backends
     platypush/backend/sensor.mcp3008.rst
     platypush/backend/sensor.motion.pwm3901.rst
     platypush/backend/sensor.serial.rst
+    platypush/backend/stt.rst
     platypush/backend/stt.deepspeech.rst
+    platypush/backend/stt.picovoice.rst
     platypush/backend/tcp.rst
     platypush/backend/todoist.rst
     platypush/backend/travisci.rst
diff --git a/docs/source/conf.py b/docs/source/conf.py
index a7cb379e..9f805107 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -242,6 +242,7 @@ autodoc_mock_imports = ['googlesamples.assistant.grpc.audio_helpers',
                         'openzwave',
                         'deepspeech',
                         'wave',
+                        'pvporcupine ',
                         ]
 
 sys.path.insert(0, os.path.abspath('../..'))
diff --git a/docs/source/platypush/backend/stt.picovoice.rst b/docs/source/platypush/backend/stt.picovoice.rst
new file mode 100644
index 00000000..20f7e34c
--- /dev/null
+++ b/docs/source/platypush/backend/stt.picovoice.rst
@@ -0,0 +1,5 @@
+``platypush.backend.stt.picovoice``
+===================================
+
+.. automodule:: platypush.backend.stt.picovoice
+    :members:
diff --git a/docs/source/platypush/backend/stt.rst b/docs/source/platypush/backend/stt.rst
new file mode 100644
index 00000000..10d7e820
--- /dev/null
+++ b/docs/source/platypush/backend/stt.rst
@@ -0,0 +1,5 @@
+``platypush.backend.stt``
+=========================
+
+.. automodule:: platypush.backend.stt
+    :members:
diff --git a/docs/source/platypush/plugins/stt.picovoice.rst b/docs/source/platypush/plugins/stt.picovoice.rst
new file mode 100644
index 00000000..593b8e5f
--- /dev/null
+++ b/docs/source/platypush/plugins/stt.picovoice.rst
@@ -0,0 +1,5 @@
+``platypush.plugins.stt.picovoice``
+===================================
+
+.. automodule:: platypush.plugins.stt.picovoice
+    :members:
diff --git a/docs/source/platypush/plugins/stt.rst b/docs/source/platypush/plugins/stt.rst
new file mode 100644
index 00000000..7dfa3fcb
--- /dev/null
+++ b/docs/source/platypush/plugins/stt.rst
@@ -0,0 +1,5 @@
+``platypush.plugins.stt``
+=========================
+
+.. automodule:: platypush.plugins.stt
+    :members:
diff --git a/docs/source/plugins.rst b/docs/source/plugins.rst
index 137f74fc..6bcafd08 100644
--- a/docs/source/plugins.rst
+++ b/docs/source/plugins.rst
@@ -90,7 +90,9 @@ Plugins
     platypush/plugins/serial.rst
     platypush/plugins/shell.rst
     platypush/plugins/sound.rst
+    platypush/plugins/stt.rst
     platypush/plugins/stt.deepspeech.rst
+    platypush/plugins/stt.picovoice.rst
     platypush/plugins/switch.rst
     platypush/plugins/switch.switchbot.rst
     platypush/plugins/switch.tplink.rst
diff --git a/docs/source/responses.rst b/docs/source/responses.rst
index 0bbca198..bcdce47a 100644
--- a/docs/source/responses.rst
+++ b/docs/source/responses.rst
@@ -6,7 +6,6 @@ Responses
     :maxdepth: 2
     :caption: Responses:
 
-    platypush/responses/.rst
     platypush/responses/bluetooth.rst
     platypush/responses/camera.rst
     platypush/responses/camera.android.rst
diff --git a/platypush/backend/stt/__init__.py b/platypush/backend/stt/__init__.py
index e69de29b..624c2b72 100644
--- a/platypush/backend/stt/__init__.py
+++ b/platypush/backend/stt/__init__.py
@@ -0,0 +1,40 @@
+import time
+
+from platypush.backend import Backend
+from platypush.context import get_plugin
+from platypush.plugins.stt import SttPlugin
+
+
+class SttBackend(Backend):
+    """
+    Base class for speech-to-text backends.
+    """
+
+    def __init__(self, plugin_name: str, retry_sleep: float = 5.0, *args, **kwargs):
+        """
+        :param plugin_name: Plugin name of the class that will be used for speech detection. Must be an instance of
+            :class:`platypush.plugins.stt.SttPlugin`.
+        :param retry_sleep: Number of seconds the backend will wait on failure before re-initializing the plugin
+            (default: 5 seconds).
+        """
+        super().__init__(*args, **kwargs)
+        self.plugin_name = plugin_name
+        self.retry_sleep = retry_sleep
+
+    def run(self):
+        super().run()
+        self.logger.info('Starting {} speech-to-text backend'.format(self.__class__.__name__))
+
+        while not self.should_stop():
+            try:
+                plugin: SttPlugin = get_plugin(self.plugin_name)
+                with plugin:
+                    # noinspection PyProtectedMember
+                    plugin._detection_thread.join()
+            except Exception as e:
+                self.logger.exception(e)
+                self.logger.warning('Encountered an unexpected error, retrying in {} seconds'.format(self.retry_sleep))
+                time.sleep(self.retry_sleep)
+
+
+# vim:sw=4:ts=4:et:
diff --git a/platypush/backend/stt/deepspeech.py b/platypush/backend/stt/deepspeech.py
index 1a149be4..de1eed21 100644
--- a/platypush/backend/stt/deepspeech.py
+++ b/platypush/backend/stt/deepspeech.py
@@ -1,11 +1,7 @@
-import time
-
-from platypush.backend import Backend
-from platypush.context import get_plugin
-from platypush.plugins.stt.deepspeech import SttDeepspeechPlugin
+from platypush.backend.stt import SttBackend
 
 
-class SttDeepspeechBackend(Backend):
+class SttDeepspeechBackend(SttBackend):
     """
     Backend for the Mozilla Deepspeech speech-to-text engine plugin. Set this plugin to ``enabled`` if you
     want to run the speech-to-text engine continuously instead of programmatically using
@@ -18,30 +14,8 @@ class SttDeepspeechBackend(Backend):
 
     """
 
-    def __init__(self, retry_sleep: float = 5.0, *args, **kwargs):
-        """
-        :param retry_sleep: Number of seconds the backend will wait on failure before re-initializing the plugin
-            (default: 5 seconds).
-        """
-        super().__init__(*args, **kwargs)
-        self.retry_sleep = retry_sleep
-
-    def run(self):
-        super().run()
-        self.logger.info('Starting Mozilla Deepspeech speech-to-text backend')
-
-        while not self.should_stop():
-            try:
-                plugin: SttDeepspeechPlugin = get_plugin('stt.deepspeech')
-                with plugin:
-                    # noinspection PyProtectedMember
-                    plugin._detection_thread.join()
-            except Exception as e:
-                self.logger.exception(e)
-                self.logger.warning('Deepspeech backend encountered an unexpected error, retrying in {} seconds'.
-                                    format(self.retry_sleep))
-
-                time.sleep(self.retry_sleep)
+    def __init__(self, *args, **kwargs):
+        super().__init__('stt.deepspeech', *args, **kwargs)
 
 
 # vim:sw=4:ts=4:et:
diff --git a/platypush/backend/stt/picovoice.py b/platypush/backend/stt/picovoice.py
new file mode 100644
index 00000000..f39f2552
--- /dev/null
+++ b/platypush/backend/stt/picovoice.py
@@ -0,0 +1,21 @@
+from platypush.backend.stt import SttBackend
+
+
+class SttPicovoiceBackend(SttBackend):
+    """
+    Backend for the PicoVoice speech-to-text engine plugin. Set this plugin to ``enabled`` if you
+    want to run the speech-to-text engine continuously instead of programmatically using
+    ``start_detection`` and ``stop_detection``.
+
+    Requires:
+
+        - The :class:`platypush.plugins.stt.deepspeech.SttPicovoicePlugin` plugin configured and its dependencies
+          installed.
+
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__('stt.picovoice', *args, **kwargs)
+
+
+# vim:sw=4:ts=4:et:
diff --git a/platypush/config/__init__.py b/platypush/config/__init__.py
index e6c5227f..3bbcabc8 100644
--- a/platypush/config/__init__.py
+++ b/platypush/config/__init__.py
@@ -138,6 +138,9 @@ class Config(object):
         with open(cfgfile, 'r') as fp:
             file_config = yaml.safe_load(fp)
 
+        if not file_config:
+            return config
+
         for section in file_config:
             if section == 'include':
                 include_files = file_config[section] \
diff --git a/platypush/plugins/stt/__init__.py b/platypush/plugins/stt/__init__.py
index f4978248..bca339f8 100644
--- a/platypush/plugins/stt/__init__.py
+++ b/platypush/plugins/stt/__init__.py
@@ -35,7 +35,7 @@ class SttPlugin(ABC, Plugin):
                  input_device: Optional[Union[int, str]] = None,
                  hotword: Optional[str] = None,
                  hotwords: Optional[List[str]] = None,
-                 conversation_timeout: Optional[float] = None,
+                 conversation_timeout: Optional[float] = 10.0,
                  block_duration: float = 1.0):
         """
         :param input_device: PortAudio device index or name that will be used for recording speech (default: default
@@ -49,7 +49,7 @@ class SttPlugin(ABC, Plugin):
             the next speech detected event will trigger a :class:`platypush.message.event.stt.ConversationDetectedEvent`
             instead of a :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can hook custom hooks
             here to run any logic depending on the detected speech - it can emulate a kind of
-            "OK, Google. Turn on the lights" interaction without using an external assistant.
+            "OK, Google. Turn on the lights" interaction without using an external assistant (default: 10 seconds).
         :param block_duration: Duration of the acquired audio blocks (default: 1 second).
         """
 
@@ -67,6 +67,7 @@ class SttPlugin(ABC, Plugin):
         self._recording_thread: Optional[threading.Thread] = None
         self._detection_thread: Optional[threading.Thread] = None
         self._audio_queue: Optional[queue.Queue] = None
+        self._current_text = ''
 
     def _get_input_device(self, device: Optional[Union[int, str]] = None) -> int:
         """
@@ -99,13 +100,13 @@ class SttPlugin(ABC, Plugin):
         """
         speech = speech.strip()
 
-        if self._conversation_event.is_set():
-            event = ConversationDetectedEvent(speech=speech)
-        elif speech in self.hotwords:
+        if speech in self.hotwords:
             event = HotwordDetectedEvent(hotword=speech)
             if self.conversation_timeout:
                 self._conversation_event.set()
                 threading.Timer(self.conversation_timeout, lambda: self._conversation_event.clear()).start()
+        elif self._conversation_event.is_set():
+            event = ConversationDetectedEvent(speech=speech)
         else:
             event = SpeechDetectedEvent(speech=speech)
 
@@ -122,35 +123,68 @@ class SttPlugin(ABC, Plugin):
         """
         return frames
 
-    def on_detection_started(self):
+    def on_detection_started(self) -> None:
         """
         Method called when the ``detection_thread`` starts. Initialize your context variables and models here if
         required.
         """
         pass
 
-    def on_detection_ended(self):
+    def on_detection_ended(self) -> None:
         """
         Method called when the ``detection_thread`` stops. Clean up your context variables and models here.
         """
         pass
 
+    def before_recording(self) -> None:
+        """
+        Method called when the ``recording_thread`` starts. Put here any logic that you may want to run before the
+        recording thread starts.
+        """
+        pass
+
+    def on_recording_started(self) -> None:
+        """
+        Method called after the ``recording_thread`` opens the audio device. Put here any logic that you may want to
+        run after the recording starts.
+        """
+        pass
+
+    def on_recording_ended(self) -> None:
+        """
+        Method called when the ``recording_thread`` stops. Put here any logic that you want to run after the audio
+        device is closed.
+        """
+        pass
+
     @abstractmethod
-    def detect_audio(self, frames) -> str:
+    def detect_speech(self, frames) -> str:
         """
         Method called within the ``detection_thread`` when new audio frames have been captured. Must be implemented
         by the derived classes.
 
         :param frames: Audio frames, as returned by ``convert_frames``.
-        :return: Detected text, as a string.
+        :return: Detected text, as a string. Returns an empty string if no text has been detected.
         """
         raise NotImplementedError
 
+    def process_text(self, text: str) -> None:
+        if (not text and self._current_text) or (text and text == self._current_text):
+            self.on_speech_detected(self._current_text)
+            self._current_text = ''
+        else:
+            if text:
+                if not self._current_text:
+                    get_bus().post(SpeechStartedEvent())
+                self.logger.info('Intermediate speech results: [{}]'.format(text))
+
+            self._current_text = text
+
     def detection_thread(self) -> None:
         """
         This thread reads frames from ``_audio_queue``, performs the speech-to-text detection and calls
         """
-        current_text = ''
+        self._current_text = ''
         self.logger.debug('Detection thread started')
         self.on_detection_started()
 
@@ -162,41 +196,40 @@ class SttPlugin(ABC, Plugin):
                 self.logger.warning('Error while feeding audio to the model: {}'.format(str(e)))
                 continue
 
-            text = self.detect_audio(frames)
-            if text == current_text:
-                if current_text:
-                    self.on_speech_detected(current_text)
-
-                current_text = ''
-            else:
-                if not current_text:
-                    get_bus().post(SpeechStartedEvent())
-
-                self.logger.info('Intermediate speech results: [{}]'.format(text))
-                current_text = text
+            text = self.detect_speech(frames).strip()
+            self.process_text(text)
 
         self.on_detection_ended()
         self.logger.debug('Detection thread terminated')
 
-    def recording_thread(self, block_duration: float, input_device: Optional[str] = None) -> None:
+    def recording_thread(self, block_duration: Optional[float] = None, block_size: Optional[int] = None,
+                         input_device: Optional[str] = None) -> None:
         """
         Recording thread. It reads raw frames from the audio device and dispatches them to ``detection_thread``.
 
-        :param block_duration: Audio blocks duration.
+        :param block_duration: Audio blocks duration. Specify either ``block_duration`` or ``block_size``.
+        :param block_size: Size of the audio blocks. Specify either ``block_duration`` or ``block_size``.
         :param input_device: Input device
         """
+        assert (block_duration or block_size) and not (block_duration and block_size), \
+            'Please specify either block_duration or block_size'
+
+        if not block_size:
+            block_size = int(self.rate * self.channels * block_duration)
+
+        self.before_recording()
         self.logger.debug('Recording thread started')
         device = self._get_input_device(input_device)
-        blocksize = int(self.rate * self.channels * block_duration)
         self._input_stream = sd.InputStream(samplerate=self.rate, device=device,
                                             channels=self.channels, dtype='int16', latency=0,
-                                            blocksize=blocksize)
+                                            blocksize=block_size)
         self._input_stream.start()
+        self.on_recording_started()
         get_bus().post(SpeechDetectionStartedEvent())
 
         while self._input_stream:
             try:
-                frames = self._input_stream.read(self.rate)[0]
+                frames = self._input_stream.read(block_size)[0]
             except Exception as e:
                 self.logger.warning('Error while reading from the audio input: {}'.format(str(e)))
                 continue
@@ -204,6 +237,7 @@ class SttPlugin(ABC, Plugin):
             self._audio_queue.put(frames)
 
         get_bus().post(SpeechDetectionStoppedEvent())
+        self.on_recording_ended()
         self.logger.debug('Recording thread terminated')
 
     @abstractmethod
diff --git a/platypush/plugins/stt/deepspeech.py b/platypush/plugins/stt/deepspeech.py
index f50bd1db..10432567 100644
--- a/platypush/plugins/stt/deepspeech.py
+++ b/platypush/plugins/stt/deepspeech.py
@@ -116,7 +116,7 @@ class SttDeepspeechPlugin(SttPlugin):
             self._model.finishStream()
         self._context = None
 
-    def detect_audio(self, frames) -> str:
+    def detect_speech(self, frames) -> str:
         model = self._get_model()
         context = self._get_context()
         model.feedAudioContent(context, frames)
diff --git a/platypush/plugins/stt/picovoice.py b/platypush/plugins/stt/picovoice.py
new file mode 100644
index 00000000..04388b16
--- /dev/null
+++ b/platypush/plugins/stt/picovoice.py
@@ -0,0 +1,103 @@
+import os
+import struct
+from typing import Optional, List
+
+from platypush.message.response.stt import SpeechDetectedResponse
+from platypush.plugins import action
+from platypush.plugins.stt import SttPlugin
+
+
+class SttPicovoicePlugin(SttPlugin):
+    """
+    This plugin performs speech-to-text and speech detection using the
+    `PicoVoice <https://github.com/Picovoice>`_ speech-to-text integrations.
+
+    Requires:
+
+        * **pvporcupine** (``pip install pvporcupine``) for hotword detection.
+
+    """
+
+    def __init__(self,
+                 library_path: Optional[str] = None,
+                 model_file_path: Optional[str] = None,
+                 keyword_file_paths: Optional[List[str]] = None,
+                 sensitivity: float = 0.5,
+                 sensitivities: Optional[List[float]] = None,
+                 *args, **kwargs):
+        from pvporcupine import Porcupine
+        from pvporcupine.resources.util.python.util import LIBRARY_PATH, MODEL_FILE_PATH, KEYWORD_FILE_PATHS
+        super().__init__(*args, **kwargs)
+
+        self.hotwords = list(self.hotwords)
+        self._hotword_engine: Optional[Porcupine] = None
+        self._library_path = os.path.abspath(os.path.expanduser(library_path or LIBRARY_PATH))
+        self._model_file_path = os.path.abspath(os.path.expanduser(model_file_path or MODEL_FILE_PATH))
+
+        if not keyword_file_paths:
+            hotwords = KEYWORD_FILE_PATHS
+            assert all(hotword in hotwords for hotword in self.hotwords), \
+                'Not all the hotwords could be found. Available hotwords: {}'.format(list(hotwords.keys()))
+
+            self._keyword_file_paths = [os.path.abspath(os.path.expanduser(hotwords[hotword]))
+                                        for hotword in self.hotwords]
+        else:
+            self._keyword_file_paths = [
+                os.path.abspath(os.path.expanduser(p))
+                for p in keyword_file_paths
+            ]
+
+        self._sensitivities = []
+        if sensitivities:
+            assert len(self._keyword_file_paths) == len(sensitivities), \
+                'Please specify as many sensitivities as the number of configured hotwords'
+
+            self._sensitivities = sensitivities
+        else:
+            self._sensitivities = [sensitivity] * len(self._keyword_file_paths)
+
+    def convert_frames(self, frames: bytes) -> tuple:
+        assert self._hotword_engine, 'The hotword engine is not running'
+        return struct.unpack_from("h" * self._hotword_engine.frame_length, frames)
+
+    def on_detection_ended(self) -> None:
+        if self._hotword_engine:
+            self._hotword_engine.delete()
+        self._hotword_engine = None
+
+    def detect_speech(self, frames: tuple) -> str:
+        index = self._hotword_engine.process(frames)
+        if index < 0:
+            return ''
+
+        if index is True:
+            index = 0
+        return self.hotwords[index]
+
+    @action
+    def detect(self, audio_file: str) -> SpeechDetectedResponse:
+        """
+        Perform speech-to-text analysis on an audio file.
+
+        :param audio_file: Path to the audio file.
+        """
+        pass
+
+    def recording_thread(self, input_device: Optional[str] = None, *args, **kwargs) -> None:
+        assert self._hotword_engine, 'The hotword engine has not yet been initialized'
+        super().recording_thread(block_size=self._hotword_engine.frame_length, input_device=input_device)
+
+    @action
+    def start_detection(self, *args, **kwargs) -> None:
+        from pvporcupine import Porcupine
+        self._hotword_engine = Porcupine(
+            library_path=self._library_path,
+            model_file_path=self._model_file_path,
+            keyword_file_paths=self._keyword_file_paths,
+            sensitivities=self._sensitivities)
+
+        self.rate = self._hotword_engine.sample_rate
+        super().start_detection(*args, **kwargs)
+
+
+# vim:sw=4:ts=4:et:
diff --git a/requirements.txt b/requirements.txt
index cdd720ca..eabe15ec 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -230,7 +230,10 @@ croniter
 # Support for Z-Wave
 # python-openzwave
 
-# Support for DeepSpeech
+# Support for Mozilla DeepSpeech speech-to-text engine
 # deepspeech
 # numpy
 # sounddevice
+
+# Support for PicoVoice speech-to-text engine
+# pvporcupine
diff --git a/setup.py b/setup.py
index 92838578..5aeb0b55 100755
--- a/setup.py
+++ b/setup.py
@@ -283,7 +283,9 @@ setup(
         'zigbee': ['paho-mqtt'],
         # Support for Z-Wave
         'zwave': ['python-openzwave'],
-        # Support for DeepSpeech
+        # Support for Mozilla DeepSpeech speech-to-text engine
         'deepspeech': ['deepspeech', 'numpy','sounddevice'],
+        # Support for PicoVoice speech-to-text engine
+        'picovoice': ['pvporcupine'],
     },
 )