diff --git a/platypush/backend/stt/__init__.py b/platypush/backend/stt/__init__.py deleted file mode 100644 index 624c2b72..00000000 --- a/platypush/backend/stt/__init__.py +++ /dev/null @@ -1,40 +0,0 @@ -import time - -from platypush.backend import Backend -from platypush.context import get_plugin -from platypush.plugins.stt import SttPlugin - - -class SttBackend(Backend): - """ - Base class for speech-to-text backends. - """ - - def __init__(self, plugin_name: str, retry_sleep: float = 5.0, *args, **kwargs): - """ - :param plugin_name: Plugin name of the class that will be used for speech detection. Must be an instance of - :class:`platypush.plugins.stt.SttPlugin`. - :param retry_sleep: Number of seconds the backend will wait on failure before re-initializing the plugin - (default: 5 seconds). - """ - super().__init__(*args, **kwargs) - self.plugin_name = plugin_name - self.retry_sleep = retry_sleep - - def run(self): - super().run() - self.logger.info('Starting {} speech-to-text backend'.format(self.__class__.__name__)) - - while not self.should_stop(): - try: - plugin: SttPlugin = get_plugin(self.plugin_name) - with plugin: - # noinspection PyProtectedMember - plugin._detection_thread.join() - except Exception as e: - self.logger.exception(e) - self.logger.warning('Encountered an unexpected error, retrying in {} seconds'.format(self.retry_sleep)) - time.sleep(self.retry_sleep) - - -# vim:sw=4:ts=4:et: diff --git a/platypush/backend/stt/picovoice/__init__.py b/platypush/backend/stt/picovoice/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/platypush/backend/stt/picovoice/hotword/__init__.py b/platypush/backend/stt/picovoice/hotword/__init__.py deleted file mode 100644 index 9dc6ae63..00000000 --- a/platypush/backend/stt/picovoice/hotword/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -from platypush.backend.stt import SttBackend - - -class SttPicovoiceHotwordBackend(SttBackend): - """ - Backend for the PicoVoice hotword detection plugin. Set this plugin to ``enabled`` if you - want to run the hotword engine continuously instead of programmatically using - ``start_detection`` and ``stop_detection``. - - Requires: - - - The :class:`platypush.plugins.stt.deepspeech.SttPicovoiceHotwordPlugin` plugin configured and its dependencies - installed. - - """ - - def __init__(self, *args, **kwargs): - super().__init__('stt.picovoice.hotword', *args, **kwargs) - - -# vim:sw=4:ts=4:et: diff --git a/platypush/backend/stt/picovoice/hotword/manifest.yaml b/platypush/backend/stt/picovoice/hotword/manifest.yaml deleted file mode 100644 index 0527afca..00000000 --- a/platypush/backend/stt/picovoice/hotword/manifest.yaml +++ /dev/null @@ -1,6 +0,0 @@ -manifest: - events: {} - install: - pip: [] - package: platypush.backend.stt.picovoice.hotword - type: backend diff --git a/platypush/backend/stt/picovoice/speech/__init__.py b/platypush/backend/stt/picovoice/speech/__init__.py deleted file mode 100644 index 28a4b0b1..00000000 --- a/platypush/backend/stt/picovoice/speech/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -from platypush.backend.stt import SttBackend - - -class SttPicovoiceSpeechBackend(SttBackend): - """ - Backend for the PicoVoice speech detection plugin. Set this plugin to ``enabled`` if you - want to run the speech engine continuously instead of programmatically using - ``start_detection`` and ``stop_detection``. - - Requires: - - - The :class:`platypush.plugins.stt.deepspeech.SttPicovoiceSpeechPlugin` plugin configured and its dependencies - installed. - - """ - - def __init__(self, *args, **kwargs): - super().__init__('stt.picovoice.speech', *args, **kwargs) - - -# vim:sw=4:ts=4:et: diff --git a/platypush/backend/stt/picovoice/speech/manifest.yaml b/platypush/backend/stt/picovoice/speech/manifest.yaml deleted file mode 100644 index fc68a467..00000000 --- a/platypush/backend/stt/picovoice/speech/manifest.yaml +++ /dev/null @@ -1,6 +0,0 @@ -manifest: - events: {} - install: - pip: [] - package: platypush.backend.stt.picovoice.speech - type: backend diff --git a/platypush/plugins/stt/__init__.py b/platypush/plugins/stt/__init__.py deleted file mode 100644 index 1df2ae45..00000000 --- a/platypush/plugins/stt/__init__.py +++ /dev/null @@ -1,336 +0,0 @@ -import queue -import threading -from abc import ABC, abstractmethod -from typing import Optional, Union, List - -import sounddevice as sd - -from platypush.context import get_bus -from platypush.message.event.stt import ( - SpeechDetectionStartedEvent, - SpeechDetectionStoppedEvent, - SpeechStartedEvent, - SpeechDetectedEvent, - HotwordDetectedEvent, - ConversationDetectedEvent, -) -from platypush.message.response.stt import SpeechDetectedResponse -from platypush.plugins import Plugin, action - - -class SttPlugin(ABC, Plugin): - """ - Abstract class for speech-to-text plugins. - """ - - _thread_stop_timeout = 10.0 - rate = 16000 - channels = 1 - - def __init__( - self, - input_device: Optional[Union[int, str]] = None, - hotword: Optional[str] = None, - hotwords: Optional[List[str]] = None, - conversation_timeout: Optional[float] = 10.0, - block_duration: float = 1.0, - ): - """ - :param input_device: PortAudio device index or name that will be used for recording speech (default: default - system audio input device). - :param hotword: When this word is detected, the plugin will trigger a - :class:`platypush.message.event.stt.HotwordDetectedEvent` instead of a - :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can use these events for hooking other - assistants. - :param hotwords: Use a list of hotwords instead of a single one. - :param conversation_timeout: If ``hotword`` or ``hotwords`` are set and ``conversation_timeout`` is set, - the next speech detected event will trigger a :class:`platypush.message.event.stt.ConversationDetectedEvent` - instead of a :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can hook custom hooks - here to run any logic depending on the detected speech - it can emulate a kind of - "OK, Google. Turn on the lights" interaction without using an external assistant (default: 10 seconds). - :param block_duration: Duration of the acquired audio blocks (default: 1 second). - """ - - super().__init__() - self.input_device = input_device - self.conversation_timeout = conversation_timeout - self.block_duration = block_duration - - self.hotwords = set(hotwords or []) - if hotword: - self.hotwords = {hotword} - - self._conversation_event = threading.Event() - self._input_stream: Optional[sd.InputStream] = None - self._recording_thread: Optional[threading.Thread] = None - self._detection_thread: Optional[threading.Thread] = None - self._audio_queue: Optional[queue.Queue] = None - self._current_text = '' - - def _get_input_device(self, device: Optional[Union[int, str]] = None) -> int: - """ - Get the index of the input device by index or name. - - :param device: Device index or name. If None is set then the function will return the index of the - default audio input device. - :return: Index of the audio input device. - """ - if not device: - device = self.input_device - if not device: - return sd.query_hostapis()[0].get('default_input_device') - - if isinstance(device, int): - assert device <= len(sd.query_devices()) - return device - - for i, dev in enumerate(sd.query_devices()): - if dev['name'] == device: - return i - - raise AssertionError('Device {} not found'.format(device)) - - def on_speech_detected(self, speech: str) -> None: - """ - Hook called when speech is detected. Triggers the right event depending on the current context. - - :param speech: Detected speech. - """ - speech = speech.strip() - - if speech in self.hotwords: - event = HotwordDetectedEvent(hotword=speech) - if self.conversation_timeout: - self._conversation_event.set() - threading.Timer( - self.conversation_timeout, lambda: self._conversation_event.clear() - ).start() - elif self._conversation_event.is_set(): - event = ConversationDetectedEvent(speech=speech) - else: - event = SpeechDetectedEvent(speech=speech) - - get_bus().post(event) - - @staticmethod - def convert_frames(frames: bytes) -> bytes: - """ - Conversion method for raw audio frames. It just returns the input frames as bytes. Override it if required - by your logic. - - :param frames: Input audio frames, as bytes. - :return: The audio frames as passed on the input. Override if required. - """ - return frames - - def on_detection_started(self) -> None: - """ - Method called when the ``detection_thread`` starts. Initialize your context variables and models here if - required. - """ - pass - - def on_detection_ended(self) -> None: - """ - Method called when the ``detection_thread`` stops. Clean up your context variables and models here. - """ - pass - - def before_recording(self) -> None: - """ - Method called when the ``recording_thread`` starts. Put here any logic that you may want to run before the - recording thread starts. - """ - pass - - def on_recording_started(self) -> None: - """ - Method called after the ``recording_thread`` opens the audio device. Put here any logic that you may want to - run after the recording starts. - """ - pass - - def on_recording_ended(self) -> None: - """ - Method called when the ``recording_thread`` stops. Put here any logic that you want to run after the audio - device is closed. - """ - pass - - @abstractmethod - def detect_speech(self, frames) -> str: - """ - Method called within the ``detection_thread`` when new audio frames have been captured. Must be implemented - by the derived classes. - - :param frames: Audio frames, as returned by ``convert_frames``. - :return: Detected text, as a string. Returns an empty string if no text has been detected. - """ - raise NotImplementedError - - def process_text(self, text: str) -> None: - if (not text and self._current_text) or (text and text == self._current_text): - self.on_speech_detected(self._current_text) - self._current_text = '' - else: - if text: - if not self._current_text: - get_bus().post(SpeechStartedEvent()) - self.logger.info('Intermediate speech results: [{}]'.format(text)) - - self._current_text = text - - def detection_thread(self) -> None: - """ - This thread reads frames from ``_audio_queue``, performs the speech-to-text detection and calls - """ - self._current_text = '' - self.logger.debug('Detection thread started') - self.on_detection_started() - - while self._audio_queue: - try: - frames = self._audio_queue.get() - frames = self.convert_frames(frames) - except Exception as e: - self.logger.warning( - 'Error while feeding audio to the model: {}'.format(str(e)) - ) - continue - - text = self.detect_speech(frames).strip() - self.process_text(text) - - self.on_detection_ended() - self.logger.debug('Detection thread terminated') - - def recording_thread( - self, - block_duration: Optional[float] = None, - block_size: Optional[int] = None, - input_device: Optional[str] = None, - ) -> None: - """ - Recording thread. It reads raw frames from the audio device and dispatches them to ``detection_thread``. - - :param block_duration: Audio blocks duration. Specify either ``block_duration`` or ``block_size``. - :param block_size: Size of the audio blocks. Specify either ``block_duration`` or ``block_size``. - :param input_device: Input device - """ - assert (block_duration or block_size) and not ( - block_duration and block_size - ), 'Please specify either block_duration or block_size' - - if not block_size: - block_size = int(self.rate * self.channels * block_duration) - - self.before_recording() - self.logger.debug('Recording thread started') - device = self._get_input_device(input_device) - self._input_stream = sd.InputStream( - samplerate=self.rate, - device=device, - channels=self.channels, - dtype='int16', - latency=0, - blocksize=block_size, - ) - self._input_stream.start() - self.on_recording_started() - get_bus().post(SpeechDetectionStartedEvent()) - - while self._input_stream: - try: - frames = self._input_stream.read(block_size)[0] - except Exception as e: - self.logger.warning( - 'Error while reading from the audio input: {}'.format(str(e)) - ) - continue - - self._audio_queue.put(frames) - - get_bus().post(SpeechDetectionStoppedEvent()) - self.on_recording_ended() - self.logger.debug('Recording thread terminated') - - @abstractmethod - @action - def detect(self, audio_file: str) -> SpeechDetectedResponse: - """ - Perform speech-to-text analysis on an audio file. Must be implemented by the derived classes. - - :param audio_file: Path to the audio file. - """ - raise NotImplementedError - - def __enter__(self): - """ - Context manager enter. Starts detection and returns self. - """ - self.start_detection() - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - """ - Context manager exit. Stops detection. - """ - self.stop_detection() - - @action - def start_detection( - self, - input_device: Optional[str] = None, - seconds: Optional[float] = None, - block_duration: Optional[float] = None, - ) -> None: - """ - Start the speech detection engine. - - :param input_device: Audio input device name/index override - :param seconds: If set, then the detection engine will stop after this many seconds, otherwise it'll - start running until ``stop_detection`` is called or application stop. - :param block_duration: ``block_duration`` override. - """ - assert ( - not self._input_stream and not self._recording_thread - ), 'Speech detection is already running' - block_duration = block_duration or self.block_duration - input_device = input_device if input_device is not None else self.input_device - self._audio_queue = queue.Queue() - self._recording_thread = threading.Thread( - target=lambda: self.recording_thread( - block_duration=block_duration, input_device=input_device - ) - ) - - self._recording_thread.start() - self._detection_thread = threading.Thread( - target=lambda: self.detection_thread() - ) - self._detection_thread.start() - - if seconds: - threading.Timer(seconds, lambda: self.stop_detection()).start() - - @action - def stop_detection(self) -> None: - """ - Stop the speech detection engine. - """ - assert self._input_stream, 'Speech detection is not running' - self._input_stream.stop(ignore_errors=True) - self._input_stream.close(ignore_errors=True) - self._input_stream = None - - if self._recording_thread: - self._recording_thread.join(timeout=self._thread_stop_timeout) - self._recording_thread = None - - self._audio_queue = None - if self._detection_thread: - self._detection_thread.join(timeout=self._thread_stop_timeout) - self._detection_thread = None - - -# vim:sw=4:ts=4:et: diff --git a/platypush/plugins/stt/picovoice/__init__.py b/platypush/plugins/stt/picovoice/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/platypush/plugins/stt/picovoice/hotword/__init__.py b/platypush/plugins/stt/picovoice/hotword/__init__.py deleted file mode 100644 index 5c776783..00000000 --- a/platypush/plugins/stt/picovoice/hotword/__init__.py +++ /dev/null @@ -1,120 +0,0 @@ -import os -import struct -from typing import Optional, List - -from platypush.message.response.stt import SpeechDetectedResponse -from platypush.plugins import action -from platypush.plugins.stt import SttPlugin - - -class SttPicovoiceHotwordPlugin(SttPlugin): - """ - This plugin performs hotword detection using `PicoVoice `_. - """ - - def __init__( - self, - library_path: Optional[str] = None, - model_file_path: Optional[str] = None, - keyword_file_paths: Optional[List[str]] = None, - sensitivity: float = 0.5, - sensitivities: Optional[List[float]] = None, - *args, - **kwargs - ): - from pvporcupine import Porcupine - from pvporcupine.resources.util.python.util import ( - LIBRARY_PATH, - MODEL_FILE_PATH, - KEYWORD_FILE_PATHS, - ) - - super().__init__(*args, **kwargs) - - self.hotwords = list(self.hotwords) - self._hotword_engine: Optional[Porcupine] = None - self._library_path = os.path.abspath( - os.path.expanduser(library_path or LIBRARY_PATH) - ) - self._model_file_path = os.path.abspath( - os.path.expanduser(model_file_path or MODEL_FILE_PATH) - ) - - if not keyword_file_paths: - hotwords = KEYWORD_FILE_PATHS - assert all( - hotword in hotwords for hotword in self.hotwords - ), 'Not all the hotwords could be found. Available hotwords: {}'.format( - list(hotwords.keys()) - ) - - self._keyword_file_paths = [ - os.path.abspath(os.path.expanduser(hotwords[hotword])) - for hotword in self.hotwords - ] - else: - self._keyword_file_paths = [ - os.path.abspath(os.path.expanduser(p)) for p in keyword_file_paths - ] - - self._sensitivities = [] - if sensitivities: - assert len(self._keyword_file_paths) == len( - sensitivities - ), 'Please specify as many sensitivities as the number of configured hotwords' - - self._sensitivities = sensitivities - else: - self._sensitivities = [sensitivity] * len(self._keyword_file_paths) - - def convert_frames(self, frames: bytes) -> tuple: - assert self._hotword_engine, 'The hotword engine is not running' - return struct.unpack_from("h" * self._hotword_engine.frame_length, frames) - - def on_detection_ended(self) -> None: - if self._hotword_engine: - self._hotword_engine.delete() - self._hotword_engine = None - - def detect_speech(self, frames: tuple) -> str: - index = self._hotword_engine.process(frames) - if index < 0: - return '' - - if index is True: - index = 0 - return self.hotwords[index] - - @action - def detect(self, audio_file: str) -> SpeechDetectedResponse: - """ - Perform speech-to-text analysis on an audio file. - - :param audio_file: Path to the audio file. - """ - pass - - def recording_thread( - self, input_device: Optional[str] = None, *args, **kwargs - ) -> None: - assert self._hotword_engine, 'The hotword engine has not yet been initialized' - super().recording_thread( - block_size=self._hotword_engine.frame_length, input_device=input_device - ) - - @action - def start_detection(self, *args, **kwargs) -> None: - from pvporcupine import Porcupine - - self._hotword_engine = Porcupine( - library_path=self._library_path, - model_file_path=self._model_file_path, - keyword_file_paths=self._keyword_file_paths, - sensitivities=self._sensitivities, - ) - - self.rate = self._hotword_engine.sample_rate - super().start_detection(*args, **kwargs) - - -# vim:sw=4:ts=4:et: diff --git a/platypush/plugins/stt/picovoice/hotword/manifest.yaml b/platypush/plugins/stt/picovoice/hotword/manifest.yaml deleted file mode 100644 index f8e9d210..00000000 --- a/platypush/plugins/stt/picovoice/hotword/manifest.yaml +++ /dev/null @@ -1,7 +0,0 @@ -manifest: - events: {} - install: - pip: - - pvporcupine - package: platypush.plugins.stt.picovoice.hotword - type: plugin diff --git a/platypush/plugins/stt/picovoice/speech/__init__.py b/platypush/plugins/stt/picovoice/speech/__init__.py deleted file mode 100644 index 4043ec53..00000000 --- a/platypush/plugins/stt/picovoice/speech/__init__.py +++ /dev/null @@ -1,154 +0,0 @@ -import inspect -import os -import platform -import struct -import threading -from typing import Optional - -from platypush.message.event.stt import SpeechStartedEvent - -from platypush.context import get_bus -from platypush.message.response.stt import SpeechDetectedResponse -from platypush.plugins import action -from platypush.plugins.stt import SttPlugin - - -class SttPicovoiceSpeechPlugin(SttPlugin): - """ - This plugin performs speech detection using `PicoVoice `_. - NOTE: The PicoVoice product used for real-time speech-to-text (Cheetah) can be used freely for - personal applications on x86_64 Linux. Other architectures and operating systems require a commercial license. - You can ask for a license `here `_. - """ - - def __init__( - self, - library_path: Optional[str] = None, - acoustic_model_path: Optional[str] = None, - language_model_path: Optional[str] = None, - license_path: Optional[str] = None, - end_of_speech_timeout: int = 1, - *args, - **kwargs - ): - """ - :param library_path: Path to the Cheetah binary library for your OS - (default: ``CHEETAH_INSTALL_DIR/lib/OS/ARCH/libpv_cheetah.EXT``). - :param acoustic_model_path: Path to the acoustic speech model - (default: ``CHEETAH_INSTALL_DIR/lib/common/acoustic_model.pv``). - :param language_model_path: Path to the language model - (default: ``CHEETAH_INSTALL_DIR/lib/common/language_model.pv``). - :param license_path: Path to your PicoVoice license - (default: ``CHEETAH_INSTALL_DIR/resources/license/cheetah_eval_linux_public.lic``). - :param end_of_speech_timeout: Number of seconds of silence during speech recognition before considering - a phrase over (default: 1). - """ - from pvcheetah import Cheetah - - super().__init__(*args, **kwargs) - - self._basedir = os.path.abspath( - os.path.join(inspect.getfile(Cheetah), '..', '..', '..') - ) - if not library_path: - library_path = self._get_library_path() - if not language_model_path: - language_model_path = os.path.join( - self._basedir, 'lib', 'common', 'language_model.pv' - ) - if not acoustic_model_path: - acoustic_model_path = os.path.join( - self._basedir, 'lib', 'common', 'acoustic_model.pv' - ) - if not license_path: - license_path = os.path.join( - self._basedir, 'resources', 'license', 'cheetah_eval_linux_public.lic' - ) - - self._library_path = library_path - self._language_model_path = language_model_path - self._acoustic_model_path = acoustic_model_path - self._license_path = license_path - self._end_of_speech_timeout = end_of_speech_timeout - self._stt_engine: Optional[Cheetah] = None - self._speech_in_progress = threading.Event() - - def _get_library_path(self) -> str: - path = os.path.join( - self._basedir, 'lib', platform.system().lower(), platform.machine() - ) - return os.path.join( - path, [f for f in os.listdir(path) if f.startswith('libpv_cheetah.')][0] - ) - - def convert_frames(self, frames: bytes) -> tuple: - assert self._stt_engine, 'The speech engine is not running' - return struct.unpack_from("h" * self._stt_engine.frame_length, frames) - - def on_detection_ended(self) -> None: - if self._stt_engine: - self._stt_engine.delete() - self._stt_engine = None - - def detect_speech(self, frames: tuple) -> str: - text, is_endpoint = self._stt_engine.process(frames) - text = text.strip() - - if text: - if not self._speech_in_progress.is_set(): - self._speech_in_progress.set() - get_bus().post(SpeechStartedEvent()) - - self._current_text += ' ' + text.strip() - - if is_endpoint: - text = self._stt_engine.flush().strip().strip() - if text: - self._current_text += ' ' + text - - self._speech_in_progress.clear() - if self._current_text: - self.on_speech_detected(self._current_text) - - self._current_text = '' - - return self._current_text - - def process_text(self, text: str) -> None: - pass - - @action - def detect(self, audio_file: str) -> SpeechDetectedResponse: - """ - Perform speech-to-text analysis on an audio file. - - :param audio_file: Path to the audio file. - """ - pass - - def recording_thread( - self, input_device: Optional[str] = None, *args, **kwargs - ) -> None: - assert self._stt_engine, 'The hotword engine has not yet been initialized' - super().recording_thread( - block_size=self._stt_engine.frame_length, input_device=input_device - ) - - @action - def start_detection(self, *args, **kwargs) -> None: - from pvcheetah import Cheetah - - self._stt_engine = Cheetah( - library_path=self._library_path, - acoustic_model_path=self._acoustic_model_path, - language_model_path=self._language_model_path, - license_path=self._license_path, - endpoint_duration_sec=self._end_of_speech_timeout, - ) - - self.rate = self._stt_engine.sample_rate - self._speech_in_progress.clear() - super().start_detection(*args, **kwargs) - - -# vim:sw=4:ts=4:et: diff --git a/platypush/plugins/stt/picovoice/speech/manifest.yaml b/platypush/plugins/stt/picovoice/speech/manifest.yaml deleted file mode 100644 index 0e7a01a8..00000000 --- a/platypush/plugins/stt/picovoice/speech/manifest.yaml +++ /dev/null @@ -1,7 +0,0 @@ -manifest: - events: {} - install: - pip: - - cheetah - package: platypush.plugins.stt.picovoice.speech - type: plugin