From f7517eb321545b888d234b22ee2afb5b48492f54 Mon Sep 17 00:00:00 2001 From: Fabio Manganiello Date: Mon, 8 Apr 2024 01:54:26 +0200 Subject: [PATCH] [WIP] Added speech detection logic over Cheetah. --- docs/source/backends.rst | 2 - .../backend/stt.picovoice.hotword.rst | 5 - .../backend/stt.picovoice.speech.rst | 5 - docs/source/platypush/plugins/picovoice.rst | 5 + .../plugins/stt.picovoice.hotword.rst | 5 - .../plugins/stt.picovoice.speech.rst | 5 - docs/source/plugins.rst | 3 +- platypush/plugins/picovoice/__init__.py | 63 +++++- platypush/plugins/picovoice/_assistant.py | 198 ++++++++++++++---- platypush/plugins/picovoice/_context.py | 43 ++++ platypush/plugins/picovoice/_recorder.py | 3 +- 11 files changed, 263 insertions(+), 74 deletions(-) delete mode 100644 docs/source/platypush/backend/stt.picovoice.hotword.rst delete mode 100644 docs/source/platypush/backend/stt.picovoice.speech.rst create mode 100644 docs/source/platypush/plugins/picovoice.rst delete mode 100644 docs/source/platypush/plugins/stt.picovoice.hotword.rst delete mode 100644 docs/source/platypush/plugins/stt.picovoice.speech.rst create mode 100644 platypush/plugins/picovoice/_context.py diff --git a/docs/source/backends.rst b/docs/source/backends.rst index 2a43daeec..4171e8825 100644 --- a/docs/source/backends.rst +++ b/docs/source/backends.rst @@ -10,6 +10,4 @@ Backends platypush/backend/midi.rst platypush/backend/nodered.rst platypush/backend/redis.rst - platypush/backend/stt.picovoice.hotword.rst - platypush/backend/stt.picovoice.speech.rst platypush/backend/tcp.rst diff --git a/docs/source/platypush/backend/stt.picovoice.hotword.rst b/docs/source/platypush/backend/stt.picovoice.hotword.rst deleted file mode 100644 index 858386889..000000000 --- a/docs/source/platypush/backend/stt.picovoice.hotword.rst +++ /dev/null @@ -1,5 +0,0 @@ -``stt.picovoice.hotword`` -=========================================== - -.. automodule:: platypush.backend.stt.picovoice.hotword - :members: diff --git a/docs/source/platypush/backend/stt.picovoice.speech.rst b/docs/source/platypush/backend/stt.picovoice.speech.rst deleted file mode 100644 index 8b5809662..000000000 --- a/docs/source/platypush/backend/stt.picovoice.speech.rst +++ /dev/null @@ -1,5 +0,0 @@ -``stt.picovoice.speech`` -========================================== - -.. automodule:: platypush.backend.stt.picovoice.speech - :members: diff --git a/docs/source/platypush/plugins/picovoice.rst b/docs/source/platypush/plugins/picovoice.rst new file mode 100644 index 000000000..f1f8acded --- /dev/null +++ b/docs/source/platypush/plugins/picovoice.rst @@ -0,0 +1,5 @@ +``picovoice`` +============= + +.. automodule:: platypush.plugins.picovoice + :members: diff --git a/docs/source/platypush/plugins/stt.picovoice.hotword.rst b/docs/source/platypush/plugins/stt.picovoice.hotword.rst deleted file mode 100644 index 11eb37dd5..000000000 --- a/docs/source/platypush/plugins/stt.picovoice.hotword.rst +++ /dev/null @@ -1,5 +0,0 @@ -``stt.picovoice.hotword`` -=========================================== - -.. automodule:: platypush.plugins.stt.picovoice.hotword - :members: diff --git a/docs/source/platypush/plugins/stt.picovoice.speech.rst b/docs/source/platypush/plugins/stt.picovoice.speech.rst deleted file mode 100644 index 890c904cc..000000000 --- a/docs/source/platypush/plugins/stt.picovoice.speech.rst +++ /dev/null @@ -1,5 +0,0 @@ -``stt.picovoice.speech`` -========================================== - -.. automodule:: platypush.plugins.stt.picovoice.speech - :members: diff --git a/docs/source/plugins.rst b/docs/source/plugins.rst index 5e583f5e5..783cb841e 100644 --- a/docs/source/plugins.rst +++ b/docs/source/plugins.rst @@ -95,6 +95,7 @@ Plugins platypush/plugins/nmap.rst platypush/plugins/ntfy.rst platypush/plugins/otp.rst + platypush/plugins/picovoice.rst platypush/plugins/pihole.rst platypush/plugins/ping.rst platypush/plugins/printer.cups.rst @@ -119,8 +120,6 @@ Plugins platypush/plugins/smartthings.rst platypush/plugins/sound.rst platypush/plugins/ssh.rst - platypush/plugins/stt.picovoice.hotword.rst - platypush/plugins/stt.picovoice.speech.rst platypush/plugins/sun.rst platypush/plugins/switch.tplink.rst platypush/plugins/switch.wemo.rst diff --git a/platypush/plugins/picovoice/__init__.py b/platypush/plugins/picovoice/__init__.py index a861f66bd..c1e55570f 100644 --- a/platypush/plugins/picovoice/__init__.py +++ b/platypush/plugins/picovoice/__init__.py @@ -51,27 +51,34 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin): keywords: Optional[Sequence[str]] = None, keyword_paths: Optional[Sequence[str]] = None, keyword_model_path: Optional[str] = None, + speech_model_path: Optional[str] = None, + endpoint_duration: Optional[float] = 0.5, + enable_automatic_punctuation: bool = False, + start_conversation_on_hotword: bool = True, + audio_queue_size: int = 100, + conversation_timeout: Optional[float] = 5.0, **kwargs, ): """ :param access_key: Your Picovoice access key. You can get it by signing up at the `Picovoice console `. :param hotword_enabled: Enable the wake-word engine (default: True). - .. note:: The wake-word engine requires you to add Porcupine to the - products available in your Picovoice account. + **Note**: The wake-word engine requires you to add Porcupine to the + products available in your Picovoice account. :param stt_enabled: Enable the speech-to-text engine (default: True). - .. note:: The speech-to-text engine requires you to add Cheetah to - the products available in your Picovoice account. + **Note**: The speech-to-text engine requires you to add Cheetah to + the products available in your Picovoice account. :param intent_enabled: Enable the intent recognition engine (default: False). - .. note:: The intent recognition engine requires you to add Rhino - to the products available in your Picovoice account. + **Note**: The intent recognition engine requires you to add Rhino + to the products available in your Picovoice account. :param keywords: List of keywords to listen for (e.g. ``alexa``, ``ok - google``...). Either ``keywords`` or ``keyword_paths`` must be - provided if the wake-word engine is enabled. This list can include - any of the default Picovoice keywords (available on the `Picovoice - repository + google``...). This is required if the wake-word engine is enabled. + See the `Picovoice repository `_). + for a list of the stock keywords available. If you have a custom + model, you can pass its path to the ``keyword_paths`` parameter and + its filename (without the path and the platform extension) here. :param keyword_paths: List of paths to the keyword files to listen for. Custom keyword files can be created using the `Picovoice console `_ and downloaded from the @@ -81,6 +88,35 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin): for its language. Model files are available for all the supported languages through the `Picovoice repository `_. + :param speech_model_path: Path to the speech model file. If you are + using a language other than English, you can provide the path to the + model file for that language. Model files are available for all the + supported languages through the `Picovoice repository + `_. + :param endpoint_duration: If set, the assistant will stop listening when + no speech is detected for the specified duration (in seconds) after + the end of an utterance. + :param enable_automatic_punctuation: Enable automatic punctuation + insertion. + :param start_conversation_on_hotword: If set to True (default), a speech + detection session will be started when the hotword is detected. If + set to False, you may want to start the conversation programmatically + by calling the :meth:`.start_conversation` method instead, or run any + custom logic hotword detection logic. This can be particularly useful + when you want to run the assistant in a push-to-talk mode, or when you + want different hotwords to trigger conversations with different models + or languages. + :param audio_queue_size: Maximum number of audio frames to hold in the + processing queue. You may want to increase this value if you are + running this integration on a slow device and/or the logs report + audio frame drops too often. Keep in mind that increasing this value + will increase the memory usage of the integration. Also, a higher + value may result in higher accuracy at the cost of higher latency. + :param conversation_timeout: Maximum time to wait for some speech to be + detected after the hotword is detected. If no speech is detected + within this time, the conversation will time out and the plugin will + go back into hotword detection mode, if the mode is enabled. Default: + 5 seconds. """ super().__init__(**kwargs) self._assistant_args = { @@ -92,6 +128,12 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin): 'keywords': keywords, 'keyword_paths': keyword_paths, 'keyword_model_path': keyword_model_path, + 'speech_model_path': speech_model_path, + 'endpoint_duration': endpoint_duration, + 'enable_automatic_punctuation': enable_automatic_punctuation, + 'start_conversation_on_hotword': start_conversation_on_hotword, + 'audio_queue_size': audio_queue_size, + 'conversation_timeout': conversation_timeout, } @action @@ -151,6 +193,7 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin): try: for event in assistant: if event: + event.args['assistant'] = 'picovoice' get_bus().post(event) except KeyboardInterrupt: break diff --git a/platypush/plugins/picovoice/_assistant.py b/platypush/plugins/picovoice/_assistant.py index 5181aa572..27a727129 100644 --- a/platypush/plugins/picovoice/_assistant.py +++ b/platypush/plugins/picovoice/_assistant.py @@ -1,6 +1,6 @@ import logging import os -from threading import Event +from threading import Event, RLock from time import time from typing import Any, Dict, Optional, Sequence @@ -9,9 +9,18 @@ import pvleopard import pvporcupine import pvrhino -from platypush.message.event.assistant import HotwordDetectedEvent +from platypush.context import get_bus +from platypush.message.event.assistant import ( + ConversationStartEvent, + ConversationEndEvent, + ConversationTimeoutEvent, + HotwordDetectedEvent, + SpeechRecognizedEvent, +) +from ._context import SpeechDetectionContext from ._recorder import AudioRecorder +from ._state import AssistantState class Assistant: @@ -30,10 +39,16 @@ class Assistant: keyword_paths: Optional[Sequence[str]] = None, keyword_model_path: Optional[str] = None, frame_expiration: float = 3.0, # Don't process audio frames older than this + speech_model_path: Optional[str] = None, + endpoint_duration: Optional[float] = None, + enable_automatic_punctuation: bool = False, + start_conversation_on_hotword: bool = False, + audio_queue_size: int = 100, + conversation_timeout: Optional[float] = None, ): - self.logger = logging.getLogger(__name__) self._access_key = access_key self._stop_event = stop_event + self.logger = logging.getLogger(__name__) self.hotword_enabled = hotword_enabled self.stt_enabled = stt_enabled self.intent_enabled = intent_enabled @@ -41,9 +56,23 @@ class Assistant: self.keyword_paths = None self.keyword_model_path = None self.frame_expiration = frame_expiration + self.speech_model_path = speech_model_path + self.endpoint_duration = endpoint_duration + self.enable_automatic_punctuation = enable_automatic_punctuation + self.start_conversation_on_hotword = start_conversation_on_hotword + self.audio_queue_size = audio_queue_size + self._recorder = None + self._state = AssistantState.IDLE + self._state_lock = RLock() + self._speech_ctx = SpeechDetectionContext(timeout=conversation_timeout) if hotword_enabled: + if not keywords: + raise ValueError( + 'You need to provide a list of keywords if the wake-word engine is enabled' + ) + if keyword_paths: keyword_paths = [os.path.expanduser(path) for path in keyword_paths] missing_paths = [ @@ -74,46 +103,89 @@ class Assistant: def wait_stop(self): self._stop_event.wait() - def _create_porcupine(self): - if not self.hotword_enabled: - return None + @property + def state(self) -> AssistantState: + with self._state_lock: + return self._state - args: Dict[str, Any] = {'access_key': self._access_key} - if not (self.keywords or self.keyword_paths): - raise ValueError( - 'You need to provide either a list of keywords or a list of ' - 'keyword paths if the wake-word engine is enabled' - ) + @state.setter + def state(self, state: AssistantState): + with self._state_lock: + prev_state = self._state + self._state = state + new_state = self.state - if self.keywords: - args['keywords'] = self.keywords - if self.keyword_paths: - args['keyword_paths'] = self.keyword_paths - if self.keyword_model_path: - args['model_path'] = self.keyword_model_path + if prev_state == new_state: + return - return pvporcupine.create(**args) + if prev_state == AssistantState.DETECTING_SPEECH: + self._speech_ctx.stop() + self._post_event(ConversationEndEvent()) + elif new_state == AssistantState.DETECTING_SPEECH: + self._speech_ctx.start() + self._post_event(ConversationStartEvent()) @property def porcupine(self) -> Optional[pvporcupine.Porcupine]: + if not self.hotword_enabled: + return None + if not self._porcupine: - self._porcupine = self._create_porcupine() + args: Dict[str, Any] = {'access_key': self._access_key} + if self.keywords: + args['keywords'] = self.keywords + if self.keyword_paths: + args['keyword_paths'] = self.keyword_paths + if self.keyword_model_path: + args['model_path'] = self.keyword_model_path + + self._porcupine = pvporcupine.create(**args) return self._porcupine + @property + def cheetah(self) -> Optional[pvcheetah.Cheetah]: + if not self.stt_enabled: + return None + + if not self._cheetah: + args: Dict[str, Any] = {'access_key': self._access_key} + if self.speech_model_path: + args['model_path'] = self.speech_model_path + if self.endpoint_duration: + args['endpoint_duration_sec'] = self.endpoint_duration + if self.enable_automatic_punctuation: + args['enable_automatic_punctuation'] = self.enable_automatic_punctuation + + self._cheetah = pvcheetah.create(**args) + + return self._cheetah + def __enter__(self): + if self.should_stop(): + return self + if self._recorder: self.logger.info('A recording stream already exists') - elif self.porcupine: + elif self.porcupine or self.cheetah: + sample_rate = (self.porcupine or self.cheetah).sample_rate # type: ignore + frame_length = (self.porcupine or self.cheetah).frame_length # type: ignore + self._recorder = AudioRecorder( stop_event=self._stop_event, - sample_rate=self.porcupine.sample_rate, - frame_size=self.porcupine.frame_length, + sample_rate=sample_rate, + frame_size=frame_length, + queue_size=self.audio_queue_size, channels=1, ) self._recorder.__enter__() + if self.porcupine: + self.state = AssistantState.DETECTING_HOTWORD + else: + self.state = AssistantState.DETECTING_SPEECH + return self def __exit__(self, *_): @@ -121,6 +193,8 @@ class Assistant: self._recorder.__exit__(*_) self._recorder = None + self.state = AssistantState.IDLE + if self._cheetah: self._cheetah.delete() self._cheetah = None @@ -146,26 +220,74 @@ class Assistant: raise StopIteration while not (self.should_stop() or has_data): - if self.porcupine: # TODO also check current state - data = self._recorder.read() - if data is None: - continue + data = self._recorder.read() + if data is None: + continue - frame, t = data - if time() - t > self.frame_expiration: - self.logger.info( - 'Skipping audio frame older than %ss', self.frame_expiration - ) - continue # The audio frame is too old + frame, t = data + if time() - t > self.frame_expiration: + self.logger.info( + 'Skipping audio frame older than %ss', self.frame_expiration + ) + continue # The audio frame is too old - keyword_index = self.porcupine.process(frame) - if keyword_index is None: - continue # No keyword detected + if self.porcupine and self.state == AssistantState.DETECTING_HOTWORD: + return self._process_hotword(frame) - if keyword_index >= 0 and self.keywords: - return HotwordDetectedEvent(hotword=self.keywords[keyword_index]) + if self.cheetah and self.state == AssistantState.DETECTING_SPEECH: + return self._process_speech(frame) raise StopIteration + def _post_event(self, event): + if event: + event.args['assistant'] = 'picovoice' + get_bus().post(event) + + def _process_hotword(self, frame): + if not self.porcupine: + return None + + keyword_index = self.porcupine.process(frame) + if keyword_index is None: + return None # No keyword detected + + if keyword_index >= 0 and self.keywords: + if self.start_conversation_on_hotword: + self.state = AssistantState.DETECTING_SPEECH + + return HotwordDetectedEvent(hotword=self.keywords[keyword_index]) + + return None + + def _process_speech(self, frame): + if not self.cheetah: + return None + + event = None + ( + self._speech_ctx.partial_transcript, + self._speech_ctx.is_final, + ) = self.cheetah.process(frame) + + if self._speech_ctx.partial_transcript: + self.logger.info( + 'Partial transcript: %s, is_final: %s', + self._speech_ctx.partial_transcript, + self._speech_ctx.is_final, + ) + + if self._speech_ctx.is_final or self._speech_ctx.timed_out: + event = ( + ConversationTimeoutEvent() + if self._speech_ctx.timed_out + else SpeechRecognizedEvent(phrase=self.cheetah.flush()) + ) + + if self.porcupine: + self.state = AssistantState.DETECTING_HOTWORD + + return event + # vim:sw=4:ts=4:et: diff --git a/platypush/plugins/picovoice/_context.py b/platypush/plugins/picovoice/_context.py new file mode 100644 index 000000000..cb7546105 --- /dev/null +++ b/platypush/plugins/picovoice/_context.py @@ -0,0 +1,43 @@ +from dataclasses import dataclass +from time import time +from typing import Optional + + +@dataclass +class SpeechDetectionContext: + """ + Context of the speech detection process. + """ + + partial_transcript: str = '' + is_final: bool = False + timeout: Optional[float] = None + t_start: Optional[float] = None + t_end: Optional[float] = None + + def start(self): + self.reset() + self.t_start = time() + + def stop(self): + self.reset() + self.t_end = time() + + def reset(self): + self.partial_transcript = '' + self.is_final = False + self.t_start = None + self.t_end = None + + @property + def timed_out(self): + return ( + not self.partial_transcript + and not self.is_final + and self.timeout + and self.t_start + and time() - self.t_start > self.timeout + ) + + +# vim:sw=4:ts=4:et: diff --git a/platypush/plugins/picovoice/_recorder.py b/platypush/plugins/picovoice/_recorder.py index 9df81e7c9..e0c23a8e9 100644 --- a/platypush/plugins/picovoice/_recorder.py +++ b/platypush/plugins/picovoice/_recorder.py @@ -26,7 +26,7 @@ class AudioRecorder: frame_size: int, channels: int, dtype: str = 'int16', - queue_size: int = 20, + queue_size: int = 100, ): self.logger = getLogger(__name__) self._audio_queue: Queue[AudioFrame] = Queue(maxsize=queue_size) @@ -48,7 +48,6 @@ class AudioRecorder: def __exit__(self, *_): self.stop() - # self.stream.close() def _audio_callback(self, indata, *_): if self.should_stop():