diff --git a/docs/source/backends.rst b/docs/source/backends.rst index 2a43daee..4171e882 100644 --- a/docs/source/backends.rst +++ b/docs/source/backends.rst @@ -10,6 +10,4 @@ Backends platypush/backend/midi.rst platypush/backend/nodered.rst platypush/backend/redis.rst - platypush/backend/stt.picovoice.hotword.rst - platypush/backend/stt.picovoice.speech.rst platypush/backend/tcp.rst diff --git a/docs/source/platypush/backend/stt.picovoice.hotword.rst b/docs/source/platypush/backend/stt.picovoice.hotword.rst deleted file mode 100644 index 85838688..00000000 --- a/docs/source/platypush/backend/stt.picovoice.hotword.rst +++ /dev/null @@ -1,5 +0,0 @@ -``stt.picovoice.hotword`` -=========================================== - -.. automodule:: platypush.backend.stt.picovoice.hotword - :members: diff --git a/docs/source/platypush/backend/stt.picovoice.speech.rst b/docs/source/platypush/backend/stt.picovoice.speech.rst deleted file mode 100644 index 8b580966..00000000 --- a/docs/source/platypush/backend/stt.picovoice.speech.rst +++ /dev/null @@ -1,5 +0,0 @@ -``stt.picovoice.speech`` -========================================== - -.. automodule:: platypush.backend.stt.picovoice.speech - :members: diff --git a/docs/source/platypush/plugins/assistant.picovoice.rst b/docs/source/platypush/plugins/assistant.picovoice.rst new file mode 100644 index 00000000..33f39e98 --- /dev/null +++ b/docs/source/platypush/plugins/assistant.picovoice.rst @@ -0,0 +1,5 @@ +``assistant.picovoice`` +======================= + +.. automodule:: platypush.plugins.assistant.picovoice + :members: diff --git a/docs/source/platypush/plugins/openai.rst b/docs/source/platypush/plugins/openai.rst new file mode 100644 index 00000000..0407965d --- /dev/null +++ b/docs/source/platypush/plugins/openai.rst @@ -0,0 +1,5 @@ +``openai`` +========== + +.. automodule:: platypush.plugins.openai + :members: diff --git a/docs/source/platypush/plugins/stt.picovoice.hotword.rst b/docs/source/platypush/plugins/stt.picovoice.hotword.rst deleted file mode 100644 index 11eb37dd..00000000 --- a/docs/source/platypush/plugins/stt.picovoice.hotword.rst +++ /dev/null @@ -1,5 +0,0 @@ -``stt.picovoice.hotword`` -=========================================== - -.. automodule:: platypush.plugins.stt.picovoice.hotword - :members: diff --git a/docs/source/platypush/plugins/stt.picovoice.speech.rst b/docs/source/platypush/plugins/stt.picovoice.speech.rst deleted file mode 100644 index 890c904c..00000000 --- a/docs/source/platypush/plugins/stt.picovoice.speech.rst +++ /dev/null @@ -1,5 +0,0 @@ -``stt.picovoice.speech`` -========================================== - -.. automodule:: platypush.plugins.stt.picovoice.speech - :members: diff --git a/docs/source/platypush/plugins/tts.picovoice.rst b/docs/source/platypush/plugins/tts.picovoice.rst new file mode 100644 index 00000000..afc4def6 --- /dev/null +++ b/docs/source/platypush/plugins/tts.picovoice.rst @@ -0,0 +1,5 @@ +``tts.picovoice`` +================= + +.. automodule:: platypush.plugins.tts.picovoice + :members: diff --git a/docs/source/plugins.rst b/docs/source/plugins.rst index 5e583f5e..3aa3a51e 100644 --- a/docs/source/plugins.rst +++ b/docs/source/plugins.rst @@ -11,6 +11,7 @@ Plugins platypush/plugins/application.rst platypush/plugins/arduino.rst platypush/plugins/assistant.google.rst + platypush/plugins/assistant.picovoice.rst platypush/plugins/autoremote.rst platypush/plugins/bluetooth.rst platypush/plugins/calendar.rst @@ -94,6 +95,7 @@ Plugins platypush/plugins/ngrok.rst platypush/plugins/nmap.rst platypush/plugins/ntfy.rst + platypush/plugins/openai.rst platypush/plugins/otp.rst platypush/plugins/pihole.rst platypush/plugins/ping.rst @@ -119,8 +121,6 @@ Plugins platypush/plugins/smartthings.rst platypush/plugins/sound.rst platypush/plugins/ssh.rst - platypush/plugins/stt.picovoice.hotword.rst - platypush/plugins/stt.picovoice.speech.rst platypush/plugins/sun.rst platypush/plugins/switch.tplink.rst platypush/plugins/switch.wemo.rst @@ -135,6 +135,7 @@ Plugins platypush/plugins/tts.rst platypush/plugins/tts.google.rst platypush/plugins/tts.mimic3.rst + platypush/plugins/tts.picovoice.rst platypush/plugins/tv.samsung.ws.rst platypush/plugins/twilio.rst platypush/plugins/udp.rst diff --git a/platypush/__init__.py b/platypush/__init__.py index 26fcfb19..c962a5d1 100644 --- a/platypush/__init__.py +++ b/platypush/__init__.py @@ -7,10 +7,13 @@ Platypush from .app import Application from .config import Config -from .context import get_backend, get_bus, get_plugin +from .context import Variable, get_backend, get_bus, get_plugin +from .cron import cron +from .event.hook import hook from .message.event import Event from .message.request import Request from .message.response import Response +from .procedure import procedure from .runner import main from .utils import run @@ -19,14 +22,18 @@ __author__ = 'Fabio Manganiello ' __version__ = '0.50.3' __all__ = [ 'Application', + 'Variable', 'Config', 'Event', 'Request', 'Response', + 'cron', 'get_backend', 'get_bus', 'get_plugin', + 'hook', 'main', + 'procedure', 'run', ] diff --git a/platypush/backend/stt/__init__.py b/platypush/backend/stt/__init__.py deleted file mode 100644 index 624c2b72..00000000 --- a/platypush/backend/stt/__init__.py +++ /dev/null @@ -1,40 +0,0 @@ -import time - -from platypush.backend import Backend -from platypush.context import get_plugin -from platypush.plugins.stt import SttPlugin - - -class SttBackend(Backend): - """ - Base class for speech-to-text backends. - """ - - def __init__(self, plugin_name: str, retry_sleep: float = 5.0, *args, **kwargs): - """ - :param plugin_name: Plugin name of the class that will be used for speech detection. Must be an instance of - :class:`platypush.plugins.stt.SttPlugin`. - :param retry_sleep: Number of seconds the backend will wait on failure before re-initializing the plugin - (default: 5 seconds). - """ - super().__init__(*args, **kwargs) - self.plugin_name = plugin_name - self.retry_sleep = retry_sleep - - def run(self): - super().run() - self.logger.info('Starting {} speech-to-text backend'.format(self.__class__.__name__)) - - while not self.should_stop(): - try: - plugin: SttPlugin = get_plugin(self.plugin_name) - with plugin: - # noinspection PyProtectedMember - plugin._detection_thread.join() - except Exception as e: - self.logger.exception(e) - self.logger.warning('Encountered an unexpected error, retrying in {} seconds'.format(self.retry_sleep)) - time.sleep(self.retry_sleep) - - -# vim:sw=4:ts=4:et: diff --git a/platypush/backend/stt/picovoice/__init__.py b/platypush/backend/stt/picovoice/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/platypush/backend/stt/picovoice/hotword/__init__.py b/platypush/backend/stt/picovoice/hotword/__init__.py deleted file mode 100644 index 9dc6ae63..00000000 --- a/platypush/backend/stt/picovoice/hotword/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -from platypush.backend.stt import SttBackend - - -class SttPicovoiceHotwordBackend(SttBackend): - """ - Backend for the PicoVoice hotword detection plugin. Set this plugin to ``enabled`` if you - want to run the hotword engine continuously instead of programmatically using - ``start_detection`` and ``stop_detection``. - - Requires: - - - The :class:`platypush.plugins.stt.deepspeech.SttPicovoiceHotwordPlugin` plugin configured and its dependencies - installed. - - """ - - def __init__(self, *args, **kwargs): - super().__init__('stt.picovoice.hotword', *args, **kwargs) - - -# vim:sw=4:ts=4:et: diff --git a/platypush/backend/stt/picovoice/hotword/manifest.yaml b/platypush/backend/stt/picovoice/hotword/manifest.yaml deleted file mode 100644 index 0527afca..00000000 --- a/platypush/backend/stt/picovoice/hotword/manifest.yaml +++ /dev/null @@ -1,6 +0,0 @@ -manifest: - events: {} - install: - pip: [] - package: platypush.backend.stt.picovoice.hotword - type: backend diff --git a/platypush/backend/stt/picovoice/speech/__init__.py b/platypush/backend/stt/picovoice/speech/__init__.py deleted file mode 100644 index 28a4b0b1..00000000 --- a/platypush/backend/stt/picovoice/speech/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -from platypush.backend.stt import SttBackend - - -class SttPicovoiceSpeechBackend(SttBackend): - """ - Backend for the PicoVoice speech detection plugin. Set this plugin to ``enabled`` if you - want to run the speech engine continuously instead of programmatically using - ``start_detection`` and ``stop_detection``. - - Requires: - - - The :class:`platypush.plugins.stt.deepspeech.SttPicovoiceSpeechPlugin` plugin configured and its dependencies - installed. - - """ - - def __init__(self, *args, **kwargs): - super().__init__('stt.picovoice.speech', *args, **kwargs) - - -# vim:sw=4:ts=4:et: diff --git a/platypush/backend/stt/picovoice/speech/manifest.yaml b/platypush/backend/stt/picovoice/speech/manifest.yaml deleted file mode 100644 index fc68a467..00000000 --- a/platypush/backend/stt/picovoice/speech/manifest.yaml +++ /dev/null @@ -1,6 +0,0 @@ -manifest: - events: {} - install: - pip: [] - package: platypush.backend.stt.picovoice.speech - type: backend diff --git a/platypush/commands/_stream.py b/platypush/commands/_stream.py index 3e480c11..47f59547 100644 --- a/platypush/commands/_stream.py +++ b/platypush/commands/_stream.py @@ -1,4 +1,4 @@ -from multiprocessing import RLock, Queue +from multiprocessing import RLock, Queue, active_children import os from queue import Empty import socket @@ -57,6 +57,25 @@ class CommandStream(ControllableProcess): self.reset() return super().close() + def _term_or_kill(self, kill: bool = False, visited=None) -> None: + func = 'kill' if kill else 'terminate' + visited = visited or set() + visited.add(id(self)) + + for child in active_children(): + child_id = id(child) + if child_id not in visited: + visited.add(child_id) + getattr(child, func)() + + getattr(super(), func)() + + def terminate(self, visited=None) -> None: + self._term_or_kill(kill=False, visited=visited) + + def kill(self) -> None: + self._term_or_kill(kill=True) + def __enter__(self) -> "CommandStream": self.reset() sock = self._sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) diff --git a/platypush/event/processor/__init__.py b/platypush/event/processor/__init__.py index 24f97074..8df62f61 100644 --- a/platypush/event/processor/__init__.py +++ b/platypush/event/processor/__init__.py @@ -1,4 +1,5 @@ import sys +from typing import Callable, List, Union from ..hook import EventHook @@ -20,10 +21,23 @@ class EventProcessor: if hooks is None: hooks = Config.get_event_hooks() - self.hooks = [] + self._hooks_by_name = {} + self._hooks_by_value_id = {} for name, hook in hooks.items(): - h = EventHook.build(name=name, hook=hook) - self.hooks.append(h) + self.add_hook(name, hook) + + @property + def hooks(self) -> List[EventHook]: + return list(self._hooks_by_name.values()) + + def add_hook(self, name: str, desc: Union[dict, Callable]): + hook_id = id(desc) + if hook_id in self._hooks_by_value_id: + return # Don't add the same hook twice + + hook = EventHook.build(name=name, hook=desc) + self._hooks_by_name[name] = hook + self._hooks_by_value_id[hook_id] = hook @staticmethod def notify_web_clients(event): diff --git a/platypush/message/event/__init__.py b/platypush/message/event/__init__.py index f6bcf3be..465b981a 100644 --- a/platypush/message/event/__init__.py +++ b/platypush/message/event/__init__.py @@ -257,26 +257,29 @@ class Event(Message): return result + def as_dict(self): + """ + Converts the event into a dictionary + """ + args = copy.deepcopy(self.args) + flatten(args) + return { + 'type': 'event', + 'target': self.target, + 'origin': self.origin if hasattr(self, 'origin') else None, + 'id': self.id if hasattr(self, 'id') else None, + '_timestamp': self.timestamp, + 'args': {'type': self.type, **args}, + } + def __str__(self): """ Overrides the str() operator and converts the message into a UTF-8 JSON string """ - args = copy.deepcopy(self.args) flatten(args) - - return json.dumps( - { - 'type': 'event', - 'target': self.target, - 'origin': self.origin if hasattr(self, 'origin') else None, - 'id': self.id if hasattr(self, 'id') else None, - '_timestamp': self.timestamp, - 'args': {'type': self.type, **args}, - }, - cls=self.Encoder, - ) + return json.dumps(self.as_dict(), cls=self.Encoder) @dataclass diff --git a/platypush/message/event/assistant/__init__.py b/platypush/message/event/assistant/__init__.py index 364eace3..942e2c1e 100644 --- a/platypush/message/event/assistant/__init__.py +++ b/platypush/message/event/assistant/__init__.py @@ -1,27 +1,53 @@ import re import sys -from typing import Optional +from typing import Any, Mapping, Optional, Union from platypush.context import get_plugin -from platypush.message.event import Event +from platypush.message.event import Event, EventMatchResult +from platypush.plugins.assistant import AssistantPlugin +from platypush.utils import get_plugin_name_by_class class AssistantEvent(Event): """Base class for assistant events""" - def __init__(self, *args, assistant: Optional[str] = None, **kwargs): + def __init__( + self, *args, assistant: Optional[Union[str, AssistantPlugin]] = None, **kwargs + ): """ :param assistant: Name of the assistant plugin that triggered the event. """ - super().__init__(*args, assistant=assistant, **kwargs) + assistant = assistant or kwargs.get('assistant') + if assistant: + assistant = ( + assistant + if isinstance(assistant, str) + else get_plugin_name_by_class(assistant.__class__) + ) + + kwargs['_assistant'] = assistant + + super().__init__(*args, **kwargs) @property - def _assistant(self): - return ( - get_plugin(self.args.get('assistant')) - if self.args.get('assistant') - else None - ) + def assistant(self) -> Optional[AssistantPlugin]: + assistant = self.args.get('_assistant') + if not assistant: + return None + + return get_plugin(assistant) + + def as_dict(self): + evt_dict = super().as_dict() + evt_args = {**evt_dict['args']} + assistant = evt_args.pop('_assistant', None) + if assistant: + evt_args['assistant'] = assistant + + return { + **evt_dict, + 'args': evt_args, + } class ConversationStartEvent(AssistantEvent): @@ -43,7 +69,6 @@ class ConversationEndEvent(AssistantEvent): :param with_follow_on_turn: Set to true if the conversation expects a user follow-up, false otherwise """ - super().__init__(*args, with_follow_on_turn=with_follow_on_turn, **kwargs) @@ -56,17 +81,46 @@ class ConversationTimeoutEvent(ConversationEndEvent): super().__init__(*args, **kwargs) -class ResponseEvent(ConversationEndEvent): +class ResponseEvent(AssistantEvent): """ Event triggered when a response is processed by the assistant """ - def __init__(self, *args, response_text: str, **kwargs): + def __init__( + self, *args, response_text: str, with_follow_on_turn: bool = False, **kwargs + ): """ :param response_text: Response text processed by the assistant + :param with_follow_on_turn: Set to true if the conversation expects a + user follow-up, false otherwise """ + super().__init__( + *args, + response_text=response_text, + with_follow_on_turn=with_follow_on_turn, + **kwargs, + ) - super().__init__(*args, response_text=response_text, **kwargs) + +class ResponseEndEvent(ConversationEndEvent): + """ + Event triggered when a response has been rendered on the assistant. + """ + + def __init__( + self, *args, response_text: str, with_follow_on_turn: bool = False, **kwargs + ): + """ + :param response_text: Response text rendered on the assistant. + :param with_follow_on_turn: Set to true if the conversation expects a + user follow-up, false otherwise. + """ + super().__init__( + *args, + response_text=response_text, + with_follow_on_turn=with_follow_on_turn, + **kwargs, + ) class NoResponseEvent(ConversationEndEvent): @@ -95,8 +149,8 @@ class SpeechRecognizedEvent(AssistantEvent): """ result = super().matches_condition(condition) - if result.is_match and self._assistant and 'phrase' in condition.args: - self._assistant.stop_conversation() + if result.is_match and self.assistant and 'phrase' in condition.args: + self.assistant.stop_conversation() return result @@ -125,6 +179,7 @@ class SpeechRecognizedEvent(AssistantEvent): result.score = sys.maxsize return result + parsed_args = {} event_tokens = re.split(r'\s+', event_args.get(argname, '').strip().lower()) condition_tokens = re.split(r'\s+', condition_value.strip().lower()) @@ -147,11 +202,11 @@ class SpeechRecognizedEvent(AssistantEvent): m = re.match(r'[^\\]*\${(.+?)}', condition_token) if m: argname = m.group(1) - if argname not in result.parsed_args: - result.parsed_args[argname] = event_token + if argname not in parsed_args: + parsed_args[argname] = event_token result.score += 1.0 else: - result.parsed_args[argname] += ' ' + event_token + parsed_args[argname] += ' ' + event_token if (len(condition_tokens) == 1 and len(event_tokens) == 1) or ( len(event_tokens) > 1 @@ -169,6 +224,45 @@ class SpeechRecognizedEvent(AssistantEvent): # It's a match if all the tokens in the condition string have been satisfied result.is_match = len(condition_tokens) == 0 + if result.is_match: + result.parsed_args = parsed_args + + return result + + +class IntentRecognizedEvent(AssistantEvent): + """ + Event triggered when an intent is matched by a speech command. + """ + + def __init__( + self, *args, intent: str, slots: Optional[Mapping[str, Any]] = None, **kwargs + ): + """ + :param intent: The intent that has been matched. + :param slots: The slots extracted from the intent, as a key-value mapping. + """ + super().__init__(*args, intent=intent, slots=slots or {}, **kwargs) + + def _matches_argument( + self, argname, condition_value, event_args, result: EventMatchResult + ): + if argname != 'slots': + return super()._matches_argument( + argname, condition_value, event_args, result + ) + + event_slots = set(event_args.get(argname, {}).items()) + slots = set(self.args.get(argname, {}).items()) + + # All the slots in the condition must be in the event + if slots.difference(event_slots) == 0: + result.is_match = True + result.score += 1 + else: + result.is_match = False + result.score = 0 + return result diff --git a/platypush/plugins/__init__.py b/platypush/plugins/__init__.py index 4692aed8..a1ed1eb0 100644 --- a/platypush/plugins/__init__.py +++ b/platypush/plugins/__init__.py @@ -122,6 +122,12 @@ class Plugin(EventGenerator, ExtensionWithManifest): # lgtm [py/missing-call-to assert entities, 'entities plugin not initialized' return entities + def __str__(self): + """ + :return: The qualified name of the plugin. + """ + return get_plugin_name_by_class(self.__class__) + def run(self, method, *args, **kwargs): assert ( method in self.registered_actions diff --git a/platypush/plugins/assistant/__init__.py b/platypush/plugins/assistant/__init__.py index bb5b25e4..22ddb622 100644 --- a/platypush/plugins/assistant/__init__.py +++ b/platypush/plugins/assistant/__init__.py @@ -8,23 +8,7 @@ from typing import Any, Collection, Dict, Optional, Type from platypush.context import get_bus, get_plugin from platypush.entities.assistants import Assistant from platypush.entities.managers.assistants import AssistantEntityManager -from platypush.message.event.assistant import ( - AssistantEvent, - ConversationStartEvent, - ConversationEndEvent, - ConversationTimeoutEvent, - ResponseEvent, - NoResponseEvent, - SpeechRecognizedEvent, - AlarmStartedEvent, - AlarmEndEvent, - TimerStartedEvent, - TimerEndEvent, - AlertStartedEvent, - AlertEndEvent, - MicMutedEvent, - MicUnmutedEvent, -) +from platypush.message.event import Event as AppEvent from platypush.plugins import Plugin, action from platypush.utils import get_plugin_name_by_class @@ -181,6 +165,17 @@ class AssistantPlugin(Plugin, AssistantEntityManager, ABC): self.publish_entities([self]) return asdict(self._state) + @action + def render_response(self, text: str, *_, **__): + """ + Render a response text as audio over the configured TTS plugin. + + :param text: Text to render. + """ + self._on_response_render_start(text) + self._render_response(text) + self._on_response_render_end() + def _get_tts_plugin(self): if not self.tts_plugin: return None @@ -200,11 +195,13 @@ class AssistantPlugin(Plugin, AssistantEntityManager, ABC): audio.play(self._conversation_start_sound) - def _send_event(self, event_type: Type[AssistantEvent], **kwargs): + def _send_event(self, event_type: Type[AppEvent], **kwargs): self.publish_entities([self]) get_bus().post(event_type(assistant=self._plugin_name, **kwargs)) def _on_conversation_start(self): + from platypush.message.event.assistant import ConversationStartEvent + self._last_response = None self._last_query = None self._conversation_running.set() @@ -212,63 +209,105 @@ class AssistantPlugin(Plugin, AssistantEntityManager, ABC): self._play_conversation_start_sound() def _on_conversation_end(self): + from platypush.message.event.assistant import ConversationEndEvent + self._conversation_running.clear() self._send_event(ConversationEndEvent) def _on_conversation_timeout(self): + from platypush.message.event.assistant import ConversationTimeoutEvent + self._last_response = None self._last_query = None self._conversation_running.clear() self._send_event(ConversationTimeoutEvent) def _on_no_response(self): + from platypush.message.event.assistant import NoResponseEvent + self._last_response = None self._conversation_running.clear() self._send_event(NoResponseEvent) - def _on_reponse_rendered(self, text: Optional[str]): + def _on_response_render_start(self, text: Optional[str]): + from platypush.message.event.assistant import ResponseEvent + self._last_response = text self._send_event(ResponseEvent, response_text=text) - tts = self._get_tts_plugin() + def _render_response(self, text: Optional[str]): + tts = self._get_tts_plugin() if tts and text: self.stop_conversation() tts.say(text=text, **self.tts_plugin_args) + def _on_response_render_end(self): + from platypush.message.event.assistant import ResponseEndEvent + + self._send_event(ResponseEndEvent, response_text=self._last_response) + + def _on_hotword_detected(self, hotword: Optional[str]): + from platypush.message.event.assistant import HotwordDetectedEvent + + self._send_event(HotwordDetectedEvent, hotword=hotword) + def _on_speech_recognized(self, phrase: Optional[str]): + from platypush.message.event.assistant import SpeechRecognizedEvent + phrase = (phrase or '').lower().strip() self._last_query = phrase self._send_event(SpeechRecognizedEvent, phrase=phrase) + def _on_intent_matched(self, intent: str, slots: Optional[Dict[str, Any]] = None): + from platypush.message.event.assistant import IntentRecognizedEvent + + self._send_event(IntentRecognizedEvent, intent=intent, slots=slots) + def _on_alarm_start(self): + from platypush.message.event.assistant import AlarmStartedEvent + self._cur_alert_type = AlertType.ALARM self._send_event(AlarmStartedEvent) def _on_alarm_end(self): + from platypush.message.event.assistant import AlarmEndEvent + self._cur_alert_type = None self._send_event(AlarmEndEvent) def _on_timer_start(self): + from platypush.message.event.assistant import TimerStartedEvent + self._cur_alert_type = AlertType.TIMER self._send_event(TimerStartedEvent) def _on_timer_end(self): + from platypush.message.event.assistant import TimerEndEvent + self._cur_alert_type = None self._send_event(TimerEndEvent) def _on_alert_start(self): + from platypush.message.event.assistant import AlertStartedEvent + self._cur_alert_type = AlertType.ALERT self._send_event(AlertStartedEvent) def _on_alert_end(self): + from platypush.message.event.assistant import AlertEndEvent + self._cur_alert_type = None self._send_event(AlertEndEvent) def _on_mute(self): + from platypush.message.event.assistant import MicMutedEvent + self._is_muted = True self._send_event(MicMutedEvent) def _on_unmute(self): + from platypush.message.event.assistant import MicUnmutedEvent + self._is_muted = False self._send_event(MicUnmutedEvent) diff --git a/platypush/plugins/assistant/picovoice/__init__.py b/platypush/plugins/assistant/picovoice/__init__.py new file mode 100644 index 00000000..3e36e35c --- /dev/null +++ b/platypush/plugins/assistant/picovoice/__init__.py @@ -0,0 +1,740 @@ +import os +from typing import Optional, Sequence + +from platypush.context import get_plugin +from platypush.plugins import RunnablePlugin, action +from platypush.plugins.assistant import AssistantPlugin +from platypush.plugins.tts.picovoice import TtsPicovoicePlugin + +from ._assistant import Assistant +from ._state import AssistantState + + +# pylint: disable=too-many-ancestors +class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin): + r""" + A voice assistant that runs on your device, based on the `Picovoice + `_ engine. + + Picovoice is a suite of on-device voice technologies that include: + + * **Porcupine**: wake-word engine, if you want the device to listen for + a specific wake word in order to start the assistant. + + * **Cheetah**: speech-to-text engine, if you want your voice + interactions to be transcribed into free text - either + programmatically or when triggered by the wake word. Or: + + * **Rhino**: intent recognition engine, if you want to extract *intents* + out of your voice commands - for instance, the phrase "set the living + room temperature to 20 degrees" could be mapped to the intent with the + following parameters: ``intent``: ``set_temperature``, ``room``: + ``living_room``, ``temperature``: ``20``. + + * **Leopard**: speech-to-text engine aimed at offline transcription of + audio files rather than real-time transcription. + + * **Orca**: text-to-speech engine, if you want to create your custom + logic to respond to user's voice commands and render the responses as + audio. + + This plugin is a wrapper around the Picovoice engine that allows you to + run your custom voice-based conversational flows on your device. + + Getting a Picovoice account and access key + ------------------------------------------- + + You can get your personal access key by signing up at the `Picovoice + console `_. You may be asked to submit a + reason for using the service (feel free to mention a personal Platypush + integration), and you will receive your personal access key. + + If prompted to select the products you want to use, make sure to select + the ones from the Picovoice suite that you want to use with this plugin. + + + Hotword detection + ----------------- + + The hotword detection engine is based on `Porcupine + `_. + + If enabled through the ``hotword_enabled`` parameter (default: True), the + assistant will listen for a specific wake word before starting the + speech-to-text or intent recognition engines. You can specify custom models + for your hotword (e.g. on the same device you may use "Alexa" to trigger the + speech-to-text engine in English, "Computer" to trigger the speech-to-text + engine in Italian, and "Ok Google" to trigger the intent recognition engine. + + You can also create your custom hotword models using the `Porcupine console + `_. + + If ``hotword_enabled`` is set to True, you must also specify the + ``keywords`` parameter with the list of keywords that you want to listen + for, and optionally the ``keyword_paths`` parameter with the paths to the + any custom hotword models that you want to use. If ``hotword_enabled`` is + set to False, then the assistant won't start listening for speech after the + plugin is started, and you will need to programmatically start the + conversation by calling the :meth:`.start_conversation` action, or trigger + it from the UI. + + When a wake-word is detected, the assistant will emit a + :class:`platypush.message.event.assistant.HotwordDetectedEvent` event that + you can use to build your custom logic. For example: + + .. code-block:: python + + import time + + from platypush import hook, run + from platypush.message.event.assistant import HotwordDetectedEvent + + # Turn on a light for 5 seconds when the hotword "Alexa" is detected + @hook(HotwordDetectedEvent, hotword='Alexa') + def on_hotword_detected(event: HotwordDetectedEvent, **context): + run("light.hue.on", lights=["Living Room"]) + time.sleep(5) + run("light.hue.off", lights=["Living Room"]) + + By default, the assistant will start listening for speech after the hotword + if either ``stt_enabled`` or ``intent_model_path`` are set. If you don't + want the assistant to start listening for speech after the hotword is + detected (for example because you want to build your custom response flows, + or trigger the speech detection using different models depending on the + hotword that is used, or because you just want to detect hotwords but not + speech), then you can also set the ``start_conversation_on_hotword`` + parameter to ``False``. If that is the case, then you can programmatically + start the conversation by calling the :meth:`.start_conversation` method in + your event hooks: + + .. code-block:: python + + from platypush import hook, run + from platypush.message.event.assistant import HotwordDetectedEvent + + # Start a conversation using the Italian language model when the + # "Buongiorno" hotword is detected + @hook(HotwordDetectedEvent, hotword='Buongiorno') + def on_it_hotword_detected(event: HotwordDetectedEvent, **context): + event.assistant.start_conversation(model_file='path/to/it.pv') + + Speech-to-text + -------------- + + The speech-to-text engine is based on `Cheetah + `_. + + If enabled through the ``stt_enabled`` parameter (default: True), the + assistant will transcribe the voice commands into text when a conversation + is started either programmatically through :meth:`.start_conversation` or + when the hotword is detected. + + It will emit a + :class:`platypush.message.event.assistant.SpeechRecognizedEvent` when some + speech is detected, and you can hook to that event to build your custom + logic: + + .. code-block:: python + + from platypush import hook, run + from platypush.message.event.assistant import SpeechRecognizedEvent + + # Turn on a light when the phrase "turn on the lights" is detected. + # Note that we can leverage regex-based pattern matching to be more + # flexible when matching the phrases. For example, the following hook + # will be matched when the user says "turn on the lights", "turn on + # lights", "lights on", "lights on please", "turn on light" etc. + @hook(SpeechRecognizedEvent, phrase='turn on (the)? lights?') + def on_turn_on_lights(event: SpeechRecognizedEvent, **context): + run("light.hue.on") + + You can also leverage context extraction through the ``${}`` syntax on the + hook to extract specific tokens from the event that can be passed to your + event hook. For example: + + .. code-block:: python + + from platypush import hook, run + from platypush.message.event.assistant import SpeechRecognizedEvent + + @hook(SpeechRecognizedEvent, phrase='play ${title} by ${artist}') + def on_play_track_command( + event: SpeechRecognizedEvent, title: str, artist: str, **context + ): + results = run( + "music.mopidy.search", + filter={"title": title, "artist": artist} + ) + + if not results: + event.assistant.render_response(f"Couldn't find {title} by {artist}") + return + + run("music.mopidy.play", resource=results[0]["uri"]) + + Speech-to-intent + ---------------- + + The intent recognition engine is based on `Rhino + `_. + + *Intents* are snippets of unstructured transcribed speech that can be + matched to structured actions. + + Unlike with hotword and speech-to-text detection, you need to provide a + custom model for intent detection. You can create your custom model using + the `Rhino console `_. + + When an intent is detected, the assistant will emit a + :class:`platypush.message.event.assistant.IntentRecognizedEvent` that can + be listened. + + For example, you can train a model to control groups of smart lights by + defining the following slots on the Rhino console: + + - ``device_state``: The new state of the device (e.g. with ``on`` or + ``off`` as supported values) + + - ``room``: The name of the room associated to the group of lights to + be controlled (e.g. ``living room``, ``kitchen``, ``bedroom``) + + You can then define a ``lights_ctrl`` intent with the following expressions: + + - "turn ``$device_state:state`` the lights" + - "turn ``$device_state:state`` the ``$room:room`` lights" + - "turn the lights ``$device_state:state``" + - "turn the ``$room:room`` lights ``$device_state:state``" + - "turn ``$room:room`` lights ``$device_state:state``" + + This intent will match any of the following phrases: + + - "*turn on the lights*" + - "*turn off the lights*" + - "*turn the lights on*" + - "*turn the lights off*" + - "*turn on the living room lights*" + - "*turn off the living room lights*" + - "*turn the living room lights on*" + - "*turn the living room lights off*" + + And it will extract any slots that are matched in the phrases in the + :class:`platypush.message.event.assistant.IntentRecognizedEvent` event. + + Train the model, download the context file, and pass the path on the + ``intent_model_path`` parameter. + + You can then register a hook to listen to a specific intent: + + .. code-block:: python + + from platypush import hook, run + from platypush.message.event.assistant import IntentRecognizedEvent + + @hook(IntentRecognizedEvent, intent='lights_ctrl', slots={'state': 'on'}) + def on_turn_on_lights(event: IntentRecognizedEvent, **context): + room = event.slots.get('room') + if room: + run("light.hue.on", groups=[room]) + else: + run("light.hue.on") + + Note that if both ``stt_enabled`` and ``intent_model_path`` are set, then + both the speech-to-text and intent recognition engines will run in parallel + when a conversation is started. + + The intent engine is usually faster, as it has a smaller set of intents to + match and doesn't have to run a full speech-to-text transcription. This means that, + if an utterance matches both a speech-to-text phrase and an intent, the + :class:`platypush.message.event.assistant.IntentRecognizedEvent` event is emitted + (and not :class:`platypush.message.event.assistant.SpeechRecognizedEvent`). + + This may not be always the case though. So it may be a good practice to + also provide a fallback + :class:`platypush.message.event.assistant.SpeechRecognizedEvent` hook to + catch the text if the speech is not recognized as an intent: + + .. code-block:: python + + from platypush import hook, run + from platypush.message.event.assistant import SpeechRecognizedEvent + + @hook(SpeechRecognizedEvent, phrase='turn ${state} (the)? ${room} lights?') + def on_turn_on_lights(event: SpeechRecognizedEvent, phrase, room, **context): + if room: + run("light.hue.on", groups=[room]) + else: + run("light.hue.on") + + Text-to-speech + -------------- + + The text-to-speech engine is based on `Orca + `_. + + It is not directly implemented by this plugin, but the implementation is + provided in the :class:`platypush.plugins.tts.picovoice.TtsPicovoicePlugin` + plugin. + + You can however leverage the :meth:`.render_response` action to render some + text as speech in response to a user command, and that in turn will leverage + the PicoVoice TTS plugin to render the response. + + For example, the following snippet provides a hook that: + + - Listens for + :class:`platypush.message.event.assistant.SpeechRecognizedEvent`. + + - Matches the phrase against a list of predefined commands that + shouldn't require a response. + + - Has a fallback logic that leverages the + :class:`platypush.plugins.openai.OpenaiPlugin` to generate a response + for the given text and renders it as speech. + + - Has a logic for follow-on turns if the response from ChatGPT is a question. + + .. code-block:: python + + import re + from collections import defaultdict + from datetime import datetime as dt, timedelta + from dateutil.parser import isoparse + from logging import getLogger + + from platypush import hook, run + from platypush.message.event.assistant import ( + SpeechRecognizedEvent, + ResponseEndEvent, + ) + + logger = getLogger(__name__) + + def play_music(*_, **__): + run("music.mopidy.play") + + def stop_music(*_, **__): + run("music.mopidy.stop") + + def ai_assist(event: SpeechRecognizedEvent, **__): + response = run("openai.get_response", prompt=event.phrase) + if not response: + return + + run("assistant.picovoice.render_response", text=response) + + # List of commands to match, as pairs of regex patterns and the + # corresponding actions + hooks = ( + (re.compile(r"play (the)?music", re.IGNORECASE), play_music), + (re.compile(r"stop (the)?music", re.IGNORECASE), stop_music), + # Fallback to the AI assistant + (re.compile(r".*"), ai_assist), + ) + + @hook(SpeechRecognizedEvent) + def on_speech_recognized(event, **kwargs): + for pattern, command in hooks: + if pattern.search(event.phrase): + logger.info("Running voice command %s", command.__name__) + command(event, **kwargs) + break + + @hook(ResponseEndEvent) + def on_response_end(event: ResponseEndEvent, **__): + # Check if the response is a question and start a follow-on turn if so. + # Note that the ``openai`` plugin by default is configured to keep + # the past interaction in a context window of ~10 minutes, so you + # can follow up like in a real conversation. + if event.assistant and event.response_text and event.response_text.endswith("?"): + event.assistant.start_conversation() + + """ + + def __init__( + self, + access_key: str, + hotword_enabled: bool = True, + stt_enabled: bool = True, + keywords: Optional[Sequence[str]] = None, + keyword_paths: Optional[Sequence[str]] = None, + keyword_model_path: Optional[str] = None, + speech_model_path: Optional[str] = None, + intent_model_path: Optional[str] = None, + endpoint_duration: Optional[float] = 0.5, + enable_automatic_punctuation: bool = False, + start_conversation_on_hotword: bool = True, + audio_queue_size: int = 100, + conversation_timeout: Optional[float] = 7.5, + muted: bool = False, + **kwargs, + ): + """ + :param access_key: Your Picovoice access key. You can get it by signing + up at the `Picovoice console `. + :param hotword_enabled: Enable the wake-word engine (default: True). + **Note**: The wake-word engine requires you to add Porcupine to the + products available in your Picovoice account. + :param stt_enabled: Enable the speech-to-text engine (default: True). + **Note**: The speech-to-text engine requires you to add Cheetah to + the products available in your Picovoice account. + :param keywords: List of keywords to listen for (e.g. ``alexa``, ``ok + google``...). This is required if the wake-word engine is enabled. + See the `Porcupine keywords repository + `_). + for a list of the stock keywords available. If you have a custom + model, you can pass its path to the ``keyword_paths`` parameter and + its filename (without the path and the platform extension) here. + :param keyword_paths: List of paths to the keyword files to listen for. + Custom keyword files can be created using the `Porcupine console + `_ and downloaded from the + console itself. + :param keyword_model_path: If you are using a keyword file in a + non-English language, you can provide the path to the model file + for its language. Model files are available for all the supported + languages through the `Porcupine lib repository + `_. + :param speech_model_path: Path to the speech model file. If you are + using a language other than English, you can provide the path to the + model file for that language. Model files are available for all the + supported languages through the `Cheetah repository + `_. + You can also use the `Speech console + `_ + to train your custom models. You can use a base model and fine-tune + it by boosting the detection of your own words and phrases and edit + the phonetic representation of the words you want to detect. + :param intent_model_path: Path to the Rhino context model. This is + required if you want to use the intent recognition engine through + Rhino. The context model is a file that contains a list of intents + that can be recognized by the engine. An intent is an action or a + class of actions that the assistant can recognize, and it can + contain an optional number of slots to model context variables - + e.g. temperature, lights group, location, device state etc. + You can create your own context model using the `Rhino console + `_. For example, you can define a + context file to control smart home devices by defining the + following slots: + + - ``device_type``: The device to control (e.g. lights, music) + - ``device_state``: The target state of the device (e.g. on, + off) + - ``location``: The location of the device (e.g. living + room, kitchen, bedroom) + - ``media_type``: The type of media to play (e.g. music, video) + - ``media_state``: The state of the media (e.g. play, pause, + stop) + + You can then define the following intents: + + - ``device_ctrl``: Control a device state. Supported phrases: + - "turn ``$device_state:state`` the ``$location:location`` + ``$device_type:device``" + - "turn ``$device_state:state`` the ``$device_type:device``" + + - ``media_ctrl``: Control media state. Supported phrases: + - "``$media_state:state`` the ``$media_type:media``" + - "``$media_state:state`` the ``$media_type:media`` in the + ``$location:location``" + + Then a phrase like "turn on the lights in the living room" would + trigger a + :class:`platypush.message.event.assistant.IntentRecognizedEvent` with: + + .. code-block:: json + + { + "intent": "device_ctrl", + "slots": { + "type": "lights", + "state": "on", + "location": "living room" + } + } + + **Note**: The intent recognition engine requires you to add Rhino + to the products available in your Picovoice account. + :param endpoint_duration: If set, the assistant will stop listening when + no speech is detected for the specified duration (in seconds) after + the end of an utterance. + :param enable_automatic_punctuation: Enable automatic punctuation + insertion. + :param start_conversation_on_hotword: If set to True (default), a speech + detection session will be started when the hotword is detected. If + set to False, you may want to start the conversation programmatically + by calling the :meth:`.start_conversation` method instead, or run any + custom logic hotword detection logic. This can be particularly useful + when you want to run the assistant in a push-to-talk mode, or when you + want different hotwords to trigger conversations with different models + or languages. + :param audio_queue_size: Maximum number of audio frames to hold in the + processing queue. You may want to increase this value if you are + running this integration on a slow device and/or the logs report + audio frame drops too often. Keep in mind that increasing this value + will increase the memory usage of the integration. Also, a higher + value may result in higher accuracy at the cost of higher latency. + :param conversation_timeout: Maximum time to wait for some speech to be + detected after the hotword is detected. If no speech is detected + within this time, the conversation will time out and the plugin will + go back into hotword detection mode, if the mode is enabled. Default: + 7.5 seconds. + :param muted: Set to True to start the assistant in a muted state. You will + need to call the :meth:`.unmute` method to start the assistant listening + for commands, or programmatically call the :meth:`.start_conversation` + to start a conversation. + """ + super().__init__(**kwargs) + self._assistant = None + self._assistant_args = { + 'stop_event': self._should_stop, + 'access_key': access_key, + 'hotword_enabled': hotword_enabled, + 'stt_enabled': stt_enabled, + 'keywords': keywords, + 'keyword_paths': ( + os.path.expanduser(keyword_path) + for keyword_path in (keyword_paths or []) + ), + 'keyword_model_path': ( + os.path.expanduser(keyword_model_path) if keyword_model_path else None + ), + 'speech_model_path': ( + os.path.expanduser(speech_model_path) if speech_model_path else None + ), + 'intent_model_path': ( + os.path.expanduser(intent_model_path) if intent_model_path else None + ), + 'endpoint_duration': endpoint_duration, + 'enable_automatic_punctuation': enable_automatic_punctuation, + 'start_conversation_on_hotword': ( + start_conversation_on_hotword + if (intent_model_path or stt_enabled) + else False + ), + 'audio_queue_size': audio_queue_size, + 'conversation_timeout': conversation_timeout, + 'muted': muted, + 'on_conversation_start': self._on_conversation_start, + 'on_conversation_end': self._on_conversation_end, + 'on_conversation_timeout': self._on_conversation_timeout, + 'on_speech_recognized': self._on_speech_recognized, + 'on_intent_matched': self._on_intent_matched, + 'on_hotword_detected': self._on_hotword_detected, + } + + @property + def tts(self) -> TtsPicovoicePlugin: + p = get_plugin('tts.picovoice') + assert p, 'Picovoice TTS plugin not configured/found' + return p + + def _get_tts_plugin(self) -> TtsPicovoicePlugin: + return self.tts + + def _on_response_render_start(self, text: Optional[str]): + if self._assistant: + self._assistant.set_responding(True) + return super()._on_response_render_start(text) + + def _on_response_render_end(self): + if self._assistant: + self._assistant.set_responding(False) + + return super()._on_response_render_end() + + @action + def start_conversation(self, *_, model_file: Optional[str] = None, **__): + """ + Programmatically start a conversation with the assistant. + + :param model_file: Override the model file to be used to detect speech + in this conversation. If not set, the configured + ``speech_model_path`` will be used. + """ + if not self._assistant: + self.logger.warning('Assistant not initialized') + return + + if not model_file: + model_file = self._assistant_args['speech_model_path'] + if model_file: + model_file = os.path.expanduser(model_file) + + self._assistant.override_speech_model(model_file) + self._assistant.state = AssistantState.DETECTING_SPEECH + + @action + def stop_conversation(self, *_, **__): + """ + Programmatically stop a running conversation with the assistant + """ + if not self._assistant: + self.logger.warning('Assistant not initialized') + return + + self._assistant.override_speech_model(None) + + if self._assistant.hotword_enabled: + self._assistant.state = AssistantState.DETECTING_HOTWORD + else: + self._assistant.state = AssistantState.IDLE + + @action + def say(self, text: str, *args, **kwargs): + """ + Proxy to + :class:`platypush.plugins.tts.picovoice.TtsPicovoicePlugin.say` to + render some text as speech through the Picovoice TTS engine. + + Extra arguments to + :class:`platypush.plugins.tts.picovoice.TtsPicovoicePlugin.say` can be + passed over ``args`` and ``kwargs``. + + :param text: Text to be rendered as speech. + """ + return self.tts.say(text, *args, **kwargs) + + @action + def transcribe(self, audio_file: str, *_, model_file: Optional[str] = None, **__): + """ + Transcribe an audio file to text using the `Leopard + `_ engine. + + :param text: Text to be transcribed. + :param model_file: Override the model file to be used to detect speech + in this conversation. If not set, the configured + ``speech_model_path`` will be used. + :return: dict + + .. code-block:: json + + { + "transcription": "This is a test", + "words": [ + { + "word": "this", + "start": 0.06400000303983688, + "end": 0.19200000166893005, + "confidence": 0.9626294374465942 + }, + { + "word": "is", + "start": 0.2879999876022339, + "end": 0.35199999809265137, + "confidence": 0.9781675934791565 + }, + { + "word": "a", + "start": 0.41600000858306885, + "end": 0.41600000858306885, + "confidence": 0.9764975309371948 + }, + { + "word": "test", + "start": 0.5120000243186951, + "end": 0.8320000171661377, + "confidence": 0.9511580467224121 + } + ] + } + + """ + import pvleopard + + audio_file = os.path.expanduser(audio_file) + if not model_file: + model_file = self._assistant_args['speech_model_path'] + if model_file: + model_file = os.path.expanduser(model_file) + + leopard = pvleopard.create( + access_key=self._assistant_args['access_key'], model_path=model_file + ) + + transcript, words = leopard.process_file(audio_file) + + try: + return { + 'transcription': transcript, + 'words': [ + { + 'word': word.word, + 'start': word.start_sec, + 'end': word.end_sec, + 'confidence': word.confidence, + } + for word in words + ], + } + finally: + leopard.delete() + + @action + def mute(self, *_, **__): + """ + Mute the microphone. Alias for :meth:`.set_mic_mute` with + ``muted=True``. + """ + return self.set_mic_mute(muted=True) + + @action + def unmute(self, *_, **__): + """ + Unmute the microphone. Alias for :meth:`.set_mic_mute` with + ``muted=False``. + """ + return self.set_mic_mute(muted=False) + + @action + def set_mic_mute(self, muted: bool): + """ + Programmatically mute/unmute the microphone. + + :param muted: Set to True or False. + """ + if self._assistant: + self._assistant.set_mic_mute(muted) + + super()._on_mute_changed(muted) + + @action + def toggle_mute(self, *_, **__): + """ + Toggle the mic mute state. + """ + return self.set_mic_mute(not self._is_muted) + + @action + def send_text_query(self, *_, query: str, **__): + """ + Send a text query to the assistant. + + This is equivalent to saying something to the assistant. + + :param query: Query to be sent. + """ + self._on_speech_recognized(query) + + def main(self): + while not self.should_stop(): + self.logger.info('Starting Picovoice assistant') + with Assistant(**self._assistant_args) as self._assistant: + try: + for event in self._assistant: + if event is not None: + self.logger.debug('Dequeued assistant event: %s', event) + except KeyboardInterrupt: + break + except Exception as e: + self.logger.error('Picovoice assistant error: %s', e, exc_info=True) + self.wait_stop(5) + + def stop(self): + try: + self.stop_conversation() + except RuntimeError: + pass + + super().stop() + + +# vim:sw=4:ts=4:et: diff --git a/platypush/plugins/assistant/picovoice/_assistant.py b/platypush/plugins/assistant/picovoice/_assistant.py new file mode 100644 index 00000000..9fcf66d0 --- /dev/null +++ b/platypush/plugins/assistant/picovoice/_assistant.py @@ -0,0 +1,424 @@ +import logging +import os +from queue import Full, Queue +from threading import Event, RLock, Thread +from time import time +from typing import Any, Dict, Optional, Sequence + +import pvporcupine + +from platypush.context import get_plugin +from platypush.message.event.assistant import ( + AssistantEvent, + ConversationTimeoutEvent, + HotwordDetectedEvent, + IntentRecognizedEvent, + SpeechRecognizedEvent, +) +from platypush.plugins.tts.picovoice import TtsPicovoicePlugin + +from ._recorder import AudioRecorder +from ._speech import SpeechProcessor +from ._state import AssistantState + + +class Assistant(Thread): + """ + A facade class that wraps the Picovoice engines under an assistant API. + """ + + @staticmethod + def _default_callback(*_, **__): + pass + + def __init__( + self, + access_key: str, + stop_event: Event, + hotword_enabled: bool = True, + stt_enabled: bool = True, + keywords: Optional[Sequence[str]] = None, + keyword_paths: Optional[Sequence[str]] = None, + keyword_model_path: Optional[str] = None, + frame_expiration: float = 3.0, # Don't process audio frames older than this + speech_model_path: Optional[str] = None, + intent_model_path: Optional[str] = None, + endpoint_duration: Optional[float] = None, + enable_automatic_punctuation: bool = False, + start_conversation_on_hotword: bool = False, + audio_queue_size: int = 100, + muted: bool = False, + conversation_timeout: Optional[float] = None, + on_conversation_start=_default_callback, + on_conversation_end=_default_callback, + on_conversation_timeout=_default_callback, + on_speech_recognized=_default_callback, + on_intent_matched=_default_callback, + on_hotword_detected=_default_callback, + ): + super().__init__(name='picovoice:Assistant') + + self._access_key = access_key + self._stop_event = stop_event + self.logger = logging.getLogger(__name__) + self.hotword_enabled = hotword_enabled + self.stt_enabled = stt_enabled + self.keywords = list(keywords or []) + self.keyword_paths = None + self.keyword_model_path = None + self.frame_expiration = frame_expiration + self.endpoint_duration = endpoint_duration + self.enable_automatic_punctuation = enable_automatic_punctuation + self.start_conversation_on_hotword = start_conversation_on_hotword + self.audio_queue_size = audio_queue_size + self._responding = Event() + self._muted = muted + self._speech_model_path = speech_model_path + self._speech_model_path_override = None + self._intent_model_path = intent_model_path + self._intent_model_path_override = None + self._in_ctx = False + + self._speech_processor = SpeechProcessor( + stop_event=stop_event, + stt_enabled=stt_enabled, + intent_enabled=self.intent_enabled, + conversation_timeout=conversation_timeout, + model_path=speech_model_path, + get_cheetah_args=self._get_speech_engine_args, + get_rhino_args=self._get_intent_engine_args, + ) + + self._on_conversation_start = on_conversation_start + self._on_conversation_end = on_conversation_end + self._on_conversation_timeout = on_conversation_timeout + self._on_speech_recognized = on_speech_recognized + self._on_intent_matched = on_intent_matched + self._on_hotword_detected = on_hotword_detected + + self._recorder = None + self._state = AssistantState.IDLE + self._state_lock = RLock() + self._evt_queue = Queue(maxsize=100) + + if hotword_enabled: + if not keywords: + raise ValueError( + 'You need to provide a list of keywords if the wake-word engine is enabled' + ) + + if keyword_paths: + keyword_paths = [os.path.expanduser(path) for path in keyword_paths] + missing_paths = [ + path for path in keyword_paths if not os.path.isfile(path) + ] + if missing_paths: + raise FileNotFoundError(f'Keyword files not found: {missing_paths}') + + self.keyword_paths = keyword_paths + + if keyword_model_path: + keyword_model_path = os.path.expanduser(keyword_model_path) + if not os.path.isfile(keyword_model_path): + raise FileNotFoundError( + f'Keyword model file not found: {keyword_model_path}' + ) + + self.keyword_model_path = keyword_model_path + + self._porcupine: Optional[pvporcupine.Porcupine] = None + + @property + def intent_enabled(self) -> bool: + return self.intent_model_path is not None + + @property + def is_responding(self) -> bool: + return self._responding.is_set() + + @property + def speech_model_path(self) -> Optional[str]: + return self._speech_model_path_override or self._speech_model_path + + @property + def intent_model_path(self) -> Optional[str]: + return self._intent_model_path_override or self._intent_model_path + + @property + def tts(self) -> TtsPicovoicePlugin: + p = get_plugin('tts.picovoice') + assert p, 'Picovoice TTS plugin not configured/found' + return p + + def set_responding(self, responding: bool): + if responding: + self._responding.set() + else: + self._responding.clear() + + def should_stop(self): + return self._stop_event.is_set() + + def wait_stop(self): + self._stop_event.wait() + + @property + def state(self) -> AssistantState: + with self._state_lock: + return self._state + + @state.setter + def state(self, state: AssistantState): + with self._state_lock: + prev_state = self._state + self._state = state + new_state = self.state + + if prev_state == new_state: + return + + self.logger.info('Assistant state transition: %s -> %s', prev_state, new_state) + if prev_state == AssistantState.DETECTING_SPEECH: + self.tts.stop() + self._speech_model_path_override = None + self._intent_model_path_override = None + self._speech_processor.on_conversation_end() + self._on_conversation_end() + elif new_state == AssistantState.DETECTING_SPEECH: + self._speech_processor.on_conversation_start() + self._on_conversation_start() + + if new_state == AssistantState.DETECTING_HOTWORD: + self.tts.stop() + self._speech_processor.on_conversation_reset() + + # Put a null event on the event queue to unblock next_event + self._evt_queue.put(None) + + @property + def porcupine(self) -> Optional[pvporcupine.Porcupine]: + if not self.hotword_enabled: + return None + + if not self._porcupine: + args: Dict[str, Any] = {'access_key': self._access_key} + if self.keywords: + args['keywords'] = self.keywords + if self.keyword_paths: + args['keyword_paths'] = self.keyword_paths + if self.keyword_model_path: + args['model_path'] = self.keyword_model_path + + self._porcupine = pvporcupine.create(**args) + + return self._porcupine + + def _get_speech_engine_args(self) -> dict: + args: Dict[str, Any] = {'access_key': self._access_key} + if self.speech_model_path: + args['model_path'] = self.speech_model_path + if self.endpoint_duration: + args['endpoint_duration_sec'] = self.endpoint_duration + if self.enable_automatic_punctuation: + args['enable_automatic_punctuation'] = self.enable_automatic_punctuation + + return args + + def _get_intent_engine_args(self) -> dict: + args: Dict[str, Any] = {'access_key': self._access_key} + args['context_path'] = self.intent_model_path + if self.endpoint_duration: + args['endpoint_duration_sec'] = self.endpoint_duration + if self.enable_automatic_punctuation: + args['enable_automatic_punctuation'] = self.enable_automatic_punctuation + + return args + + def __enter__(self): + """ + Get the assistant ready to start processing audio frames. + """ + if self.should_stop(): + return self + + assert not self.is_alive(), 'The assistant is already running' + self._in_ctx = True + + if self._recorder: + self.logger.info('A recording stream already exists') + elif self.hotword_enabled or self.stt_enabled or self.intent_enabled: + sample_rate = (self.porcupine or self._speech_processor).sample_rate + frame_length = (self.porcupine or self._speech_processor).frame_length + self._recorder = AudioRecorder( + stop_event=self._stop_event, + sample_rate=sample_rate, + frame_size=frame_length, + queue_size=self.audio_queue_size, + paused=self._muted, + channels=1, + ) + + self._speech_processor.__enter__() + self._recorder.__enter__() + + if self.porcupine: + self.state = AssistantState.DETECTING_HOTWORD + else: + self.state = AssistantState.DETECTING_SPEECH + + self.start() + return self + + def __exit__(self, *_): + """ + Stop the assistant and release all resources. + """ + self._in_ctx = False + if self._recorder: + self._recorder.__exit__(*_) + self._recorder = None + + self.state = AssistantState.IDLE + + if self._porcupine: + self._porcupine.delete() + self._porcupine = None + + self._speech_processor.__exit__(*_) + + def __iter__(self): + """ + Iterate over processed assistant events. + """ + return self + + def __next__(self): + """ + Process the next audio frame and return the corresponding event. + """ + if self.should_stop() or not self._recorder: + raise StopIteration + + if self.hotword_enabled and self.state == AssistantState.DETECTING_HOTWORD: + return self._evt_queue.get() + + evt = None + if ( + self._speech_processor.enabled + and self.state == AssistantState.DETECTING_SPEECH + ): + evt = self._speech_processor.next_event() + + if isinstance(evt, SpeechRecognizedEvent): + self._on_speech_recognized(phrase=evt.args['phrase']) + if isinstance(evt, IntentRecognizedEvent): + self._on_intent_matched( + intent=evt.args['intent'], slots=evt.args.get('slots', {}) + ) + if isinstance(evt, ConversationTimeoutEvent): + self._on_conversation_timeout() + + if evt: + self._speech_processor.reset() + + if ( + evt + and self.state == AssistantState.DETECTING_SPEECH + and self.hotword_enabled + ): + self.state = AssistantState.DETECTING_HOTWORD + + return evt + + def mute(self): + self._muted = True + if self._recorder: + self._recorder.pause() + + def unmute(self): + self._muted = False + if self._recorder: + self._recorder.resume() + + def set_mic_mute(self, mute: bool): + if mute: + self.mute() + else: + self.unmute() + + def toggle_mic_mute(self): + if self._muted: + self.unmute() + else: + self.mute() + + def _process_hotword(self, frame) -> Optional[HotwordDetectedEvent]: + if not self.porcupine: + return None + + keyword_index = self.porcupine.process(frame) + if keyword_index is None: + return None # No keyword detected + + if keyword_index >= 0 and self.keywords: + if self.start_conversation_on_hotword: + self.state = AssistantState.DETECTING_SPEECH + + self.tts.stop() # Stop any ongoing TTS when the hotword is detected + self._on_hotword_detected(hotword=self.keywords[keyword_index]) + return HotwordDetectedEvent(hotword=self.keywords[keyword_index]) + + return None + + def override_speech_model(self, model_path: Optional[str]): + self._speech_model_path_override = model_path + + def override_intent_model(self, model_path: Optional[str]): + self._intent_model_path_override = model_path + + def _put_event(self, evt: AssistantEvent): + try: + self._evt_queue.put_nowait(evt) + except Full: + self.logger.warning('The assistant event queue is full') + + def run(self): + assert ( + self._in_ctx + ), 'The assistant can only be started through a context manager' + + super().run() + + while not self.should_stop() and self._recorder: + self._recorder.wait_start() + if self.should_stop(): + break + + data = self._recorder.read() + if data is None: + continue + + frame, t = data + if time() - t > self.frame_expiration: + self.logger.info( + 'Skipping audio frame older than %ss', self.frame_expiration + ) + continue # The audio frame is too old + + if self.hotword_enabled and self.state == AssistantState.DETECTING_HOTWORD: + evt = self._process_hotword(frame) + if evt: + self._put_event(evt) + + continue + + if ( + self._speech_processor.enabled + and self.state == AssistantState.DETECTING_SPEECH + ): + self._speech_processor.process(frame, block=False) + + self.logger.info('Assistant stopped') + + +# vim:sw=4:ts=4:et: diff --git a/platypush/plugins/assistant/picovoice/_context.py b/platypush/plugins/assistant/picovoice/_context.py new file mode 100644 index 00000000..1a85bb24 --- /dev/null +++ b/platypush/plugins/assistant/picovoice/_context.py @@ -0,0 +1,51 @@ +from dataclasses import dataclass +from time import time +from typing import Optional + +from ._intent import Intent + + +@dataclass +class ConversationContext: + """ + Context of the conversation process. + """ + + transcript: str = '' + is_final: bool = False + intent: Optional[Intent] = None + timeout: Optional[float] = None + t_start: Optional[float] = None + + def start(self): + self.reset() + self.t_start = time() + + def reset(self): + self.transcript = '' + self.intent = None + self.is_final = False + self.t_start = None + + @property + def timed_out(self): + return ( + ( + (not self.transcript and not self.is_final) + or (not self.intent and not self.is_final) + ) + and self.timeout + and self.t_start + and time() - self.t_start > self.timeout + ) or ( + ( + (self.transcript and not self.is_final) + or (self.intent and not self.is_final) + ) + and self.timeout + and self.t_start + and time() - self.t_start > self.timeout * 2 + ) + + +# vim:sw=4:ts=4:et: diff --git a/platypush/plugins/assistant/picovoice/_intent.py b/platypush/plugins/assistant/picovoice/_intent.py new file mode 100644 index 00000000..427a52d1 --- /dev/null +++ b/platypush/plugins/assistant/picovoice/_intent.py @@ -0,0 +1,11 @@ +from dataclasses import dataclass, field + + +@dataclass +class Intent: + """ + Speech intent data class. + """ + + name: str + slots: dict = field(default_factory=dict) diff --git a/platypush/plugins/assistant/picovoice/_recorder.py b/platypush/plugins/assistant/picovoice/_recorder.py new file mode 100644 index 00000000..523806be --- /dev/null +++ b/platypush/plugins/assistant/picovoice/_recorder.py @@ -0,0 +1,191 @@ +from collections import namedtuple +from dataclasses import dataclass, field +from logging import getLogger +from queue import Full, Queue +from threading import Event, RLock +from time import time +from typing import Optional + +import sounddevice as sd + +from platypush.utils import wait_for_either + + +AudioFrame = namedtuple('AudioFrame', ['data', 'timestamp']) + + +@dataclass +class PauseState: + """ + Data class to hold the boilerplate (state + synchronization events) for the + audio recorder pause API. + """ + + _paused_event: Event = field(default_factory=Event) + _recording_event: Event = field(default_factory=Event) + _state_lock: RLock = field(default_factory=RLock) + + @property + def paused(self): + with self._state_lock: + return self._paused_event.is_set() + + def pause(self): + """ + Pause the audio recorder. + """ + with self._state_lock: + self._paused_event.set() + self._recording_event.clear() + + def resume(self): + """ + Resume the audio recorder. + """ + with self._state_lock: + self._paused_event.clear() + self._recording_event.set() + + def toggle(self): + """ + Toggle the audio recorder pause state. + """ + with self._state_lock: + if self.paused: + self.resume() + else: + self.pause() + + def wait_paused(self, timeout: Optional[float] = None): + """ + Wait until the audio recorder is paused. + """ + self._paused_event.wait(timeout=timeout) + + def wait_recording(self, timeout: Optional[float] = None): + """ + Wait until the audio recorder is resumed. + """ + self._recording_event.wait(timeout=timeout) + + +class AudioRecorder: + """ + Audio recorder component that uses the sounddevice library to record audio + from the microphone. + """ + + def __init__( + self, + stop_event: Event, + sample_rate: int, + frame_size: int, + channels: int, + paused: bool = False, + dtype: str = 'int16', + queue_size: int = 100, + ): + self.logger = getLogger(__name__) + self._audio_queue: Queue[AudioFrame] = Queue(maxsize=queue_size) + self.frame_size = frame_size + self._stop_event = Event() + self._upstream_stop_event = stop_event + self._paused_state = PauseState() + if paused: + self._paused_state.pause() + else: + self._paused_state.resume() + + self.stream = sd.InputStream( + samplerate=sample_rate, + channels=channels, + dtype=dtype, + blocksize=frame_size, + callback=self._audio_callback, + ) + + @property + def paused(self): + return self._paused_state.paused + + def __enter__(self): + """ + Start the audio stream. + """ + self._stop_event.clear() + self.stream.start() + return self + + def __exit__(self, *_): + """ + Stop the audio stream. + """ + self.stop() + + def _audio_callback(self, indata, *_): + if self.should_stop() or self.paused: + return + + try: + self._audio_queue.put_nowait(AudioFrame(indata.reshape(-1), time())) + except Full: + self.logger.warning('Audio queue is full, dropping audio frame') + + def read(self, timeout: Optional[float] = None): + """ + Read an audio frame from the queue. + + :param timeout: Timeout in seconds. If None, the method will block until + an audio frame is available. + :return: Audio frame or None if the timeout has expired. + """ + try: + return self._audio_queue.get(timeout=timeout) + except TimeoutError: + self.logger.debug('Audio queue is empty') + return None + + def stop(self): + """ + Stop the audio stream. + """ + self._stop_event.set() + self.stream.stop() + + def pause(self): + """ + Pause the audio stream. + """ + self._paused_state.pause() + + def resume(self): + """ + Resume the audio stream. + """ + self._paused_state.resume() + + def toggle(self): + """ + Toggle the audio stream pause state. + """ + self._paused_state.toggle() + + def should_stop(self): + return self._stop_event.is_set() or self._upstream_stop_event.is_set() + + def wait(self, timeout: Optional[float] = None): + """ + Wait until the audio stream is stopped. + """ + wait_for_either(self._stop_event, self._upstream_stop_event, timeout=timeout) + + def wait_start(self, timeout: Optional[float] = None): + """ + Wait until the audio stream is started. + """ + wait_for_either( + self._stop_event, + self._upstream_stop_event, + self._paused_state._recording_event, + timeout=timeout, + ) diff --git a/platypush/plugins/assistant/picovoice/_speech/__init__.py b/platypush/plugins/assistant/picovoice/_speech/__init__.py new file mode 100644 index 00000000..6318c0b6 --- /dev/null +++ b/platypush/plugins/assistant/picovoice/_speech/__init__.py @@ -0,0 +1,3 @@ +from ._processor import SpeechProcessor + +__all__ = ['SpeechProcessor'] diff --git a/platypush/plugins/assistant/picovoice/_speech/_base.py b/platypush/plugins/assistant/picovoice/_speech/_base.py new file mode 100644 index 00000000..8ac7c278 --- /dev/null +++ b/platypush/plugins/assistant/picovoice/_speech/_base.py @@ -0,0 +1,180 @@ +import logging +from abc import ABC, abstractmethod +from queue import Empty, Queue +from threading import Event, RLock, Thread, get_ident +from typing import Any, Optional, Sequence + +from platypush.message.event.assistant import AssistantEvent + +from .._context import ConversationContext + + +class BaseProcessor(ABC, Thread): + """ + Base speech processor class. It is implemented by the ``SttProcessor`` and + the ``IntentProcessor`` classes. + """ + + def __init__( + self, + *args, + stop_event: Event, + enabled: bool = True, + conversation_timeout: Optional[float] = None, + **kwargs, + ): + super().__init__(*args, name=f'picovoice:{self.__class__.__name__}', **kwargs) + + self.logger = logging.getLogger(self.name) + self._enabled = enabled + self._audio_queue = Queue() + self._stop_event = stop_event + self._ctx = ConversationContext(timeout=conversation_timeout) + self._event_queue = Queue() + # This event is set if the upstream processor is waiting for an event + # from this processor + self._event_wait = Event() + # This event is set when the processor is done with the audio + # processing and it's ready to accept a new audio frame + self._processing_done = Event() + self._processing_done.set() + self._state_lock = RLock() + + def should_stop(self) -> bool: + return self._stop_event.is_set() + + def wait_stop(self, timeout: Optional[float] = None) -> bool: + return self._stop_event.wait(timeout) + + def enqueue(self, audio: Sequence[int]): + if not self._enabled: + return + + self._event_wait.set() + self._processing_done.clear() + self._audio_queue.put_nowait(audio) + + def reset(self) -> Optional[Any]: + """ + Reset any pending context. + """ + if not self._enabled: + return + + with self._state_lock: + self._ctx.reset() + self._event_queue.queue.clear() + self._event_wait.clear() + self._processing_done.set() + + @property + def processing_done(self) -> Event: + return self._processing_done + + @property + @abstractmethod + def _model_path(self) -> Optional[str]: + """ + Return the model path. + """ + + @property + @abstractmethod + def sample_rate(self) -> int: + """ + :return: The sample rate wanted by Cheetah/Rhino. + """ + + @property + @abstractmethod + def frame_length(self) -> int: + """ + :return: The frame length wanted by Cheetah/Rhino. + """ + + def last_event(self) -> Optional[AssistantEvent]: + """ + :return: The latest event that was processed by the processor. + """ + with self._state_lock: + evt = None + try: + while True: + evt = self._event_queue.get_nowait() + except Empty: + pass + + if evt: + self._event_wait.clear() + + return evt + + @abstractmethod + def process(self, audio: Sequence[int]) -> Optional[AssistantEvent]: + """ + Process speech events from a raw audio input. + """ + + def run(self): + super().run() + if not self._enabled: + self.wait_stop() + + self.reset() + self._processing_done.clear() + self.logger.info('Processor started: %s', self.name) + + while not self.should_stop(): + audio = self._audio_queue.get() + + # The thread is stopped when it receives a None object + if audio is None: + break + + # Don't process the audio if the upstream processor is not waiting + # for an event + if not self._event_wait.is_set(): + continue + + try: + self._processing_done.clear() + event = self.process(audio) + + if event: + self.logger.debug( + 'Dispatching event processed from %s: %s', self.name, event + ) + self._event_queue.put_nowait(event) + self._processing_done.set() + except Exception as e: + self.logger.error( + 'An error occurred while processing the audio on %s: %s', + self.name, + e, + exc_info=e, + ) + self.wait_stop(timeout=1) + self._processing_done.set() + continue + + self._processing_done.set() + self.reset() + self.logger.info('Processor stopped: %s', self.name) + + def stop(self): + if not self._enabled: + return + + self._audio_queue.put_nowait(None) + if self.is_alive() and self.ident != get_ident(): + self.logger.debug('Stopping %s', self.name) + self.join() + + def on_conversation_start(self): + self._ctx.start() + + def on_conversation_end(self): + self.reset() + + def on_conversation_reset(self): + self.reset() diff --git a/platypush/plugins/assistant/picovoice/_speech/_intent.py b/platypush/plugins/assistant/picovoice/_speech/_intent.py new file mode 100644 index 00000000..eb5767bb --- /dev/null +++ b/platypush/plugins/assistant/picovoice/_speech/_intent.py @@ -0,0 +1,93 @@ +from typing import Callable, Optional, Sequence, Union + +import pvrhino + +from platypush.message.event.assistant import ( + ConversationTimeoutEvent, + IntentRecognizedEvent, +) + +from ._base import BaseProcessor + + +class IntentProcessor(BaseProcessor): + """ + Implementation of the speech-to-intent processor using the Picovoice Rhino + engine. + """ + + def __init__( + self, *args, get_rhino_args: Callable[[], dict] = lambda: {}, **kwargs + ): + super().__init__(*args, **kwargs) + self._get_rhino_args = get_rhino_args + # model_path -> Rhino instance cache + self._rhino = {} + + @property + def _model_path(self) -> Optional[str]: + return self._get_rhino_args().get('model_path') + + @property + def sample_rate(self) -> int: + return self._get_rhino().sample_rate + + @property + def frame_length(self) -> int: + return self._get_rhino().frame_length + + def _get_rhino(self) -> pvrhino.Rhino: + if not self._rhino.get(self._model_path): + self._rhino[self._model_path] = pvrhino.create(**self._get_rhino_args()) + + return self._rhino[self._model_path] + + def process( + self, audio: Sequence[int] + ) -> Optional[Union[IntentRecognizedEvent, ConversationTimeoutEvent]]: + """ + Process the audio and return an ``IntentRecognizedEvent`` if the intent was + understood, or a ``ConversationTimeoutEvent`` if the conversation timed + out, or ``None`` if the intent processing is not yet finalized. + """ + event = None + rhino = self._get_rhino() + self._ctx.is_final = rhino.process(audio) + + if self._ctx.is_final: + inference = rhino.get_inference() + self.logger.debug( + 'Intent detection finalized. Inference understood: %s', + inference.is_understood, + ) + + if inference.is_understood: + event = IntentRecognizedEvent( + intent=inference.intent, + slots=dict(inference.slots), + ) + + if not event and self._ctx.timed_out: + event = ConversationTimeoutEvent() + + return event + + def reset(self) -> None: + if not self._enabled: + return + + with self._state_lock: + self._get_rhino().reset() + super().reset() + + def stop(self): + if not self._enabled: + return + + super().stop() + + with self._state_lock: + objs = self._rhino.copy() + for key, obj in objs.items(): + obj.delete() + self._rhino.pop(key) diff --git a/platypush/plugins/assistant/picovoice/_speech/_processor.py b/platypush/plugins/assistant/picovoice/_speech/_processor.py new file mode 100644 index 00000000..edda995a --- /dev/null +++ b/platypush/plugins/assistant/picovoice/_speech/_processor.py @@ -0,0 +1,208 @@ +import logging +from queue import Queue +from threading import Event +from typing import Callable, Optional, Sequence + +from platypush.message.event.assistant import AssistantEvent, ConversationTimeoutEvent +from platypush.utils import wait_for_either + +from ._intent import IntentProcessor +from ._stt import SttProcessor + + +class SpeechProcessor: + """ + Speech processor class that wraps the STT and Intent processors under the + same interface. + """ + + def __init__( + self, + stop_event: Event, + model_path: Optional[str] = None, + stt_enabled: bool = True, + intent_enabled: bool = False, + conversation_timeout: Optional[float] = None, + get_cheetah_args: Callable[[], dict] = lambda: {}, + get_rhino_args: Callable[[], dict] = lambda: {}, + ): + self.logger = logging.getLogger(self.__class__.__name__) + self._stt_enabled = stt_enabled + self._intent_enabled = intent_enabled + self._model_path = model_path + self._conversation_timeout = conversation_timeout + self._audio_queue = Queue() + self._stop_event = stop_event + self._get_cheetah_args = get_cheetah_args + self._get_rhino_args = get_rhino_args + + self._stt_processor = SttProcessor( + conversation_timeout=conversation_timeout, + stop_event=stop_event, + enabled=stt_enabled, + get_cheetah_args=get_cheetah_args, + ) + + self._intent_processor = IntentProcessor( + conversation_timeout=conversation_timeout, + stop_event=stop_event, + enabled=intent_enabled, + get_rhino_args=get_rhino_args, + ) + + @property + def enabled(self) -> bool: + """ + The processor is enabled if either the STT or the Intent processor are + enabled. + """ + return self._stt_enabled or self._intent_enabled + + def should_stop(self) -> bool: + return self._stop_event.is_set() + + def next_event(self, timeout: Optional[float] = None) -> Optional[AssistantEvent]: + evt = None + + # Wait for either the STT or Intent processor to finish processing the audio + completed = wait_for_either( + self._stt_processor.processing_done, + self._intent_processor.processing_done, + self._stop_event, + timeout=timeout, + ) + + if not completed: + self.logger.warning('Timeout while waiting for the processors to finish') + + # Immediately return if the stop event is set + if self.should_stop(): + return evt + + with self._stt_processor._state_lock, self._intent_processor._state_lock: + # Priority to the intent processor event, if the processor is enabled + if self._intent_enabled: + evt = self._intent_processor.last_event() + + # If the intent processor didn't return any event, then return the STT + # processor event + if ( + not evt or isinstance(evt, ConversationTimeoutEvent) + ) and self._stt_enabled: + # self._stt_processor.processing_done.wait(timeout=timeout) + evt = self._stt_processor.last_event() + + if evt: + self._stt_processor.reset() + self._intent_processor.reset() + + return evt + + def process( + self, audio: Sequence[int], block: bool = True, timeout: Optional[float] = None + ) -> Optional[AssistantEvent]: + """ + Process an audio frame. + + The audio frame is enqueued to both the STT and Intent processors, if + enabled. The function waits for either processor to finish processing + the audio, and returns the event from the first processor that returns + a result. + + Priority is given to the Intent processor if enabled, otherwise the STT + processor is used. + """ + # Enqueue the audio to both the STT and Intent processors if enabled + if self._stt_enabled: + self._stt_processor.enqueue(audio) + + if self._intent_enabled: + self._intent_processor.enqueue(audio) + + if not block: + return None + + return self.next_event(timeout=timeout) + + def __enter__(self): + """ + Context manager entry point - it wraps :meth:`start`. + """ + self.start() + + def __exit__(self, *_, **__): + """ + Context manager exit point - it wraps :meth:`stop`. + """ + self.stop() + + def start(self): + """ + Start the STT and Intent processors. + """ + if self._stt_enabled: + self._stt_processor.start() + + if self._intent_enabled: + self._intent_processor.start() + + def stop(self): + """ + Stop the STT and Intent processors. + """ + self._stt_processor.stop() + self._intent_processor.stop() + + def on_conversation_start(self): + if self._stt_enabled: + self._stt_processor.on_conversation_start() + + if self._intent_enabled: + self._intent_processor.on_conversation_start() + + def on_conversation_end(self): + if self._stt_enabled: + self._stt_processor.on_conversation_end() + + if self._intent_enabled: + self._intent_processor.on_conversation_end() + + def on_conversation_reset(self): + if self._stt_enabled: + self._stt_processor.on_conversation_reset() + + if self._intent_enabled: + self._intent_processor.on_conversation_reset() + + def reset(self): + """ + Reset the state of the STT and Intent processors. + """ + self._stt_processor.reset() + self._intent_processor.reset() + + @property + def sample_rate(self) -> int: + """ + The sample rate of the audio frames. + """ + if self._intent_enabled: + return self._intent_processor.sample_rate + + if self._stt_enabled: + return self._stt_processor.sample_rate + + raise ValueError('No processor enabled') + + @property + def frame_length(self) -> int: + """ + The frame length of the audio frames. + """ + if self._intent_enabled: + return self._intent_processor.frame_length + + if self._stt_enabled: + return self._stt_processor.frame_length + + raise ValueError('No processor enabled') diff --git a/platypush/plugins/assistant/picovoice/_speech/_stt.py b/platypush/plugins/assistant/picovoice/_speech/_stt.py new file mode 100644 index 00000000..cdd87d60 --- /dev/null +++ b/platypush/plugins/assistant/picovoice/_speech/_stt.py @@ -0,0 +1,110 @@ +from typing import Callable, Optional, Sequence, Union + +import pvcheetah + +from platypush.message.event.assistant import ( + ConversationTimeoutEvent, + SpeechRecognizedEvent, +) + +from ._base import BaseProcessor + + +class SttProcessor(BaseProcessor): + """ + Implementation of the speech-to-text processor using the Picovoice Cheetah + engine. + """ + + def __init__( + self, *args, get_cheetah_args: Callable[[], dict] = lambda: {}, **kwargs + ): + super().__init__(*args, **kwargs) + self._get_cheetah_args = get_cheetah_args + # model_path -> Cheetah instance cache + self._cheetah = {self._model_path: pvcheetah.create(**self._get_cheetah_args())} + + @property + def _model_path(self) -> Optional[str]: + return self._get_cheetah_args().get('model_path') + + @property + def sample_rate(self) -> int: + return self._get_cheetah().sample_rate + + @property + def frame_length(self) -> int: + return self._get_cheetah().frame_length + + def _get_cheetah(self) -> pvcheetah.Cheetah: + with self._state_lock: + if not self._cheetah.get(self._model_path): + self.logger.debug( + 'Creating Cheetah instance for model %s', self._model_path + ) + self._cheetah[self._model_path] = pvcheetah.create( + **self._get_cheetah_args() + ) + self.logger.debug( + 'Cheetah instance created for model %s', self._model_path + ) + + return self._cheetah[self._model_path] + + def process( + self, audio: Sequence[int] + ) -> Optional[Union[SpeechRecognizedEvent, ConversationTimeoutEvent]]: + event = None + cheetah = self._get_cheetah() + partial_transcript, self._ctx.is_final = cheetah.process(audio) + last_transcript = self._ctx.transcript + + # Concatenate the partial transcript to the context + if partial_transcript: + self._ctx.transcript += partial_transcript + self.logger.info( + 'Partial transcript: %s, is_final: %s', + self._ctx.transcript, + self._ctx.is_final, + ) + + # If the transcript is final or the conversation timed out, then + # process and return whatever is available in the context + if self._ctx.is_final or self._ctx.timed_out: + phrase = cheetah.flush() or '' + self._ctx.transcript += phrase + if self._ctx.transcript and self._ctx.transcript != last_transcript: + self.logger.debug('Processed STT transcript: %s', self._ctx.transcript) + last_transcript = self._ctx.transcript + + phrase = self._ctx.transcript + phrase = (phrase[:1].lower() + phrase[1:]).strip() + event = ( + SpeechRecognizedEvent(phrase=phrase) + if phrase + else ConversationTimeoutEvent() + ) + + self.reset() + + return event + + def reset(self): + if not self._enabled: + return + + with self._state_lock: + super().reset() + self._get_cheetah().flush() + + def stop(self): + if not self._enabled: + return + + super().stop() + + with self._state_lock: + objs = self._cheetah.copy() + for key, obj in objs.items(): + obj.delete() + self._cheetah.pop(key) diff --git a/platypush/plugins/assistant/picovoice/_state.py b/platypush/plugins/assistant/picovoice/_state.py new file mode 100644 index 00000000..e0eb7e71 --- /dev/null +++ b/platypush/plugins/assistant/picovoice/_state.py @@ -0,0 +1,14 @@ +from enum import Enum + + +class AssistantState(Enum): + """ + Possible states of the assistant. + """ + + IDLE = 'idle' + DETECTING_HOTWORD = 'detecting_hotword' + DETECTING_SPEECH = 'detecting_speech' + + +# vim:sw=4:ts=4:et: diff --git a/platypush/plugins/assistant/picovoice/manifest.yaml b/platypush/plugins/assistant/picovoice/manifest.yaml new file mode 100644 index 00000000..73001200 --- /dev/null +++ b/platypush/plugins/assistant/picovoice/manifest.yaml @@ -0,0 +1,33 @@ +manifest: + package: platypush.plugins.assistant.picovoice + type: plugin + events: + - platypush.message.event.assistant.ConversationEndEvent + - platypush.message.event.assistant.ConversationStartEvent + - platypush.message.event.assistant.ConversationTimeoutEvent + - platypush.message.event.assistant.HotwordDetectedEvent + - platypush.message.event.assistant.IntentRecognizedEvent + - platypush.message.event.assistant.MicMutedEvent + - platypush.message.event.assistant.MicUnmutedEvent + - platypush.message.event.assistant.NoResponseEvent + - platypush.message.event.assistant.ResponseEndEvent + - platypush.message.event.assistant.ResponseEvent + - platypush.message.event.assistant.SpeechRecognizedEvent + install: + apk: + - ffmpeg + apt: + - ffmpeg + dnf: + - ffmpeg + pacman: + - ffmpeg + - python-sounddevice + pip: + - num2words # Temporary dependency + - pvcheetah + - pvleopard + - pvorca + - pvporcupine + - pvrhino + - sounddevice diff --git a/platypush/plugins/music/mopidy/__init__.py b/platypush/plugins/music/mopidy/__init__.py index 97744b63..9e9056ea 100644 --- a/platypush/plugins/music/mopidy/__init__.py +++ b/platypush/plugins/music/mopidy/__init__.py @@ -255,6 +255,8 @@ class MusicMopidyPlugin(RunnablePlugin): ret = self._add(resource, position=0) if not ret: self.logger.warning('Failed to add %s to the tracklist', resource) + elif isinstance(ret, list): + track_id = ret[0].get('tlid') elif isinstance(ret, dict): track_id = ret.get('tlid') elif position is not None: diff --git a/platypush/plugins/openai/__init__.py b/platypush/plugins/openai/__init__.py new file mode 100644 index 00000000..70b5538b --- /dev/null +++ b/platypush/plugins/openai/__init__.py @@ -0,0 +1,283 @@ +import os +from dataclasses import dataclass +from datetime import datetime as dt +from enum import Enum +from threading import RLock +from typing import Iterable, List, Optional + +import requests + +from platypush.plugins import Plugin, action + + +class ContextEntryRole(Enum): + """ + Roles for context entries. + """ + + ASSISTANT = "assistant" + SYSTEM = "system" + USER = "user" + + +@dataclass +class ContextEntry: + """ + A context entry. + """ + + timestamp: dt + role: ContextEntryRole + content: str + + @classmethod + def from_dict(cls, data: dict): + return cls( + timestamp=dt.fromisoformat(data.get("timestamp", dt.now().isoformat())), + role=ContextEntryRole(data["role"]), + content=data["content"], + ) + + def to_dict(self): + return { + "role": self.role.value, + "content": self.content, + } + + +class OpenaiPlugin(Plugin): + """ + Plugin to interact with OpenAI services. + + So far only ChatGPT is supported. + + Contexts + -------- + + The plugin also supports the implementation of custom assistant + *contexts*/environment. + + Contexts can be used to: + + - Customize the model's behavior based on a set of inputs - going from + a generic "*You are a helpful assistant*" to a more specific "*You + are a Star Trek fan*", or "*You are a 16th century noble lady who + talks in Shakespearean English to her peers*". + - Pre-configure the model with a set of previous interactions in order + to either pre-load information that we expect the model to remember, + or to provide a set of previous interactions that the model can use + to generate responses that are consistent with the conversation + history. + + The plugin provides two types of contexts: + + - **Default context**: This is a set of context entries that are + provided at plugin initialization and that will be used to initialize + the model with a configuration or set of previous interactions that + will be remembered when generating all responses. + + - **Runtime context**: This is a set of context entries that can be + passed at runtime at :meth:`.get_response`. All the interactions + (both user prompts and assistant responses) that are processed + through :meth:`.get_response` will also be added to the runtime + context, and remembered for the next ``context_expiry`` seconds. This + allows you to generate responses that are consistent with the recent + conversation history. + + Each context entry is a dictionary with the following keys: + + - ``role``: The role of the message. Can be one of: + - ``system``: A system message provided to the model to set + up its initial state - e.g. "you are a helpful + assistant". + - ``user``: A user message, as provided by a previous (real + or synthetic) user interaction. + - ``assistant``: An assistant message, as provided by a + previous (real or synthetic) assistant response. + - ``content``: The content of the message. + + An example of context: + + .. code-block:: yaml + + context: + - role: system + content: > + You are a 16th century noble lady who talks in + Shakespearean English to her peers. + - role: user + content: What is a telephone? + - role: assistant + content: > + Pray tell, noble companion, a telephone is a device + of modern innovation that doth permit one to speak + with a distant acquaintance by means of magical pink + waves that do carry the sound of thine voice to the + ear of the listener. + + Given such context, if you call :meth:`.get_response` with a + prompt such as "*How does it work?*", the model may generate a + response such as "*Fair lady, to use a telephone, thou must first + lift the receiver and place it to thine ear. Then, thou must speak + into the mouthpiece as though conversing with a companion in + another room. The magical pink waves shall carry thy words to the + recipient, who shall hear them on their own device. 'Tis a wondrous + invention indeed!*". + + Note that the model will remember the previous interactions and + also generate responses, so you can ask it direct questions such as "How + does it work" while remembering what "it" is likely to mean. And it'll + provide responses which are in the same style initialized through the + ``system`` context. + """ + + def __init__( + self, + api_key: Optional[str], + model: str = "gpt-3.5-turbo", + timeout: float = 30, + context: Optional[Iterable[dict]] = None, + context_expiry: Optional[float] = 600, + max_tokens: int = 500, + **kwargs, + ): + """ + :param api_key: OpenAI API key. If not set, it will be read from the + ``OPENAI_API_KEY`` environment variable. + :param model: The model to use. Default: ``gpt-3.5-turbo``. + :param timeout: Default timeout for API requests (default: 30 seconds). + :param max_tokens: Maximum number of tokens to generate in the response + (default: 500). + :param context: Default context to use for completions, as a list of + dictionaries with ``role`` and ``content`` keys. Default: None. + :param context_expiry: Default expiry time for the context in seconds. + After this time since the last interaction, the context will be + cleared. + + This means that any follow-up interactions happening within the + expiry window will remember the past prompts, but any interaction + that happens after the expiry window (calculated from the time of + the last interaction) will start fresh. + + Note that ``context_expiry`` is only applied to the runtime + context. The default context will never expire unless it's removed + from the plugin configuration. + + Set to 0 to disable context expiry - i.e. all messages stay in the + context until the plugin is restarted or the context is cleared + explicitly via :meth:`.clear_context`. Default: 600 seconds (10 + minutes). + """ + super().__init__(**kwargs) + api_key = api_key or os.getenv('OPENAI_API_KEY') + assert api_key, 'OpenAI API key not provided' + + self._api_key = api_key + self._context_lock = RLock() + self._runtime_context: List[ContextEntry] = [] + self._default_context = [ + ContextEntry.from_dict(entries) for entries in (context or []) + ] + + self.max_tokens = max_tokens + self.context_expiry = context_expiry + self.model = model + self.timeout = timeout + + def _rotate_context(self): + """ + Rotate the context by removing any entries older than the configured + ``context_expiry``. + """ + if not self.context_expiry: + return + + with self._context_lock: + now = dt.now() + self._runtime_context = [ + entry + for entry in self._runtime_context + if (now - entry.timestamp).total_seconds() < self.context_expiry + ] + + @action + def get_response( + self, + prompt: str, + model: Optional[str] = None, + context: Optional[Iterable[dict]] = None, + timeout: Optional[float] = None, + max_tokens: Optional[int] = None, + ) -> Optional[str]: + """ + Get completions for a given prompt using ChatGPT. + + :param prompt: The prompt/question to complete/answer. + :param model: Override the default model to use. + :param context: Extend the default context with these extra messages. + :param max_tokens: Override the default maximum number of tokens to + generate in the response. + :param timeout: Override the default timeout for the API request. + :return: The completion for the prompt - or, better, the message + associted to the highest scoring completion choice. + """ + self._rotate_context() + context = [ + *(context or []), + { + "role": "user", + "content": prompt, + }, + ] + + resp = requests.post( + "https://api.openai.com/v1/chat/completions", + timeout=timeout or self.timeout, + headers={ + "Authorization": f"Bearer {self._api_key}", + "Content-Type": "application/json", + }, + json={ + "model": model or self.model, + "messages": [ + *( + entry.to_dict() + for entry in ( + *(self._default_context or []), + *self._runtime_context, + ) + ), + *context, + ], + "max_tokens": max_tokens or self.max_tokens, + }, + ) + + resp.raise_for_status() + self._update_context(*context) + choices = resp.json()["choices"] + self.logger.debug("OpenAI response: %s", resp.json()) + + if not choices: + return None + + msg = choices[0]["message"] + self._update_context(msg) + return msg["content"] + + def _update_context(self, *entries: dict): + """ + Update the context with a new entry. + """ + with self._context_lock: + for entry in entries: + self._runtime_context.append(ContextEntry.from_dict(entry)) + self._rotate_context() + + @action + def clear_context(self): + """ + Clear the runtime context. + """ + with self._context_lock: + self._runtime_context = [] diff --git a/platypush/plugins/openai/manifest.yaml b/platypush/plugins/openai/manifest.yaml new file mode 100644 index 00000000..6930e8b8 --- /dev/null +++ b/platypush/plugins/openai/manifest.yaml @@ -0,0 +1,3 @@ +manifest: + package: platypush.plugins.openai + type: plugin diff --git a/platypush/plugins/sound/_manager/_main.py b/platypush/plugins/sound/_manager/_main.py index 9a1d96cf..4dea95df 100644 --- a/platypush/plugins/sound/_manager/_main.py +++ b/platypush/plugins/sound/_manager/_main.py @@ -247,9 +247,11 @@ class AudioManager: wait_start = time() for audio_thread in streams_to_stop: audio_thread.join( - timeout=max(0, timeout - (time() - wait_start)) - if timeout is not None - else None + timeout=( + max(0, timeout - (time() - wait_start)) + if timeout is not None + else None + ) ) # Remove references diff --git a/platypush/plugins/stt/__init__.py b/platypush/plugins/stt/__init__.py deleted file mode 100644 index 1df2ae45..00000000 --- a/platypush/plugins/stt/__init__.py +++ /dev/null @@ -1,336 +0,0 @@ -import queue -import threading -from abc import ABC, abstractmethod -from typing import Optional, Union, List - -import sounddevice as sd - -from platypush.context import get_bus -from platypush.message.event.stt import ( - SpeechDetectionStartedEvent, - SpeechDetectionStoppedEvent, - SpeechStartedEvent, - SpeechDetectedEvent, - HotwordDetectedEvent, - ConversationDetectedEvent, -) -from platypush.message.response.stt import SpeechDetectedResponse -from platypush.plugins import Plugin, action - - -class SttPlugin(ABC, Plugin): - """ - Abstract class for speech-to-text plugins. - """ - - _thread_stop_timeout = 10.0 - rate = 16000 - channels = 1 - - def __init__( - self, - input_device: Optional[Union[int, str]] = None, - hotword: Optional[str] = None, - hotwords: Optional[List[str]] = None, - conversation_timeout: Optional[float] = 10.0, - block_duration: float = 1.0, - ): - """ - :param input_device: PortAudio device index or name that will be used for recording speech (default: default - system audio input device). - :param hotword: When this word is detected, the plugin will trigger a - :class:`platypush.message.event.stt.HotwordDetectedEvent` instead of a - :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can use these events for hooking other - assistants. - :param hotwords: Use a list of hotwords instead of a single one. - :param conversation_timeout: If ``hotword`` or ``hotwords`` are set and ``conversation_timeout`` is set, - the next speech detected event will trigger a :class:`platypush.message.event.stt.ConversationDetectedEvent` - instead of a :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can hook custom hooks - here to run any logic depending on the detected speech - it can emulate a kind of - "OK, Google. Turn on the lights" interaction without using an external assistant (default: 10 seconds). - :param block_duration: Duration of the acquired audio blocks (default: 1 second). - """ - - super().__init__() - self.input_device = input_device - self.conversation_timeout = conversation_timeout - self.block_duration = block_duration - - self.hotwords = set(hotwords or []) - if hotword: - self.hotwords = {hotword} - - self._conversation_event = threading.Event() - self._input_stream: Optional[sd.InputStream] = None - self._recording_thread: Optional[threading.Thread] = None - self._detection_thread: Optional[threading.Thread] = None - self._audio_queue: Optional[queue.Queue] = None - self._current_text = '' - - def _get_input_device(self, device: Optional[Union[int, str]] = None) -> int: - """ - Get the index of the input device by index or name. - - :param device: Device index or name. If None is set then the function will return the index of the - default audio input device. - :return: Index of the audio input device. - """ - if not device: - device = self.input_device - if not device: - return sd.query_hostapis()[0].get('default_input_device') - - if isinstance(device, int): - assert device <= len(sd.query_devices()) - return device - - for i, dev in enumerate(sd.query_devices()): - if dev['name'] == device: - return i - - raise AssertionError('Device {} not found'.format(device)) - - def on_speech_detected(self, speech: str) -> None: - """ - Hook called when speech is detected. Triggers the right event depending on the current context. - - :param speech: Detected speech. - """ - speech = speech.strip() - - if speech in self.hotwords: - event = HotwordDetectedEvent(hotword=speech) - if self.conversation_timeout: - self._conversation_event.set() - threading.Timer( - self.conversation_timeout, lambda: self._conversation_event.clear() - ).start() - elif self._conversation_event.is_set(): - event = ConversationDetectedEvent(speech=speech) - else: - event = SpeechDetectedEvent(speech=speech) - - get_bus().post(event) - - @staticmethod - def convert_frames(frames: bytes) -> bytes: - """ - Conversion method for raw audio frames. It just returns the input frames as bytes. Override it if required - by your logic. - - :param frames: Input audio frames, as bytes. - :return: The audio frames as passed on the input. Override if required. - """ - return frames - - def on_detection_started(self) -> None: - """ - Method called when the ``detection_thread`` starts. Initialize your context variables and models here if - required. - """ - pass - - def on_detection_ended(self) -> None: - """ - Method called when the ``detection_thread`` stops. Clean up your context variables and models here. - """ - pass - - def before_recording(self) -> None: - """ - Method called when the ``recording_thread`` starts. Put here any logic that you may want to run before the - recording thread starts. - """ - pass - - def on_recording_started(self) -> None: - """ - Method called after the ``recording_thread`` opens the audio device. Put here any logic that you may want to - run after the recording starts. - """ - pass - - def on_recording_ended(self) -> None: - """ - Method called when the ``recording_thread`` stops. Put here any logic that you want to run after the audio - device is closed. - """ - pass - - @abstractmethod - def detect_speech(self, frames) -> str: - """ - Method called within the ``detection_thread`` when new audio frames have been captured. Must be implemented - by the derived classes. - - :param frames: Audio frames, as returned by ``convert_frames``. - :return: Detected text, as a string. Returns an empty string if no text has been detected. - """ - raise NotImplementedError - - def process_text(self, text: str) -> None: - if (not text and self._current_text) or (text and text == self._current_text): - self.on_speech_detected(self._current_text) - self._current_text = '' - else: - if text: - if not self._current_text: - get_bus().post(SpeechStartedEvent()) - self.logger.info('Intermediate speech results: [{}]'.format(text)) - - self._current_text = text - - def detection_thread(self) -> None: - """ - This thread reads frames from ``_audio_queue``, performs the speech-to-text detection and calls - """ - self._current_text = '' - self.logger.debug('Detection thread started') - self.on_detection_started() - - while self._audio_queue: - try: - frames = self._audio_queue.get() - frames = self.convert_frames(frames) - except Exception as e: - self.logger.warning( - 'Error while feeding audio to the model: {}'.format(str(e)) - ) - continue - - text = self.detect_speech(frames).strip() - self.process_text(text) - - self.on_detection_ended() - self.logger.debug('Detection thread terminated') - - def recording_thread( - self, - block_duration: Optional[float] = None, - block_size: Optional[int] = None, - input_device: Optional[str] = None, - ) -> None: - """ - Recording thread. It reads raw frames from the audio device and dispatches them to ``detection_thread``. - - :param block_duration: Audio blocks duration. Specify either ``block_duration`` or ``block_size``. - :param block_size: Size of the audio blocks. Specify either ``block_duration`` or ``block_size``. - :param input_device: Input device - """ - assert (block_duration or block_size) and not ( - block_duration and block_size - ), 'Please specify either block_duration or block_size' - - if not block_size: - block_size = int(self.rate * self.channels * block_duration) - - self.before_recording() - self.logger.debug('Recording thread started') - device = self._get_input_device(input_device) - self._input_stream = sd.InputStream( - samplerate=self.rate, - device=device, - channels=self.channels, - dtype='int16', - latency=0, - blocksize=block_size, - ) - self._input_stream.start() - self.on_recording_started() - get_bus().post(SpeechDetectionStartedEvent()) - - while self._input_stream: - try: - frames = self._input_stream.read(block_size)[0] - except Exception as e: - self.logger.warning( - 'Error while reading from the audio input: {}'.format(str(e)) - ) - continue - - self._audio_queue.put(frames) - - get_bus().post(SpeechDetectionStoppedEvent()) - self.on_recording_ended() - self.logger.debug('Recording thread terminated') - - @abstractmethod - @action - def detect(self, audio_file: str) -> SpeechDetectedResponse: - """ - Perform speech-to-text analysis on an audio file. Must be implemented by the derived classes. - - :param audio_file: Path to the audio file. - """ - raise NotImplementedError - - def __enter__(self): - """ - Context manager enter. Starts detection and returns self. - """ - self.start_detection() - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - """ - Context manager exit. Stops detection. - """ - self.stop_detection() - - @action - def start_detection( - self, - input_device: Optional[str] = None, - seconds: Optional[float] = None, - block_duration: Optional[float] = None, - ) -> None: - """ - Start the speech detection engine. - - :param input_device: Audio input device name/index override - :param seconds: If set, then the detection engine will stop after this many seconds, otherwise it'll - start running until ``stop_detection`` is called or application stop. - :param block_duration: ``block_duration`` override. - """ - assert ( - not self._input_stream and not self._recording_thread - ), 'Speech detection is already running' - block_duration = block_duration or self.block_duration - input_device = input_device if input_device is not None else self.input_device - self._audio_queue = queue.Queue() - self._recording_thread = threading.Thread( - target=lambda: self.recording_thread( - block_duration=block_duration, input_device=input_device - ) - ) - - self._recording_thread.start() - self._detection_thread = threading.Thread( - target=lambda: self.detection_thread() - ) - self._detection_thread.start() - - if seconds: - threading.Timer(seconds, lambda: self.stop_detection()).start() - - @action - def stop_detection(self) -> None: - """ - Stop the speech detection engine. - """ - assert self._input_stream, 'Speech detection is not running' - self._input_stream.stop(ignore_errors=True) - self._input_stream.close(ignore_errors=True) - self._input_stream = None - - if self._recording_thread: - self._recording_thread.join(timeout=self._thread_stop_timeout) - self._recording_thread = None - - self._audio_queue = None - if self._detection_thread: - self._detection_thread.join(timeout=self._thread_stop_timeout) - self._detection_thread = None - - -# vim:sw=4:ts=4:et: diff --git a/platypush/plugins/stt/picovoice/__init__.py b/platypush/plugins/stt/picovoice/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/platypush/plugins/stt/picovoice/hotword/__init__.py b/platypush/plugins/stt/picovoice/hotword/__init__.py deleted file mode 100644 index 5c776783..00000000 --- a/platypush/plugins/stt/picovoice/hotword/__init__.py +++ /dev/null @@ -1,120 +0,0 @@ -import os -import struct -from typing import Optional, List - -from platypush.message.response.stt import SpeechDetectedResponse -from platypush.plugins import action -from platypush.plugins.stt import SttPlugin - - -class SttPicovoiceHotwordPlugin(SttPlugin): - """ - This plugin performs hotword detection using `PicoVoice `_. - """ - - def __init__( - self, - library_path: Optional[str] = None, - model_file_path: Optional[str] = None, - keyword_file_paths: Optional[List[str]] = None, - sensitivity: float = 0.5, - sensitivities: Optional[List[float]] = None, - *args, - **kwargs - ): - from pvporcupine import Porcupine - from pvporcupine.resources.util.python.util import ( - LIBRARY_PATH, - MODEL_FILE_PATH, - KEYWORD_FILE_PATHS, - ) - - super().__init__(*args, **kwargs) - - self.hotwords = list(self.hotwords) - self._hotword_engine: Optional[Porcupine] = None - self._library_path = os.path.abspath( - os.path.expanduser(library_path or LIBRARY_PATH) - ) - self._model_file_path = os.path.abspath( - os.path.expanduser(model_file_path or MODEL_FILE_PATH) - ) - - if not keyword_file_paths: - hotwords = KEYWORD_FILE_PATHS - assert all( - hotword in hotwords for hotword in self.hotwords - ), 'Not all the hotwords could be found. Available hotwords: {}'.format( - list(hotwords.keys()) - ) - - self._keyword_file_paths = [ - os.path.abspath(os.path.expanduser(hotwords[hotword])) - for hotword in self.hotwords - ] - else: - self._keyword_file_paths = [ - os.path.abspath(os.path.expanduser(p)) for p in keyword_file_paths - ] - - self._sensitivities = [] - if sensitivities: - assert len(self._keyword_file_paths) == len( - sensitivities - ), 'Please specify as many sensitivities as the number of configured hotwords' - - self._sensitivities = sensitivities - else: - self._sensitivities = [sensitivity] * len(self._keyword_file_paths) - - def convert_frames(self, frames: bytes) -> tuple: - assert self._hotword_engine, 'The hotword engine is not running' - return struct.unpack_from("h" * self._hotword_engine.frame_length, frames) - - def on_detection_ended(self) -> None: - if self._hotword_engine: - self._hotword_engine.delete() - self._hotword_engine = None - - def detect_speech(self, frames: tuple) -> str: - index = self._hotword_engine.process(frames) - if index < 0: - return '' - - if index is True: - index = 0 - return self.hotwords[index] - - @action - def detect(self, audio_file: str) -> SpeechDetectedResponse: - """ - Perform speech-to-text analysis on an audio file. - - :param audio_file: Path to the audio file. - """ - pass - - def recording_thread( - self, input_device: Optional[str] = None, *args, **kwargs - ) -> None: - assert self._hotword_engine, 'The hotword engine has not yet been initialized' - super().recording_thread( - block_size=self._hotword_engine.frame_length, input_device=input_device - ) - - @action - def start_detection(self, *args, **kwargs) -> None: - from pvporcupine import Porcupine - - self._hotword_engine = Porcupine( - library_path=self._library_path, - model_file_path=self._model_file_path, - keyword_file_paths=self._keyword_file_paths, - sensitivities=self._sensitivities, - ) - - self.rate = self._hotword_engine.sample_rate - super().start_detection(*args, **kwargs) - - -# vim:sw=4:ts=4:et: diff --git a/platypush/plugins/stt/picovoice/hotword/manifest.yaml b/platypush/plugins/stt/picovoice/hotword/manifest.yaml deleted file mode 100644 index f8e9d210..00000000 --- a/platypush/plugins/stt/picovoice/hotword/manifest.yaml +++ /dev/null @@ -1,7 +0,0 @@ -manifest: - events: {} - install: - pip: - - pvporcupine - package: platypush.plugins.stt.picovoice.hotword - type: plugin diff --git a/platypush/plugins/stt/picovoice/speech/__init__.py b/platypush/plugins/stt/picovoice/speech/__init__.py deleted file mode 100644 index 4043ec53..00000000 --- a/platypush/plugins/stt/picovoice/speech/__init__.py +++ /dev/null @@ -1,154 +0,0 @@ -import inspect -import os -import platform -import struct -import threading -from typing import Optional - -from platypush.message.event.stt import SpeechStartedEvent - -from platypush.context import get_bus -from platypush.message.response.stt import SpeechDetectedResponse -from platypush.plugins import action -from platypush.plugins.stt import SttPlugin - - -class SttPicovoiceSpeechPlugin(SttPlugin): - """ - This plugin performs speech detection using `PicoVoice `_. - NOTE: The PicoVoice product used for real-time speech-to-text (Cheetah) can be used freely for - personal applications on x86_64 Linux. Other architectures and operating systems require a commercial license. - You can ask for a license `here `_. - """ - - def __init__( - self, - library_path: Optional[str] = None, - acoustic_model_path: Optional[str] = None, - language_model_path: Optional[str] = None, - license_path: Optional[str] = None, - end_of_speech_timeout: int = 1, - *args, - **kwargs - ): - """ - :param library_path: Path to the Cheetah binary library for your OS - (default: ``CHEETAH_INSTALL_DIR/lib/OS/ARCH/libpv_cheetah.EXT``). - :param acoustic_model_path: Path to the acoustic speech model - (default: ``CHEETAH_INSTALL_DIR/lib/common/acoustic_model.pv``). - :param language_model_path: Path to the language model - (default: ``CHEETAH_INSTALL_DIR/lib/common/language_model.pv``). - :param license_path: Path to your PicoVoice license - (default: ``CHEETAH_INSTALL_DIR/resources/license/cheetah_eval_linux_public.lic``). - :param end_of_speech_timeout: Number of seconds of silence during speech recognition before considering - a phrase over (default: 1). - """ - from pvcheetah import Cheetah - - super().__init__(*args, **kwargs) - - self._basedir = os.path.abspath( - os.path.join(inspect.getfile(Cheetah), '..', '..', '..') - ) - if not library_path: - library_path = self._get_library_path() - if not language_model_path: - language_model_path = os.path.join( - self._basedir, 'lib', 'common', 'language_model.pv' - ) - if not acoustic_model_path: - acoustic_model_path = os.path.join( - self._basedir, 'lib', 'common', 'acoustic_model.pv' - ) - if not license_path: - license_path = os.path.join( - self._basedir, 'resources', 'license', 'cheetah_eval_linux_public.lic' - ) - - self._library_path = library_path - self._language_model_path = language_model_path - self._acoustic_model_path = acoustic_model_path - self._license_path = license_path - self._end_of_speech_timeout = end_of_speech_timeout - self._stt_engine: Optional[Cheetah] = None - self._speech_in_progress = threading.Event() - - def _get_library_path(self) -> str: - path = os.path.join( - self._basedir, 'lib', platform.system().lower(), platform.machine() - ) - return os.path.join( - path, [f for f in os.listdir(path) if f.startswith('libpv_cheetah.')][0] - ) - - def convert_frames(self, frames: bytes) -> tuple: - assert self._stt_engine, 'The speech engine is not running' - return struct.unpack_from("h" * self._stt_engine.frame_length, frames) - - def on_detection_ended(self) -> None: - if self._stt_engine: - self._stt_engine.delete() - self._stt_engine = None - - def detect_speech(self, frames: tuple) -> str: - text, is_endpoint = self._stt_engine.process(frames) - text = text.strip() - - if text: - if not self._speech_in_progress.is_set(): - self._speech_in_progress.set() - get_bus().post(SpeechStartedEvent()) - - self._current_text += ' ' + text.strip() - - if is_endpoint: - text = self._stt_engine.flush().strip().strip() - if text: - self._current_text += ' ' + text - - self._speech_in_progress.clear() - if self._current_text: - self.on_speech_detected(self._current_text) - - self._current_text = '' - - return self._current_text - - def process_text(self, text: str) -> None: - pass - - @action - def detect(self, audio_file: str) -> SpeechDetectedResponse: - """ - Perform speech-to-text analysis on an audio file. - - :param audio_file: Path to the audio file. - """ - pass - - def recording_thread( - self, input_device: Optional[str] = None, *args, **kwargs - ) -> None: - assert self._stt_engine, 'The hotword engine has not yet been initialized' - super().recording_thread( - block_size=self._stt_engine.frame_length, input_device=input_device - ) - - @action - def start_detection(self, *args, **kwargs) -> None: - from pvcheetah import Cheetah - - self._stt_engine = Cheetah( - library_path=self._library_path, - acoustic_model_path=self._acoustic_model_path, - language_model_path=self._language_model_path, - license_path=self._license_path, - endpoint_duration_sec=self._end_of_speech_timeout, - ) - - self.rate = self._stt_engine.sample_rate - self._speech_in_progress.clear() - super().start_detection(*args, **kwargs) - - -# vim:sw=4:ts=4:et: diff --git a/platypush/plugins/stt/picovoice/speech/manifest.yaml b/platypush/plugins/stt/picovoice/speech/manifest.yaml deleted file mode 100644 index 0e7a01a8..00000000 --- a/platypush/plugins/stt/picovoice/speech/manifest.yaml +++ /dev/null @@ -1,7 +0,0 @@ -manifest: - events: {} - install: - pip: - - cheetah - package: platypush.plugins.stt.picovoice.speech - type: plugin diff --git a/platypush/plugins/tts/manifest.yaml b/platypush/plugins/tts/manifest.yaml index 12d184f7..e2ed8cc3 100644 --- a/platypush/plugins/tts/manifest.yaml +++ b/platypush/plugins/tts/manifest.yaml @@ -19,6 +19,7 @@ manifest: - python-numpy - python-sounddevice pip: + - num2words - numpy - sounddevice package: platypush.plugins.tts diff --git a/platypush/plugins/tts/picovoice/__init__.py b/platypush/plugins/tts/picovoice/__init__.py new file mode 100644 index 00000000..69c00c6c --- /dev/null +++ b/platypush/plugins/tts/picovoice/__init__.py @@ -0,0 +1,198 @@ +import logging +import os +import re +from threading import RLock +from typing import Optional + +import numpy as np +import pvorca +import sounddevice as sd + +from platypush.config import Config +from platypush.plugins import action +from platypush.plugins.tts import TtsPlugin + + +class TextConversionUtils: + """ + Utility class to convert text to a format that is supported by the Orca TTS + engine. + + This pre-processing step is necessary until the issue is fixed: + https://github.com/Picovoice/orca/issues/10. + """ + + _logger = logging.getLogger(__name__) + _number_re = re.compile(r'(([0-9]+\.[0-9]+)|([0-9]+\,[0-9]+)|([0-9]+))') + _conversions_map = { + (re.compile(r'\s*[(){}\[\]<>]'), ', '), + (re.compile(r'[;]'), '.'), + (re.compile(r'[@#]'), ' at '), + (re.compile(r'[$]'), ' dollar '), + (re.compile(r'[%]'), ' percent '), + (re.compile(r'[&]'), ' and '), + (re.compile(r'[+]'), ' plus '), + (re.compile(r'[=]'), ' equals '), + (re.compile(r'[|]'), ' or '), + (re.compile(r'[~]'), ' tilde '), + (re.compile(r'[`]'), ''), + (re.compile(r'[*]'), ' star '), + (re.compile(r'[\\/]'), ' slash '), + (re.compile(r'[_]'), ' underscore '), + # Anything that isn't a letter or supported punctuation is replaced with a space + (re.compile(r'[^a-zA-Z,.:?!\-\'" ]'), ' '), + } + + @classmethod + def _convert_digits(cls, text: str) -> str: + try: + from num2words import num2words + except ImportError: + cls._logger.warning('num2words is not installed, skipping digit conversion') + return text + + while match := cls._number_re.search(text): + number = match.group(1) + text = text.replace(number, num2words(float(number.replace(',', '')))) + + return text + + @classmethod + def convert(cls, text: str) -> str: + text = cls._convert_digits(text) + + for pattern, replacement in TextConversionUtils._conversions_map: + text = pattern.sub(replacement, text) + + return text + + +class TtsPicovoicePlugin(TtsPlugin): + """ + This TTS plugin enables you to render text as audio using `Picovoice + `_'s (still experimental) `Orca TTS engine + `_. + + Take a look at + :class:`platypush.plugins.assistant.picovoice.AssistantPicovoicePlugin` + for details on how to sign up for a Picovoice account and get the API key. + + Also note that using the TTS features requires you to select Orca from the + list of products available for your account on the `Picovoice console + `_. + """ + + def __init__( + self, + access_key: Optional[str] = None, + model_path: Optional[str] = None, + **kwargs, + ): + """ + :param access_key: Picovoice access key. If it's not specified here, + then it must be specified on the configuration of + :class:`platypush.plugins.assistant.picovoice.AssistantPicovoicePlugin`. + :param model_path: Path of the TTS model file (default: use the default + English model). + """ + super().__init__(**kwargs) + if not access_key: + access_key = Config.get('assistant.picovoice', {}).get('access_key') + assert ( + access_key + ), 'No access key specified and no assistant.picovoice plugin found' + + self.model_path = model_path + self.access_key = access_key + if model_path: + model_path = os.path.expanduser(model_path) + + self._stream: Optional[sd.OutputStream] = None + self._stream_lock = RLock() + + def _play_audio(self, orca: pvorca.Orca, pcm: np.ndarray): + with self._stream_lock: + self.stop() + self._stream = sd.OutputStream( + samplerate=orca.sample_rate, + channels=1, + dtype='int16', + ) + + try: + self._stream.start() + self._stream.write(pcm) + except Exception as e: + self.logger.warning('Error playing audio: %s: %s', type(e), str(e)) + finally: + try: + self.stop() + self._stream.close() + except Exception as e: + self.logger.warning( + 'Error stopping audio stream: %s: %s', type(e), str(e) + ) + finally: + if self._stream: + self._stream = None + + def get_orca(self, model_path: Optional[str] = None): + if not model_path: + model_path = self.model_path + if model_path: + model_path = os.path.expanduser(model_path) + + return pvorca.create(access_key=self.access_key, model_path=model_path) + + @action + def say( + self, + text: str, + *_, + output_file: Optional[str] = None, + speech_rate: Optional[float] = None, + model_path: Optional[str] = None, + **__, + ): + """ + Say some text. + + :param text: Text to say. + :param output_file: If set, save the audio to the specified file. + Otherwise play it. + :param speech_rate: Speech rate (default: None). + :param model_path: Path of the TTS model file (default: use the default + configured model). + """ + # This is a temporary workaround until this issue is fixed: + # https://github.com/Picovoice/orca/issues/10. + text = TextConversionUtils.convert(text) + orca = self.get_orca(model_path=model_path) + + if output_file: + orca.synthesize_to_file( + text, os.path.expanduser(output_file), speech_rate=speech_rate + ) + return + + self._play_audio( + orca=orca, + pcm=np.array( + orca.synthesize(text, speech_rate=speech_rate), + dtype='int16', + ), + ) + + @action + def stop(self): + """ + Stop the currently playing audio. + """ + with self._stream_lock: + if not self._stream: + return + + self._stream.stop() + + +# vim:sw=4:ts=4:et: diff --git a/platypush/plugins/tts/picovoice/manifest.yaml b/platypush/plugins/tts/picovoice/manifest.yaml new file mode 100644 index 00000000..5d35ee3b --- /dev/null +++ b/platypush/plugins/tts/picovoice/manifest.yaml @@ -0,0 +1,22 @@ +manifest: + events: {} + install: + apk: + - ffmpeg + - py3-numpy + apt: + - ffmpeg + - python3-numpy + dnf: + - ffmpeg + - python-numpy + pacman: + - ffmpeg + - python-numpy + - python-sounddevice + pip: + - numpy + - pvorca + - sounddevice + package: platypush.plugins.tts.picovoice + type: plugin diff --git a/platypush/utils/mock/modules.py b/platypush/utils/mock/modules.py index 6af74355..e83fd377 100644 --- a/platypush/utils/mock/modules.py +++ b/platypush/utils/mock/modules.py @@ -83,7 +83,10 @@ mock_imports = [ "pmw3901", "psutil", "pvcheetah", - "pvporcupine ", + "pvleopard", + "pvorca", + "pvporcupine", + "pvrhino", "pyHS100", "pyaudio", "pychromecast", diff --git a/platypush/utils/threads.py b/platypush/utils/threads.py index b48b32b4..228a3c77 100644 --- a/platypush/utils/threads.py +++ b/platypush/utils/threads.py @@ -24,8 +24,11 @@ def OrEvent(*events, cls: Type = threading.Event): or_event.clear() def _to_or(e, changed_callback: Callable[[], None]): - e._set = e.set - e._clear = e.clear + if not hasattr(e, "_set"): + e._set = e.set + if not hasattr(e, "_clear"): + e._clear = e.clear + e.changed = changed_callback e.set = lambda: _or_set(e) e.clear = lambda: _clear_or(e)