diff --git a/platypush/plugins/assistant/picovoice/__init__.py b/platypush/plugins/assistant/picovoice/__init__.py index 71ff1a3ce1..3e36e35c8d 100644 --- a/platypush/plugins/assistant/picovoice/__init__.py +++ b/platypush/plugins/assistant/picovoice/__init__.py @@ -12,27 +12,18 @@ from ._state import AssistantState # pylint: disable=too-many-ancestors class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin): - """ + r""" A voice assistant that runs on your device, based on the `Picovoice `_ engine. - .. note:: You will need a PicoVoice account and a personal access key to - use this integration. - - You can get your personal access key by signing up at the `Picovoice - console `_. You may be asked to submit a - reason for using the service (feel free to mention a personal Platypush - integration), and you will receive your personal access key. - - You may also be asked to select which products you want to use. The default - configuration of this plugin requires the following: + Picovoice is a suite of on-device voice technologies that include: * **Porcupine**: wake-word engine, if you want the device to listen for a specific wake word in order to start the assistant. * **Cheetah**: speech-to-text engine, if you want your voice - interactions to be transcribed into free text - either programmatically - or when triggered by the wake word. Or: + interactions to be transcribed into free text - either + programmatically or when triggered by the wake word. Or: * **Rhino**: intent recognition engine, if you want to extract *intents* out of your voice commands - for instance, the phrase "set the living @@ -47,6 +38,316 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin): logic to respond to user's voice commands and render the responses as audio. + This plugin is a wrapper around the Picovoice engine that allows you to + run your custom voice-based conversational flows on your device. + + Getting a Picovoice account and access key + ------------------------------------------- + + You can get your personal access key by signing up at the `Picovoice + console `_. You may be asked to submit a + reason for using the service (feel free to mention a personal Platypush + integration), and you will receive your personal access key. + + If prompted to select the products you want to use, make sure to select + the ones from the Picovoice suite that you want to use with this plugin. + + + Hotword detection + ----------------- + + The hotword detection engine is based on `Porcupine + `_. + + If enabled through the ``hotword_enabled`` parameter (default: True), the + assistant will listen for a specific wake word before starting the + speech-to-text or intent recognition engines. You can specify custom models + for your hotword (e.g. on the same device you may use "Alexa" to trigger the + speech-to-text engine in English, "Computer" to trigger the speech-to-text + engine in Italian, and "Ok Google" to trigger the intent recognition engine. + + You can also create your custom hotword models using the `Porcupine console + `_. + + If ``hotword_enabled`` is set to True, you must also specify the + ``keywords`` parameter with the list of keywords that you want to listen + for, and optionally the ``keyword_paths`` parameter with the paths to the + any custom hotword models that you want to use. If ``hotword_enabled`` is + set to False, then the assistant won't start listening for speech after the + plugin is started, and you will need to programmatically start the + conversation by calling the :meth:`.start_conversation` action, or trigger + it from the UI. + + When a wake-word is detected, the assistant will emit a + :class:`platypush.message.event.assistant.HotwordDetectedEvent` event that + you can use to build your custom logic. For example: + + .. code-block:: python + + import time + + from platypush import hook, run + from platypush.message.event.assistant import HotwordDetectedEvent + + # Turn on a light for 5 seconds when the hotword "Alexa" is detected + @hook(HotwordDetectedEvent, hotword='Alexa') + def on_hotword_detected(event: HotwordDetectedEvent, **context): + run("light.hue.on", lights=["Living Room"]) + time.sleep(5) + run("light.hue.off", lights=["Living Room"]) + + By default, the assistant will start listening for speech after the hotword + if either ``stt_enabled`` or ``intent_model_path`` are set. If you don't + want the assistant to start listening for speech after the hotword is + detected (for example because you want to build your custom response flows, + or trigger the speech detection using different models depending on the + hotword that is used, or because you just want to detect hotwords but not + speech), then you can also set the ``start_conversation_on_hotword`` + parameter to ``False``. If that is the case, then you can programmatically + start the conversation by calling the :meth:`.start_conversation` method in + your event hooks: + + .. code-block:: python + + from platypush import hook, run + from platypush.message.event.assistant import HotwordDetectedEvent + + # Start a conversation using the Italian language model when the + # "Buongiorno" hotword is detected + @hook(HotwordDetectedEvent, hotword='Buongiorno') + def on_it_hotword_detected(event: HotwordDetectedEvent, **context): + event.assistant.start_conversation(model_file='path/to/it.pv') + + Speech-to-text + -------------- + + The speech-to-text engine is based on `Cheetah + `_. + + If enabled through the ``stt_enabled`` parameter (default: True), the + assistant will transcribe the voice commands into text when a conversation + is started either programmatically through :meth:`.start_conversation` or + when the hotword is detected. + + It will emit a + :class:`platypush.message.event.assistant.SpeechRecognizedEvent` when some + speech is detected, and you can hook to that event to build your custom + logic: + + .. code-block:: python + + from platypush import hook, run + from platypush.message.event.assistant import SpeechRecognizedEvent + + # Turn on a light when the phrase "turn on the lights" is detected. + # Note that we can leverage regex-based pattern matching to be more + # flexible when matching the phrases. For example, the following hook + # will be matched when the user says "turn on the lights", "turn on + # lights", "lights on", "lights on please", "turn on light" etc. + @hook(SpeechRecognizedEvent, phrase='turn on (the)? lights?') + def on_turn_on_lights(event: SpeechRecognizedEvent, **context): + run("light.hue.on") + + You can also leverage context extraction through the ``${}`` syntax on the + hook to extract specific tokens from the event that can be passed to your + event hook. For example: + + .. code-block:: python + + from platypush import hook, run + from platypush.message.event.assistant import SpeechRecognizedEvent + + @hook(SpeechRecognizedEvent, phrase='play ${title} by ${artist}') + def on_play_track_command( + event: SpeechRecognizedEvent, title: str, artist: str, **context + ): + results = run( + "music.mopidy.search", + filter={"title": title, "artist": artist} + ) + + if not results: + event.assistant.render_response(f"Couldn't find {title} by {artist}") + return + + run("music.mopidy.play", resource=results[0]["uri"]) + + Speech-to-intent + ---------------- + + The intent recognition engine is based on `Rhino + `_. + + *Intents* are snippets of unstructured transcribed speech that can be + matched to structured actions. + + Unlike with hotword and speech-to-text detection, you need to provide a + custom model for intent detection. You can create your custom model using + the `Rhino console `_. + + When an intent is detected, the assistant will emit a + :class:`platypush.message.event.assistant.IntentRecognizedEvent` that can + be listened. + + For example, you can train a model to control groups of smart lights by + defining the following slots on the Rhino console: + + - ``device_state``: The new state of the device (e.g. with ``on`` or + ``off`` as supported values) + + - ``room``: The name of the room associated to the group of lights to + be controlled (e.g. ``living room``, ``kitchen``, ``bedroom``) + + You can then define a ``lights_ctrl`` intent with the following expressions: + + - "turn ``$device_state:state`` the lights" + - "turn ``$device_state:state`` the ``$room:room`` lights" + - "turn the lights ``$device_state:state``" + - "turn the ``$room:room`` lights ``$device_state:state``" + - "turn ``$room:room`` lights ``$device_state:state``" + + This intent will match any of the following phrases: + + - "*turn on the lights*" + - "*turn off the lights*" + - "*turn the lights on*" + - "*turn the lights off*" + - "*turn on the living room lights*" + - "*turn off the living room lights*" + - "*turn the living room lights on*" + - "*turn the living room lights off*" + + And it will extract any slots that are matched in the phrases in the + :class:`platypush.message.event.assistant.IntentRecognizedEvent` event. + + Train the model, download the context file, and pass the path on the + ``intent_model_path`` parameter. + + You can then register a hook to listen to a specific intent: + + .. code-block:: python + + from platypush import hook, run + from platypush.message.event.assistant import IntentRecognizedEvent + + @hook(IntentRecognizedEvent, intent='lights_ctrl', slots={'state': 'on'}) + def on_turn_on_lights(event: IntentRecognizedEvent, **context): + room = event.slots.get('room') + if room: + run("light.hue.on", groups=[room]) + else: + run("light.hue.on") + + Note that if both ``stt_enabled`` and ``intent_model_path`` are set, then + both the speech-to-text and intent recognition engines will run in parallel + when a conversation is started. + + The intent engine is usually faster, as it has a smaller set of intents to + match and doesn't have to run a full speech-to-text transcription. This means that, + if an utterance matches both a speech-to-text phrase and an intent, the + :class:`platypush.message.event.assistant.IntentRecognizedEvent` event is emitted + (and not :class:`platypush.message.event.assistant.SpeechRecognizedEvent`). + + This may not be always the case though. So it may be a good practice to + also provide a fallback + :class:`platypush.message.event.assistant.SpeechRecognizedEvent` hook to + catch the text if the speech is not recognized as an intent: + + .. code-block:: python + + from platypush import hook, run + from platypush.message.event.assistant import SpeechRecognizedEvent + + @hook(SpeechRecognizedEvent, phrase='turn ${state} (the)? ${room} lights?') + def on_turn_on_lights(event: SpeechRecognizedEvent, phrase, room, **context): + if room: + run("light.hue.on", groups=[room]) + else: + run("light.hue.on") + + Text-to-speech + -------------- + + The text-to-speech engine is based on `Orca + `_. + + It is not directly implemented by this plugin, but the implementation is + provided in the :class:`platypush.plugins.tts.picovoice.TtsPicovoicePlugin` + plugin. + + You can however leverage the :meth:`.render_response` action to render some + text as speech in response to a user command, and that in turn will leverage + the PicoVoice TTS plugin to render the response. + + For example, the following snippet provides a hook that: + + - Listens for + :class:`platypush.message.event.assistant.SpeechRecognizedEvent`. + + - Matches the phrase against a list of predefined commands that + shouldn't require a response. + + - Has a fallback logic that leverages the + :class:`platypush.plugins.openai.OpenaiPlugin` to generate a response + for the given text and renders it as speech. + + - Has a logic for follow-on turns if the response from ChatGPT is a question. + + .. code-block:: python + + import re + from collections import defaultdict + from datetime import datetime as dt, timedelta + from dateutil.parser import isoparse + from logging import getLogger + + from platypush import hook, run + from platypush.message.event.assistant import ( + SpeechRecognizedEvent, + ResponseEndEvent, + ) + + logger = getLogger(__name__) + + def play_music(*_, **__): + run("music.mopidy.play") + + def stop_music(*_, **__): + run("music.mopidy.stop") + + def ai_assist(event: SpeechRecognizedEvent, **__): + response = run("openai.get_response", prompt=event.phrase) + if not response: + return + + run("assistant.picovoice.render_response", text=response) + + # List of commands to match, as pairs of regex patterns and the + # corresponding actions + hooks = ( + (re.compile(r"play (the)?music", re.IGNORECASE), play_music), + (re.compile(r"stop (the)?music", re.IGNORECASE), stop_music), + # Fallback to the AI assistant + (re.compile(r".*"), ai_assist), + ) + + @hook(SpeechRecognizedEvent) + def on_speech_recognized(event, **kwargs): + for pattern, command in hooks: + if pattern.search(event.phrase): + logger.info("Running voice command %s", command.__name__) + command(event, **kwargs) + break + + @hook(ResponseEndEvent) + def on_response_end(event: ResponseEndEvent, **__): + # Check if the response is a question and start a follow-on turn if so. + # Note that the ``openai`` plugin by default is configured to keep + # the past interaction in a context window of ~10 minutes, so you + # can follow up like in a real conversation. + if event.assistant and event.response_text and event.response_text.endswith("?"): + event.assistant.start_conversation() + """ def __init__(