[#304] Rewrite Picovoice integrations #385

Merged
blacklight merged 36 commits from 304-new-picovoice-integration into master 2024-05-02 02:50:51 +02:00
49 changed files with 2838 additions and 805 deletions

View File

@ -10,6 +10,4 @@ Backends
platypush/backend/midi.rst
platypush/backend/nodered.rst
platypush/backend/redis.rst
platypush/backend/stt.picovoice.hotword.rst
platypush/backend/stt.picovoice.speech.rst
platypush/backend/tcp.rst

View File

@ -1,5 +0,0 @@
``stt.picovoice.hotword``
===========================================
.. automodule:: platypush.backend.stt.picovoice.hotword
:members:

View File

@ -1,5 +0,0 @@
``stt.picovoice.speech``
==========================================
.. automodule:: platypush.backend.stt.picovoice.speech
:members:

View File

@ -0,0 +1,5 @@
``assistant.picovoice``
=======================
.. automodule:: platypush.plugins.assistant.picovoice
:members:

View File

@ -0,0 +1,5 @@
``openai``
==========
.. automodule:: platypush.plugins.openai
:members:

View File

@ -1,5 +0,0 @@
``stt.picovoice.hotword``
===========================================
.. automodule:: platypush.plugins.stt.picovoice.hotword
:members:

View File

@ -1,5 +0,0 @@
``stt.picovoice.speech``
==========================================
.. automodule:: platypush.plugins.stt.picovoice.speech
:members:

View File

@ -0,0 +1,5 @@
``tts.picovoice``
=================
.. automodule:: platypush.plugins.tts.picovoice
:members:

View File

@ -11,6 +11,7 @@ Plugins
platypush/plugins/application.rst
platypush/plugins/arduino.rst
platypush/plugins/assistant.google.rst
platypush/plugins/assistant.picovoice.rst
platypush/plugins/autoremote.rst
platypush/plugins/bluetooth.rst
platypush/plugins/calendar.rst
@ -94,6 +95,7 @@ Plugins
platypush/plugins/ngrok.rst
platypush/plugins/nmap.rst
platypush/plugins/ntfy.rst
platypush/plugins/openai.rst
platypush/plugins/otp.rst
platypush/plugins/pihole.rst
platypush/plugins/ping.rst
@ -119,8 +121,6 @@ Plugins
platypush/plugins/smartthings.rst
platypush/plugins/sound.rst
platypush/plugins/ssh.rst
platypush/plugins/stt.picovoice.hotword.rst
platypush/plugins/stt.picovoice.speech.rst
platypush/plugins/sun.rst
platypush/plugins/switch.tplink.rst
platypush/plugins/switch.wemo.rst
@ -135,6 +135,7 @@ Plugins
platypush/plugins/tts.rst
platypush/plugins/tts.google.rst
platypush/plugins/tts.mimic3.rst
platypush/plugins/tts.picovoice.rst
platypush/plugins/tv.samsung.ws.rst
platypush/plugins/twilio.rst
platypush/plugins/udp.rst

View File

@ -7,10 +7,13 @@ Platypush
from .app import Application
from .config import Config
from .context import get_backend, get_bus, get_plugin
from .context import Variable, get_backend, get_bus, get_plugin
from .cron import cron
from .event.hook import hook
from .message.event import Event
from .message.request import Request
from .message.response import Response
from .procedure import procedure
from .runner import main
from .utils import run
@ -19,14 +22,18 @@ __author__ = 'Fabio Manganiello <fabio@manganiello.tech>'
__version__ = '0.50.3'
__all__ = [
'Application',
'Variable',
'Config',
'Event',
'Request',
'Response',
'cron',
'get_backend',
'get_bus',
'get_plugin',
'hook',
'main',
'procedure',
'run',
]

View File

@ -1,40 +0,0 @@
import time
from platypush.backend import Backend
from platypush.context import get_plugin
from platypush.plugins.stt import SttPlugin
class SttBackend(Backend):
"""
Base class for speech-to-text backends.
"""
def __init__(self, plugin_name: str, retry_sleep: float = 5.0, *args, **kwargs):
"""
:param plugin_name: Plugin name of the class that will be used for speech detection. Must be an instance of
:class:`platypush.plugins.stt.SttPlugin`.
:param retry_sleep: Number of seconds the backend will wait on failure before re-initializing the plugin
(default: 5 seconds).
"""
super().__init__(*args, **kwargs)
self.plugin_name = plugin_name
self.retry_sleep = retry_sleep
def run(self):
super().run()
self.logger.info('Starting {} speech-to-text backend'.format(self.__class__.__name__))
while not self.should_stop():
try:
plugin: SttPlugin = get_plugin(self.plugin_name)
with plugin:
# noinspection PyProtectedMember
plugin._detection_thread.join()
except Exception as e:
self.logger.exception(e)
self.logger.warning('Encountered an unexpected error, retrying in {} seconds'.format(self.retry_sleep))
time.sleep(self.retry_sleep)
# vim:sw=4:ts=4:et:

View File

@ -1,21 +0,0 @@
from platypush.backend.stt import SttBackend
class SttPicovoiceHotwordBackend(SttBackend):
"""
Backend for the PicoVoice hotword detection plugin. Set this plugin to ``enabled`` if you
want to run the hotword engine continuously instead of programmatically using
``start_detection`` and ``stop_detection``.
Requires:
- The :class:`platypush.plugins.stt.deepspeech.SttPicovoiceHotwordPlugin` plugin configured and its dependencies
installed.
"""
def __init__(self, *args, **kwargs):
super().__init__('stt.picovoice.hotword', *args, **kwargs)
# vim:sw=4:ts=4:et:

View File

@ -1,6 +0,0 @@
manifest:
events: {}
install:
pip: []
package: platypush.backend.stt.picovoice.hotword
type: backend

View File

@ -1,21 +0,0 @@
from platypush.backend.stt import SttBackend
class SttPicovoiceSpeechBackend(SttBackend):
"""
Backend for the PicoVoice speech detection plugin. Set this plugin to ``enabled`` if you
want to run the speech engine continuously instead of programmatically using
``start_detection`` and ``stop_detection``.
Requires:
- The :class:`platypush.plugins.stt.deepspeech.SttPicovoiceSpeechPlugin` plugin configured and its dependencies
installed.
"""
def __init__(self, *args, **kwargs):
super().__init__('stt.picovoice.speech', *args, **kwargs)
# vim:sw=4:ts=4:et:

View File

@ -1,6 +0,0 @@
manifest:
events: {}
install:
pip: []
package: platypush.backend.stt.picovoice.speech
type: backend

View File

@ -1,4 +1,4 @@
from multiprocessing import RLock, Queue
from multiprocessing import RLock, Queue, active_children
import os
from queue import Empty
import socket
@ -57,6 +57,25 @@ class CommandStream(ControllableProcess):
self.reset()
return super().close()
def _term_or_kill(self, kill: bool = False, visited=None) -> None:
func = 'kill' if kill else 'terminate'
visited = visited or set()
visited.add(id(self))
for child in active_children():
child_id = id(child)
if child_id not in visited:
visited.add(child_id)
getattr(child, func)()
getattr(super(), func)()
def terminate(self, visited=None) -> None:
self._term_or_kill(kill=False, visited=visited)
def kill(self) -> None:
self._term_or_kill(kill=True)
def __enter__(self) -> "CommandStream":
self.reset()
sock = self._sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)

View File

@ -1,4 +1,5 @@
import sys
from typing import Callable, List, Union
from ..hook import EventHook
@ -20,10 +21,23 @@ class EventProcessor:
if hooks is None:
hooks = Config.get_event_hooks()
self.hooks = []
self._hooks_by_name = {}
self._hooks_by_value_id = {}
for name, hook in hooks.items():
h = EventHook.build(name=name, hook=hook)
self.hooks.append(h)
self.add_hook(name, hook)
@property
def hooks(self) -> List[EventHook]:
return list(self._hooks_by_name.values())
def add_hook(self, name: str, desc: Union[dict, Callable]):
hook_id = id(desc)
if hook_id in self._hooks_by_value_id:
return # Don't add the same hook twice
hook = EventHook.build(name=name, hook=desc)
self._hooks_by_name[name] = hook
self._hooks_by_value_id[hook_id] = hook
@staticmethod
def notify_web_clients(event):

View File

@ -257,26 +257,29 @@ class Event(Message):
return result
def as_dict(self):
"""
Converts the event into a dictionary
"""
args = copy.deepcopy(self.args)
flatten(args)
return {
'type': 'event',
'target': self.target,
'origin': self.origin if hasattr(self, 'origin') else None,
'id': self.id if hasattr(self, 'id') else None,
'_timestamp': self.timestamp,
'args': {'type': self.type, **args},
}
def __str__(self):
"""
Overrides the str() operator and converts
the message into a UTF-8 JSON string
"""
args = copy.deepcopy(self.args)
flatten(args)
return json.dumps(
{
'type': 'event',
'target': self.target,
'origin': self.origin if hasattr(self, 'origin') else None,
'id': self.id if hasattr(self, 'id') else None,
'_timestamp': self.timestamp,
'args': {'type': self.type, **args},
},
cls=self.Encoder,
)
return json.dumps(self.as_dict(), cls=self.Encoder)
@dataclass

View File

@ -1,27 +1,53 @@
import re
import sys
from typing import Optional
from typing import Any, Mapping, Optional, Union
from platypush.context import get_plugin
from platypush.message.event import Event
from platypush.message.event import Event, EventMatchResult
from platypush.plugins.assistant import AssistantPlugin
from platypush.utils import get_plugin_name_by_class
class AssistantEvent(Event):
"""Base class for assistant events"""
def __init__(self, *args, assistant: Optional[str] = None, **kwargs):
def __init__(
self, *args, assistant: Optional[Union[str, AssistantPlugin]] = None, **kwargs
):
"""
:param assistant: Name of the assistant plugin that triggered the event.
"""
super().__init__(*args, assistant=assistant, **kwargs)
assistant = assistant or kwargs.get('assistant')
if assistant:
assistant = (
assistant
if isinstance(assistant, str)
else get_plugin_name_by_class(assistant.__class__)
)
kwargs['_assistant'] = assistant
super().__init__(*args, **kwargs)
@property
def _assistant(self):
return (
get_plugin(self.args.get('assistant'))
if self.args.get('assistant')
else None
)
def assistant(self) -> Optional[AssistantPlugin]:
assistant = self.args.get('_assistant')
if not assistant:
return None
return get_plugin(assistant)
def as_dict(self):
evt_dict = super().as_dict()
evt_args = {**evt_dict['args']}
assistant = evt_args.pop('_assistant', None)
if assistant:
evt_args['assistant'] = assistant
return {
**evt_dict,
'args': evt_args,
}
class ConversationStartEvent(AssistantEvent):
@ -43,7 +69,6 @@ class ConversationEndEvent(AssistantEvent):
:param with_follow_on_turn: Set to true if the conversation expects a
user follow-up, false otherwise
"""
super().__init__(*args, with_follow_on_turn=with_follow_on_turn, **kwargs)
@ -56,17 +81,46 @@ class ConversationTimeoutEvent(ConversationEndEvent):
super().__init__(*args, **kwargs)
class ResponseEvent(ConversationEndEvent):
class ResponseEvent(AssistantEvent):
"""
Event triggered when a response is processed by the assistant
"""
def __init__(self, *args, response_text: str, **kwargs):
def __init__(
self, *args, response_text: str, with_follow_on_turn: bool = False, **kwargs
):
"""
:param response_text: Response text processed by the assistant
:param with_follow_on_turn: Set to true if the conversation expects a
user follow-up, false otherwise
"""
super().__init__(
*args,
response_text=response_text,
with_follow_on_turn=with_follow_on_turn,
**kwargs,
)
super().__init__(*args, response_text=response_text, **kwargs)
class ResponseEndEvent(ConversationEndEvent):
"""
Event triggered when a response has been rendered on the assistant.
"""
def __init__(
self, *args, response_text: str, with_follow_on_turn: bool = False, **kwargs
):
"""
:param response_text: Response text rendered on the assistant.
:param with_follow_on_turn: Set to true if the conversation expects a
user follow-up, false otherwise.
"""
super().__init__(
*args,
response_text=response_text,
with_follow_on_turn=with_follow_on_turn,
**kwargs,
)
class NoResponseEvent(ConversationEndEvent):
@ -95,8 +149,8 @@ class SpeechRecognizedEvent(AssistantEvent):
"""
result = super().matches_condition(condition)
if result.is_match and self._assistant and 'phrase' in condition.args:
self._assistant.stop_conversation()
if result.is_match and self.assistant and 'phrase' in condition.args:
self.assistant.stop_conversation()
return result
@ -125,6 +179,7 @@ class SpeechRecognizedEvent(AssistantEvent):
result.score = sys.maxsize
return result
parsed_args = {}
event_tokens = re.split(r'\s+', event_args.get(argname, '').strip().lower())
condition_tokens = re.split(r'\s+', condition_value.strip().lower())
@ -147,11 +202,11 @@ class SpeechRecognizedEvent(AssistantEvent):
m = re.match(r'[^\\]*\${(.+?)}', condition_token)
if m:
argname = m.group(1)
if argname not in result.parsed_args:
result.parsed_args[argname] = event_token
if argname not in parsed_args:
parsed_args[argname] = event_token
result.score += 1.0
else:
result.parsed_args[argname] += ' ' + event_token
parsed_args[argname] += ' ' + event_token
if (len(condition_tokens) == 1 and len(event_tokens) == 1) or (
len(event_tokens) > 1
@ -169,6 +224,45 @@ class SpeechRecognizedEvent(AssistantEvent):
# It's a match if all the tokens in the condition string have been satisfied
result.is_match = len(condition_tokens) == 0
if result.is_match:
result.parsed_args = parsed_args
return result
class IntentRecognizedEvent(AssistantEvent):
"""
Event triggered when an intent is matched by a speech command.
"""
def __init__(
self, *args, intent: str, slots: Optional[Mapping[str, Any]] = None, **kwargs
):
"""
:param intent: The intent that has been matched.
:param slots: The slots extracted from the intent, as a key-value mapping.
"""
super().__init__(*args, intent=intent, slots=slots or {}, **kwargs)
def _matches_argument(
self, argname, condition_value, event_args, result: EventMatchResult
):
if argname != 'slots':
return super()._matches_argument(
argname, condition_value, event_args, result
)
event_slots = set(event_args.get(argname, {}).items())
slots = set(self.args.get(argname, {}).items())
# All the slots in the condition must be in the event
if slots.difference(event_slots) == 0:
result.is_match = True
result.score += 1
else:
result.is_match = False
result.score = 0
return result

View File

@ -122,6 +122,12 @@ class Plugin(EventGenerator, ExtensionWithManifest): # lgtm [py/missing-call-to
assert entities, 'entities plugin not initialized'
return entities
def __str__(self):
"""
:return: The qualified name of the plugin.
"""
return get_plugin_name_by_class(self.__class__)
def run(self, method, *args, **kwargs):
assert (
method in self.registered_actions

View File

@ -8,23 +8,7 @@ from typing import Any, Collection, Dict, Optional, Type
from platypush.context import get_bus, get_plugin
from platypush.entities.assistants import Assistant
from platypush.entities.managers.assistants import AssistantEntityManager
from platypush.message.event.assistant import (
AssistantEvent,
ConversationStartEvent,
ConversationEndEvent,
ConversationTimeoutEvent,
ResponseEvent,
NoResponseEvent,
SpeechRecognizedEvent,
AlarmStartedEvent,
AlarmEndEvent,
TimerStartedEvent,
TimerEndEvent,
AlertStartedEvent,
AlertEndEvent,
MicMutedEvent,
MicUnmutedEvent,
)
from platypush.message.event import Event as AppEvent
from platypush.plugins import Plugin, action
from platypush.utils import get_plugin_name_by_class
@ -181,6 +165,17 @@ class AssistantPlugin(Plugin, AssistantEntityManager, ABC):
self.publish_entities([self])
return asdict(self._state)
@action
def render_response(self, text: str, *_, **__):
"""
Render a response text as audio over the configured TTS plugin.
:param text: Text to render.
"""
self._on_response_render_start(text)
self._render_response(text)
self._on_response_render_end()
def _get_tts_plugin(self):
if not self.tts_plugin:
return None
@ -200,11 +195,13 @@ class AssistantPlugin(Plugin, AssistantEntityManager, ABC):
audio.play(self._conversation_start_sound)
def _send_event(self, event_type: Type[AssistantEvent], **kwargs):
def _send_event(self, event_type: Type[AppEvent], **kwargs):
self.publish_entities([self])
get_bus().post(event_type(assistant=self._plugin_name, **kwargs))
def _on_conversation_start(self):
from platypush.message.event.assistant import ConversationStartEvent
self._last_response = None
self._last_query = None
self._conversation_running.set()
@ -212,63 +209,105 @@ class AssistantPlugin(Plugin, AssistantEntityManager, ABC):
self._play_conversation_start_sound()
def _on_conversation_end(self):
from platypush.message.event.assistant import ConversationEndEvent
self._conversation_running.clear()
self._send_event(ConversationEndEvent)
def _on_conversation_timeout(self):
from platypush.message.event.assistant import ConversationTimeoutEvent
self._last_response = None
self._last_query = None
self._conversation_running.clear()
self._send_event(ConversationTimeoutEvent)
def _on_no_response(self):
from platypush.message.event.assistant import NoResponseEvent
self._last_response = None
self._conversation_running.clear()
self._send_event(NoResponseEvent)
def _on_reponse_rendered(self, text: Optional[str]):
def _on_response_render_start(self, text: Optional[str]):
from platypush.message.event.assistant import ResponseEvent
self._last_response = text
self._send_event(ResponseEvent, response_text=text)
tts = self._get_tts_plugin()
def _render_response(self, text: Optional[str]):
tts = self._get_tts_plugin()
if tts and text:
self.stop_conversation()
tts.say(text=text, **self.tts_plugin_args)
def _on_response_render_end(self):
from platypush.message.event.assistant import ResponseEndEvent
self._send_event(ResponseEndEvent, response_text=self._last_response)
def _on_hotword_detected(self, hotword: Optional[str]):
from platypush.message.event.assistant import HotwordDetectedEvent
self._send_event(HotwordDetectedEvent, hotword=hotword)
def _on_speech_recognized(self, phrase: Optional[str]):
from platypush.message.event.assistant import SpeechRecognizedEvent
phrase = (phrase or '').lower().strip()
self._last_query = phrase
self._send_event(SpeechRecognizedEvent, phrase=phrase)
def _on_intent_matched(self, intent: str, slots: Optional[Dict[str, Any]] = None):
from platypush.message.event.assistant import IntentRecognizedEvent
self._send_event(IntentRecognizedEvent, intent=intent, slots=slots)
def _on_alarm_start(self):
from platypush.message.event.assistant import AlarmStartedEvent
self._cur_alert_type = AlertType.ALARM
self._send_event(AlarmStartedEvent)
def _on_alarm_end(self):
from platypush.message.event.assistant import AlarmEndEvent
self._cur_alert_type = None
self._send_event(AlarmEndEvent)
def _on_timer_start(self):
from platypush.message.event.assistant import TimerStartedEvent
self._cur_alert_type = AlertType.TIMER
self._send_event(TimerStartedEvent)
def _on_timer_end(self):
from platypush.message.event.assistant import TimerEndEvent
self._cur_alert_type = None
self._send_event(TimerEndEvent)
def _on_alert_start(self):
from platypush.message.event.assistant import AlertStartedEvent
self._cur_alert_type = AlertType.ALERT
self._send_event(AlertStartedEvent)
def _on_alert_end(self):
from platypush.message.event.assistant import AlertEndEvent
self._cur_alert_type = None
self._send_event(AlertEndEvent)
def _on_mute(self):
from platypush.message.event.assistant import MicMutedEvent
self._is_muted = True
self._send_event(MicMutedEvent)
def _on_unmute(self):
from platypush.message.event.assistant import MicUnmutedEvent
self._is_muted = False
self._send_event(MicUnmutedEvent)

View File

@ -0,0 +1,740 @@
import os
from typing import Optional, Sequence
from platypush.context import get_plugin
from platypush.plugins import RunnablePlugin, action
from platypush.plugins.assistant import AssistantPlugin
from platypush.plugins.tts.picovoice import TtsPicovoicePlugin
from ._assistant import Assistant
from ._state import AssistantState
# pylint: disable=too-many-ancestors
class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
r"""
A voice assistant that runs on your device, based on the `Picovoice
<https://picovoice.ai/>`_ engine.
Picovoice is a suite of on-device voice technologies that include:
* **Porcupine**: wake-word engine, if you want the device to listen for
a specific wake word in order to start the assistant.
* **Cheetah**: speech-to-text engine, if you want your voice
interactions to be transcribed into free text - either
programmatically or when triggered by the wake word. Or:
* **Rhino**: intent recognition engine, if you want to extract *intents*
out of your voice commands - for instance, the phrase "set the living
room temperature to 20 degrees" could be mapped to the intent with the
following parameters: ``intent``: ``set_temperature``, ``room``:
``living_room``, ``temperature``: ``20``.
* **Leopard**: speech-to-text engine aimed at offline transcription of
audio files rather than real-time transcription.
* **Orca**: text-to-speech engine, if you want to create your custom
logic to respond to user's voice commands and render the responses as
audio.
This plugin is a wrapper around the Picovoice engine that allows you to
run your custom voice-based conversational flows on your device.
Getting a Picovoice account and access key
-------------------------------------------
You can get your personal access key by signing up at the `Picovoice
console <https://console.picovoice.ai/>`_. You may be asked to submit a
reason for using the service (feel free to mention a personal Platypush
integration), and you will receive your personal access key.
If prompted to select the products you want to use, make sure to select
the ones from the Picovoice suite that you want to use with this plugin.
Hotword detection
-----------------
The hotword detection engine is based on `Porcupine
<https://picovoice.ai/platform/porcupine/>`_.
If enabled through the ``hotword_enabled`` parameter (default: True), the
assistant will listen for a specific wake word before starting the
speech-to-text or intent recognition engines. You can specify custom models
for your hotword (e.g. on the same device you may use "Alexa" to trigger the
speech-to-text engine in English, "Computer" to trigger the speech-to-text
engine in Italian, and "Ok Google" to trigger the intent recognition engine.
You can also create your custom hotword models using the `Porcupine console
<https://console.picovoice.ai/ppn>`_.
If ``hotword_enabled`` is set to True, you must also specify the
``keywords`` parameter with the list of keywords that you want to listen
for, and optionally the ``keyword_paths`` parameter with the paths to the
any custom hotword models that you want to use. If ``hotword_enabled`` is
set to False, then the assistant won't start listening for speech after the
plugin is started, and you will need to programmatically start the
conversation by calling the :meth:`.start_conversation` action, or trigger
it from the UI.
When a wake-word is detected, the assistant will emit a
:class:`platypush.message.event.assistant.HotwordDetectedEvent` event that
you can use to build your custom logic. For example:
.. code-block:: python
import time
from platypush import hook, run
from platypush.message.event.assistant import HotwordDetectedEvent
# Turn on a light for 5 seconds when the hotword "Alexa" is detected
@hook(HotwordDetectedEvent, hotword='Alexa')
def on_hotword_detected(event: HotwordDetectedEvent, **context):
run("light.hue.on", lights=["Living Room"])
time.sleep(5)
run("light.hue.off", lights=["Living Room"])
By default, the assistant will start listening for speech after the hotword
if either ``stt_enabled`` or ``intent_model_path`` are set. If you don't
want the assistant to start listening for speech after the hotword is
detected (for example because you want to build your custom response flows,
or trigger the speech detection using different models depending on the
hotword that is used, or because you just want to detect hotwords but not
speech), then you can also set the ``start_conversation_on_hotword``
parameter to ``False``. If that is the case, then you can programmatically
start the conversation by calling the :meth:`.start_conversation` method in
your event hooks:
.. code-block:: python
from platypush import hook, run
from platypush.message.event.assistant import HotwordDetectedEvent
# Start a conversation using the Italian language model when the
# "Buongiorno" hotword is detected
@hook(HotwordDetectedEvent, hotword='Buongiorno')
def on_it_hotword_detected(event: HotwordDetectedEvent, **context):
event.assistant.start_conversation(model_file='path/to/it.pv')
Speech-to-text
--------------
The speech-to-text engine is based on `Cheetah
<https://picovoice.ai/docs/cheetah/>`_.
If enabled through the ``stt_enabled`` parameter (default: True), the
assistant will transcribe the voice commands into text when a conversation
is started either programmatically through :meth:`.start_conversation` or
when the hotword is detected.
It will emit a
:class:`platypush.message.event.assistant.SpeechRecognizedEvent` when some
speech is detected, and you can hook to that event to build your custom
logic:
.. code-block:: python
from platypush import hook, run
from platypush.message.event.assistant import SpeechRecognizedEvent
# Turn on a light when the phrase "turn on the lights" is detected.
# Note that we can leverage regex-based pattern matching to be more
# flexible when matching the phrases. For example, the following hook
# will be matched when the user says "turn on the lights", "turn on
# lights", "lights on", "lights on please", "turn on light" etc.
@hook(SpeechRecognizedEvent, phrase='turn on (the)? lights?')
def on_turn_on_lights(event: SpeechRecognizedEvent, **context):
run("light.hue.on")
You can also leverage context extraction through the ``${}`` syntax on the
hook to extract specific tokens from the event that can be passed to your
event hook. For example:
.. code-block:: python
from platypush import hook, run
from platypush.message.event.assistant import SpeechRecognizedEvent
@hook(SpeechRecognizedEvent, phrase='play ${title} by ${artist}')
def on_play_track_command(
event: SpeechRecognizedEvent, title: str, artist: str, **context
):
results = run(
"music.mopidy.search",
filter={"title": title, "artist": artist}
)
if not results:
event.assistant.render_response(f"Couldn't find {title} by {artist}")
return
run("music.mopidy.play", resource=results[0]["uri"])
Speech-to-intent
----------------
The intent recognition engine is based on `Rhino
<https://picovoice.ai/docs/rhino/>`_.
*Intents* are snippets of unstructured transcribed speech that can be
matched to structured actions.
Unlike with hotword and speech-to-text detection, you need to provide a
custom model for intent detection. You can create your custom model using
the `Rhino console <https://console.picovoice.ai/rhn>`_.
When an intent is detected, the assistant will emit a
:class:`platypush.message.event.assistant.IntentRecognizedEvent` that can
be listened.
For example, you can train a model to control groups of smart lights by
defining the following slots on the Rhino console:
- ``device_state``: The new state of the device (e.g. with ``on`` or
``off`` as supported values)
- ``room``: The name of the room associated to the group of lights to
be controlled (e.g. ``living room``, ``kitchen``, ``bedroom``)
You can then define a ``lights_ctrl`` intent with the following expressions:
- "turn ``$device_state:state`` the lights"
- "turn ``$device_state:state`` the ``$room:room`` lights"
- "turn the lights ``$device_state:state``"
- "turn the ``$room:room`` lights ``$device_state:state``"
- "turn ``$room:room`` lights ``$device_state:state``"
This intent will match any of the following phrases:
- "*turn on the lights*"
- "*turn off the lights*"
- "*turn the lights on*"
- "*turn the lights off*"
- "*turn on the living room lights*"
- "*turn off the living room lights*"
- "*turn the living room lights on*"
- "*turn the living room lights off*"
And it will extract any slots that are matched in the phrases in the
:class:`platypush.message.event.assistant.IntentRecognizedEvent` event.
Train the model, download the context file, and pass the path on the
``intent_model_path`` parameter.
You can then register a hook to listen to a specific intent:
.. code-block:: python
from platypush import hook, run
from platypush.message.event.assistant import IntentRecognizedEvent
@hook(IntentRecognizedEvent, intent='lights_ctrl', slots={'state': 'on'})
def on_turn_on_lights(event: IntentRecognizedEvent, **context):
room = event.slots.get('room')
if room:
run("light.hue.on", groups=[room])
else:
run("light.hue.on")
Note that if both ``stt_enabled`` and ``intent_model_path`` are set, then
both the speech-to-text and intent recognition engines will run in parallel
when a conversation is started.
The intent engine is usually faster, as it has a smaller set of intents to
match and doesn't have to run a full speech-to-text transcription. This means that,
if an utterance matches both a speech-to-text phrase and an intent, the
:class:`platypush.message.event.assistant.IntentRecognizedEvent` event is emitted
(and not :class:`platypush.message.event.assistant.SpeechRecognizedEvent`).
This may not be always the case though. So it may be a good practice to
also provide a fallback
:class:`platypush.message.event.assistant.SpeechRecognizedEvent` hook to
catch the text if the speech is not recognized as an intent:
.. code-block:: python
from platypush import hook, run
from platypush.message.event.assistant import SpeechRecognizedEvent
@hook(SpeechRecognizedEvent, phrase='turn ${state} (the)? ${room} lights?')
def on_turn_on_lights(event: SpeechRecognizedEvent, phrase, room, **context):
if room:
run("light.hue.on", groups=[room])
else:
run("light.hue.on")
Text-to-speech
--------------
The text-to-speech engine is based on `Orca
<https://picovoice.ai/docs/orca/>`_.
It is not directly implemented by this plugin, but the implementation is
provided in the :class:`platypush.plugins.tts.picovoice.TtsPicovoicePlugin`
plugin.
You can however leverage the :meth:`.render_response` action to render some
text as speech in response to a user command, and that in turn will leverage
the PicoVoice TTS plugin to render the response.
For example, the following snippet provides a hook that:
- Listens for
:class:`platypush.message.event.assistant.SpeechRecognizedEvent`.
- Matches the phrase against a list of predefined commands that
shouldn't require a response.
- Has a fallback logic that leverages the
:class:`platypush.plugins.openai.OpenaiPlugin` to generate a response
for the given text and renders it as speech.
- Has a logic for follow-on turns if the response from ChatGPT is a question.
.. code-block:: python
import re
from collections import defaultdict
from datetime import datetime as dt, timedelta
from dateutil.parser import isoparse
from logging import getLogger
from platypush import hook, run
from platypush.message.event.assistant import (
SpeechRecognizedEvent,
ResponseEndEvent,
)
logger = getLogger(__name__)
def play_music(*_, **__):
run("music.mopidy.play")
def stop_music(*_, **__):
run("music.mopidy.stop")
def ai_assist(event: SpeechRecognizedEvent, **__):
response = run("openai.get_response", prompt=event.phrase)
if not response:
return
run("assistant.picovoice.render_response", text=response)
# List of commands to match, as pairs of regex patterns and the
# corresponding actions
hooks = (
(re.compile(r"play (the)?music", re.IGNORECASE), play_music),
(re.compile(r"stop (the)?music", re.IGNORECASE), stop_music),
# Fallback to the AI assistant
(re.compile(r".*"), ai_assist),
)
@hook(SpeechRecognizedEvent)
def on_speech_recognized(event, **kwargs):
for pattern, command in hooks:
if pattern.search(event.phrase):
logger.info("Running voice command %s", command.__name__)
command(event, **kwargs)
break
@hook(ResponseEndEvent)
def on_response_end(event: ResponseEndEvent, **__):
# Check if the response is a question and start a follow-on turn if so.
# Note that the ``openai`` plugin by default is configured to keep
# the past interaction in a context window of ~10 minutes, so you
# can follow up like in a real conversation.
if event.assistant and event.response_text and event.response_text.endswith("?"):
event.assistant.start_conversation()
"""
def __init__(
self,
access_key: str,
hotword_enabled: bool = True,
stt_enabled: bool = True,
keywords: Optional[Sequence[str]] = None,
keyword_paths: Optional[Sequence[str]] = None,
keyword_model_path: Optional[str] = None,
speech_model_path: Optional[str] = None,
intent_model_path: Optional[str] = None,
endpoint_duration: Optional[float] = 0.5,
enable_automatic_punctuation: bool = False,
start_conversation_on_hotword: bool = True,
audio_queue_size: int = 100,
conversation_timeout: Optional[float] = 7.5,
muted: bool = False,
**kwargs,
):
"""
:param access_key: Your Picovoice access key. You can get it by signing
up at the `Picovoice console <https://console.picovoice.ai/>`.
:param hotword_enabled: Enable the wake-word engine (default: True).
**Note**: The wake-word engine requires you to add Porcupine to the
products available in your Picovoice account.
:param stt_enabled: Enable the speech-to-text engine (default: True).
**Note**: The speech-to-text engine requires you to add Cheetah to
the products available in your Picovoice account.
:param keywords: List of keywords to listen for (e.g. ``alexa``, ``ok
google``...). This is required if the wake-word engine is enabled.
See the `Porcupine keywords repository
<https://github.com/Picovoice/porcupine/tree/master/resources/keyword_files>`_).
for a list of the stock keywords available. If you have a custom
model, you can pass its path to the ``keyword_paths`` parameter and
its filename (without the path and the platform extension) here.
:param keyword_paths: List of paths to the keyword files to listen for.
Custom keyword files can be created using the `Porcupine console
<https://console.picovoice.ai/ppn>`_ and downloaded from the
console itself.
:param keyword_model_path: If you are using a keyword file in a
non-English language, you can provide the path to the model file
for its language. Model files are available for all the supported
languages through the `Porcupine lib repository
<https://github.com/Picovoice/porcupine/tree/master/lib/common>`_.
:param speech_model_path: Path to the speech model file. If you are
using a language other than English, you can provide the path to the
model file for that language. Model files are available for all the
supported languages through the `Cheetah repository
<https://github.com/Picovoice/cheetah/tree/master/lib/common>`_.
You can also use the `Speech console
<https://console.picovoice.ai/cat>`_
to train your custom models. You can use a base model and fine-tune
it by boosting the detection of your own words and phrases and edit
the phonetic representation of the words you want to detect.
:param intent_model_path: Path to the Rhino context model. This is
required if you want to use the intent recognition engine through
Rhino. The context model is a file that contains a list of intents
that can be recognized by the engine. An intent is an action or a
class of actions that the assistant can recognize, and it can
contain an optional number of slots to model context variables -
e.g. temperature, lights group, location, device state etc.
You can create your own context model using the `Rhino console
<https://console.picovoice.ai/rhn>`_. For example, you can define a
context file to control smart home devices by defining the
following slots:
- ``device_type``: The device to control (e.g. lights, music)
- ``device_state``: The target state of the device (e.g. on,
off)
- ``location``: The location of the device (e.g. living
room, kitchen, bedroom)
- ``media_type``: The type of media to play (e.g. music, video)
- ``media_state``: The state of the media (e.g. play, pause,
stop)
You can then define the following intents:
- ``device_ctrl``: Control a device state. Supported phrases:
- "turn ``$device_state:state`` the ``$location:location``
``$device_type:device``"
- "turn ``$device_state:state`` the ``$device_type:device``"
- ``media_ctrl``: Control media state. Supported phrases:
- "``$media_state:state`` the ``$media_type:media``"
- "``$media_state:state`` the ``$media_type:media`` in the
``$location:location``"
Then a phrase like "turn on the lights in the living room" would
trigger a
:class:`platypush.message.event.assistant.IntentRecognizedEvent` with:
.. code-block:: json
{
"intent": "device_ctrl",
"slots": {
"type": "lights",
"state": "on",
"location": "living room"
}
}
**Note**: The intent recognition engine requires you to add Rhino
to the products available in your Picovoice account.
:param endpoint_duration: If set, the assistant will stop listening when
no speech is detected for the specified duration (in seconds) after
the end of an utterance.
:param enable_automatic_punctuation: Enable automatic punctuation
insertion.
:param start_conversation_on_hotword: If set to True (default), a speech
detection session will be started when the hotword is detected. If
set to False, you may want to start the conversation programmatically
by calling the :meth:`.start_conversation` method instead, or run any
custom logic hotword detection logic. This can be particularly useful
when you want to run the assistant in a push-to-talk mode, or when you
want different hotwords to trigger conversations with different models
or languages.
:param audio_queue_size: Maximum number of audio frames to hold in the
processing queue. You may want to increase this value if you are
running this integration on a slow device and/or the logs report
audio frame drops too often. Keep in mind that increasing this value
will increase the memory usage of the integration. Also, a higher
value may result in higher accuracy at the cost of higher latency.
:param conversation_timeout: Maximum time to wait for some speech to be
detected after the hotword is detected. If no speech is detected
within this time, the conversation will time out and the plugin will
go back into hotword detection mode, if the mode is enabled. Default:
7.5 seconds.
:param muted: Set to True to start the assistant in a muted state. You will
need to call the :meth:`.unmute` method to start the assistant listening
for commands, or programmatically call the :meth:`.start_conversation`
to start a conversation.
"""
super().__init__(**kwargs)
self._assistant = None
self._assistant_args = {
'stop_event': self._should_stop,
'access_key': access_key,
'hotword_enabled': hotword_enabled,
'stt_enabled': stt_enabled,
'keywords': keywords,
'keyword_paths': (
os.path.expanduser(keyword_path)
for keyword_path in (keyword_paths or [])
),
'keyword_model_path': (
os.path.expanduser(keyword_model_path) if keyword_model_path else None
),
'speech_model_path': (
os.path.expanduser(speech_model_path) if speech_model_path else None
),
'intent_model_path': (
os.path.expanduser(intent_model_path) if intent_model_path else None
),
'endpoint_duration': endpoint_duration,
'enable_automatic_punctuation': enable_automatic_punctuation,
'start_conversation_on_hotword': (
start_conversation_on_hotword
if (intent_model_path or stt_enabled)
else False
),
'audio_queue_size': audio_queue_size,
'conversation_timeout': conversation_timeout,
'muted': muted,
'on_conversation_start': self._on_conversation_start,
'on_conversation_end': self._on_conversation_end,
'on_conversation_timeout': self._on_conversation_timeout,
'on_speech_recognized': self._on_speech_recognized,
'on_intent_matched': self._on_intent_matched,
'on_hotword_detected': self._on_hotword_detected,
}
@property
def tts(self) -> TtsPicovoicePlugin:
p = get_plugin('tts.picovoice')
assert p, 'Picovoice TTS plugin not configured/found'
return p
def _get_tts_plugin(self) -> TtsPicovoicePlugin:
return self.tts
def _on_response_render_start(self, text: Optional[str]):
if self._assistant:
self._assistant.set_responding(True)
return super()._on_response_render_start(text)
def _on_response_render_end(self):
if self._assistant:
self._assistant.set_responding(False)
return super()._on_response_render_end()
@action
def start_conversation(self, *_, model_file: Optional[str] = None, **__):
"""
Programmatically start a conversation with the assistant.
:param model_file: Override the model file to be used to detect speech
in this conversation. If not set, the configured
``speech_model_path`` will be used.
"""
if not self._assistant:
self.logger.warning('Assistant not initialized')
return
if not model_file:
model_file = self._assistant_args['speech_model_path']
if model_file:
model_file = os.path.expanduser(model_file)
self._assistant.override_speech_model(model_file)
self._assistant.state = AssistantState.DETECTING_SPEECH
@action
def stop_conversation(self, *_, **__):
"""
Programmatically stop a running conversation with the assistant
"""
if not self._assistant:
self.logger.warning('Assistant not initialized')
return
self._assistant.override_speech_model(None)
if self._assistant.hotword_enabled:
self._assistant.state = AssistantState.DETECTING_HOTWORD
else:
self._assistant.state = AssistantState.IDLE
@action
def say(self, text: str, *args, **kwargs):
"""
Proxy to
:class:`platypush.plugins.tts.picovoice.TtsPicovoicePlugin.say` to
render some text as speech through the Picovoice TTS engine.
Extra arguments to
:class:`platypush.plugins.tts.picovoice.TtsPicovoicePlugin.say` can be
passed over ``args`` and ``kwargs``.
:param text: Text to be rendered as speech.
"""
return self.tts.say(text, *args, **kwargs)
@action
def transcribe(self, audio_file: str, *_, model_file: Optional[str] = None, **__):
"""
Transcribe an audio file to text using the `Leopard
<https://picovoice.ai/docs/leopard/>`_ engine.
:param text: Text to be transcribed.
:param model_file: Override the model file to be used to detect speech
in this conversation. If not set, the configured
``speech_model_path`` will be used.
:return: dict
.. code-block:: json
{
"transcription": "This is a test",
"words": [
{
"word": "this",
"start": 0.06400000303983688,
"end": 0.19200000166893005,
"confidence": 0.9626294374465942
},
{
"word": "is",
"start": 0.2879999876022339,
"end": 0.35199999809265137,
"confidence": 0.9781675934791565
},
{
"word": "a",
"start": 0.41600000858306885,
"end": 0.41600000858306885,
"confidence": 0.9764975309371948
},
{
"word": "test",
"start": 0.5120000243186951,
"end": 0.8320000171661377,
"confidence": 0.9511580467224121
}
]
}
"""
import pvleopard
audio_file = os.path.expanduser(audio_file)
if not model_file:
model_file = self._assistant_args['speech_model_path']
if model_file:
model_file = os.path.expanduser(model_file)
leopard = pvleopard.create(
access_key=self._assistant_args['access_key'], model_path=model_file
)
transcript, words = leopard.process_file(audio_file)
try:
return {
'transcription': transcript,
'words': [
{
'word': word.word,
'start': word.start_sec,
'end': word.end_sec,
'confidence': word.confidence,
}
for word in words
],
}
finally:
leopard.delete()
@action
def mute(self, *_, **__):
"""
Mute the microphone. Alias for :meth:`.set_mic_mute` with
``muted=True``.
"""
return self.set_mic_mute(muted=True)
@action
def unmute(self, *_, **__):
"""
Unmute the microphone. Alias for :meth:`.set_mic_mute` with
``muted=False``.
"""
return self.set_mic_mute(muted=False)
@action
def set_mic_mute(self, muted: bool):
"""
Programmatically mute/unmute the microphone.
:param muted: Set to True or False.
"""
if self._assistant:
self._assistant.set_mic_mute(muted)
super()._on_mute_changed(muted)
@action
def toggle_mute(self, *_, **__):
"""
Toggle the mic mute state.
"""
return self.set_mic_mute(not self._is_muted)
@action
def send_text_query(self, *_, query: str, **__):
"""
Send a text query to the assistant.
This is equivalent to saying something to the assistant.
:param query: Query to be sent.
"""
self._on_speech_recognized(query)
def main(self):
while not self.should_stop():
self.logger.info('Starting Picovoice assistant')
with Assistant(**self._assistant_args) as self._assistant:
try:
for event in self._assistant:
if event is not None:
self.logger.debug('Dequeued assistant event: %s', event)
except KeyboardInterrupt:
break
except Exception as e:
self.logger.error('Picovoice assistant error: %s', e, exc_info=True)
self.wait_stop(5)
def stop(self):
try:
self.stop_conversation()
except RuntimeError:
pass
super().stop()
# vim:sw=4:ts=4:et:

View File

@ -0,0 +1,424 @@
import logging
import os
from queue import Full, Queue
from threading import Event, RLock, Thread
from time import time
from typing import Any, Dict, Optional, Sequence
import pvporcupine
from platypush.context import get_plugin
from platypush.message.event.assistant import (
AssistantEvent,
ConversationTimeoutEvent,
HotwordDetectedEvent,
IntentRecognizedEvent,
SpeechRecognizedEvent,
)
from platypush.plugins.tts.picovoice import TtsPicovoicePlugin
from ._recorder import AudioRecorder
from ._speech import SpeechProcessor
from ._state import AssistantState
class Assistant(Thread):
"""
A facade class that wraps the Picovoice engines under an assistant API.
"""
@staticmethod
def _default_callback(*_, **__):
pass
def __init__(
self,
access_key: str,
stop_event: Event,
hotword_enabled: bool = True,
stt_enabled: bool = True,
keywords: Optional[Sequence[str]] = None,
keyword_paths: Optional[Sequence[str]] = None,
keyword_model_path: Optional[str] = None,
frame_expiration: float = 3.0, # Don't process audio frames older than this
speech_model_path: Optional[str] = None,
intent_model_path: Optional[str] = None,
endpoint_duration: Optional[float] = None,
enable_automatic_punctuation: bool = False,
start_conversation_on_hotword: bool = False,
audio_queue_size: int = 100,
muted: bool = False,
conversation_timeout: Optional[float] = None,
on_conversation_start=_default_callback,
on_conversation_end=_default_callback,
on_conversation_timeout=_default_callback,
on_speech_recognized=_default_callback,
on_intent_matched=_default_callback,
on_hotword_detected=_default_callback,
):
super().__init__(name='picovoice:Assistant')
self._access_key = access_key
self._stop_event = stop_event
self.logger = logging.getLogger(__name__)
self.hotword_enabled = hotword_enabled
self.stt_enabled = stt_enabled
self.keywords = list(keywords or [])
self.keyword_paths = None
self.keyword_model_path = None
self.frame_expiration = frame_expiration
self.endpoint_duration = endpoint_duration
self.enable_automatic_punctuation = enable_automatic_punctuation
self.start_conversation_on_hotword = start_conversation_on_hotword
self.audio_queue_size = audio_queue_size
self._responding = Event()
self._muted = muted
self._speech_model_path = speech_model_path
self._speech_model_path_override = None
self._intent_model_path = intent_model_path
self._intent_model_path_override = None
self._in_ctx = False
self._speech_processor = SpeechProcessor(
stop_event=stop_event,
stt_enabled=stt_enabled,
intent_enabled=self.intent_enabled,
conversation_timeout=conversation_timeout,
model_path=speech_model_path,
get_cheetah_args=self._get_speech_engine_args,
get_rhino_args=self._get_intent_engine_args,
)
self._on_conversation_start = on_conversation_start
self._on_conversation_end = on_conversation_end
self._on_conversation_timeout = on_conversation_timeout
self._on_speech_recognized = on_speech_recognized
self._on_intent_matched = on_intent_matched
self._on_hotword_detected = on_hotword_detected
self._recorder = None
self._state = AssistantState.IDLE
self._state_lock = RLock()
self._evt_queue = Queue(maxsize=100)
if hotword_enabled:
if not keywords:
raise ValueError(
'You need to provide a list of keywords if the wake-word engine is enabled'
)
if keyword_paths:
keyword_paths = [os.path.expanduser(path) for path in keyword_paths]
missing_paths = [
path for path in keyword_paths if not os.path.isfile(path)
]
if missing_paths:
raise FileNotFoundError(f'Keyword files not found: {missing_paths}')
self.keyword_paths = keyword_paths
if keyword_model_path:
keyword_model_path = os.path.expanduser(keyword_model_path)
if not os.path.isfile(keyword_model_path):
raise FileNotFoundError(
f'Keyword model file not found: {keyword_model_path}'
)
self.keyword_model_path = keyword_model_path
self._porcupine: Optional[pvporcupine.Porcupine] = None
@property
def intent_enabled(self) -> bool:
return self.intent_model_path is not None
@property
def is_responding(self) -> bool:
return self._responding.is_set()
@property
def speech_model_path(self) -> Optional[str]:
return self._speech_model_path_override or self._speech_model_path
@property
def intent_model_path(self) -> Optional[str]:
return self._intent_model_path_override or self._intent_model_path
@property
def tts(self) -> TtsPicovoicePlugin:
p = get_plugin('tts.picovoice')
assert p, 'Picovoice TTS plugin not configured/found'
return p
def set_responding(self, responding: bool):
if responding:
self._responding.set()
else:
self._responding.clear()
def should_stop(self):
return self._stop_event.is_set()
def wait_stop(self):
self._stop_event.wait()
@property
def state(self) -> AssistantState:
with self._state_lock:
return self._state
@state.setter
def state(self, state: AssistantState):
with self._state_lock:
prev_state = self._state
self._state = state
new_state = self.state
if prev_state == new_state:
return
self.logger.info('Assistant state transition: %s -> %s', prev_state, new_state)
if prev_state == AssistantState.DETECTING_SPEECH:
self.tts.stop()
self._speech_model_path_override = None
self._intent_model_path_override = None
self._speech_processor.on_conversation_end()
self._on_conversation_end()
elif new_state == AssistantState.DETECTING_SPEECH:
self._speech_processor.on_conversation_start()
self._on_conversation_start()
if new_state == AssistantState.DETECTING_HOTWORD:
self.tts.stop()
self._speech_processor.on_conversation_reset()
# Put a null event on the event queue to unblock next_event
self._evt_queue.put(None)
@property
def porcupine(self) -> Optional[pvporcupine.Porcupine]:
if not self.hotword_enabled:
return None
if not self._porcupine:
args: Dict[str, Any] = {'access_key': self._access_key}
if self.keywords:
args['keywords'] = self.keywords
if self.keyword_paths:
args['keyword_paths'] = self.keyword_paths
if self.keyword_model_path:
args['model_path'] = self.keyword_model_path
self._porcupine = pvporcupine.create(**args)
return self._porcupine
def _get_speech_engine_args(self) -> dict:
args: Dict[str, Any] = {'access_key': self._access_key}
if self.speech_model_path:
args['model_path'] = self.speech_model_path
if self.endpoint_duration:
args['endpoint_duration_sec'] = self.endpoint_duration
if self.enable_automatic_punctuation:
args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
return args
def _get_intent_engine_args(self) -> dict:
args: Dict[str, Any] = {'access_key': self._access_key}
args['context_path'] = self.intent_model_path
if self.endpoint_duration:
args['endpoint_duration_sec'] = self.endpoint_duration
if self.enable_automatic_punctuation:
args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
return args
def __enter__(self):
"""
Get the assistant ready to start processing audio frames.
"""
if self.should_stop():
return self
assert not self.is_alive(), 'The assistant is already running'
self._in_ctx = True
if self._recorder:
self.logger.info('A recording stream already exists')
elif self.hotword_enabled or self.stt_enabled or self.intent_enabled:
sample_rate = (self.porcupine or self._speech_processor).sample_rate
frame_length = (self.porcupine or self._speech_processor).frame_length
self._recorder = AudioRecorder(
stop_event=self._stop_event,
sample_rate=sample_rate,
frame_size=frame_length,
queue_size=self.audio_queue_size,
paused=self._muted,
channels=1,
)
self._speech_processor.__enter__()
self._recorder.__enter__()
if self.porcupine:
self.state = AssistantState.DETECTING_HOTWORD
else:
self.state = AssistantState.DETECTING_SPEECH
self.start()
return self
def __exit__(self, *_):
"""
Stop the assistant and release all resources.
"""
self._in_ctx = False
if self._recorder:
self._recorder.__exit__(*_)
self._recorder = None
self.state = AssistantState.IDLE
if self._porcupine:
self._porcupine.delete()
self._porcupine = None
self._speech_processor.__exit__(*_)
def __iter__(self):
"""
Iterate over processed assistant events.
"""
return self
def __next__(self):
"""
Process the next audio frame and return the corresponding event.
"""
if self.should_stop() or not self._recorder:
raise StopIteration
if self.hotword_enabled and self.state == AssistantState.DETECTING_HOTWORD:
return self._evt_queue.get()
evt = None
if (
self._speech_processor.enabled
and self.state == AssistantState.DETECTING_SPEECH
):
evt = self._speech_processor.next_event()
if isinstance(evt, SpeechRecognizedEvent):
self._on_speech_recognized(phrase=evt.args['phrase'])
if isinstance(evt, IntentRecognizedEvent):
self._on_intent_matched(
intent=evt.args['intent'], slots=evt.args.get('slots', {})
)
if isinstance(evt, ConversationTimeoutEvent):
self._on_conversation_timeout()
if evt:
self._speech_processor.reset()
if (
evt
and self.state == AssistantState.DETECTING_SPEECH
and self.hotword_enabled
):
self.state = AssistantState.DETECTING_HOTWORD
return evt
def mute(self):
self._muted = True
if self._recorder:
self._recorder.pause()
def unmute(self):
self._muted = False
if self._recorder:
self._recorder.resume()
def set_mic_mute(self, mute: bool):
if mute:
self.mute()
else:
self.unmute()
def toggle_mic_mute(self):
if self._muted:
self.unmute()
else:
self.mute()
def _process_hotword(self, frame) -> Optional[HotwordDetectedEvent]:
if not self.porcupine:
return None
keyword_index = self.porcupine.process(frame)
if keyword_index is None:
return None # No keyword detected
if keyword_index >= 0 and self.keywords:
if self.start_conversation_on_hotword:
self.state = AssistantState.DETECTING_SPEECH
self.tts.stop() # Stop any ongoing TTS when the hotword is detected
self._on_hotword_detected(hotword=self.keywords[keyword_index])
return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
return None
def override_speech_model(self, model_path: Optional[str]):
self._speech_model_path_override = model_path
def override_intent_model(self, model_path: Optional[str]):
self._intent_model_path_override = model_path
def _put_event(self, evt: AssistantEvent):
try:
self._evt_queue.put_nowait(evt)
except Full:
self.logger.warning('The assistant event queue is full')
def run(self):
assert (
self._in_ctx
), 'The assistant can only be started through a context manager'
super().run()
while not self.should_stop() and self._recorder:
self._recorder.wait_start()
if self.should_stop():
break
data = self._recorder.read()
if data is None:
continue
frame, t = data
if time() - t > self.frame_expiration:
self.logger.info(
'Skipping audio frame older than %ss', self.frame_expiration
)
continue # The audio frame is too old
if self.hotword_enabled and self.state == AssistantState.DETECTING_HOTWORD:
evt = self._process_hotword(frame)
if evt:
self._put_event(evt)
continue
if (
self._speech_processor.enabled
and self.state == AssistantState.DETECTING_SPEECH
):
self._speech_processor.process(frame, block=False)
self.logger.info('Assistant stopped')
# vim:sw=4:ts=4:et:

View File

@ -0,0 +1,51 @@
from dataclasses import dataclass
from time import time
from typing import Optional
from ._intent import Intent
@dataclass
class ConversationContext:
"""
Context of the conversation process.
"""
transcript: str = ''
is_final: bool = False
intent: Optional[Intent] = None
timeout: Optional[float] = None
t_start: Optional[float] = None
def start(self):
self.reset()
self.t_start = time()
def reset(self):
self.transcript = ''
self.intent = None
self.is_final = False
self.t_start = None
@property
def timed_out(self):
return (
(
(not self.transcript and not self.is_final)
or (not self.intent and not self.is_final)
)
and self.timeout
and self.t_start
and time() - self.t_start > self.timeout
) or (
(
(self.transcript and not self.is_final)
or (self.intent and not self.is_final)
)
and self.timeout
and self.t_start
and time() - self.t_start > self.timeout * 2
)
# vim:sw=4:ts=4:et:

View File

@ -0,0 +1,11 @@
from dataclasses import dataclass, field
@dataclass
class Intent:
"""
Speech intent data class.
"""
name: str
slots: dict = field(default_factory=dict)

View File

@ -0,0 +1,191 @@
from collections import namedtuple
from dataclasses import dataclass, field
from logging import getLogger
from queue import Full, Queue
from threading import Event, RLock
from time import time
from typing import Optional
import sounddevice as sd
from platypush.utils import wait_for_either
AudioFrame = namedtuple('AudioFrame', ['data', 'timestamp'])
@dataclass
class PauseState:
"""
Data class to hold the boilerplate (state + synchronization events) for the
audio recorder pause API.
"""
_paused_event: Event = field(default_factory=Event)
_recording_event: Event = field(default_factory=Event)
_state_lock: RLock = field(default_factory=RLock)
@property
def paused(self):
with self._state_lock:
return self._paused_event.is_set()
def pause(self):
"""
Pause the audio recorder.
"""
with self._state_lock:
self._paused_event.set()
self._recording_event.clear()
def resume(self):
"""
Resume the audio recorder.
"""
with self._state_lock:
self._paused_event.clear()
self._recording_event.set()
def toggle(self):
"""
Toggle the audio recorder pause state.
"""
with self._state_lock:
if self.paused:
self.resume()
else:
self.pause()
def wait_paused(self, timeout: Optional[float] = None):
"""
Wait until the audio recorder is paused.
"""
self._paused_event.wait(timeout=timeout)
def wait_recording(self, timeout: Optional[float] = None):
"""
Wait until the audio recorder is resumed.
"""
self._recording_event.wait(timeout=timeout)
class AudioRecorder:
"""
Audio recorder component that uses the sounddevice library to record audio
from the microphone.
"""
def __init__(
self,
stop_event: Event,
sample_rate: int,
frame_size: int,
channels: int,
paused: bool = False,
dtype: str = 'int16',
queue_size: int = 100,
):
self.logger = getLogger(__name__)
self._audio_queue: Queue[AudioFrame] = Queue(maxsize=queue_size)
self.frame_size = frame_size
self._stop_event = Event()
self._upstream_stop_event = stop_event
self._paused_state = PauseState()
if paused:
self._paused_state.pause()
else:
self._paused_state.resume()
self.stream = sd.InputStream(
samplerate=sample_rate,
channels=channels,
dtype=dtype,
blocksize=frame_size,
callback=self._audio_callback,
)
@property
def paused(self):
return self._paused_state.paused
def __enter__(self):
"""
Start the audio stream.
"""
self._stop_event.clear()
self.stream.start()
return self
def __exit__(self, *_):
"""
Stop the audio stream.
"""
self.stop()
def _audio_callback(self, indata, *_):
if self.should_stop() or self.paused:
return
try:
self._audio_queue.put_nowait(AudioFrame(indata.reshape(-1), time()))
except Full:
self.logger.warning('Audio queue is full, dropping audio frame')
def read(self, timeout: Optional[float] = None):
"""
Read an audio frame from the queue.
:param timeout: Timeout in seconds. If None, the method will block until
an audio frame is available.
:return: Audio frame or None if the timeout has expired.
"""
try:
return self._audio_queue.get(timeout=timeout)
except TimeoutError:
self.logger.debug('Audio queue is empty')
return None
def stop(self):
"""
Stop the audio stream.
"""
self._stop_event.set()
self.stream.stop()
def pause(self):
"""
Pause the audio stream.
"""
self._paused_state.pause()
def resume(self):
"""
Resume the audio stream.
"""
self._paused_state.resume()
def toggle(self):
"""
Toggle the audio stream pause state.
"""
self._paused_state.toggle()
def should_stop(self):
return self._stop_event.is_set() or self._upstream_stop_event.is_set()
def wait(self, timeout: Optional[float] = None):
"""
Wait until the audio stream is stopped.
"""
wait_for_either(self._stop_event, self._upstream_stop_event, timeout=timeout)
def wait_start(self, timeout: Optional[float] = None):
"""
Wait until the audio stream is started.
"""
wait_for_either(
self._stop_event,
self._upstream_stop_event,
self._paused_state._recording_event,
timeout=timeout,
)

View File

@ -0,0 +1,3 @@
from ._processor import SpeechProcessor
__all__ = ['SpeechProcessor']

View File

@ -0,0 +1,180 @@
import logging
from abc import ABC, abstractmethod
from queue import Empty, Queue
from threading import Event, RLock, Thread, get_ident
from typing import Any, Optional, Sequence
from platypush.message.event.assistant import AssistantEvent
from .._context import ConversationContext
class BaseProcessor(ABC, Thread):
"""
Base speech processor class. It is implemented by the ``SttProcessor`` and
the ``IntentProcessor`` classes.
"""
def __init__(
self,
*args,
stop_event: Event,
enabled: bool = True,
conversation_timeout: Optional[float] = None,
**kwargs,
):
super().__init__(*args, name=f'picovoice:{self.__class__.__name__}', **kwargs)
self.logger = logging.getLogger(self.name)
self._enabled = enabled
self._audio_queue = Queue()
self._stop_event = stop_event
self._ctx = ConversationContext(timeout=conversation_timeout)
self._event_queue = Queue()
# This event is set if the upstream processor is waiting for an event
# from this processor
self._event_wait = Event()
# This event is set when the processor is done with the audio
# processing and it's ready to accept a new audio frame
self._processing_done = Event()
self._processing_done.set()
self._state_lock = RLock()
def should_stop(self) -> bool:
return self._stop_event.is_set()
def wait_stop(self, timeout: Optional[float] = None) -> bool:
return self._stop_event.wait(timeout)
def enqueue(self, audio: Sequence[int]):
if not self._enabled:
return
self._event_wait.set()
self._processing_done.clear()
self._audio_queue.put_nowait(audio)
def reset(self) -> Optional[Any]:
"""
Reset any pending context.
"""
if not self._enabled:
return
with self._state_lock:
self._ctx.reset()
self._event_queue.queue.clear()
self._event_wait.clear()
self._processing_done.set()
@property
def processing_done(self) -> Event:
return self._processing_done
@property
@abstractmethod
def _model_path(self) -> Optional[str]:
"""
Return the model path.
"""
@property
@abstractmethod
def sample_rate(self) -> int:
"""
:return: The sample rate wanted by Cheetah/Rhino.
"""
@property
@abstractmethod
def frame_length(self) -> int:
"""
:return: The frame length wanted by Cheetah/Rhino.
"""
def last_event(self) -> Optional[AssistantEvent]:
"""
:return: The latest event that was processed by the processor.
"""
with self._state_lock:
evt = None
try:
while True:
evt = self._event_queue.get_nowait()
except Empty:
pass
if evt:
self._event_wait.clear()
return evt
@abstractmethod
def process(self, audio: Sequence[int]) -> Optional[AssistantEvent]:
"""
Process speech events from a raw audio input.
"""
def run(self):
super().run()
if not self._enabled:
self.wait_stop()
self.reset()
self._processing_done.clear()
self.logger.info('Processor started: %s', self.name)
while not self.should_stop():
audio = self._audio_queue.get()
# The thread is stopped when it receives a None object
if audio is None:
break
# Don't process the audio if the upstream processor is not waiting
# for an event
if not self._event_wait.is_set():
continue
try:
self._processing_done.clear()
event = self.process(audio)
if event:
self.logger.debug(
'Dispatching event processed from %s: %s', self.name, event
)
self._event_queue.put_nowait(event)
self._processing_done.set()
except Exception as e:
self.logger.error(
'An error occurred while processing the audio on %s: %s',
self.name,
e,
exc_info=e,
)
self.wait_stop(timeout=1)
self._processing_done.set()
continue
self._processing_done.set()
self.reset()
self.logger.info('Processor stopped: %s', self.name)
def stop(self):
if not self._enabled:
return
self._audio_queue.put_nowait(None)
if self.is_alive() and self.ident != get_ident():
self.logger.debug('Stopping %s', self.name)
self.join()
def on_conversation_start(self):
self._ctx.start()
def on_conversation_end(self):
self.reset()
def on_conversation_reset(self):
self.reset()

View File

@ -0,0 +1,93 @@
from typing import Callable, Optional, Sequence, Union
import pvrhino
from platypush.message.event.assistant import (
ConversationTimeoutEvent,
IntentRecognizedEvent,
)
from ._base import BaseProcessor
class IntentProcessor(BaseProcessor):
"""
Implementation of the speech-to-intent processor using the Picovoice Rhino
engine.
"""
def __init__(
self, *args, get_rhino_args: Callable[[], dict] = lambda: {}, **kwargs
):
super().__init__(*args, **kwargs)
self._get_rhino_args = get_rhino_args
# model_path -> Rhino instance cache
self._rhino = {}
@property
def _model_path(self) -> Optional[str]:
return self._get_rhino_args().get('model_path')
@property
def sample_rate(self) -> int:
return self._get_rhino().sample_rate
@property
def frame_length(self) -> int:
return self._get_rhino().frame_length
def _get_rhino(self) -> pvrhino.Rhino:
if not self._rhino.get(self._model_path):
self._rhino[self._model_path] = pvrhino.create(**self._get_rhino_args())
return self._rhino[self._model_path]
def process(
self, audio: Sequence[int]
) -> Optional[Union[IntentRecognizedEvent, ConversationTimeoutEvent]]:
"""
Process the audio and return an ``IntentRecognizedEvent`` if the intent was
understood, or a ``ConversationTimeoutEvent`` if the conversation timed
out, or ``None`` if the intent processing is not yet finalized.
"""
event = None
rhino = self._get_rhino()
self._ctx.is_final = rhino.process(audio)
if self._ctx.is_final:
inference = rhino.get_inference()
self.logger.debug(
'Intent detection finalized. Inference understood: %s',
inference.is_understood,
)
if inference.is_understood:
event = IntentRecognizedEvent(
intent=inference.intent,
slots=dict(inference.slots),
)
if not event and self._ctx.timed_out:
event = ConversationTimeoutEvent()
return event
def reset(self) -> None:
if not self._enabled:
return
with self._state_lock:
self._get_rhino().reset()
super().reset()
def stop(self):
if not self._enabled:
return
super().stop()
with self._state_lock:
objs = self._rhino.copy()
for key, obj in objs.items():
obj.delete()
self._rhino.pop(key)

View File

@ -0,0 +1,208 @@
import logging
from queue import Queue
from threading import Event
from typing import Callable, Optional, Sequence
from platypush.message.event.assistant import AssistantEvent, ConversationTimeoutEvent
from platypush.utils import wait_for_either
from ._intent import IntentProcessor
from ._stt import SttProcessor
class SpeechProcessor:
"""
Speech processor class that wraps the STT and Intent processors under the
same interface.
"""
def __init__(
self,
stop_event: Event,
model_path: Optional[str] = None,
stt_enabled: bool = True,
intent_enabled: bool = False,
conversation_timeout: Optional[float] = None,
get_cheetah_args: Callable[[], dict] = lambda: {},
get_rhino_args: Callable[[], dict] = lambda: {},
):
self.logger = logging.getLogger(self.__class__.__name__)
self._stt_enabled = stt_enabled
self._intent_enabled = intent_enabled
self._model_path = model_path
self._conversation_timeout = conversation_timeout
self._audio_queue = Queue()
self._stop_event = stop_event
self._get_cheetah_args = get_cheetah_args
self._get_rhino_args = get_rhino_args
self._stt_processor = SttProcessor(
conversation_timeout=conversation_timeout,
stop_event=stop_event,
enabled=stt_enabled,
get_cheetah_args=get_cheetah_args,
)
self._intent_processor = IntentProcessor(
conversation_timeout=conversation_timeout,
stop_event=stop_event,
enabled=intent_enabled,
get_rhino_args=get_rhino_args,
)
@property
def enabled(self) -> bool:
"""
The processor is enabled if either the STT or the Intent processor are
enabled.
"""
return self._stt_enabled or self._intent_enabled
def should_stop(self) -> bool:
return self._stop_event.is_set()
def next_event(self, timeout: Optional[float] = None) -> Optional[AssistantEvent]:
evt = None
# Wait for either the STT or Intent processor to finish processing the audio
completed = wait_for_either(
self._stt_processor.processing_done,
self._intent_processor.processing_done,
self._stop_event,
timeout=timeout,
)
if not completed:
self.logger.warning('Timeout while waiting for the processors to finish')
# Immediately return if the stop event is set
if self.should_stop():
return evt
with self._stt_processor._state_lock, self._intent_processor._state_lock:
# Priority to the intent processor event, if the processor is enabled
if self._intent_enabled:
evt = self._intent_processor.last_event()
# If the intent processor didn't return any event, then return the STT
# processor event
if (
not evt or isinstance(evt, ConversationTimeoutEvent)
) and self._stt_enabled:
# self._stt_processor.processing_done.wait(timeout=timeout)
evt = self._stt_processor.last_event()
if evt:
self._stt_processor.reset()
self._intent_processor.reset()
return evt
def process(
self, audio: Sequence[int], block: bool = True, timeout: Optional[float] = None
) -> Optional[AssistantEvent]:
"""
Process an audio frame.
The audio frame is enqueued to both the STT and Intent processors, if
enabled. The function waits for either processor to finish processing
the audio, and returns the event from the first processor that returns
a result.
Priority is given to the Intent processor if enabled, otherwise the STT
processor is used.
"""
# Enqueue the audio to both the STT and Intent processors if enabled
if self._stt_enabled:
self._stt_processor.enqueue(audio)
if self._intent_enabled:
self._intent_processor.enqueue(audio)
if not block:
return None
return self.next_event(timeout=timeout)
def __enter__(self):
"""
Context manager entry point - it wraps :meth:`start`.
"""
self.start()
def __exit__(self, *_, **__):
"""
Context manager exit point - it wraps :meth:`stop`.
"""
self.stop()
def start(self):
"""
Start the STT and Intent processors.
"""
if self._stt_enabled:
self._stt_processor.start()
if self._intent_enabled:
self._intent_processor.start()
def stop(self):
"""
Stop the STT and Intent processors.
"""
self._stt_processor.stop()
self._intent_processor.stop()
def on_conversation_start(self):
if self._stt_enabled:
self._stt_processor.on_conversation_start()
if self._intent_enabled:
self._intent_processor.on_conversation_start()
def on_conversation_end(self):
if self._stt_enabled:
self._stt_processor.on_conversation_end()
if self._intent_enabled:
self._intent_processor.on_conversation_end()
def on_conversation_reset(self):
if self._stt_enabled:
self._stt_processor.on_conversation_reset()
if self._intent_enabled:
self._intent_processor.on_conversation_reset()
def reset(self):
"""
Reset the state of the STT and Intent processors.
"""
self._stt_processor.reset()
self._intent_processor.reset()
@property
def sample_rate(self) -> int:
"""
The sample rate of the audio frames.
"""
if self._intent_enabled:
return self._intent_processor.sample_rate
if self._stt_enabled:
return self._stt_processor.sample_rate
raise ValueError('No processor enabled')
@property
def frame_length(self) -> int:
"""
The frame length of the audio frames.
"""
if self._intent_enabled:
return self._intent_processor.frame_length
if self._stt_enabled:
return self._stt_processor.frame_length
raise ValueError('No processor enabled')

View File

@ -0,0 +1,110 @@
from typing import Callable, Optional, Sequence, Union
import pvcheetah
from platypush.message.event.assistant import (
ConversationTimeoutEvent,
SpeechRecognizedEvent,
)
from ._base import BaseProcessor
class SttProcessor(BaseProcessor):
"""
Implementation of the speech-to-text processor using the Picovoice Cheetah
engine.
"""
def __init__(
self, *args, get_cheetah_args: Callable[[], dict] = lambda: {}, **kwargs
):
super().__init__(*args, **kwargs)
self._get_cheetah_args = get_cheetah_args
# model_path -> Cheetah instance cache
self._cheetah = {self._model_path: pvcheetah.create(**self._get_cheetah_args())}
@property
def _model_path(self) -> Optional[str]:
return self._get_cheetah_args().get('model_path')
@property
def sample_rate(self) -> int:
return self._get_cheetah().sample_rate
@property
def frame_length(self) -> int:
return self._get_cheetah().frame_length
def _get_cheetah(self) -> pvcheetah.Cheetah:
with self._state_lock:
if not self._cheetah.get(self._model_path):
self.logger.debug(
'Creating Cheetah instance for model %s', self._model_path
)
self._cheetah[self._model_path] = pvcheetah.create(
**self._get_cheetah_args()
)
self.logger.debug(
'Cheetah instance created for model %s', self._model_path
)
return self._cheetah[self._model_path]
def process(
self, audio: Sequence[int]
) -> Optional[Union[SpeechRecognizedEvent, ConversationTimeoutEvent]]:
event = None
cheetah = self._get_cheetah()
partial_transcript, self._ctx.is_final = cheetah.process(audio)
last_transcript = self._ctx.transcript
# Concatenate the partial transcript to the context
if partial_transcript:
self._ctx.transcript += partial_transcript
self.logger.info(
'Partial transcript: %s, is_final: %s',
self._ctx.transcript,
self._ctx.is_final,
)
# If the transcript is final or the conversation timed out, then
# process and return whatever is available in the context
if self._ctx.is_final or self._ctx.timed_out:
phrase = cheetah.flush() or ''
self._ctx.transcript += phrase
if self._ctx.transcript and self._ctx.transcript != last_transcript:
self.logger.debug('Processed STT transcript: %s', self._ctx.transcript)
last_transcript = self._ctx.transcript
phrase = self._ctx.transcript
phrase = (phrase[:1].lower() + phrase[1:]).strip()
event = (
SpeechRecognizedEvent(phrase=phrase)
if phrase
else ConversationTimeoutEvent()
)
self.reset()
return event
def reset(self):
if not self._enabled:
return
with self._state_lock:
super().reset()
self._get_cheetah().flush()
def stop(self):
if not self._enabled:
return
super().stop()
with self._state_lock:
objs = self._cheetah.copy()
for key, obj in objs.items():
obj.delete()
self._cheetah.pop(key)

View File

@ -0,0 +1,14 @@
from enum import Enum
class AssistantState(Enum):
"""
Possible states of the assistant.
"""
IDLE = 'idle'
DETECTING_HOTWORD = 'detecting_hotword'
DETECTING_SPEECH = 'detecting_speech'
# vim:sw=4:ts=4:et:

View File

@ -0,0 +1,33 @@
manifest:
package: platypush.plugins.assistant.picovoice
type: plugin
events:
- platypush.message.event.assistant.ConversationEndEvent
- platypush.message.event.assistant.ConversationStartEvent
- platypush.message.event.assistant.ConversationTimeoutEvent
- platypush.message.event.assistant.HotwordDetectedEvent
- platypush.message.event.assistant.IntentRecognizedEvent
- platypush.message.event.assistant.MicMutedEvent
- platypush.message.event.assistant.MicUnmutedEvent
- platypush.message.event.assistant.NoResponseEvent
- platypush.message.event.assistant.ResponseEndEvent
- platypush.message.event.assistant.ResponseEvent
- platypush.message.event.assistant.SpeechRecognizedEvent
install:
apk:
- ffmpeg
apt:
- ffmpeg
dnf:
- ffmpeg
pacman:
- ffmpeg
- python-sounddevice
pip:
- num2words # Temporary dependency
- pvcheetah
- pvleopard
- pvorca
- pvporcupine
- pvrhino
- sounddevice

View File

@ -255,6 +255,8 @@ class MusicMopidyPlugin(RunnablePlugin):
ret = self._add(resource, position=0)
if not ret:
self.logger.warning('Failed to add %s to the tracklist', resource)
elif isinstance(ret, list):
track_id = ret[0].get('tlid')
elif isinstance(ret, dict):
track_id = ret.get('tlid')
elif position is not None:

View File

@ -0,0 +1,283 @@
import os
from dataclasses import dataclass
from datetime import datetime as dt
from enum import Enum
from threading import RLock
from typing import Iterable, List, Optional
import requests
from platypush.plugins import Plugin, action
class ContextEntryRole(Enum):
"""
Roles for context entries.
"""
ASSISTANT = "assistant"
SYSTEM = "system"
USER = "user"
@dataclass
class ContextEntry:
"""
A context entry.
"""
timestamp: dt
role: ContextEntryRole
content: str
@classmethod
def from_dict(cls, data: dict):
return cls(
timestamp=dt.fromisoformat(data.get("timestamp", dt.now().isoformat())),
role=ContextEntryRole(data["role"]),
content=data["content"],
)
def to_dict(self):
return {
"role": self.role.value,
"content": self.content,
}
class OpenaiPlugin(Plugin):
"""
Plugin to interact with OpenAI services.
So far only ChatGPT is supported.
Contexts
--------
The plugin also supports the implementation of custom assistant
*contexts*/environment.
Contexts can be used to:
- Customize the model's behavior based on a set of inputs - going from
a generic "*You are a helpful assistant*" to a more specific "*You
are a Star Trek fan*", or "*You are a 16th century noble lady who
talks in Shakespearean English to her peers*".
- Pre-configure the model with a set of previous interactions in order
to either pre-load information that we expect the model to remember,
or to provide a set of previous interactions that the model can use
to generate responses that are consistent with the conversation
history.
The plugin provides two types of contexts:
- **Default context**: This is a set of context entries that are
provided at plugin initialization and that will be used to initialize
the model with a configuration or set of previous interactions that
will be remembered when generating all responses.
- **Runtime context**: This is a set of context entries that can be
passed at runtime at :meth:`.get_response`. All the interactions
(both user prompts and assistant responses) that are processed
through :meth:`.get_response` will also be added to the runtime
context, and remembered for the next ``context_expiry`` seconds. This
allows you to generate responses that are consistent with the recent
conversation history.
Each context entry is a dictionary with the following keys:
- ``role``: The role of the message. Can be one of:
- ``system``: A system message provided to the model to set
up its initial state - e.g. "you are a helpful
assistant".
- ``user``: A user message, as provided by a previous (real
or synthetic) user interaction.
- ``assistant``: An assistant message, as provided by a
previous (real or synthetic) assistant response.
- ``content``: The content of the message.
An example of context:
.. code-block:: yaml
context:
- role: system
content: >
You are a 16th century noble lady who talks in
Shakespearean English to her peers.
- role: user
content: What is a telephone?
- role: assistant
content: >
Pray tell, noble companion, a telephone is a device
of modern innovation that doth permit one to speak
with a distant acquaintance by means of magical pink
waves that do carry the sound of thine voice to the
ear of the listener.
Given such context, if you call :meth:`.get_response` with a
prompt such as "*How does it work?*", the model may generate a
response such as "*Fair lady, to use a telephone, thou must first
lift the receiver and place it to thine ear. Then, thou must speak
into the mouthpiece as though conversing with a companion in
another room. The magical pink waves shall carry thy words to the
recipient, who shall hear them on their own device. 'Tis a wondrous
invention indeed!*".
Note that the model will remember the previous interactions and
also generate responses, so you can ask it direct questions such as "How
does it work" while remembering what "it" is likely to mean. And it'll
provide responses which are in the same style initialized through the
``system`` context.
"""
def __init__(
self,
api_key: Optional[str],
model: str = "gpt-3.5-turbo",
timeout: float = 30,
context: Optional[Iterable[dict]] = None,
context_expiry: Optional[float] = 600,
max_tokens: int = 500,
**kwargs,
):
"""
:param api_key: OpenAI API key. If not set, it will be read from the
``OPENAI_API_KEY`` environment variable.
:param model: The model to use. Default: ``gpt-3.5-turbo``.
:param timeout: Default timeout for API requests (default: 30 seconds).
:param max_tokens: Maximum number of tokens to generate in the response
(default: 500).
:param context: Default context to use for completions, as a list of
dictionaries with ``role`` and ``content`` keys. Default: None.
:param context_expiry: Default expiry time for the context in seconds.
After this time since the last interaction, the context will be
cleared.
This means that any follow-up interactions happening within the
expiry window will remember the past prompts, but any interaction
that happens after the expiry window (calculated from the time of
the last interaction) will start fresh.
Note that ``context_expiry`` is only applied to the runtime
context. The default context will never expire unless it's removed
from the plugin configuration.
Set to 0 to disable context expiry - i.e. all messages stay in the
context until the plugin is restarted or the context is cleared
explicitly via :meth:`.clear_context`. Default: 600 seconds (10
minutes).
"""
super().__init__(**kwargs)
api_key = api_key or os.getenv('OPENAI_API_KEY')
assert api_key, 'OpenAI API key not provided'
self._api_key = api_key
self._context_lock = RLock()
self._runtime_context: List[ContextEntry] = []
self._default_context = [
ContextEntry.from_dict(entries) for entries in (context or [])
]
self.max_tokens = max_tokens
self.context_expiry = context_expiry
self.model = model
self.timeout = timeout
def _rotate_context(self):
"""
Rotate the context by removing any entries older than the configured
``context_expiry``.
"""
if not self.context_expiry:
return
with self._context_lock:
now = dt.now()
self._runtime_context = [
entry
for entry in self._runtime_context
if (now - entry.timestamp).total_seconds() < self.context_expiry
]
@action
def get_response(
self,
prompt: str,
model: Optional[str] = None,
context: Optional[Iterable[dict]] = None,
timeout: Optional[float] = None,
max_tokens: Optional[int] = None,
) -> Optional[str]:
"""
Get completions for a given prompt using ChatGPT.
:param prompt: The prompt/question to complete/answer.
:param model: Override the default model to use.
:param context: Extend the default context with these extra messages.
:param max_tokens: Override the default maximum number of tokens to
generate in the response.
:param timeout: Override the default timeout for the API request.
:return: The completion for the prompt - or, better, the message
associted to the highest scoring completion choice.
"""
self._rotate_context()
context = [
*(context or []),
{
"role": "user",
"content": prompt,
},
]
resp = requests.post(
"https://api.openai.com/v1/chat/completions",
timeout=timeout or self.timeout,
headers={
"Authorization": f"Bearer {self._api_key}",
"Content-Type": "application/json",
},
json={
"model": model or self.model,
"messages": [
*(
entry.to_dict()
for entry in (
*(self._default_context or []),
*self._runtime_context,
)
),
*context,
],
"max_tokens": max_tokens or self.max_tokens,
},
)
resp.raise_for_status()
self._update_context(*context)
choices = resp.json()["choices"]
self.logger.debug("OpenAI response: %s", resp.json())
if not choices:
return None
msg = choices[0]["message"]
self._update_context(msg)
return msg["content"]
def _update_context(self, *entries: dict):
"""
Update the context with a new entry.
"""
with self._context_lock:
for entry in entries:
self._runtime_context.append(ContextEntry.from_dict(entry))
self._rotate_context()
@action
def clear_context(self):
"""
Clear the runtime context.
"""
with self._context_lock:
self._runtime_context = []

View File

@ -0,0 +1,3 @@
manifest:
package: platypush.plugins.openai
type: plugin

View File

@ -247,9 +247,11 @@ class AudioManager:
wait_start = time()
for audio_thread in streams_to_stop:
audio_thread.join(
timeout=max(0, timeout - (time() - wait_start))
if timeout is not None
else None
timeout=(
max(0, timeout - (time() - wait_start))
if timeout is not None
else None
)
)
# Remove references

View File

@ -1,336 +0,0 @@
import queue
import threading
from abc import ABC, abstractmethod
from typing import Optional, Union, List
import sounddevice as sd
from platypush.context import get_bus
from platypush.message.event.stt import (
SpeechDetectionStartedEvent,
SpeechDetectionStoppedEvent,
SpeechStartedEvent,
SpeechDetectedEvent,
HotwordDetectedEvent,
ConversationDetectedEvent,
)
from platypush.message.response.stt import SpeechDetectedResponse
from platypush.plugins import Plugin, action
class SttPlugin(ABC, Plugin):
"""
Abstract class for speech-to-text plugins.
"""
_thread_stop_timeout = 10.0
rate = 16000
channels = 1
def __init__(
self,
input_device: Optional[Union[int, str]] = None,
hotword: Optional[str] = None,
hotwords: Optional[List[str]] = None,
conversation_timeout: Optional[float] = 10.0,
block_duration: float = 1.0,
):
"""
:param input_device: PortAudio device index or name that will be used for recording speech (default: default
system audio input device).
:param hotword: When this word is detected, the plugin will trigger a
:class:`platypush.message.event.stt.HotwordDetectedEvent` instead of a
:class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can use these events for hooking other
assistants.
:param hotwords: Use a list of hotwords instead of a single one.
:param conversation_timeout: If ``hotword`` or ``hotwords`` are set and ``conversation_timeout`` is set,
the next speech detected event will trigger a :class:`platypush.message.event.stt.ConversationDetectedEvent`
instead of a :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can hook custom hooks
here to run any logic depending on the detected speech - it can emulate a kind of
"OK, Google. Turn on the lights" interaction without using an external assistant (default: 10 seconds).
:param block_duration: Duration of the acquired audio blocks (default: 1 second).
"""
super().__init__()
self.input_device = input_device
self.conversation_timeout = conversation_timeout
self.block_duration = block_duration
self.hotwords = set(hotwords or [])
if hotword:
self.hotwords = {hotword}
self._conversation_event = threading.Event()
self._input_stream: Optional[sd.InputStream] = None
self._recording_thread: Optional[threading.Thread] = None
self._detection_thread: Optional[threading.Thread] = None
self._audio_queue: Optional[queue.Queue] = None
self._current_text = ''
def _get_input_device(self, device: Optional[Union[int, str]] = None) -> int:
"""
Get the index of the input device by index or name.
:param device: Device index or name. If None is set then the function will return the index of the
default audio input device.
:return: Index of the audio input device.
"""
if not device:
device = self.input_device
if not device:
return sd.query_hostapis()[0].get('default_input_device')
if isinstance(device, int):
assert device <= len(sd.query_devices())
return device
for i, dev in enumerate(sd.query_devices()):
if dev['name'] == device:
return i
raise AssertionError('Device {} not found'.format(device))
def on_speech_detected(self, speech: str) -> None:
"""
Hook called when speech is detected. Triggers the right event depending on the current context.
:param speech: Detected speech.
"""
speech = speech.strip()
if speech in self.hotwords:
event = HotwordDetectedEvent(hotword=speech)
if self.conversation_timeout:
self._conversation_event.set()
threading.Timer(
self.conversation_timeout, lambda: self._conversation_event.clear()
).start()
elif self._conversation_event.is_set():
event = ConversationDetectedEvent(speech=speech)
else:
event = SpeechDetectedEvent(speech=speech)
get_bus().post(event)
@staticmethod
def convert_frames(frames: bytes) -> bytes:
"""
Conversion method for raw audio frames. It just returns the input frames as bytes. Override it if required
by your logic.
:param frames: Input audio frames, as bytes.
:return: The audio frames as passed on the input. Override if required.
"""
return frames
def on_detection_started(self) -> None:
"""
Method called when the ``detection_thread`` starts. Initialize your context variables and models here if
required.
"""
pass
def on_detection_ended(self) -> None:
"""
Method called when the ``detection_thread`` stops. Clean up your context variables and models here.
"""
pass
def before_recording(self) -> None:
"""
Method called when the ``recording_thread`` starts. Put here any logic that you may want to run before the
recording thread starts.
"""
pass
def on_recording_started(self) -> None:
"""
Method called after the ``recording_thread`` opens the audio device. Put here any logic that you may want to
run after the recording starts.
"""
pass
def on_recording_ended(self) -> None:
"""
Method called when the ``recording_thread`` stops. Put here any logic that you want to run after the audio
device is closed.
"""
pass
@abstractmethod
def detect_speech(self, frames) -> str:
"""
Method called within the ``detection_thread`` when new audio frames have been captured. Must be implemented
by the derived classes.
:param frames: Audio frames, as returned by ``convert_frames``.
:return: Detected text, as a string. Returns an empty string if no text has been detected.
"""
raise NotImplementedError
def process_text(self, text: str) -> None:
if (not text and self._current_text) or (text and text == self._current_text):
self.on_speech_detected(self._current_text)
self._current_text = ''
else:
if text:
if not self._current_text:
get_bus().post(SpeechStartedEvent())
self.logger.info('Intermediate speech results: [{}]'.format(text))
self._current_text = text
def detection_thread(self) -> None:
"""
This thread reads frames from ``_audio_queue``, performs the speech-to-text detection and calls
"""
self._current_text = ''
self.logger.debug('Detection thread started')
self.on_detection_started()
while self._audio_queue:
try:
frames = self._audio_queue.get()
frames = self.convert_frames(frames)
except Exception as e:
self.logger.warning(
'Error while feeding audio to the model: {}'.format(str(e))
)
continue
text = self.detect_speech(frames).strip()
self.process_text(text)
self.on_detection_ended()
self.logger.debug('Detection thread terminated')
def recording_thread(
self,
block_duration: Optional[float] = None,
block_size: Optional[int] = None,
input_device: Optional[str] = None,
) -> None:
"""
Recording thread. It reads raw frames from the audio device and dispatches them to ``detection_thread``.
:param block_duration: Audio blocks duration. Specify either ``block_duration`` or ``block_size``.
:param block_size: Size of the audio blocks. Specify either ``block_duration`` or ``block_size``.
:param input_device: Input device
"""
assert (block_duration or block_size) and not (
block_duration and block_size
), 'Please specify either block_duration or block_size'
if not block_size:
block_size = int(self.rate * self.channels * block_duration)
self.before_recording()
self.logger.debug('Recording thread started')
device = self._get_input_device(input_device)
self._input_stream = sd.InputStream(
samplerate=self.rate,
device=device,
channels=self.channels,
dtype='int16',
latency=0,
blocksize=block_size,
)
self._input_stream.start()
self.on_recording_started()
get_bus().post(SpeechDetectionStartedEvent())
while self._input_stream:
try:
frames = self._input_stream.read(block_size)[0]
except Exception as e:
self.logger.warning(
'Error while reading from the audio input: {}'.format(str(e))
)
continue
self._audio_queue.put(frames)
get_bus().post(SpeechDetectionStoppedEvent())
self.on_recording_ended()
self.logger.debug('Recording thread terminated')
@abstractmethod
@action
def detect(self, audio_file: str) -> SpeechDetectedResponse:
"""
Perform speech-to-text analysis on an audio file. Must be implemented by the derived classes.
:param audio_file: Path to the audio file.
"""
raise NotImplementedError
def __enter__(self):
"""
Context manager enter. Starts detection and returns self.
"""
self.start_detection()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""
Context manager exit. Stops detection.
"""
self.stop_detection()
@action
def start_detection(
self,
input_device: Optional[str] = None,
seconds: Optional[float] = None,
block_duration: Optional[float] = None,
) -> None:
"""
Start the speech detection engine.
:param input_device: Audio input device name/index override
:param seconds: If set, then the detection engine will stop after this many seconds, otherwise it'll
start running until ``stop_detection`` is called or application stop.
:param block_duration: ``block_duration`` override.
"""
assert (
not self._input_stream and not self._recording_thread
), 'Speech detection is already running'
block_duration = block_duration or self.block_duration
input_device = input_device if input_device is not None else self.input_device
self._audio_queue = queue.Queue()
self._recording_thread = threading.Thread(
target=lambda: self.recording_thread(
block_duration=block_duration, input_device=input_device
)
)
self._recording_thread.start()
self._detection_thread = threading.Thread(
target=lambda: self.detection_thread()
)
self._detection_thread.start()
if seconds:
threading.Timer(seconds, lambda: self.stop_detection()).start()
@action
def stop_detection(self) -> None:
"""
Stop the speech detection engine.
"""
assert self._input_stream, 'Speech detection is not running'
self._input_stream.stop(ignore_errors=True)
self._input_stream.close(ignore_errors=True)
self._input_stream = None
if self._recording_thread:
self._recording_thread.join(timeout=self._thread_stop_timeout)
self._recording_thread = None
self._audio_queue = None
if self._detection_thread:
self._detection_thread.join(timeout=self._thread_stop_timeout)
self._detection_thread = None
# vim:sw=4:ts=4:et:

View File

@ -1,120 +0,0 @@
import os
import struct
from typing import Optional, List
from platypush.message.response.stt import SpeechDetectedResponse
from platypush.plugins import action
from platypush.plugins.stt import SttPlugin
class SttPicovoiceHotwordPlugin(SttPlugin):
"""
This plugin performs hotword detection using `PicoVoice <https://github.com/Picovoice>`_.
"""
def __init__(
self,
library_path: Optional[str] = None,
model_file_path: Optional[str] = None,
keyword_file_paths: Optional[List[str]] = None,
sensitivity: float = 0.5,
sensitivities: Optional[List[float]] = None,
*args,
**kwargs
):
from pvporcupine import Porcupine
from pvporcupine.resources.util.python.util import (
LIBRARY_PATH,
MODEL_FILE_PATH,
KEYWORD_FILE_PATHS,
)
super().__init__(*args, **kwargs)
self.hotwords = list(self.hotwords)
self._hotword_engine: Optional[Porcupine] = None
self._library_path = os.path.abspath(
os.path.expanduser(library_path or LIBRARY_PATH)
)
self._model_file_path = os.path.abspath(
os.path.expanduser(model_file_path or MODEL_FILE_PATH)
)
if not keyword_file_paths:
hotwords = KEYWORD_FILE_PATHS
assert all(
hotword in hotwords for hotword in self.hotwords
), 'Not all the hotwords could be found. Available hotwords: {}'.format(
list(hotwords.keys())
)
self._keyword_file_paths = [
os.path.abspath(os.path.expanduser(hotwords[hotword]))
for hotword in self.hotwords
]
else:
self._keyword_file_paths = [
os.path.abspath(os.path.expanduser(p)) for p in keyword_file_paths
]
self._sensitivities = []
if sensitivities:
assert len(self._keyword_file_paths) == len(
sensitivities
), 'Please specify as many sensitivities as the number of configured hotwords'
self._sensitivities = sensitivities
else:
self._sensitivities = [sensitivity] * len(self._keyword_file_paths)
def convert_frames(self, frames: bytes) -> tuple:
assert self._hotword_engine, 'The hotword engine is not running'
return struct.unpack_from("h" * self._hotword_engine.frame_length, frames)
def on_detection_ended(self) -> None:
if self._hotword_engine:
self._hotword_engine.delete()
self._hotword_engine = None
def detect_speech(self, frames: tuple) -> str:
index = self._hotword_engine.process(frames)
if index < 0:
return ''
if index is True:
index = 0
return self.hotwords[index]
@action
def detect(self, audio_file: str) -> SpeechDetectedResponse:
"""
Perform speech-to-text analysis on an audio file.
:param audio_file: Path to the audio file.
"""
pass
def recording_thread(
self, input_device: Optional[str] = None, *args, **kwargs
) -> None:
assert self._hotword_engine, 'The hotword engine has not yet been initialized'
super().recording_thread(
block_size=self._hotword_engine.frame_length, input_device=input_device
)
@action
def start_detection(self, *args, **kwargs) -> None:
from pvporcupine import Porcupine
self._hotword_engine = Porcupine(
library_path=self._library_path,
model_file_path=self._model_file_path,
keyword_file_paths=self._keyword_file_paths,
sensitivities=self._sensitivities,
)
self.rate = self._hotword_engine.sample_rate
super().start_detection(*args, **kwargs)
# vim:sw=4:ts=4:et:

View File

@ -1,7 +0,0 @@
manifest:
events: {}
install:
pip:
- pvporcupine
package: platypush.plugins.stt.picovoice.hotword
type: plugin

View File

@ -1,154 +0,0 @@
import inspect
import os
import platform
import struct
import threading
from typing import Optional
from platypush.message.event.stt import SpeechStartedEvent
from platypush.context import get_bus
from platypush.message.response.stt import SpeechDetectedResponse
from platypush.plugins import action
from platypush.plugins.stt import SttPlugin
class SttPicovoiceSpeechPlugin(SttPlugin):
"""
This plugin performs speech detection using `PicoVoice <https://github.com/Picovoice>`_.
NOTE: The PicoVoice product used for real-time speech-to-text (Cheetah) can be used freely for
personal applications on x86_64 Linux. Other architectures and operating systems require a commercial license.
You can ask for a license `here <https://picovoice.ai/contact.html>`_.
"""
def __init__(
self,
library_path: Optional[str] = None,
acoustic_model_path: Optional[str] = None,
language_model_path: Optional[str] = None,
license_path: Optional[str] = None,
end_of_speech_timeout: int = 1,
*args,
**kwargs
):
"""
:param library_path: Path to the Cheetah binary library for your OS
(default: ``CHEETAH_INSTALL_DIR/lib/OS/ARCH/libpv_cheetah.EXT``).
:param acoustic_model_path: Path to the acoustic speech model
(default: ``CHEETAH_INSTALL_DIR/lib/common/acoustic_model.pv``).
:param language_model_path: Path to the language model
(default: ``CHEETAH_INSTALL_DIR/lib/common/language_model.pv``).
:param license_path: Path to your PicoVoice license
(default: ``CHEETAH_INSTALL_DIR/resources/license/cheetah_eval_linux_public.lic``).
:param end_of_speech_timeout: Number of seconds of silence during speech recognition before considering
a phrase over (default: 1).
"""
from pvcheetah import Cheetah
super().__init__(*args, **kwargs)
self._basedir = os.path.abspath(
os.path.join(inspect.getfile(Cheetah), '..', '..', '..')
)
if not library_path:
library_path = self._get_library_path()
if not language_model_path:
language_model_path = os.path.join(
self._basedir, 'lib', 'common', 'language_model.pv'
)
if not acoustic_model_path:
acoustic_model_path = os.path.join(
self._basedir, 'lib', 'common', 'acoustic_model.pv'
)
if not license_path:
license_path = os.path.join(
self._basedir, 'resources', 'license', 'cheetah_eval_linux_public.lic'
)
self._library_path = library_path
self._language_model_path = language_model_path
self._acoustic_model_path = acoustic_model_path
self._license_path = license_path
self._end_of_speech_timeout = end_of_speech_timeout
self._stt_engine: Optional[Cheetah] = None
self._speech_in_progress = threading.Event()
def _get_library_path(self) -> str:
path = os.path.join(
self._basedir, 'lib', platform.system().lower(), platform.machine()
)
return os.path.join(
path, [f for f in os.listdir(path) if f.startswith('libpv_cheetah.')][0]
)
def convert_frames(self, frames: bytes) -> tuple:
assert self._stt_engine, 'The speech engine is not running'
return struct.unpack_from("h" * self._stt_engine.frame_length, frames)
def on_detection_ended(self) -> None:
if self._stt_engine:
self._stt_engine.delete()
self._stt_engine = None
def detect_speech(self, frames: tuple) -> str:
text, is_endpoint = self._stt_engine.process(frames)
text = text.strip()
if text:
if not self._speech_in_progress.is_set():
self._speech_in_progress.set()
get_bus().post(SpeechStartedEvent())
self._current_text += ' ' + text.strip()
if is_endpoint:
text = self._stt_engine.flush().strip().strip()
if text:
self._current_text += ' ' + text
self._speech_in_progress.clear()
if self._current_text:
self.on_speech_detected(self._current_text)
self._current_text = ''
return self._current_text
def process_text(self, text: str) -> None:
pass
@action
def detect(self, audio_file: str) -> SpeechDetectedResponse:
"""
Perform speech-to-text analysis on an audio file.
:param audio_file: Path to the audio file.
"""
pass
def recording_thread(
self, input_device: Optional[str] = None, *args, **kwargs
) -> None:
assert self._stt_engine, 'The hotword engine has not yet been initialized'
super().recording_thread(
block_size=self._stt_engine.frame_length, input_device=input_device
)
@action
def start_detection(self, *args, **kwargs) -> None:
from pvcheetah import Cheetah
self._stt_engine = Cheetah(
library_path=self._library_path,
acoustic_model_path=self._acoustic_model_path,
language_model_path=self._language_model_path,
license_path=self._license_path,
endpoint_duration_sec=self._end_of_speech_timeout,
)
self.rate = self._stt_engine.sample_rate
self._speech_in_progress.clear()
super().start_detection(*args, **kwargs)
# vim:sw=4:ts=4:et:

View File

@ -1,7 +0,0 @@
manifest:
events: {}
install:
pip:
- cheetah
package: platypush.plugins.stt.picovoice.speech
type: plugin

View File

@ -19,6 +19,7 @@ manifest:
- python-numpy
- python-sounddevice
pip:
- num2words
- numpy
- sounddevice
package: platypush.plugins.tts

View File

@ -0,0 +1,198 @@
import logging
import os
import re
from threading import RLock
from typing import Optional
import numpy as np
import pvorca
import sounddevice as sd
from platypush.config import Config
from platypush.plugins import action
from platypush.plugins.tts import TtsPlugin
class TextConversionUtils:
"""
Utility class to convert text to a format that is supported by the Orca TTS
engine.
This pre-processing step is necessary until the issue is fixed:
https://github.com/Picovoice/orca/issues/10.
"""
_logger = logging.getLogger(__name__)
_number_re = re.compile(r'(([0-9]+\.[0-9]+)|([0-9]+\,[0-9]+)|([0-9]+))')
_conversions_map = {
(re.compile(r'\s*[(){}\[\]<>]'), ', '),
(re.compile(r'[;]'), '.'),
(re.compile(r'[@#]'), ' at '),
(re.compile(r'[$]'), ' dollar '),
(re.compile(r'[%]'), ' percent '),
(re.compile(r'[&]'), ' and '),
(re.compile(r'[+]'), ' plus '),
(re.compile(r'[=]'), ' equals '),
(re.compile(r'[|]'), ' or '),
(re.compile(r'[~]'), ' tilde '),
(re.compile(r'[`]'), ''),
(re.compile(r'[*]'), ' star '),
(re.compile(r'[\\/]'), ' slash '),
(re.compile(r'[_]'), ' underscore '),
# Anything that isn't a letter or supported punctuation is replaced with a space
(re.compile(r'[^a-zA-Z,.:?!\-\'" ]'), ' '),
}
@classmethod
def _convert_digits(cls, text: str) -> str:
try:
from num2words import num2words
except ImportError:
cls._logger.warning('num2words is not installed, skipping digit conversion')
return text
while match := cls._number_re.search(text):
number = match.group(1)
text = text.replace(number, num2words(float(number.replace(',', ''))))
return text
@classmethod
def convert(cls, text: str) -> str:
text = cls._convert_digits(text)
for pattern, replacement in TextConversionUtils._conversions_map:
text = pattern.sub(replacement, text)
return text
class TtsPicovoicePlugin(TtsPlugin):
"""
This TTS plugin enables you to render text as audio using `Picovoice
<https://picovoice.ai>`_'s (still experimental) `Orca TTS engine
<https://github.com/Picovoice/orca>`_.
Take a look at
:class:`platypush.plugins.assistant.picovoice.AssistantPicovoicePlugin`
for details on how to sign up for a Picovoice account and get the API key.
Also note that using the TTS features requires you to select Orca from the
list of products available for your account on the `Picovoice console
<https://console.picovoice.ai>`_.
"""
def __init__(
self,
access_key: Optional[str] = None,
model_path: Optional[str] = None,
**kwargs,
):
"""
:param access_key: Picovoice access key. If it's not specified here,
then it must be specified on the configuration of
:class:`platypush.plugins.assistant.picovoice.AssistantPicovoicePlugin`.
:param model_path: Path of the TTS model file (default: use the default
English model).
"""
super().__init__(**kwargs)
if not access_key:
access_key = Config.get('assistant.picovoice', {}).get('access_key')
assert (
access_key
), 'No access key specified and no assistant.picovoice plugin found'
self.model_path = model_path
self.access_key = access_key
if model_path:
model_path = os.path.expanduser(model_path)
self._stream: Optional[sd.OutputStream] = None
self._stream_lock = RLock()
def _play_audio(self, orca: pvorca.Orca, pcm: np.ndarray):
with self._stream_lock:
self.stop()
self._stream = sd.OutputStream(
samplerate=orca.sample_rate,
channels=1,
dtype='int16',
)
try:
self._stream.start()
self._stream.write(pcm)
except Exception as e:
self.logger.warning('Error playing audio: %s: %s', type(e), str(e))
finally:
try:
self.stop()
self._stream.close()
except Exception as e:
self.logger.warning(
'Error stopping audio stream: %s: %s', type(e), str(e)
)
finally:
if self._stream:
self._stream = None
def get_orca(self, model_path: Optional[str] = None):
if not model_path:
model_path = self.model_path
if model_path:
model_path = os.path.expanduser(model_path)
return pvorca.create(access_key=self.access_key, model_path=model_path)
@action
def say(
self,
text: str,
*_,
output_file: Optional[str] = None,
speech_rate: Optional[float] = None,
model_path: Optional[str] = None,
**__,
):
"""
Say some text.
:param text: Text to say.
:param output_file: If set, save the audio to the specified file.
Otherwise play it.
:param speech_rate: Speech rate (default: None).
:param model_path: Path of the TTS model file (default: use the default
configured model).
"""
# This is a temporary workaround until this issue is fixed:
# https://github.com/Picovoice/orca/issues/10.
text = TextConversionUtils.convert(text)
orca = self.get_orca(model_path=model_path)
if output_file:
orca.synthesize_to_file(
text, os.path.expanduser(output_file), speech_rate=speech_rate
)
return
self._play_audio(
orca=orca,
pcm=np.array(
orca.synthesize(text, speech_rate=speech_rate),
dtype='int16',
),
)
@action
def stop(self):
"""
Stop the currently playing audio.
"""
with self._stream_lock:
if not self._stream:
return
self._stream.stop()
# vim:sw=4:ts=4:et:

View File

@ -0,0 +1,22 @@
manifest:
events: {}
install:
apk:
- ffmpeg
- py3-numpy
apt:
- ffmpeg
- python3-numpy
dnf:
- ffmpeg
- python-numpy
pacman:
- ffmpeg
- python-numpy
- python-sounddevice
pip:
- numpy
- pvorca
- sounddevice
package: platypush.plugins.tts.picovoice
type: plugin

View File

@ -83,7 +83,10 @@ mock_imports = [
"pmw3901",
"psutil",
"pvcheetah",
"pvporcupine ",
"pvleopard",
"pvorca",
"pvporcupine",
"pvrhino",
"pyHS100",
"pyaudio",
"pychromecast",

View File

@ -24,8 +24,11 @@ def OrEvent(*events, cls: Type = threading.Event):
or_event.clear()
def _to_or(e, changed_callback: Callable[[], None]):
e._set = e.set
e._clear = e.clear
if not hasattr(e, "_set"):
e._set = e.set
if not hasattr(e, "_clear"):
e._clear = e.clear
e.changed = changed_callback
e.set = lambda: _or_set(e)
e.clear = lambda: _clear_or(e)