[#304] Rewrite Picovoice integrations #385
|
@ -10,6 +10,4 @@ Backends
|
|||
platypush/backend/midi.rst
|
||||
platypush/backend/nodered.rst
|
||||
platypush/backend/redis.rst
|
||||
platypush/backend/stt.picovoice.hotword.rst
|
||||
platypush/backend/stt.picovoice.speech.rst
|
||||
platypush/backend/tcp.rst
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
``stt.picovoice.hotword``
|
||||
===========================================
|
||||
|
||||
.. automodule:: platypush.backend.stt.picovoice.hotword
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
``stt.picovoice.speech``
|
||||
==========================================
|
||||
|
||||
.. automodule:: platypush.backend.stt.picovoice.speech
|
||||
:members:
|
|
@ -0,0 +1,5 @@
|
|||
``assistant.picovoice``
|
||||
=======================
|
||||
|
||||
.. automodule:: platypush.plugins.assistant.picovoice
|
||||
:members:
|
|
@ -0,0 +1,5 @@
|
|||
``openai``
|
||||
==========
|
||||
|
||||
.. automodule:: platypush.plugins.openai
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
``stt.picovoice.hotword``
|
||||
===========================================
|
||||
|
||||
.. automodule:: platypush.plugins.stt.picovoice.hotword
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
``stt.picovoice.speech``
|
||||
==========================================
|
||||
|
||||
.. automodule:: platypush.plugins.stt.picovoice.speech
|
||||
:members:
|
|
@ -0,0 +1,5 @@
|
|||
``tts.picovoice``
|
||||
=================
|
||||
|
||||
.. automodule:: platypush.plugins.tts.picovoice
|
||||
:members:
|
|
@ -11,6 +11,7 @@ Plugins
|
|||
platypush/plugins/application.rst
|
||||
platypush/plugins/arduino.rst
|
||||
platypush/plugins/assistant.google.rst
|
||||
platypush/plugins/assistant.picovoice.rst
|
||||
platypush/plugins/autoremote.rst
|
||||
platypush/plugins/bluetooth.rst
|
||||
platypush/plugins/calendar.rst
|
||||
|
@ -94,6 +95,7 @@ Plugins
|
|||
platypush/plugins/ngrok.rst
|
||||
platypush/plugins/nmap.rst
|
||||
platypush/plugins/ntfy.rst
|
||||
platypush/plugins/openai.rst
|
||||
platypush/plugins/otp.rst
|
||||
platypush/plugins/pihole.rst
|
||||
platypush/plugins/ping.rst
|
||||
|
@ -119,8 +121,6 @@ Plugins
|
|||
platypush/plugins/smartthings.rst
|
||||
platypush/plugins/sound.rst
|
||||
platypush/plugins/ssh.rst
|
||||
platypush/plugins/stt.picovoice.hotword.rst
|
||||
platypush/plugins/stt.picovoice.speech.rst
|
||||
platypush/plugins/sun.rst
|
||||
platypush/plugins/switch.tplink.rst
|
||||
platypush/plugins/switch.wemo.rst
|
||||
|
@ -135,6 +135,7 @@ Plugins
|
|||
platypush/plugins/tts.rst
|
||||
platypush/plugins/tts.google.rst
|
||||
platypush/plugins/tts.mimic3.rst
|
||||
platypush/plugins/tts.picovoice.rst
|
||||
platypush/plugins/tv.samsung.ws.rst
|
||||
platypush/plugins/twilio.rst
|
||||
platypush/plugins/udp.rst
|
||||
|
|
|
@ -7,10 +7,13 @@ Platypush
|
|||
|
||||
from .app import Application
|
||||
from .config import Config
|
||||
from .context import get_backend, get_bus, get_plugin
|
||||
from .context import Variable, get_backend, get_bus, get_plugin
|
||||
from .cron import cron
|
||||
from .event.hook import hook
|
||||
from .message.event import Event
|
||||
from .message.request import Request
|
||||
from .message.response import Response
|
||||
from .procedure import procedure
|
||||
from .runner import main
|
||||
from .utils import run
|
||||
|
||||
|
@ -19,14 +22,18 @@ __author__ = 'Fabio Manganiello <fabio@manganiello.tech>'
|
|||
__version__ = '0.50.3'
|
||||
__all__ = [
|
||||
'Application',
|
||||
'Variable',
|
||||
'Config',
|
||||
'Event',
|
||||
'Request',
|
||||
'Response',
|
||||
'cron',
|
||||
'get_backend',
|
||||
'get_bus',
|
||||
'get_plugin',
|
||||
'hook',
|
||||
'main',
|
||||
'procedure',
|
||||
'run',
|
||||
]
|
||||
|
||||
|
|
|
@ -1,40 +0,0 @@
|
|||
import time
|
||||
|
||||
from platypush.backend import Backend
|
||||
from platypush.context import get_plugin
|
||||
from platypush.plugins.stt import SttPlugin
|
||||
|
||||
|
||||
class SttBackend(Backend):
|
||||
"""
|
||||
Base class for speech-to-text backends.
|
||||
"""
|
||||
|
||||
def __init__(self, plugin_name: str, retry_sleep: float = 5.0, *args, **kwargs):
|
||||
"""
|
||||
:param plugin_name: Plugin name of the class that will be used for speech detection. Must be an instance of
|
||||
:class:`platypush.plugins.stt.SttPlugin`.
|
||||
:param retry_sleep: Number of seconds the backend will wait on failure before re-initializing the plugin
|
||||
(default: 5 seconds).
|
||||
"""
|
||||
super().__init__(*args, **kwargs)
|
||||
self.plugin_name = plugin_name
|
||||
self.retry_sleep = retry_sleep
|
||||
|
||||
def run(self):
|
||||
super().run()
|
||||
self.logger.info('Starting {} speech-to-text backend'.format(self.__class__.__name__))
|
||||
|
||||
while not self.should_stop():
|
||||
try:
|
||||
plugin: SttPlugin = get_plugin(self.plugin_name)
|
||||
with plugin:
|
||||
# noinspection PyProtectedMember
|
||||
plugin._detection_thread.join()
|
||||
except Exception as e:
|
||||
self.logger.exception(e)
|
||||
self.logger.warning('Encountered an unexpected error, retrying in {} seconds'.format(self.retry_sleep))
|
||||
time.sleep(self.retry_sleep)
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
|
@ -1,21 +0,0 @@
|
|||
from platypush.backend.stt import SttBackend
|
||||
|
||||
|
||||
class SttPicovoiceHotwordBackend(SttBackend):
|
||||
"""
|
||||
Backend for the PicoVoice hotword detection plugin. Set this plugin to ``enabled`` if you
|
||||
want to run the hotword engine continuously instead of programmatically using
|
||||
``start_detection`` and ``stop_detection``.
|
||||
|
||||
Requires:
|
||||
|
||||
- The :class:`platypush.plugins.stt.deepspeech.SttPicovoiceHotwordPlugin` plugin configured and its dependencies
|
||||
installed.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__('stt.picovoice.hotword', *args, **kwargs)
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
|
@ -1,6 +0,0 @@
|
|||
manifest:
|
||||
events: {}
|
||||
install:
|
||||
pip: []
|
||||
package: platypush.backend.stt.picovoice.hotword
|
||||
type: backend
|
|
@ -1,21 +0,0 @@
|
|||
from platypush.backend.stt import SttBackend
|
||||
|
||||
|
||||
class SttPicovoiceSpeechBackend(SttBackend):
|
||||
"""
|
||||
Backend for the PicoVoice speech detection plugin. Set this plugin to ``enabled`` if you
|
||||
want to run the speech engine continuously instead of programmatically using
|
||||
``start_detection`` and ``stop_detection``.
|
||||
|
||||
Requires:
|
||||
|
||||
- The :class:`platypush.plugins.stt.deepspeech.SttPicovoiceSpeechPlugin` plugin configured and its dependencies
|
||||
installed.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__('stt.picovoice.speech', *args, **kwargs)
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
|
@ -1,6 +0,0 @@
|
|||
manifest:
|
||||
events: {}
|
||||
install:
|
||||
pip: []
|
||||
package: platypush.backend.stt.picovoice.speech
|
||||
type: backend
|
|
@ -1,4 +1,4 @@
|
|||
from multiprocessing import RLock, Queue
|
||||
from multiprocessing import RLock, Queue, active_children
|
||||
import os
|
||||
from queue import Empty
|
||||
import socket
|
||||
|
@ -57,6 +57,25 @@ class CommandStream(ControllableProcess):
|
|||
self.reset()
|
||||
return super().close()
|
||||
|
||||
def _term_or_kill(self, kill: bool = False, visited=None) -> None:
|
||||
func = 'kill' if kill else 'terminate'
|
||||
visited = visited or set()
|
||||
visited.add(id(self))
|
||||
|
||||
for child in active_children():
|
||||
child_id = id(child)
|
||||
if child_id not in visited:
|
||||
visited.add(child_id)
|
||||
getattr(child, func)()
|
||||
|
||||
getattr(super(), func)()
|
||||
|
||||
def terminate(self, visited=None) -> None:
|
||||
self._term_or_kill(kill=False, visited=visited)
|
||||
|
||||
def kill(self) -> None:
|
||||
self._term_or_kill(kill=True)
|
||||
|
||||
def __enter__(self) -> "CommandStream":
|
||||
self.reset()
|
||||
sock = self._sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import sys
|
||||
from typing import Callable, List, Union
|
||||
|
||||
from ..hook import EventHook
|
||||
|
||||
|
@ -20,10 +21,23 @@ class EventProcessor:
|
|||
if hooks is None:
|
||||
hooks = Config.get_event_hooks()
|
||||
|
||||
self.hooks = []
|
||||
self._hooks_by_name = {}
|
||||
self._hooks_by_value_id = {}
|
||||
for name, hook in hooks.items():
|
||||
h = EventHook.build(name=name, hook=hook)
|
||||
self.hooks.append(h)
|
||||
self.add_hook(name, hook)
|
||||
|
||||
@property
|
||||
def hooks(self) -> List[EventHook]:
|
||||
return list(self._hooks_by_name.values())
|
||||
|
||||
def add_hook(self, name: str, desc: Union[dict, Callable]):
|
||||
hook_id = id(desc)
|
||||
if hook_id in self._hooks_by_value_id:
|
||||
return # Don't add the same hook twice
|
||||
|
||||
hook = EventHook.build(name=name, hook=desc)
|
||||
self._hooks_by_name[name] = hook
|
||||
self._hooks_by_value_id[hook_id] = hook
|
||||
|
||||
@staticmethod
|
||||
def notify_web_clients(event):
|
||||
|
|
|
@ -257,26 +257,29 @@ class Event(Message):
|
|||
|
||||
return result
|
||||
|
||||
def as_dict(self):
|
||||
"""
|
||||
Converts the event into a dictionary
|
||||
"""
|
||||
args = copy.deepcopy(self.args)
|
||||
flatten(args)
|
||||
return {
|
||||
'type': 'event',
|
||||
'target': self.target,
|
||||
'origin': self.origin if hasattr(self, 'origin') else None,
|
||||
'id': self.id if hasattr(self, 'id') else None,
|
||||
'_timestamp': self.timestamp,
|
||||
'args': {'type': self.type, **args},
|
||||
}
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
Overrides the str() operator and converts
|
||||
the message into a UTF-8 JSON string
|
||||
"""
|
||||
|
||||
args = copy.deepcopy(self.args)
|
||||
flatten(args)
|
||||
|
||||
return json.dumps(
|
||||
{
|
||||
'type': 'event',
|
||||
'target': self.target,
|
||||
'origin': self.origin if hasattr(self, 'origin') else None,
|
||||
'id': self.id if hasattr(self, 'id') else None,
|
||||
'_timestamp': self.timestamp,
|
||||
'args': {'type': self.type, **args},
|
||||
},
|
||||
cls=self.Encoder,
|
||||
)
|
||||
return json.dumps(self.as_dict(), cls=self.Encoder)
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
|
@ -1,27 +1,53 @@
|
|||
import re
|
||||
import sys
|
||||
from typing import Optional
|
||||
from typing import Any, Mapping, Optional, Union
|
||||
|
||||
from platypush.context import get_plugin
|
||||
from platypush.message.event import Event
|
||||
from platypush.message.event import Event, EventMatchResult
|
||||
from platypush.plugins.assistant import AssistantPlugin
|
||||
from platypush.utils import get_plugin_name_by_class
|
||||
|
||||
|
||||
class AssistantEvent(Event):
|
||||
"""Base class for assistant events"""
|
||||
|
||||
def __init__(self, *args, assistant: Optional[str] = None, **kwargs):
|
||||
def __init__(
|
||||
self, *args, assistant: Optional[Union[str, AssistantPlugin]] = None, **kwargs
|
||||
):
|
||||
"""
|
||||
:param assistant: Name of the assistant plugin that triggered the event.
|
||||
"""
|
||||
super().__init__(*args, assistant=assistant, **kwargs)
|
||||
assistant = assistant or kwargs.get('assistant')
|
||||
if assistant:
|
||||
assistant = (
|
||||
assistant
|
||||
if isinstance(assistant, str)
|
||||
else get_plugin_name_by_class(assistant.__class__)
|
||||
)
|
||||
|
||||
kwargs['_assistant'] = assistant
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@property
|
||||
def _assistant(self):
|
||||
return (
|
||||
get_plugin(self.args.get('assistant'))
|
||||
if self.args.get('assistant')
|
||||
else None
|
||||
)
|
||||
def assistant(self) -> Optional[AssistantPlugin]:
|
||||
assistant = self.args.get('_assistant')
|
||||
if not assistant:
|
||||
return None
|
||||
|
||||
return get_plugin(assistant)
|
||||
|
||||
def as_dict(self):
|
||||
evt_dict = super().as_dict()
|
||||
evt_args = {**evt_dict['args']}
|
||||
assistant = evt_args.pop('_assistant', None)
|
||||
if assistant:
|
||||
evt_args['assistant'] = assistant
|
||||
|
||||
return {
|
||||
**evt_dict,
|
||||
'args': evt_args,
|
||||
}
|
||||
|
||||
|
||||
class ConversationStartEvent(AssistantEvent):
|
||||
|
@ -43,7 +69,6 @@ class ConversationEndEvent(AssistantEvent):
|
|||
:param with_follow_on_turn: Set to true if the conversation expects a
|
||||
user follow-up, false otherwise
|
||||
"""
|
||||
|
||||
super().__init__(*args, with_follow_on_turn=with_follow_on_turn, **kwargs)
|
||||
|
||||
|
||||
|
@ -56,17 +81,46 @@ class ConversationTimeoutEvent(ConversationEndEvent):
|
|||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class ResponseEvent(ConversationEndEvent):
|
||||
class ResponseEvent(AssistantEvent):
|
||||
"""
|
||||
Event triggered when a response is processed by the assistant
|
||||
"""
|
||||
|
||||
def __init__(self, *args, response_text: str, **kwargs):
|
||||
def __init__(
|
||||
self, *args, response_text: str, with_follow_on_turn: bool = False, **kwargs
|
||||
):
|
||||
"""
|
||||
:param response_text: Response text processed by the assistant
|
||||
:param with_follow_on_turn: Set to true if the conversation expects a
|
||||
user follow-up, false otherwise
|
||||
"""
|
||||
super().__init__(
|
||||
*args,
|
||||
response_text=response_text,
|
||||
with_follow_on_turn=with_follow_on_turn,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
super().__init__(*args, response_text=response_text, **kwargs)
|
||||
|
||||
class ResponseEndEvent(ConversationEndEvent):
|
||||
"""
|
||||
Event triggered when a response has been rendered on the assistant.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, *args, response_text: str, with_follow_on_turn: bool = False, **kwargs
|
||||
):
|
||||
"""
|
||||
:param response_text: Response text rendered on the assistant.
|
||||
:param with_follow_on_turn: Set to true if the conversation expects a
|
||||
user follow-up, false otherwise.
|
||||
"""
|
||||
super().__init__(
|
||||
*args,
|
||||
response_text=response_text,
|
||||
with_follow_on_turn=with_follow_on_turn,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class NoResponseEvent(ConversationEndEvent):
|
||||
|
@ -95,8 +149,8 @@ class SpeechRecognizedEvent(AssistantEvent):
|
|||
"""
|
||||
|
||||
result = super().matches_condition(condition)
|
||||
if result.is_match and self._assistant and 'phrase' in condition.args:
|
||||
self._assistant.stop_conversation()
|
||||
if result.is_match and self.assistant and 'phrase' in condition.args:
|
||||
self.assistant.stop_conversation()
|
||||
|
||||
return result
|
||||
|
||||
|
@ -125,6 +179,7 @@ class SpeechRecognizedEvent(AssistantEvent):
|
|||
result.score = sys.maxsize
|
||||
return result
|
||||
|
||||
parsed_args = {}
|
||||
event_tokens = re.split(r'\s+', event_args.get(argname, '').strip().lower())
|
||||
condition_tokens = re.split(r'\s+', condition_value.strip().lower())
|
||||
|
||||
|
@ -147,11 +202,11 @@ class SpeechRecognizedEvent(AssistantEvent):
|
|||
m = re.match(r'[^\\]*\${(.+?)}', condition_token)
|
||||
if m:
|
||||
argname = m.group(1)
|
||||
if argname not in result.parsed_args:
|
||||
result.parsed_args[argname] = event_token
|
||||
if argname not in parsed_args:
|
||||
parsed_args[argname] = event_token
|
||||
result.score += 1.0
|
||||
else:
|
||||
result.parsed_args[argname] += ' ' + event_token
|
||||
parsed_args[argname] += ' ' + event_token
|
||||
|
||||
if (len(condition_tokens) == 1 and len(event_tokens) == 1) or (
|
||||
len(event_tokens) > 1
|
||||
|
@ -169,6 +224,45 @@ class SpeechRecognizedEvent(AssistantEvent):
|
|||
|
||||
# It's a match if all the tokens in the condition string have been satisfied
|
||||
result.is_match = len(condition_tokens) == 0
|
||||
if result.is_match:
|
||||
result.parsed_args = parsed_args
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class IntentRecognizedEvent(AssistantEvent):
|
||||
"""
|
||||
Event triggered when an intent is matched by a speech command.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, *args, intent: str, slots: Optional[Mapping[str, Any]] = None, **kwargs
|
||||
):
|
||||
"""
|
||||
:param intent: The intent that has been matched.
|
||||
:param slots: The slots extracted from the intent, as a key-value mapping.
|
||||
"""
|
||||
super().__init__(*args, intent=intent, slots=slots or {}, **kwargs)
|
||||
|
||||
def _matches_argument(
|
||||
self, argname, condition_value, event_args, result: EventMatchResult
|
||||
):
|
||||
if argname != 'slots':
|
||||
return super()._matches_argument(
|
||||
argname, condition_value, event_args, result
|
||||
)
|
||||
|
||||
event_slots = set(event_args.get(argname, {}).items())
|
||||
slots = set(self.args.get(argname, {}).items())
|
||||
|
||||
# All the slots in the condition must be in the event
|
||||
if slots.difference(event_slots) == 0:
|
||||
result.is_match = True
|
||||
result.score += 1
|
||||
else:
|
||||
result.is_match = False
|
||||
result.score = 0
|
||||
|
||||
return result
|
||||
|
||||
|
||||
|
|
|
@ -122,6 +122,12 @@ class Plugin(EventGenerator, ExtensionWithManifest): # lgtm [py/missing-call-to
|
|||
assert entities, 'entities plugin not initialized'
|
||||
return entities
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
:return: The qualified name of the plugin.
|
||||
"""
|
||||
return get_plugin_name_by_class(self.__class__)
|
||||
|
||||
def run(self, method, *args, **kwargs):
|
||||
assert (
|
||||
method in self.registered_actions
|
||||
|
|
|
@ -8,23 +8,7 @@ from typing import Any, Collection, Dict, Optional, Type
|
|||
from platypush.context import get_bus, get_plugin
|
||||
from platypush.entities.assistants import Assistant
|
||||
from platypush.entities.managers.assistants import AssistantEntityManager
|
||||
from platypush.message.event.assistant import (
|
||||
AssistantEvent,
|
||||
ConversationStartEvent,
|
||||
ConversationEndEvent,
|
||||
ConversationTimeoutEvent,
|
||||
ResponseEvent,
|
||||
NoResponseEvent,
|
||||
SpeechRecognizedEvent,
|
||||
AlarmStartedEvent,
|
||||
AlarmEndEvent,
|
||||
TimerStartedEvent,
|
||||
TimerEndEvent,
|
||||
AlertStartedEvent,
|
||||
AlertEndEvent,
|
||||
MicMutedEvent,
|
||||
MicUnmutedEvent,
|
||||
)
|
||||
from platypush.message.event import Event as AppEvent
|
||||
from platypush.plugins import Plugin, action
|
||||
from platypush.utils import get_plugin_name_by_class
|
||||
|
||||
|
@ -181,6 +165,17 @@ class AssistantPlugin(Plugin, AssistantEntityManager, ABC):
|
|||
self.publish_entities([self])
|
||||
return asdict(self._state)
|
||||
|
||||
@action
|
||||
def render_response(self, text: str, *_, **__):
|
||||
"""
|
||||
Render a response text as audio over the configured TTS plugin.
|
||||
|
||||
:param text: Text to render.
|
||||
"""
|
||||
self._on_response_render_start(text)
|
||||
self._render_response(text)
|
||||
self._on_response_render_end()
|
||||
|
||||
def _get_tts_plugin(self):
|
||||
if not self.tts_plugin:
|
||||
return None
|
||||
|
@ -200,11 +195,13 @@ class AssistantPlugin(Plugin, AssistantEntityManager, ABC):
|
|||
|
||||
audio.play(self._conversation_start_sound)
|
||||
|
||||
def _send_event(self, event_type: Type[AssistantEvent], **kwargs):
|
||||
def _send_event(self, event_type: Type[AppEvent], **kwargs):
|
||||
self.publish_entities([self])
|
||||
get_bus().post(event_type(assistant=self._plugin_name, **kwargs))
|
||||
|
||||
def _on_conversation_start(self):
|
||||
from platypush.message.event.assistant import ConversationStartEvent
|
||||
|
||||
self._last_response = None
|
||||
self._last_query = None
|
||||
self._conversation_running.set()
|
||||
|
@ -212,63 +209,105 @@ class AssistantPlugin(Plugin, AssistantEntityManager, ABC):
|
|||
self._play_conversation_start_sound()
|
||||
|
||||
def _on_conversation_end(self):
|
||||
from platypush.message.event.assistant import ConversationEndEvent
|
||||
|
||||
self._conversation_running.clear()
|
||||
self._send_event(ConversationEndEvent)
|
||||
|
||||
def _on_conversation_timeout(self):
|
||||
from platypush.message.event.assistant import ConversationTimeoutEvent
|
||||
|
||||
self._last_response = None
|
||||
self._last_query = None
|
||||
self._conversation_running.clear()
|
||||
self._send_event(ConversationTimeoutEvent)
|
||||
|
||||
def _on_no_response(self):
|
||||
from platypush.message.event.assistant import NoResponseEvent
|
||||
|
||||
self._last_response = None
|
||||
self._conversation_running.clear()
|
||||
self._send_event(NoResponseEvent)
|
||||
|
||||
def _on_reponse_rendered(self, text: Optional[str]):
|
||||
def _on_response_render_start(self, text: Optional[str]):
|
||||
from platypush.message.event.assistant import ResponseEvent
|
||||
|
||||
self._last_response = text
|
||||
self._send_event(ResponseEvent, response_text=text)
|
||||
tts = self._get_tts_plugin()
|
||||
|
||||
def _render_response(self, text: Optional[str]):
|
||||
tts = self._get_tts_plugin()
|
||||
if tts and text:
|
||||
self.stop_conversation()
|
||||
tts.say(text=text, **self.tts_plugin_args)
|
||||
|
||||
def _on_response_render_end(self):
|
||||
from platypush.message.event.assistant import ResponseEndEvent
|
||||
|
||||
self._send_event(ResponseEndEvent, response_text=self._last_response)
|
||||
|
||||
def _on_hotword_detected(self, hotword: Optional[str]):
|
||||
from platypush.message.event.assistant import HotwordDetectedEvent
|
||||
|
||||
self._send_event(HotwordDetectedEvent, hotword=hotword)
|
||||
|
||||
def _on_speech_recognized(self, phrase: Optional[str]):
|
||||
from platypush.message.event.assistant import SpeechRecognizedEvent
|
||||
|
||||
phrase = (phrase or '').lower().strip()
|
||||
self._last_query = phrase
|
||||
self._send_event(SpeechRecognizedEvent, phrase=phrase)
|
||||
|
||||
def _on_intent_matched(self, intent: str, slots: Optional[Dict[str, Any]] = None):
|
||||
from platypush.message.event.assistant import IntentRecognizedEvent
|
||||
|
||||
self._send_event(IntentRecognizedEvent, intent=intent, slots=slots)
|
||||
|
||||
def _on_alarm_start(self):
|
||||
from platypush.message.event.assistant import AlarmStartedEvent
|
||||
|
||||
self._cur_alert_type = AlertType.ALARM
|
||||
self._send_event(AlarmStartedEvent)
|
||||
|
||||
def _on_alarm_end(self):
|
||||
from platypush.message.event.assistant import AlarmEndEvent
|
||||
|
||||
self._cur_alert_type = None
|
||||
self._send_event(AlarmEndEvent)
|
||||
|
||||
def _on_timer_start(self):
|
||||
from platypush.message.event.assistant import TimerStartedEvent
|
||||
|
||||
self._cur_alert_type = AlertType.TIMER
|
||||
self._send_event(TimerStartedEvent)
|
||||
|
||||
def _on_timer_end(self):
|
||||
from platypush.message.event.assistant import TimerEndEvent
|
||||
|
||||
self._cur_alert_type = None
|
||||
self._send_event(TimerEndEvent)
|
||||
|
||||
def _on_alert_start(self):
|
||||
from platypush.message.event.assistant import AlertStartedEvent
|
||||
|
||||
self._cur_alert_type = AlertType.ALERT
|
||||
self._send_event(AlertStartedEvent)
|
||||
|
||||
def _on_alert_end(self):
|
||||
from platypush.message.event.assistant import AlertEndEvent
|
||||
|
||||
self._cur_alert_type = None
|
||||
self._send_event(AlertEndEvent)
|
||||
|
||||
def _on_mute(self):
|
||||
from platypush.message.event.assistant import MicMutedEvent
|
||||
|
||||
self._is_muted = True
|
||||
self._send_event(MicMutedEvent)
|
||||
|
||||
def _on_unmute(self):
|
||||
from platypush.message.event.assistant import MicUnmutedEvent
|
||||
|
||||
self._is_muted = False
|
||||
self._send_event(MicUnmutedEvent)
|
||||
|
||||
|
|
|
@ -0,0 +1,740 @@
|
|||
import os
|
||||
from typing import Optional, Sequence
|
||||
|
||||
from platypush.context import get_plugin
|
||||
from platypush.plugins import RunnablePlugin, action
|
||||
from platypush.plugins.assistant import AssistantPlugin
|
||||
from platypush.plugins.tts.picovoice import TtsPicovoicePlugin
|
||||
|
||||
from ._assistant import Assistant
|
||||
from ._state import AssistantState
|
||||
|
||||
|
||||
# pylint: disable=too-many-ancestors
|
||||
class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
|
||||
r"""
|
||||
A voice assistant that runs on your device, based on the `Picovoice
|
||||
<https://picovoice.ai/>`_ engine.
|
||||
|
||||
Picovoice is a suite of on-device voice technologies that include:
|
||||
|
||||
* **Porcupine**: wake-word engine, if you want the device to listen for
|
||||
a specific wake word in order to start the assistant.
|
||||
|
||||
* **Cheetah**: speech-to-text engine, if you want your voice
|
||||
interactions to be transcribed into free text - either
|
||||
programmatically or when triggered by the wake word. Or:
|
||||
|
||||
* **Rhino**: intent recognition engine, if you want to extract *intents*
|
||||
out of your voice commands - for instance, the phrase "set the living
|
||||
room temperature to 20 degrees" could be mapped to the intent with the
|
||||
following parameters: ``intent``: ``set_temperature``, ``room``:
|
||||
``living_room``, ``temperature``: ``20``.
|
||||
|
||||
* **Leopard**: speech-to-text engine aimed at offline transcription of
|
||||
audio files rather than real-time transcription.
|
||||
|
||||
* **Orca**: text-to-speech engine, if you want to create your custom
|
||||
logic to respond to user's voice commands and render the responses as
|
||||
audio.
|
||||
|
||||
This plugin is a wrapper around the Picovoice engine that allows you to
|
||||
run your custom voice-based conversational flows on your device.
|
||||
|
||||
Getting a Picovoice account and access key
|
||||
-------------------------------------------
|
||||
|
||||
You can get your personal access key by signing up at the `Picovoice
|
||||
console <https://console.picovoice.ai/>`_. You may be asked to submit a
|
||||
reason for using the service (feel free to mention a personal Platypush
|
||||
integration), and you will receive your personal access key.
|
||||
|
||||
If prompted to select the products you want to use, make sure to select
|
||||
the ones from the Picovoice suite that you want to use with this plugin.
|
||||
|
||||
|
||||
Hotword detection
|
||||
-----------------
|
||||
|
||||
The hotword detection engine is based on `Porcupine
|
||||
<https://picovoice.ai/platform/porcupine/>`_.
|
||||
|
||||
If enabled through the ``hotword_enabled`` parameter (default: True), the
|
||||
assistant will listen for a specific wake word before starting the
|
||||
speech-to-text or intent recognition engines. You can specify custom models
|
||||
for your hotword (e.g. on the same device you may use "Alexa" to trigger the
|
||||
speech-to-text engine in English, "Computer" to trigger the speech-to-text
|
||||
engine in Italian, and "Ok Google" to trigger the intent recognition engine.
|
||||
|
||||
You can also create your custom hotword models using the `Porcupine console
|
||||
<https://console.picovoice.ai/ppn>`_.
|
||||
|
||||
If ``hotword_enabled`` is set to True, you must also specify the
|
||||
``keywords`` parameter with the list of keywords that you want to listen
|
||||
for, and optionally the ``keyword_paths`` parameter with the paths to the
|
||||
any custom hotword models that you want to use. If ``hotword_enabled`` is
|
||||
set to False, then the assistant won't start listening for speech after the
|
||||
plugin is started, and you will need to programmatically start the
|
||||
conversation by calling the :meth:`.start_conversation` action, or trigger
|
||||
it from the UI.
|
||||
|
||||
When a wake-word is detected, the assistant will emit a
|
||||
:class:`platypush.message.event.assistant.HotwordDetectedEvent` event that
|
||||
you can use to build your custom logic. For example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import time
|
||||
|
||||
from platypush import hook, run
|
||||
from platypush.message.event.assistant import HotwordDetectedEvent
|
||||
|
||||
# Turn on a light for 5 seconds when the hotword "Alexa" is detected
|
||||
@hook(HotwordDetectedEvent, hotword='Alexa')
|
||||
def on_hotword_detected(event: HotwordDetectedEvent, **context):
|
||||
run("light.hue.on", lights=["Living Room"])
|
||||
time.sleep(5)
|
||||
run("light.hue.off", lights=["Living Room"])
|
||||
|
||||
By default, the assistant will start listening for speech after the hotword
|
||||
if either ``stt_enabled`` or ``intent_model_path`` are set. If you don't
|
||||
want the assistant to start listening for speech after the hotword is
|
||||
detected (for example because you want to build your custom response flows,
|
||||
or trigger the speech detection using different models depending on the
|
||||
hotword that is used, or because you just want to detect hotwords but not
|
||||
speech), then you can also set the ``start_conversation_on_hotword``
|
||||
parameter to ``False``. If that is the case, then you can programmatically
|
||||
start the conversation by calling the :meth:`.start_conversation` method in
|
||||
your event hooks:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from platypush import hook, run
|
||||
from platypush.message.event.assistant import HotwordDetectedEvent
|
||||
|
||||
# Start a conversation using the Italian language model when the
|
||||
# "Buongiorno" hotword is detected
|
||||
@hook(HotwordDetectedEvent, hotword='Buongiorno')
|
||||
def on_it_hotword_detected(event: HotwordDetectedEvent, **context):
|
||||
event.assistant.start_conversation(model_file='path/to/it.pv')
|
||||
|
||||
Speech-to-text
|
||||
--------------
|
||||
|
||||
The speech-to-text engine is based on `Cheetah
|
||||
<https://picovoice.ai/docs/cheetah/>`_.
|
||||
|
||||
If enabled through the ``stt_enabled`` parameter (default: True), the
|
||||
assistant will transcribe the voice commands into text when a conversation
|
||||
is started either programmatically through :meth:`.start_conversation` or
|
||||
when the hotword is detected.
|
||||
|
||||
It will emit a
|
||||
:class:`platypush.message.event.assistant.SpeechRecognizedEvent` when some
|
||||
speech is detected, and you can hook to that event to build your custom
|
||||
logic:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from platypush import hook, run
|
||||
from platypush.message.event.assistant import SpeechRecognizedEvent
|
||||
|
||||
# Turn on a light when the phrase "turn on the lights" is detected.
|
||||
# Note that we can leverage regex-based pattern matching to be more
|
||||
# flexible when matching the phrases. For example, the following hook
|
||||
# will be matched when the user says "turn on the lights", "turn on
|
||||
# lights", "lights on", "lights on please", "turn on light" etc.
|
||||
@hook(SpeechRecognizedEvent, phrase='turn on (the)? lights?')
|
||||
def on_turn_on_lights(event: SpeechRecognizedEvent, **context):
|
||||
run("light.hue.on")
|
||||
|
||||
You can also leverage context extraction through the ``${}`` syntax on the
|
||||
hook to extract specific tokens from the event that can be passed to your
|
||||
event hook. For example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from platypush import hook, run
|
||||
from platypush.message.event.assistant import SpeechRecognizedEvent
|
||||
|
||||
@hook(SpeechRecognizedEvent, phrase='play ${title} by ${artist}')
|
||||
def on_play_track_command(
|
||||
event: SpeechRecognizedEvent, title: str, artist: str, **context
|
||||
):
|
||||
results = run(
|
||||
"music.mopidy.search",
|
||||
filter={"title": title, "artist": artist}
|
||||
)
|
||||
|
||||
if not results:
|
||||
event.assistant.render_response(f"Couldn't find {title} by {artist}")
|
||||
return
|
||||
|
||||
run("music.mopidy.play", resource=results[0]["uri"])
|
||||
|
||||
Speech-to-intent
|
||||
----------------
|
||||
|
||||
The intent recognition engine is based on `Rhino
|
||||
<https://picovoice.ai/docs/rhino/>`_.
|
||||
|
||||
*Intents* are snippets of unstructured transcribed speech that can be
|
||||
matched to structured actions.
|
||||
|
||||
Unlike with hotword and speech-to-text detection, you need to provide a
|
||||
custom model for intent detection. You can create your custom model using
|
||||
the `Rhino console <https://console.picovoice.ai/rhn>`_.
|
||||
|
||||
When an intent is detected, the assistant will emit a
|
||||
:class:`platypush.message.event.assistant.IntentRecognizedEvent` that can
|
||||
be listened.
|
||||
|
||||
For example, you can train a model to control groups of smart lights by
|
||||
defining the following slots on the Rhino console:
|
||||
|
||||
- ``device_state``: The new state of the device (e.g. with ``on`` or
|
||||
``off`` as supported values)
|
||||
|
||||
- ``room``: The name of the room associated to the group of lights to
|
||||
be controlled (e.g. ``living room``, ``kitchen``, ``bedroom``)
|
||||
|
||||
You can then define a ``lights_ctrl`` intent with the following expressions:
|
||||
|
||||
- "turn ``$device_state:state`` the lights"
|
||||
- "turn ``$device_state:state`` the ``$room:room`` lights"
|
||||
- "turn the lights ``$device_state:state``"
|
||||
- "turn the ``$room:room`` lights ``$device_state:state``"
|
||||
- "turn ``$room:room`` lights ``$device_state:state``"
|
||||
|
||||
This intent will match any of the following phrases:
|
||||
|
||||
- "*turn on the lights*"
|
||||
- "*turn off the lights*"
|
||||
- "*turn the lights on*"
|
||||
- "*turn the lights off*"
|
||||
- "*turn on the living room lights*"
|
||||
- "*turn off the living room lights*"
|
||||
- "*turn the living room lights on*"
|
||||
- "*turn the living room lights off*"
|
||||
|
||||
And it will extract any slots that are matched in the phrases in the
|
||||
:class:`platypush.message.event.assistant.IntentRecognizedEvent` event.
|
||||
|
||||
Train the model, download the context file, and pass the path on the
|
||||
``intent_model_path`` parameter.
|
||||
|
||||
You can then register a hook to listen to a specific intent:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from platypush import hook, run
|
||||
from platypush.message.event.assistant import IntentRecognizedEvent
|
||||
|
||||
@hook(IntentRecognizedEvent, intent='lights_ctrl', slots={'state': 'on'})
|
||||
def on_turn_on_lights(event: IntentRecognizedEvent, **context):
|
||||
room = event.slots.get('room')
|
||||
if room:
|
||||
run("light.hue.on", groups=[room])
|
||||
else:
|
||||
run("light.hue.on")
|
||||
|
||||
Note that if both ``stt_enabled`` and ``intent_model_path`` are set, then
|
||||
both the speech-to-text and intent recognition engines will run in parallel
|
||||
when a conversation is started.
|
||||
|
||||
The intent engine is usually faster, as it has a smaller set of intents to
|
||||
match and doesn't have to run a full speech-to-text transcription. This means that,
|
||||
if an utterance matches both a speech-to-text phrase and an intent, the
|
||||
:class:`platypush.message.event.assistant.IntentRecognizedEvent` event is emitted
|
||||
(and not :class:`platypush.message.event.assistant.SpeechRecognizedEvent`).
|
||||
|
||||
This may not be always the case though. So it may be a good practice to
|
||||
also provide a fallback
|
||||
:class:`platypush.message.event.assistant.SpeechRecognizedEvent` hook to
|
||||
catch the text if the speech is not recognized as an intent:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from platypush import hook, run
|
||||
from platypush.message.event.assistant import SpeechRecognizedEvent
|
||||
|
||||
@hook(SpeechRecognizedEvent, phrase='turn ${state} (the)? ${room} lights?')
|
||||
def on_turn_on_lights(event: SpeechRecognizedEvent, phrase, room, **context):
|
||||
if room:
|
||||
run("light.hue.on", groups=[room])
|
||||
else:
|
||||
run("light.hue.on")
|
||||
|
||||
Text-to-speech
|
||||
--------------
|
||||
|
||||
The text-to-speech engine is based on `Orca
|
||||
<https://picovoice.ai/docs/orca/>`_.
|
||||
|
||||
It is not directly implemented by this plugin, but the implementation is
|
||||
provided in the :class:`platypush.plugins.tts.picovoice.TtsPicovoicePlugin`
|
||||
plugin.
|
||||
|
||||
You can however leverage the :meth:`.render_response` action to render some
|
||||
text as speech in response to a user command, and that in turn will leverage
|
||||
the PicoVoice TTS plugin to render the response.
|
||||
|
||||
For example, the following snippet provides a hook that:
|
||||
|
||||
- Listens for
|
||||
:class:`platypush.message.event.assistant.SpeechRecognizedEvent`.
|
||||
|
||||
- Matches the phrase against a list of predefined commands that
|
||||
shouldn't require a response.
|
||||
|
||||
- Has a fallback logic that leverages the
|
||||
:class:`platypush.plugins.openai.OpenaiPlugin` to generate a response
|
||||
for the given text and renders it as speech.
|
||||
|
||||
- Has a logic for follow-on turns if the response from ChatGPT is a question.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from datetime import datetime as dt, timedelta
|
||||
from dateutil.parser import isoparse
|
||||
from logging import getLogger
|
||||
|
||||
from platypush import hook, run
|
||||
from platypush.message.event.assistant import (
|
||||
SpeechRecognizedEvent,
|
||||
ResponseEndEvent,
|
||||
)
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
def play_music(*_, **__):
|
||||
run("music.mopidy.play")
|
||||
|
||||
def stop_music(*_, **__):
|
||||
run("music.mopidy.stop")
|
||||
|
||||
def ai_assist(event: SpeechRecognizedEvent, **__):
|
||||
response = run("openai.get_response", prompt=event.phrase)
|
||||
if not response:
|
||||
return
|
||||
|
||||
run("assistant.picovoice.render_response", text=response)
|
||||
|
||||
# List of commands to match, as pairs of regex patterns and the
|
||||
# corresponding actions
|
||||
hooks = (
|
||||
(re.compile(r"play (the)?music", re.IGNORECASE), play_music),
|
||||
(re.compile(r"stop (the)?music", re.IGNORECASE), stop_music),
|
||||
# Fallback to the AI assistant
|
||||
(re.compile(r".*"), ai_assist),
|
||||
)
|
||||
|
||||
@hook(SpeechRecognizedEvent)
|
||||
def on_speech_recognized(event, **kwargs):
|
||||
for pattern, command in hooks:
|
||||
if pattern.search(event.phrase):
|
||||
logger.info("Running voice command %s", command.__name__)
|
||||
command(event, **kwargs)
|
||||
break
|
||||
|
||||
@hook(ResponseEndEvent)
|
||||
def on_response_end(event: ResponseEndEvent, **__):
|
||||
# Check if the response is a question and start a follow-on turn if so.
|
||||
# Note that the ``openai`` plugin by default is configured to keep
|
||||
# the past interaction in a context window of ~10 minutes, so you
|
||||
# can follow up like in a real conversation.
|
||||
if event.assistant and event.response_text and event.response_text.endswith("?"):
|
||||
event.assistant.start_conversation()
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
access_key: str,
|
||||
hotword_enabled: bool = True,
|
||||
stt_enabled: bool = True,
|
||||
keywords: Optional[Sequence[str]] = None,
|
||||
keyword_paths: Optional[Sequence[str]] = None,
|
||||
keyword_model_path: Optional[str] = None,
|
||||
speech_model_path: Optional[str] = None,
|
||||
intent_model_path: Optional[str] = None,
|
||||
endpoint_duration: Optional[float] = 0.5,
|
||||
enable_automatic_punctuation: bool = False,
|
||||
start_conversation_on_hotword: bool = True,
|
||||
audio_queue_size: int = 100,
|
||||
conversation_timeout: Optional[float] = 7.5,
|
||||
muted: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
:param access_key: Your Picovoice access key. You can get it by signing
|
||||
up at the `Picovoice console <https://console.picovoice.ai/>`.
|
||||
:param hotword_enabled: Enable the wake-word engine (default: True).
|
||||
**Note**: The wake-word engine requires you to add Porcupine to the
|
||||
products available in your Picovoice account.
|
||||
:param stt_enabled: Enable the speech-to-text engine (default: True).
|
||||
**Note**: The speech-to-text engine requires you to add Cheetah to
|
||||
the products available in your Picovoice account.
|
||||
:param keywords: List of keywords to listen for (e.g. ``alexa``, ``ok
|
||||
google``...). This is required if the wake-word engine is enabled.
|
||||
See the `Porcupine keywords repository
|
||||
<https://github.com/Picovoice/porcupine/tree/master/resources/keyword_files>`_).
|
||||
for a list of the stock keywords available. If you have a custom
|
||||
model, you can pass its path to the ``keyword_paths`` parameter and
|
||||
its filename (without the path and the platform extension) here.
|
||||
:param keyword_paths: List of paths to the keyword files to listen for.
|
||||
Custom keyword files can be created using the `Porcupine console
|
||||
<https://console.picovoice.ai/ppn>`_ and downloaded from the
|
||||
console itself.
|
||||
:param keyword_model_path: If you are using a keyword file in a
|
||||
non-English language, you can provide the path to the model file
|
||||
for its language. Model files are available for all the supported
|
||||
languages through the `Porcupine lib repository
|
||||
<https://github.com/Picovoice/porcupine/tree/master/lib/common>`_.
|
||||
:param speech_model_path: Path to the speech model file. If you are
|
||||
using a language other than English, you can provide the path to the
|
||||
model file for that language. Model files are available for all the
|
||||
supported languages through the `Cheetah repository
|
||||
<https://github.com/Picovoice/cheetah/tree/master/lib/common>`_.
|
||||
You can also use the `Speech console
|
||||
<https://console.picovoice.ai/cat>`_
|
||||
to train your custom models. You can use a base model and fine-tune
|
||||
it by boosting the detection of your own words and phrases and edit
|
||||
the phonetic representation of the words you want to detect.
|
||||
:param intent_model_path: Path to the Rhino context model. This is
|
||||
required if you want to use the intent recognition engine through
|
||||
Rhino. The context model is a file that contains a list of intents
|
||||
that can be recognized by the engine. An intent is an action or a
|
||||
class of actions that the assistant can recognize, and it can
|
||||
contain an optional number of slots to model context variables -
|
||||
e.g. temperature, lights group, location, device state etc.
|
||||
You can create your own context model using the `Rhino console
|
||||
<https://console.picovoice.ai/rhn>`_. For example, you can define a
|
||||
context file to control smart home devices by defining the
|
||||
following slots:
|
||||
|
||||
- ``device_type``: The device to control (e.g. lights, music)
|
||||
- ``device_state``: The target state of the device (e.g. on,
|
||||
off)
|
||||
- ``location``: The location of the device (e.g. living
|
||||
room, kitchen, bedroom)
|
||||
- ``media_type``: The type of media to play (e.g. music, video)
|
||||
- ``media_state``: The state of the media (e.g. play, pause,
|
||||
stop)
|
||||
|
||||
You can then define the following intents:
|
||||
|
||||
- ``device_ctrl``: Control a device state. Supported phrases:
|
||||
- "turn ``$device_state:state`` the ``$location:location``
|
||||
``$device_type:device``"
|
||||
- "turn ``$device_state:state`` the ``$device_type:device``"
|
||||
|
||||
- ``media_ctrl``: Control media state. Supported phrases:
|
||||
- "``$media_state:state`` the ``$media_type:media``"
|
||||
- "``$media_state:state`` the ``$media_type:media`` in the
|
||||
``$location:location``"
|
||||
|
||||
Then a phrase like "turn on the lights in the living room" would
|
||||
trigger a
|
||||
:class:`platypush.message.event.assistant.IntentRecognizedEvent` with:
|
||||
|
||||
.. code-block:: json
|
||||
|
||||
{
|
||||
"intent": "device_ctrl",
|
||||
"slots": {
|
||||
"type": "lights",
|
||||
"state": "on",
|
||||
"location": "living room"
|
||||
}
|
||||
}
|
||||
|
||||
**Note**: The intent recognition engine requires you to add Rhino
|
||||
to the products available in your Picovoice account.
|
||||
:param endpoint_duration: If set, the assistant will stop listening when
|
||||
no speech is detected for the specified duration (in seconds) after
|
||||
the end of an utterance.
|
||||
:param enable_automatic_punctuation: Enable automatic punctuation
|
||||
insertion.
|
||||
:param start_conversation_on_hotword: If set to True (default), a speech
|
||||
detection session will be started when the hotword is detected. If
|
||||
set to False, you may want to start the conversation programmatically
|
||||
by calling the :meth:`.start_conversation` method instead, or run any
|
||||
custom logic hotword detection logic. This can be particularly useful
|
||||
when you want to run the assistant in a push-to-talk mode, or when you
|
||||
want different hotwords to trigger conversations with different models
|
||||
or languages.
|
||||
:param audio_queue_size: Maximum number of audio frames to hold in the
|
||||
processing queue. You may want to increase this value if you are
|
||||
running this integration on a slow device and/or the logs report
|
||||
audio frame drops too often. Keep in mind that increasing this value
|
||||
will increase the memory usage of the integration. Also, a higher
|
||||
value may result in higher accuracy at the cost of higher latency.
|
||||
:param conversation_timeout: Maximum time to wait for some speech to be
|
||||
detected after the hotword is detected. If no speech is detected
|
||||
within this time, the conversation will time out and the plugin will
|
||||
go back into hotword detection mode, if the mode is enabled. Default:
|
||||
7.5 seconds.
|
||||
:param muted: Set to True to start the assistant in a muted state. You will
|
||||
need to call the :meth:`.unmute` method to start the assistant listening
|
||||
for commands, or programmatically call the :meth:`.start_conversation`
|
||||
to start a conversation.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self._assistant = None
|
||||
self._assistant_args = {
|
||||
'stop_event': self._should_stop,
|
||||
'access_key': access_key,
|
||||
'hotword_enabled': hotword_enabled,
|
||||
'stt_enabled': stt_enabled,
|
||||
'keywords': keywords,
|
||||
'keyword_paths': (
|
||||
os.path.expanduser(keyword_path)
|
||||
for keyword_path in (keyword_paths or [])
|
||||
),
|
||||
'keyword_model_path': (
|
||||
os.path.expanduser(keyword_model_path) if keyword_model_path else None
|
||||
),
|
||||
'speech_model_path': (
|
||||
os.path.expanduser(speech_model_path) if speech_model_path else None
|
||||
),
|
||||
'intent_model_path': (
|
||||
os.path.expanduser(intent_model_path) if intent_model_path else None
|
||||
),
|
||||
'endpoint_duration': endpoint_duration,
|
||||
'enable_automatic_punctuation': enable_automatic_punctuation,
|
||||
'start_conversation_on_hotword': (
|
||||
start_conversation_on_hotword
|
||||
if (intent_model_path or stt_enabled)
|
||||
else False
|
||||
),
|
||||
'audio_queue_size': audio_queue_size,
|
||||
'conversation_timeout': conversation_timeout,
|
||||
'muted': muted,
|
||||
'on_conversation_start': self._on_conversation_start,
|
||||
'on_conversation_end': self._on_conversation_end,
|
||||
'on_conversation_timeout': self._on_conversation_timeout,
|
||||
'on_speech_recognized': self._on_speech_recognized,
|
||||
'on_intent_matched': self._on_intent_matched,
|
||||
'on_hotword_detected': self._on_hotword_detected,
|
||||
}
|
||||
|
||||
@property
|
||||
def tts(self) -> TtsPicovoicePlugin:
|
||||
p = get_plugin('tts.picovoice')
|
||||
assert p, 'Picovoice TTS plugin not configured/found'
|
||||
return p
|
||||
|
||||
def _get_tts_plugin(self) -> TtsPicovoicePlugin:
|
||||
return self.tts
|
||||
|
||||
def _on_response_render_start(self, text: Optional[str]):
|
||||
if self._assistant:
|
||||
self._assistant.set_responding(True)
|
||||
return super()._on_response_render_start(text)
|
||||
|
||||
def _on_response_render_end(self):
|
||||
if self._assistant:
|
||||
self._assistant.set_responding(False)
|
||||
|
||||
return super()._on_response_render_end()
|
||||
|
||||
@action
|
||||
def start_conversation(self, *_, model_file: Optional[str] = None, **__):
|
||||
"""
|
||||
Programmatically start a conversation with the assistant.
|
||||
|
||||
:param model_file: Override the model file to be used to detect speech
|
||||
in this conversation. If not set, the configured
|
||||
``speech_model_path`` will be used.
|
||||
"""
|
||||
if not self._assistant:
|
||||
self.logger.warning('Assistant not initialized')
|
||||
return
|
||||
|
||||
if not model_file:
|
||||
model_file = self._assistant_args['speech_model_path']
|
||||
if model_file:
|
||||
model_file = os.path.expanduser(model_file)
|
||||
|
||||
self._assistant.override_speech_model(model_file)
|
||||
self._assistant.state = AssistantState.DETECTING_SPEECH
|
||||
|
||||
@action
|
||||
def stop_conversation(self, *_, **__):
|
||||
"""
|
||||
Programmatically stop a running conversation with the assistant
|
||||
"""
|
||||
if not self._assistant:
|
||||
self.logger.warning('Assistant not initialized')
|
||||
return
|
||||
|
||||
self._assistant.override_speech_model(None)
|
||||
|
||||
if self._assistant.hotword_enabled:
|
||||
self._assistant.state = AssistantState.DETECTING_HOTWORD
|
||||
else:
|
||||
self._assistant.state = AssistantState.IDLE
|
||||
|
||||
@action
|
||||
def say(self, text: str, *args, **kwargs):
|
||||
"""
|
||||
Proxy to
|
||||
:class:`platypush.plugins.tts.picovoice.TtsPicovoicePlugin.say` to
|
||||
render some text as speech through the Picovoice TTS engine.
|
||||
|
||||
Extra arguments to
|
||||
:class:`platypush.plugins.tts.picovoice.TtsPicovoicePlugin.say` can be
|
||||
passed over ``args`` and ``kwargs``.
|
||||
|
||||
:param text: Text to be rendered as speech.
|
||||
"""
|
||||
return self.tts.say(text, *args, **kwargs)
|
||||
|
||||
@action
|
||||
def transcribe(self, audio_file: str, *_, model_file: Optional[str] = None, **__):
|
||||
"""
|
||||
Transcribe an audio file to text using the `Leopard
|
||||
<https://picovoice.ai/docs/leopard/>`_ engine.
|
||||
|
||||
:param text: Text to be transcribed.
|
||||
:param model_file: Override the model file to be used to detect speech
|
||||
in this conversation. If not set, the configured
|
||||
``speech_model_path`` will be used.
|
||||
:return: dict
|
||||
|
||||
.. code-block:: json
|
||||
|
||||
{
|
||||
"transcription": "This is a test",
|
||||
"words": [
|
||||
{
|
||||
"word": "this",
|
||||
"start": 0.06400000303983688,
|
||||
"end": 0.19200000166893005,
|
||||
"confidence": 0.9626294374465942
|
||||
},
|
||||
{
|
||||
"word": "is",
|
||||
"start": 0.2879999876022339,
|
||||
"end": 0.35199999809265137,
|
||||
"confidence": 0.9781675934791565
|
||||
},
|
||||
{
|
||||
"word": "a",
|
||||
"start": 0.41600000858306885,
|
||||
"end": 0.41600000858306885,
|
||||
"confidence": 0.9764975309371948
|
||||
},
|
||||
{
|
||||
"word": "test",
|
||||
"start": 0.5120000243186951,
|
||||
"end": 0.8320000171661377,
|
||||
"confidence": 0.9511580467224121
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
"""
|
||||
import pvleopard
|
||||
|
||||
audio_file = os.path.expanduser(audio_file)
|
||||
if not model_file:
|
||||
model_file = self._assistant_args['speech_model_path']
|
||||
if model_file:
|
||||
model_file = os.path.expanduser(model_file)
|
||||
|
||||
leopard = pvleopard.create(
|
||||
access_key=self._assistant_args['access_key'], model_path=model_file
|
||||
)
|
||||
|
||||
transcript, words = leopard.process_file(audio_file)
|
||||
|
||||
try:
|
||||
return {
|
||||
'transcription': transcript,
|
||||
'words': [
|
||||
{
|
||||
'word': word.word,
|
||||
'start': word.start_sec,
|
||||
'end': word.end_sec,
|
||||
'confidence': word.confidence,
|
||||
}
|
||||
for word in words
|
||||
],
|
||||
}
|
||||
finally:
|
||||
leopard.delete()
|
||||
|
||||
@action
|
||||
def mute(self, *_, **__):
|
||||
"""
|
||||
Mute the microphone. Alias for :meth:`.set_mic_mute` with
|
||||
``muted=True``.
|
||||
"""
|
||||
return self.set_mic_mute(muted=True)
|
||||
|
||||
@action
|
||||
def unmute(self, *_, **__):
|
||||
"""
|
||||
Unmute the microphone. Alias for :meth:`.set_mic_mute` with
|
||||
``muted=False``.
|
||||
"""
|
||||
return self.set_mic_mute(muted=False)
|
||||
|
||||
@action
|
||||
def set_mic_mute(self, muted: bool):
|
||||
"""
|
||||
Programmatically mute/unmute the microphone.
|
||||
|
||||
:param muted: Set to True or False.
|
||||
"""
|
||||
if self._assistant:
|
||||
self._assistant.set_mic_mute(muted)
|
||||
|
||||
super()._on_mute_changed(muted)
|
||||
|
||||
@action
|
||||
def toggle_mute(self, *_, **__):
|
||||
"""
|
||||
Toggle the mic mute state.
|
||||
"""
|
||||
return self.set_mic_mute(not self._is_muted)
|
||||
|
||||
@action
|
||||
def send_text_query(self, *_, query: str, **__):
|
||||
"""
|
||||
Send a text query to the assistant.
|
||||
|
||||
This is equivalent to saying something to the assistant.
|
||||
|
||||
:param query: Query to be sent.
|
||||
"""
|
||||
self._on_speech_recognized(query)
|
||||
|
||||
def main(self):
|
||||
while not self.should_stop():
|
||||
self.logger.info('Starting Picovoice assistant')
|
||||
with Assistant(**self._assistant_args) as self._assistant:
|
||||
try:
|
||||
for event in self._assistant:
|
||||
if event is not None:
|
||||
self.logger.debug('Dequeued assistant event: %s', event)
|
||||
except KeyboardInterrupt:
|
||||
break
|
||||
except Exception as e:
|
||||
self.logger.error('Picovoice assistant error: %s', e, exc_info=True)
|
||||
self.wait_stop(5)
|
||||
|
||||
def stop(self):
|
||||
try:
|
||||
self.stop_conversation()
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
super().stop()
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
|
@ -0,0 +1,424 @@
|
|||
import logging
|
||||
import os
|
||||
from queue import Full, Queue
|
||||
from threading import Event, RLock, Thread
|
||||
from time import time
|
||||
from typing import Any, Dict, Optional, Sequence
|
||||
|
||||
import pvporcupine
|
||||
|
||||
from platypush.context import get_plugin
|
||||
from platypush.message.event.assistant import (
|
||||
AssistantEvent,
|
||||
ConversationTimeoutEvent,
|
||||
HotwordDetectedEvent,
|
||||
IntentRecognizedEvent,
|
||||
SpeechRecognizedEvent,
|
||||
)
|
||||
from platypush.plugins.tts.picovoice import TtsPicovoicePlugin
|
||||
|
||||
from ._recorder import AudioRecorder
|
||||
from ._speech import SpeechProcessor
|
||||
from ._state import AssistantState
|
||||
|
||||
|
||||
class Assistant(Thread):
|
||||
"""
|
||||
A facade class that wraps the Picovoice engines under an assistant API.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _default_callback(*_, **__):
|
||||
pass
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
access_key: str,
|
||||
stop_event: Event,
|
||||
hotword_enabled: bool = True,
|
||||
stt_enabled: bool = True,
|
||||
keywords: Optional[Sequence[str]] = None,
|
||||
keyword_paths: Optional[Sequence[str]] = None,
|
||||
keyword_model_path: Optional[str] = None,
|
||||
frame_expiration: float = 3.0, # Don't process audio frames older than this
|
||||
speech_model_path: Optional[str] = None,
|
||||
intent_model_path: Optional[str] = None,
|
||||
endpoint_duration: Optional[float] = None,
|
||||
enable_automatic_punctuation: bool = False,
|
||||
start_conversation_on_hotword: bool = False,
|
||||
audio_queue_size: int = 100,
|
||||
muted: bool = False,
|
||||
conversation_timeout: Optional[float] = None,
|
||||
on_conversation_start=_default_callback,
|
||||
on_conversation_end=_default_callback,
|
||||
on_conversation_timeout=_default_callback,
|
||||
on_speech_recognized=_default_callback,
|
||||
on_intent_matched=_default_callback,
|
||||
on_hotword_detected=_default_callback,
|
||||
):
|
||||
super().__init__(name='picovoice:Assistant')
|
||||
|
||||
self._access_key = access_key
|
||||
self._stop_event = stop_event
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.hotword_enabled = hotword_enabled
|
||||
self.stt_enabled = stt_enabled
|
||||
self.keywords = list(keywords or [])
|
||||
self.keyword_paths = None
|
||||
self.keyword_model_path = None
|
||||
self.frame_expiration = frame_expiration
|
||||
self.endpoint_duration = endpoint_duration
|
||||
self.enable_automatic_punctuation = enable_automatic_punctuation
|
||||
self.start_conversation_on_hotword = start_conversation_on_hotword
|
||||
self.audio_queue_size = audio_queue_size
|
||||
self._responding = Event()
|
||||
self._muted = muted
|
||||
self._speech_model_path = speech_model_path
|
||||
self._speech_model_path_override = None
|
||||
self._intent_model_path = intent_model_path
|
||||
self._intent_model_path_override = None
|
||||
self._in_ctx = False
|
||||
|
||||
self._speech_processor = SpeechProcessor(
|
||||
stop_event=stop_event,
|
||||
stt_enabled=stt_enabled,
|
||||
intent_enabled=self.intent_enabled,
|
||||
conversation_timeout=conversation_timeout,
|
||||
model_path=speech_model_path,
|
||||
get_cheetah_args=self._get_speech_engine_args,
|
||||
get_rhino_args=self._get_intent_engine_args,
|
||||
)
|
||||
|
||||
self._on_conversation_start = on_conversation_start
|
||||
self._on_conversation_end = on_conversation_end
|
||||
self._on_conversation_timeout = on_conversation_timeout
|
||||
self._on_speech_recognized = on_speech_recognized
|
||||
self._on_intent_matched = on_intent_matched
|
||||
self._on_hotword_detected = on_hotword_detected
|
||||
|
||||
self._recorder = None
|
||||
self._state = AssistantState.IDLE
|
||||
self._state_lock = RLock()
|
||||
self._evt_queue = Queue(maxsize=100)
|
||||
|
||||
if hotword_enabled:
|
||||
if not keywords:
|
||||
raise ValueError(
|
||||
'You need to provide a list of keywords if the wake-word engine is enabled'
|
||||
)
|
||||
|
||||
if keyword_paths:
|
||||
keyword_paths = [os.path.expanduser(path) for path in keyword_paths]
|
||||
missing_paths = [
|
||||
path for path in keyword_paths if not os.path.isfile(path)
|
||||
]
|
||||
if missing_paths:
|
||||
raise FileNotFoundError(f'Keyword files not found: {missing_paths}')
|
||||
|
||||
self.keyword_paths = keyword_paths
|
||||
|
||||
if keyword_model_path:
|
||||
keyword_model_path = os.path.expanduser(keyword_model_path)
|
||||
if not os.path.isfile(keyword_model_path):
|
||||
raise FileNotFoundError(
|
||||
f'Keyword model file not found: {keyword_model_path}'
|
||||
)
|
||||
|
||||
self.keyword_model_path = keyword_model_path
|
||||
|
||||
self._porcupine: Optional[pvporcupine.Porcupine] = None
|
||||
|
||||
@property
|
||||
def intent_enabled(self) -> bool:
|
||||
return self.intent_model_path is not None
|
||||
|
||||
@property
|
||||
def is_responding(self) -> bool:
|
||||
return self._responding.is_set()
|
||||
|
||||
@property
|
||||
def speech_model_path(self) -> Optional[str]:
|
||||
return self._speech_model_path_override or self._speech_model_path
|
||||
|
||||
@property
|
||||
def intent_model_path(self) -> Optional[str]:
|
||||
return self._intent_model_path_override or self._intent_model_path
|
||||
|
||||
@property
|
||||
def tts(self) -> TtsPicovoicePlugin:
|
||||
p = get_plugin('tts.picovoice')
|
||||
assert p, 'Picovoice TTS plugin not configured/found'
|
||||
return p
|
||||
|
||||
def set_responding(self, responding: bool):
|
||||
if responding:
|
||||
self._responding.set()
|
||||
else:
|
||||
self._responding.clear()
|
||||
|
||||
def should_stop(self):
|
||||
return self._stop_event.is_set()
|
||||
|
||||
def wait_stop(self):
|
||||
self._stop_event.wait()
|
||||
|
||||
@property
|
||||
def state(self) -> AssistantState:
|
||||
with self._state_lock:
|
||||
return self._state
|
||||
|
||||
@state.setter
|
||||
def state(self, state: AssistantState):
|
||||
with self._state_lock:
|
||||
prev_state = self._state
|
||||
self._state = state
|
||||
new_state = self.state
|
||||
|
||||
if prev_state == new_state:
|
||||
return
|
||||
|
||||
self.logger.info('Assistant state transition: %s -> %s', prev_state, new_state)
|
||||
if prev_state == AssistantState.DETECTING_SPEECH:
|
||||
self.tts.stop()
|
||||
self._speech_model_path_override = None
|
||||
self._intent_model_path_override = None
|
||||
self._speech_processor.on_conversation_end()
|
||||
self._on_conversation_end()
|
||||
elif new_state == AssistantState.DETECTING_SPEECH:
|
||||
self._speech_processor.on_conversation_start()
|
||||
self._on_conversation_start()
|
||||
|
||||
if new_state == AssistantState.DETECTING_HOTWORD:
|
||||
self.tts.stop()
|
||||
self._speech_processor.on_conversation_reset()
|
||||
|
||||
# Put a null event on the event queue to unblock next_event
|
||||
self._evt_queue.put(None)
|
||||
|
||||
@property
|
||||
def porcupine(self) -> Optional[pvporcupine.Porcupine]:
|
||||
if not self.hotword_enabled:
|
||||
return None
|
||||
|
||||
if not self._porcupine:
|
||||
args: Dict[str, Any] = {'access_key': self._access_key}
|
||||
if self.keywords:
|
||||
args['keywords'] = self.keywords
|
||||
if self.keyword_paths:
|
||||
args['keyword_paths'] = self.keyword_paths
|
||||
if self.keyword_model_path:
|
||||
args['model_path'] = self.keyword_model_path
|
||||
|
||||
self._porcupine = pvporcupine.create(**args)
|
||||
|
||||
return self._porcupine
|
||||
|
||||
def _get_speech_engine_args(self) -> dict:
|
||||
args: Dict[str, Any] = {'access_key': self._access_key}
|
||||
if self.speech_model_path:
|
||||
args['model_path'] = self.speech_model_path
|
||||
if self.endpoint_duration:
|
||||
args['endpoint_duration_sec'] = self.endpoint_duration
|
||||
if self.enable_automatic_punctuation:
|
||||
args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
|
||||
|
||||
return args
|
||||
|
||||
def _get_intent_engine_args(self) -> dict:
|
||||
args: Dict[str, Any] = {'access_key': self._access_key}
|
||||
args['context_path'] = self.intent_model_path
|
||||
if self.endpoint_duration:
|
||||
args['endpoint_duration_sec'] = self.endpoint_duration
|
||||
if self.enable_automatic_punctuation:
|
||||
args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
|
||||
|
||||
return args
|
||||
|
||||
def __enter__(self):
|
||||
"""
|
||||
Get the assistant ready to start processing audio frames.
|
||||
"""
|
||||
if self.should_stop():
|
||||
return self
|
||||
|
||||
assert not self.is_alive(), 'The assistant is already running'
|
||||
self._in_ctx = True
|
||||
|
||||
if self._recorder:
|
||||
self.logger.info('A recording stream already exists')
|
||||
elif self.hotword_enabled or self.stt_enabled or self.intent_enabled:
|
||||
sample_rate = (self.porcupine or self._speech_processor).sample_rate
|
||||
frame_length = (self.porcupine or self._speech_processor).frame_length
|
||||
self._recorder = AudioRecorder(
|
||||
stop_event=self._stop_event,
|
||||
sample_rate=sample_rate,
|
||||
frame_size=frame_length,
|
||||
queue_size=self.audio_queue_size,
|
||||
paused=self._muted,
|
||||
channels=1,
|
||||
)
|
||||
|
||||
self._speech_processor.__enter__()
|
||||
self._recorder.__enter__()
|
||||
|
||||
if self.porcupine:
|
||||
self.state = AssistantState.DETECTING_HOTWORD
|
||||
else:
|
||||
self.state = AssistantState.DETECTING_SPEECH
|
||||
|
||||
self.start()
|
||||
return self
|
||||
|
||||
def __exit__(self, *_):
|
||||
"""
|
||||
Stop the assistant and release all resources.
|
||||
"""
|
||||
self._in_ctx = False
|
||||
if self._recorder:
|
||||
self._recorder.__exit__(*_)
|
||||
self._recorder = None
|
||||
|
||||
self.state = AssistantState.IDLE
|
||||
|
||||
if self._porcupine:
|
||||
self._porcupine.delete()
|
||||
self._porcupine = None
|
||||
|
||||
self._speech_processor.__exit__(*_)
|
||||
|
||||
def __iter__(self):
|
||||
"""
|
||||
Iterate over processed assistant events.
|
||||
"""
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
"""
|
||||
Process the next audio frame and return the corresponding event.
|
||||
"""
|
||||
if self.should_stop() or not self._recorder:
|
||||
raise StopIteration
|
||||
|
||||
if self.hotword_enabled and self.state == AssistantState.DETECTING_HOTWORD:
|
||||
return self._evt_queue.get()
|
||||
|
||||
evt = None
|
||||
if (
|
||||
self._speech_processor.enabled
|
||||
and self.state == AssistantState.DETECTING_SPEECH
|
||||
):
|
||||
evt = self._speech_processor.next_event()
|
||||
|
||||
if isinstance(evt, SpeechRecognizedEvent):
|
||||
self._on_speech_recognized(phrase=evt.args['phrase'])
|
||||
if isinstance(evt, IntentRecognizedEvent):
|
||||
self._on_intent_matched(
|
||||
intent=evt.args['intent'], slots=evt.args.get('slots', {})
|
||||
)
|
||||
if isinstance(evt, ConversationTimeoutEvent):
|
||||
self._on_conversation_timeout()
|
||||
|
||||
if evt:
|
||||
self._speech_processor.reset()
|
||||
|
||||
if (
|
||||
evt
|
||||
and self.state == AssistantState.DETECTING_SPEECH
|
||||
and self.hotword_enabled
|
||||
):
|
||||
self.state = AssistantState.DETECTING_HOTWORD
|
||||
|
||||
return evt
|
||||
|
||||
def mute(self):
|
||||
self._muted = True
|
||||
if self._recorder:
|
||||
self._recorder.pause()
|
||||
|
||||
def unmute(self):
|
||||
self._muted = False
|
||||
if self._recorder:
|
||||
self._recorder.resume()
|
||||
|
||||
def set_mic_mute(self, mute: bool):
|
||||
if mute:
|
||||
self.mute()
|
||||
else:
|
||||
self.unmute()
|
||||
|
||||
def toggle_mic_mute(self):
|
||||
if self._muted:
|
||||
self.unmute()
|
||||
else:
|
||||
self.mute()
|
||||
|
||||
def _process_hotword(self, frame) -> Optional[HotwordDetectedEvent]:
|
||||
if not self.porcupine:
|
||||
return None
|
||||
|
||||
keyword_index = self.porcupine.process(frame)
|
||||
if keyword_index is None:
|
||||
return None # No keyword detected
|
||||
|
||||
if keyword_index >= 0 and self.keywords:
|
||||
if self.start_conversation_on_hotword:
|
||||
self.state = AssistantState.DETECTING_SPEECH
|
||||
|
||||
self.tts.stop() # Stop any ongoing TTS when the hotword is detected
|
||||
self._on_hotword_detected(hotword=self.keywords[keyword_index])
|
||||
return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
|
||||
|
||||
return None
|
||||
|
||||
def override_speech_model(self, model_path: Optional[str]):
|
||||
self._speech_model_path_override = model_path
|
||||
|
||||
def override_intent_model(self, model_path: Optional[str]):
|
||||
self._intent_model_path_override = model_path
|
||||
|
||||
def _put_event(self, evt: AssistantEvent):
|
||||
try:
|
||||
self._evt_queue.put_nowait(evt)
|
||||
except Full:
|
||||
self.logger.warning('The assistant event queue is full')
|
||||
|
||||
def run(self):
|
||||
assert (
|
||||
self._in_ctx
|
||||
), 'The assistant can only be started through a context manager'
|
||||
|
||||
super().run()
|
||||
|
||||
while not self.should_stop() and self._recorder:
|
||||
self._recorder.wait_start()
|
||||
if self.should_stop():
|
||||
break
|
||||
|
||||
data = self._recorder.read()
|
||||
if data is None:
|
||||
continue
|
||||
|
||||
frame, t = data
|
||||
if time() - t > self.frame_expiration:
|
||||
self.logger.info(
|
||||
'Skipping audio frame older than %ss', self.frame_expiration
|
||||
)
|
||||
continue # The audio frame is too old
|
||||
|
||||
if self.hotword_enabled and self.state == AssistantState.DETECTING_HOTWORD:
|
||||
evt = self._process_hotword(frame)
|
||||
if evt:
|
||||
self._put_event(evt)
|
||||
|
||||
continue
|
||||
|
||||
if (
|
||||
self._speech_processor.enabled
|
||||
and self.state == AssistantState.DETECTING_SPEECH
|
||||
):
|
||||
self._speech_processor.process(frame, block=False)
|
||||
|
||||
self.logger.info('Assistant stopped')
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
|
@ -0,0 +1,51 @@
|
|||
from dataclasses import dataclass
|
||||
from time import time
|
||||
from typing import Optional
|
||||
|
||||
from ._intent import Intent
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConversationContext:
|
||||
"""
|
||||
Context of the conversation process.
|
||||
"""
|
||||
|
||||
transcript: str = ''
|
||||
is_final: bool = False
|
||||
intent: Optional[Intent] = None
|
||||
timeout: Optional[float] = None
|
||||
t_start: Optional[float] = None
|
||||
|
||||
def start(self):
|
||||
self.reset()
|
||||
self.t_start = time()
|
||||
|
||||
def reset(self):
|
||||
self.transcript = ''
|
||||
self.intent = None
|
||||
self.is_final = False
|
||||
self.t_start = None
|
||||
|
||||
@property
|
||||
def timed_out(self):
|
||||
return (
|
||||
(
|
||||
(not self.transcript and not self.is_final)
|
||||
or (not self.intent and not self.is_final)
|
||||
)
|
||||
and self.timeout
|
||||
and self.t_start
|
||||
and time() - self.t_start > self.timeout
|
||||
) or (
|
||||
(
|
||||
(self.transcript and not self.is_final)
|
||||
or (self.intent and not self.is_final)
|
||||
)
|
||||
and self.timeout
|
||||
and self.t_start
|
||||
and time() - self.t_start > self.timeout * 2
|
||||
)
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
|
@ -0,0 +1,11 @@
|
|||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class Intent:
|
||||
"""
|
||||
Speech intent data class.
|
||||
"""
|
||||
|
||||
name: str
|
||||
slots: dict = field(default_factory=dict)
|
|
@ -0,0 +1,191 @@
|
|||
from collections import namedtuple
|
||||
from dataclasses import dataclass, field
|
||||
from logging import getLogger
|
||||
from queue import Full, Queue
|
||||
from threading import Event, RLock
|
||||
from time import time
|
||||
from typing import Optional
|
||||
|
||||
import sounddevice as sd
|
||||
|
||||
from platypush.utils import wait_for_either
|
||||
|
||||
|
||||
AudioFrame = namedtuple('AudioFrame', ['data', 'timestamp'])
|
||||
|
||||
|
||||
@dataclass
|
||||
class PauseState:
|
||||
"""
|
||||
Data class to hold the boilerplate (state + synchronization events) for the
|
||||
audio recorder pause API.
|
||||
"""
|
||||
|
||||
_paused_event: Event = field(default_factory=Event)
|
||||
_recording_event: Event = field(default_factory=Event)
|
||||
_state_lock: RLock = field(default_factory=RLock)
|
||||
|
||||
@property
|
||||
def paused(self):
|
||||
with self._state_lock:
|
||||
return self._paused_event.is_set()
|
||||
|
||||
def pause(self):
|
||||
"""
|
||||
Pause the audio recorder.
|
||||
"""
|
||||
with self._state_lock:
|
||||
self._paused_event.set()
|
||||
self._recording_event.clear()
|
||||
|
||||
def resume(self):
|
||||
"""
|
||||
Resume the audio recorder.
|
||||
"""
|
||||
with self._state_lock:
|
||||
self._paused_event.clear()
|
||||
self._recording_event.set()
|
||||
|
||||
def toggle(self):
|
||||
"""
|
||||
Toggle the audio recorder pause state.
|
||||
"""
|
||||
with self._state_lock:
|
||||
if self.paused:
|
||||
self.resume()
|
||||
else:
|
||||
self.pause()
|
||||
|
||||
def wait_paused(self, timeout: Optional[float] = None):
|
||||
"""
|
||||
Wait until the audio recorder is paused.
|
||||
"""
|
||||
self._paused_event.wait(timeout=timeout)
|
||||
|
||||
def wait_recording(self, timeout: Optional[float] = None):
|
||||
"""
|
||||
Wait until the audio recorder is resumed.
|
||||
"""
|
||||
self._recording_event.wait(timeout=timeout)
|
||||
|
||||
|
||||
class AudioRecorder:
|
||||
"""
|
||||
Audio recorder component that uses the sounddevice library to record audio
|
||||
from the microphone.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
stop_event: Event,
|
||||
sample_rate: int,
|
||||
frame_size: int,
|
||||
channels: int,
|
||||
paused: bool = False,
|
||||
dtype: str = 'int16',
|
||||
queue_size: int = 100,
|
||||
):
|
||||
self.logger = getLogger(__name__)
|
||||
self._audio_queue: Queue[AudioFrame] = Queue(maxsize=queue_size)
|
||||
self.frame_size = frame_size
|
||||
self._stop_event = Event()
|
||||
self._upstream_stop_event = stop_event
|
||||
self._paused_state = PauseState()
|
||||
if paused:
|
||||
self._paused_state.pause()
|
||||
else:
|
||||
self._paused_state.resume()
|
||||
|
||||
self.stream = sd.InputStream(
|
||||
samplerate=sample_rate,
|
||||
channels=channels,
|
||||
dtype=dtype,
|
||||
blocksize=frame_size,
|
||||
callback=self._audio_callback,
|
||||
)
|
||||
|
||||
@property
|
||||
def paused(self):
|
||||
return self._paused_state.paused
|
||||
|
||||
def __enter__(self):
|
||||
"""
|
||||
Start the audio stream.
|
||||
"""
|
||||
self._stop_event.clear()
|
||||
self.stream.start()
|
||||
return self
|
||||
|
||||
def __exit__(self, *_):
|
||||
"""
|
||||
Stop the audio stream.
|
||||
"""
|
||||
self.stop()
|
||||
|
||||
def _audio_callback(self, indata, *_):
|
||||
if self.should_stop() or self.paused:
|
||||
return
|
||||
|
||||
try:
|
||||
self._audio_queue.put_nowait(AudioFrame(indata.reshape(-1), time()))
|
||||
except Full:
|
||||
self.logger.warning('Audio queue is full, dropping audio frame')
|
||||
|
||||
def read(self, timeout: Optional[float] = None):
|
||||
"""
|
||||
Read an audio frame from the queue.
|
||||
|
||||
:param timeout: Timeout in seconds. If None, the method will block until
|
||||
an audio frame is available.
|
||||
:return: Audio frame or None if the timeout has expired.
|
||||
"""
|
||||
try:
|
||||
return self._audio_queue.get(timeout=timeout)
|
||||
except TimeoutError:
|
||||
self.logger.debug('Audio queue is empty')
|
||||
return None
|
||||
|
||||
def stop(self):
|
||||
"""
|
||||
Stop the audio stream.
|
||||
"""
|
||||
self._stop_event.set()
|
||||
self.stream.stop()
|
||||
|
||||
def pause(self):
|
||||
"""
|
||||
Pause the audio stream.
|
||||
"""
|
||||
self._paused_state.pause()
|
||||
|
||||
def resume(self):
|
||||
"""
|
||||
Resume the audio stream.
|
||||
"""
|
||||
self._paused_state.resume()
|
||||
|
||||
def toggle(self):
|
||||
"""
|
||||
Toggle the audio stream pause state.
|
||||
"""
|
||||
self._paused_state.toggle()
|
||||
|
||||
def should_stop(self):
|
||||
return self._stop_event.is_set() or self._upstream_stop_event.is_set()
|
||||
|
||||
def wait(self, timeout: Optional[float] = None):
|
||||
"""
|
||||
Wait until the audio stream is stopped.
|
||||
"""
|
||||
wait_for_either(self._stop_event, self._upstream_stop_event, timeout=timeout)
|
||||
|
||||
def wait_start(self, timeout: Optional[float] = None):
|
||||
"""
|
||||
Wait until the audio stream is started.
|
||||
"""
|
||||
wait_for_either(
|
||||
self._stop_event,
|
||||
self._upstream_stop_event,
|
||||
self._paused_state._recording_event,
|
||||
timeout=timeout,
|
||||
)
|
|
@ -0,0 +1,3 @@
|
|||
from ._processor import SpeechProcessor
|
||||
|
||||
__all__ = ['SpeechProcessor']
|
|
@ -0,0 +1,180 @@
|
|||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from queue import Empty, Queue
|
||||
from threading import Event, RLock, Thread, get_ident
|
||||
from typing import Any, Optional, Sequence
|
||||
|
||||
from platypush.message.event.assistant import AssistantEvent
|
||||
|
||||
from .._context import ConversationContext
|
||||
|
||||
|
||||
class BaseProcessor(ABC, Thread):
|
||||
"""
|
||||
Base speech processor class. It is implemented by the ``SttProcessor`` and
|
||||
the ``IntentProcessor`` classes.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args,
|
||||
stop_event: Event,
|
||||
enabled: bool = True,
|
||||
conversation_timeout: Optional[float] = None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(*args, name=f'picovoice:{self.__class__.__name__}', **kwargs)
|
||||
|
||||
self.logger = logging.getLogger(self.name)
|
||||
self._enabled = enabled
|
||||
self._audio_queue = Queue()
|
||||
self._stop_event = stop_event
|
||||
self._ctx = ConversationContext(timeout=conversation_timeout)
|
||||
self._event_queue = Queue()
|
||||
# This event is set if the upstream processor is waiting for an event
|
||||
# from this processor
|
||||
self._event_wait = Event()
|
||||
# This event is set when the processor is done with the audio
|
||||
# processing and it's ready to accept a new audio frame
|
||||
self._processing_done = Event()
|
||||
self._processing_done.set()
|
||||
self._state_lock = RLock()
|
||||
|
||||
def should_stop(self) -> bool:
|
||||
return self._stop_event.is_set()
|
||||
|
||||
def wait_stop(self, timeout: Optional[float] = None) -> bool:
|
||||
return self._stop_event.wait(timeout)
|
||||
|
||||
def enqueue(self, audio: Sequence[int]):
|
||||
if not self._enabled:
|
||||
return
|
||||
|
||||
self._event_wait.set()
|
||||
self._processing_done.clear()
|
||||
self._audio_queue.put_nowait(audio)
|
||||
|
||||
def reset(self) -> Optional[Any]:
|
||||
"""
|
||||
Reset any pending context.
|
||||
"""
|
||||
if not self._enabled:
|
||||
return
|
||||
|
||||
with self._state_lock:
|
||||
self._ctx.reset()
|
||||
self._event_queue.queue.clear()
|
||||
self._event_wait.clear()
|
||||
self._processing_done.set()
|
||||
|
||||
@property
|
||||
def processing_done(self) -> Event:
|
||||
return self._processing_done
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def _model_path(self) -> Optional[str]:
|
||||
"""
|
||||
Return the model path.
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def sample_rate(self) -> int:
|
||||
"""
|
||||
:return: The sample rate wanted by Cheetah/Rhino.
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def frame_length(self) -> int:
|
||||
"""
|
||||
:return: The frame length wanted by Cheetah/Rhino.
|
||||
"""
|
||||
|
||||
def last_event(self) -> Optional[AssistantEvent]:
|
||||
"""
|
||||
:return: The latest event that was processed by the processor.
|
||||
"""
|
||||
with self._state_lock:
|
||||
evt = None
|
||||
try:
|
||||
while True:
|
||||
evt = self._event_queue.get_nowait()
|
||||
except Empty:
|
||||
pass
|
||||
|
||||
if evt:
|
||||
self._event_wait.clear()
|
||||
|
||||
return evt
|
||||
|
||||
@abstractmethod
|
||||
def process(self, audio: Sequence[int]) -> Optional[AssistantEvent]:
|
||||
"""
|
||||
Process speech events from a raw audio input.
|
||||
"""
|
||||
|
||||
def run(self):
|
||||
super().run()
|
||||
if not self._enabled:
|
||||
self.wait_stop()
|
||||
|
||||
self.reset()
|
||||
self._processing_done.clear()
|
||||
self.logger.info('Processor started: %s', self.name)
|
||||
|
||||
while not self.should_stop():
|
||||
audio = self._audio_queue.get()
|
||||
|
||||
# The thread is stopped when it receives a None object
|
||||
if audio is None:
|
||||
break
|
||||
|
||||
# Don't process the audio if the upstream processor is not waiting
|
||||
# for an event
|
||||
if not self._event_wait.is_set():
|
||||
continue
|
||||
|
||||
try:
|
||||
self._processing_done.clear()
|
||||
event = self.process(audio)
|
||||
|
||||
if event:
|
||||
self.logger.debug(
|
||||
'Dispatching event processed from %s: %s', self.name, event
|
||||
)
|
||||
self._event_queue.put_nowait(event)
|
||||
self._processing_done.set()
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
'An error occurred while processing the audio on %s: %s',
|
||||
self.name,
|
||||
e,
|
||||
exc_info=e,
|
||||
)
|
||||
self.wait_stop(timeout=1)
|
||||
self._processing_done.set()
|
||||
continue
|
||||
|
||||
self._processing_done.set()
|
||||
self.reset()
|
||||
self.logger.info('Processor stopped: %s', self.name)
|
||||
|
||||
def stop(self):
|
||||
if not self._enabled:
|
||||
return
|
||||
|
||||
self._audio_queue.put_nowait(None)
|
||||
if self.is_alive() and self.ident != get_ident():
|
||||
self.logger.debug('Stopping %s', self.name)
|
||||
self.join()
|
||||
|
||||
def on_conversation_start(self):
|
||||
self._ctx.start()
|
||||
|
||||
def on_conversation_end(self):
|
||||
self.reset()
|
||||
|
||||
def on_conversation_reset(self):
|
||||
self.reset()
|
|
@ -0,0 +1,93 @@
|
|||
from typing import Callable, Optional, Sequence, Union
|
||||
|
||||
import pvrhino
|
||||
|
||||
from platypush.message.event.assistant import (
|
||||
ConversationTimeoutEvent,
|
||||
IntentRecognizedEvent,
|
||||
)
|
||||
|
||||
from ._base import BaseProcessor
|
||||
|
||||
|
||||
class IntentProcessor(BaseProcessor):
|
||||
"""
|
||||
Implementation of the speech-to-intent processor using the Picovoice Rhino
|
||||
engine.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, *args, get_rhino_args: Callable[[], dict] = lambda: {}, **kwargs
|
||||
):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._get_rhino_args = get_rhino_args
|
||||
# model_path -> Rhino instance cache
|
||||
self._rhino = {}
|
||||
|
||||
@property
|
||||
def _model_path(self) -> Optional[str]:
|
||||
return self._get_rhino_args().get('model_path')
|
||||
|
||||
@property
|
||||
def sample_rate(self) -> int:
|
||||
return self._get_rhino().sample_rate
|
||||
|
||||
@property
|
||||
def frame_length(self) -> int:
|
||||
return self._get_rhino().frame_length
|
||||
|
||||
def _get_rhino(self) -> pvrhino.Rhino:
|
||||
if not self._rhino.get(self._model_path):
|
||||
self._rhino[self._model_path] = pvrhino.create(**self._get_rhino_args())
|
||||
|
||||
return self._rhino[self._model_path]
|
||||
|
||||
def process(
|
||||
self, audio: Sequence[int]
|
||||
) -> Optional[Union[IntentRecognizedEvent, ConversationTimeoutEvent]]:
|
||||
"""
|
||||
Process the audio and return an ``IntentRecognizedEvent`` if the intent was
|
||||
understood, or a ``ConversationTimeoutEvent`` if the conversation timed
|
||||
out, or ``None`` if the intent processing is not yet finalized.
|
||||
"""
|
||||
event = None
|
||||
rhino = self._get_rhino()
|
||||
self._ctx.is_final = rhino.process(audio)
|
||||
|
||||
if self._ctx.is_final:
|
||||
inference = rhino.get_inference()
|
||||
self.logger.debug(
|
||||
'Intent detection finalized. Inference understood: %s',
|
||||
inference.is_understood,
|
||||
)
|
||||
|
||||
if inference.is_understood:
|
||||
event = IntentRecognizedEvent(
|
||||
intent=inference.intent,
|
||||
slots=dict(inference.slots),
|
||||
)
|
||||
|
||||
if not event and self._ctx.timed_out:
|
||||
event = ConversationTimeoutEvent()
|
||||
|
||||
return event
|
||||
|
||||
def reset(self) -> None:
|
||||
if not self._enabled:
|
||||
return
|
||||
|
||||
with self._state_lock:
|
||||
self._get_rhino().reset()
|
||||
super().reset()
|
||||
|
||||
def stop(self):
|
||||
if not self._enabled:
|
||||
return
|
||||
|
||||
super().stop()
|
||||
|
||||
with self._state_lock:
|
||||
objs = self._rhino.copy()
|
||||
for key, obj in objs.items():
|
||||
obj.delete()
|
||||
self._rhino.pop(key)
|
|
@ -0,0 +1,208 @@
|
|||
import logging
|
||||
from queue import Queue
|
||||
from threading import Event
|
||||
from typing import Callable, Optional, Sequence
|
||||
|
||||
from platypush.message.event.assistant import AssistantEvent, ConversationTimeoutEvent
|
||||
from platypush.utils import wait_for_either
|
||||
|
||||
from ._intent import IntentProcessor
|
||||
from ._stt import SttProcessor
|
||||
|
||||
|
||||
class SpeechProcessor:
|
||||
"""
|
||||
Speech processor class that wraps the STT and Intent processors under the
|
||||
same interface.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
stop_event: Event,
|
||||
model_path: Optional[str] = None,
|
||||
stt_enabled: bool = True,
|
||||
intent_enabled: bool = False,
|
||||
conversation_timeout: Optional[float] = None,
|
||||
get_cheetah_args: Callable[[], dict] = lambda: {},
|
||||
get_rhino_args: Callable[[], dict] = lambda: {},
|
||||
):
|
||||
self.logger = logging.getLogger(self.__class__.__name__)
|
||||
self._stt_enabled = stt_enabled
|
||||
self._intent_enabled = intent_enabled
|
||||
self._model_path = model_path
|
||||
self._conversation_timeout = conversation_timeout
|
||||
self._audio_queue = Queue()
|
||||
self._stop_event = stop_event
|
||||
self._get_cheetah_args = get_cheetah_args
|
||||
self._get_rhino_args = get_rhino_args
|
||||
|
||||
self._stt_processor = SttProcessor(
|
||||
conversation_timeout=conversation_timeout,
|
||||
stop_event=stop_event,
|
||||
enabled=stt_enabled,
|
||||
get_cheetah_args=get_cheetah_args,
|
||||
)
|
||||
|
||||
self._intent_processor = IntentProcessor(
|
||||
conversation_timeout=conversation_timeout,
|
||||
stop_event=stop_event,
|
||||
enabled=intent_enabled,
|
||||
get_rhino_args=get_rhino_args,
|
||||
)
|
||||
|
||||
@property
|
||||
def enabled(self) -> bool:
|
||||
"""
|
||||
The processor is enabled if either the STT or the Intent processor are
|
||||
enabled.
|
||||
"""
|
||||
return self._stt_enabled or self._intent_enabled
|
||||
|
||||
def should_stop(self) -> bool:
|
||||
return self._stop_event.is_set()
|
||||
|
||||
def next_event(self, timeout: Optional[float] = None) -> Optional[AssistantEvent]:
|
||||
evt = None
|
||||
|
||||
# Wait for either the STT or Intent processor to finish processing the audio
|
||||
completed = wait_for_either(
|
||||
self._stt_processor.processing_done,
|
||||
self._intent_processor.processing_done,
|
||||
self._stop_event,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
if not completed:
|
||||
self.logger.warning('Timeout while waiting for the processors to finish')
|
||||
|
||||
# Immediately return if the stop event is set
|
||||
if self.should_stop():
|
||||
return evt
|
||||
|
||||
with self._stt_processor._state_lock, self._intent_processor._state_lock:
|
||||
# Priority to the intent processor event, if the processor is enabled
|
||||
if self._intent_enabled:
|
||||
evt = self._intent_processor.last_event()
|
||||
|
||||
# If the intent processor didn't return any event, then return the STT
|
||||
# processor event
|
||||
if (
|
||||
not evt or isinstance(evt, ConversationTimeoutEvent)
|
||||
) and self._stt_enabled:
|
||||
# self._stt_processor.processing_done.wait(timeout=timeout)
|
||||
evt = self._stt_processor.last_event()
|
||||
|
||||
if evt:
|
||||
self._stt_processor.reset()
|
||||
self._intent_processor.reset()
|
||||
|
||||
return evt
|
||||
|
||||
def process(
|
||||
self, audio: Sequence[int], block: bool = True, timeout: Optional[float] = None
|
||||
) -> Optional[AssistantEvent]:
|
||||
"""
|
||||
Process an audio frame.
|
||||
|
||||
The audio frame is enqueued to both the STT and Intent processors, if
|
||||
enabled. The function waits for either processor to finish processing
|
||||
the audio, and returns the event from the first processor that returns
|
||||
a result.
|
||||
|
||||
Priority is given to the Intent processor if enabled, otherwise the STT
|
||||
processor is used.
|
||||
"""
|
||||
# Enqueue the audio to both the STT and Intent processors if enabled
|
||||
if self._stt_enabled:
|
||||
self._stt_processor.enqueue(audio)
|
||||
|
||||
if self._intent_enabled:
|
||||
self._intent_processor.enqueue(audio)
|
||||
|
||||
if not block:
|
||||
return None
|
||||
|
||||
return self.next_event(timeout=timeout)
|
||||
|
||||
def __enter__(self):
|
||||
"""
|
||||
Context manager entry point - it wraps :meth:`start`.
|
||||
"""
|
||||
self.start()
|
||||
|
||||
def __exit__(self, *_, **__):
|
||||
"""
|
||||
Context manager exit point - it wraps :meth:`stop`.
|
||||
"""
|
||||
self.stop()
|
||||
|
||||
def start(self):
|
||||
"""
|
||||
Start the STT and Intent processors.
|
||||
"""
|
||||
if self._stt_enabled:
|
||||
self._stt_processor.start()
|
||||
|
||||
if self._intent_enabled:
|
||||
self._intent_processor.start()
|
||||
|
||||
def stop(self):
|
||||
"""
|
||||
Stop the STT and Intent processors.
|
||||
"""
|
||||
self._stt_processor.stop()
|
||||
self._intent_processor.stop()
|
||||
|
||||
def on_conversation_start(self):
|
||||
if self._stt_enabled:
|
||||
self._stt_processor.on_conversation_start()
|
||||
|
||||
if self._intent_enabled:
|
||||
self._intent_processor.on_conversation_start()
|
||||
|
||||
def on_conversation_end(self):
|
||||
if self._stt_enabled:
|
||||
self._stt_processor.on_conversation_end()
|
||||
|
||||
if self._intent_enabled:
|
||||
self._intent_processor.on_conversation_end()
|
||||
|
||||
def on_conversation_reset(self):
|
||||
if self._stt_enabled:
|
||||
self._stt_processor.on_conversation_reset()
|
||||
|
||||
if self._intent_enabled:
|
||||
self._intent_processor.on_conversation_reset()
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
Reset the state of the STT and Intent processors.
|
||||
"""
|
||||
self._stt_processor.reset()
|
||||
self._intent_processor.reset()
|
||||
|
||||
@property
|
||||
def sample_rate(self) -> int:
|
||||
"""
|
||||
The sample rate of the audio frames.
|
||||
"""
|
||||
if self._intent_enabled:
|
||||
return self._intent_processor.sample_rate
|
||||
|
||||
if self._stt_enabled:
|
||||
return self._stt_processor.sample_rate
|
||||
|
||||
raise ValueError('No processor enabled')
|
||||
|
||||
@property
|
||||
def frame_length(self) -> int:
|
||||
"""
|
||||
The frame length of the audio frames.
|
||||
"""
|
||||
if self._intent_enabled:
|
||||
return self._intent_processor.frame_length
|
||||
|
||||
if self._stt_enabled:
|
||||
return self._stt_processor.frame_length
|
||||
|
||||
raise ValueError('No processor enabled')
|
|
@ -0,0 +1,110 @@
|
|||
from typing import Callable, Optional, Sequence, Union
|
||||
|
||||
import pvcheetah
|
||||
|
||||
from platypush.message.event.assistant import (
|
||||
ConversationTimeoutEvent,
|
||||
SpeechRecognizedEvent,
|
||||
)
|
||||
|
||||
from ._base import BaseProcessor
|
||||
|
||||
|
||||
class SttProcessor(BaseProcessor):
|
||||
"""
|
||||
Implementation of the speech-to-text processor using the Picovoice Cheetah
|
||||
engine.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, *args, get_cheetah_args: Callable[[], dict] = lambda: {}, **kwargs
|
||||
):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._get_cheetah_args = get_cheetah_args
|
||||
# model_path -> Cheetah instance cache
|
||||
self._cheetah = {self._model_path: pvcheetah.create(**self._get_cheetah_args())}
|
||||
|
||||
@property
|
||||
def _model_path(self) -> Optional[str]:
|
||||
return self._get_cheetah_args().get('model_path')
|
||||
|
||||
@property
|
||||
def sample_rate(self) -> int:
|
||||
return self._get_cheetah().sample_rate
|
||||
|
||||
@property
|
||||
def frame_length(self) -> int:
|
||||
return self._get_cheetah().frame_length
|
||||
|
||||
def _get_cheetah(self) -> pvcheetah.Cheetah:
|
||||
with self._state_lock:
|
||||
if not self._cheetah.get(self._model_path):
|
||||
self.logger.debug(
|
||||
'Creating Cheetah instance for model %s', self._model_path
|
||||
)
|
||||
self._cheetah[self._model_path] = pvcheetah.create(
|
||||
**self._get_cheetah_args()
|
||||
)
|
||||
self.logger.debug(
|
||||
'Cheetah instance created for model %s', self._model_path
|
||||
)
|
||||
|
||||
return self._cheetah[self._model_path]
|
||||
|
||||
def process(
|
||||
self, audio: Sequence[int]
|
||||
) -> Optional[Union[SpeechRecognizedEvent, ConversationTimeoutEvent]]:
|
||||
event = None
|
||||
cheetah = self._get_cheetah()
|
||||
partial_transcript, self._ctx.is_final = cheetah.process(audio)
|
||||
last_transcript = self._ctx.transcript
|
||||
|
||||
# Concatenate the partial transcript to the context
|
||||
if partial_transcript:
|
||||
self._ctx.transcript += partial_transcript
|
||||
self.logger.info(
|
||||
'Partial transcript: %s, is_final: %s',
|
||||
self._ctx.transcript,
|
||||
self._ctx.is_final,
|
||||
)
|
||||
|
||||
# If the transcript is final or the conversation timed out, then
|
||||
# process and return whatever is available in the context
|
||||
if self._ctx.is_final or self._ctx.timed_out:
|
||||
phrase = cheetah.flush() or ''
|
||||
self._ctx.transcript += phrase
|
||||
if self._ctx.transcript and self._ctx.transcript != last_transcript:
|
||||
self.logger.debug('Processed STT transcript: %s', self._ctx.transcript)
|
||||
last_transcript = self._ctx.transcript
|
||||
|
||||
phrase = self._ctx.transcript
|
||||
phrase = (phrase[:1].lower() + phrase[1:]).strip()
|
||||
event = (
|
||||
SpeechRecognizedEvent(phrase=phrase)
|
||||
if phrase
|
||||
else ConversationTimeoutEvent()
|
||||
)
|
||||
|
||||
self.reset()
|
||||
|
||||
return event
|
||||
|
||||
def reset(self):
|
||||
if not self._enabled:
|
||||
return
|
||||
|
||||
with self._state_lock:
|
||||
super().reset()
|
||||
self._get_cheetah().flush()
|
||||
|
||||
def stop(self):
|
||||
if not self._enabled:
|
||||
return
|
||||
|
||||
super().stop()
|
||||
|
||||
with self._state_lock:
|
||||
objs = self._cheetah.copy()
|
||||
for key, obj in objs.items():
|
||||
obj.delete()
|
||||
self._cheetah.pop(key)
|
|
@ -0,0 +1,14 @@
|
|||
from enum import Enum
|
||||
|
||||
|
||||
class AssistantState(Enum):
|
||||
"""
|
||||
Possible states of the assistant.
|
||||
"""
|
||||
|
||||
IDLE = 'idle'
|
||||
DETECTING_HOTWORD = 'detecting_hotword'
|
||||
DETECTING_SPEECH = 'detecting_speech'
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
|
@ -0,0 +1,33 @@
|
|||
manifest:
|
||||
package: platypush.plugins.assistant.picovoice
|
||||
type: plugin
|
||||
events:
|
||||
- platypush.message.event.assistant.ConversationEndEvent
|
||||
- platypush.message.event.assistant.ConversationStartEvent
|
||||
- platypush.message.event.assistant.ConversationTimeoutEvent
|
||||
- platypush.message.event.assistant.HotwordDetectedEvent
|
||||
- platypush.message.event.assistant.IntentRecognizedEvent
|
||||
- platypush.message.event.assistant.MicMutedEvent
|
||||
- platypush.message.event.assistant.MicUnmutedEvent
|
||||
- platypush.message.event.assistant.NoResponseEvent
|
||||
- platypush.message.event.assistant.ResponseEndEvent
|
||||
- platypush.message.event.assistant.ResponseEvent
|
||||
- platypush.message.event.assistant.SpeechRecognizedEvent
|
||||
install:
|
||||
apk:
|
||||
- ffmpeg
|
||||
apt:
|
||||
- ffmpeg
|
||||
dnf:
|
||||
- ffmpeg
|
||||
pacman:
|
||||
- ffmpeg
|
||||
- python-sounddevice
|
||||
pip:
|
||||
- num2words # Temporary dependency
|
||||
- pvcheetah
|
||||
- pvleopard
|
||||
- pvorca
|
||||
- pvporcupine
|
||||
- pvrhino
|
||||
- sounddevice
|
|
@ -255,6 +255,8 @@ class MusicMopidyPlugin(RunnablePlugin):
|
|||
ret = self._add(resource, position=0)
|
||||
if not ret:
|
||||
self.logger.warning('Failed to add %s to the tracklist', resource)
|
||||
elif isinstance(ret, list):
|
||||
track_id = ret[0].get('tlid')
|
||||
elif isinstance(ret, dict):
|
||||
track_id = ret.get('tlid')
|
||||
elif position is not None:
|
||||
|
|
|
@ -0,0 +1,283 @@
|
|||
import os
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime as dt
|
||||
from enum import Enum
|
||||
from threading import RLock
|
||||
from typing import Iterable, List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from platypush.plugins import Plugin, action
|
||||
|
||||
|
||||
class ContextEntryRole(Enum):
|
||||
"""
|
||||
Roles for context entries.
|
||||
"""
|
||||
|
||||
ASSISTANT = "assistant"
|
||||
SYSTEM = "system"
|
||||
USER = "user"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ContextEntry:
|
||||
"""
|
||||
A context entry.
|
||||
"""
|
||||
|
||||
timestamp: dt
|
||||
role: ContextEntryRole
|
||||
content: str
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict):
|
||||
return cls(
|
||||
timestamp=dt.fromisoformat(data.get("timestamp", dt.now().isoformat())),
|
||||
role=ContextEntryRole(data["role"]),
|
||||
content=data["content"],
|
||||
)
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"role": self.role.value,
|
||||
"content": self.content,
|
||||
}
|
||||
|
||||
|
||||
class OpenaiPlugin(Plugin):
|
||||
"""
|
||||
Plugin to interact with OpenAI services.
|
||||
|
||||
So far only ChatGPT is supported.
|
||||
|
||||
Contexts
|
||||
--------
|
||||
|
||||
The plugin also supports the implementation of custom assistant
|
||||
*contexts*/environment.
|
||||
|
||||
Contexts can be used to:
|
||||
|
||||
- Customize the model's behavior based on a set of inputs - going from
|
||||
a generic "*You are a helpful assistant*" to a more specific "*You
|
||||
are a Star Trek fan*", or "*You are a 16th century noble lady who
|
||||
talks in Shakespearean English to her peers*".
|
||||
- Pre-configure the model with a set of previous interactions in order
|
||||
to either pre-load information that we expect the model to remember,
|
||||
or to provide a set of previous interactions that the model can use
|
||||
to generate responses that are consistent with the conversation
|
||||
history.
|
||||
|
||||
The plugin provides two types of contexts:
|
||||
|
||||
- **Default context**: This is a set of context entries that are
|
||||
provided at plugin initialization and that will be used to initialize
|
||||
the model with a configuration or set of previous interactions that
|
||||
will be remembered when generating all responses.
|
||||
|
||||
- **Runtime context**: This is a set of context entries that can be
|
||||
passed at runtime at :meth:`.get_response`. All the interactions
|
||||
(both user prompts and assistant responses) that are processed
|
||||
through :meth:`.get_response` will also be added to the runtime
|
||||
context, and remembered for the next ``context_expiry`` seconds. This
|
||||
allows you to generate responses that are consistent with the recent
|
||||
conversation history.
|
||||
|
||||
Each context entry is a dictionary with the following keys:
|
||||
|
||||
- ``role``: The role of the message. Can be one of:
|
||||
- ``system``: A system message provided to the model to set
|
||||
up its initial state - e.g. "you are a helpful
|
||||
assistant".
|
||||
- ``user``: A user message, as provided by a previous (real
|
||||
or synthetic) user interaction.
|
||||
- ``assistant``: An assistant message, as provided by a
|
||||
previous (real or synthetic) assistant response.
|
||||
- ``content``: The content of the message.
|
||||
|
||||
An example of context:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
context:
|
||||
- role: system
|
||||
content: >
|
||||
You are a 16th century noble lady who talks in
|
||||
Shakespearean English to her peers.
|
||||
- role: user
|
||||
content: What is a telephone?
|
||||
- role: assistant
|
||||
content: >
|
||||
Pray tell, noble companion, a telephone is a device
|
||||
of modern innovation that doth permit one to speak
|
||||
with a distant acquaintance by means of magical pink
|
||||
waves that do carry the sound of thine voice to the
|
||||
ear of the listener.
|
||||
|
||||
Given such context, if you call :meth:`.get_response` with a
|
||||
prompt such as "*How does it work?*", the model may generate a
|
||||
response such as "*Fair lady, to use a telephone, thou must first
|
||||
lift the receiver and place it to thine ear. Then, thou must speak
|
||||
into the mouthpiece as though conversing with a companion in
|
||||
another room. The magical pink waves shall carry thy words to the
|
||||
recipient, who shall hear them on their own device. 'Tis a wondrous
|
||||
invention indeed!*".
|
||||
|
||||
Note that the model will remember the previous interactions and
|
||||
also generate responses, so you can ask it direct questions such as "How
|
||||
does it work" while remembering what "it" is likely to mean. And it'll
|
||||
provide responses which are in the same style initialized through the
|
||||
``system`` context.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key: Optional[str],
|
||||
model: str = "gpt-3.5-turbo",
|
||||
timeout: float = 30,
|
||||
context: Optional[Iterable[dict]] = None,
|
||||
context_expiry: Optional[float] = 600,
|
||||
max_tokens: int = 500,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
:param api_key: OpenAI API key. If not set, it will be read from the
|
||||
``OPENAI_API_KEY`` environment variable.
|
||||
:param model: The model to use. Default: ``gpt-3.5-turbo``.
|
||||
:param timeout: Default timeout for API requests (default: 30 seconds).
|
||||
:param max_tokens: Maximum number of tokens to generate in the response
|
||||
(default: 500).
|
||||
:param context: Default context to use for completions, as a list of
|
||||
dictionaries with ``role`` and ``content`` keys. Default: None.
|
||||
:param context_expiry: Default expiry time for the context in seconds.
|
||||
After this time since the last interaction, the context will be
|
||||
cleared.
|
||||
|
||||
This means that any follow-up interactions happening within the
|
||||
expiry window will remember the past prompts, but any interaction
|
||||
that happens after the expiry window (calculated from the time of
|
||||
the last interaction) will start fresh.
|
||||
|
||||
Note that ``context_expiry`` is only applied to the runtime
|
||||
context. The default context will never expire unless it's removed
|
||||
from the plugin configuration.
|
||||
|
||||
Set to 0 to disable context expiry - i.e. all messages stay in the
|
||||
context until the plugin is restarted or the context is cleared
|
||||
explicitly via :meth:`.clear_context`. Default: 600 seconds (10
|
||||
minutes).
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
api_key = api_key or os.getenv('OPENAI_API_KEY')
|
||||
assert api_key, 'OpenAI API key not provided'
|
||||
|
||||
self._api_key = api_key
|
||||
self._context_lock = RLock()
|
||||
self._runtime_context: List[ContextEntry] = []
|
||||
self._default_context = [
|
||||
ContextEntry.from_dict(entries) for entries in (context or [])
|
||||
]
|
||||
|
||||
self.max_tokens = max_tokens
|
||||
self.context_expiry = context_expiry
|
||||
self.model = model
|
||||
self.timeout = timeout
|
||||
|
||||
def _rotate_context(self):
|
||||
"""
|
||||
Rotate the context by removing any entries older than the configured
|
||||
``context_expiry``.
|
||||
"""
|
||||
if not self.context_expiry:
|
||||
return
|
||||
|
||||
with self._context_lock:
|
||||
now = dt.now()
|
||||
self._runtime_context = [
|
||||
entry
|
||||
for entry in self._runtime_context
|
||||
if (now - entry.timestamp).total_seconds() < self.context_expiry
|
||||
]
|
||||
|
||||
@action
|
||||
def get_response(
|
||||
self,
|
||||
prompt: str,
|
||||
model: Optional[str] = None,
|
||||
context: Optional[Iterable[dict]] = None,
|
||||
timeout: Optional[float] = None,
|
||||
max_tokens: Optional[int] = None,
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Get completions for a given prompt using ChatGPT.
|
||||
|
||||
:param prompt: The prompt/question to complete/answer.
|
||||
:param model: Override the default model to use.
|
||||
:param context: Extend the default context with these extra messages.
|
||||
:param max_tokens: Override the default maximum number of tokens to
|
||||
generate in the response.
|
||||
:param timeout: Override the default timeout for the API request.
|
||||
:return: The completion for the prompt - or, better, the message
|
||||
associted to the highest scoring completion choice.
|
||||
"""
|
||||
self._rotate_context()
|
||||
context = [
|
||||
*(context or []),
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
},
|
||||
]
|
||||
|
||||
resp = requests.post(
|
||||
"https://api.openai.com/v1/chat/completions",
|
||||
timeout=timeout or self.timeout,
|
||||
headers={
|
||||
"Authorization": f"Bearer {self._api_key}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={
|
||||
"model": model or self.model,
|
||||
"messages": [
|
||||
*(
|
||||
entry.to_dict()
|
||||
for entry in (
|
||||
*(self._default_context or []),
|
||||
*self._runtime_context,
|
||||
)
|
||||
),
|
||||
*context,
|
||||
],
|
||||
"max_tokens": max_tokens or self.max_tokens,
|
||||
},
|
||||
)
|
||||
|
||||
resp.raise_for_status()
|
||||
self._update_context(*context)
|
||||
choices = resp.json()["choices"]
|
||||
self.logger.debug("OpenAI response: %s", resp.json())
|
||||
|
||||
if not choices:
|
||||
return None
|
||||
|
||||
msg = choices[0]["message"]
|
||||
self._update_context(msg)
|
||||
return msg["content"]
|
||||
|
||||
def _update_context(self, *entries: dict):
|
||||
"""
|
||||
Update the context with a new entry.
|
||||
"""
|
||||
with self._context_lock:
|
||||
for entry in entries:
|
||||
self._runtime_context.append(ContextEntry.from_dict(entry))
|
||||
self._rotate_context()
|
||||
|
||||
@action
|
||||
def clear_context(self):
|
||||
"""
|
||||
Clear the runtime context.
|
||||
"""
|
||||
with self._context_lock:
|
||||
self._runtime_context = []
|
|
@ -0,0 +1,3 @@
|
|||
manifest:
|
||||
package: platypush.plugins.openai
|
||||
type: plugin
|
|
@ -247,9 +247,11 @@ class AudioManager:
|
|||
wait_start = time()
|
||||
for audio_thread in streams_to_stop:
|
||||
audio_thread.join(
|
||||
timeout=max(0, timeout - (time() - wait_start))
|
||||
if timeout is not None
|
||||
else None
|
||||
timeout=(
|
||||
max(0, timeout - (time() - wait_start))
|
||||
if timeout is not None
|
||||
else None
|
||||
)
|
||||
)
|
||||
|
||||
# Remove references
|
||||
|
|
|
@ -1,336 +0,0 @@
|
|||
import queue
|
||||
import threading
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional, Union, List
|
||||
|
||||
import sounddevice as sd
|
||||
|
||||
from platypush.context import get_bus
|
||||
from platypush.message.event.stt import (
|
||||
SpeechDetectionStartedEvent,
|
||||
SpeechDetectionStoppedEvent,
|
||||
SpeechStartedEvent,
|
||||
SpeechDetectedEvent,
|
||||
HotwordDetectedEvent,
|
||||
ConversationDetectedEvent,
|
||||
)
|
||||
from platypush.message.response.stt import SpeechDetectedResponse
|
||||
from platypush.plugins import Plugin, action
|
||||
|
||||
|
||||
class SttPlugin(ABC, Plugin):
|
||||
"""
|
||||
Abstract class for speech-to-text plugins.
|
||||
"""
|
||||
|
||||
_thread_stop_timeout = 10.0
|
||||
rate = 16000
|
||||
channels = 1
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_device: Optional[Union[int, str]] = None,
|
||||
hotword: Optional[str] = None,
|
||||
hotwords: Optional[List[str]] = None,
|
||||
conversation_timeout: Optional[float] = 10.0,
|
||||
block_duration: float = 1.0,
|
||||
):
|
||||
"""
|
||||
:param input_device: PortAudio device index or name that will be used for recording speech (default: default
|
||||
system audio input device).
|
||||
:param hotword: When this word is detected, the plugin will trigger a
|
||||
:class:`platypush.message.event.stt.HotwordDetectedEvent` instead of a
|
||||
:class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can use these events for hooking other
|
||||
assistants.
|
||||
:param hotwords: Use a list of hotwords instead of a single one.
|
||||
:param conversation_timeout: If ``hotword`` or ``hotwords`` are set and ``conversation_timeout`` is set,
|
||||
the next speech detected event will trigger a :class:`platypush.message.event.stt.ConversationDetectedEvent`
|
||||
instead of a :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can hook custom hooks
|
||||
here to run any logic depending on the detected speech - it can emulate a kind of
|
||||
"OK, Google. Turn on the lights" interaction without using an external assistant (default: 10 seconds).
|
||||
:param block_duration: Duration of the acquired audio blocks (default: 1 second).
|
||||
"""
|
||||
|
||||
super().__init__()
|
||||
self.input_device = input_device
|
||||
self.conversation_timeout = conversation_timeout
|
||||
self.block_duration = block_duration
|
||||
|
||||
self.hotwords = set(hotwords or [])
|
||||
if hotword:
|
||||
self.hotwords = {hotword}
|
||||
|
||||
self._conversation_event = threading.Event()
|
||||
self._input_stream: Optional[sd.InputStream] = None
|
||||
self._recording_thread: Optional[threading.Thread] = None
|
||||
self._detection_thread: Optional[threading.Thread] = None
|
||||
self._audio_queue: Optional[queue.Queue] = None
|
||||
self._current_text = ''
|
||||
|
||||
def _get_input_device(self, device: Optional[Union[int, str]] = None) -> int:
|
||||
"""
|
||||
Get the index of the input device by index or name.
|
||||
|
||||
:param device: Device index or name. If None is set then the function will return the index of the
|
||||
default audio input device.
|
||||
:return: Index of the audio input device.
|
||||
"""
|
||||
if not device:
|
||||
device = self.input_device
|
||||
if not device:
|
||||
return sd.query_hostapis()[0].get('default_input_device')
|
||||
|
||||
if isinstance(device, int):
|
||||
assert device <= len(sd.query_devices())
|
||||
return device
|
||||
|
||||
for i, dev in enumerate(sd.query_devices()):
|
||||
if dev['name'] == device:
|
||||
return i
|
||||
|
||||
raise AssertionError('Device {} not found'.format(device))
|
||||
|
||||
def on_speech_detected(self, speech: str) -> None:
|
||||
"""
|
||||
Hook called when speech is detected. Triggers the right event depending on the current context.
|
||||
|
||||
:param speech: Detected speech.
|
||||
"""
|
||||
speech = speech.strip()
|
||||
|
||||
if speech in self.hotwords:
|
||||
event = HotwordDetectedEvent(hotword=speech)
|
||||
if self.conversation_timeout:
|
||||
self._conversation_event.set()
|
||||
threading.Timer(
|
||||
self.conversation_timeout, lambda: self._conversation_event.clear()
|
||||
).start()
|
||||
elif self._conversation_event.is_set():
|
||||
event = ConversationDetectedEvent(speech=speech)
|
||||
else:
|
||||
event = SpeechDetectedEvent(speech=speech)
|
||||
|
||||
get_bus().post(event)
|
||||
|
||||
@staticmethod
|
||||
def convert_frames(frames: bytes) -> bytes:
|
||||
"""
|
||||
Conversion method for raw audio frames. It just returns the input frames as bytes. Override it if required
|
||||
by your logic.
|
||||
|
||||
:param frames: Input audio frames, as bytes.
|
||||
:return: The audio frames as passed on the input. Override if required.
|
||||
"""
|
||||
return frames
|
||||
|
||||
def on_detection_started(self) -> None:
|
||||
"""
|
||||
Method called when the ``detection_thread`` starts. Initialize your context variables and models here if
|
||||
required.
|
||||
"""
|
||||
pass
|
||||
|
||||
def on_detection_ended(self) -> None:
|
||||
"""
|
||||
Method called when the ``detection_thread`` stops. Clean up your context variables and models here.
|
||||
"""
|
||||
pass
|
||||
|
||||
def before_recording(self) -> None:
|
||||
"""
|
||||
Method called when the ``recording_thread`` starts. Put here any logic that you may want to run before the
|
||||
recording thread starts.
|
||||
"""
|
||||
pass
|
||||
|
||||
def on_recording_started(self) -> None:
|
||||
"""
|
||||
Method called after the ``recording_thread`` opens the audio device. Put here any logic that you may want to
|
||||
run after the recording starts.
|
||||
"""
|
||||
pass
|
||||
|
||||
def on_recording_ended(self) -> None:
|
||||
"""
|
||||
Method called when the ``recording_thread`` stops. Put here any logic that you want to run after the audio
|
||||
device is closed.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def detect_speech(self, frames) -> str:
|
||||
"""
|
||||
Method called within the ``detection_thread`` when new audio frames have been captured. Must be implemented
|
||||
by the derived classes.
|
||||
|
||||
:param frames: Audio frames, as returned by ``convert_frames``.
|
||||
:return: Detected text, as a string. Returns an empty string if no text has been detected.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def process_text(self, text: str) -> None:
|
||||
if (not text and self._current_text) or (text and text == self._current_text):
|
||||
self.on_speech_detected(self._current_text)
|
||||
self._current_text = ''
|
||||
else:
|
||||
if text:
|
||||
if not self._current_text:
|
||||
get_bus().post(SpeechStartedEvent())
|
||||
self.logger.info('Intermediate speech results: [{}]'.format(text))
|
||||
|
||||
self._current_text = text
|
||||
|
||||
def detection_thread(self) -> None:
|
||||
"""
|
||||
This thread reads frames from ``_audio_queue``, performs the speech-to-text detection and calls
|
||||
"""
|
||||
self._current_text = ''
|
||||
self.logger.debug('Detection thread started')
|
||||
self.on_detection_started()
|
||||
|
||||
while self._audio_queue:
|
||||
try:
|
||||
frames = self._audio_queue.get()
|
||||
frames = self.convert_frames(frames)
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
'Error while feeding audio to the model: {}'.format(str(e))
|
||||
)
|
||||
continue
|
||||
|
||||
text = self.detect_speech(frames).strip()
|
||||
self.process_text(text)
|
||||
|
||||
self.on_detection_ended()
|
||||
self.logger.debug('Detection thread terminated')
|
||||
|
||||
def recording_thread(
|
||||
self,
|
||||
block_duration: Optional[float] = None,
|
||||
block_size: Optional[int] = None,
|
||||
input_device: Optional[str] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Recording thread. It reads raw frames from the audio device and dispatches them to ``detection_thread``.
|
||||
|
||||
:param block_duration: Audio blocks duration. Specify either ``block_duration`` or ``block_size``.
|
||||
:param block_size: Size of the audio blocks. Specify either ``block_duration`` or ``block_size``.
|
||||
:param input_device: Input device
|
||||
"""
|
||||
assert (block_duration or block_size) and not (
|
||||
block_duration and block_size
|
||||
), 'Please specify either block_duration or block_size'
|
||||
|
||||
if not block_size:
|
||||
block_size = int(self.rate * self.channels * block_duration)
|
||||
|
||||
self.before_recording()
|
||||
self.logger.debug('Recording thread started')
|
||||
device = self._get_input_device(input_device)
|
||||
self._input_stream = sd.InputStream(
|
||||
samplerate=self.rate,
|
||||
device=device,
|
||||
channels=self.channels,
|
||||
dtype='int16',
|
||||
latency=0,
|
||||
blocksize=block_size,
|
||||
)
|
||||
self._input_stream.start()
|
||||
self.on_recording_started()
|
||||
get_bus().post(SpeechDetectionStartedEvent())
|
||||
|
||||
while self._input_stream:
|
||||
try:
|
||||
frames = self._input_stream.read(block_size)[0]
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
'Error while reading from the audio input: {}'.format(str(e))
|
||||
)
|
||||
continue
|
||||
|
||||
self._audio_queue.put(frames)
|
||||
|
||||
get_bus().post(SpeechDetectionStoppedEvent())
|
||||
self.on_recording_ended()
|
||||
self.logger.debug('Recording thread terminated')
|
||||
|
||||
@abstractmethod
|
||||
@action
|
||||
def detect(self, audio_file: str) -> SpeechDetectedResponse:
|
||||
"""
|
||||
Perform speech-to-text analysis on an audio file. Must be implemented by the derived classes.
|
||||
|
||||
:param audio_file: Path to the audio file.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def __enter__(self):
|
||||
"""
|
||||
Context manager enter. Starts detection and returns self.
|
||||
"""
|
||||
self.start_detection()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""
|
||||
Context manager exit. Stops detection.
|
||||
"""
|
||||
self.stop_detection()
|
||||
|
||||
@action
|
||||
def start_detection(
|
||||
self,
|
||||
input_device: Optional[str] = None,
|
||||
seconds: Optional[float] = None,
|
||||
block_duration: Optional[float] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Start the speech detection engine.
|
||||
|
||||
:param input_device: Audio input device name/index override
|
||||
:param seconds: If set, then the detection engine will stop after this many seconds, otherwise it'll
|
||||
start running until ``stop_detection`` is called or application stop.
|
||||
:param block_duration: ``block_duration`` override.
|
||||
"""
|
||||
assert (
|
||||
not self._input_stream and not self._recording_thread
|
||||
), 'Speech detection is already running'
|
||||
block_duration = block_duration or self.block_duration
|
||||
input_device = input_device if input_device is not None else self.input_device
|
||||
self._audio_queue = queue.Queue()
|
||||
self._recording_thread = threading.Thread(
|
||||
target=lambda: self.recording_thread(
|
||||
block_duration=block_duration, input_device=input_device
|
||||
)
|
||||
)
|
||||
|
||||
self._recording_thread.start()
|
||||
self._detection_thread = threading.Thread(
|
||||
target=lambda: self.detection_thread()
|
||||
)
|
||||
self._detection_thread.start()
|
||||
|
||||
if seconds:
|
||||
threading.Timer(seconds, lambda: self.stop_detection()).start()
|
||||
|
||||
@action
|
||||
def stop_detection(self) -> None:
|
||||
"""
|
||||
Stop the speech detection engine.
|
||||
"""
|
||||
assert self._input_stream, 'Speech detection is not running'
|
||||
self._input_stream.stop(ignore_errors=True)
|
||||
self._input_stream.close(ignore_errors=True)
|
||||
self._input_stream = None
|
||||
|
||||
if self._recording_thread:
|
||||
self._recording_thread.join(timeout=self._thread_stop_timeout)
|
||||
self._recording_thread = None
|
||||
|
||||
self._audio_queue = None
|
||||
if self._detection_thread:
|
||||
self._detection_thread.join(timeout=self._thread_stop_timeout)
|
||||
self._detection_thread = None
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
|
@ -1,120 +0,0 @@
|
|||
import os
|
||||
import struct
|
||||
from typing import Optional, List
|
||||
|
||||
from platypush.message.response.stt import SpeechDetectedResponse
|
||||
from platypush.plugins import action
|
||||
from platypush.plugins.stt import SttPlugin
|
||||
|
||||
|
||||
class SttPicovoiceHotwordPlugin(SttPlugin):
|
||||
"""
|
||||
This plugin performs hotword detection using `PicoVoice <https://github.com/Picovoice>`_.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
library_path: Optional[str] = None,
|
||||
model_file_path: Optional[str] = None,
|
||||
keyword_file_paths: Optional[List[str]] = None,
|
||||
sensitivity: float = 0.5,
|
||||
sensitivities: Optional[List[float]] = None,
|
||||
*args,
|
||||
**kwargs
|
||||
):
|
||||
from pvporcupine import Porcupine
|
||||
from pvporcupine.resources.util.python.util import (
|
||||
LIBRARY_PATH,
|
||||
MODEL_FILE_PATH,
|
||||
KEYWORD_FILE_PATHS,
|
||||
)
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self.hotwords = list(self.hotwords)
|
||||
self._hotword_engine: Optional[Porcupine] = None
|
||||
self._library_path = os.path.abspath(
|
||||
os.path.expanduser(library_path or LIBRARY_PATH)
|
||||
)
|
||||
self._model_file_path = os.path.abspath(
|
||||
os.path.expanduser(model_file_path or MODEL_FILE_PATH)
|
||||
)
|
||||
|
||||
if not keyword_file_paths:
|
||||
hotwords = KEYWORD_FILE_PATHS
|
||||
assert all(
|
||||
hotword in hotwords for hotword in self.hotwords
|
||||
), 'Not all the hotwords could be found. Available hotwords: {}'.format(
|
||||
list(hotwords.keys())
|
||||
)
|
||||
|
||||
self._keyword_file_paths = [
|
||||
os.path.abspath(os.path.expanduser(hotwords[hotword]))
|
||||
for hotword in self.hotwords
|
||||
]
|
||||
else:
|
||||
self._keyword_file_paths = [
|
||||
os.path.abspath(os.path.expanduser(p)) for p in keyword_file_paths
|
||||
]
|
||||
|
||||
self._sensitivities = []
|
||||
if sensitivities:
|
||||
assert len(self._keyword_file_paths) == len(
|
||||
sensitivities
|
||||
), 'Please specify as many sensitivities as the number of configured hotwords'
|
||||
|
||||
self._sensitivities = sensitivities
|
||||
else:
|
||||
self._sensitivities = [sensitivity] * len(self._keyword_file_paths)
|
||||
|
||||
def convert_frames(self, frames: bytes) -> tuple:
|
||||
assert self._hotword_engine, 'The hotword engine is not running'
|
||||
return struct.unpack_from("h" * self._hotword_engine.frame_length, frames)
|
||||
|
||||
def on_detection_ended(self) -> None:
|
||||
if self._hotword_engine:
|
||||
self._hotword_engine.delete()
|
||||
self._hotword_engine = None
|
||||
|
||||
def detect_speech(self, frames: tuple) -> str:
|
||||
index = self._hotword_engine.process(frames)
|
||||
if index < 0:
|
||||
return ''
|
||||
|
||||
if index is True:
|
||||
index = 0
|
||||
return self.hotwords[index]
|
||||
|
||||
@action
|
||||
def detect(self, audio_file: str) -> SpeechDetectedResponse:
|
||||
"""
|
||||
Perform speech-to-text analysis on an audio file.
|
||||
|
||||
:param audio_file: Path to the audio file.
|
||||
"""
|
||||
pass
|
||||
|
||||
def recording_thread(
|
||||
self, input_device: Optional[str] = None, *args, **kwargs
|
||||
) -> None:
|
||||
assert self._hotword_engine, 'The hotword engine has not yet been initialized'
|
||||
super().recording_thread(
|
||||
block_size=self._hotword_engine.frame_length, input_device=input_device
|
||||
)
|
||||
|
||||
@action
|
||||
def start_detection(self, *args, **kwargs) -> None:
|
||||
from pvporcupine import Porcupine
|
||||
|
||||
self._hotword_engine = Porcupine(
|
||||
library_path=self._library_path,
|
||||
model_file_path=self._model_file_path,
|
||||
keyword_file_paths=self._keyword_file_paths,
|
||||
sensitivities=self._sensitivities,
|
||||
)
|
||||
|
||||
self.rate = self._hotword_engine.sample_rate
|
||||
super().start_detection(*args, **kwargs)
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
|
@ -1,7 +0,0 @@
|
|||
manifest:
|
||||
events: {}
|
||||
install:
|
||||
pip:
|
||||
- pvporcupine
|
||||
package: platypush.plugins.stt.picovoice.hotword
|
||||
type: plugin
|
|
@ -1,154 +0,0 @@
|
|||
import inspect
|
||||
import os
|
||||
import platform
|
||||
import struct
|
||||
import threading
|
||||
from typing import Optional
|
||||
|
||||
from platypush.message.event.stt import SpeechStartedEvent
|
||||
|
||||
from platypush.context import get_bus
|
||||
from platypush.message.response.stt import SpeechDetectedResponse
|
||||
from platypush.plugins import action
|
||||
from platypush.plugins.stt import SttPlugin
|
||||
|
||||
|
||||
class SttPicovoiceSpeechPlugin(SttPlugin):
|
||||
"""
|
||||
This plugin performs speech detection using `PicoVoice <https://github.com/Picovoice>`_.
|
||||
NOTE: The PicoVoice product used for real-time speech-to-text (Cheetah) can be used freely for
|
||||
personal applications on x86_64 Linux. Other architectures and operating systems require a commercial license.
|
||||
You can ask for a license `here <https://picovoice.ai/contact.html>`_.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
library_path: Optional[str] = None,
|
||||
acoustic_model_path: Optional[str] = None,
|
||||
language_model_path: Optional[str] = None,
|
||||
license_path: Optional[str] = None,
|
||||
end_of_speech_timeout: int = 1,
|
||||
*args,
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
:param library_path: Path to the Cheetah binary library for your OS
|
||||
(default: ``CHEETAH_INSTALL_DIR/lib/OS/ARCH/libpv_cheetah.EXT``).
|
||||
:param acoustic_model_path: Path to the acoustic speech model
|
||||
(default: ``CHEETAH_INSTALL_DIR/lib/common/acoustic_model.pv``).
|
||||
:param language_model_path: Path to the language model
|
||||
(default: ``CHEETAH_INSTALL_DIR/lib/common/language_model.pv``).
|
||||
:param license_path: Path to your PicoVoice license
|
||||
(default: ``CHEETAH_INSTALL_DIR/resources/license/cheetah_eval_linux_public.lic``).
|
||||
:param end_of_speech_timeout: Number of seconds of silence during speech recognition before considering
|
||||
a phrase over (default: 1).
|
||||
"""
|
||||
from pvcheetah import Cheetah
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self._basedir = os.path.abspath(
|
||||
os.path.join(inspect.getfile(Cheetah), '..', '..', '..')
|
||||
)
|
||||
if not library_path:
|
||||
library_path = self._get_library_path()
|
||||
if not language_model_path:
|
||||
language_model_path = os.path.join(
|
||||
self._basedir, 'lib', 'common', 'language_model.pv'
|
||||
)
|
||||
if not acoustic_model_path:
|
||||
acoustic_model_path = os.path.join(
|
||||
self._basedir, 'lib', 'common', 'acoustic_model.pv'
|
||||
)
|
||||
if not license_path:
|
||||
license_path = os.path.join(
|
||||
self._basedir, 'resources', 'license', 'cheetah_eval_linux_public.lic'
|
||||
)
|
||||
|
||||
self._library_path = library_path
|
||||
self._language_model_path = language_model_path
|
||||
self._acoustic_model_path = acoustic_model_path
|
||||
self._license_path = license_path
|
||||
self._end_of_speech_timeout = end_of_speech_timeout
|
||||
self._stt_engine: Optional[Cheetah] = None
|
||||
self._speech_in_progress = threading.Event()
|
||||
|
||||
def _get_library_path(self) -> str:
|
||||
path = os.path.join(
|
||||
self._basedir, 'lib', platform.system().lower(), platform.machine()
|
||||
)
|
||||
return os.path.join(
|
||||
path, [f for f in os.listdir(path) if f.startswith('libpv_cheetah.')][0]
|
||||
)
|
||||
|
||||
def convert_frames(self, frames: bytes) -> tuple:
|
||||
assert self._stt_engine, 'The speech engine is not running'
|
||||
return struct.unpack_from("h" * self._stt_engine.frame_length, frames)
|
||||
|
||||
def on_detection_ended(self) -> None:
|
||||
if self._stt_engine:
|
||||
self._stt_engine.delete()
|
||||
self._stt_engine = None
|
||||
|
||||
def detect_speech(self, frames: tuple) -> str:
|
||||
text, is_endpoint = self._stt_engine.process(frames)
|
||||
text = text.strip()
|
||||
|
||||
if text:
|
||||
if not self._speech_in_progress.is_set():
|
||||
self._speech_in_progress.set()
|
||||
get_bus().post(SpeechStartedEvent())
|
||||
|
||||
self._current_text += ' ' + text.strip()
|
||||
|
||||
if is_endpoint:
|
||||
text = self._stt_engine.flush().strip().strip()
|
||||
if text:
|
||||
self._current_text += ' ' + text
|
||||
|
||||
self._speech_in_progress.clear()
|
||||
if self._current_text:
|
||||
self.on_speech_detected(self._current_text)
|
||||
|
||||
self._current_text = ''
|
||||
|
||||
return self._current_text
|
||||
|
||||
def process_text(self, text: str) -> None:
|
||||
pass
|
||||
|
||||
@action
|
||||
def detect(self, audio_file: str) -> SpeechDetectedResponse:
|
||||
"""
|
||||
Perform speech-to-text analysis on an audio file.
|
||||
|
||||
:param audio_file: Path to the audio file.
|
||||
"""
|
||||
pass
|
||||
|
||||
def recording_thread(
|
||||
self, input_device: Optional[str] = None, *args, **kwargs
|
||||
) -> None:
|
||||
assert self._stt_engine, 'The hotword engine has not yet been initialized'
|
||||
super().recording_thread(
|
||||
block_size=self._stt_engine.frame_length, input_device=input_device
|
||||
)
|
||||
|
||||
@action
|
||||
def start_detection(self, *args, **kwargs) -> None:
|
||||
from pvcheetah import Cheetah
|
||||
|
||||
self._stt_engine = Cheetah(
|
||||
library_path=self._library_path,
|
||||
acoustic_model_path=self._acoustic_model_path,
|
||||
language_model_path=self._language_model_path,
|
||||
license_path=self._license_path,
|
||||
endpoint_duration_sec=self._end_of_speech_timeout,
|
||||
)
|
||||
|
||||
self.rate = self._stt_engine.sample_rate
|
||||
self._speech_in_progress.clear()
|
||||
super().start_detection(*args, **kwargs)
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
|
@ -1,7 +0,0 @@
|
|||
manifest:
|
||||
events: {}
|
||||
install:
|
||||
pip:
|
||||
- cheetah
|
||||
package: platypush.plugins.stt.picovoice.speech
|
||||
type: plugin
|
|
@ -19,6 +19,7 @@ manifest:
|
|||
- python-numpy
|
||||
- python-sounddevice
|
||||
pip:
|
||||
- num2words
|
||||
- numpy
|
||||
- sounddevice
|
||||
package: platypush.plugins.tts
|
||||
|
|
|
@ -0,0 +1,198 @@
|
|||
import logging
|
||||
import os
|
||||
import re
|
||||
from threading import RLock
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import pvorca
|
||||
import sounddevice as sd
|
||||
|
||||
from platypush.config import Config
|
||||
from platypush.plugins import action
|
||||
from platypush.plugins.tts import TtsPlugin
|
||||
|
||||
|
||||
class TextConversionUtils:
|
||||
"""
|
||||
Utility class to convert text to a format that is supported by the Orca TTS
|
||||
engine.
|
||||
|
||||
This pre-processing step is necessary until the issue is fixed:
|
||||
https://github.com/Picovoice/orca/issues/10.
|
||||
"""
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
_number_re = re.compile(r'(([0-9]+\.[0-9]+)|([0-9]+\,[0-9]+)|([0-9]+))')
|
||||
_conversions_map = {
|
||||
(re.compile(r'\s*[(){}\[\]<>]'), ', '),
|
||||
(re.compile(r'[;]'), '.'),
|
||||
(re.compile(r'[@#]'), ' at '),
|
||||
(re.compile(r'[$]'), ' dollar '),
|
||||
(re.compile(r'[%]'), ' percent '),
|
||||
(re.compile(r'[&]'), ' and '),
|
||||
(re.compile(r'[+]'), ' plus '),
|
||||
(re.compile(r'[=]'), ' equals '),
|
||||
(re.compile(r'[|]'), ' or '),
|
||||
(re.compile(r'[~]'), ' tilde '),
|
||||
(re.compile(r'[`]'), ''),
|
||||
(re.compile(r'[*]'), ' star '),
|
||||
(re.compile(r'[\\/]'), ' slash '),
|
||||
(re.compile(r'[_]'), ' underscore '),
|
||||
# Anything that isn't a letter or supported punctuation is replaced with a space
|
||||
(re.compile(r'[^a-zA-Z,.:?!\-\'" ]'), ' '),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _convert_digits(cls, text: str) -> str:
|
||||
try:
|
||||
from num2words import num2words
|
||||
except ImportError:
|
||||
cls._logger.warning('num2words is not installed, skipping digit conversion')
|
||||
return text
|
||||
|
||||
while match := cls._number_re.search(text):
|
||||
number = match.group(1)
|
||||
text = text.replace(number, num2words(float(number.replace(',', ''))))
|
||||
|
||||
return text
|
||||
|
||||
@classmethod
|
||||
def convert(cls, text: str) -> str:
|
||||
text = cls._convert_digits(text)
|
||||
|
||||
for pattern, replacement in TextConversionUtils._conversions_map:
|
||||
text = pattern.sub(replacement, text)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
class TtsPicovoicePlugin(TtsPlugin):
|
||||
"""
|
||||
This TTS plugin enables you to render text as audio using `Picovoice
|
||||
<https://picovoice.ai>`_'s (still experimental) `Orca TTS engine
|
||||
<https://github.com/Picovoice/orca>`_.
|
||||
|
||||
Take a look at
|
||||
:class:`platypush.plugins.assistant.picovoice.AssistantPicovoicePlugin`
|
||||
for details on how to sign up for a Picovoice account and get the API key.
|
||||
|
||||
Also note that using the TTS features requires you to select Orca from the
|
||||
list of products available for your account on the `Picovoice console
|
||||
<https://console.picovoice.ai>`_.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
access_key: Optional[str] = None,
|
||||
model_path: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
:param access_key: Picovoice access key. If it's not specified here,
|
||||
then it must be specified on the configuration of
|
||||
:class:`platypush.plugins.assistant.picovoice.AssistantPicovoicePlugin`.
|
||||
:param model_path: Path of the TTS model file (default: use the default
|
||||
English model).
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
if not access_key:
|
||||
access_key = Config.get('assistant.picovoice', {}).get('access_key')
|
||||
assert (
|
||||
access_key
|
||||
), 'No access key specified and no assistant.picovoice plugin found'
|
||||
|
||||
self.model_path = model_path
|
||||
self.access_key = access_key
|
||||
if model_path:
|
||||
model_path = os.path.expanduser(model_path)
|
||||
|
||||
self._stream: Optional[sd.OutputStream] = None
|
||||
self._stream_lock = RLock()
|
||||
|
||||
def _play_audio(self, orca: pvorca.Orca, pcm: np.ndarray):
|
||||
with self._stream_lock:
|
||||
self.stop()
|
||||
self._stream = sd.OutputStream(
|
||||
samplerate=orca.sample_rate,
|
||||
channels=1,
|
||||
dtype='int16',
|
||||
)
|
||||
|
||||
try:
|
||||
self._stream.start()
|
||||
self._stream.write(pcm)
|
||||
except Exception as e:
|
||||
self.logger.warning('Error playing audio: %s: %s', type(e), str(e))
|
||||
finally:
|
||||
try:
|
||||
self.stop()
|
||||
self._stream.close()
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
'Error stopping audio stream: %s: %s', type(e), str(e)
|
||||
)
|
||||
finally:
|
||||
if self._stream:
|
||||
self._stream = None
|
||||
|
||||
def get_orca(self, model_path: Optional[str] = None):
|
||||
if not model_path:
|
||||
model_path = self.model_path
|
||||
if model_path:
|
||||
model_path = os.path.expanduser(model_path)
|
||||
|
||||
return pvorca.create(access_key=self.access_key, model_path=model_path)
|
||||
|
||||
@action
|
||||
def say(
|
||||
self,
|
||||
text: str,
|
||||
*_,
|
||||
output_file: Optional[str] = None,
|
||||
speech_rate: Optional[float] = None,
|
||||
model_path: Optional[str] = None,
|
||||
**__,
|
||||
):
|
||||
"""
|
||||
Say some text.
|
||||
|
||||
:param text: Text to say.
|
||||
:param output_file: If set, save the audio to the specified file.
|
||||
Otherwise play it.
|
||||
:param speech_rate: Speech rate (default: None).
|
||||
:param model_path: Path of the TTS model file (default: use the default
|
||||
configured model).
|
||||
"""
|
||||
# This is a temporary workaround until this issue is fixed:
|
||||
# https://github.com/Picovoice/orca/issues/10.
|
||||
text = TextConversionUtils.convert(text)
|
||||
orca = self.get_orca(model_path=model_path)
|
||||
|
||||
if output_file:
|
||||
orca.synthesize_to_file(
|
||||
text, os.path.expanduser(output_file), speech_rate=speech_rate
|
||||
)
|
||||
return
|
||||
|
||||
self._play_audio(
|
||||
orca=orca,
|
||||
pcm=np.array(
|
||||
orca.synthesize(text, speech_rate=speech_rate),
|
||||
dtype='int16',
|
||||
),
|
||||
)
|
||||
|
||||
@action
|
||||
def stop(self):
|
||||
"""
|
||||
Stop the currently playing audio.
|
||||
"""
|
||||
with self._stream_lock:
|
||||
if not self._stream:
|
||||
return
|
||||
|
||||
self._stream.stop()
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
|
@ -0,0 +1,22 @@
|
|||
manifest:
|
||||
events: {}
|
||||
install:
|
||||
apk:
|
||||
- ffmpeg
|
||||
- py3-numpy
|
||||
apt:
|
||||
- ffmpeg
|
||||
- python3-numpy
|
||||
dnf:
|
||||
- ffmpeg
|
||||
- python-numpy
|
||||
pacman:
|
||||
- ffmpeg
|
||||
- python-numpy
|
||||
- python-sounddevice
|
||||
pip:
|
||||
- numpy
|
||||
- pvorca
|
||||
- sounddevice
|
||||
package: platypush.plugins.tts.picovoice
|
||||
type: plugin
|
|
@ -83,7 +83,10 @@ mock_imports = [
|
|||
"pmw3901",
|
||||
"psutil",
|
||||
"pvcheetah",
|
||||
"pvporcupine ",
|
||||
"pvleopard",
|
||||
"pvorca",
|
||||
"pvporcupine",
|
||||
"pvrhino",
|
||||
"pyHS100",
|
||||
"pyaudio",
|
||||
"pychromecast",
|
||||
|
|
|
@ -24,8 +24,11 @@ def OrEvent(*events, cls: Type = threading.Event):
|
|||
or_event.clear()
|
||||
|
||||
def _to_or(e, changed_callback: Callable[[], None]):
|
||||
e._set = e.set
|
||||
e._clear = e.clear
|
||||
if not hasattr(e, "_set"):
|
||||
e._set = e.set
|
||||
if not hasattr(e, "_clear"):
|
||||
e._clear = e.clear
|
||||
|
||||
e.changed = changed_callback
|
||||
e.set = lambda: _or_set(e)
|
||||
e.clear = lambda: _clear_or(e)
|
||||
|
|
Loading…
Reference in New Issue