Compare commits

...

7 Commits

Author SHA1 Message Date
Fabio Manganiello 2b287b569f
[assistant.picovoice] Conversation flow improvements.
continuous-integration/drone/push Build is passing Details
- The `Responding` state should be modelled as an extra event/binary
  flag, not as an assistant state. The assistant may be listening for
  hotwords even while the `tts` plugin is responding, and we don't want
  the two states to interfere with each either - neither to build a more
  complex state machine that also needs to take concurrent states into
  account.

- Stop any responses being rendered upon the `tts` plugin when a new
  hotword audio is detected. If e.g. I say "Ok Google", I should always
  be able to trigger the assistant and stop any concurrent audio
  process.

- `SpeechRecognizedEvent` should be emitted even if `cheetah`'s latest
  audio frame results weren't marked as final, and the speech detection
  window timed out. Cheetah's `is_final` detection seems to be quite
  buggy sometimes, and it may not properly detect the end of utterances,
  especially with non-native accents. The workaround is to flush out
  whatever text is available (if at least some speech was detected) into
  a `SpeechRecognizedEvent` upon timeout.
2024-04-13 20:01:21 +02:00
Fabio Manganiello 24e93ad160
Added more default imports under the `platypush` module root.
These objects can now also be imported in scripts through
`from platypush import <name>`:

- `Variable`
- `cron`
- `hook`
- `procedure`
2024-04-10 23:33:48 +02:00
Fabio Manganiello 3b73b22db9
[assistant.picovoice] More features.
- Added wiring between `assistant.picovoice` and `tts.picovoice`.

- Added `RESPONDING` status to the assistant.

- Added ability to override the default speech model upon
  `start_conversation`.

- Better handling of conversation timeouts.

- Cache Cheetah objects in a `model -> object` map - at least the
  default model should be pre-loaded, since model loading at runtime
  seems to take a while, and that could impact the ability to detect the
  speech in the first seconds after a hotword is detected.
2024-04-10 22:26:45 +02:00
Fabio Manganiello 9761cc2eef
Added `tts.picovoice` plugin. 2024-04-10 20:32:32 +02:00
Fabio Manganiello 6bd20bfcf6
Added ffmpeg requirement for `assistant.picovoice`. 2024-04-10 20:31:38 +02:00
Fabio Manganiello 8702eaa25b
s/partial_transcript/transcript/g 2024-04-09 00:19:51 +02:00
Fabio Manganiello 6feb824c04
Refactored `AssistantEvent`.
`AssistantEvent.assistant` is now modelled as an opaque object that
behaves the following way:

- The underlying plugin name is saved under `event.args['_assistant']`.

- `event.assistant` is a property that returns the assistant instance
  via `get_plugin`.

- `event.assistant` is reported as a string (plugin qualified name) upon
  event dump.

This allows event hooks to easily use `event.assistant` to interact with
the underlying assistant and easily modify the conversation flow, while
event hook conditions can still be easily modelled as equality
operations between strings.
2024-04-09 00:15:51 +02:00
11 changed files with 404 additions and 75 deletions

View File

@ -7,10 +7,13 @@ Platypush
from .app import Application from .app import Application
from .config import Config from .config import Config
from .context import get_backend, get_bus, get_plugin from .context import Variable, get_backend, get_bus, get_plugin
from .cron import cron
from .event.hook import hook
from .message.event import Event from .message.event import Event
from .message.request import Request from .message.request import Request
from .message.response import Response from .message.response import Response
from .procedure import procedure
from .runner import main from .runner import main
from .utils import run from .utils import run
@ -19,14 +22,18 @@ __author__ = 'Fabio Manganiello <fabio@manganiello.tech>'
__version__ = '0.50.3' __version__ = '0.50.3'
__all__ = [ __all__ = [
'Application', 'Application',
'Variable',
'Config', 'Config',
'Event', 'Event',
'Request', 'Request',
'Response', 'Response',
'cron',
'get_backend', 'get_backend',
'get_bus', 'get_bus',
'get_plugin', 'get_plugin',
'hook',
'main', 'main',
'procedure',
'run', 'run',
] ]

View File

@ -257,26 +257,29 @@ class Event(Message):
return result return result
def as_dict(self):
"""
Converts the event into a dictionary
"""
args = copy.deepcopy(self.args)
flatten(args)
return {
'type': 'event',
'target': self.target,
'origin': self.origin if hasattr(self, 'origin') else None,
'id': self.id if hasattr(self, 'id') else None,
'_timestamp': self.timestamp,
'args': {'type': self.type, **args},
}
def __str__(self): def __str__(self):
""" """
Overrides the str() operator and converts Overrides the str() operator and converts
the message into a UTF-8 JSON string the message into a UTF-8 JSON string
""" """
args = copy.deepcopy(self.args) args = copy.deepcopy(self.args)
flatten(args) flatten(args)
return json.dumps(self.as_dict(), cls=self.Encoder)
return json.dumps(
{
'type': 'event',
'target': self.target,
'origin': self.origin if hasattr(self, 'origin') else None,
'id': self.id if hasattr(self, 'id') else None,
'_timestamp': self.timestamp,
'args': {'type': self.type, **args},
},
cls=self.Encoder,
)
@dataclass @dataclass

View File

@ -1,27 +1,53 @@
import re import re
import sys import sys
from typing import Optional from typing import Optional, Union
from platypush.context import get_plugin from platypush.context import get_plugin
from platypush.message.event import Event from platypush.message.event import Event
from platypush.plugins.assistant import AssistantPlugin
from platypush.utils import get_plugin_name_by_class
class AssistantEvent(Event): class AssistantEvent(Event):
"""Base class for assistant events""" """Base class for assistant events"""
def __init__(self, *args, assistant: Optional[str] = None, **kwargs): def __init__(
self, *args, assistant: Optional[Union[str, AssistantPlugin]] = None, **kwargs
):
""" """
:param assistant: Name of the assistant plugin that triggered the event. :param assistant: Name of the assistant plugin that triggered the event.
""" """
super().__init__(*args, assistant=assistant, **kwargs) assistant = assistant or kwargs.get('assistant')
if assistant:
assistant = (
assistant
if isinstance(assistant, str)
else get_plugin_name_by_class(assistant.__class__)
)
kwargs['_assistant'] = assistant
super().__init__(*args, **kwargs)
@property @property
def _assistant(self): def assistant(self) -> Optional[AssistantPlugin]:
return ( assistant = self.args.get('_assistant')
get_plugin(self.args.get('assistant')) if not assistant:
if self.args.get('assistant') return None
else None
) return get_plugin(assistant)
def as_dict(self):
evt_dict = super().as_dict()
evt_args = {**evt_dict['args']}
assistant = evt_args.pop('_assistant', None)
if assistant:
evt_args['assistant'] = assistant
return {
**evt_dict,
'args': evt_args,
}
class ConversationStartEvent(AssistantEvent): class ConversationStartEvent(AssistantEvent):
@ -95,8 +121,8 @@ class SpeechRecognizedEvent(AssistantEvent):
""" """
result = super().matches_condition(condition) result = super().matches_condition(condition)
if result.is_match and self._assistant and 'phrase' in condition.args: if result.is_match and self.assistant and 'phrase' in condition.args:
self._assistant.stop_conversation() self.assistant.stop_conversation()
return result return result

View File

@ -122,6 +122,12 @@ class Plugin(EventGenerator, ExtensionWithManifest): # lgtm [py/missing-call-to
assert entities, 'entities plugin not initialized' assert entities, 'entities plugin not initialized'
return entities return entities
def __str__(self):
"""
:return: The qualified name of the plugin.
"""
return get_plugin_name_by_class(self.__class__)
def run(self, method, *args, **kwargs): def run(self, method, *args, **kwargs):
assert ( assert (
method in self.registered_actions method in self.registered_actions

View File

@ -8,24 +8,7 @@ from typing import Any, Collection, Dict, Optional, Type
from platypush.context import get_bus, get_plugin from platypush.context import get_bus, get_plugin
from platypush.entities.assistants import Assistant from platypush.entities.assistants import Assistant
from platypush.entities.managers.assistants import AssistantEntityManager from platypush.entities.managers.assistants import AssistantEntityManager
from platypush.message.event.assistant import ( from platypush.message.event import Event as AppEvent
AlarmEndEvent,
AlarmStartedEvent,
AlertEndEvent,
AlertStartedEvent,
AssistantEvent,
ConversationEndEvent,
ConversationStartEvent,
ConversationTimeoutEvent,
HotwordDetectedEvent,
MicMutedEvent,
MicUnmutedEvent,
NoResponseEvent,
ResponseEvent,
SpeechRecognizedEvent,
TimerEndEvent,
TimerStartedEvent,
)
from platypush.plugins import Plugin, action from platypush.plugins import Plugin, action
from platypush.utils import get_plugin_name_by_class from platypush.utils import get_plugin_name_by_class
@ -182,6 +165,17 @@ class AssistantPlugin(Plugin, AssistantEntityManager, ABC):
self.publish_entities([self]) self.publish_entities([self])
return asdict(self._state) return asdict(self._state)
@action
def render_response(self, text: str, *_, **__):
"""
Render a response text as audio over the configured TTS plugin.
:param text: Text to render.
"""
self._on_response_render_start(text)
self._render_response(text)
self._on_response_render_end()
def _get_tts_plugin(self): def _get_tts_plugin(self):
if not self.tts_plugin: if not self.tts_plugin:
return None return None
@ -201,11 +195,13 @@ class AssistantPlugin(Plugin, AssistantEntityManager, ABC):
audio.play(self._conversation_start_sound) audio.play(self._conversation_start_sound)
def _send_event(self, event_type: Type[AssistantEvent], **kwargs): def _send_event(self, event_type: Type[AppEvent], **kwargs):
self.publish_entities([self]) self.publish_entities([self])
get_bus().post(event_type(assistant=self._plugin_name, **kwargs)) get_bus().post(event_type(assistant=self._plugin_name, **kwargs))
def _on_conversation_start(self): def _on_conversation_start(self):
from platypush.message.event.assistant import ConversationStartEvent
self._last_response = None self._last_response = None
self._last_query = None self._last_query = None
self._conversation_running.set() self._conversation_running.set()
@ -213,66 +209,98 @@ class AssistantPlugin(Plugin, AssistantEntityManager, ABC):
self._play_conversation_start_sound() self._play_conversation_start_sound()
def _on_conversation_end(self): def _on_conversation_end(self):
from platypush.message.event.assistant import ConversationEndEvent
self._conversation_running.clear() self._conversation_running.clear()
self._send_event(ConversationEndEvent) self._send_event(ConversationEndEvent)
def _on_conversation_timeout(self): def _on_conversation_timeout(self):
from platypush.message.event.assistant import ConversationTimeoutEvent
self._last_response = None self._last_response = None
self._last_query = None self._last_query = None
self._conversation_running.clear() self._conversation_running.clear()
self._send_event(ConversationTimeoutEvent) self._send_event(ConversationTimeoutEvent)
def _on_no_response(self): def _on_no_response(self):
from platypush.message.event.assistant import NoResponseEvent
self._last_response = None self._last_response = None
self._conversation_running.clear() self._conversation_running.clear()
self._send_event(NoResponseEvent) self._send_event(NoResponseEvent)
def _on_reponse_rendered(self, text: Optional[str]): def _on_response_render_start(self, text: Optional[str]):
from platypush.message.event.assistant import ResponseEvent
self._last_response = text self._last_response = text
self._send_event(ResponseEvent, response_text=text) self._send_event(ResponseEvent, response_text=text)
tts = self._get_tts_plugin()
def _render_response(self, text: Optional[str]):
tts = self._get_tts_plugin()
if tts and text: if tts and text:
self.stop_conversation() self.stop_conversation()
tts.say(text=text, **self.tts_plugin_args) tts.say(text=text, **self.tts_plugin_args)
def _on_response_render_end(self):
pass
def _on_hotword_detected(self, hotword: Optional[str]): def _on_hotword_detected(self, hotword: Optional[str]):
from platypush.message.event.assistant import HotwordDetectedEvent
self._send_event(HotwordDetectedEvent, hotword=hotword) self._send_event(HotwordDetectedEvent, hotword=hotword)
def _on_speech_recognized(self, phrase: Optional[str]): def _on_speech_recognized(self, phrase: Optional[str]):
from platypush.message.event.assistant import SpeechRecognizedEvent
phrase = (phrase or '').lower().strip() phrase = (phrase or '').lower().strip()
self._last_query = phrase self._last_query = phrase
self._send_event(SpeechRecognizedEvent, phrase=phrase) self._send_event(SpeechRecognizedEvent, phrase=phrase)
def _on_alarm_start(self): def _on_alarm_start(self):
from platypush.message.event.assistant import AlarmStartedEvent
self._cur_alert_type = AlertType.ALARM self._cur_alert_type = AlertType.ALARM
self._send_event(AlarmStartedEvent) self._send_event(AlarmStartedEvent)
def _on_alarm_end(self): def _on_alarm_end(self):
from platypush.message.event.assistant import AlarmEndEvent
self._cur_alert_type = None self._cur_alert_type = None
self._send_event(AlarmEndEvent) self._send_event(AlarmEndEvent)
def _on_timer_start(self): def _on_timer_start(self):
from platypush.message.event.assistant import TimerStartedEvent
self._cur_alert_type = AlertType.TIMER self._cur_alert_type = AlertType.TIMER
self._send_event(TimerStartedEvent) self._send_event(TimerStartedEvent)
def _on_timer_end(self): def _on_timer_end(self):
from platypush.message.event.assistant import TimerEndEvent
self._cur_alert_type = None self._cur_alert_type = None
self._send_event(TimerEndEvent) self._send_event(TimerEndEvent)
def _on_alert_start(self): def _on_alert_start(self):
from platypush.message.event.assistant import AlertStartedEvent
self._cur_alert_type = AlertType.ALERT self._cur_alert_type = AlertType.ALERT
self._send_event(AlertStartedEvent) self._send_event(AlertStartedEvent)
def _on_alert_end(self): def _on_alert_end(self):
from platypush.message.event.assistant import AlertEndEvent
self._cur_alert_type = None self._cur_alert_type = None
self._send_event(AlertEndEvent) self._send_event(AlertEndEvent)
def _on_mute(self): def _on_mute(self):
from platypush.message.event.assistant import MicMutedEvent
self._is_muted = True self._is_muted = True
self._send_event(MicMutedEvent) self._send_event(MicMutedEvent)
def _on_unmute(self): def _on_unmute(self):
from platypush.message.event.assistant import MicUnmutedEvent
self._is_muted = False self._is_muted = False
self._send_event(MicUnmutedEvent) self._send_event(MicUnmutedEvent)

View File

@ -1,7 +1,10 @@
import os
from typing import Optional, Sequence from typing import Optional, Sequence
from platypush.context import get_plugin
from platypush.plugins import RunnablePlugin, action from platypush.plugins import RunnablePlugin, action
from platypush.plugins.assistant import AssistantPlugin from platypush.plugins.assistant import AssistantPlugin
from platypush.plugins.tts.picovoice import TtsPicovoicePlugin
from ._assistant import Assistant from ._assistant import Assistant
from ._state import AssistantState from ._state import AssistantState
@ -96,7 +99,12 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
using a language other than English, you can provide the path to the using a language other than English, you can provide the path to the
model file for that language. Model files are available for all the model file for that language. Model files are available for all the
supported languages through the `Picovoice repository supported languages through the `Picovoice repository
<https://github.com/Picovoice/porcupine/tree/master/lib/common>`_. <https://github.com/Picovoice/cheetah/tree/master/lib/common>`_.
You can also use the `Picovoice console
<https://console.picovoice.ai/cat>`_
to train your custom models. You can use a base model and fine-tune
it by boosting the detection of your own words and phrases and edit
the phonetic representation of the words you want to detect.
:param endpoint_duration: If set, the assistant will stop listening when :param endpoint_duration: If set, the assistant will stop listening when
no speech is detected for the specified duration (in seconds) after no speech is detected for the specified duration (in seconds) after
the end of an utterance. the end of an utterance.
@ -146,15 +154,43 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
'on_hotword_detected': self._on_hotword_detected, 'on_hotword_detected': self._on_hotword_detected,
} }
@property
def tts(self) -> TtsPicovoicePlugin:
p = get_plugin('tts.picovoice')
assert p, 'Picovoice TTS plugin not configured/found'
return p
def _get_tts_plugin(self) -> TtsPicovoicePlugin:
return self.tts
def _on_response_render_start(self, text: Optional[str]):
if self._assistant:
self._assistant.set_responding(True)
return super()._on_response_render_start(text)
def _on_response_render_end(self):
if self._assistant:
self._assistant.set_responding(False)
return super()._on_response_render_end()
@action @action
def start_conversation(self, *_, **__): def start_conversation(self, *_, model_file: Optional[str] = None, **__):
""" """
Programmatically start a conversation with the assistant Programmatically start a conversation with the assistant.
:param model_file: Override the model file to be used to detect speech
in this conversation. If not set, the configured
``speech_model_path`` will be used.
""" """
if not self._assistant: if not self._assistant:
self.logger.warning('Assistant not initialized') self.logger.warning('Assistant not initialized')
return return
if model_file:
model_file = os.path.expanduser(model_file)
self._assistant.override_speech_model(model_file)
self._assistant.state = AssistantState.DETECTING_SPEECH self._assistant.state = AssistantState.DETECTING_SPEECH
@action @action
@ -166,6 +202,8 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
self.logger.warning('Assistant not initialized') self.logger.warning('Assistant not initialized')
return return
self._assistant.override_speech_model(None)
if self._assistant.hotword_enabled: if self._assistant.hotword_enabled:
self._assistant.state = AssistantState.DETECTING_HOTWORD self._assistant.state = AssistantState.DETECTING_HOTWORD
else: else:
@ -215,7 +253,8 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
with Assistant(**self._assistant_args) as self._assistant: with Assistant(**self._assistant_args) as self._assistant:
try: try:
for event in self._assistant: for event in self._assistant:
self.logger.debug('Picovoice assistant event: %s', event) if event is not None:
self.logger.debug('Picovoice assistant event: %s', event)
except KeyboardInterrupt: except KeyboardInterrupt:
break break
except Exception as e: except Exception as e:

View File

@ -9,11 +9,13 @@ import pvleopard
import pvporcupine import pvporcupine
import pvrhino import pvrhino
from platypush.context import get_plugin
from platypush.message.event.assistant import ( from platypush.message.event.assistant import (
ConversationTimeoutEvent, ConversationTimeoutEvent,
HotwordDetectedEvent, HotwordDetectedEvent,
SpeechRecognizedEvent, SpeechRecognizedEvent,
) )
from platypush.plugins.tts.picovoice import TtsPicovoicePlugin
from ._context import ConversationContext from ._context import ConversationContext
from ._recorder import AudioRecorder from ._recorder import AudioRecorder
@ -25,6 +27,7 @@ class Assistant:
A facade class that wraps the Picovoice engines under an assistant API. A facade class that wraps the Picovoice engines under an assistant API.
""" """
@staticmethod
def _default_callback(*_, **__): def _default_callback(*_, **__):
pass pass
@ -60,12 +63,14 @@ class Assistant:
self.keywords = list(keywords or []) self.keywords = list(keywords or [])
self.keyword_paths = None self.keyword_paths = None
self.keyword_model_path = None self.keyword_model_path = None
self._responding = Event()
self.frame_expiration = frame_expiration self.frame_expiration = frame_expiration
self.speech_model_path = speech_model_path
self.endpoint_duration = endpoint_duration self.endpoint_duration = endpoint_duration
self.enable_automatic_punctuation = enable_automatic_punctuation self.enable_automatic_punctuation = enable_automatic_punctuation
self.start_conversation_on_hotword = start_conversation_on_hotword self.start_conversation_on_hotword = start_conversation_on_hotword
self.audio_queue_size = audio_queue_size self.audio_queue_size = audio_queue_size
self._speech_model_path = speech_model_path
self._speech_model_path_override = None
self._on_conversation_start = on_conversation_start self._on_conversation_start = on_conversation_start
self._on_conversation_end = on_conversation_end self._on_conversation_end = on_conversation_end
@ -103,11 +108,32 @@ class Assistant:
self.keyword_model_path = keyword_model_path self.keyword_model_path = keyword_model_path
self._cheetah: Optional[pvcheetah.Cheetah] = None # Model path -> model instance cache
self._cheetah = {}
self._leopard: Optional[pvleopard.Leopard] = None self._leopard: Optional[pvleopard.Leopard] = None
self._porcupine: Optional[pvporcupine.Porcupine] = None self._porcupine: Optional[pvporcupine.Porcupine] = None
self._rhino: Optional[pvrhino.Rhino] = None self._rhino: Optional[pvrhino.Rhino] = None
@property
def is_responding(self):
return self._responding.is_set()
@property
def speech_model_path(self):
return self._speech_model_path_override or self._speech_model_path
@property
def tts(self) -> TtsPicovoicePlugin:
p = get_plugin('tts.picovoice')
assert p, 'Picovoice TTS plugin not configured/found'
return p
def set_responding(self, responding: bool):
if responding:
self._responding.set()
else:
self._responding.clear()
def should_stop(self): def should_stop(self):
return self._stop_event.is_set() return self._stop_event.is_set()
@ -130,12 +156,18 @@ class Assistant:
return return
if prev_state == AssistantState.DETECTING_SPEECH: if prev_state == AssistantState.DETECTING_SPEECH:
self.tts.stop()
self._ctx.stop() self._ctx.stop()
self._speech_model_path_override = None
self._on_conversation_end() self._on_conversation_end()
elif new_state == AssistantState.DETECTING_SPEECH: elif new_state == AssistantState.DETECTING_SPEECH:
self._ctx.start() self._ctx.start()
self._on_conversation_start() self._on_conversation_start()
if new_state == AssistantState.DETECTING_HOTWORD:
self.tts.stop()
self._ctx.reset()
@property @property
def porcupine(self) -> Optional[pvporcupine.Porcupine]: def porcupine(self) -> Optional[pvporcupine.Porcupine]:
if not self.hotword_enabled: if not self.hotword_enabled:
@ -159,7 +191,7 @@ class Assistant:
if not self.stt_enabled: if not self.stt_enabled:
return None return None
if not self._cheetah: if not self._cheetah.get(self.speech_model_path):
args: Dict[str, Any] = {'access_key': self._access_key} args: Dict[str, Any] = {'access_key': self._access_key}
if self.speech_model_path: if self.speech_model_path:
args['model_path'] = self.speech_model_path args['model_path'] = self.speech_model_path
@ -168,20 +200,22 @@ class Assistant:
if self.enable_automatic_punctuation: if self.enable_automatic_punctuation:
args['enable_automatic_punctuation'] = self.enable_automatic_punctuation args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
self._cheetah = pvcheetah.create(**args) self._cheetah[self.speech_model_path] = pvcheetah.create(**args)
return self._cheetah return self._cheetah[self.speech_model_path]
def __enter__(self): def __enter__(self):
"""
Get the assistant ready to start processing audio frames.
"""
if self.should_stop(): if self.should_stop():
return self return self
if self._recorder: if self._recorder:
self.logger.info('A recording stream already exists') self.logger.info('A recording stream already exists')
elif self.porcupine or self.cheetah: elif self.hotword_enabled or self.stt_enabled:
sample_rate = (self.porcupine or self.cheetah).sample_rate # type: ignore sample_rate = (self.porcupine or self.cheetah).sample_rate # type: ignore
frame_length = (self.porcupine or self.cheetah).frame_length # type: ignore frame_length = (self.porcupine or self.cheetah).frame_length # type: ignore
self._recorder = AudioRecorder( self._recorder = AudioRecorder(
stop_event=self._stop_event, stop_event=self._stop_event,
sample_rate=sample_rate, sample_rate=sample_rate,
@ -190,6 +224,9 @@ class Assistant:
channels=1, channels=1,
) )
if self.stt_enabled:
self._cheetah[self.speech_model_path] = self.cheetah
self._recorder.__enter__() self._recorder.__enter__()
if self.porcupine: if self.porcupine:
@ -200,15 +237,18 @@ class Assistant:
return self return self
def __exit__(self, *_): def __exit__(self, *_):
"""
Stop the assistant and release all resources.
"""
if self._recorder: if self._recorder:
self._recorder.__exit__(*_) self._recorder.__exit__(*_)
self._recorder = None self._recorder = None
self.state = AssistantState.IDLE self.state = AssistantState.IDLE
for model in [*self._cheetah.keys()]:
if self._cheetah: cheetah = self._cheetah.pop(model, None)
self._cheetah.delete() if cheetah:
self._cheetah = None cheetah.delete()
if self._leopard: if self._leopard:
self._leopard.delete() self._leopard.delete()
@ -223,9 +263,15 @@ class Assistant:
self._rhino = None self._rhino = None
def __iter__(self): def __iter__(self):
"""
Iterate over processed assistant events.
"""
return self return self
def __next__(self): def __next__(self):
"""
Process the next audio frame and return the corresponding event.
"""
has_data = False has_data = False
if self.should_stop() or not self._recorder: if self.should_stop() or not self._recorder:
raise StopIteration raise StopIteration
@ -242,10 +288,10 @@ class Assistant:
) )
continue # The audio frame is too old continue # The audio frame is too old
if self.porcupine and self.state == AssistantState.DETECTING_HOTWORD: if self.hotword_enabled and self.state == AssistantState.DETECTING_HOTWORD:
return self._process_hotword(frame) return self._process_hotword(frame)
if self.cheetah and self.state == AssistantState.DETECTING_SPEECH: if self.stt_enabled and self.state == AssistantState.DETECTING_SPEECH:
return self._process_speech(frame) return self._process_speech(frame)
raise StopIteration raise StopIteration
@ -262,6 +308,7 @@ class Assistant:
if self.start_conversation_on_hotword: if self.start_conversation_on_hotword:
self.state = AssistantState.DETECTING_SPEECH self.state = AssistantState.DETECTING_SPEECH
self.tts.stop()
self._on_hotword_detected(hotword=self.keywords[keyword_index]) self._on_hotword_detected(hotword=self.keywords[keyword_index])
return HotwordDetectedEvent(hotword=self.keywords[keyword_index]) return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
@ -275,23 +322,20 @@ class Assistant:
partial_transcript, self._ctx.is_final = self.cheetah.process(frame) partial_transcript, self._ctx.is_final = self.cheetah.process(frame)
if partial_transcript: if partial_transcript:
self._ctx.partial_transcript += partial_transcript self._ctx.transcript += partial_transcript
self.logger.info( self.logger.info(
'Partial transcript: %s, is_final: %s', 'Partial transcript: %s, is_final: %s',
self._ctx.partial_transcript, self._ctx.transcript,
self._ctx.is_final, self._ctx.is_final,
) )
if self._ctx.is_final or self._ctx.timed_out: if self._ctx.is_final or self._ctx.timed_out:
phrase = '' phrase = self.cheetah.flush() or ''
if self.cheetah: self._ctx.transcript += phrase
phrase = self.cheetah.flush() phrase = self._ctx.transcript
self._ctx.partial_transcript += phrase
phrase = self._ctx.partial_transcript
phrase = phrase[:1].lower() + phrase[1:] phrase = phrase[:1].lower() + phrase[1:]
if self._ctx.is_final or phrase: if phrase:
event = SpeechRecognizedEvent(phrase=phrase) event = SpeechRecognizedEvent(phrase=phrase)
self._on_speech_recognized(phrase=phrase) self._on_speech_recognized(phrase=phrase)
else: else:
@ -304,5 +348,8 @@ class Assistant:
return event return event
def override_speech_model(self, model_path: Optional[str]):
self._speech_model_path_override = model_path
# vim:sw=4:ts=4:et: # vim:sw=4:ts=4:et:

View File

@ -9,7 +9,7 @@ class ConversationContext:
Context of the conversation process. Context of the conversation process.
""" """
partial_transcript: str = '' transcript: str = ''
is_final: bool = False is_final: bool = False
timeout: Optional[float] = None timeout: Optional[float] = None
t_start: Optional[float] = None t_start: Optional[float] = None
@ -24,7 +24,7 @@ class ConversationContext:
self.t_end = time() self.t_end = time()
def reset(self): def reset(self):
self.partial_transcript = '' self.transcript = ''
self.is_final = False self.is_final = False
self.t_start = None self.t_start = None
self.t_end = None self.t_end = None
@ -32,11 +32,17 @@ class ConversationContext:
@property @property
def timed_out(self): def timed_out(self):
return ( return (
not self.partial_transcript not self.transcript
and not self.is_final and not self.is_final
and self.timeout and self.timeout
and self.t_start and self.t_start
and time() - self.t_start > self.timeout and time() - self.t_start > self.timeout
) or (
self.transcript
and not self.is_final
and self.timeout
and self.t_start
and time() - self.t_start > self.timeout * 2
) )

View File

@ -12,7 +12,14 @@ manifest:
- platypush.message.event.assistant.ResponseEvent - platypush.message.event.assistant.ResponseEvent
- platypush.message.event.assistant.SpeechRecognizedEvent - platypush.message.event.assistant.SpeechRecognizedEvent
install: install:
apk:
- ffmpeg
apt:
- ffmpeg
dnf:
- ffmpeg
pacman: pacman:
- ffmpeg
- python-sounddevice - python-sounddevice
pip: pip:
- pvcheetah - pvcheetah

View File

@ -0,0 +1,138 @@
import os
from threading import RLock
from typing import Optional
import numpy as np
import pvorca
import sounddevice as sd
from platypush.config import Config
from platypush.plugins import action
from platypush.plugins.tts import TtsPlugin
class TtsPicovoicePlugin(TtsPlugin):
"""
This TTS plugin enables you to render text as audio using `Picovoice
<https://picovoice.ai>`_'s (still experimental) `Orca TTS engine
<https://github.com/Picovoice/orca>`_.
Take a look at
:class:`platypush.plugins.assistant.picovoice.AssistantPicovoicePlugin`
for details on how to sign up for a Picovoice account and get the API key.
Also note that using the TTS features requires you to select Orca from the
list of products available for your account on the `Picovoice console
<https://console.picovoice.ai>`_.
"""
def __init__(
self,
access_key: Optional[str] = None,
model_path: Optional[str] = None,
**kwargs,
):
"""
:param access_key: Picovoice access key. If it's not specified here,
then it must be specified on the configuration of
:class:`platypush.plugins.assistant.picovoice.AssistantPicovoicePlugin`.
:param model_path: Path of the TTS model file (default: use the default
English model).
"""
super().__init__(**kwargs)
if not access_key:
access_key = Config.get('assistant.picovoice', {}).get('access_key')
assert (
access_key
), 'No access key specified and no assistant.picovoice plugin found'
self.model_path = model_path
self.access_key = access_key
if model_path:
model_path = os.path.expanduser(model_path)
self._stream: Optional[sd.OutputStream] = None
self._stream_lock = RLock()
def _play_audio(self, orca: pvorca.Orca, pcm: np.ndarray):
with self._stream_lock:
self.stop()
self._stream = sd.OutputStream(
samplerate=orca.sample_rate,
channels=1,
dtype='int16',
)
try:
self._stream.start()
self._stream.write(pcm)
except Exception as e:
self.logger.warning('Error playing audio: %s: %s', type(e), str(e))
finally:
try:
self.stop()
self._stream.close()
except Exception as e:
self.logger.warning(
'Error stopping audio stream: %s: %s', type(e), str(e)
)
finally:
if self._stream:
self._stream = None
def get_orca(self, model_path: Optional[str] = None):
if not model_path:
model_path = self.model_path
if model_path:
model_path = os.path.expanduser(model_path)
return pvorca.create(access_key=self.access_key, model_path=model_path)
@action
def say(
self,
text: str,
*_,
output_file: Optional[str] = None,
speech_rate: Optional[float] = None,
model_path: Optional[str] = None,
**__,
):
"""
Say some text.
:param text: Text to say.
:param output_file: If set, save the audio to the specified file.
Otherwise play it.
:param speech_rate: Speech rate (default: None).
:param model_path: Path of the TTS model file (default: use the default
configured model).
"""
orca = self.get_orca(model_path=model_path)
if output_file:
orca.synthesize_to_file(
text, os.path.expanduser(output_file), speech_rate=speech_rate
)
return
self._play_audio(
orca=orca,
pcm=np.array(
orca.synthesize(text, speech_rate=speech_rate),
dtype='int16',
),
)
@action
def stop(self):
"""
Stop the currently playing audio.
"""
with self._stream_lock:
if not self._stream:
return
self._stream.stop()
# vim:sw=4:ts=4:et:

View File

@ -0,0 +1,22 @@
manifest:
events: {}
install:
apk:
- ffmpeg
- py3-numpy
apt:
- ffmpeg
- python3-numpy
dnf:
- ffmpeg
- python-numpy
pacman:
- ffmpeg
- python-numpy
- python-sounddevice
pip:
- numpy
- pvorca
- sounddevice
package: platypush.plugins.tts.picovoice
type: plugin