platypush/platypush/plugins/assistant/picovoice/_assistant.py

425 lines
14 KiB
Python

import logging
import os
from queue import Full, Queue
from threading import Event, RLock, Thread
from time import time
from typing import Any, Dict, Optional, Sequence
import pvporcupine
from platypush.context import get_plugin
from platypush.message.event.assistant import (
AssistantEvent,
ConversationTimeoutEvent,
HotwordDetectedEvent,
IntentRecognizedEvent,
SpeechRecognizedEvent,
)
from platypush.plugins.tts.picovoice import TtsPicovoicePlugin
from ._recorder import AudioRecorder
from ._speech import SpeechProcessor
from ._state import AssistantState
class Assistant(Thread):
"""
A facade class that wraps the Picovoice engines under an assistant API.
"""
@staticmethod
def _default_callback(*_, **__):
pass
def __init__(
self,
access_key: str,
stop_event: Event,
hotword_enabled: bool = True,
stt_enabled: bool = True,
keywords: Optional[Sequence[str]] = None,
keyword_paths: Optional[Sequence[str]] = None,
keyword_model_path: Optional[str] = None,
frame_expiration: float = 3.0, # Don't process audio frames older than this
speech_model_path: Optional[str] = None,
intent_model_path: Optional[str] = None,
endpoint_duration: Optional[float] = None,
enable_automatic_punctuation: bool = False,
start_conversation_on_hotword: bool = False,
audio_queue_size: int = 100,
muted: bool = False,
conversation_timeout: Optional[float] = None,
on_conversation_start=_default_callback,
on_conversation_end=_default_callback,
on_conversation_timeout=_default_callback,
on_speech_recognized=_default_callback,
on_intent_matched=_default_callback,
on_hotword_detected=_default_callback,
):
super().__init__(name='picovoice:Assistant')
self._access_key = access_key
self._stop_event = stop_event
self.logger = logging.getLogger(__name__)
self.hotword_enabled = hotword_enabled
self.stt_enabled = stt_enabled
self.keywords = list(keywords or [])
self.keyword_paths = None
self.keyword_model_path = None
self.frame_expiration = frame_expiration
self.endpoint_duration = endpoint_duration
self.enable_automatic_punctuation = enable_automatic_punctuation
self.start_conversation_on_hotword = start_conversation_on_hotword
self.audio_queue_size = audio_queue_size
self._responding = Event()
self._muted = muted
self._speech_model_path = speech_model_path
self._speech_model_path_override = None
self._intent_model_path = intent_model_path
self._intent_model_path_override = None
self._in_ctx = False
self._speech_processor = SpeechProcessor(
stop_event=stop_event,
stt_enabled=stt_enabled,
intent_enabled=self.intent_enabled,
conversation_timeout=conversation_timeout,
model_path=speech_model_path,
get_cheetah_args=self._get_speech_engine_args,
get_rhino_args=self._get_intent_engine_args,
)
self._on_conversation_start = on_conversation_start
self._on_conversation_end = on_conversation_end
self._on_conversation_timeout = on_conversation_timeout
self._on_speech_recognized = on_speech_recognized
self._on_intent_matched = on_intent_matched
self._on_hotword_detected = on_hotword_detected
self._recorder = None
self._state = AssistantState.IDLE
self._state_lock = RLock()
self._evt_queue = Queue(maxsize=100)
if hotword_enabled:
if not keywords:
raise ValueError(
'You need to provide a list of keywords if the wake-word engine is enabled'
)
if keyword_paths:
keyword_paths = [os.path.expanduser(path) for path in keyword_paths]
missing_paths = [
path for path in keyword_paths if not os.path.isfile(path)
]
if missing_paths:
raise FileNotFoundError(f'Keyword files not found: {missing_paths}')
self.keyword_paths = keyword_paths
if keyword_model_path:
keyword_model_path = os.path.expanduser(keyword_model_path)
if not os.path.isfile(keyword_model_path):
raise FileNotFoundError(
f'Keyword model file not found: {keyword_model_path}'
)
self.keyword_model_path = keyword_model_path
self._porcupine: Optional[pvporcupine.Porcupine] = None
@property
def intent_enabled(self) -> bool:
return self.intent_model_path is not None
@property
def is_responding(self) -> bool:
return self._responding.is_set()
@property
def speech_model_path(self) -> Optional[str]:
return self._speech_model_path_override or self._speech_model_path
@property
def intent_model_path(self) -> Optional[str]:
return self._intent_model_path_override or self._intent_model_path
@property
def tts(self) -> TtsPicovoicePlugin:
p = get_plugin('tts.picovoice')
assert p, 'Picovoice TTS plugin not configured/found'
return p
def set_responding(self, responding: bool):
if responding:
self._responding.set()
else:
self._responding.clear()
def should_stop(self):
return self._stop_event.is_set()
def wait_stop(self):
self._stop_event.wait()
@property
def state(self) -> AssistantState:
with self._state_lock:
return self._state
@state.setter
def state(self, state: AssistantState):
with self._state_lock:
prev_state = self._state
self._state = state
new_state = self.state
if prev_state == new_state:
return
self.logger.info('Assistant state transition: %s -> %s', prev_state, new_state)
if prev_state == AssistantState.DETECTING_SPEECH:
self.tts.stop()
self._speech_model_path_override = None
self._intent_model_path_override = None
self._speech_processor.on_conversation_end()
self._on_conversation_end()
elif new_state == AssistantState.DETECTING_SPEECH:
self._speech_processor.on_conversation_start()
self._on_conversation_start()
if new_state == AssistantState.DETECTING_HOTWORD:
self.tts.stop()
self._speech_processor.on_conversation_reset()
# Put a null event on the event queue to unblock next_event
self._evt_queue.put(None)
@property
def porcupine(self) -> Optional[pvporcupine.Porcupine]:
if not self.hotword_enabled:
return None
if not self._porcupine:
args: Dict[str, Any] = {'access_key': self._access_key}
if self.keywords:
args['keywords'] = self.keywords
if self.keyword_paths:
args['keyword_paths'] = self.keyword_paths
if self.keyword_model_path:
args['model_path'] = self.keyword_model_path
self._porcupine = pvporcupine.create(**args)
return self._porcupine
def _get_speech_engine_args(self) -> dict:
args: Dict[str, Any] = {'access_key': self._access_key}
if self.speech_model_path:
args['model_path'] = self.speech_model_path
if self.endpoint_duration:
args['endpoint_duration_sec'] = self.endpoint_duration
if self.enable_automatic_punctuation:
args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
return args
def _get_intent_engine_args(self) -> dict:
args: Dict[str, Any] = {'access_key': self._access_key}
args['context_path'] = self.intent_model_path
if self.endpoint_duration:
args['endpoint_duration_sec'] = self.endpoint_duration
if self.enable_automatic_punctuation:
args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
return args
def __enter__(self):
"""
Get the assistant ready to start processing audio frames.
"""
if self.should_stop():
return self
assert not self.is_alive(), 'The assistant is already running'
self._in_ctx = True
if self._recorder:
self.logger.info('A recording stream already exists')
elif self.hotword_enabled or self.stt_enabled or self.intent_enabled:
sample_rate = (self.porcupine or self._speech_processor).sample_rate
frame_length = (self.porcupine or self._speech_processor).frame_length
self._recorder = AudioRecorder(
stop_event=self._stop_event,
sample_rate=sample_rate,
frame_size=frame_length,
queue_size=self.audio_queue_size,
paused=self._muted,
channels=1,
)
self._speech_processor.__enter__()
self._recorder.__enter__()
if self.porcupine:
self.state = AssistantState.DETECTING_HOTWORD
else:
self.state = AssistantState.DETECTING_SPEECH
self.start()
return self
def __exit__(self, *_):
"""
Stop the assistant and release all resources.
"""
self._in_ctx = False
if self._recorder:
self._recorder.__exit__(*_)
self._recorder = None
self.state = AssistantState.IDLE
if self._porcupine:
self._porcupine.delete()
self._porcupine = None
self._speech_processor.__exit__(*_)
def __iter__(self):
"""
Iterate over processed assistant events.
"""
return self
def __next__(self):
"""
Process the next audio frame and return the corresponding event.
"""
if self.should_stop() or not self._recorder:
raise StopIteration
if self.hotword_enabled and self.state == AssistantState.DETECTING_HOTWORD:
return self._evt_queue.get()
evt = None
if (
self._speech_processor.enabled
and self.state == AssistantState.DETECTING_SPEECH
):
evt = self._speech_processor.next_event()
if isinstance(evt, SpeechRecognizedEvent):
self._on_speech_recognized(phrase=evt.args['phrase'])
if isinstance(evt, IntentRecognizedEvent):
self._on_intent_matched(
intent=evt.args['intent'], slots=evt.args.get('slots', {})
)
if isinstance(evt, ConversationTimeoutEvent):
self._on_conversation_timeout()
if evt:
self._speech_processor.reset()
if (
evt
and self.state == AssistantState.DETECTING_SPEECH
and self.hotword_enabled
):
self.state = AssistantState.DETECTING_HOTWORD
return evt
def mute(self):
self._muted = True
if self._recorder:
self._recorder.pause()
def unmute(self):
self._muted = False
if self._recorder:
self._recorder.resume()
def set_mic_mute(self, mute: bool):
if mute:
self.mute()
else:
self.unmute()
def toggle_mic_mute(self):
if self._muted:
self.unmute()
else:
self.mute()
def _process_hotword(self, frame) -> Optional[HotwordDetectedEvent]:
if not self.porcupine:
return None
keyword_index = self.porcupine.process(frame)
if keyword_index is None:
return None # No keyword detected
if keyword_index >= 0 and self.keywords:
if self.start_conversation_on_hotword:
self.state = AssistantState.DETECTING_SPEECH
self.tts.stop() # Stop any ongoing TTS when the hotword is detected
self._on_hotword_detected(hotword=self.keywords[keyword_index])
return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
return None
def override_speech_model(self, model_path: Optional[str]):
self._speech_model_path_override = model_path
def override_intent_model(self, model_path: Optional[str]):
self._intent_model_path_override = model_path
def _put_event(self, evt: AssistantEvent):
try:
self._evt_queue.put_nowait(evt)
except Full:
self.logger.warning('The assistant event queue is full')
def run(self):
assert (
self._in_ctx
), 'The assistant can only be started through a context manager'
super().run()
while not self.should_stop() and self._recorder:
self._recorder.wait_start()
if self.should_stop():
break
data = self._recorder.read()
if data is None:
continue
frame, t = data
if time() - t > self.frame_expiration:
self.logger.info(
'Skipping audio frame older than %ss', self.frame_expiration
)
continue # The audio frame is too old
if self.hotword_enabled and self.state == AssistantState.DETECTING_HOTWORD:
evt = self._process_hotword(frame)
if evt:
self._put_event(evt)
continue
if (
self._speech_processor.enabled
and self.state == AssistantState.DETECTING_SPEECH
):
self._speech_processor.process(frame, block=False)
self.logger.info('Assistant stopped')
# vim:sw=4:ts=4:et: