425 lines
14 KiB
Python
425 lines
14 KiB
Python
import logging
|
|
import os
|
|
from queue import Full, Queue
|
|
from threading import Event, RLock, Thread
|
|
from time import time
|
|
from typing import Any, Dict, Optional, Sequence
|
|
|
|
import pvporcupine
|
|
|
|
from platypush.context import get_plugin
|
|
from platypush.message.event.assistant import (
|
|
AssistantEvent,
|
|
ConversationTimeoutEvent,
|
|
HotwordDetectedEvent,
|
|
IntentRecognizedEvent,
|
|
SpeechRecognizedEvent,
|
|
)
|
|
from platypush.plugins.tts.picovoice import TtsPicovoicePlugin
|
|
|
|
from ._recorder import AudioRecorder
|
|
from ._speech import SpeechProcessor
|
|
from ._state import AssistantState
|
|
|
|
|
|
class Assistant(Thread):
|
|
"""
|
|
A facade class that wraps the Picovoice engines under an assistant API.
|
|
"""
|
|
|
|
@staticmethod
|
|
def _default_callback(*_, **__):
|
|
pass
|
|
|
|
def __init__(
|
|
self,
|
|
access_key: str,
|
|
stop_event: Event,
|
|
hotword_enabled: bool = True,
|
|
stt_enabled: bool = True,
|
|
keywords: Optional[Sequence[str]] = None,
|
|
keyword_paths: Optional[Sequence[str]] = None,
|
|
keyword_model_path: Optional[str] = None,
|
|
frame_expiration: float = 3.0, # Don't process audio frames older than this
|
|
speech_model_path: Optional[str] = None,
|
|
intent_model_path: Optional[str] = None,
|
|
endpoint_duration: Optional[float] = None,
|
|
enable_automatic_punctuation: bool = False,
|
|
start_conversation_on_hotword: bool = False,
|
|
audio_queue_size: int = 100,
|
|
muted: bool = False,
|
|
conversation_timeout: Optional[float] = None,
|
|
on_conversation_start=_default_callback,
|
|
on_conversation_end=_default_callback,
|
|
on_conversation_timeout=_default_callback,
|
|
on_speech_recognized=_default_callback,
|
|
on_intent_matched=_default_callback,
|
|
on_hotword_detected=_default_callback,
|
|
):
|
|
super().__init__(name='picovoice:Assistant')
|
|
|
|
self._access_key = access_key
|
|
self._stop_event = stop_event
|
|
self.logger = logging.getLogger(__name__)
|
|
self.hotword_enabled = hotword_enabled
|
|
self.stt_enabled = stt_enabled
|
|
self.keywords = list(keywords or [])
|
|
self.keyword_paths = None
|
|
self.keyword_model_path = None
|
|
self.frame_expiration = frame_expiration
|
|
self.endpoint_duration = endpoint_duration
|
|
self.enable_automatic_punctuation = enable_automatic_punctuation
|
|
self.start_conversation_on_hotword = start_conversation_on_hotword
|
|
self.audio_queue_size = audio_queue_size
|
|
self._responding = Event()
|
|
self._muted = muted
|
|
self._speech_model_path = speech_model_path
|
|
self._speech_model_path_override = None
|
|
self._intent_model_path = intent_model_path
|
|
self._intent_model_path_override = None
|
|
self._in_ctx = False
|
|
|
|
self._speech_processor = SpeechProcessor(
|
|
stop_event=stop_event,
|
|
stt_enabled=stt_enabled,
|
|
intent_enabled=self.intent_enabled,
|
|
conversation_timeout=conversation_timeout,
|
|
model_path=speech_model_path,
|
|
get_cheetah_args=self._get_speech_engine_args,
|
|
get_rhino_args=self._get_intent_engine_args,
|
|
)
|
|
|
|
self._on_conversation_start = on_conversation_start
|
|
self._on_conversation_end = on_conversation_end
|
|
self._on_conversation_timeout = on_conversation_timeout
|
|
self._on_speech_recognized = on_speech_recognized
|
|
self._on_intent_matched = on_intent_matched
|
|
self._on_hotword_detected = on_hotword_detected
|
|
|
|
self._recorder = None
|
|
self._state = AssistantState.IDLE
|
|
self._state_lock = RLock()
|
|
self._evt_queue = Queue(maxsize=100)
|
|
|
|
if hotword_enabled:
|
|
if not keywords:
|
|
raise ValueError(
|
|
'You need to provide a list of keywords if the wake-word engine is enabled'
|
|
)
|
|
|
|
if keyword_paths:
|
|
keyword_paths = [os.path.expanduser(path) for path in keyword_paths]
|
|
missing_paths = [
|
|
path for path in keyword_paths if not os.path.isfile(path)
|
|
]
|
|
if missing_paths:
|
|
raise FileNotFoundError(f'Keyword files not found: {missing_paths}')
|
|
|
|
self.keyword_paths = keyword_paths
|
|
|
|
if keyword_model_path:
|
|
keyword_model_path = os.path.expanduser(keyword_model_path)
|
|
if not os.path.isfile(keyword_model_path):
|
|
raise FileNotFoundError(
|
|
f'Keyword model file not found: {keyword_model_path}'
|
|
)
|
|
|
|
self.keyword_model_path = keyword_model_path
|
|
|
|
self._porcupine: Optional[pvporcupine.Porcupine] = None
|
|
|
|
@property
|
|
def intent_enabled(self) -> bool:
|
|
return self.intent_model_path is not None
|
|
|
|
@property
|
|
def is_responding(self) -> bool:
|
|
return self._responding.is_set()
|
|
|
|
@property
|
|
def speech_model_path(self) -> Optional[str]:
|
|
return self._speech_model_path_override or self._speech_model_path
|
|
|
|
@property
|
|
def intent_model_path(self) -> Optional[str]:
|
|
return self._intent_model_path_override or self._intent_model_path
|
|
|
|
@property
|
|
def tts(self) -> TtsPicovoicePlugin:
|
|
p = get_plugin('tts.picovoice')
|
|
assert p, 'Picovoice TTS plugin not configured/found'
|
|
return p
|
|
|
|
def set_responding(self, responding: bool):
|
|
if responding:
|
|
self._responding.set()
|
|
else:
|
|
self._responding.clear()
|
|
|
|
def should_stop(self):
|
|
return self._stop_event.is_set()
|
|
|
|
def wait_stop(self):
|
|
self._stop_event.wait()
|
|
|
|
@property
|
|
def state(self) -> AssistantState:
|
|
with self._state_lock:
|
|
return self._state
|
|
|
|
@state.setter
|
|
def state(self, state: AssistantState):
|
|
with self._state_lock:
|
|
prev_state = self._state
|
|
self._state = state
|
|
new_state = self.state
|
|
|
|
if prev_state == new_state:
|
|
return
|
|
|
|
self.logger.info('Assistant state transition: %s -> %s', prev_state, new_state)
|
|
if prev_state == AssistantState.DETECTING_SPEECH:
|
|
self.tts.stop()
|
|
self._speech_model_path_override = None
|
|
self._intent_model_path_override = None
|
|
self._speech_processor.on_conversation_end()
|
|
self._on_conversation_end()
|
|
elif new_state == AssistantState.DETECTING_SPEECH:
|
|
self._speech_processor.on_conversation_start()
|
|
self._on_conversation_start()
|
|
|
|
if new_state == AssistantState.DETECTING_HOTWORD:
|
|
self.tts.stop()
|
|
self._speech_processor.on_conversation_reset()
|
|
|
|
# Put a null event on the event queue to unblock next_event
|
|
self._evt_queue.put(None)
|
|
|
|
@property
|
|
def porcupine(self) -> Optional[pvporcupine.Porcupine]:
|
|
if not self.hotword_enabled:
|
|
return None
|
|
|
|
if not self._porcupine:
|
|
args: Dict[str, Any] = {'access_key': self._access_key}
|
|
if self.keywords:
|
|
args['keywords'] = self.keywords
|
|
if self.keyword_paths:
|
|
args['keyword_paths'] = self.keyword_paths
|
|
if self.keyword_model_path:
|
|
args['model_path'] = self.keyword_model_path
|
|
|
|
self._porcupine = pvporcupine.create(**args)
|
|
|
|
return self._porcupine
|
|
|
|
def _get_speech_engine_args(self) -> dict:
|
|
args: Dict[str, Any] = {'access_key': self._access_key}
|
|
if self.speech_model_path:
|
|
args['model_path'] = self.speech_model_path
|
|
if self.endpoint_duration:
|
|
args['endpoint_duration_sec'] = self.endpoint_duration
|
|
if self.enable_automatic_punctuation:
|
|
args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
|
|
|
|
return args
|
|
|
|
def _get_intent_engine_args(self) -> dict:
|
|
args: Dict[str, Any] = {'access_key': self._access_key}
|
|
args['context_path'] = self.intent_model_path
|
|
if self.endpoint_duration:
|
|
args['endpoint_duration_sec'] = self.endpoint_duration
|
|
if self.enable_automatic_punctuation:
|
|
args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
|
|
|
|
return args
|
|
|
|
def __enter__(self):
|
|
"""
|
|
Get the assistant ready to start processing audio frames.
|
|
"""
|
|
if self.should_stop():
|
|
return self
|
|
|
|
assert not self.is_alive(), 'The assistant is already running'
|
|
self._in_ctx = True
|
|
|
|
if self._recorder:
|
|
self.logger.info('A recording stream already exists')
|
|
elif self.hotword_enabled or self.stt_enabled or self.intent_enabled:
|
|
sample_rate = (self.porcupine or self._speech_processor).sample_rate
|
|
frame_length = (self.porcupine or self._speech_processor).frame_length
|
|
self._recorder = AudioRecorder(
|
|
stop_event=self._stop_event,
|
|
sample_rate=sample_rate,
|
|
frame_size=frame_length,
|
|
queue_size=self.audio_queue_size,
|
|
paused=self._muted,
|
|
channels=1,
|
|
)
|
|
|
|
self._speech_processor.__enter__()
|
|
self._recorder.__enter__()
|
|
|
|
if self.porcupine:
|
|
self.state = AssistantState.DETECTING_HOTWORD
|
|
else:
|
|
self.state = AssistantState.DETECTING_SPEECH
|
|
|
|
self.start()
|
|
return self
|
|
|
|
def __exit__(self, *_):
|
|
"""
|
|
Stop the assistant and release all resources.
|
|
"""
|
|
self._in_ctx = False
|
|
if self._recorder:
|
|
self._recorder.__exit__(*_)
|
|
self._recorder = None
|
|
|
|
self.state = AssistantState.IDLE
|
|
|
|
if self._porcupine:
|
|
self._porcupine.delete()
|
|
self._porcupine = None
|
|
|
|
self._speech_processor.__exit__(*_)
|
|
|
|
def __iter__(self):
|
|
"""
|
|
Iterate over processed assistant events.
|
|
"""
|
|
return self
|
|
|
|
def __next__(self):
|
|
"""
|
|
Process the next audio frame and return the corresponding event.
|
|
"""
|
|
if self.should_stop() or not self._recorder:
|
|
raise StopIteration
|
|
|
|
if self.hotword_enabled and self.state == AssistantState.DETECTING_HOTWORD:
|
|
return self._evt_queue.get()
|
|
|
|
evt = None
|
|
if (
|
|
self._speech_processor.enabled
|
|
and self.state == AssistantState.DETECTING_SPEECH
|
|
):
|
|
evt = self._speech_processor.next_event()
|
|
|
|
if isinstance(evt, SpeechRecognizedEvent):
|
|
self._on_speech_recognized(phrase=evt.args['phrase'])
|
|
if isinstance(evt, IntentRecognizedEvent):
|
|
self._on_intent_matched(
|
|
intent=evt.args['intent'], slots=evt.args.get('slots', {})
|
|
)
|
|
if isinstance(evt, ConversationTimeoutEvent):
|
|
self._on_conversation_timeout()
|
|
|
|
if evt:
|
|
self._speech_processor.reset()
|
|
|
|
if (
|
|
evt
|
|
and self.state == AssistantState.DETECTING_SPEECH
|
|
and self.hotword_enabled
|
|
):
|
|
self.state = AssistantState.DETECTING_HOTWORD
|
|
|
|
return evt
|
|
|
|
def mute(self):
|
|
self._muted = True
|
|
if self._recorder:
|
|
self._recorder.pause()
|
|
|
|
def unmute(self):
|
|
self._muted = False
|
|
if self._recorder:
|
|
self._recorder.resume()
|
|
|
|
def set_mic_mute(self, mute: bool):
|
|
if mute:
|
|
self.mute()
|
|
else:
|
|
self.unmute()
|
|
|
|
def toggle_mic_mute(self):
|
|
if self._muted:
|
|
self.unmute()
|
|
else:
|
|
self.mute()
|
|
|
|
def _process_hotword(self, frame) -> Optional[HotwordDetectedEvent]:
|
|
if not self.porcupine:
|
|
return None
|
|
|
|
keyword_index = self.porcupine.process(frame)
|
|
if keyword_index is None:
|
|
return None # No keyword detected
|
|
|
|
if keyword_index >= 0 and self.keywords:
|
|
if self.start_conversation_on_hotword:
|
|
self.state = AssistantState.DETECTING_SPEECH
|
|
|
|
self.tts.stop() # Stop any ongoing TTS when the hotword is detected
|
|
self._on_hotword_detected(hotword=self.keywords[keyword_index])
|
|
return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
|
|
|
|
return None
|
|
|
|
def override_speech_model(self, model_path: Optional[str]):
|
|
self._speech_model_path_override = model_path
|
|
|
|
def override_intent_model(self, model_path: Optional[str]):
|
|
self._intent_model_path_override = model_path
|
|
|
|
def _put_event(self, evt: AssistantEvent):
|
|
try:
|
|
self._evt_queue.put_nowait(evt)
|
|
except Full:
|
|
self.logger.warning('The assistant event queue is full')
|
|
|
|
def run(self):
|
|
assert (
|
|
self._in_ctx
|
|
), 'The assistant can only be started through a context manager'
|
|
|
|
super().run()
|
|
|
|
while not self.should_stop() and self._recorder:
|
|
self._recorder.wait_start()
|
|
if self.should_stop():
|
|
break
|
|
|
|
data = self._recorder.read()
|
|
if data is None:
|
|
continue
|
|
|
|
frame, t = data
|
|
if time() - t > self.frame_expiration:
|
|
self.logger.info(
|
|
'Skipping audio frame older than %ss', self.frame_expiration
|
|
)
|
|
continue # The audio frame is too old
|
|
|
|
if self.hotword_enabled and self.state == AssistantState.DETECTING_HOTWORD:
|
|
evt = self._process_hotword(frame)
|
|
if evt:
|
|
self._put_event(evt)
|
|
|
|
continue
|
|
|
|
if (
|
|
self._speech_processor.enabled
|
|
and self.state == AssistantState.DETECTING_SPEECH
|
|
):
|
|
self._speech_processor.process(frame, block=False)
|
|
|
|
self.logger.info('Assistant stopped')
|
|
|
|
|
|
# vim:sw=4:ts=4:et:
|