[assistant.picovoice] Conversation flow improvements.

- The `Responding` state should be modelled as an extra event/binary
  flag, not as an assistant state. The assistant may be listening for
  hotwords even while the `tts` plugin is responding, and we don't want
  the two states to interfere with each either - neither to build a more
  complex state machine that also needs to take concurrent states into
  account.

- Stop any responses being rendered upon the `tts` plugin when a new
  hotword audio is detected. If e.g. I say "Ok Google", I should always
  be able to trigger the assistant and stop any concurrent audio
  process.

- `SpeechRecognizedEvent` should be emitted even if `cheetah`'s latest
  audio frame results weren't marked as final, and the speech detection
  window timed out. Cheetah's `is_final` detection seems to be quite
  buggy sometimes, and it may not properly detect the end of utterances,
  especially with non-native accents. The workaround is to flush out
  whatever text is available (if at least some speech was detected) into
  a `SpeechRecognizedEvent` upon timeout.
This commit is contained in:
Fabio Manganiello 2024-04-13 19:49:58 +02:00
parent a6f7b6e790
commit 9de49c71a1
4 changed files with 48 additions and 9 deletions

View file

@ -165,16 +165,12 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
def _on_response_render_start(self, text: Optional[str]): def _on_response_render_start(self, text: Optional[str]):
if self._assistant: if self._assistant:
self._assistant.state = AssistantState.RESPONDING self._assistant.set_responding(True)
return super()._on_response_render_start(text) return super()._on_response_render_start(text)
def _on_response_render_end(self): def _on_response_render_end(self):
if self._assistant: if self._assistant:
self._assistant.state = ( self._assistant.set_responding(False)
AssistantState.DETECTING_HOTWORD
if self._assistant.hotword_enabled
else AssistantState.IDLE
)
return super()._on_response_render_end() return super()._on_response_render_end()
@ -257,7 +253,8 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
with Assistant(**self._assistant_args) as self._assistant: with Assistant(**self._assistant_args) as self._assistant:
try: try:
for event in self._assistant: for event in self._assistant:
self.logger.debug('Picovoice assistant event: %s', event) if event is not None:
self.logger.debug('Picovoice assistant event: %s', event)
except KeyboardInterrupt: except KeyboardInterrupt:
break break
except Exception as e: except Exception as e:

View file

@ -63,6 +63,7 @@ class Assistant:
self.keywords = list(keywords or []) self.keywords = list(keywords or [])
self.keyword_paths = None self.keyword_paths = None
self.keyword_model_path = None self.keyword_model_path = None
self._responding = Event()
self.frame_expiration = frame_expiration self.frame_expiration = frame_expiration
self.endpoint_duration = endpoint_duration self.endpoint_duration = endpoint_duration
self.enable_automatic_punctuation = enable_automatic_punctuation self.enable_automatic_punctuation = enable_automatic_punctuation
@ -113,6 +114,10 @@ class Assistant:
self._porcupine: Optional[pvporcupine.Porcupine] = None self._porcupine: Optional[pvporcupine.Porcupine] = None
self._rhino: Optional[pvrhino.Rhino] = None self._rhino: Optional[pvrhino.Rhino] = None
@property
def is_responding(self):
return self._responding.is_set()
@property @property
def speech_model_path(self): def speech_model_path(self):
return self._speech_model_path_override or self._speech_model_path return self._speech_model_path_override or self._speech_model_path
@ -123,6 +128,12 @@ class Assistant:
assert p, 'Picovoice TTS plugin not configured/found' assert p, 'Picovoice TTS plugin not configured/found'
return p return p
def set_responding(self, responding: bool):
if responding:
self._responding.set()
else:
self._responding.clear()
def should_stop(self): def should_stop(self):
return self._stop_event.is_set() return self._stop_event.is_set()
@ -194,6 +205,9 @@ class Assistant:
return self._cheetah[self.speech_model_path] return self._cheetah[self.speech_model_path]
def __enter__(self): def __enter__(self):
"""
Get the assistant ready to start processing audio frames.
"""
if self.should_stop(): if self.should_stop():
return self return self
@ -223,6 +237,9 @@ class Assistant:
return self return self
def __exit__(self, *_): def __exit__(self, *_):
"""
Stop the assistant and release all resources.
"""
if self._recorder: if self._recorder:
self._recorder.__exit__(*_) self._recorder.__exit__(*_)
self._recorder = None self._recorder = None
@ -246,9 +263,15 @@ class Assistant:
self._rhino = None self._rhino = None
def __iter__(self): def __iter__(self):
"""
Iterate over processed assistant events.
"""
return self return self
def __next__(self): def __next__(self):
"""
Process the next audio frame and return the corresponding event.
"""
has_data = False has_data = False
if self.should_stop() or not self._recorder: if self.should_stop() or not self._recorder:
raise StopIteration raise StopIteration
@ -285,6 +308,7 @@ class Assistant:
if self.start_conversation_on_hotword: if self.start_conversation_on_hotword:
self.state = AssistantState.DETECTING_SPEECH self.state = AssistantState.DETECTING_SPEECH
self.tts.stop()
self._on_hotword_detected(hotword=self.keywords[keyword_index]) self._on_hotword_detected(hotword=self.keywords[keyword_index])
return HotwordDetectedEvent(hotword=self.keywords[keyword_index]) return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
@ -311,7 +335,7 @@ class Assistant:
phrase = self._ctx.transcript phrase = self._ctx.transcript
phrase = phrase[:1].lower() + phrase[1:] phrase = phrase[:1].lower() + phrase[1:]
if self._ctx.is_final and phrase: if phrase:
event = SpeechRecognizedEvent(phrase=phrase) event = SpeechRecognizedEvent(phrase=phrase)
self._on_speech_recognized(phrase=phrase) self._on_speech_recognized(phrase=phrase)
else: else:

View file

@ -42,11 +42,17 @@ class AudioRecorder:
) )
def __enter__(self): def __enter__(self):
"""
Start the audio stream.
"""
self._stop_event.clear() self._stop_event.clear()
self.stream.start() self.stream.start()
return self return self
def __exit__(self, *_): def __exit__(self, *_):
"""
Stop the audio stream.
"""
self.stop() self.stop()
def _audio_callback(self, indata, *_): def _audio_callback(self, indata, *_):
@ -59,6 +65,13 @@ class AudioRecorder:
self.logger.warning('Audio queue is full, dropping audio frame') self.logger.warning('Audio queue is full, dropping audio frame')
def read(self, timeout: Optional[float] = None): def read(self, timeout: Optional[float] = None):
"""
Read an audio frame from the queue.
:param timeout: Timeout in seconds. If None, the method will block until
an audio frame is available.
:return: Audio frame or None if the timeout has expired.
"""
try: try:
return self._audio_queue.get(timeout=timeout) return self._audio_queue.get(timeout=timeout)
except TimeoutError: except TimeoutError:
@ -66,6 +79,9 @@ class AudioRecorder:
return None return None
def stop(self): def stop(self):
"""
Stop the audio stream.
"""
self._stop_event.set() self._stop_event.set()
self.stream.stop() self.stream.stop()
@ -73,4 +89,7 @@ class AudioRecorder:
return self._stop_event.is_set() or self._upstream_stop_event.is_set() return self._stop_event.is_set() or self._upstream_stop_event.is_set()
def wait(self, timeout: Optional[float] = None): def wait(self, timeout: Optional[float] = None):
"""
Wait until the audio stream is stopped.
"""
wait_for_either(self._stop_event, self._upstream_stop_event, timeout=timeout) wait_for_either(self._stop_event, self._upstream_stop_event, timeout=timeout)

View file

@ -9,7 +9,6 @@ class AssistantState(Enum):
IDLE = 'idle' IDLE = 'idle'
DETECTING_HOTWORD = 'detecting_hotword' DETECTING_HOTWORD = 'detecting_hotword'
DETECTING_SPEECH = 'detecting_speech' DETECTING_SPEECH = 'detecting_speech'
RESPONDING = 'responding'
# vim:sw=4:ts=4:et: # vim:sw=4:ts=4:et: