[assistant.picovoice] Conversation flow improvements.
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
- The `Responding` state should be modelled as an extra event/binary flag, not as an assistant state. The assistant may be listening for hotwords even while the `tts` plugin is responding, and we don't want the two states to interfere with each either - neither to build a more complex state machine that also needs to take concurrent states into account. - Stop any responses being rendered upon the `tts` plugin when a new hotword audio is detected. If e.g. I say "Ok Google", I should always be able to trigger the assistant and stop any concurrent audio process. - `SpeechRecognizedEvent` should be emitted even if `cheetah`'s latest audio frame results weren't marked as final, and the speech detection window timed out. Cheetah's `is_final` detection seems to be quite buggy sometimes, and it may not properly detect the end of utterances, especially with non-native accents. The workaround is to flush out whatever text is available (if at least some speech was detected) into a `SpeechRecognizedEvent` upon timeout.
This commit is contained in:
parent
24e93ad160
commit
fa49db4107
4 changed files with 48 additions and 9 deletions
|
@ -165,16 +165,12 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
|
||||||
|
|
||||||
def _on_response_render_start(self, text: Optional[str]):
|
def _on_response_render_start(self, text: Optional[str]):
|
||||||
if self._assistant:
|
if self._assistant:
|
||||||
self._assistant.state = AssistantState.RESPONDING
|
self._assistant.set_responding(True)
|
||||||
return super()._on_response_render_start(text)
|
return super()._on_response_render_start(text)
|
||||||
|
|
||||||
def _on_response_render_end(self):
|
def _on_response_render_end(self):
|
||||||
if self._assistant:
|
if self._assistant:
|
||||||
self._assistant.state = (
|
self._assistant.set_responding(False)
|
||||||
AssistantState.DETECTING_HOTWORD
|
|
||||||
if self._assistant.hotword_enabled
|
|
||||||
else AssistantState.IDLE
|
|
||||||
)
|
|
||||||
|
|
||||||
return super()._on_response_render_end()
|
return super()._on_response_render_end()
|
||||||
|
|
||||||
|
@ -257,6 +253,7 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
|
||||||
with Assistant(**self._assistant_args) as self._assistant:
|
with Assistant(**self._assistant_args) as self._assistant:
|
||||||
try:
|
try:
|
||||||
for event in self._assistant:
|
for event in self._assistant:
|
||||||
|
if event is not None:
|
||||||
self.logger.debug('Picovoice assistant event: %s', event)
|
self.logger.debug('Picovoice assistant event: %s', event)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
break
|
break
|
||||||
|
|
|
@ -63,6 +63,7 @@ class Assistant:
|
||||||
self.keywords = list(keywords or [])
|
self.keywords = list(keywords or [])
|
||||||
self.keyword_paths = None
|
self.keyword_paths = None
|
||||||
self.keyword_model_path = None
|
self.keyword_model_path = None
|
||||||
|
self._responding = Event()
|
||||||
self.frame_expiration = frame_expiration
|
self.frame_expiration = frame_expiration
|
||||||
self.endpoint_duration = endpoint_duration
|
self.endpoint_duration = endpoint_duration
|
||||||
self.enable_automatic_punctuation = enable_automatic_punctuation
|
self.enable_automatic_punctuation = enable_automatic_punctuation
|
||||||
|
@ -113,6 +114,10 @@ class Assistant:
|
||||||
self._porcupine: Optional[pvporcupine.Porcupine] = None
|
self._porcupine: Optional[pvporcupine.Porcupine] = None
|
||||||
self._rhino: Optional[pvrhino.Rhino] = None
|
self._rhino: Optional[pvrhino.Rhino] = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_responding(self):
|
||||||
|
return self._responding.is_set()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def speech_model_path(self):
|
def speech_model_path(self):
|
||||||
return self._speech_model_path_override or self._speech_model_path
|
return self._speech_model_path_override or self._speech_model_path
|
||||||
|
@ -123,6 +128,12 @@ class Assistant:
|
||||||
assert p, 'Picovoice TTS plugin not configured/found'
|
assert p, 'Picovoice TTS plugin not configured/found'
|
||||||
return p
|
return p
|
||||||
|
|
||||||
|
def set_responding(self, responding: bool):
|
||||||
|
if responding:
|
||||||
|
self._responding.set()
|
||||||
|
else:
|
||||||
|
self._responding.clear()
|
||||||
|
|
||||||
def should_stop(self):
|
def should_stop(self):
|
||||||
return self._stop_event.is_set()
|
return self._stop_event.is_set()
|
||||||
|
|
||||||
|
@ -194,6 +205,9 @@ class Assistant:
|
||||||
return self._cheetah[self.speech_model_path]
|
return self._cheetah[self.speech_model_path]
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
|
"""
|
||||||
|
Get the assistant ready to start processing audio frames.
|
||||||
|
"""
|
||||||
if self.should_stop():
|
if self.should_stop():
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@ -223,6 +237,9 @@ class Assistant:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __exit__(self, *_):
|
def __exit__(self, *_):
|
||||||
|
"""
|
||||||
|
Stop the assistant and release all resources.
|
||||||
|
"""
|
||||||
if self._recorder:
|
if self._recorder:
|
||||||
self._recorder.__exit__(*_)
|
self._recorder.__exit__(*_)
|
||||||
self._recorder = None
|
self._recorder = None
|
||||||
|
@ -246,9 +263,15 @@ class Assistant:
|
||||||
self._rhino = None
|
self._rhino = None
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
"""
|
||||||
|
Iterate over processed assistant events.
|
||||||
|
"""
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __next__(self):
|
def __next__(self):
|
||||||
|
"""
|
||||||
|
Process the next audio frame and return the corresponding event.
|
||||||
|
"""
|
||||||
has_data = False
|
has_data = False
|
||||||
if self.should_stop() or not self._recorder:
|
if self.should_stop() or not self._recorder:
|
||||||
raise StopIteration
|
raise StopIteration
|
||||||
|
@ -285,6 +308,7 @@ class Assistant:
|
||||||
if self.start_conversation_on_hotword:
|
if self.start_conversation_on_hotword:
|
||||||
self.state = AssistantState.DETECTING_SPEECH
|
self.state = AssistantState.DETECTING_SPEECH
|
||||||
|
|
||||||
|
self.tts.stop()
|
||||||
self._on_hotword_detected(hotword=self.keywords[keyword_index])
|
self._on_hotword_detected(hotword=self.keywords[keyword_index])
|
||||||
return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
|
return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
|
||||||
|
|
||||||
|
@ -311,7 +335,7 @@ class Assistant:
|
||||||
phrase = self._ctx.transcript
|
phrase = self._ctx.transcript
|
||||||
phrase = phrase[:1].lower() + phrase[1:]
|
phrase = phrase[:1].lower() + phrase[1:]
|
||||||
|
|
||||||
if self._ctx.is_final and phrase:
|
if phrase:
|
||||||
event = SpeechRecognizedEvent(phrase=phrase)
|
event = SpeechRecognizedEvent(phrase=phrase)
|
||||||
self._on_speech_recognized(phrase=phrase)
|
self._on_speech_recognized(phrase=phrase)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -42,11 +42,17 @@ class AudioRecorder:
|
||||||
)
|
)
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
|
"""
|
||||||
|
Start the audio stream.
|
||||||
|
"""
|
||||||
self._stop_event.clear()
|
self._stop_event.clear()
|
||||||
self.stream.start()
|
self.stream.start()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __exit__(self, *_):
|
def __exit__(self, *_):
|
||||||
|
"""
|
||||||
|
Stop the audio stream.
|
||||||
|
"""
|
||||||
self.stop()
|
self.stop()
|
||||||
|
|
||||||
def _audio_callback(self, indata, *_):
|
def _audio_callback(self, indata, *_):
|
||||||
|
@ -59,6 +65,13 @@ class AudioRecorder:
|
||||||
self.logger.warning('Audio queue is full, dropping audio frame')
|
self.logger.warning('Audio queue is full, dropping audio frame')
|
||||||
|
|
||||||
def read(self, timeout: Optional[float] = None):
|
def read(self, timeout: Optional[float] = None):
|
||||||
|
"""
|
||||||
|
Read an audio frame from the queue.
|
||||||
|
|
||||||
|
:param timeout: Timeout in seconds. If None, the method will block until
|
||||||
|
an audio frame is available.
|
||||||
|
:return: Audio frame or None if the timeout has expired.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
return self._audio_queue.get(timeout=timeout)
|
return self._audio_queue.get(timeout=timeout)
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
|
@ -66,6 +79,9 @@ class AudioRecorder:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
|
"""
|
||||||
|
Stop the audio stream.
|
||||||
|
"""
|
||||||
self._stop_event.set()
|
self._stop_event.set()
|
||||||
self.stream.stop()
|
self.stream.stop()
|
||||||
|
|
||||||
|
@ -73,4 +89,7 @@ class AudioRecorder:
|
||||||
return self._stop_event.is_set() or self._upstream_stop_event.is_set()
|
return self._stop_event.is_set() or self._upstream_stop_event.is_set()
|
||||||
|
|
||||||
def wait(self, timeout: Optional[float] = None):
|
def wait(self, timeout: Optional[float] = None):
|
||||||
|
"""
|
||||||
|
Wait until the audio stream is stopped.
|
||||||
|
"""
|
||||||
wait_for_either(self._stop_event, self._upstream_stop_event, timeout=timeout)
|
wait_for_either(self._stop_event, self._upstream_stop_event, timeout=timeout)
|
||||||
|
|
|
@ -9,7 +9,6 @@ class AssistantState(Enum):
|
||||||
IDLE = 'idle'
|
IDLE = 'idle'
|
||||||
DETECTING_HOTWORD = 'detecting_hotword'
|
DETECTING_HOTWORD = 'detecting_hotword'
|
||||||
DETECTING_SPEECH = 'detecting_speech'
|
DETECTING_SPEECH = 'detecting_speech'
|
||||||
RESPONDING = 'responding'
|
|
||||||
|
|
||||||
|
|
||||||
# vim:sw=4:ts=4:et:
|
# vim:sw=4:ts=4:et:
|
||||||
|
|
Loading…
Reference in a new issue