[WIP] Added speech detection logic over Cheetah.
continuous-integration/drone/push Build is passing Details

This commit is contained in:
Fabio Manganiello 2024-04-08 01:54:26 +02:00
parent 01dec0b7a4
commit f021b471aa
Signed by: blacklight
GPG Key ID: D90FBA7F76362774
11 changed files with 263 additions and 74 deletions

View File

@ -10,6 +10,4 @@ Backends
platypush/backend/midi.rst platypush/backend/midi.rst
platypush/backend/nodered.rst platypush/backend/nodered.rst
platypush/backend/redis.rst platypush/backend/redis.rst
platypush/backend/stt.picovoice.hotword.rst
platypush/backend/stt.picovoice.speech.rst
platypush/backend/tcp.rst platypush/backend/tcp.rst

View File

@ -1,5 +0,0 @@
``stt.picovoice.hotword``
===========================================
.. automodule:: platypush.backend.stt.picovoice.hotword
:members:

View File

@ -1,5 +0,0 @@
``stt.picovoice.speech``
==========================================
.. automodule:: platypush.backend.stt.picovoice.speech
:members:

View File

@ -0,0 +1,5 @@
``picovoice``
=============
.. automodule:: platypush.plugins.picovoice
:members:

View File

@ -1,5 +0,0 @@
``stt.picovoice.hotword``
===========================================
.. automodule:: platypush.plugins.stt.picovoice.hotword
:members:

View File

@ -1,5 +0,0 @@
``stt.picovoice.speech``
==========================================
.. automodule:: platypush.plugins.stt.picovoice.speech
:members:

View File

@ -95,6 +95,7 @@ Plugins
platypush/plugins/nmap.rst platypush/plugins/nmap.rst
platypush/plugins/ntfy.rst platypush/plugins/ntfy.rst
platypush/plugins/otp.rst platypush/plugins/otp.rst
platypush/plugins/picovoice.rst
platypush/plugins/pihole.rst platypush/plugins/pihole.rst
platypush/plugins/ping.rst platypush/plugins/ping.rst
platypush/plugins/printer.cups.rst platypush/plugins/printer.cups.rst
@ -119,8 +120,6 @@ Plugins
platypush/plugins/smartthings.rst platypush/plugins/smartthings.rst
platypush/plugins/sound.rst platypush/plugins/sound.rst
platypush/plugins/ssh.rst platypush/plugins/ssh.rst
platypush/plugins/stt.picovoice.hotword.rst
platypush/plugins/stt.picovoice.speech.rst
platypush/plugins/sun.rst platypush/plugins/sun.rst
platypush/plugins/switch.tplink.rst platypush/plugins/switch.tplink.rst
platypush/plugins/switch.wemo.rst platypush/plugins/switch.wemo.rst

View File

@ -51,27 +51,34 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
keywords: Optional[Sequence[str]] = None, keywords: Optional[Sequence[str]] = None,
keyword_paths: Optional[Sequence[str]] = None, keyword_paths: Optional[Sequence[str]] = None,
keyword_model_path: Optional[str] = None, keyword_model_path: Optional[str] = None,
speech_model_path: Optional[str] = None,
endpoint_duration: Optional[float] = 0.5,
enable_automatic_punctuation: bool = False,
start_conversation_on_hotword: bool = True,
audio_queue_size: int = 100,
conversation_timeout: Optional[float] = 5.0,
**kwargs, **kwargs,
): ):
""" """
:param access_key: Your Picovoice access key. You can get it by signing :param access_key: Your Picovoice access key. You can get it by signing
up at the `Picovoice console <https://console.picovoice.ai/>`. up at the `Picovoice console <https://console.picovoice.ai/>`.
:param hotword_enabled: Enable the wake-word engine (default: True). :param hotword_enabled: Enable the wake-word engine (default: True).
.. note:: The wake-word engine requires you to add Porcupine to the **Note**: The wake-word engine requires you to add Porcupine to the
products available in your Picovoice account. products available in your Picovoice account.
:param stt_enabled: Enable the speech-to-text engine (default: True). :param stt_enabled: Enable the speech-to-text engine (default: True).
.. note:: The speech-to-text engine requires you to add Cheetah to **Note**: The speech-to-text engine requires you to add Cheetah to
the products available in your Picovoice account. the products available in your Picovoice account.
:param intent_enabled: Enable the intent recognition engine (default: :param intent_enabled: Enable the intent recognition engine (default:
False). False).
.. note:: The intent recognition engine requires you to add Rhino **Note**: The intent recognition engine requires you to add Rhino
to the products available in your Picovoice account. to the products available in your Picovoice account.
:param keywords: List of keywords to listen for (e.g. ``alexa``, ``ok :param keywords: List of keywords to listen for (e.g. ``alexa``, ``ok
google``...). Either ``keywords`` or ``keyword_paths`` must be google``...). This is required if the wake-word engine is enabled.
provided if the wake-word engine is enabled. This list can include See the `Picovoice repository
any of the default Picovoice keywords (available on the `Picovoice
repository
<https://github.com/Picovoice/porcupine/tree/master/resources/keyword_files>`_). <https://github.com/Picovoice/porcupine/tree/master/resources/keyword_files>`_).
for a list of the stock keywords available. If you have a custom
model, you can pass its path to the ``keyword_paths`` parameter and
its filename (without the path and the platform extension) here.
:param keyword_paths: List of paths to the keyword files to listen for. :param keyword_paths: List of paths to the keyword files to listen for.
Custom keyword files can be created using the `Picovoice console Custom keyword files can be created using the `Picovoice console
<https://console.picovoice.ai/ppn>`_ and downloaded from the <https://console.picovoice.ai/ppn>`_ and downloaded from the
@ -81,6 +88,35 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
for its language. Model files are available for all the supported for its language. Model files are available for all the supported
languages through the `Picovoice repository languages through the `Picovoice repository
<https://github.com/Picovoice/porcupine/tree/master/lib/common>`_. <https://github.com/Picovoice/porcupine/tree/master/lib/common>`_.
:param speech_model_path: Path to the speech model file. If you are
using a language other than English, you can provide the path to the
model file for that language. Model files are available for all the
supported languages through the `Picovoice repository
<https://github.com/Picovoice/porcupine/tree/master/lib/common>`_.
:param endpoint_duration: If set, the assistant will stop listening when
no speech is detected for the specified duration (in seconds) after
the end of an utterance.
:param enable_automatic_punctuation: Enable automatic punctuation
insertion.
:param start_conversation_on_hotword: If set to True (default), a speech
detection session will be started when the hotword is detected. If
set to False, you may want to start the conversation programmatically
by calling the :meth:`.start_conversation` method instead, or run any
custom logic hotword detection logic. This can be particularly useful
when you want to run the assistant in a push-to-talk mode, or when you
want different hotwords to trigger conversations with different models
or languages.
:param audio_queue_size: Maximum number of audio frames to hold in the
processing queue. You may want to increase this value if you are
running this integration on a slow device and/or the logs report
audio frame drops too often. Keep in mind that increasing this value
will increase the memory usage of the integration. Also, a higher
value may result in higher accuracy at the cost of higher latency.
:param conversation_timeout: Maximum time to wait for some speech to be
detected after the hotword is detected. If no speech is detected
within this time, the conversation will time out and the plugin will
go back into hotword detection mode, if the mode is enabled. Default:
5 seconds.
""" """
super().__init__(**kwargs) super().__init__(**kwargs)
self._assistant_args = { self._assistant_args = {
@ -92,6 +128,12 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
'keywords': keywords, 'keywords': keywords,
'keyword_paths': keyword_paths, 'keyword_paths': keyword_paths,
'keyword_model_path': keyword_model_path, 'keyword_model_path': keyword_model_path,
'speech_model_path': speech_model_path,
'endpoint_duration': endpoint_duration,
'enable_automatic_punctuation': enable_automatic_punctuation,
'start_conversation_on_hotword': start_conversation_on_hotword,
'audio_queue_size': audio_queue_size,
'conversation_timeout': conversation_timeout,
} }
@action @action
@ -151,6 +193,7 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
try: try:
for event in assistant: for event in assistant:
if event: if event:
event.args['assistant'] = 'picovoice'
get_bus().post(event) get_bus().post(event)
except KeyboardInterrupt: except KeyboardInterrupt:
break break

View File

@ -1,6 +1,6 @@
import logging import logging
import os import os
from threading import Event from threading import Event, RLock
from time import time from time import time
from typing import Any, Dict, Optional, Sequence from typing import Any, Dict, Optional, Sequence
@ -9,9 +9,18 @@ import pvleopard
import pvporcupine import pvporcupine
import pvrhino import pvrhino
from platypush.message.event.assistant import HotwordDetectedEvent from platypush.context import get_bus
from platypush.message.event.assistant import (
ConversationStartEvent,
ConversationEndEvent,
ConversationTimeoutEvent,
HotwordDetectedEvent,
SpeechRecognizedEvent,
)
from ._context import SpeechDetectionContext
from ._recorder import AudioRecorder from ._recorder import AudioRecorder
from ._state import AssistantState
class Assistant: class Assistant:
@ -30,10 +39,16 @@ class Assistant:
keyword_paths: Optional[Sequence[str]] = None, keyword_paths: Optional[Sequence[str]] = None,
keyword_model_path: Optional[str] = None, keyword_model_path: Optional[str] = None,
frame_expiration: float = 3.0, # Don't process audio frames older than this frame_expiration: float = 3.0, # Don't process audio frames older than this
speech_model_path: Optional[str] = None,
endpoint_duration: Optional[float] = None,
enable_automatic_punctuation: bool = False,
start_conversation_on_hotword: bool = False,
audio_queue_size: int = 100,
conversation_timeout: Optional[float] = None,
): ):
self.logger = logging.getLogger(__name__)
self._access_key = access_key self._access_key = access_key
self._stop_event = stop_event self._stop_event = stop_event
self.logger = logging.getLogger(__name__)
self.hotword_enabled = hotword_enabled self.hotword_enabled = hotword_enabled
self.stt_enabled = stt_enabled self.stt_enabled = stt_enabled
self.intent_enabled = intent_enabled self.intent_enabled = intent_enabled
@ -41,9 +56,23 @@ class Assistant:
self.keyword_paths = None self.keyword_paths = None
self.keyword_model_path = None self.keyword_model_path = None
self.frame_expiration = frame_expiration self.frame_expiration = frame_expiration
self.speech_model_path = speech_model_path
self.endpoint_duration = endpoint_duration
self.enable_automatic_punctuation = enable_automatic_punctuation
self.start_conversation_on_hotword = start_conversation_on_hotword
self.audio_queue_size = audio_queue_size
self._recorder = None self._recorder = None
self._state = AssistantState.IDLE
self._state_lock = RLock()
self._speech_ctx = SpeechDetectionContext(timeout=conversation_timeout)
if hotword_enabled: if hotword_enabled:
if not keywords:
raise ValueError(
'You need to provide a list of keywords if the wake-word engine is enabled'
)
if keyword_paths: if keyword_paths:
keyword_paths = [os.path.expanduser(path) for path in keyword_paths] keyword_paths = [os.path.expanduser(path) for path in keyword_paths]
missing_paths = [ missing_paths = [
@ -74,46 +103,89 @@ class Assistant:
def wait_stop(self): def wait_stop(self):
self._stop_event.wait() self._stop_event.wait()
def _create_porcupine(self): @property
if not self.hotword_enabled: def state(self) -> AssistantState:
return None with self._state_lock:
return self._state
args: Dict[str, Any] = {'access_key': self._access_key} @state.setter
if not (self.keywords or self.keyword_paths): def state(self, state: AssistantState):
raise ValueError( with self._state_lock:
'You need to provide either a list of keywords or a list of ' prev_state = self._state
'keyword paths if the wake-word engine is enabled' self._state = state
) new_state = self.state
if self.keywords: if prev_state == new_state:
args['keywords'] = self.keywords return
if self.keyword_paths:
args['keyword_paths'] = self.keyword_paths
if self.keyword_model_path:
args['model_path'] = self.keyword_model_path
return pvporcupine.create(**args) if prev_state == AssistantState.DETECTING_SPEECH:
self._speech_ctx.stop()
self._post_event(ConversationEndEvent())
elif new_state == AssistantState.DETECTING_SPEECH:
self._speech_ctx.start()
self._post_event(ConversationStartEvent())
@property @property
def porcupine(self) -> Optional[pvporcupine.Porcupine]: def porcupine(self) -> Optional[pvporcupine.Porcupine]:
if not self.hotword_enabled:
return None
if not self._porcupine: if not self._porcupine:
self._porcupine = self._create_porcupine() args: Dict[str, Any] = {'access_key': self._access_key}
if self.keywords:
args['keywords'] = self.keywords
if self.keyword_paths:
args['keyword_paths'] = self.keyword_paths
if self.keyword_model_path:
args['model_path'] = self.keyword_model_path
self._porcupine = pvporcupine.create(**args)
return self._porcupine return self._porcupine
@property
def cheetah(self) -> Optional[pvcheetah.Cheetah]:
if not self.stt_enabled:
return None
if not self._cheetah:
args: Dict[str, Any] = {'access_key': self._access_key}
if self.speech_model_path:
args['model_path'] = self.speech_model_path
if self.endpoint_duration:
args['endpoint_duration_sec'] = self.endpoint_duration
if self.enable_automatic_punctuation:
args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
self._cheetah = pvcheetah.create(**args)
return self._cheetah
def __enter__(self): def __enter__(self):
if self.should_stop():
return self
if self._recorder: if self._recorder:
self.logger.info('A recording stream already exists') self.logger.info('A recording stream already exists')
elif self.porcupine: elif self.porcupine or self.cheetah:
sample_rate = (self.porcupine or self.cheetah).sample_rate # type: ignore
frame_length = (self.porcupine or self.cheetah).frame_length # type: ignore
self._recorder = AudioRecorder( self._recorder = AudioRecorder(
stop_event=self._stop_event, stop_event=self._stop_event,
sample_rate=self.porcupine.sample_rate, sample_rate=sample_rate,
frame_size=self.porcupine.frame_length, frame_size=frame_length,
queue_size=self.audio_queue_size,
channels=1, channels=1,
) )
self._recorder.__enter__() self._recorder.__enter__()
if self.porcupine:
self.state = AssistantState.DETECTING_HOTWORD
else:
self.state = AssistantState.DETECTING_SPEECH
return self return self
def __exit__(self, *_): def __exit__(self, *_):
@ -121,6 +193,8 @@ class Assistant:
self._recorder.__exit__(*_) self._recorder.__exit__(*_)
self._recorder = None self._recorder = None
self.state = AssistantState.IDLE
if self._cheetah: if self._cheetah:
self._cheetah.delete() self._cheetah.delete()
self._cheetah = None self._cheetah = None
@ -146,26 +220,74 @@ class Assistant:
raise StopIteration raise StopIteration
while not (self.should_stop() or has_data): while not (self.should_stop() or has_data):
if self.porcupine: # TODO also check current state data = self._recorder.read()
data = self._recorder.read() if data is None:
if data is None: continue
continue
frame, t = data frame, t = data
if time() - t > self.frame_expiration: if time() - t > self.frame_expiration:
self.logger.info( self.logger.info(
'Skipping audio frame older than %ss', self.frame_expiration 'Skipping audio frame older than %ss', self.frame_expiration
) )
continue # The audio frame is too old continue # The audio frame is too old
keyword_index = self.porcupine.process(frame) if self.porcupine and self.state == AssistantState.DETECTING_HOTWORD:
if keyword_index is None: return self._process_hotword(frame)
continue # No keyword detected
if keyword_index >= 0 and self.keywords: if self.cheetah and self.state == AssistantState.DETECTING_SPEECH:
return HotwordDetectedEvent(hotword=self.keywords[keyword_index]) return self._process_speech(frame)
raise StopIteration raise StopIteration
def _post_event(self, event):
if event:
event.args['assistant'] = 'picovoice'
get_bus().post(event)
def _process_hotword(self, frame):
if not self.porcupine:
return None
keyword_index = self.porcupine.process(frame)
if keyword_index is None:
return None # No keyword detected
if keyword_index >= 0 and self.keywords:
if self.start_conversation_on_hotword:
self.state = AssistantState.DETECTING_SPEECH
return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
return None
def _process_speech(self, frame):
if not self.cheetah:
return None
event = None
(
self._speech_ctx.partial_transcript,
self._speech_ctx.is_final,
) = self.cheetah.process(frame)
if self._speech_ctx.partial_transcript:
self.logger.info(
'Partial transcript: %s, is_final: %s',
self._speech_ctx.partial_transcript,
self._speech_ctx.is_final,
)
if self._speech_ctx.is_final or self._speech_ctx.timed_out:
event = (
ConversationTimeoutEvent()
if self._speech_ctx.timed_out
else SpeechRecognizedEvent(phrase=self.cheetah.flush())
)
if self.porcupine:
self.state = AssistantState.DETECTING_HOTWORD
return event
# vim:sw=4:ts=4:et: # vim:sw=4:ts=4:et:

View File

@ -0,0 +1,43 @@
from dataclasses import dataclass
from time import time
from typing import Optional
@dataclass
class SpeechDetectionContext:
"""
Context of the speech detection process.
"""
partial_transcript: str = ''
is_final: bool = False
timeout: Optional[float] = None
t_start: Optional[float] = None
t_end: Optional[float] = None
def start(self):
self.reset()
self.t_start = time()
def stop(self):
self.reset()
self.t_end = time()
def reset(self):
self.partial_transcript = ''
self.is_final = False
self.t_start = None
self.t_end = None
@property
def timed_out(self):
return (
not self.partial_transcript
and not self.is_final
and self.timeout
and self.t_start
and time() - self.t_start > self.timeout
)
# vim:sw=4:ts=4:et:

View File

@ -26,7 +26,7 @@ class AudioRecorder:
frame_size: int, frame_size: int,
channels: int, channels: int,
dtype: str = 'int16', dtype: str = 'int16',
queue_size: int = 20, queue_size: int = 100,
): ):
self.logger = getLogger(__name__) self.logger = getLogger(__name__)
self._audio_queue: Queue[AudioFrame] = Queue(maxsize=queue_size) self._audio_queue: Queue[AudioFrame] = Queue(maxsize=queue_size)
@ -48,7 +48,6 @@ class AudioRecorder:
def __exit__(self, *_): def __exit__(self, *_):
self.stop() self.stop()
# self.stream.close()
def _audio_callback(self, indata, *_): def _audio_callback(self, indata, *_):
if self.should_stop(): if self.should_stop():