[WIP] Added speech detection logic over Cheetah.
continuous-integration/drone/push Build is passing Details

This commit is contained in:
Fabio Manganiello 2024-04-08 01:54:26 +02:00
parent 01dec0b7a4
commit f021b471aa
Signed by: blacklight
GPG Key ID: D90FBA7F76362774
11 changed files with 263 additions and 74 deletions

View File

@ -10,6 +10,4 @@ Backends
platypush/backend/midi.rst
platypush/backend/nodered.rst
platypush/backend/redis.rst
platypush/backend/stt.picovoice.hotword.rst
platypush/backend/stt.picovoice.speech.rst
platypush/backend/tcp.rst

View File

@ -1,5 +0,0 @@
``stt.picovoice.hotword``
===========================================
.. automodule:: platypush.backend.stt.picovoice.hotword
:members:

View File

@ -1,5 +0,0 @@
``stt.picovoice.speech``
==========================================
.. automodule:: platypush.backend.stt.picovoice.speech
:members:

View File

@ -0,0 +1,5 @@
``picovoice``
=============
.. automodule:: platypush.plugins.picovoice
:members:

View File

@ -1,5 +0,0 @@
``stt.picovoice.hotword``
===========================================
.. automodule:: platypush.plugins.stt.picovoice.hotword
:members:

View File

@ -1,5 +0,0 @@
``stt.picovoice.speech``
==========================================
.. automodule:: platypush.plugins.stt.picovoice.speech
:members:

View File

@ -95,6 +95,7 @@ Plugins
platypush/plugins/nmap.rst
platypush/plugins/ntfy.rst
platypush/plugins/otp.rst
platypush/plugins/picovoice.rst
platypush/plugins/pihole.rst
platypush/plugins/ping.rst
platypush/plugins/printer.cups.rst
@ -119,8 +120,6 @@ Plugins
platypush/plugins/smartthings.rst
platypush/plugins/sound.rst
platypush/plugins/ssh.rst
platypush/plugins/stt.picovoice.hotword.rst
platypush/plugins/stt.picovoice.speech.rst
platypush/plugins/sun.rst
platypush/plugins/switch.tplink.rst
platypush/plugins/switch.wemo.rst

View File

@ -51,27 +51,34 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
keywords: Optional[Sequence[str]] = None,
keyword_paths: Optional[Sequence[str]] = None,
keyword_model_path: Optional[str] = None,
speech_model_path: Optional[str] = None,
endpoint_duration: Optional[float] = 0.5,
enable_automatic_punctuation: bool = False,
start_conversation_on_hotword: bool = True,
audio_queue_size: int = 100,
conversation_timeout: Optional[float] = 5.0,
**kwargs,
):
"""
:param access_key: Your Picovoice access key. You can get it by signing
up at the `Picovoice console <https://console.picovoice.ai/>`.
:param hotword_enabled: Enable the wake-word engine (default: True).
.. note:: The wake-word engine requires you to add Porcupine to the
products available in your Picovoice account.
**Note**: The wake-word engine requires you to add Porcupine to the
products available in your Picovoice account.
:param stt_enabled: Enable the speech-to-text engine (default: True).
.. note:: The speech-to-text engine requires you to add Cheetah to
the products available in your Picovoice account.
**Note**: The speech-to-text engine requires you to add Cheetah to
the products available in your Picovoice account.
:param intent_enabled: Enable the intent recognition engine (default:
False).
.. note:: The intent recognition engine requires you to add Rhino
to the products available in your Picovoice account.
**Note**: The intent recognition engine requires you to add Rhino
to the products available in your Picovoice account.
:param keywords: List of keywords to listen for (e.g. ``alexa``, ``ok
google``...). Either ``keywords`` or ``keyword_paths`` must be
provided if the wake-word engine is enabled. This list can include
any of the default Picovoice keywords (available on the `Picovoice
repository
google``...). This is required if the wake-word engine is enabled.
See the `Picovoice repository
<https://github.com/Picovoice/porcupine/tree/master/resources/keyword_files>`_).
for a list of the stock keywords available. If you have a custom
model, you can pass its path to the ``keyword_paths`` parameter and
its filename (without the path and the platform extension) here.
:param keyword_paths: List of paths to the keyword files to listen for.
Custom keyword files can be created using the `Picovoice console
<https://console.picovoice.ai/ppn>`_ and downloaded from the
@ -81,6 +88,35 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
for its language. Model files are available for all the supported
languages through the `Picovoice repository
<https://github.com/Picovoice/porcupine/tree/master/lib/common>`_.
:param speech_model_path: Path to the speech model file. If you are
using a language other than English, you can provide the path to the
model file for that language. Model files are available for all the
supported languages through the `Picovoice repository
<https://github.com/Picovoice/porcupine/tree/master/lib/common>`_.
:param endpoint_duration: If set, the assistant will stop listening when
no speech is detected for the specified duration (in seconds) after
the end of an utterance.
:param enable_automatic_punctuation: Enable automatic punctuation
insertion.
:param start_conversation_on_hotword: If set to True (default), a speech
detection session will be started when the hotword is detected. If
set to False, you may want to start the conversation programmatically
by calling the :meth:`.start_conversation` method instead, or run any
custom logic hotword detection logic. This can be particularly useful
when you want to run the assistant in a push-to-talk mode, or when you
want different hotwords to trigger conversations with different models
or languages.
:param audio_queue_size: Maximum number of audio frames to hold in the
processing queue. You may want to increase this value if you are
running this integration on a slow device and/or the logs report
audio frame drops too often. Keep in mind that increasing this value
will increase the memory usage of the integration. Also, a higher
value may result in higher accuracy at the cost of higher latency.
:param conversation_timeout: Maximum time to wait for some speech to be
detected after the hotword is detected. If no speech is detected
within this time, the conversation will time out and the plugin will
go back into hotword detection mode, if the mode is enabled. Default:
5 seconds.
"""
super().__init__(**kwargs)
self._assistant_args = {
@ -92,6 +128,12 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
'keywords': keywords,
'keyword_paths': keyword_paths,
'keyword_model_path': keyword_model_path,
'speech_model_path': speech_model_path,
'endpoint_duration': endpoint_duration,
'enable_automatic_punctuation': enable_automatic_punctuation,
'start_conversation_on_hotword': start_conversation_on_hotword,
'audio_queue_size': audio_queue_size,
'conversation_timeout': conversation_timeout,
}
@action
@ -151,6 +193,7 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
try:
for event in assistant:
if event:
event.args['assistant'] = 'picovoice'
get_bus().post(event)
except KeyboardInterrupt:
break

View File

@ -1,6 +1,6 @@
import logging
import os
from threading import Event
from threading import Event, RLock
from time import time
from typing import Any, Dict, Optional, Sequence
@ -9,9 +9,18 @@ import pvleopard
import pvporcupine
import pvrhino
from platypush.message.event.assistant import HotwordDetectedEvent
from platypush.context import get_bus
from platypush.message.event.assistant import (
ConversationStartEvent,
ConversationEndEvent,
ConversationTimeoutEvent,
HotwordDetectedEvent,
SpeechRecognizedEvent,
)
from ._context import SpeechDetectionContext
from ._recorder import AudioRecorder
from ._state import AssistantState
class Assistant:
@ -30,10 +39,16 @@ class Assistant:
keyword_paths: Optional[Sequence[str]] = None,
keyword_model_path: Optional[str] = None,
frame_expiration: float = 3.0, # Don't process audio frames older than this
speech_model_path: Optional[str] = None,
endpoint_duration: Optional[float] = None,
enable_automatic_punctuation: bool = False,
start_conversation_on_hotword: bool = False,
audio_queue_size: int = 100,
conversation_timeout: Optional[float] = None,
):
self.logger = logging.getLogger(__name__)
self._access_key = access_key
self._stop_event = stop_event
self.logger = logging.getLogger(__name__)
self.hotword_enabled = hotword_enabled
self.stt_enabled = stt_enabled
self.intent_enabled = intent_enabled
@ -41,9 +56,23 @@ class Assistant:
self.keyword_paths = None
self.keyword_model_path = None
self.frame_expiration = frame_expiration
self.speech_model_path = speech_model_path
self.endpoint_duration = endpoint_duration
self.enable_automatic_punctuation = enable_automatic_punctuation
self.start_conversation_on_hotword = start_conversation_on_hotword
self.audio_queue_size = audio_queue_size
self._recorder = None
self._state = AssistantState.IDLE
self._state_lock = RLock()
self._speech_ctx = SpeechDetectionContext(timeout=conversation_timeout)
if hotword_enabled:
if not keywords:
raise ValueError(
'You need to provide a list of keywords if the wake-word engine is enabled'
)
if keyword_paths:
keyword_paths = [os.path.expanduser(path) for path in keyword_paths]
missing_paths = [
@ -74,46 +103,89 @@ class Assistant:
def wait_stop(self):
self._stop_event.wait()
def _create_porcupine(self):
if not self.hotword_enabled:
return None
@property
def state(self) -> AssistantState:
with self._state_lock:
return self._state
args: Dict[str, Any] = {'access_key': self._access_key}
if not (self.keywords or self.keyword_paths):
raise ValueError(
'You need to provide either a list of keywords or a list of '
'keyword paths if the wake-word engine is enabled'
)
@state.setter
def state(self, state: AssistantState):
with self._state_lock:
prev_state = self._state
self._state = state
new_state = self.state
if self.keywords:
args['keywords'] = self.keywords
if self.keyword_paths:
args['keyword_paths'] = self.keyword_paths
if self.keyword_model_path:
args['model_path'] = self.keyword_model_path
if prev_state == new_state:
return
return pvporcupine.create(**args)
if prev_state == AssistantState.DETECTING_SPEECH:
self._speech_ctx.stop()
self._post_event(ConversationEndEvent())
elif new_state == AssistantState.DETECTING_SPEECH:
self._speech_ctx.start()
self._post_event(ConversationStartEvent())
@property
def porcupine(self) -> Optional[pvporcupine.Porcupine]:
if not self.hotword_enabled:
return None
if not self._porcupine:
self._porcupine = self._create_porcupine()
args: Dict[str, Any] = {'access_key': self._access_key}
if self.keywords:
args['keywords'] = self.keywords
if self.keyword_paths:
args['keyword_paths'] = self.keyword_paths
if self.keyword_model_path:
args['model_path'] = self.keyword_model_path
self._porcupine = pvporcupine.create(**args)
return self._porcupine
@property
def cheetah(self) -> Optional[pvcheetah.Cheetah]:
if not self.stt_enabled:
return None
if not self._cheetah:
args: Dict[str, Any] = {'access_key': self._access_key}
if self.speech_model_path:
args['model_path'] = self.speech_model_path
if self.endpoint_duration:
args['endpoint_duration_sec'] = self.endpoint_duration
if self.enable_automatic_punctuation:
args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
self._cheetah = pvcheetah.create(**args)
return self._cheetah
def __enter__(self):
if self.should_stop():
return self
if self._recorder:
self.logger.info('A recording stream already exists')
elif self.porcupine:
elif self.porcupine or self.cheetah:
sample_rate = (self.porcupine or self.cheetah).sample_rate # type: ignore
frame_length = (self.porcupine or self.cheetah).frame_length # type: ignore
self._recorder = AudioRecorder(
stop_event=self._stop_event,
sample_rate=self.porcupine.sample_rate,
frame_size=self.porcupine.frame_length,
sample_rate=sample_rate,
frame_size=frame_length,
queue_size=self.audio_queue_size,
channels=1,
)
self._recorder.__enter__()
if self.porcupine:
self.state = AssistantState.DETECTING_HOTWORD
else:
self.state = AssistantState.DETECTING_SPEECH
return self
def __exit__(self, *_):
@ -121,6 +193,8 @@ class Assistant:
self._recorder.__exit__(*_)
self._recorder = None
self.state = AssistantState.IDLE
if self._cheetah:
self._cheetah.delete()
self._cheetah = None
@ -146,26 +220,74 @@ class Assistant:
raise StopIteration
while not (self.should_stop() or has_data):
if self.porcupine: # TODO also check current state
data = self._recorder.read()
if data is None:
continue
data = self._recorder.read()
if data is None:
continue
frame, t = data
if time() - t > self.frame_expiration:
self.logger.info(
'Skipping audio frame older than %ss', self.frame_expiration
)
continue # The audio frame is too old
frame, t = data
if time() - t > self.frame_expiration:
self.logger.info(
'Skipping audio frame older than %ss', self.frame_expiration
)
continue # The audio frame is too old
keyword_index = self.porcupine.process(frame)
if keyword_index is None:
continue # No keyword detected
if self.porcupine and self.state == AssistantState.DETECTING_HOTWORD:
return self._process_hotword(frame)
if keyword_index >= 0 and self.keywords:
return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
if self.cheetah and self.state == AssistantState.DETECTING_SPEECH:
return self._process_speech(frame)
raise StopIteration
def _post_event(self, event):
if event:
event.args['assistant'] = 'picovoice'
get_bus().post(event)
def _process_hotword(self, frame):
if not self.porcupine:
return None
keyword_index = self.porcupine.process(frame)
if keyword_index is None:
return None # No keyword detected
if keyword_index >= 0 and self.keywords:
if self.start_conversation_on_hotword:
self.state = AssistantState.DETECTING_SPEECH
return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
return None
def _process_speech(self, frame):
if not self.cheetah:
return None
event = None
(
self._speech_ctx.partial_transcript,
self._speech_ctx.is_final,
) = self.cheetah.process(frame)
if self._speech_ctx.partial_transcript:
self.logger.info(
'Partial transcript: %s, is_final: %s',
self._speech_ctx.partial_transcript,
self._speech_ctx.is_final,
)
if self._speech_ctx.is_final or self._speech_ctx.timed_out:
event = (
ConversationTimeoutEvent()
if self._speech_ctx.timed_out
else SpeechRecognizedEvent(phrase=self.cheetah.flush())
)
if self.porcupine:
self.state = AssistantState.DETECTING_HOTWORD
return event
# vim:sw=4:ts=4:et:

View File

@ -0,0 +1,43 @@
from dataclasses import dataclass
from time import time
from typing import Optional
@dataclass
class SpeechDetectionContext:
"""
Context of the speech detection process.
"""
partial_transcript: str = ''
is_final: bool = False
timeout: Optional[float] = None
t_start: Optional[float] = None
t_end: Optional[float] = None
def start(self):
self.reset()
self.t_start = time()
def stop(self):
self.reset()
self.t_end = time()
def reset(self):
self.partial_transcript = ''
self.is_final = False
self.t_start = None
self.t_end = None
@property
def timed_out(self):
return (
not self.partial_transcript
and not self.is_final
and self.timeout
and self.t_start
and time() - self.t_start > self.timeout
)
# vim:sw=4:ts=4:et:

View File

@ -26,7 +26,7 @@ class AudioRecorder:
frame_size: int,
channels: int,
dtype: str = 'int16',
queue_size: int = 20,
queue_size: int = 100,
):
self.logger = getLogger(__name__)
self._audio_queue: Queue[AudioFrame] = Queue(maxsize=queue_size)
@ -48,7 +48,6 @@ class AudioRecorder:
def __exit__(self, *_):
self.stop()
# self.stream.close()
def _audio_callback(self, indata, *_):
if self.should_stop():