forked from platypush/platypush
[WIP] Added speech detection logic over Cheetah.
This commit is contained in:
parent
a9498ea191
commit
f7517eb321
11 changed files with 263 additions and 74 deletions
|
@ -10,6 +10,4 @@ Backends
|
|||
platypush/backend/midi.rst
|
||||
platypush/backend/nodered.rst
|
||||
platypush/backend/redis.rst
|
||||
platypush/backend/stt.picovoice.hotword.rst
|
||||
platypush/backend/stt.picovoice.speech.rst
|
||||
platypush/backend/tcp.rst
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
``stt.picovoice.hotword``
|
||||
===========================================
|
||||
|
||||
.. automodule:: platypush.backend.stt.picovoice.hotword
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
``stt.picovoice.speech``
|
||||
==========================================
|
||||
|
||||
.. automodule:: platypush.backend.stt.picovoice.speech
|
||||
:members:
|
5
docs/source/platypush/plugins/picovoice.rst
Normal file
5
docs/source/platypush/plugins/picovoice.rst
Normal file
|
@ -0,0 +1,5 @@
|
|||
``picovoice``
|
||||
=============
|
||||
|
||||
.. automodule:: platypush.plugins.picovoice
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
``stt.picovoice.hotword``
|
||||
===========================================
|
||||
|
||||
.. automodule:: platypush.plugins.stt.picovoice.hotword
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
``stt.picovoice.speech``
|
||||
==========================================
|
||||
|
||||
.. automodule:: platypush.plugins.stt.picovoice.speech
|
||||
:members:
|
|
@ -95,6 +95,7 @@ Plugins
|
|||
platypush/plugins/nmap.rst
|
||||
platypush/plugins/ntfy.rst
|
||||
platypush/plugins/otp.rst
|
||||
platypush/plugins/picovoice.rst
|
||||
platypush/plugins/pihole.rst
|
||||
platypush/plugins/ping.rst
|
||||
platypush/plugins/printer.cups.rst
|
||||
|
@ -119,8 +120,6 @@ Plugins
|
|||
platypush/plugins/smartthings.rst
|
||||
platypush/plugins/sound.rst
|
||||
platypush/plugins/ssh.rst
|
||||
platypush/plugins/stt.picovoice.hotword.rst
|
||||
platypush/plugins/stt.picovoice.speech.rst
|
||||
platypush/plugins/sun.rst
|
||||
platypush/plugins/switch.tplink.rst
|
||||
platypush/plugins/switch.wemo.rst
|
||||
|
|
|
@ -51,27 +51,34 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
|
|||
keywords: Optional[Sequence[str]] = None,
|
||||
keyword_paths: Optional[Sequence[str]] = None,
|
||||
keyword_model_path: Optional[str] = None,
|
||||
speech_model_path: Optional[str] = None,
|
||||
endpoint_duration: Optional[float] = 0.5,
|
||||
enable_automatic_punctuation: bool = False,
|
||||
start_conversation_on_hotword: bool = True,
|
||||
audio_queue_size: int = 100,
|
||||
conversation_timeout: Optional[float] = 5.0,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
:param access_key: Your Picovoice access key. You can get it by signing
|
||||
up at the `Picovoice console <https://console.picovoice.ai/>`.
|
||||
:param hotword_enabled: Enable the wake-word engine (default: True).
|
||||
.. note:: The wake-word engine requires you to add Porcupine to the
|
||||
products available in your Picovoice account.
|
||||
**Note**: The wake-word engine requires you to add Porcupine to the
|
||||
products available in your Picovoice account.
|
||||
:param stt_enabled: Enable the speech-to-text engine (default: True).
|
||||
.. note:: The speech-to-text engine requires you to add Cheetah to
|
||||
the products available in your Picovoice account.
|
||||
**Note**: The speech-to-text engine requires you to add Cheetah to
|
||||
the products available in your Picovoice account.
|
||||
:param intent_enabled: Enable the intent recognition engine (default:
|
||||
False).
|
||||
.. note:: The intent recognition engine requires you to add Rhino
|
||||
to the products available in your Picovoice account.
|
||||
**Note**: The intent recognition engine requires you to add Rhino
|
||||
to the products available in your Picovoice account.
|
||||
:param keywords: List of keywords to listen for (e.g. ``alexa``, ``ok
|
||||
google``...). Either ``keywords`` or ``keyword_paths`` must be
|
||||
provided if the wake-word engine is enabled. This list can include
|
||||
any of the default Picovoice keywords (available on the `Picovoice
|
||||
repository
|
||||
google``...). This is required if the wake-word engine is enabled.
|
||||
See the `Picovoice repository
|
||||
<https://github.com/Picovoice/porcupine/tree/master/resources/keyword_files>`_).
|
||||
for a list of the stock keywords available. If you have a custom
|
||||
model, you can pass its path to the ``keyword_paths`` parameter and
|
||||
its filename (without the path and the platform extension) here.
|
||||
:param keyword_paths: List of paths to the keyword files to listen for.
|
||||
Custom keyword files can be created using the `Picovoice console
|
||||
<https://console.picovoice.ai/ppn>`_ and downloaded from the
|
||||
|
@ -81,6 +88,35 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
|
|||
for its language. Model files are available for all the supported
|
||||
languages through the `Picovoice repository
|
||||
<https://github.com/Picovoice/porcupine/tree/master/lib/common>`_.
|
||||
:param speech_model_path: Path to the speech model file. If you are
|
||||
using a language other than English, you can provide the path to the
|
||||
model file for that language. Model files are available for all the
|
||||
supported languages through the `Picovoice repository
|
||||
<https://github.com/Picovoice/porcupine/tree/master/lib/common>`_.
|
||||
:param endpoint_duration: If set, the assistant will stop listening when
|
||||
no speech is detected for the specified duration (in seconds) after
|
||||
the end of an utterance.
|
||||
:param enable_automatic_punctuation: Enable automatic punctuation
|
||||
insertion.
|
||||
:param start_conversation_on_hotword: If set to True (default), a speech
|
||||
detection session will be started when the hotword is detected. If
|
||||
set to False, you may want to start the conversation programmatically
|
||||
by calling the :meth:`.start_conversation` method instead, or run any
|
||||
custom logic hotword detection logic. This can be particularly useful
|
||||
when you want to run the assistant in a push-to-talk mode, or when you
|
||||
want different hotwords to trigger conversations with different models
|
||||
or languages.
|
||||
:param audio_queue_size: Maximum number of audio frames to hold in the
|
||||
processing queue. You may want to increase this value if you are
|
||||
running this integration on a slow device and/or the logs report
|
||||
audio frame drops too often. Keep in mind that increasing this value
|
||||
will increase the memory usage of the integration. Also, a higher
|
||||
value may result in higher accuracy at the cost of higher latency.
|
||||
:param conversation_timeout: Maximum time to wait for some speech to be
|
||||
detected after the hotword is detected. If no speech is detected
|
||||
within this time, the conversation will time out and the plugin will
|
||||
go back into hotword detection mode, if the mode is enabled. Default:
|
||||
5 seconds.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self._assistant_args = {
|
||||
|
@ -92,6 +128,12 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
|
|||
'keywords': keywords,
|
||||
'keyword_paths': keyword_paths,
|
||||
'keyword_model_path': keyword_model_path,
|
||||
'speech_model_path': speech_model_path,
|
||||
'endpoint_duration': endpoint_duration,
|
||||
'enable_automatic_punctuation': enable_automatic_punctuation,
|
||||
'start_conversation_on_hotword': start_conversation_on_hotword,
|
||||
'audio_queue_size': audio_queue_size,
|
||||
'conversation_timeout': conversation_timeout,
|
||||
}
|
||||
|
||||
@action
|
||||
|
@ -151,6 +193,7 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
|
|||
try:
|
||||
for event in assistant:
|
||||
if event:
|
||||
event.args['assistant'] = 'picovoice'
|
||||
get_bus().post(event)
|
||||
except KeyboardInterrupt:
|
||||
break
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import logging
|
||||
import os
|
||||
from threading import Event
|
||||
from threading import Event, RLock
|
||||
from time import time
|
||||
from typing import Any, Dict, Optional, Sequence
|
||||
|
||||
|
@ -9,9 +9,18 @@ import pvleopard
|
|||
import pvporcupine
|
||||
import pvrhino
|
||||
|
||||
from platypush.message.event.assistant import HotwordDetectedEvent
|
||||
from platypush.context import get_bus
|
||||
from platypush.message.event.assistant import (
|
||||
ConversationStartEvent,
|
||||
ConversationEndEvent,
|
||||
ConversationTimeoutEvent,
|
||||
HotwordDetectedEvent,
|
||||
SpeechRecognizedEvent,
|
||||
)
|
||||
|
||||
from ._context import SpeechDetectionContext
|
||||
from ._recorder import AudioRecorder
|
||||
from ._state import AssistantState
|
||||
|
||||
|
||||
class Assistant:
|
||||
|
@ -30,10 +39,16 @@ class Assistant:
|
|||
keyword_paths: Optional[Sequence[str]] = None,
|
||||
keyword_model_path: Optional[str] = None,
|
||||
frame_expiration: float = 3.0, # Don't process audio frames older than this
|
||||
speech_model_path: Optional[str] = None,
|
||||
endpoint_duration: Optional[float] = None,
|
||||
enable_automatic_punctuation: bool = False,
|
||||
start_conversation_on_hotword: bool = False,
|
||||
audio_queue_size: int = 100,
|
||||
conversation_timeout: Optional[float] = None,
|
||||
):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._access_key = access_key
|
||||
self._stop_event = stop_event
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.hotword_enabled = hotword_enabled
|
||||
self.stt_enabled = stt_enabled
|
||||
self.intent_enabled = intent_enabled
|
||||
|
@ -41,9 +56,23 @@ class Assistant:
|
|||
self.keyword_paths = None
|
||||
self.keyword_model_path = None
|
||||
self.frame_expiration = frame_expiration
|
||||
self.speech_model_path = speech_model_path
|
||||
self.endpoint_duration = endpoint_duration
|
||||
self.enable_automatic_punctuation = enable_automatic_punctuation
|
||||
self.start_conversation_on_hotword = start_conversation_on_hotword
|
||||
self.audio_queue_size = audio_queue_size
|
||||
|
||||
self._recorder = None
|
||||
self._state = AssistantState.IDLE
|
||||
self._state_lock = RLock()
|
||||
self._speech_ctx = SpeechDetectionContext(timeout=conversation_timeout)
|
||||
|
||||
if hotword_enabled:
|
||||
if not keywords:
|
||||
raise ValueError(
|
||||
'You need to provide a list of keywords if the wake-word engine is enabled'
|
||||
)
|
||||
|
||||
if keyword_paths:
|
||||
keyword_paths = [os.path.expanduser(path) for path in keyword_paths]
|
||||
missing_paths = [
|
||||
|
@ -74,46 +103,89 @@ class Assistant:
|
|||
def wait_stop(self):
|
||||
self._stop_event.wait()
|
||||
|
||||
def _create_porcupine(self):
|
||||
if not self.hotword_enabled:
|
||||
return None
|
||||
@property
|
||||
def state(self) -> AssistantState:
|
||||
with self._state_lock:
|
||||
return self._state
|
||||
|
||||
args: Dict[str, Any] = {'access_key': self._access_key}
|
||||
if not (self.keywords or self.keyword_paths):
|
||||
raise ValueError(
|
||||
'You need to provide either a list of keywords or a list of '
|
||||
'keyword paths if the wake-word engine is enabled'
|
||||
)
|
||||
@state.setter
|
||||
def state(self, state: AssistantState):
|
||||
with self._state_lock:
|
||||
prev_state = self._state
|
||||
self._state = state
|
||||
new_state = self.state
|
||||
|
||||
if self.keywords:
|
||||
args['keywords'] = self.keywords
|
||||
if self.keyword_paths:
|
||||
args['keyword_paths'] = self.keyword_paths
|
||||
if self.keyword_model_path:
|
||||
args['model_path'] = self.keyword_model_path
|
||||
if prev_state == new_state:
|
||||
return
|
||||
|
||||
return pvporcupine.create(**args)
|
||||
if prev_state == AssistantState.DETECTING_SPEECH:
|
||||
self._speech_ctx.stop()
|
||||
self._post_event(ConversationEndEvent())
|
||||
elif new_state == AssistantState.DETECTING_SPEECH:
|
||||
self._speech_ctx.start()
|
||||
self._post_event(ConversationStartEvent())
|
||||
|
||||
@property
|
||||
def porcupine(self) -> Optional[pvporcupine.Porcupine]:
|
||||
if not self.hotword_enabled:
|
||||
return None
|
||||
|
||||
if not self._porcupine:
|
||||
self._porcupine = self._create_porcupine()
|
||||
args: Dict[str, Any] = {'access_key': self._access_key}
|
||||
if self.keywords:
|
||||
args['keywords'] = self.keywords
|
||||
if self.keyword_paths:
|
||||
args['keyword_paths'] = self.keyword_paths
|
||||
if self.keyword_model_path:
|
||||
args['model_path'] = self.keyword_model_path
|
||||
|
||||
self._porcupine = pvporcupine.create(**args)
|
||||
|
||||
return self._porcupine
|
||||
|
||||
@property
|
||||
def cheetah(self) -> Optional[pvcheetah.Cheetah]:
|
||||
if not self.stt_enabled:
|
||||
return None
|
||||
|
||||
if not self._cheetah:
|
||||
args: Dict[str, Any] = {'access_key': self._access_key}
|
||||
if self.speech_model_path:
|
||||
args['model_path'] = self.speech_model_path
|
||||
if self.endpoint_duration:
|
||||
args['endpoint_duration_sec'] = self.endpoint_duration
|
||||
if self.enable_automatic_punctuation:
|
||||
args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
|
||||
|
||||
self._cheetah = pvcheetah.create(**args)
|
||||
|
||||
return self._cheetah
|
||||
|
||||
def __enter__(self):
|
||||
if self.should_stop():
|
||||
return self
|
||||
|
||||
if self._recorder:
|
||||
self.logger.info('A recording stream already exists')
|
||||
elif self.porcupine:
|
||||
elif self.porcupine or self.cheetah:
|
||||
sample_rate = (self.porcupine or self.cheetah).sample_rate # type: ignore
|
||||
frame_length = (self.porcupine or self.cheetah).frame_length # type: ignore
|
||||
|
||||
self._recorder = AudioRecorder(
|
||||
stop_event=self._stop_event,
|
||||
sample_rate=self.porcupine.sample_rate,
|
||||
frame_size=self.porcupine.frame_length,
|
||||
sample_rate=sample_rate,
|
||||
frame_size=frame_length,
|
||||
queue_size=self.audio_queue_size,
|
||||
channels=1,
|
||||
)
|
||||
|
||||
self._recorder.__enter__()
|
||||
|
||||
if self.porcupine:
|
||||
self.state = AssistantState.DETECTING_HOTWORD
|
||||
else:
|
||||
self.state = AssistantState.DETECTING_SPEECH
|
||||
|
||||
return self
|
||||
|
||||
def __exit__(self, *_):
|
||||
|
@ -121,6 +193,8 @@ class Assistant:
|
|||
self._recorder.__exit__(*_)
|
||||
self._recorder = None
|
||||
|
||||
self.state = AssistantState.IDLE
|
||||
|
||||
if self._cheetah:
|
||||
self._cheetah.delete()
|
||||
self._cheetah = None
|
||||
|
@ -146,26 +220,74 @@ class Assistant:
|
|||
raise StopIteration
|
||||
|
||||
while not (self.should_stop() or has_data):
|
||||
if self.porcupine: # TODO also check current state
|
||||
data = self._recorder.read()
|
||||
if data is None:
|
||||
continue
|
||||
data = self._recorder.read()
|
||||
if data is None:
|
||||
continue
|
||||
|
||||
frame, t = data
|
||||
if time() - t > self.frame_expiration:
|
||||
self.logger.info(
|
||||
'Skipping audio frame older than %ss', self.frame_expiration
|
||||
)
|
||||
continue # The audio frame is too old
|
||||
frame, t = data
|
||||
if time() - t > self.frame_expiration:
|
||||
self.logger.info(
|
||||
'Skipping audio frame older than %ss', self.frame_expiration
|
||||
)
|
||||
continue # The audio frame is too old
|
||||
|
||||
keyword_index = self.porcupine.process(frame)
|
||||
if keyword_index is None:
|
||||
continue # No keyword detected
|
||||
if self.porcupine and self.state == AssistantState.DETECTING_HOTWORD:
|
||||
return self._process_hotword(frame)
|
||||
|
||||
if keyword_index >= 0 and self.keywords:
|
||||
return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
|
||||
if self.cheetah and self.state == AssistantState.DETECTING_SPEECH:
|
||||
return self._process_speech(frame)
|
||||
|
||||
raise StopIteration
|
||||
|
||||
def _post_event(self, event):
|
||||
if event:
|
||||
event.args['assistant'] = 'picovoice'
|
||||
get_bus().post(event)
|
||||
|
||||
def _process_hotword(self, frame):
|
||||
if not self.porcupine:
|
||||
return None
|
||||
|
||||
keyword_index = self.porcupine.process(frame)
|
||||
if keyword_index is None:
|
||||
return None # No keyword detected
|
||||
|
||||
if keyword_index >= 0 and self.keywords:
|
||||
if self.start_conversation_on_hotword:
|
||||
self.state = AssistantState.DETECTING_SPEECH
|
||||
|
||||
return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
|
||||
|
||||
return None
|
||||
|
||||
def _process_speech(self, frame):
|
||||
if not self.cheetah:
|
||||
return None
|
||||
|
||||
event = None
|
||||
(
|
||||
self._speech_ctx.partial_transcript,
|
||||
self._speech_ctx.is_final,
|
||||
) = self.cheetah.process(frame)
|
||||
|
||||
if self._speech_ctx.partial_transcript:
|
||||
self.logger.info(
|
||||
'Partial transcript: %s, is_final: %s',
|
||||
self._speech_ctx.partial_transcript,
|
||||
self._speech_ctx.is_final,
|
||||
)
|
||||
|
||||
if self._speech_ctx.is_final or self._speech_ctx.timed_out:
|
||||
event = (
|
||||
ConversationTimeoutEvent()
|
||||
if self._speech_ctx.timed_out
|
||||
else SpeechRecognizedEvent(phrase=self.cheetah.flush())
|
||||
)
|
||||
|
||||
if self.porcupine:
|
||||
self.state = AssistantState.DETECTING_HOTWORD
|
||||
|
||||
return event
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
||||
|
|
43
platypush/plugins/picovoice/_context.py
Normal file
43
platypush/plugins/picovoice/_context.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
from dataclasses import dataclass
|
||||
from time import time
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class SpeechDetectionContext:
|
||||
"""
|
||||
Context of the speech detection process.
|
||||
"""
|
||||
|
||||
partial_transcript: str = ''
|
||||
is_final: bool = False
|
||||
timeout: Optional[float] = None
|
||||
t_start: Optional[float] = None
|
||||
t_end: Optional[float] = None
|
||||
|
||||
def start(self):
|
||||
self.reset()
|
||||
self.t_start = time()
|
||||
|
||||
def stop(self):
|
||||
self.reset()
|
||||
self.t_end = time()
|
||||
|
||||
def reset(self):
|
||||
self.partial_transcript = ''
|
||||
self.is_final = False
|
||||
self.t_start = None
|
||||
self.t_end = None
|
||||
|
||||
@property
|
||||
def timed_out(self):
|
||||
return (
|
||||
not self.partial_transcript
|
||||
and not self.is_final
|
||||
and self.timeout
|
||||
and self.t_start
|
||||
and time() - self.t_start > self.timeout
|
||||
)
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
|
@ -26,7 +26,7 @@ class AudioRecorder:
|
|||
frame_size: int,
|
||||
channels: int,
|
||||
dtype: str = 'int16',
|
||||
queue_size: int = 20,
|
||||
queue_size: int = 100,
|
||||
):
|
||||
self.logger = getLogger(__name__)
|
||||
self._audio_queue: Queue[AudioFrame] = Queue(maxsize=queue_size)
|
||||
|
@ -48,7 +48,6 @@ class AudioRecorder:
|
|||
|
||||
def __exit__(self, *_):
|
||||
self.stop()
|
||||
# self.stream.close()
|
||||
|
||||
def _audio_callback(self, indata, *_):
|
||||
if self.should_stop():
|
||||
|
|
Loading…
Reference in a new issue