diff --git a/docs/source/backends.rst b/docs/source/backends.rst
index 2a43daeec..4171e8825 100644
--- a/docs/source/backends.rst
+++ b/docs/source/backends.rst
@@ -10,6 +10,4 @@ Backends
platypush/backend/midi.rst
platypush/backend/nodered.rst
platypush/backend/redis.rst
- platypush/backend/stt.picovoice.hotword.rst
- platypush/backend/stt.picovoice.speech.rst
platypush/backend/tcp.rst
diff --git a/docs/source/platypush/backend/stt.picovoice.hotword.rst b/docs/source/platypush/backend/stt.picovoice.hotword.rst
deleted file mode 100644
index 858386889..000000000
--- a/docs/source/platypush/backend/stt.picovoice.hotword.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-``stt.picovoice.hotword``
-===========================================
-
-.. automodule:: platypush.backend.stt.picovoice.hotword
- :members:
diff --git a/docs/source/platypush/backend/stt.picovoice.speech.rst b/docs/source/platypush/backend/stt.picovoice.speech.rst
deleted file mode 100644
index 8b5809662..000000000
--- a/docs/source/platypush/backend/stt.picovoice.speech.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-``stt.picovoice.speech``
-==========================================
-
-.. automodule:: platypush.backend.stt.picovoice.speech
- :members:
diff --git a/docs/source/platypush/plugins/picovoice.rst b/docs/source/platypush/plugins/picovoice.rst
new file mode 100644
index 000000000..f1f8acded
--- /dev/null
+++ b/docs/source/platypush/plugins/picovoice.rst
@@ -0,0 +1,5 @@
+``picovoice``
+=============
+
+.. automodule:: platypush.plugins.picovoice
+ :members:
diff --git a/docs/source/platypush/plugins/stt.picovoice.hotword.rst b/docs/source/platypush/plugins/stt.picovoice.hotword.rst
deleted file mode 100644
index 11eb37dd5..000000000
--- a/docs/source/platypush/plugins/stt.picovoice.hotword.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-``stt.picovoice.hotword``
-===========================================
-
-.. automodule:: platypush.plugins.stt.picovoice.hotword
- :members:
diff --git a/docs/source/platypush/plugins/stt.picovoice.speech.rst b/docs/source/platypush/plugins/stt.picovoice.speech.rst
deleted file mode 100644
index 890c904cc..000000000
--- a/docs/source/platypush/plugins/stt.picovoice.speech.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-``stt.picovoice.speech``
-==========================================
-
-.. automodule:: platypush.plugins.stt.picovoice.speech
- :members:
diff --git a/docs/source/plugins.rst b/docs/source/plugins.rst
index 5e583f5e5..783cb841e 100644
--- a/docs/source/plugins.rst
+++ b/docs/source/plugins.rst
@@ -95,6 +95,7 @@ Plugins
platypush/plugins/nmap.rst
platypush/plugins/ntfy.rst
platypush/plugins/otp.rst
+ platypush/plugins/picovoice.rst
platypush/plugins/pihole.rst
platypush/plugins/ping.rst
platypush/plugins/printer.cups.rst
@@ -119,8 +120,6 @@ Plugins
platypush/plugins/smartthings.rst
platypush/plugins/sound.rst
platypush/plugins/ssh.rst
- platypush/plugins/stt.picovoice.hotword.rst
- platypush/plugins/stt.picovoice.speech.rst
platypush/plugins/sun.rst
platypush/plugins/switch.tplink.rst
platypush/plugins/switch.wemo.rst
diff --git a/platypush/plugins/picovoice/__init__.py b/platypush/plugins/picovoice/__init__.py
index a861f66bd..c1e55570f 100644
--- a/platypush/plugins/picovoice/__init__.py
+++ b/platypush/plugins/picovoice/__init__.py
@@ -51,27 +51,34 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
keywords: Optional[Sequence[str]] = None,
keyword_paths: Optional[Sequence[str]] = None,
keyword_model_path: Optional[str] = None,
+ speech_model_path: Optional[str] = None,
+ endpoint_duration: Optional[float] = 0.5,
+ enable_automatic_punctuation: bool = False,
+ start_conversation_on_hotword: bool = True,
+ audio_queue_size: int = 100,
+ conversation_timeout: Optional[float] = 5.0,
**kwargs,
):
"""
:param access_key: Your Picovoice access key. You can get it by signing
up at the `Picovoice console `.
:param hotword_enabled: Enable the wake-word engine (default: True).
- .. note:: The wake-word engine requires you to add Porcupine to the
- products available in your Picovoice account.
+ **Note**: The wake-word engine requires you to add Porcupine to the
+ products available in your Picovoice account.
:param stt_enabled: Enable the speech-to-text engine (default: True).
- .. note:: The speech-to-text engine requires you to add Cheetah to
- the products available in your Picovoice account.
+ **Note**: The speech-to-text engine requires you to add Cheetah to
+ the products available in your Picovoice account.
:param intent_enabled: Enable the intent recognition engine (default:
False).
- .. note:: The intent recognition engine requires you to add Rhino
- to the products available in your Picovoice account.
+ **Note**: The intent recognition engine requires you to add Rhino
+ to the products available in your Picovoice account.
:param keywords: List of keywords to listen for (e.g. ``alexa``, ``ok
- google``...). Either ``keywords`` or ``keyword_paths`` must be
- provided if the wake-word engine is enabled. This list can include
- any of the default Picovoice keywords (available on the `Picovoice
- repository
+ google``...). This is required if the wake-word engine is enabled.
+ See the `Picovoice repository
`_).
+ for a list of the stock keywords available. If you have a custom
+ model, you can pass its path to the ``keyword_paths`` parameter and
+ its filename (without the path and the platform extension) here.
:param keyword_paths: List of paths to the keyword files to listen for.
Custom keyword files can be created using the `Picovoice console
`_ and downloaded from the
@@ -81,6 +88,35 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
for its language. Model files are available for all the supported
languages through the `Picovoice repository
`_.
+ :param speech_model_path: Path to the speech model file. If you are
+ using a language other than English, you can provide the path to the
+ model file for that language. Model files are available for all the
+ supported languages through the `Picovoice repository
+ `_.
+ :param endpoint_duration: If set, the assistant will stop listening when
+ no speech is detected for the specified duration (in seconds) after
+ the end of an utterance.
+ :param enable_automatic_punctuation: Enable automatic punctuation
+ insertion.
+ :param start_conversation_on_hotword: If set to True (default), a speech
+ detection session will be started when the hotword is detected. If
+ set to False, you may want to start the conversation programmatically
+ by calling the :meth:`.start_conversation` method instead, or run any
+ custom logic hotword detection logic. This can be particularly useful
+ when you want to run the assistant in a push-to-talk mode, or when you
+ want different hotwords to trigger conversations with different models
+ or languages.
+ :param audio_queue_size: Maximum number of audio frames to hold in the
+ processing queue. You may want to increase this value if you are
+ running this integration on a slow device and/or the logs report
+ audio frame drops too often. Keep in mind that increasing this value
+ will increase the memory usage of the integration. Also, a higher
+ value may result in higher accuracy at the cost of higher latency.
+ :param conversation_timeout: Maximum time to wait for some speech to be
+ detected after the hotword is detected. If no speech is detected
+ within this time, the conversation will time out and the plugin will
+ go back into hotword detection mode, if the mode is enabled. Default:
+ 5 seconds.
"""
super().__init__(**kwargs)
self._assistant_args = {
@@ -92,6 +128,12 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
'keywords': keywords,
'keyword_paths': keyword_paths,
'keyword_model_path': keyword_model_path,
+ 'speech_model_path': speech_model_path,
+ 'endpoint_duration': endpoint_duration,
+ 'enable_automatic_punctuation': enable_automatic_punctuation,
+ 'start_conversation_on_hotword': start_conversation_on_hotword,
+ 'audio_queue_size': audio_queue_size,
+ 'conversation_timeout': conversation_timeout,
}
@action
@@ -151,6 +193,7 @@ class PicovoicePlugin(AssistantPlugin, RunnablePlugin):
try:
for event in assistant:
if event:
+ event.args['assistant'] = 'picovoice'
get_bus().post(event)
except KeyboardInterrupt:
break
diff --git a/platypush/plugins/picovoice/_assistant.py b/platypush/plugins/picovoice/_assistant.py
index 5181aa572..27a727129 100644
--- a/platypush/plugins/picovoice/_assistant.py
+++ b/platypush/plugins/picovoice/_assistant.py
@@ -1,6 +1,6 @@
import logging
import os
-from threading import Event
+from threading import Event, RLock
from time import time
from typing import Any, Dict, Optional, Sequence
@@ -9,9 +9,18 @@ import pvleopard
import pvporcupine
import pvrhino
-from platypush.message.event.assistant import HotwordDetectedEvent
+from platypush.context import get_bus
+from platypush.message.event.assistant import (
+ ConversationStartEvent,
+ ConversationEndEvent,
+ ConversationTimeoutEvent,
+ HotwordDetectedEvent,
+ SpeechRecognizedEvent,
+)
+from ._context import SpeechDetectionContext
from ._recorder import AudioRecorder
+from ._state import AssistantState
class Assistant:
@@ -30,10 +39,16 @@ class Assistant:
keyword_paths: Optional[Sequence[str]] = None,
keyword_model_path: Optional[str] = None,
frame_expiration: float = 3.0, # Don't process audio frames older than this
+ speech_model_path: Optional[str] = None,
+ endpoint_duration: Optional[float] = None,
+ enable_automatic_punctuation: bool = False,
+ start_conversation_on_hotword: bool = False,
+ audio_queue_size: int = 100,
+ conversation_timeout: Optional[float] = None,
):
- self.logger = logging.getLogger(__name__)
self._access_key = access_key
self._stop_event = stop_event
+ self.logger = logging.getLogger(__name__)
self.hotword_enabled = hotword_enabled
self.stt_enabled = stt_enabled
self.intent_enabled = intent_enabled
@@ -41,9 +56,23 @@ class Assistant:
self.keyword_paths = None
self.keyword_model_path = None
self.frame_expiration = frame_expiration
+ self.speech_model_path = speech_model_path
+ self.endpoint_duration = endpoint_duration
+ self.enable_automatic_punctuation = enable_automatic_punctuation
+ self.start_conversation_on_hotword = start_conversation_on_hotword
+ self.audio_queue_size = audio_queue_size
+
self._recorder = None
+ self._state = AssistantState.IDLE
+ self._state_lock = RLock()
+ self._speech_ctx = SpeechDetectionContext(timeout=conversation_timeout)
if hotword_enabled:
+ if not keywords:
+ raise ValueError(
+ 'You need to provide a list of keywords if the wake-word engine is enabled'
+ )
+
if keyword_paths:
keyword_paths = [os.path.expanduser(path) for path in keyword_paths]
missing_paths = [
@@ -74,46 +103,89 @@ class Assistant:
def wait_stop(self):
self._stop_event.wait()
- def _create_porcupine(self):
- if not self.hotword_enabled:
- return None
+ @property
+ def state(self) -> AssistantState:
+ with self._state_lock:
+ return self._state
- args: Dict[str, Any] = {'access_key': self._access_key}
- if not (self.keywords or self.keyword_paths):
- raise ValueError(
- 'You need to provide either a list of keywords or a list of '
- 'keyword paths if the wake-word engine is enabled'
- )
+ @state.setter
+ def state(self, state: AssistantState):
+ with self._state_lock:
+ prev_state = self._state
+ self._state = state
+ new_state = self.state
- if self.keywords:
- args['keywords'] = self.keywords
- if self.keyword_paths:
- args['keyword_paths'] = self.keyword_paths
- if self.keyword_model_path:
- args['model_path'] = self.keyword_model_path
+ if prev_state == new_state:
+ return
- return pvporcupine.create(**args)
+ if prev_state == AssistantState.DETECTING_SPEECH:
+ self._speech_ctx.stop()
+ self._post_event(ConversationEndEvent())
+ elif new_state == AssistantState.DETECTING_SPEECH:
+ self._speech_ctx.start()
+ self._post_event(ConversationStartEvent())
@property
def porcupine(self) -> Optional[pvporcupine.Porcupine]:
+ if not self.hotword_enabled:
+ return None
+
if not self._porcupine:
- self._porcupine = self._create_porcupine()
+ args: Dict[str, Any] = {'access_key': self._access_key}
+ if self.keywords:
+ args['keywords'] = self.keywords
+ if self.keyword_paths:
+ args['keyword_paths'] = self.keyword_paths
+ if self.keyword_model_path:
+ args['model_path'] = self.keyword_model_path
+
+ self._porcupine = pvporcupine.create(**args)
return self._porcupine
+ @property
+ def cheetah(self) -> Optional[pvcheetah.Cheetah]:
+ if not self.stt_enabled:
+ return None
+
+ if not self._cheetah:
+ args: Dict[str, Any] = {'access_key': self._access_key}
+ if self.speech_model_path:
+ args['model_path'] = self.speech_model_path
+ if self.endpoint_duration:
+ args['endpoint_duration_sec'] = self.endpoint_duration
+ if self.enable_automatic_punctuation:
+ args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
+
+ self._cheetah = pvcheetah.create(**args)
+
+ return self._cheetah
+
def __enter__(self):
+ if self.should_stop():
+ return self
+
if self._recorder:
self.logger.info('A recording stream already exists')
- elif self.porcupine:
+ elif self.porcupine or self.cheetah:
+ sample_rate = (self.porcupine or self.cheetah).sample_rate # type: ignore
+ frame_length = (self.porcupine or self.cheetah).frame_length # type: ignore
+
self._recorder = AudioRecorder(
stop_event=self._stop_event,
- sample_rate=self.porcupine.sample_rate,
- frame_size=self.porcupine.frame_length,
+ sample_rate=sample_rate,
+ frame_size=frame_length,
+ queue_size=self.audio_queue_size,
channels=1,
)
self._recorder.__enter__()
+ if self.porcupine:
+ self.state = AssistantState.DETECTING_HOTWORD
+ else:
+ self.state = AssistantState.DETECTING_SPEECH
+
return self
def __exit__(self, *_):
@@ -121,6 +193,8 @@ class Assistant:
self._recorder.__exit__(*_)
self._recorder = None
+ self.state = AssistantState.IDLE
+
if self._cheetah:
self._cheetah.delete()
self._cheetah = None
@@ -146,26 +220,74 @@ class Assistant:
raise StopIteration
while not (self.should_stop() or has_data):
- if self.porcupine: # TODO also check current state
- data = self._recorder.read()
- if data is None:
- continue
+ data = self._recorder.read()
+ if data is None:
+ continue
- frame, t = data
- if time() - t > self.frame_expiration:
- self.logger.info(
- 'Skipping audio frame older than %ss', self.frame_expiration
- )
- continue # The audio frame is too old
+ frame, t = data
+ if time() - t > self.frame_expiration:
+ self.logger.info(
+ 'Skipping audio frame older than %ss', self.frame_expiration
+ )
+ continue # The audio frame is too old
- keyword_index = self.porcupine.process(frame)
- if keyword_index is None:
- continue # No keyword detected
+ if self.porcupine and self.state == AssistantState.DETECTING_HOTWORD:
+ return self._process_hotword(frame)
- if keyword_index >= 0 and self.keywords:
- return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
+ if self.cheetah and self.state == AssistantState.DETECTING_SPEECH:
+ return self._process_speech(frame)
raise StopIteration
+ def _post_event(self, event):
+ if event:
+ event.args['assistant'] = 'picovoice'
+ get_bus().post(event)
+
+ def _process_hotword(self, frame):
+ if not self.porcupine:
+ return None
+
+ keyword_index = self.porcupine.process(frame)
+ if keyword_index is None:
+ return None # No keyword detected
+
+ if keyword_index >= 0 and self.keywords:
+ if self.start_conversation_on_hotword:
+ self.state = AssistantState.DETECTING_SPEECH
+
+ return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
+
+ return None
+
+ def _process_speech(self, frame):
+ if not self.cheetah:
+ return None
+
+ event = None
+ (
+ self._speech_ctx.partial_transcript,
+ self._speech_ctx.is_final,
+ ) = self.cheetah.process(frame)
+
+ if self._speech_ctx.partial_transcript:
+ self.logger.info(
+ 'Partial transcript: %s, is_final: %s',
+ self._speech_ctx.partial_transcript,
+ self._speech_ctx.is_final,
+ )
+
+ if self._speech_ctx.is_final or self._speech_ctx.timed_out:
+ event = (
+ ConversationTimeoutEvent()
+ if self._speech_ctx.timed_out
+ else SpeechRecognizedEvent(phrase=self.cheetah.flush())
+ )
+
+ if self.porcupine:
+ self.state = AssistantState.DETECTING_HOTWORD
+
+ return event
+
# vim:sw=4:ts=4:et:
diff --git a/platypush/plugins/picovoice/_context.py b/platypush/plugins/picovoice/_context.py
new file mode 100644
index 000000000..cb7546105
--- /dev/null
+++ b/platypush/plugins/picovoice/_context.py
@@ -0,0 +1,43 @@
+from dataclasses import dataclass
+from time import time
+from typing import Optional
+
+
+@dataclass
+class SpeechDetectionContext:
+ """
+ Context of the speech detection process.
+ """
+
+ partial_transcript: str = ''
+ is_final: bool = False
+ timeout: Optional[float] = None
+ t_start: Optional[float] = None
+ t_end: Optional[float] = None
+
+ def start(self):
+ self.reset()
+ self.t_start = time()
+
+ def stop(self):
+ self.reset()
+ self.t_end = time()
+
+ def reset(self):
+ self.partial_transcript = ''
+ self.is_final = False
+ self.t_start = None
+ self.t_end = None
+
+ @property
+ def timed_out(self):
+ return (
+ not self.partial_transcript
+ and not self.is_final
+ and self.timeout
+ and self.t_start
+ and time() - self.t_start > self.timeout
+ )
+
+
+# vim:sw=4:ts=4:et:
diff --git a/platypush/plugins/picovoice/_recorder.py b/platypush/plugins/picovoice/_recorder.py
index 9df81e7c9..e0c23a8e9 100644
--- a/platypush/plugins/picovoice/_recorder.py
+++ b/platypush/plugins/picovoice/_recorder.py
@@ -26,7 +26,7 @@ class AudioRecorder:
frame_size: int,
channels: int,
dtype: str = 'int16',
- queue_size: int = 20,
+ queue_size: int = 100,
):
self.logger = getLogger(__name__)
self._audio_queue: Queue[AudioFrame] = Queue(maxsize=queue_size)
@@ -48,7 +48,6 @@ class AudioRecorder:
def __exit__(self, *_):
self.stop()
- # self.stream.close()
def _audio_callback(self, indata, *_):
if self.should_stop():