Compare commits

...

6 commits

28 changed files with 728 additions and 757 deletions

View file

@ -10,6 +10,4 @@ Backends
platypush/backend/midi.rst
platypush/backend/nodered.rst
platypush/backend/redis.rst
platypush/backend/stt.picovoice.hotword.rst
platypush/backend/stt.picovoice.speech.rst
platypush/backend/tcp.rst

View file

@ -1,5 +0,0 @@
``stt.picovoice.hotword``
===========================================
.. automodule:: platypush.backend.stt.picovoice.hotword
:members:

View file

@ -1,5 +0,0 @@
``stt.picovoice.speech``
==========================================
.. automodule:: platypush.backend.stt.picovoice.speech
:members:

View file

@ -0,0 +1,5 @@
``picovoice``
=============
.. automodule:: platypush.plugins.picovoice
:members:

View file

@ -1,5 +0,0 @@
``stt.picovoice.hotword``
===========================================
.. automodule:: platypush.plugins.stt.picovoice.hotword
:members:

View file

@ -1,5 +0,0 @@
``stt.picovoice.speech``
==========================================
.. automodule:: platypush.plugins.stt.picovoice.speech
:members:

View file

@ -95,6 +95,7 @@ Plugins
platypush/plugins/nmap.rst
platypush/plugins/ntfy.rst
platypush/plugins/otp.rst
platypush/plugins/picovoice.rst
platypush/plugins/pihole.rst
platypush/plugins/ping.rst
platypush/plugins/printer.cups.rst
@ -119,8 +120,6 @@ Plugins
platypush/plugins/smartthings.rst
platypush/plugins/sound.rst
platypush/plugins/ssh.rst
platypush/plugins/stt.picovoice.hotword.rst
platypush/plugins/stt.picovoice.speech.rst
platypush/plugins/sun.rst
platypush/plugins/switch.tplink.rst
platypush/plugins/switch.wemo.rst

View file

@ -1,40 +0,0 @@
import time
from platypush.backend import Backend
from platypush.context import get_plugin
from platypush.plugins.stt import SttPlugin
class SttBackend(Backend):
"""
Base class for speech-to-text backends.
"""
def __init__(self, plugin_name: str, retry_sleep: float = 5.0, *args, **kwargs):
"""
:param plugin_name: Plugin name of the class that will be used for speech detection. Must be an instance of
:class:`platypush.plugins.stt.SttPlugin`.
:param retry_sleep: Number of seconds the backend will wait on failure before re-initializing the plugin
(default: 5 seconds).
"""
super().__init__(*args, **kwargs)
self.plugin_name = plugin_name
self.retry_sleep = retry_sleep
def run(self):
super().run()
self.logger.info('Starting {} speech-to-text backend'.format(self.__class__.__name__))
while not self.should_stop():
try:
plugin: SttPlugin = get_plugin(self.plugin_name)
with plugin:
# noinspection PyProtectedMember
plugin._detection_thread.join()
except Exception as e:
self.logger.exception(e)
self.logger.warning('Encountered an unexpected error, retrying in {} seconds'.format(self.retry_sleep))
time.sleep(self.retry_sleep)
# vim:sw=4:ts=4:et:

View file

@ -1,21 +0,0 @@
from platypush.backend.stt import SttBackend
class SttPicovoiceHotwordBackend(SttBackend):
"""
Backend for the PicoVoice hotword detection plugin. Set this plugin to ``enabled`` if you
want to run the hotword engine continuously instead of programmatically using
``start_detection`` and ``stop_detection``.
Requires:
- The :class:`platypush.plugins.stt.deepspeech.SttPicovoiceHotwordPlugin` plugin configured and its dependencies
installed.
"""
def __init__(self, *args, **kwargs):
super().__init__('stt.picovoice.hotword', *args, **kwargs)
# vim:sw=4:ts=4:et:

View file

@ -1,6 +0,0 @@
manifest:
events: {}
install:
pip: []
package: platypush.backend.stt.picovoice.hotword
type: backend

View file

@ -1,21 +0,0 @@
from platypush.backend.stt import SttBackend
class SttPicovoiceSpeechBackend(SttBackend):
"""
Backend for the PicoVoice speech detection plugin. Set this plugin to ``enabled`` if you
want to run the speech engine continuously instead of programmatically using
``start_detection`` and ``stop_detection``.
Requires:
- The :class:`platypush.plugins.stt.deepspeech.SttPicovoiceSpeechPlugin` plugin configured and its dependencies
installed.
"""
def __init__(self, *args, **kwargs):
super().__init__('stt.picovoice.speech', *args, **kwargs)
# vim:sw=4:ts=4:et:

View file

@ -1,6 +0,0 @@
manifest:
events: {}
install:
pip: []
package: platypush.backend.stt.picovoice.speech
type: backend

View file

@ -9,21 +9,22 @@ from platypush.context import get_bus, get_plugin
from platypush.entities.assistants import Assistant
from platypush.entities.managers.assistants import AssistantEntityManager
from platypush.message.event.assistant import (
AssistantEvent,
ConversationStartEvent,
ConversationEndEvent,
ConversationTimeoutEvent,
ResponseEvent,
NoResponseEvent,
SpeechRecognizedEvent,
AlarmStartedEvent,
AlarmEndEvent,
TimerStartedEvent,
TimerEndEvent,
AlertStartedEvent,
AlarmStartedEvent,
AlertEndEvent,
AlertStartedEvent,
AssistantEvent,
ConversationEndEvent,
ConversationStartEvent,
ConversationTimeoutEvent,
HotwordDetectedEvent,
MicMutedEvent,
MicUnmutedEvent,
NoResponseEvent,
ResponseEvent,
SpeechRecognizedEvent,
TimerEndEvent,
TimerStartedEvent,
)
from platypush.plugins import Plugin, action
from platypush.utils import get_plugin_name_by_class
@ -235,6 +236,9 @@ class AssistantPlugin(Plugin, AssistantEntityManager, ABC):
self.stop_conversation()
tts.say(text=text, **self.tts_plugin_args)
def _on_hotword_detected(self, hotword: Optional[str]):
self._send_event(HotwordDetectedEvent, hotword=hotword)
def _on_speech_recognized(self, phrase: Optional[str]):
phrase = (phrase or '').lower().strip()
self._last_query = phrase

View file

@ -0,0 +1,234 @@
from typing import Optional, Sequence
from platypush.plugins import RunnablePlugin, action
from platypush.plugins.assistant import AssistantPlugin
from ._assistant import Assistant
from ._state import AssistantState
# pylint: disable=too-many-ancestors
class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
"""
A voice assistant that runs on your device, based on the `Picovoice
<https://picovoice.ai/>`_ engine.
.. note:: You will need a PicoVoice account and a personal access key to
use this integration.
You can get your personal access key by signing up at the `Picovoice
console <https://console.picovoice.ai/>`_. You may be asked to submit a
reason for using the service (feel free to mention a personal Platypush
integration), and you will receive your personal access key.
You may also be asked to select which products you want to use. The default
configuration of this plugin requires the following:
* **Porcupine**: wake-word engine, if you want the device to listen for
a specific wake word in order to start the assistant.
* **Cheetah**: speech-to-text engine, if you want your voice
interactions to be transcribed into free text - either programmatically
or when triggered by the wake word. Or:
* **Rhino**: intent recognition engine, if you want to extract *intents*
out of your voice commands - for instance, the phrase "set the living
room temperature to 20 degrees" could be mapped to the intent with the
following parameters: ``intent``: ``set_temperature``, ``room``:
``living_room``, ``temperature``: ``20``.
* **Leopard**: speech-to-text engine aimed at offline transcription of
audio files rather than real-time transcription.
* **Orca**: text-to-speech engine, if you want to create your custom
logic to respond to user's voice commands and render the responses as
audio.
"""
def __init__(
self,
access_key: str,
hotword_enabled: bool = True,
stt_enabled: bool = True,
intent_enabled: bool = False,
keywords: Optional[Sequence[str]] = None,
keyword_paths: Optional[Sequence[str]] = None,
keyword_model_path: Optional[str] = None,
speech_model_path: Optional[str] = None,
endpoint_duration: Optional[float] = 0.5,
enable_automatic_punctuation: bool = False,
start_conversation_on_hotword: bool = True,
audio_queue_size: int = 100,
conversation_timeout: Optional[float] = 7.5,
**kwargs,
):
"""
:param access_key: Your Picovoice access key. You can get it by signing
up at the `Picovoice console <https://console.picovoice.ai/>`.
:param hotword_enabled: Enable the wake-word engine (default: True).
**Note**: The wake-word engine requires you to add Porcupine to the
products available in your Picovoice account.
:param stt_enabled: Enable the speech-to-text engine (default: True).
**Note**: The speech-to-text engine requires you to add Cheetah to
the products available in your Picovoice account.
:param intent_enabled: Enable the intent recognition engine (default:
False).
**Note**: The intent recognition engine requires you to add Rhino
to the products available in your Picovoice account.
:param keywords: List of keywords to listen for (e.g. ``alexa``, ``ok
google``...). This is required if the wake-word engine is enabled.
See the `Picovoice repository
<https://github.com/Picovoice/porcupine/tree/master/resources/keyword_files>`_).
for a list of the stock keywords available. If you have a custom
model, you can pass its path to the ``keyword_paths`` parameter and
its filename (without the path and the platform extension) here.
:param keyword_paths: List of paths to the keyword files to listen for.
Custom keyword files can be created using the `Picovoice console
<https://console.picovoice.ai/ppn>`_ and downloaded from the
console itself.
:param keyword_model_path: If you are using a keyword file in a
non-English language, you can provide the path to the model file
for its language. Model files are available for all the supported
languages through the `Picovoice repository
<https://github.com/Picovoice/porcupine/tree/master/lib/common>`_.
:param speech_model_path: Path to the speech model file. If you are
using a language other than English, you can provide the path to the
model file for that language. Model files are available for all the
supported languages through the `Picovoice repository
<https://github.com/Picovoice/porcupine/tree/master/lib/common>`_.
:param endpoint_duration: If set, the assistant will stop listening when
no speech is detected for the specified duration (in seconds) after
the end of an utterance.
:param enable_automatic_punctuation: Enable automatic punctuation
insertion.
:param start_conversation_on_hotword: If set to True (default), a speech
detection session will be started when the hotword is detected. If
set to False, you may want to start the conversation programmatically
by calling the :meth:`.start_conversation` method instead, or run any
custom logic hotword detection logic. This can be particularly useful
when you want to run the assistant in a push-to-talk mode, or when you
want different hotwords to trigger conversations with different models
or languages.
:param audio_queue_size: Maximum number of audio frames to hold in the
processing queue. You may want to increase this value if you are
running this integration on a slow device and/or the logs report
audio frame drops too often. Keep in mind that increasing this value
will increase the memory usage of the integration. Also, a higher
value may result in higher accuracy at the cost of higher latency.
:param conversation_timeout: Maximum time to wait for some speech to be
detected after the hotword is detected. If no speech is detected
within this time, the conversation will time out and the plugin will
go back into hotword detection mode, if the mode is enabled. Default:
7.5 seconds.
"""
super().__init__(**kwargs)
self._assistant = None
self._assistant_args = {
'stop_event': self._should_stop,
'access_key': access_key,
'hotword_enabled': hotword_enabled,
'stt_enabled': stt_enabled,
'intent_enabled': intent_enabled,
'keywords': keywords,
'keyword_paths': keyword_paths,
'keyword_model_path': keyword_model_path,
'speech_model_path': speech_model_path,
'endpoint_duration': endpoint_duration,
'enable_automatic_punctuation': enable_automatic_punctuation,
'start_conversation_on_hotword': start_conversation_on_hotword,
'audio_queue_size': audio_queue_size,
'conversation_timeout': conversation_timeout,
'on_conversation_start': self._on_conversation_start,
'on_conversation_end': self._on_conversation_end,
'on_conversation_timeout': self._on_conversation_timeout,
'on_speech_recognized': self._on_speech_recognized,
'on_hotword_detected': self._on_hotword_detected,
}
@action
def start_conversation(self, *_, **__):
"""
Programmatically start a conversation with the assistant
"""
if not self._assistant:
self.logger.warning('Assistant not initialized')
return
self._assistant.state = AssistantState.DETECTING_SPEECH
@action
def stop_conversation(self, *_, **__):
"""
Programmatically stop a running conversation with the assistant
"""
if not self._assistant:
self.logger.warning('Assistant not initialized')
return
if self._assistant.hotword_enabled:
self._assistant.state = AssistantState.DETECTING_HOTWORD
else:
self._assistant.state = AssistantState.IDLE
@action
def mute(self, *_, **__):
"""
Mute the microphone. Alias for :meth:`.set_mic_mute` with
``muted=True``.
"""
@action
def unmute(self, *_, **__):
"""
Unmute the microphone. Alias for :meth:`.set_mic_mute` with
``muted=False``.
"""
@action
def set_mic_mute(self, muted: bool):
"""
Programmatically mute/unmute the microphone.
:param muted: Set to True or False.
"""
@action
def toggle_mute(self, *_, **__):
"""
Toggle the mic mute state.
"""
@action
def send_text_query(self, *_, query: str, **__):
"""
Send a text query to the assistant.
This is equivalent to saying something to the assistant.
:param query: Query to be sent.
"""
def main(self):
while not self.should_stop():
self.logger.info('Starting Picovoice assistant')
with Assistant(**self._assistant_args) as self._assistant:
try:
for event in self._assistant:
self.logger.debug('Picovoice assistant event: %s', event)
except KeyboardInterrupt:
break
except Exception as e:
self.logger.error('Picovoice assistant error: %s', e, exc_info=True)
self.wait_stop(5)
def stop(self):
try:
self.stop_conversation()
except RuntimeError:
pass
super().stop()
# vim:sw=4:ts=4:et:

View file

@ -0,0 +1,308 @@
import logging
import os
from threading import Event, RLock
from time import time
from typing import Any, Dict, Optional, Sequence
import pvcheetah
import pvleopard
import pvporcupine
import pvrhino
from platypush.message.event.assistant import (
ConversationTimeoutEvent,
HotwordDetectedEvent,
SpeechRecognizedEvent,
)
from ._context import ConversationContext
from ._recorder import AudioRecorder
from ._state import AssistantState
class Assistant:
"""
A facade class that wraps the Picovoice engines under an assistant API.
"""
def _default_callback(*_, **__):
pass
def __init__(
self,
access_key: str,
stop_event: Event,
hotword_enabled: bool = True,
stt_enabled: bool = True,
intent_enabled: bool = False,
keywords: Optional[Sequence[str]] = None,
keyword_paths: Optional[Sequence[str]] = None,
keyword_model_path: Optional[str] = None,
frame_expiration: float = 3.0, # Don't process audio frames older than this
speech_model_path: Optional[str] = None,
endpoint_duration: Optional[float] = None,
enable_automatic_punctuation: bool = False,
start_conversation_on_hotword: bool = False,
audio_queue_size: int = 100,
conversation_timeout: Optional[float] = None,
on_conversation_start=_default_callback,
on_conversation_end=_default_callback,
on_conversation_timeout=_default_callback,
on_speech_recognized=_default_callback,
on_hotword_detected=_default_callback,
):
self._access_key = access_key
self._stop_event = stop_event
self.logger = logging.getLogger(__name__)
self.hotword_enabled = hotword_enabled
self.stt_enabled = stt_enabled
self.intent_enabled = intent_enabled
self.keywords = list(keywords or [])
self.keyword_paths = None
self.keyword_model_path = None
self.frame_expiration = frame_expiration
self.speech_model_path = speech_model_path
self.endpoint_duration = endpoint_duration
self.enable_automatic_punctuation = enable_automatic_punctuation
self.start_conversation_on_hotword = start_conversation_on_hotword
self.audio_queue_size = audio_queue_size
self._on_conversation_start = on_conversation_start
self._on_conversation_end = on_conversation_end
self._on_conversation_timeout = on_conversation_timeout
self._on_speech_recognized = on_speech_recognized
self._on_hotword_detected = on_hotword_detected
self._recorder = None
self._state = AssistantState.IDLE
self._state_lock = RLock()
self._ctx = ConversationContext(timeout=conversation_timeout)
if hotword_enabled:
if not keywords:
raise ValueError(
'You need to provide a list of keywords if the wake-word engine is enabled'
)
if keyword_paths:
keyword_paths = [os.path.expanduser(path) for path in keyword_paths]
missing_paths = [
path for path in keyword_paths if not os.path.isfile(path)
]
if missing_paths:
raise FileNotFoundError(f'Keyword files not found: {missing_paths}')
self.keyword_paths = keyword_paths
if keyword_model_path:
keyword_model_path = os.path.expanduser(keyword_model_path)
if not os.path.isfile(keyword_model_path):
raise FileNotFoundError(
f'Keyword model file not found: {keyword_model_path}'
)
self.keyword_model_path = keyword_model_path
self._cheetah: Optional[pvcheetah.Cheetah] = None
self._leopard: Optional[pvleopard.Leopard] = None
self._porcupine: Optional[pvporcupine.Porcupine] = None
self._rhino: Optional[pvrhino.Rhino] = None
def should_stop(self):
return self._stop_event.is_set()
def wait_stop(self):
self._stop_event.wait()
@property
def state(self) -> AssistantState:
with self._state_lock:
return self._state
@state.setter
def state(self, state: AssistantState):
with self._state_lock:
prev_state = self._state
self._state = state
new_state = self.state
if prev_state == new_state:
return
if prev_state == AssistantState.DETECTING_SPEECH:
self._ctx.stop()
self._on_conversation_end()
elif new_state == AssistantState.DETECTING_SPEECH:
self._ctx.start()
self._on_conversation_start()
@property
def porcupine(self) -> Optional[pvporcupine.Porcupine]:
if not self.hotword_enabled:
return None
if not self._porcupine:
args: Dict[str, Any] = {'access_key': self._access_key}
if self.keywords:
args['keywords'] = self.keywords
if self.keyword_paths:
args['keyword_paths'] = self.keyword_paths
if self.keyword_model_path:
args['model_path'] = self.keyword_model_path
self._porcupine = pvporcupine.create(**args)
return self._porcupine
@property
def cheetah(self) -> Optional[pvcheetah.Cheetah]:
if not self.stt_enabled:
return None
if not self._cheetah:
args: Dict[str, Any] = {'access_key': self._access_key}
if self.speech_model_path:
args['model_path'] = self.speech_model_path
if self.endpoint_duration:
args['endpoint_duration_sec'] = self.endpoint_duration
if self.enable_automatic_punctuation:
args['enable_automatic_punctuation'] = self.enable_automatic_punctuation
self._cheetah = pvcheetah.create(**args)
return self._cheetah
def __enter__(self):
if self.should_stop():
return self
if self._recorder:
self.logger.info('A recording stream already exists')
elif self.porcupine or self.cheetah:
sample_rate = (self.porcupine or self.cheetah).sample_rate # type: ignore
frame_length = (self.porcupine or self.cheetah).frame_length # type: ignore
self._recorder = AudioRecorder(
stop_event=self._stop_event,
sample_rate=sample_rate,
frame_size=frame_length,
queue_size=self.audio_queue_size,
channels=1,
)
self._recorder.__enter__()
if self.porcupine:
self.state = AssistantState.DETECTING_HOTWORD
else:
self.state = AssistantState.DETECTING_SPEECH
return self
def __exit__(self, *_):
if self._recorder:
self._recorder.__exit__(*_)
self._recorder = None
self.state = AssistantState.IDLE
if self._cheetah:
self._cheetah.delete()
self._cheetah = None
if self._leopard:
self._leopard.delete()
self._leopard = None
if self._porcupine:
self._porcupine.delete()
self._porcupine = None
if self._rhino:
self._rhino.delete()
self._rhino = None
def __iter__(self):
return self
def __next__(self):
has_data = False
if self.should_stop() or not self._recorder:
raise StopIteration
while not (self.should_stop() or has_data):
data = self._recorder.read()
if data is None:
continue
frame, t = data
if time() - t > self.frame_expiration:
self.logger.info(
'Skipping audio frame older than %ss', self.frame_expiration
)
continue # The audio frame is too old
if self.porcupine and self.state == AssistantState.DETECTING_HOTWORD:
return self._process_hotword(frame)
if self.cheetah and self.state == AssistantState.DETECTING_SPEECH:
return self._process_speech(frame)
raise StopIteration
def _process_hotword(self, frame):
if not self.porcupine:
return None
keyword_index = self.porcupine.process(frame)
if keyword_index is None:
return None # No keyword detected
if keyword_index >= 0 and self.keywords:
if self.start_conversation_on_hotword:
self.state = AssistantState.DETECTING_SPEECH
self._on_hotword_detected(hotword=self.keywords[keyword_index])
return HotwordDetectedEvent(hotword=self.keywords[keyword_index])
return None
def _process_speech(self, frame):
if not self.cheetah:
return None
event = None
partial_transcript, self._ctx.is_final = self.cheetah.process(frame)
if partial_transcript:
self._ctx.partial_transcript += partial_transcript
self.logger.info(
'Partial transcript: %s, is_final: %s',
self._ctx.partial_transcript,
self._ctx.is_final,
)
if self._ctx.is_final or self._ctx.timed_out:
phrase = ''
if self.cheetah:
phrase = self.cheetah.flush()
self._ctx.partial_transcript += phrase
phrase = self._ctx.partial_transcript
phrase = phrase[:1].lower() + phrase[1:]
if self._ctx.is_final or phrase:
event = SpeechRecognizedEvent(phrase=phrase)
self._on_speech_recognized(phrase=phrase)
else:
event = ConversationTimeoutEvent()
self._on_conversation_timeout()
self._ctx.reset()
if self.hotword_enabled:
self.state = AssistantState.DETECTING_HOTWORD
return event
# vim:sw=4:ts=4:et:

View file

@ -0,0 +1,43 @@
from dataclasses import dataclass
from time import time
from typing import Optional
@dataclass
class ConversationContext:
"""
Context of the conversation process.
"""
partial_transcript: str = ''
is_final: bool = False
timeout: Optional[float] = None
t_start: Optional[float] = None
t_end: Optional[float] = None
def start(self):
self.reset()
self.t_start = time()
def stop(self):
self.reset()
self.t_end = time()
def reset(self):
self.partial_transcript = ''
self.is_final = False
self.t_start = None
self.t_end = None
@property
def timed_out(self):
return (
not self.partial_transcript
and not self.is_final
and self.timeout
and self.t_start
and time() - self.t_start > self.timeout
)
# vim:sw=4:ts=4:et:

View file

@ -0,0 +1,76 @@
from collections import namedtuple
from logging import getLogger
from queue import Full, Queue
from threading import Event
from time import time
from typing import Optional
import sounddevice as sd
from platypush.utils import wait_for_either
AudioFrame = namedtuple('AudioFrame', ['data', 'timestamp'])
class AudioRecorder:
"""
Audio recorder component that uses the sounddevice library to record audio
from the microphone.
"""
def __init__(
self,
stop_event: Event,
sample_rate: int,
frame_size: int,
channels: int,
dtype: str = 'int16',
queue_size: int = 100,
):
self.logger = getLogger(__name__)
self._audio_queue: Queue[AudioFrame] = Queue(maxsize=queue_size)
self.frame_size = frame_size
self._stop_event = Event()
self._upstream_stop_event = stop_event
self.stream = sd.InputStream(
samplerate=sample_rate,
channels=channels,
dtype=dtype,
blocksize=frame_size,
callback=self._audio_callback,
)
def __enter__(self):
self._stop_event.clear()
self.stream.start()
return self
def __exit__(self, *_):
self.stop()
def _audio_callback(self, indata, *_):
if self.should_stop():
return
try:
self._audio_queue.put_nowait(AudioFrame(indata.reshape(-1), time()))
except Full:
self.logger.warning('Audio queue is full, dropping audio frame')
def read(self, timeout: Optional[float] = None):
try:
return self._audio_queue.get(timeout=timeout)
except TimeoutError:
self.logger.debug('Audio queue is empty')
return None
def stop(self):
self._stop_event.set()
self.stream.stop()
def should_stop(self):
return self._stop_event.is_set() or self._upstream_stop_event.is_set()
def wait(self, timeout: Optional[float] = None):
wait_for_either(self._stop_event, self._upstream_stop_event, timeout=timeout)

View file

@ -0,0 +1,14 @@
from enum import Enum
class AssistantState(Enum):
"""
Possible states of the assistant.
"""
IDLE = 'idle'
DETECTING_HOTWORD = 'detecting_hotword'
DETECTING_SPEECH = 'detecting_speech'
# vim:sw=4:ts=4:et:

View file

@ -0,0 +1,23 @@
manifest:
package: platypush.plugins.assistant.picovoice
type: plugin
events:
- platypush.message.event.assistant.ConversationEndEvent
- platypush.message.event.assistant.ConversationStartEvent
- platypush.message.event.assistant.ConversationTimeoutEvent
- platypush.message.event.assistant.HotwordDetectedEvent
- platypush.message.event.assistant.MicMutedEvent
- platypush.message.event.assistant.MicUnmutedEvent
- platypush.message.event.assistant.NoResponseEvent
- platypush.message.event.assistant.ResponseEvent
- platypush.message.event.assistant.SpeechRecognizedEvent
install:
pacman:
- python-sounddevice
pip:
- pvcheetah
- pvleopard
- pvorca
- pvporcupine
- pvrhino
- sounddevice

View file

@ -247,9 +247,11 @@ class AudioManager:
wait_start = time()
for audio_thread in streams_to_stop:
audio_thread.join(
timeout=max(0, timeout - (time() - wait_start))
if timeout is not None
else None
timeout=(
max(0, timeout - (time() - wait_start))
if timeout is not None
else None
)
)
# Remove references

View file

@ -1,336 +0,0 @@
import queue
import threading
from abc import ABC, abstractmethod
from typing import Optional, Union, List
import sounddevice as sd
from platypush.context import get_bus
from platypush.message.event.stt import (
SpeechDetectionStartedEvent,
SpeechDetectionStoppedEvent,
SpeechStartedEvent,
SpeechDetectedEvent,
HotwordDetectedEvent,
ConversationDetectedEvent,
)
from platypush.message.response.stt import SpeechDetectedResponse
from platypush.plugins import Plugin, action
class SttPlugin(ABC, Plugin):
"""
Abstract class for speech-to-text plugins.
"""
_thread_stop_timeout = 10.0
rate = 16000
channels = 1
def __init__(
self,
input_device: Optional[Union[int, str]] = None,
hotword: Optional[str] = None,
hotwords: Optional[List[str]] = None,
conversation_timeout: Optional[float] = 10.0,
block_duration: float = 1.0,
):
"""
:param input_device: PortAudio device index or name that will be used for recording speech (default: default
system audio input device).
:param hotword: When this word is detected, the plugin will trigger a
:class:`platypush.message.event.stt.HotwordDetectedEvent` instead of a
:class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can use these events for hooking other
assistants.
:param hotwords: Use a list of hotwords instead of a single one.
:param conversation_timeout: If ``hotword`` or ``hotwords`` are set and ``conversation_timeout`` is set,
the next speech detected event will trigger a :class:`platypush.message.event.stt.ConversationDetectedEvent`
instead of a :class:`platypush.message.event.stt.SpeechDetectedEvent` event. You can hook custom hooks
here to run any logic depending on the detected speech - it can emulate a kind of
"OK, Google. Turn on the lights" interaction without using an external assistant (default: 10 seconds).
:param block_duration: Duration of the acquired audio blocks (default: 1 second).
"""
super().__init__()
self.input_device = input_device
self.conversation_timeout = conversation_timeout
self.block_duration = block_duration
self.hotwords = set(hotwords or [])
if hotword:
self.hotwords = {hotword}
self._conversation_event = threading.Event()
self._input_stream: Optional[sd.InputStream] = None
self._recording_thread: Optional[threading.Thread] = None
self._detection_thread: Optional[threading.Thread] = None
self._audio_queue: Optional[queue.Queue] = None
self._current_text = ''
def _get_input_device(self, device: Optional[Union[int, str]] = None) -> int:
"""
Get the index of the input device by index or name.
:param device: Device index or name. If None is set then the function will return the index of the
default audio input device.
:return: Index of the audio input device.
"""
if not device:
device = self.input_device
if not device:
return sd.query_hostapis()[0].get('default_input_device')
if isinstance(device, int):
assert device <= len(sd.query_devices())
return device
for i, dev in enumerate(sd.query_devices()):
if dev['name'] == device:
return i
raise AssertionError('Device {} not found'.format(device))
def on_speech_detected(self, speech: str) -> None:
"""
Hook called when speech is detected. Triggers the right event depending on the current context.
:param speech: Detected speech.
"""
speech = speech.strip()
if speech in self.hotwords:
event = HotwordDetectedEvent(hotword=speech)
if self.conversation_timeout:
self._conversation_event.set()
threading.Timer(
self.conversation_timeout, lambda: self._conversation_event.clear()
).start()
elif self._conversation_event.is_set():
event = ConversationDetectedEvent(speech=speech)
else:
event = SpeechDetectedEvent(speech=speech)
get_bus().post(event)
@staticmethod
def convert_frames(frames: bytes) -> bytes:
"""
Conversion method for raw audio frames. It just returns the input frames as bytes. Override it if required
by your logic.
:param frames: Input audio frames, as bytes.
:return: The audio frames as passed on the input. Override if required.
"""
return frames
def on_detection_started(self) -> None:
"""
Method called when the ``detection_thread`` starts. Initialize your context variables and models here if
required.
"""
pass
def on_detection_ended(self) -> None:
"""
Method called when the ``detection_thread`` stops. Clean up your context variables and models here.
"""
pass
def before_recording(self) -> None:
"""
Method called when the ``recording_thread`` starts. Put here any logic that you may want to run before the
recording thread starts.
"""
pass
def on_recording_started(self) -> None:
"""
Method called after the ``recording_thread`` opens the audio device. Put here any logic that you may want to
run after the recording starts.
"""
pass
def on_recording_ended(self) -> None:
"""
Method called when the ``recording_thread`` stops. Put here any logic that you want to run after the audio
device is closed.
"""
pass
@abstractmethod
def detect_speech(self, frames) -> str:
"""
Method called within the ``detection_thread`` when new audio frames have been captured. Must be implemented
by the derived classes.
:param frames: Audio frames, as returned by ``convert_frames``.
:return: Detected text, as a string. Returns an empty string if no text has been detected.
"""
raise NotImplementedError
def process_text(self, text: str) -> None:
if (not text and self._current_text) or (text and text == self._current_text):
self.on_speech_detected(self._current_text)
self._current_text = ''
else:
if text:
if not self._current_text:
get_bus().post(SpeechStartedEvent())
self.logger.info('Intermediate speech results: [{}]'.format(text))
self._current_text = text
def detection_thread(self) -> None:
"""
This thread reads frames from ``_audio_queue``, performs the speech-to-text detection and calls
"""
self._current_text = ''
self.logger.debug('Detection thread started')
self.on_detection_started()
while self._audio_queue:
try:
frames = self._audio_queue.get()
frames = self.convert_frames(frames)
except Exception as e:
self.logger.warning(
'Error while feeding audio to the model: {}'.format(str(e))
)
continue
text = self.detect_speech(frames).strip()
self.process_text(text)
self.on_detection_ended()
self.logger.debug('Detection thread terminated')
def recording_thread(
self,
block_duration: Optional[float] = None,
block_size: Optional[int] = None,
input_device: Optional[str] = None,
) -> None:
"""
Recording thread. It reads raw frames from the audio device and dispatches them to ``detection_thread``.
:param block_duration: Audio blocks duration. Specify either ``block_duration`` or ``block_size``.
:param block_size: Size of the audio blocks. Specify either ``block_duration`` or ``block_size``.
:param input_device: Input device
"""
assert (block_duration or block_size) and not (
block_duration and block_size
), 'Please specify either block_duration or block_size'
if not block_size:
block_size = int(self.rate * self.channels * block_duration)
self.before_recording()
self.logger.debug('Recording thread started')
device = self._get_input_device(input_device)
self._input_stream = sd.InputStream(
samplerate=self.rate,
device=device,
channels=self.channels,
dtype='int16',
latency=0,
blocksize=block_size,
)
self._input_stream.start()
self.on_recording_started()
get_bus().post(SpeechDetectionStartedEvent())
while self._input_stream:
try:
frames = self._input_stream.read(block_size)[0]
except Exception as e:
self.logger.warning(
'Error while reading from the audio input: {}'.format(str(e))
)
continue
self._audio_queue.put(frames)
get_bus().post(SpeechDetectionStoppedEvent())
self.on_recording_ended()
self.logger.debug('Recording thread terminated')
@abstractmethod
@action
def detect(self, audio_file: str) -> SpeechDetectedResponse:
"""
Perform speech-to-text analysis on an audio file. Must be implemented by the derived classes.
:param audio_file: Path to the audio file.
"""
raise NotImplementedError
def __enter__(self):
"""
Context manager enter. Starts detection and returns self.
"""
self.start_detection()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""
Context manager exit. Stops detection.
"""
self.stop_detection()
@action
def start_detection(
self,
input_device: Optional[str] = None,
seconds: Optional[float] = None,
block_duration: Optional[float] = None,
) -> None:
"""
Start the speech detection engine.
:param input_device: Audio input device name/index override
:param seconds: If set, then the detection engine will stop after this many seconds, otherwise it'll
start running until ``stop_detection`` is called or application stop.
:param block_duration: ``block_duration`` override.
"""
assert (
not self._input_stream and not self._recording_thread
), 'Speech detection is already running'
block_duration = block_duration or self.block_duration
input_device = input_device if input_device is not None else self.input_device
self._audio_queue = queue.Queue()
self._recording_thread = threading.Thread(
target=lambda: self.recording_thread(
block_duration=block_duration, input_device=input_device
)
)
self._recording_thread.start()
self._detection_thread = threading.Thread(
target=lambda: self.detection_thread()
)
self._detection_thread.start()
if seconds:
threading.Timer(seconds, lambda: self.stop_detection()).start()
@action
def stop_detection(self) -> None:
"""
Stop the speech detection engine.
"""
assert self._input_stream, 'Speech detection is not running'
self._input_stream.stop(ignore_errors=True)
self._input_stream.close(ignore_errors=True)
self._input_stream = None
if self._recording_thread:
self._recording_thread.join(timeout=self._thread_stop_timeout)
self._recording_thread = None
self._audio_queue = None
if self._detection_thread:
self._detection_thread.join(timeout=self._thread_stop_timeout)
self._detection_thread = None
# vim:sw=4:ts=4:et:

View file

@ -1,120 +0,0 @@
import os
import struct
from typing import Optional, List
from platypush.message.response.stt import SpeechDetectedResponse
from platypush.plugins import action
from platypush.plugins.stt import SttPlugin
class SttPicovoiceHotwordPlugin(SttPlugin):
"""
This plugin performs hotword detection using `PicoVoice <https://github.com/Picovoice>`_.
"""
def __init__(
self,
library_path: Optional[str] = None,
model_file_path: Optional[str] = None,
keyword_file_paths: Optional[List[str]] = None,
sensitivity: float = 0.5,
sensitivities: Optional[List[float]] = None,
*args,
**kwargs
):
from pvporcupine import Porcupine
from pvporcupine.resources.util.python.util import (
LIBRARY_PATH,
MODEL_FILE_PATH,
KEYWORD_FILE_PATHS,
)
super().__init__(*args, **kwargs)
self.hotwords = list(self.hotwords)
self._hotword_engine: Optional[Porcupine] = None
self._library_path = os.path.abspath(
os.path.expanduser(library_path or LIBRARY_PATH)
)
self._model_file_path = os.path.abspath(
os.path.expanduser(model_file_path or MODEL_FILE_PATH)
)
if not keyword_file_paths:
hotwords = KEYWORD_FILE_PATHS
assert all(
hotword in hotwords for hotword in self.hotwords
), 'Not all the hotwords could be found. Available hotwords: {}'.format(
list(hotwords.keys())
)
self._keyword_file_paths = [
os.path.abspath(os.path.expanduser(hotwords[hotword]))
for hotword in self.hotwords
]
else:
self._keyword_file_paths = [
os.path.abspath(os.path.expanduser(p)) for p in keyword_file_paths
]
self._sensitivities = []
if sensitivities:
assert len(self._keyword_file_paths) == len(
sensitivities
), 'Please specify as many sensitivities as the number of configured hotwords'
self._sensitivities = sensitivities
else:
self._sensitivities = [sensitivity] * len(self._keyword_file_paths)
def convert_frames(self, frames: bytes) -> tuple:
assert self._hotword_engine, 'The hotword engine is not running'
return struct.unpack_from("h" * self._hotword_engine.frame_length, frames)
def on_detection_ended(self) -> None:
if self._hotword_engine:
self._hotword_engine.delete()
self._hotword_engine = None
def detect_speech(self, frames: tuple) -> str:
index = self._hotword_engine.process(frames)
if index < 0:
return ''
if index is True:
index = 0
return self.hotwords[index]
@action
def detect(self, audio_file: str) -> SpeechDetectedResponse:
"""
Perform speech-to-text analysis on an audio file.
:param audio_file: Path to the audio file.
"""
pass
def recording_thread(
self, input_device: Optional[str] = None, *args, **kwargs
) -> None:
assert self._hotword_engine, 'The hotword engine has not yet been initialized'
super().recording_thread(
block_size=self._hotword_engine.frame_length, input_device=input_device
)
@action
def start_detection(self, *args, **kwargs) -> None:
from pvporcupine import Porcupine
self._hotword_engine = Porcupine(
library_path=self._library_path,
model_file_path=self._model_file_path,
keyword_file_paths=self._keyword_file_paths,
sensitivities=self._sensitivities,
)
self.rate = self._hotword_engine.sample_rate
super().start_detection(*args, **kwargs)
# vim:sw=4:ts=4:et:

View file

@ -1,7 +0,0 @@
manifest:
events: {}
install:
pip:
- pvporcupine
package: platypush.plugins.stt.picovoice.hotword
type: plugin

View file

@ -1,154 +0,0 @@
import inspect
import os
import platform
import struct
import threading
from typing import Optional
from platypush.message.event.stt import SpeechStartedEvent
from platypush.context import get_bus
from platypush.message.response.stt import SpeechDetectedResponse
from platypush.plugins import action
from platypush.plugins.stt import SttPlugin
class SttPicovoiceSpeechPlugin(SttPlugin):
"""
This plugin performs speech detection using `PicoVoice <https://github.com/Picovoice>`_.
NOTE: The PicoVoice product used for real-time speech-to-text (Cheetah) can be used freely for
personal applications on x86_64 Linux. Other architectures and operating systems require a commercial license.
You can ask for a license `here <https://picovoice.ai/contact.html>`_.
"""
def __init__(
self,
library_path: Optional[str] = None,
acoustic_model_path: Optional[str] = None,
language_model_path: Optional[str] = None,
license_path: Optional[str] = None,
end_of_speech_timeout: int = 1,
*args,
**kwargs
):
"""
:param library_path: Path to the Cheetah binary library for your OS
(default: ``CHEETAH_INSTALL_DIR/lib/OS/ARCH/libpv_cheetah.EXT``).
:param acoustic_model_path: Path to the acoustic speech model
(default: ``CHEETAH_INSTALL_DIR/lib/common/acoustic_model.pv``).
:param language_model_path: Path to the language model
(default: ``CHEETAH_INSTALL_DIR/lib/common/language_model.pv``).
:param license_path: Path to your PicoVoice license
(default: ``CHEETAH_INSTALL_DIR/resources/license/cheetah_eval_linux_public.lic``).
:param end_of_speech_timeout: Number of seconds of silence during speech recognition before considering
a phrase over (default: 1).
"""
from pvcheetah import Cheetah
super().__init__(*args, **kwargs)
self._basedir = os.path.abspath(
os.path.join(inspect.getfile(Cheetah), '..', '..', '..')
)
if not library_path:
library_path = self._get_library_path()
if not language_model_path:
language_model_path = os.path.join(
self._basedir, 'lib', 'common', 'language_model.pv'
)
if not acoustic_model_path:
acoustic_model_path = os.path.join(
self._basedir, 'lib', 'common', 'acoustic_model.pv'
)
if not license_path:
license_path = os.path.join(
self._basedir, 'resources', 'license', 'cheetah_eval_linux_public.lic'
)
self._library_path = library_path
self._language_model_path = language_model_path
self._acoustic_model_path = acoustic_model_path
self._license_path = license_path
self._end_of_speech_timeout = end_of_speech_timeout
self._stt_engine: Optional[Cheetah] = None
self._speech_in_progress = threading.Event()
def _get_library_path(self) -> str:
path = os.path.join(
self._basedir, 'lib', platform.system().lower(), platform.machine()
)
return os.path.join(
path, [f for f in os.listdir(path) if f.startswith('libpv_cheetah.')][0]
)
def convert_frames(self, frames: bytes) -> tuple:
assert self._stt_engine, 'The speech engine is not running'
return struct.unpack_from("h" * self._stt_engine.frame_length, frames)
def on_detection_ended(self) -> None:
if self._stt_engine:
self._stt_engine.delete()
self._stt_engine = None
def detect_speech(self, frames: tuple) -> str:
text, is_endpoint = self._stt_engine.process(frames)
text = text.strip()
if text:
if not self._speech_in_progress.is_set():
self._speech_in_progress.set()
get_bus().post(SpeechStartedEvent())
self._current_text += ' ' + text.strip()
if is_endpoint:
text = self._stt_engine.flush().strip().strip()
if text:
self._current_text += ' ' + text
self._speech_in_progress.clear()
if self._current_text:
self.on_speech_detected(self._current_text)
self._current_text = ''
return self._current_text
def process_text(self, text: str) -> None:
pass
@action
def detect(self, audio_file: str) -> SpeechDetectedResponse:
"""
Perform speech-to-text analysis on an audio file.
:param audio_file: Path to the audio file.
"""
pass
def recording_thread(
self, input_device: Optional[str] = None, *args, **kwargs
) -> None:
assert self._stt_engine, 'The hotword engine has not yet been initialized'
super().recording_thread(
block_size=self._stt_engine.frame_length, input_device=input_device
)
@action
def start_detection(self, *args, **kwargs) -> None:
from pvcheetah import Cheetah
self._stt_engine = Cheetah(
library_path=self._library_path,
acoustic_model_path=self._acoustic_model_path,
language_model_path=self._language_model_path,
license_path=self._license_path,
endpoint_duration_sec=self._end_of_speech_timeout,
)
self.rate = self._stt_engine.sample_rate
self._speech_in_progress.clear()
super().start_detection(*args, **kwargs)
# vim:sw=4:ts=4:et:

View file

@ -1,7 +0,0 @@
manifest:
events: {}
install:
pip:
- cheetah
package: platypush.plugins.stt.picovoice.speech
type: plugin

View file

@ -83,7 +83,10 @@ mock_imports = [
"pmw3901",
"psutil",
"pvcheetah",
"pvporcupine ",
"pvleopard",
"pvorca",
"pvporcupine",
"pvrhino",
"pyHS100",
"pyaudio",
"pychromecast",