Implemented PicoVoice speech-to-text integration [closes #130]
This commit is contained in:
parent
a5c08ed3e4
commit
ce0f3227ec
10 changed files with 188 additions and 26 deletions
|
@ -243,6 +243,7 @@ autodoc_mock_imports = ['googlesamples.assistant.grpc.audio_helpers',
|
||||||
'deepspeech',
|
'deepspeech',
|
||||||
'wave',
|
'wave',
|
||||||
'pvporcupine ',
|
'pvporcupine ',
|
||||||
|
'pvcheetah',
|
||||||
]
|
]
|
||||||
|
|
||||||
sys.path.insert(0, os.path.abspath('../..'))
|
sys.path.insert(0, os.path.abspath('../..'))
|
||||||
|
|
|
@ -1,21 +0,0 @@
|
||||||
from platypush.backend.stt import SttBackend
|
|
||||||
|
|
||||||
|
|
||||||
class SttPicovoiceBackend(SttBackend):
|
|
||||||
"""
|
|
||||||
Backend for the PicoVoice speech-to-text engine plugin. Set this plugin to ``enabled`` if you
|
|
||||||
want to run the speech-to-text engine continuously instead of programmatically using
|
|
||||||
``start_detection`` and ``stop_detection``.
|
|
||||||
|
|
||||||
Requires:
|
|
||||||
|
|
||||||
- The :class:`platypush.plugins.stt.deepspeech.SttPicovoicePlugin` plugin configured and its dependencies
|
|
||||||
installed.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__('stt.picovoice', *args, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
# vim:sw=4:ts=4:et:
|
|
0
platypush/backend/stt/picovoice/__init__.py
Normal file
0
platypush/backend/stt/picovoice/__init__.py
Normal file
21
platypush/backend/stt/picovoice/hotword.py
Normal file
21
platypush/backend/stt/picovoice/hotword.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
from platypush.backend.stt import SttBackend
|
||||||
|
|
||||||
|
|
||||||
|
class SttPicovoiceHotwordBackend(SttBackend):
|
||||||
|
"""
|
||||||
|
Backend for the PicoVoice hotword detection plugin. Set this plugin to ``enabled`` if you
|
||||||
|
want to run the hotword engine continuously instead of programmatically using
|
||||||
|
``start_detection`` and ``stop_detection``.
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
|
||||||
|
- The :class:`platypush.plugins.stt.deepspeech.SttPicovoiceHotwordPlugin` plugin configured and its dependencies
|
||||||
|
installed.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__('stt.picovoice.hotword', *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
# vim:sw=4:ts=4:et:
|
21
platypush/backend/stt/picovoice/speech.py
Normal file
21
platypush/backend/stt/picovoice/speech.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
from platypush.backend.stt import SttBackend
|
||||||
|
|
||||||
|
|
||||||
|
class SttPicovoiceSpeechBackend(SttBackend):
|
||||||
|
"""
|
||||||
|
Backend for the PicoVoice speech detection plugin. Set this plugin to ``enabled`` if you
|
||||||
|
want to run the speech engine continuously instead of programmatically using
|
||||||
|
``start_detection`` and ``stop_detection``.
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
|
||||||
|
- The :class:`platypush.plugins.stt.deepspeech.SttPicovoiceSpeechPlugin` plugin configured and its dependencies
|
||||||
|
installed.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__('stt.picovoice.speech', *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
# vim:sw=4:ts=4:et:
|
0
platypush/plugins/stt/picovoice/__init__.py
Normal file
0
platypush/plugins/stt/picovoice/__init__.py
Normal file
|
@ -7,10 +7,10 @@ from platypush.plugins import action
|
||||||
from platypush.plugins.stt import SttPlugin
|
from platypush.plugins.stt import SttPlugin
|
||||||
|
|
||||||
|
|
||||||
class SttPicovoicePlugin(SttPlugin):
|
class SttPicovoiceHotwordPlugin(SttPlugin):
|
||||||
"""
|
"""
|
||||||
This plugin performs speech-to-text and speech detection using the
|
This plugin performs hotword detection using
|
||||||
`PicoVoice <https://github.com/Picovoice>`_ speech-to-text integrations.
|
`PicoVoice <https://github.com/Picovoice>`_.
|
||||||
|
|
||||||
Requires:
|
Requires:
|
||||||
|
|
135
platypush/plugins/stt/picovoice/speech.py
Normal file
135
platypush/plugins/stt/picovoice/speech.py
Normal file
|
@ -0,0 +1,135 @@
|
||||||
|
import inspect
|
||||||
|
import os
|
||||||
|
import platform
|
||||||
|
import struct
|
||||||
|
import threading
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from platypush.message.event.stt import SpeechStartedEvent
|
||||||
|
|
||||||
|
from platypush.context import get_bus
|
||||||
|
from platypush.message.response.stt import SpeechDetectedResponse
|
||||||
|
from platypush.plugins import action
|
||||||
|
from platypush.plugins.stt import SttPlugin
|
||||||
|
|
||||||
|
|
||||||
|
class SttPicovoiceSpeechPlugin(SttPlugin):
|
||||||
|
"""
|
||||||
|
This plugin performs speech detection using `PicoVoice <https://github.com/Picovoice>`_.
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
|
||||||
|
* **cheetah** (``pip install git+https://github.com/BlackLight/cheetah``)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
library_path: Optional[str] = None,
|
||||||
|
acoustic_model_path: Optional[str] = None,
|
||||||
|
language_model_path: Optional[str] = None,
|
||||||
|
license_path: Optional[str] = None,
|
||||||
|
end_of_speech_timeout: int = 1,
|
||||||
|
*args, **kwargs):
|
||||||
|
"""
|
||||||
|
:param library_path: Path to the Cheetah binary library for your OS
|
||||||
|
(default: ``CHEETAH_INSTALL_DIR/lib/OS/ARCH/libpv_cheetah.EXT``).
|
||||||
|
:param acoustic_model_path: Path to the acoustic speech model
|
||||||
|
(default: ``CHEETAH_INSTALL_DIR/lib/common/acoustic_model.pv``).
|
||||||
|
:param language_model_path: Path to the language model
|
||||||
|
(default: ``CHEETAH_INSTALL_DIR/lib/common/language_model.pv``).
|
||||||
|
:param license_path: Path to your PicoVoice license
|
||||||
|
(default: ``CHEETAH_INSTALL_DIR/resources/license/cheetah_eval_linux_public.lic``).
|
||||||
|
:param end_of_speech_timeout: Number of seconds of silence during speech recognition before considering
|
||||||
|
a phrase over (default: 1).
|
||||||
|
"""
|
||||||
|
from pvcheetah import Cheetah
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
self._basedir = os.path.abspath(os.path.join(inspect.getfile(Cheetah), '..', '..', '..'))
|
||||||
|
if not library_path:
|
||||||
|
library_path = self._get_library_path()
|
||||||
|
if not language_model_path:
|
||||||
|
language_model_path = os.path.join(self._basedir, 'lib', 'common', 'language_model.pv')
|
||||||
|
if not acoustic_model_path:
|
||||||
|
acoustic_model_path = os.path.join(self._basedir, 'lib', 'common', 'acoustic_model.pv')
|
||||||
|
if not license_path:
|
||||||
|
license_path = os.path.join(self._basedir, 'resources', 'license', 'cheetah_eval_linux_public.lic')
|
||||||
|
|
||||||
|
self._library_path = library_path
|
||||||
|
self._language_model_path = language_model_path
|
||||||
|
self._acoustic_model_path = acoustic_model_path
|
||||||
|
self._license_path = license_path
|
||||||
|
self._end_of_speech_timeout = end_of_speech_timeout
|
||||||
|
self._stt_engine: Optional[Cheetah] = None
|
||||||
|
self._speech_in_progress = threading.Event()
|
||||||
|
|
||||||
|
def _get_library_path(self) -> str:
|
||||||
|
path = os.path.join(self._basedir, 'lib', platform.system().lower(), platform.machine())
|
||||||
|
return os.path.join(path, [f for f in os.listdir(path) if f.startswith('libpv_cheetah.')][0])
|
||||||
|
|
||||||
|
def convert_frames(self, frames: bytes) -> tuple:
|
||||||
|
assert self._stt_engine, 'The speech engine is not running'
|
||||||
|
return struct.unpack_from("h" * self._stt_engine.frame_length, frames)
|
||||||
|
|
||||||
|
def on_detection_ended(self) -> None:
|
||||||
|
if self._stt_engine:
|
||||||
|
self._stt_engine.delete()
|
||||||
|
self._stt_engine = None
|
||||||
|
|
||||||
|
def detect_speech(self, frames: tuple) -> str:
|
||||||
|
text, is_endpoint = self._stt_engine.process(frames)
|
||||||
|
text = text.strip()
|
||||||
|
|
||||||
|
if text:
|
||||||
|
if not self._speech_in_progress.is_set():
|
||||||
|
self._speech_in_progress.set()
|
||||||
|
get_bus().post(SpeechStartedEvent())
|
||||||
|
|
||||||
|
self._current_text += ' ' + text.strip()
|
||||||
|
|
||||||
|
if is_endpoint:
|
||||||
|
text = self._stt_engine.flush().strip().strip()
|
||||||
|
if text:
|
||||||
|
self._current_text += ' ' + text
|
||||||
|
|
||||||
|
self._speech_in_progress.clear()
|
||||||
|
if self._current_text:
|
||||||
|
self.on_speech_detected(self._current_text)
|
||||||
|
|
||||||
|
self._current_text = ''
|
||||||
|
|
||||||
|
return self._current_text
|
||||||
|
|
||||||
|
def process_text(self, text: str) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@action
|
||||||
|
def detect(self, audio_file: str) -> SpeechDetectedResponse:
|
||||||
|
"""
|
||||||
|
Perform speech-to-text analysis on an audio file.
|
||||||
|
|
||||||
|
:param audio_file: Path to the audio file.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def recording_thread(self, input_device: Optional[str] = None, *args, **kwargs) -> None:
|
||||||
|
assert self._stt_engine, 'The hotword engine has not yet been initialized'
|
||||||
|
super().recording_thread(block_size=self._stt_engine.frame_length, input_device=input_device)
|
||||||
|
|
||||||
|
@action
|
||||||
|
def start_detection(self, *args, **kwargs) -> None:
|
||||||
|
from pvcheetah import Cheetah
|
||||||
|
self._stt_engine = Cheetah(
|
||||||
|
library_path=self._library_path,
|
||||||
|
acoustic_model_path=self._acoustic_model_path,
|
||||||
|
language_model_path=self._language_model_path,
|
||||||
|
license_path=self._license_path,
|
||||||
|
endpoint_duration_sec=self._end_of_speech_timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.rate = self._stt_engine.sample_rate
|
||||||
|
self._speech_in_progress.clear()
|
||||||
|
super().start_detection(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
# vim:sw=4:ts=4:et:
|
|
@ -235,5 +235,8 @@ croniter
|
||||||
# numpy
|
# numpy
|
||||||
# sounddevice
|
# sounddevice
|
||||||
|
|
||||||
# Support for PicoVoice speech-to-text engine
|
# Support for PicoVoice hotword engine
|
||||||
# pvporcupine
|
# pvporcupine
|
||||||
|
|
||||||
|
# Support for PicoVoice speech-to-text engine
|
||||||
|
# pvcheetah
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -285,7 +285,9 @@ setup(
|
||||||
'zwave': ['python-openzwave'],
|
'zwave': ['python-openzwave'],
|
||||||
# Support for Mozilla DeepSpeech speech-to-text engine
|
# Support for Mozilla DeepSpeech speech-to-text engine
|
||||||
'deepspeech': ['deepspeech', 'numpy','sounddevice'],
|
'deepspeech': ['deepspeech', 'numpy','sounddevice'],
|
||||||
|
# Support for PicoVoice hotword detection engine
|
||||||
|
'picovoice-hotword': ['pvporcupine'],
|
||||||
# Support for PicoVoice speech-to-text engine
|
# Support for PicoVoice speech-to-text engine
|
||||||
'picovoice': ['pvporcupine'],
|
'picovoice-speech': ['pvcheetah @ git+https://github.com/BlackLight/cheetah'],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in a new issue