forked from platypush/platypush
parent
3528b3646f
commit
7a8f30e5e0
15 changed files with 851 additions and 71 deletions
|
@ -1,5 +1,9 @@
|
|||
# Changelog
|
||||
|
||||
## [1.0.7] - 2024-06-02
|
||||
|
||||
- [#384] Added `assistant.openai` and `tts.openai` plugins.
|
||||
|
||||
## [1.0.6] - 2024-06-01
|
||||
|
||||
- 🐛 Bug fix on one of the entities modules that prevented the application from
|
||||
|
|
5
docs/source/platypush/plugins/assistant.openai.rst
Normal file
5
docs/source/platypush/plugins/assistant.openai.rst
Normal file
|
@ -0,0 +1,5 @@
|
|||
``assistant.openai``
|
||||
====================
|
||||
|
||||
.. automodule:: platypush.plugins.assistant.openai
|
||||
:members:
|
5
docs/source/platypush/plugins/tts.openai.rst
Normal file
5
docs/source/platypush/plugins/tts.openai.rst
Normal file
|
@ -0,0 +1,5 @@
|
|||
``tts.openai``
|
||||
==============
|
||||
|
||||
.. automodule:: platypush.plugins.tts.openai
|
||||
:members:
|
|
@ -11,6 +11,7 @@ Plugins
|
|||
platypush/plugins/application.rst
|
||||
platypush/plugins/arduino.rst
|
||||
platypush/plugins/assistant.google.rst
|
||||
platypush/plugins/assistant.openai.rst
|
||||
platypush/plugins/assistant.picovoice.rst
|
||||
platypush/plugins/autoremote.rst
|
||||
platypush/plugins/bluetooth.rst
|
||||
|
@ -134,6 +135,7 @@ Plugins
|
|||
platypush/plugins/tts.rst
|
||||
platypush/plugins/tts.google.rst
|
||||
platypush/plugins/tts.mimic3.rst
|
||||
platypush/plugins/tts.openai.rst
|
||||
platypush/plugins/tts.picovoice.rst
|
||||
platypush/plugins/tv.samsung.ws.rst
|
||||
platypush/plugins/twilio.rst
|
||||
|
|
|
@ -8,6 +8,9 @@
|
|||
"assistant.google": {
|
||||
"class": "fas fa-microphone-lines"
|
||||
},
|
||||
"assistant.openai": {
|
||||
"class": "fas fa-microphone-lines"
|
||||
},
|
||||
"assistant.picovoice": {
|
||||
"class": "fas fa-microphone-lines"
|
||||
},
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
from collections import namedtuple
|
||||
from dataclasses import dataclass, field
|
||||
from logging import getLogger
|
||||
from queue import Full, Queue
|
||||
from threading import Event, RLock
|
||||
from threading import Event
|
||||
from time import time
|
||||
from typing import Optional
|
||||
|
||||
|
@ -10,63 +8,7 @@ import sounddevice as sd
|
|||
|
||||
from platypush.utils import wait_for_either
|
||||
|
||||
|
||||
AudioFrame = namedtuple('AudioFrame', ['data', 'timestamp'])
|
||||
|
||||
|
||||
@dataclass
|
||||
class PauseState:
|
||||
"""
|
||||
Data class to hold the boilerplate (state + synchronization events) for the
|
||||
audio recorder pause API.
|
||||
"""
|
||||
|
||||
_paused_event: Event = field(default_factory=Event)
|
||||
_recording_event: Event = field(default_factory=Event)
|
||||
_state_lock: RLock = field(default_factory=RLock)
|
||||
|
||||
@property
|
||||
def paused(self):
|
||||
with self._state_lock:
|
||||
return self._paused_event.is_set()
|
||||
|
||||
def pause(self):
|
||||
"""
|
||||
Pause the audio recorder.
|
||||
"""
|
||||
with self._state_lock:
|
||||
self._paused_event.set()
|
||||
self._recording_event.clear()
|
||||
|
||||
def resume(self):
|
||||
"""
|
||||
Resume the audio recorder.
|
||||
"""
|
||||
with self._state_lock:
|
||||
self._paused_event.clear()
|
||||
self._recording_event.set()
|
||||
|
||||
def toggle(self):
|
||||
"""
|
||||
Toggle the audio recorder pause state.
|
||||
"""
|
||||
with self._state_lock:
|
||||
if self.paused:
|
||||
self.resume()
|
||||
else:
|
||||
self.pause()
|
||||
|
||||
def wait_paused(self, timeout: Optional[float] = None):
|
||||
"""
|
||||
Wait until the audio recorder is paused.
|
||||
"""
|
||||
self._paused_event.wait(timeout=timeout)
|
||||
|
||||
def wait_recording(self, timeout: Optional[float] = None):
|
||||
"""
|
||||
Wait until the audio recorder is resumed.
|
||||
"""
|
||||
self._recording_event.wait(timeout=timeout)
|
||||
from ._state import AudioFrame, PauseState
|
||||
|
||||
|
||||
class AudioRecorder:
|
||||
|
@ -112,9 +54,7 @@ class AudioRecorder:
|
|||
"""
|
||||
Start the audio stream.
|
||||
"""
|
||||
self._stop_event.clear()
|
||||
self.stream.start()
|
||||
return self
|
||||
return self.start()
|
||||
|
||||
def __exit__(self, *_):
|
||||
"""
|
||||
|
@ -145,6 +85,14 @@ class AudioRecorder:
|
|||
self.logger.debug('Audio queue is empty')
|
||||
return None
|
||||
|
||||
def start(self):
|
||||
"""
|
||||
Start the audio stream.
|
||||
"""
|
||||
self._stop_event.clear()
|
||||
self.stream.start()
|
||||
return self
|
||||
|
||||
def stop(self):
|
||||
"""
|
||||
Stop the audio stream.
|
||||
|
@ -186,6 +134,6 @@ class AudioRecorder:
|
|||
wait_for_either(
|
||||
self._stop_event,
|
||||
self._upstream_stop_event,
|
||||
self._paused_state._recording_event,
|
||||
self._paused_state._recording_event, # pylint: disable=protected-access
|
||||
timeout=timeout,
|
||||
)
|
61
platypush/common/assistant/_state.py
Normal file
61
platypush/common/assistant/_state.py
Normal file
|
@ -0,0 +1,61 @@
|
|||
from collections import namedtuple
|
||||
from dataclasses import dataclass, field
|
||||
from threading import Event, RLock
|
||||
from typing import Optional
|
||||
|
||||
AudioFrame = namedtuple('AudioFrame', ['data', 'timestamp'])
|
||||
|
||||
|
||||
@dataclass
|
||||
class PauseState:
|
||||
"""
|
||||
Data class to hold the boilerplate (state + synchronization events) for the
|
||||
audio recorder pause API.
|
||||
"""
|
||||
|
||||
_paused_event: Event = field(default_factory=Event)
|
||||
_recording_event: Event = field(default_factory=Event)
|
||||
_state_lock: RLock = field(default_factory=RLock)
|
||||
|
||||
@property
|
||||
def paused(self):
|
||||
with self._state_lock:
|
||||
return self._paused_event.is_set()
|
||||
|
||||
def pause(self):
|
||||
"""
|
||||
Pause the audio recorder.
|
||||
"""
|
||||
with self._state_lock:
|
||||
self._paused_event.set()
|
||||
self._recording_event.clear()
|
||||
|
||||
def resume(self):
|
||||
"""
|
||||
Resume the audio recorder.
|
||||
"""
|
||||
with self._state_lock:
|
||||
self._paused_event.clear()
|
||||
self._recording_event.set()
|
||||
|
||||
def toggle(self):
|
||||
"""
|
||||
Toggle the audio recorder pause state.
|
||||
"""
|
||||
with self._state_lock:
|
||||
if self.paused:
|
||||
self.resume()
|
||||
else:
|
||||
self.pause()
|
||||
|
||||
def wait_paused(self, timeout: Optional[float] = None):
|
||||
"""
|
||||
Wait until the audio recorder is paused.
|
||||
"""
|
||||
self._paused_event.wait(timeout=timeout)
|
||||
|
||||
def wait_recording(self, timeout: Optional[float] = None):
|
||||
"""
|
||||
Wait until the audio recorder is resumed.
|
||||
"""
|
||||
self._recording_event.wait(timeout=timeout)
|
447
platypush/plugins/assistant/openai/__init__.py
Normal file
447
platypush/plugins/assistant/openai/__init__.py
Normal file
|
@ -0,0 +1,447 @@
|
|||
from io import BytesIO
|
||||
from threading import Event
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
from pydub import AudioSegment
|
||||
|
||||
from platypush.common.assistant import AudioRecorder
|
||||
from platypush.context import get_plugin
|
||||
from platypush.plugins import RunnablePlugin, action
|
||||
from platypush.plugins.assistant import AssistantPlugin
|
||||
from platypush.plugins.openai import OpenaiPlugin
|
||||
|
||||
from ._state import RecordingState
|
||||
|
||||
|
||||
# pylint: disable=too-many-ancestors
|
||||
class AssistantOpenaiPlugin(AssistantPlugin, RunnablePlugin):
|
||||
"""
|
||||
A voice assistant based on the OpenAI API.
|
||||
|
||||
It requires the :class:`platypush.plugins.openai.OpenaiPlugin` plugin to be
|
||||
configured with an OpenAI API key.
|
||||
|
||||
Hotword detection
|
||||
-----------------
|
||||
|
||||
This plugin doesn't have hotword detection, as OpenAI doesn't provide
|
||||
an API for that. Instead, the assistant can be started and stopped
|
||||
programmatically through the :meth:`.start_conversation` action.
|
||||
|
||||
If you want to implement hotword detection, you can use a separate plugin
|
||||
such as
|
||||
:class:`platypush.plugins.assistant.picovoice.AssistantPicovoicePlugin`.
|
||||
|
||||
The configuration in this case would be like:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
assistant.picovoice:
|
||||
access_key: YOUR_PICOVOICE_ACCESS_KEY
|
||||
|
||||
# List of hotwords to listen for
|
||||
keywords:
|
||||
- alexa
|
||||
- computer
|
||||
- ok google
|
||||
|
||||
# Disable speech-to-text and intent recognition, only use hotword
|
||||
# detection
|
||||
stt_enabled: false
|
||||
hotword_enabled: true
|
||||
|
||||
conversation_start_sound: /sound/to/play/when/the/conversation/starts.mp3
|
||||
# speech_model_path: /mnt/hd/models/picovoice/cheetah/custom-en.pv
|
||||
# intent_model_path: /mnt/hd/models/picovoice/rhino/custom-en-x86.rhn
|
||||
|
||||
openai:
|
||||
api_key: YOUR_OPENAI_API_KEY
|
||||
|
||||
# Customize your assistant's context and knowledge base to your
|
||||
# liking
|
||||
context:
|
||||
- role: system
|
||||
content: >
|
||||
You are a 16th century noble lady who talks in
|
||||
Shakespearean English to her peers.
|
||||
|
||||
# Enable the assistant plugin
|
||||
assistant.openai:
|
||||
|
||||
# Enable the text-to-speech plugin
|
||||
tts.openai:
|
||||
# Customize the voice model
|
||||
voice: nova
|
||||
|
||||
Then you can call :meth:`.start_conversation` when the hotword is detected
|
||||
:class:`platypush.message.event.assistant.HotwordDetectedEvent` is
|
||||
triggered:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from platypush import run, when
|
||||
from platypush.message.event.assistant import HotwordDetectedEvent
|
||||
|
||||
@when(HotwordDetectedEvent)
|
||||
# You can also customize it by running a different assistant logic
|
||||
# depending on the hotword
|
||||
# @when(HotwordDetectedEvent, hotword='computer')
|
||||
def on_hotword_detected():
|
||||
run("assistant.openai.start_conversation")
|
||||
|
||||
This configuration will:
|
||||
|
||||
1. Start the hotword detection when the application starts.
|
||||
2. Start the OpenAI assistant when the hotword is detected.
|
||||
|
||||
AI responses
|
||||
------------
|
||||
|
||||
By default (unless you set ``stop_conversation_on_speech_match`` to ``False``),
|
||||
the plugin will:
|
||||
|
||||
1. Process the speech through the OpenAI API (the GPT model to be is
|
||||
configurable in the OpenAI plugin ``model`` configuration).
|
||||
|
||||
2. Render the response through the configured ``tts_plugin`` (default:
|
||||
``tts.openai``). If ``tts_plugin`` is not set, then the response will
|
||||
be returned as a string.
|
||||
|
||||
Custom speech processing
|
||||
------------------------
|
||||
|
||||
You can create custom hooks on
|
||||
:class:`platypush.message.event.assistant.SpeechRecognizedEvent` with
|
||||
custom ``phrase`` strings or (regex) patterns. For example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from platypush import run, when
|
||||
from platypush.message.event.assistant import SpeechRecognizedEvent
|
||||
|
||||
# Matches any phrase that contains either "play music" or "play the
|
||||
# music"
|
||||
@when(SpeechRecognizedEvent, phrase='play (the)? music')
|
||||
def play_music():
|
||||
run('music.mpd.play')
|
||||
|
||||
If at least a custom hook with a non-empty ``phrase`` string is matched,
|
||||
then the default response will be disabled. If you still want the assistant
|
||||
to say something when the event is handled, you can call
|
||||
``event.assistant.render_response`` on the hook:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from datetime import datetime
|
||||
from textwrap import dedent
|
||||
from time import time
|
||||
|
||||
from platypush import run, when
|
||||
from platypush.message.event.assistant import SpeechRecognizedEvent
|
||||
|
||||
@when(SpeechRecognizedEvent, phrase='weather today')
|
||||
def weather_forecast(event: SpeechRecognizedEvent):
|
||||
limit = time() + 24 * 60 * 60 # 24 hours from now
|
||||
forecast = [
|
||||
weather
|
||||
for weather in run("weather.openweathermap.get_forecast")
|
||||
if datetime.fromisoformat(weather["time"]).timestamp() < limit
|
||||
]
|
||||
|
||||
min_temp = round(
|
||||
min(weather["temperature"] for weather in forecast)
|
||||
)
|
||||
max_temp = round(
|
||||
max(weather["temperature"] for weather in forecast)
|
||||
)
|
||||
max_wind_gust = round(
|
||||
(max(weather["wind_gust"] for weather in forecast)) * 3.6
|
||||
)
|
||||
summaries = [weather["summary"] for weather in forecast]
|
||||
most_common_summary = max(summaries, key=summaries.count)
|
||||
avg_cloud_cover = round(
|
||||
sum(weather["cloud_cover"] for weather in forecast) / len(forecast)
|
||||
)
|
||||
|
||||
event.assistant.render_response(
|
||||
dedent(
|
||||
f\"\"\"
|
||||
The forecast for today is: {most_common_summary}, with
|
||||
a minimum of {min_temp} and a maximum of {max_temp}
|
||||
degrees, wind gust of {max_wind_gust} km/h, and an
|
||||
average cloud cover of {avg_cloud_cover}%.
|
||||
\"\"\"
|
||||
)
|
||||
)
|
||||
|
||||
Conversation follow-up
|
||||
----------------------
|
||||
|
||||
A conversation will have a follow-up (i.e. the assistant will listen for a
|
||||
phrase after rendering a response) if the response is not empty and ends
|
||||
with a question mark. If you want to force a follow-up even if the response
|
||||
doesn't end with a question mark, you can call :meth:`.start_conversation`
|
||||
programmatically from your hooks.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str = "whisper-1",
|
||||
tts_plugin: Optional[str] = "tts.openai",
|
||||
min_silence_secs: float = 1.0,
|
||||
silence_threshold: int = -22,
|
||||
sample_rate: int = 16000,
|
||||
frame_size: int = 16384,
|
||||
channels: int = 1,
|
||||
conversation_start_timeout: float = 5.0,
|
||||
conversation_end_timeout: float = 1.0,
|
||||
conversation_max_duration: float = 15.0,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
:param model: OpenAI model to use for audio transcription (default:
|
||||
``whisper-1``).
|
||||
:param tts_plugin: Name of the TTS plugin to use for rendering the responses
|
||||
(default: ``tts.openai``).
|
||||
:param min_silence_secs: Minimum silence duration in seconds to detect
|
||||
the end of a conversation (default: 1.0 seconds).
|
||||
:param silence_threshold: Silence threshold in dBFS (default: -22).
|
||||
The value of 0 is the maximum amplitude, and -120 is associated to
|
||||
a silent or nearly silent audio, thus the higher the value, the more
|
||||
sensitive the silence detection will be (default: -22).
|
||||
:param sample_rate: Recording sample rate in Hz (default: 16000).
|
||||
:param frame_size: Recording frame size in samples (default: 16384).
|
||||
Note that it's important to make sure that ``frame_size`` /
|
||||
``sample_rate`` isn't smaller than the minimum silence duration,
|
||||
otherwise the silence detection won't work properly.
|
||||
:param channels: Number of recording channels (default: 1).
|
||||
:param conversation_start_timeout: How long to wait for the
|
||||
conversation to start (i.e. the first non-silent audio frame to be
|
||||
detected) before giving up and stopping the recording (default: 5.0
|
||||
seconds).
|
||||
:param conversation_end_timeout: How many seconds of silence to wait
|
||||
after the last non-silent audio frame before stopping the recording
|
||||
(default: 1.5 seconds).
|
||||
:param conversation_max_duration: Maximum conversation duration in seconds
|
||||
(default: 15.0 seconds).
|
||||
"""
|
||||
kwargs["tts_plugin"] = tts_plugin
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self._model = model
|
||||
self._min_silence_secs = min_silence_secs
|
||||
self._silence_threshold = silence_threshold
|
||||
self._sample_rate = sample_rate
|
||||
self._frame_size = frame_size
|
||||
self._channels = channels
|
||||
self._conversation_start_timeout = conversation_start_timeout
|
||||
self._conversation_end_timeout = conversation_end_timeout
|
||||
self._conversation_max_duration = conversation_max_duration
|
||||
self._start_recording_event = Event()
|
||||
self._disable_default_response = False
|
||||
self._recording_state = RecordingState(
|
||||
sample_rate=sample_rate,
|
||||
channels=channels,
|
||||
min_silence_secs=min_silence_secs,
|
||||
silence_threshold=silence_threshold,
|
||||
)
|
||||
|
||||
self._recorder: Optional[AudioRecorder] = None
|
||||
|
||||
def _to_audio_segment(self, data: np.ndarray) -> AudioSegment:
|
||||
return AudioSegment(
|
||||
data.tobytes(),
|
||||
frame_rate=self._sample_rate,
|
||||
sample_width=data.dtype.itemsize,
|
||||
channels=self._channels,
|
||||
)
|
||||
|
||||
def _is_conversation_ended(self):
|
||||
# End if the recording has been stopped
|
||||
if not self._recorder or self._recorder.should_stop():
|
||||
return True
|
||||
|
||||
# End if we reached the max conversation duration
|
||||
if self._recording_state.duration >= self._conversation_max_duration:
|
||||
return True
|
||||
|
||||
# End if the conversation hasn't started yet and we reached the
|
||||
# conversation start timeout
|
||||
if (
|
||||
not self._recording_state.conversation_started
|
||||
and self._recording_state.duration >= self._conversation_start_timeout
|
||||
):
|
||||
return True
|
||||
|
||||
# End if the conversation has started and the user has been silent for
|
||||
# more than the conversation end timeout
|
||||
if (
|
||||
self._recording_state.conversation_started
|
||||
and self._recording_state.silence_duration >= self._conversation_end_timeout
|
||||
):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
@property
|
||||
def _openai(self) -> OpenaiPlugin:
|
||||
openai: Optional[OpenaiPlugin] = get_plugin("openai")
|
||||
assert openai, (
|
||||
"OpenAI plugin not found. "
|
||||
"Please configure the `openai` plugin to use `assistant.openai`"
|
||||
)
|
||||
return openai
|
||||
|
||||
def _get_prediction(self, audio: BytesIO) -> str:
|
||||
return self._openai.transcribe_raw(
|
||||
audio.getvalue(), extension='mp3', model=self._model
|
||||
)
|
||||
|
||||
def _capture_audio(self, recorder: AudioRecorder):
|
||||
while not self.should_stop() and not self._is_conversation_ended():
|
||||
audio_data = recorder.read()
|
||||
if not audio_data:
|
||||
continue
|
||||
|
||||
self._recording_state.add_audio(audio_data)
|
||||
|
||||
def _audio_loop(self):
|
||||
while not self.should_stop():
|
||||
self._wait_recording_start()
|
||||
self._recording_state.reset()
|
||||
self._on_conversation_start()
|
||||
|
||||
try:
|
||||
with AudioRecorder(
|
||||
stop_event=self._should_stop,
|
||||
sample_rate=self._sample_rate,
|
||||
frame_size=self._frame_size,
|
||||
channels=self._channels,
|
||||
) as self._recorder:
|
||||
self._capture_audio(self._recorder)
|
||||
finally:
|
||||
if self._recorder:
|
||||
try:
|
||||
self._recorder.stream.close()
|
||||
except Exception as e:
|
||||
self.logger.warning("Error closing the audio stream: %s", e)
|
||||
|
||||
self._recorder = None
|
||||
|
||||
if self._recording_state.is_silent():
|
||||
self._on_conversation_timeout()
|
||||
else:
|
||||
audio = self._recording_state.export_audio()
|
||||
text = self._get_prediction(audio)
|
||||
self._on_speech_recognized(text)
|
||||
|
||||
def _wait_recording_start(self):
|
||||
self._start_recording_event.wait()
|
||||
self._start_recording_event.clear()
|
||||
|
||||
def _start_conversation(self, *_, **__):
|
||||
self._disable_default_response = False
|
||||
self._recording_state.reset()
|
||||
self._start_recording_event.set()
|
||||
|
||||
def _stop_conversation(self, *_, **__):
|
||||
self._disable_default_response = True
|
||||
super()._stop_conversation()
|
||||
self._recording_state.reset()
|
||||
if self._recorder:
|
||||
self._recorder.stop()
|
||||
|
||||
self._on_conversation_end()
|
||||
|
||||
def _on_speech_recognized(self, phrase: Optional[str]):
|
||||
super()._on_speech_recognized(phrase)
|
||||
|
||||
# Dirty hack: wait a bit before stopping the conversation to make sure
|
||||
# that there aren't event hooks triggered in other threads that are
|
||||
# supposed to handle.
|
||||
if self.stop_conversation_on_speech_match:
|
||||
self.wait_stop(0.5)
|
||||
if self.should_stop():
|
||||
return
|
||||
|
||||
if self._disable_default_response:
|
||||
self.logger.debug("Default response disabled, skipping response")
|
||||
return
|
||||
|
||||
response = self._openai.get_response(phrase).output
|
||||
if response:
|
||||
self.render_response(response)
|
||||
else:
|
||||
self._on_no_response()
|
||||
|
||||
@action
|
||||
def start_conversation(self, *_, **__):
|
||||
"""
|
||||
Start a conversation with the assistant. The conversation will be
|
||||
automatically stopped after ``conversation_max_duration`` seconds of
|
||||
audio, or after ``conversation_start_timeout`` seconds of silence
|
||||
with no audio detected, or after ``conversation_end_timeout`` seconds
|
||||
after the last non-silent audio frame has been detected, or when the
|
||||
:meth:`.stop_conversation` method is called.
|
||||
"""
|
||||
self._start_conversation()
|
||||
|
||||
@action
|
||||
def mute(self, *_, **__):
|
||||
"""
|
||||
.. note:: This plugin has no hotword detection, thus no continuous
|
||||
audio detection. Speech processing is done on-demand through the
|
||||
:meth:`.start_conversation` and :meth:`.stop_conversation` methods.
|
||||
Therefore, the :meth:`.mute` and :meth:`.unmute` methods are not
|
||||
implemented.
|
||||
"""
|
||||
self.logger.warning(
|
||||
"assistant.openai.mute is not implemented because this plugin "
|
||||
"has no hotword detection, and the only way to stop a conversation "
|
||||
"is by calling stop_conversation()"
|
||||
)
|
||||
|
||||
@action
|
||||
def unmute(self, *_, **__):
|
||||
"""
|
||||
.. note:: This plugin has no hotword detection, thus no continuous
|
||||
audio detection. Speech processing is done on-demand through the
|
||||
:meth:`.start_conversation` and :meth:`.stop_conversation` methods.
|
||||
Therefore, the :meth:`.mute` and :meth:`.unmute` methods are not
|
||||
implemented.
|
||||
"""
|
||||
self.logger.warning(
|
||||
"assistant.openai.unmute is not implemented because this plugin "
|
||||
"has no hotword detection, and the only way to start a conversation "
|
||||
"is by calling start_conversation()"
|
||||
)
|
||||
|
||||
@action
|
||||
def send_text_query(self, text: str, *_, **__):
|
||||
"""
|
||||
If the ``tts_plugin`` configuration is set, then the assistant will
|
||||
process the given text query through
|
||||
:meth:`platypush.plugins.openai.OpenaiPlugin.get_response` and render
|
||||
the response through the specified TTS plugin.
|
||||
|
||||
:return: The response received from
|
||||
:meth:`platypush.plugins.openai.OpenaiPlugin.get_response`.
|
||||
"""
|
||||
response = self._openai.get_response(text).output
|
||||
self.render_response(response)
|
||||
return response
|
||||
|
||||
def main(self):
|
||||
while not self.should_stop():
|
||||
try:
|
||||
self._audio_loop()
|
||||
except Exception as e:
|
||||
self.logger.error("Audio loop error: %s", e, exc_info=True)
|
||||
self.wait_stop(5)
|
||||
finally:
|
||||
self.stop_conversation()
|
||||
|
||||
def stop(self):
|
||||
self._stop_conversation()
|
||||
super().stop()
|
80
platypush/plugins/assistant/openai/_state.py
Normal file
80
platypush/plugins/assistant/openai/_state.py
Normal file
|
@ -0,0 +1,80 @@
|
|||
from io import BytesIO
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from pydub import AudioSegment, silence
|
||||
|
||||
from platypush.common.assistant import AudioFrame
|
||||
|
||||
|
||||
@dataclass
|
||||
class RecordingState:
|
||||
"""
|
||||
Current state of the audio recording.
|
||||
"""
|
||||
|
||||
sample_rate: int
|
||||
channels: int
|
||||
min_silence_secs: float
|
||||
silence_threshold: int
|
||||
silence_duration: float = 0.0
|
||||
audio_segments: List[AudioSegment] = field(default_factory=list)
|
||||
duration: float = 0.0
|
||||
conversation_started: bool = False
|
||||
|
||||
def _silence_duration(self, audio: AudioSegment) -> float:
|
||||
silent_frames = [
|
||||
(start / 1000, stop / 1000)
|
||||
for start, stop in silence.detect_silence(
|
||||
audio,
|
||||
min_silence_len=int(self.min_silence_secs * 1000),
|
||||
silence_thresh=int(self.silence_threshold),
|
||||
)
|
||||
]
|
||||
|
||||
return sum(stop - start for start, stop in silent_frames)
|
||||
|
||||
def _to_audio_segment(self, data: np.ndarray) -> AudioSegment:
|
||||
return AudioSegment(
|
||||
data.tobytes(),
|
||||
frame_rate=self.sample_rate,
|
||||
sample_width=data.dtype.itemsize,
|
||||
channels=self.channels,
|
||||
)
|
||||
|
||||
def _add_audio_segment(self, audio: AudioSegment):
|
||||
self.audio_segments.append(audio)
|
||||
self.duration += audio.duration_seconds
|
||||
silence_duration = self._silence_duration(audio)
|
||||
is_mostly_silent = silence_duration >= audio.duration_seconds * 0.75
|
||||
|
||||
if is_mostly_silent:
|
||||
self.silence_duration += silence_duration
|
||||
else:
|
||||
self.conversation_started = True
|
||||
self.silence_duration = 0.0
|
||||
|
||||
def is_silent(self) -> bool:
|
||||
return self.silence_duration >= self.duration
|
||||
|
||||
def add_audio(self, audio: AudioFrame):
|
||||
self._add_audio_segment(self._to_audio_segment(audio.data))
|
||||
|
||||
def export_audio(self) -> BytesIO:
|
||||
buffer = BytesIO()
|
||||
if not self.audio_segments:
|
||||
return buffer
|
||||
|
||||
audio = self.audio_segments[0]
|
||||
for segment in self.audio_segments[1:]:
|
||||
audio += segment
|
||||
|
||||
audio.export(buffer, format="mp3", bitrate='92')
|
||||
return buffer
|
||||
|
||||
def reset(self):
|
||||
self.audio_segments.clear()
|
||||
self.duration = 0.0
|
||||
self.silence_duration = 0.0
|
||||
self.conversation_started = False
|
44
platypush/plugins/assistant/openai/manifest.json
Normal file
44
platypush/plugins/assistant/openai/manifest.json
Normal file
|
@ -0,0 +1,44 @@
|
|||
{
|
||||
"manifest": {
|
||||
"package": "platypush.plugins.assistant.openai",
|
||||
"type": "plugin",
|
||||
"events": [
|
||||
"platypush.message.event.assistant.ConversationEndEvent",
|
||||
"platypush.message.event.assistant.ConversationStartEvent",
|
||||
"platypush.message.event.assistant.ConversationTimeoutEvent",
|
||||
"platypush.message.event.assistant.HotwordDetectedEvent",
|
||||
"platypush.message.event.assistant.IntentRecognizedEvent",
|
||||
"platypush.message.event.assistant.MicMutedEvent",
|
||||
"platypush.message.event.assistant.MicUnmutedEvent",
|
||||
"platypush.message.event.assistant.NoResponseEvent",
|
||||
"platypush.message.event.assistant.ResponseEndEvent",
|
||||
"platypush.message.event.assistant.ResponseEvent",
|
||||
"platypush.message.event.assistant.SpeechRecognizedEvent"
|
||||
],
|
||||
"install": {
|
||||
"apk": [
|
||||
"ffmpeg",
|
||||
"py3-numpy"
|
||||
],
|
||||
"apt": [
|
||||
"ffmpeg",
|
||||
"python3-numpy",
|
||||
"python3-pydub"
|
||||
],
|
||||
"dnf": [
|
||||
"ffmpeg",
|
||||
"python-numpy"
|
||||
],
|
||||
"pacman": [
|
||||
"ffmpeg",
|
||||
"python-numpy",
|
||||
"python-sounddevice"
|
||||
],
|
||||
"pip": [
|
||||
"numpy",
|
||||
"pydub",
|
||||
"sounddevice"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
|
@ -562,11 +562,8 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
|
|||
self._assistant.override_speech_model(model_file)
|
||||
self._assistant.state = AssistantState.DETECTING_SPEECH
|
||||
|
||||
@action
|
||||
def stop_conversation(self, *_, **__):
|
||||
"""
|
||||
Programmatically stop a running conversation with the assistant
|
||||
"""
|
||||
def _stop_conversation(self, *_, **__):
|
||||
super()._stop_conversation()
|
||||
if not self._assistant:
|
||||
self.logger.warning('Assistant not initialized')
|
||||
return
|
||||
|
|
|
@ -7,6 +7,7 @@ from typing import Any, Dict, Optional, Sequence
|
|||
|
||||
import pvporcupine
|
||||
|
||||
from platypush.common.assistant import AudioRecorder
|
||||
from platypush.context import get_plugin
|
||||
from platypush.message.event.assistant import (
|
||||
AssistantEvent,
|
||||
|
@ -16,8 +17,6 @@ from platypush.message.event.assistant import (
|
|||
SpeechRecognizedEvent,
|
||||
)
|
||||
from platypush.plugins.tts.picovoice import TtsPicovoicePlugin
|
||||
|
||||
from ._recorder import AudioRecorder
|
||||
from ._speech import SpeechProcessor
|
||||
from ._state import AssistantState
|
||||
|
||||
|
|
150
platypush/plugins/tts/openai/__init__.py
Normal file
150
platypush/plugins/tts/openai/__init__.py
Normal file
|
@ -0,0 +1,150 @@
|
|||
import os
|
||||
import tempfile
|
||||
from contextlib import contextmanager
|
||||
from multiprocessing import Process
|
||||
from typing import Generator, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from platypush.context import get_plugin
|
||||
from platypush.plugins import action
|
||||
from platypush.plugins.openai import OpenaiPlugin
|
||||
from platypush.plugins.tts import TtsPlugin
|
||||
|
||||
|
||||
class TtsOpenaiPlugin(TtsPlugin):
|
||||
r"""
|
||||
This plugin provides an interface to the `OpenAI text-to-speech API
|
||||
<https://platform.openai.com/docs/guides/text-to-speech>`_.
|
||||
|
||||
It requires the :class:`platypush.plugins.openai.OpenaiPlugin` plugin to be
|
||||
configured.
|
||||
"""
|
||||
|
||||
_BUFSIZE = 1024
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str = 'tts-1',
|
||||
voice: str = 'nova',
|
||||
timeout: float = 10,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
:param model: Model to be used for the text-to-speech conversion.
|
||||
See the `OpenAI API models documentation
|
||||
<https://platform.openai.com/docs/models/tts>`_ for the list of
|
||||
available models (default: ``tts-1``).
|
||||
:param voice: Default voice to be used. See the `OpenAI API
|
||||
voices documentation
|
||||
<https://platform.openai.com/docs/guides/text-to-speech/voice-options>`_
|
||||
for the list of available voices (default: ``nova``).
|
||||
:param timeout: Default timeout for the API requests (default: 10s).
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
openai = get_plugin('openai')
|
||||
assert openai, 'openai plugin not configured'
|
||||
|
||||
self.openai: OpenaiPlugin = openai
|
||||
self.model = model
|
||||
self.voice = voice
|
||||
self.timeout = timeout
|
||||
self._audio_proc: Optional[Process] = None
|
||||
|
||||
def _process_response(
|
||||
self,
|
||||
response: requests.Response,
|
||||
audio_file: str,
|
||||
) -> Process:
|
||||
def proc_fn():
|
||||
try:
|
||||
with open(audio_file, 'wb') as file:
|
||||
for chunk in response.iter_content(chunk_size=self._BUFSIZE):
|
||||
if chunk:
|
||||
file.write(chunk)
|
||||
file.flush()
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
self._audio_proc = Process(target=proc_fn, name='openai-tts-response-processor')
|
||||
self._audio_proc.start()
|
||||
return self._audio_proc
|
||||
|
||||
def _make_request(
|
||||
self,
|
||||
text: str,
|
||||
model: Optional[str] = None,
|
||||
voice: Optional[str] = None,
|
||||
) -> requests.Response:
|
||||
rs = requests.post(
|
||||
"https://api.openai.com/v1/audio/speech",
|
||||
timeout=self.timeout,
|
||||
stream=True,
|
||||
headers={
|
||||
"Authorization": f"Bearer {self.openai._api_key}", # pylint: disable=protected-access
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={
|
||||
"model": model or self.model,
|
||||
"voice": voice or self.voice,
|
||||
"input": text,
|
||||
},
|
||||
)
|
||||
|
||||
rs.raise_for_status()
|
||||
return rs
|
||||
|
||||
@contextmanager
|
||||
def _audio_fifo(self) -> Generator[str, None, None]:
|
||||
fifo_dir = tempfile.mkdtemp()
|
||||
fifo_path = os.path.join(fifo_dir, 'platypush-tts-openai-fifo')
|
||||
os.mkfifo(fifo_path)
|
||||
yield fifo_path
|
||||
|
||||
os.unlink(fifo_path)
|
||||
os.rmdir(fifo_dir)
|
||||
|
||||
@action
|
||||
def say(
|
||||
self,
|
||||
text: str,
|
||||
*_,
|
||||
model: Optional[str] = None,
|
||||
voice: Optional[str] = None,
|
||||
**player_args,
|
||||
):
|
||||
"""
|
||||
Say some text.
|
||||
|
||||
:param text: Text to say.
|
||||
:param model: Default ``model`` override.
|
||||
:param voice: Default ``voice`` override.
|
||||
:param player_args: Extends the additional arguments to be passed to
|
||||
:meth:`platypush.plugins.sound.SoundPlugin.play` (like volume,
|
||||
duration, channels etc.).
|
||||
"""
|
||||
response_processor: Optional[Process] = None
|
||||
|
||||
try:
|
||||
response = self._make_request(text, model=model, voice=voice)
|
||||
|
||||
with self._audio_fifo() as audio_file:
|
||||
response_processor = self._process_response(
|
||||
response=response, audio_file=audio_file
|
||||
)
|
||||
self._playback(audio_file, **player_args)
|
||||
response_processor.join()
|
||||
response_processor = None
|
||||
finally:
|
||||
if response_processor:
|
||||
response_processor.terminate()
|
||||
|
||||
@action
|
||||
def stop(self):
|
||||
super().stop()
|
||||
if self._audio_proc and self._audio_proc.is_alive():
|
||||
self._audio_proc.terminate()
|
||||
self._audio_proc.join()
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
34
platypush/plugins/tts/openai/manifest.json
Normal file
34
platypush/plugins/tts/openai/manifest.json
Normal file
|
@ -0,0 +1,34 @@
|
|||
{
|
||||
"manifest": {
|
||||
"events": {},
|
||||
"install": {
|
||||
"apk": [
|
||||
"ffmpeg",
|
||||
"portaudio-dev",
|
||||
"py3-numpy"
|
||||
],
|
||||
"apt": [
|
||||
"ffmpeg",
|
||||
"portaudio19-dev",
|
||||
"python3-numpy"
|
||||
],
|
||||
"dnf": [
|
||||
"ffmpeg",
|
||||
"portaudio-devel",
|
||||
"python-numpy"
|
||||
],
|
||||
"pacman": [
|
||||
"ffmpeg",
|
||||
"portaudio",
|
||||
"python-numpy",
|
||||
"python-sounddevice"
|
||||
],
|
||||
"pip": [
|
||||
"numpy",
|
||||
"sounddevice"
|
||||
]
|
||||
},
|
||||
"package": "platypush.plugins.tts.openai",
|
||||
"type": "plugin"
|
||||
}
|
||||
}
|
|
@ -93,6 +93,7 @@ mock_imports = [
|
|||
"pychromecast",
|
||||
"pyclip",
|
||||
"pydbus",
|
||||
"pydub",
|
||||
"pyfirmata2",
|
||||
"pyngrok",
|
||||
"pyotp",
|
||||
|
|
Loading…
Reference in a new issue