[#384] Added assistant.openai
and tts.openai
plugins.
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
Closes: #384
This commit is contained in:
parent
c3673391f7
commit
fa52bbfb5b
15 changed files with 851 additions and 71 deletions
|
@ -1,5 +1,9 @@
|
||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [1.0.7] - 2024-06-02
|
||||||
|
|
||||||
|
- [#384] Added `assistant.openai` and `tts.openai` plugins.
|
||||||
|
|
||||||
## [1.0.6] - 2024-06-01
|
## [1.0.6] - 2024-06-01
|
||||||
|
|
||||||
- 🐛 Bug fix on one of the entities modules that prevented the application from
|
- 🐛 Bug fix on one of the entities modules that prevented the application from
|
||||||
|
|
5
docs/source/platypush/plugins/assistant.openai.rst
Normal file
5
docs/source/platypush/plugins/assistant.openai.rst
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
``assistant.openai``
|
||||||
|
====================
|
||||||
|
|
||||||
|
.. automodule:: platypush.plugins.assistant.openai
|
||||||
|
:members:
|
5
docs/source/platypush/plugins/tts.openai.rst
Normal file
5
docs/source/platypush/plugins/tts.openai.rst
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
``tts.openai``
|
||||||
|
==============
|
||||||
|
|
||||||
|
.. automodule:: platypush.plugins.tts.openai
|
||||||
|
:members:
|
|
@ -11,6 +11,7 @@ Plugins
|
||||||
platypush/plugins/application.rst
|
platypush/plugins/application.rst
|
||||||
platypush/plugins/arduino.rst
|
platypush/plugins/arduino.rst
|
||||||
platypush/plugins/assistant.google.rst
|
platypush/plugins/assistant.google.rst
|
||||||
|
platypush/plugins/assistant.openai.rst
|
||||||
platypush/plugins/assistant.picovoice.rst
|
platypush/plugins/assistant.picovoice.rst
|
||||||
platypush/plugins/autoremote.rst
|
platypush/plugins/autoremote.rst
|
||||||
platypush/plugins/bluetooth.rst
|
platypush/plugins/bluetooth.rst
|
||||||
|
@ -134,6 +135,7 @@ Plugins
|
||||||
platypush/plugins/tts.rst
|
platypush/plugins/tts.rst
|
||||||
platypush/plugins/tts.google.rst
|
platypush/plugins/tts.google.rst
|
||||||
platypush/plugins/tts.mimic3.rst
|
platypush/plugins/tts.mimic3.rst
|
||||||
|
platypush/plugins/tts.openai.rst
|
||||||
platypush/plugins/tts.picovoice.rst
|
platypush/plugins/tts.picovoice.rst
|
||||||
platypush/plugins/tv.samsung.ws.rst
|
platypush/plugins/tv.samsung.ws.rst
|
||||||
platypush/plugins/twilio.rst
|
platypush/plugins/twilio.rst
|
||||||
|
|
|
@ -8,6 +8,9 @@
|
||||||
"assistant.google": {
|
"assistant.google": {
|
||||||
"class": "fas fa-microphone-lines"
|
"class": "fas fa-microphone-lines"
|
||||||
},
|
},
|
||||||
|
"assistant.openai": {
|
||||||
|
"class": "fas fa-microphone-lines"
|
||||||
|
},
|
||||||
"assistant.picovoice": {
|
"assistant.picovoice": {
|
||||||
"class": "fas fa-microphone-lines"
|
"class": "fas fa-microphone-lines"
|
||||||
},
|
},
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
from collections import namedtuple
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from logging import getLogger
|
from logging import getLogger
|
||||||
from queue import Full, Queue
|
from queue import Full, Queue
|
||||||
from threading import Event, RLock
|
from threading import Event
|
||||||
from time import time
|
from time import time
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
@ -10,63 +8,7 @@ import sounddevice as sd
|
||||||
|
|
||||||
from platypush.utils import wait_for_either
|
from platypush.utils import wait_for_either
|
||||||
|
|
||||||
|
from ._state import AudioFrame, PauseState
|
||||||
AudioFrame = namedtuple('AudioFrame', ['data', 'timestamp'])
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class PauseState:
|
|
||||||
"""
|
|
||||||
Data class to hold the boilerplate (state + synchronization events) for the
|
|
||||||
audio recorder pause API.
|
|
||||||
"""
|
|
||||||
|
|
||||||
_paused_event: Event = field(default_factory=Event)
|
|
||||||
_recording_event: Event = field(default_factory=Event)
|
|
||||||
_state_lock: RLock = field(default_factory=RLock)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def paused(self):
|
|
||||||
with self._state_lock:
|
|
||||||
return self._paused_event.is_set()
|
|
||||||
|
|
||||||
def pause(self):
|
|
||||||
"""
|
|
||||||
Pause the audio recorder.
|
|
||||||
"""
|
|
||||||
with self._state_lock:
|
|
||||||
self._paused_event.set()
|
|
||||||
self._recording_event.clear()
|
|
||||||
|
|
||||||
def resume(self):
|
|
||||||
"""
|
|
||||||
Resume the audio recorder.
|
|
||||||
"""
|
|
||||||
with self._state_lock:
|
|
||||||
self._paused_event.clear()
|
|
||||||
self._recording_event.set()
|
|
||||||
|
|
||||||
def toggle(self):
|
|
||||||
"""
|
|
||||||
Toggle the audio recorder pause state.
|
|
||||||
"""
|
|
||||||
with self._state_lock:
|
|
||||||
if self.paused:
|
|
||||||
self.resume()
|
|
||||||
else:
|
|
||||||
self.pause()
|
|
||||||
|
|
||||||
def wait_paused(self, timeout: Optional[float] = None):
|
|
||||||
"""
|
|
||||||
Wait until the audio recorder is paused.
|
|
||||||
"""
|
|
||||||
self._paused_event.wait(timeout=timeout)
|
|
||||||
|
|
||||||
def wait_recording(self, timeout: Optional[float] = None):
|
|
||||||
"""
|
|
||||||
Wait until the audio recorder is resumed.
|
|
||||||
"""
|
|
||||||
self._recording_event.wait(timeout=timeout)
|
|
||||||
|
|
||||||
|
|
||||||
class AudioRecorder:
|
class AudioRecorder:
|
||||||
|
@ -112,9 +54,7 @@ class AudioRecorder:
|
||||||
"""
|
"""
|
||||||
Start the audio stream.
|
Start the audio stream.
|
||||||
"""
|
"""
|
||||||
self._stop_event.clear()
|
return self.start()
|
||||||
self.stream.start()
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __exit__(self, *_):
|
def __exit__(self, *_):
|
||||||
"""
|
"""
|
||||||
|
@ -145,6 +85,14 @@ class AudioRecorder:
|
||||||
self.logger.debug('Audio queue is empty')
|
self.logger.debug('Audio queue is empty')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
"""
|
||||||
|
Start the audio stream.
|
||||||
|
"""
|
||||||
|
self._stop_event.clear()
|
||||||
|
self.stream.start()
|
||||||
|
return self
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
"""
|
"""
|
||||||
Stop the audio stream.
|
Stop the audio stream.
|
||||||
|
@ -186,6 +134,6 @@ class AudioRecorder:
|
||||||
wait_for_either(
|
wait_for_either(
|
||||||
self._stop_event,
|
self._stop_event,
|
||||||
self._upstream_stop_event,
|
self._upstream_stop_event,
|
||||||
self._paused_state._recording_event,
|
self._paused_state._recording_event, # pylint: disable=protected-access
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
)
|
)
|
61
platypush/common/assistant/_state.py
Normal file
61
platypush/common/assistant/_state.py
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
from collections import namedtuple
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from threading import Event, RLock
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
AudioFrame = namedtuple('AudioFrame', ['data', 'timestamp'])
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PauseState:
|
||||||
|
"""
|
||||||
|
Data class to hold the boilerplate (state + synchronization events) for the
|
||||||
|
audio recorder pause API.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_paused_event: Event = field(default_factory=Event)
|
||||||
|
_recording_event: Event = field(default_factory=Event)
|
||||||
|
_state_lock: RLock = field(default_factory=RLock)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def paused(self):
|
||||||
|
with self._state_lock:
|
||||||
|
return self._paused_event.is_set()
|
||||||
|
|
||||||
|
def pause(self):
|
||||||
|
"""
|
||||||
|
Pause the audio recorder.
|
||||||
|
"""
|
||||||
|
with self._state_lock:
|
||||||
|
self._paused_event.set()
|
||||||
|
self._recording_event.clear()
|
||||||
|
|
||||||
|
def resume(self):
|
||||||
|
"""
|
||||||
|
Resume the audio recorder.
|
||||||
|
"""
|
||||||
|
with self._state_lock:
|
||||||
|
self._paused_event.clear()
|
||||||
|
self._recording_event.set()
|
||||||
|
|
||||||
|
def toggle(self):
|
||||||
|
"""
|
||||||
|
Toggle the audio recorder pause state.
|
||||||
|
"""
|
||||||
|
with self._state_lock:
|
||||||
|
if self.paused:
|
||||||
|
self.resume()
|
||||||
|
else:
|
||||||
|
self.pause()
|
||||||
|
|
||||||
|
def wait_paused(self, timeout: Optional[float] = None):
|
||||||
|
"""
|
||||||
|
Wait until the audio recorder is paused.
|
||||||
|
"""
|
||||||
|
self._paused_event.wait(timeout=timeout)
|
||||||
|
|
||||||
|
def wait_recording(self, timeout: Optional[float] = None):
|
||||||
|
"""
|
||||||
|
Wait until the audio recorder is resumed.
|
||||||
|
"""
|
||||||
|
self._recording_event.wait(timeout=timeout)
|
447
platypush/plugins/assistant/openai/__init__.py
Normal file
447
platypush/plugins/assistant/openai/__init__.py
Normal file
|
@ -0,0 +1,447 @@
|
||||||
|
from io import BytesIO
|
||||||
|
from threading import Event
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
from platypush.common.assistant import AudioRecorder
|
||||||
|
from platypush.context import get_plugin
|
||||||
|
from platypush.plugins import RunnablePlugin, action
|
||||||
|
from platypush.plugins.assistant import AssistantPlugin
|
||||||
|
from platypush.plugins.openai import OpenaiPlugin
|
||||||
|
|
||||||
|
from ._state import RecordingState
|
||||||
|
|
||||||
|
|
||||||
|
# pylint: disable=too-many-ancestors
|
||||||
|
class AssistantOpenaiPlugin(AssistantPlugin, RunnablePlugin):
|
||||||
|
"""
|
||||||
|
A voice assistant based on the OpenAI API.
|
||||||
|
|
||||||
|
It requires the :class:`platypush.plugins.openai.OpenaiPlugin` plugin to be
|
||||||
|
configured with an OpenAI API key.
|
||||||
|
|
||||||
|
Hotword detection
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
This plugin doesn't have hotword detection, as OpenAI doesn't provide
|
||||||
|
an API for that. Instead, the assistant can be started and stopped
|
||||||
|
programmatically through the :meth:`.start_conversation` action.
|
||||||
|
|
||||||
|
If you want to implement hotword detection, you can use a separate plugin
|
||||||
|
such as
|
||||||
|
:class:`platypush.plugins.assistant.picovoice.AssistantPicovoicePlugin`.
|
||||||
|
|
||||||
|
The configuration in this case would be like:
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
assistant.picovoice:
|
||||||
|
access_key: YOUR_PICOVOICE_ACCESS_KEY
|
||||||
|
|
||||||
|
# List of hotwords to listen for
|
||||||
|
keywords:
|
||||||
|
- alexa
|
||||||
|
- computer
|
||||||
|
- ok google
|
||||||
|
|
||||||
|
# Disable speech-to-text and intent recognition, only use hotword
|
||||||
|
# detection
|
||||||
|
stt_enabled: false
|
||||||
|
hotword_enabled: true
|
||||||
|
|
||||||
|
conversation_start_sound: /sound/to/play/when/the/conversation/starts.mp3
|
||||||
|
# speech_model_path: /mnt/hd/models/picovoice/cheetah/custom-en.pv
|
||||||
|
# intent_model_path: /mnt/hd/models/picovoice/rhino/custom-en-x86.rhn
|
||||||
|
|
||||||
|
openai:
|
||||||
|
api_key: YOUR_OPENAI_API_KEY
|
||||||
|
|
||||||
|
# Customize your assistant's context and knowledge base to your
|
||||||
|
# liking
|
||||||
|
context:
|
||||||
|
- role: system
|
||||||
|
content: >
|
||||||
|
You are a 16th century noble lady who talks in
|
||||||
|
Shakespearean English to her peers.
|
||||||
|
|
||||||
|
# Enable the assistant plugin
|
||||||
|
assistant.openai:
|
||||||
|
|
||||||
|
# Enable the text-to-speech plugin
|
||||||
|
tts.openai:
|
||||||
|
# Customize the voice model
|
||||||
|
voice: nova
|
||||||
|
|
||||||
|
Then you can call :meth:`.start_conversation` when the hotword is detected
|
||||||
|
:class:`platypush.message.event.assistant.HotwordDetectedEvent` is
|
||||||
|
triggered:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from platypush import run, when
|
||||||
|
from platypush.message.event.assistant import HotwordDetectedEvent
|
||||||
|
|
||||||
|
@when(HotwordDetectedEvent)
|
||||||
|
# You can also customize it by running a different assistant logic
|
||||||
|
# depending on the hotword
|
||||||
|
# @when(HotwordDetectedEvent, hotword='computer')
|
||||||
|
def on_hotword_detected():
|
||||||
|
run("assistant.openai.start_conversation")
|
||||||
|
|
||||||
|
This configuration will:
|
||||||
|
|
||||||
|
1. Start the hotword detection when the application starts.
|
||||||
|
2. Start the OpenAI assistant when the hotword is detected.
|
||||||
|
|
||||||
|
AI responses
|
||||||
|
------------
|
||||||
|
|
||||||
|
By default (unless you set ``stop_conversation_on_speech_match`` to ``False``),
|
||||||
|
the plugin will:
|
||||||
|
|
||||||
|
1. Process the speech through the OpenAI API (the GPT model to be is
|
||||||
|
configurable in the OpenAI plugin ``model`` configuration).
|
||||||
|
|
||||||
|
2. Render the response through the configured ``tts_plugin`` (default:
|
||||||
|
``tts.openai``). If ``tts_plugin`` is not set, then the response will
|
||||||
|
be returned as a string.
|
||||||
|
|
||||||
|
Custom speech processing
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
You can create custom hooks on
|
||||||
|
:class:`platypush.message.event.assistant.SpeechRecognizedEvent` with
|
||||||
|
custom ``phrase`` strings or (regex) patterns. For example:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from platypush import run, when
|
||||||
|
from platypush.message.event.assistant import SpeechRecognizedEvent
|
||||||
|
|
||||||
|
# Matches any phrase that contains either "play music" or "play the
|
||||||
|
# music"
|
||||||
|
@when(SpeechRecognizedEvent, phrase='play (the)? music')
|
||||||
|
def play_music():
|
||||||
|
run('music.mpd.play')
|
||||||
|
|
||||||
|
If at least a custom hook with a non-empty ``phrase`` string is matched,
|
||||||
|
then the default response will be disabled. If you still want the assistant
|
||||||
|
to say something when the event is handled, you can call
|
||||||
|
``event.assistant.render_response`` on the hook:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from textwrap import dedent
|
||||||
|
from time import time
|
||||||
|
|
||||||
|
from platypush import run, when
|
||||||
|
from platypush.message.event.assistant import SpeechRecognizedEvent
|
||||||
|
|
||||||
|
@when(SpeechRecognizedEvent, phrase='weather today')
|
||||||
|
def weather_forecast(event: SpeechRecognizedEvent):
|
||||||
|
limit = time() + 24 * 60 * 60 # 24 hours from now
|
||||||
|
forecast = [
|
||||||
|
weather
|
||||||
|
for weather in run("weather.openweathermap.get_forecast")
|
||||||
|
if datetime.fromisoformat(weather["time"]).timestamp() < limit
|
||||||
|
]
|
||||||
|
|
||||||
|
min_temp = round(
|
||||||
|
min(weather["temperature"] for weather in forecast)
|
||||||
|
)
|
||||||
|
max_temp = round(
|
||||||
|
max(weather["temperature"] for weather in forecast)
|
||||||
|
)
|
||||||
|
max_wind_gust = round(
|
||||||
|
(max(weather["wind_gust"] for weather in forecast)) * 3.6
|
||||||
|
)
|
||||||
|
summaries = [weather["summary"] for weather in forecast]
|
||||||
|
most_common_summary = max(summaries, key=summaries.count)
|
||||||
|
avg_cloud_cover = round(
|
||||||
|
sum(weather["cloud_cover"] for weather in forecast) / len(forecast)
|
||||||
|
)
|
||||||
|
|
||||||
|
event.assistant.render_response(
|
||||||
|
dedent(
|
||||||
|
f\"\"\"
|
||||||
|
The forecast for today is: {most_common_summary}, with
|
||||||
|
a minimum of {min_temp} and a maximum of {max_temp}
|
||||||
|
degrees, wind gust of {max_wind_gust} km/h, and an
|
||||||
|
average cloud cover of {avg_cloud_cover}%.
|
||||||
|
\"\"\"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
Conversation follow-up
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
A conversation will have a follow-up (i.e. the assistant will listen for a
|
||||||
|
phrase after rendering a response) if the response is not empty and ends
|
||||||
|
with a question mark. If you want to force a follow-up even if the response
|
||||||
|
doesn't end with a question mark, you can call :meth:`.start_conversation`
|
||||||
|
programmatically from your hooks.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: str = "whisper-1",
|
||||||
|
tts_plugin: Optional[str] = "tts.openai",
|
||||||
|
min_silence_secs: float = 1.0,
|
||||||
|
silence_threshold: int = -22,
|
||||||
|
sample_rate: int = 16000,
|
||||||
|
frame_size: int = 16384,
|
||||||
|
channels: int = 1,
|
||||||
|
conversation_start_timeout: float = 5.0,
|
||||||
|
conversation_end_timeout: float = 1.0,
|
||||||
|
conversation_max_duration: float = 15.0,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
:param model: OpenAI model to use for audio transcription (default:
|
||||||
|
``whisper-1``).
|
||||||
|
:param tts_plugin: Name of the TTS plugin to use for rendering the responses
|
||||||
|
(default: ``tts.openai``).
|
||||||
|
:param min_silence_secs: Minimum silence duration in seconds to detect
|
||||||
|
the end of a conversation (default: 1.0 seconds).
|
||||||
|
:param silence_threshold: Silence threshold in dBFS (default: -22).
|
||||||
|
The value of 0 is the maximum amplitude, and -120 is associated to
|
||||||
|
a silent or nearly silent audio, thus the higher the value, the more
|
||||||
|
sensitive the silence detection will be (default: -22).
|
||||||
|
:param sample_rate: Recording sample rate in Hz (default: 16000).
|
||||||
|
:param frame_size: Recording frame size in samples (default: 16384).
|
||||||
|
Note that it's important to make sure that ``frame_size`` /
|
||||||
|
``sample_rate`` isn't smaller than the minimum silence duration,
|
||||||
|
otherwise the silence detection won't work properly.
|
||||||
|
:param channels: Number of recording channels (default: 1).
|
||||||
|
:param conversation_start_timeout: How long to wait for the
|
||||||
|
conversation to start (i.e. the first non-silent audio frame to be
|
||||||
|
detected) before giving up and stopping the recording (default: 5.0
|
||||||
|
seconds).
|
||||||
|
:param conversation_end_timeout: How many seconds of silence to wait
|
||||||
|
after the last non-silent audio frame before stopping the recording
|
||||||
|
(default: 1.5 seconds).
|
||||||
|
:param conversation_max_duration: Maximum conversation duration in seconds
|
||||||
|
(default: 15.0 seconds).
|
||||||
|
"""
|
||||||
|
kwargs["tts_plugin"] = tts_plugin
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
self._model = model
|
||||||
|
self._min_silence_secs = min_silence_secs
|
||||||
|
self._silence_threshold = silence_threshold
|
||||||
|
self._sample_rate = sample_rate
|
||||||
|
self._frame_size = frame_size
|
||||||
|
self._channels = channels
|
||||||
|
self._conversation_start_timeout = conversation_start_timeout
|
||||||
|
self._conversation_end_timeout = conversation_end_timeout
|
||||||
|
self._conversation_max_duration = conversation_max_duration
|
||||||
|
self._start_recording_event = Event()
|
||||||
|
self._disable_default_response = False
|
||||||
|
self._recording_state = RecordingState(
|
||||||
|
sample_rate=sample_rate,
|
||||||
|
channels=channels,
|
||||||
|
min_silence_secs=min_silence_secs,
|
||||||
|
silence_threshold=silence_threshold,
|
||||||
|
)
|
||||||
|
|
||||||
|
self._recorder: Optional[AudioRecorder] = None
|
||||||
|
|
||||||
|
def _to_audio_segment(self, data: np.ndarray) -> AudioSegment:
|
||||||
|
return AudioSegment(
|
||||||
|
data.tobytes(),
|
||||||
|
frame_rate=self._sample_rate,
|
||||||
|
sample_width=data.dtype.itemsize,
|
||||||
|
channels=self._channels,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _is_conversation_ended(self):
|
||||||
|
# End if the recording has been stopped
|
||||||
|
if not self._recorder or self._recorder.should_stop():
|
||||||
|
return True
|
||||||
|
|
||||||
|
# End if we reached the max conversation duration
|
||||||
|
if self._recording_state.duration >= self._conversation_max_duration:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# End if the conversation hasn't started yet and we reached the
|
||||||
|
# conversation start timeout
|
||||||
|
if (
|
||||||
|
not self._recording_state.conversation_started
|
||||||
|
and self._recording_state.duration >= self._conversation_start_timeout
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# End if the conversation has started and the user has been silent for
|
||||||
|
# more than the conversation end timeout
|
||||||
|
if (
|
||||||
|
self._recording_state.conversation_started
|
||||||
|
and self._recording_state.silence_duration >= self._conversation_end_timeout
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _openai(self) -> OpenaiPlugin:
|
||||||
|
openai: Optional[OpenaiPlugin] = get_plugin("openai")
|
||||||
|
assert openai, (
|
||||||
|
"OpenAI plugin not found. "
|
||||||
|
"Please configure the `openai` plugin to use `assistant.openai`"
|
||||||
|
)
|
||||||
|
return openai
|
||||||
|
|
||||||
|
def _get_prediction(self, audio: BytesIO) -> str:
|
||||||
|
return self._openai.transcribe_raw(
|
||||||
|
audio.getvalue(), extension='mp3', model=self._model
|
||||||
|
)
|
||||||
|
|
||||||
|
def _capture_audio(self, recorder: AudioRecorder):
|
||||||
|
while not self.should_stop() and not self._is_conversation_ended():
|
||||||
|
audio_data = recorder.read()
|
||||||
|
if not audio_data:
|
||||||
|
continue
|
||||||
|
|
||||||
|
self._recording_state.add_audio(audio_data)
|
||||||
|
|
||||||
|
def _audio_loop(self):
|
||||||
|
while not self.should_stop():
|
||||||
|
self._wait_recording_start()
|
||||||
|
self._recording_state.reset()
|
||||||
|
self._on_conversation_start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
with AudioRecorder(
|
||||||
|
stop_event=self._should_stop,
|
||||||
|
sample_rate=self._sample_rate,
|
||||||
|
frame_size=self._frame_size,
|
||||||
|
channels=self._channels,
|
||||||
|
) as self._recorder:
|
||||||
|
self._capture_audio(self._recorder)
|
||||||
|
finally:
|
||||||
|
if self._recorder:
|
||||||
|
try:
|
||||||
|
self._recorder.stream.close()
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning("Error closing the audio stream: %s", e)
|
||||||
|
|
||||||
|
self._recorder = None
|
||||||
|
|
||||||
|
if self._recording_state.is_silent():
|
||||||
|
self._on_conversation_timeout()
|
||||||
|
else:
|
||||||
|
audio = self._recording_state.export_audio()
|
||||||
|
text = self._get_prediction(audio)
|
||||||
|
self._on_speech_recognized(text)
|
||||||
|
|
||||||
|
def _wait_recording_start(self):
|
||||||
|
self._start_recording_event.wait()
|
||||||
|
self._start_recording_event.clear()
|
||||||
|
|
||||||
|
def _start_conversation(self, *_, **__):
|
||||||
|
self._disable_default_response = False
|
||||||
|
self._recording_state.reset()
|
||||||
|
self._start_recording_event.set()
|
||||||
|
|
||||||
|
def _stop_conversation(self, *_, **__):
|
||||||
|
self._disable_default_response = True
|
||||||
|
super()._stop_conversation()
|
||||||
|
self._recording_state.reset()
|
||||||
|
if self._recorder:
|
||||||
|
self._recorder.stop()
|
||||||
|
|
||||||
|
self._on_conversation_end()
|
||||||
|
|
||||||
|
def _on_speech_recognized(self, phrase: Optional[str]):
|
||||||
|
super()._on_speech_recognized(phrase)
|
||||||
|
|
||||||
|
# Dirty hack: wait a bit before stopping the conversation to make sure
|
||||||
|
# that there aren't event hooks triggered in other threads that are
|
||||||
|
# supposed to handle.
|
||||||
|
if self.stop_conversation_on_speech_match:
|
||||||
|
self.wait_stop(0.5)
|
||||||
|
if self.should_stop():
|
||||||
|
return
|
||||||
|
|
||||||
|
if self._disable_default_response:
|
||||||
|
self.logger.debug("Default response disabled, skipping response")
|
||||||
|
return
|
||||||
|
|
||||||
|
response = self._openai.get_response(phrase).output
|
||||||
|
if response:
|
||||||
|
self.render_response(response)
|
||||||
|
else:
|
||||||
|
self._on_no_response()
|
||||||
|
|
||||||
|
@action
|
||||||
|
def start_conversation(self, *_, **__):
|
||||||
|
"""
|
||||||
|
Start a conversation with the assistant. The conversation will be
|
||||||
|
automatically stopped after ``conversation_max_duration`` seconds of
|
||||||
|
audio, or after ``conversation_start_timeout`` seconds of silence
|
||||||
|
with no audio detected, or after ``conversation_end_timeout`` seconds
|
||||||
|
after the last non-silent audio frame has been detected, or when the
|
||||||
|
:meth:`.stop_conversation` method is called.
|
||||||
|
"""
|
||||||
|
self._start_conversation()
|
||||||
|
|
||||||
|
@action
|
||||||
|
def mute(self, *_, **__):
|
||||||
|
"""
|
||||||
|
.. note:: This plugin has no hotword detection, thus no continuous
|
||||||
|
audio detection. Speech processing is done on-demand through the
|
||||||
|
:meth:`.start_conversation` and :meth:`.stop_conversation` methods.
|
||||||
|
Therefore, the :meth:`.mute` and :meth:`.unmute` methods are not
|
||||||
|
implemented.
|
||||||
|
"""
|
||||||
|
self.logger.warning(
|
||||||
|
"assistant.openai.mute is not implemented because this plugin "
|
||||||
|
"has no hotword detection, and the only way to stop a conversation "
|
||||||
|
"is by calling stop_conversation()"
|
||||||
|
)
|
||||||
|
|
||||||
|
@action
|
||||||
|
def unmute(self, *_, **__):
|
||||||
|
"""
|
||||||
|
.. note:: This plugin has no hotword detection, thus no continuous
|
||||||
|
audio detection. Speech processing is done on-demand through the
|
||||||
|
:meth:`.start_conversation` and :meth:`.stop_conversation` methods.
|
||||||
|
Therefore, the :meth:`.mute` and :meth:`.unmute` methods are not
|
||||||
|
implemented.
|
||||||
|
"""
|
||||||
|
self.logger.warning(
|
||||||
|
"assistant.openai.unmute is not implemented because this plugin "
|
||||||
|
"has no hotword detection, and the only way to start a conversation "
|
||||||
|
"is by calling start_conversation()"
|
||||||
|
)
|
||||||
|
|
||||||
|
@action
|
||||||
|
def send_text_query(self, text: str, *_, **__):
|
||||||
|
"""
|
||||||
|
If the ``tts_plugin`` configuration is set, then the assistant will
|
||||||
|
process the given text query through
|
||||||
|
:meth:`platypush.plugins.openai.OpenaiPlugin.get_response` and render
|
||||||
|
the response through the specified TTS plugin.
|
||||||
|
|
||||||
|
:return: The response received from
|
||||||
|
:meth:`platypush.plugins.openai.OpenaiPlugin.get_response`.
|
||||||
|
"""
|
||||||
|
response = self._openai.get_response(text).output
|
||||||
|
self.render_response(response)
|
||||||
|
return response
|
||||||
|
|
||||||
|
def main(self):
|
||||||
|
while not self.should_stop():
|
||||||
|
try:
|
||||||
|
self._audio_loop()
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error("Audio loop error: %s", e, exc_info=True)
|
||||||
|
self.wait_stop(5)
|
||||||
|
finally:
|
||||||
|
self.stop_conversation()
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self._stop_conversation()
|
||||||
|
super().stop()
|
80
platypush/plugins/assistant/openai/_state.py
Normal file
80
platypush/plugins/assistant/openai/_state.py
Normal file
|
@ -0,0 +1,80 @@
|
||||||
|
from io import BytesIO
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from pydub import AudioSegment, silence
|
||||||
|
|
||||||
|
from platypush.common.assistant import AudioFrame
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RecordingState:
|
||||||
|
"""
|
||||||
|
Current state of the audio recording.
|
||||||
|
"""
|
||||||
|
|
||||||
|
sample_rate: int
|
||||||
|
channels: int
|
||||||
|
min_silence_secs: float
|
||||||
|
silence_threshold: int
|
||||||
|
silence_duration: float = 0.0
|
||||||
|
audio_segments: List[AudioSegment] = field(default_factory=list)
|
||||||
|
duration: float = 0.0
|
||||||
|
conversation_started: bool = False
|
||||||
|
|
||||||
|
def _silence_duration(self, audio: AudioSegment) -> float:
|
||||||
|
silent_frames = [
|
||||||
|
(start / 1000, stop / 1000)
|
||||||
|
for start, stop in silence.detect_silence(
|
||||||
|
audio,
|
||||||
|
min_silence_len=int(self.min_silence_secs * 1000),
|
||||||
|
silence_thresh=int(self.silence_threshold),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return sum(stop - start for start, stop in silent_frames)
|
||||||
|
|
||||||
|
def _to_audio_segment(self, data: np.ndarray) -> AudioSegment:
|
||||||
|
return AudioSegment(
|
||||||
|
data.tobytes(),
|
||||||
|
frame_rate=self.sample_rate,
|
||||||
|
sample_width=data.dtype.itemsize,
|
||||||
|
channels=self.channels,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _add_audio_segment(self, audio: AudioSegment):
|
||||||
|
self.audio_segments.append(audio)
|
||||||
|
self.duration += audio.duration_seconds
|
||||||
|
silence_duration = self._silence_duration(audio)
|
||||||
|
is_mostly_silent = silence_duration >= audio.duration_seconds * 0.75
|
||||||
|
|
||||||
|
if is_mostly_silent:
|
||||||
|
self.silence_duration += silence_duration
|
||||||
|
else:
|
||||||
|
self.conversation_started = True
|
||||||
|
self.silence_duration = 0.0
|
||||||
|
|
||||||
|
def is_silent(self) -> bool:
|
||||||
|
return self.silence_duration >= self.duration
|
||||||
|
|
||||||
|
def add_audio(self, audio: AudioFrame):
|
||||||
|
self._add_audio_segment(self._to_audio_segment(audio.data))
|
||||||
|
|
||||||
|
def export_audio(self) -> BytesIO:
|
||||||
|
buffer = BytesIO()
|
||||||
|
if not self.audio_segments:
|
||||||
|
return buffer
|
||||||
|
|
||||||
|
audio = self.audio_segments[0]
|
||||||
|
for segment in self.audio_segments[1:]:
|
||||||
|
audio += segment
|
||||||
|
|
||||||
|
audio.export(buffer, format="mp3", bitrate='92')
|
||||||
|
return buffer
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.audio_segments.clear()
|
||||||
|
self.duration = 0.0
|
||||||
|
self.silence_duration = 0.0
|
||||||
|
self.conversation_started = False
|
44
platypush/plugins/assistant/openai/manifest.json
Normal file
44
platypush/plugins/assistant/openai/manifest.json
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
{
|
||||||
|
"manifest": {
|
||||||
|
"package": "platypush.plugins.assistant.openai",
|
||||||
|
"type": "plugin",
|
||||||
|
"events": [
|
||||||
|
"platypush.message.event.assistant.ConversationEndEvent",
|
||||||
|
"platypush.message.event.assistant.ConversationStartEvent",
|
||||||
|
"platypush.message.event.assistant.ConversationTimeoutEvent",
|
||||||
|
"platypush.message.event.assistant.HotwordDetectedEvent",
|
||||||
|
"platypush.message.event.assistant.IntentRecognizedEvent",
|
||||||
|
"platypush.message.event.assistant.MicMutedEvent",
|
||||||
|
"platypush.message.event.assistant.MicUnmutedEvent",
|
||||||
|
"platypush.message.event.assistant.NoResponseEvent",
|
||||||
|
"platypush.message.event.assistant.ResponseEndEvent",
|
||||||
|
"platypush.message.event.assistant.ResponseEvent",
|
||||||
|
"platypush.message.event.assistant.SpeechRecognizedEvent"
|
||||||
|
],
|
||||||
|
"install": {
|
||||||
|
"apk": [
|
||||||
|
"ffmpeg",
|
||||||
|
"py3-numpy"
|
||||||
|
],
|
||||||
|
"apt": [
|
||||||
|
"ffmpeg",
|
||||||
|
"python3-numpy",
|
||||||
|
"python3-pydub"
|
||||||
|
],
|
||||||
|
"dnf": [
|
||||||
|
"ffmpeg",
|
||||||
|
"python-numpy"
|
||||||
|
],
|
||||||
|
"pacman": [
|
||||||
|
"ffmpeg",
|
||||||
|
"python-numpy",
|
||||||
|
"python-sounddevice"
|
||||||
|
],
|
||||||
|
"pip": [
|
||||||
|
"numpy",
|
||||||
|
"pydub",
|
||||||
|
"sounddevice"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -562,11 +562,8 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
|
||||||
self._assistant.override_speech_model(model_file)
|
self._assistant.override_speech_model(model_file)
|
||||||
self._assistant.state = AssistantState.DETECTING_SPEECH
|
self._assistant.state = AssistantState.DETECTING_SPEECH
|
||||||
|
|
||||||
@action
|
def _stop_conversation(self, *_, **__):
|
||||||
def stop_conversation(self, *_, **__):
|
super()._stop_conversation()
|
||||||
"""
|
|
||||||
Programmatically stop a running conversation with the assistant
|
|
||||||
"""
|
|
||||||
if not self._assistant:
|
if not self._assistant:
|
||||||
self.logger.warning('Assistant not initialized')
|
self.logger.warning('Assistant not initialized')
|
||||||
return
|
return
|
||||||
|
|
|
@ -7,6 +7,7 @@ from typing import Any, Dict, Optional, Sequence
|
||||||
|
|
||||||
import pvporcupine
|
import pvporcupine
|
||||||
|
|
||||||
|
from platypush.common.assistant import AudioRecorder
|
||||||
from platypush.context import get_plugin
|
from platypush.context import get_plugin
|
||||||
from platypush.message.event.assistant import (
|
from platypush.message.event.assistant import (
|
||||||
AssistantEvent,
|
AssistantEvent,
|
||||||
|
@ -16,8 +17,6 @@ from platypush.message.event.assistant import (
|
||||||
SpeechRecognizedEvent,
|
SpeechRecognizedEvent,
|
||||||
)
|
)
|
||||||
from platypush.plugins.tts.picovoice import TtsPicovoicePlugin
|
from platypush.plugins.tts.picovoice import TtsPicovoicePlugin
|
||||||
|
|
||||||
from ._recorder import AudioRecorder
|
|
||||||
from ._speech import SpeechProcessor
|
from ._speech import SpeechProcessor
|
||||||
from ._state import AssistantState
|
from ._state import AssistantState
|
||||||
|
|
||||||
|
|
150
platypush/plugins/tts/openai/__init__.py
Normal file
150
platypush/plugins/tts/openai/__init__.py
Normal file
|
@ -0,0 +1,150 @@
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from multiprocessing import Process
|
||||||
|
from typing import Generator, Optional
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from platypush.context import get_plugin
|
||||||
|
from platypush.plugins import action
|
||||||
|
from platypush.plugins.openai import OpenaiPlugin
|
||||||
|
from platypush.plugins.tts import TtsPlugin
|
||||||
|
|
||||||
|
|
||||||
|
class TtsOpenaiPlugin(TtsPlugin):
|
||||||
|
r"""
|
||||||
|
This plugin provides an interface to the `OpenAI text-to-speech API
|
||||||
|
<https://platform.openai.com/docs/guides/text-to-speech>`_.
|
||||||
|
|
||||||
|
It requires the :class:`platypush.plugins.openai.OpenaiPlugin` plugin to be
|
||||||
|
configured.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_BUFSIZE = 1024
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: str = 'tts-1',
|
||||||
|
voice: str = 'nova',
|
||||||
|
timeout: float = 10,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
:param model: Model to be used for the text-to-speech conversion.
|
||||||
|
See the `OpenAI API models documentation
|
||||||
|
<https://platform.openai.com/docs/models/tts>`_ for the list of
|
||||||
|
available models (default: ``tts-1``).
|
||||||
|
:param voice: Default voice to be used. See the `OpenAI API
|
||||||
|
voices documentation
|
||||||
|
<https://platform.openai.com/docs/guides/text-to-speech/voice-options>`_
|
||||||
|
for the list of available voices (default: ``nova``).
|
||||||
|
:param timeout: Default timeout for the API requests (default: 10s).
|
||||||
|
"""
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
openai = get_plugin('openai')
|
||||||
|
assert openai, 'openai plugin not configured'
|
||||||
|
|
||||||
|
self.openai: OpenaiPlugin = openai
|
||||||
|
self.model = model
|
||||||
|
self.voice = voice
|
||||||
|
self.timeout = timeout
|
||||||
|
self._audio_proc: Optional[Process] = None
|
||||||
|
|
||||||
|
def _process_response(
|
||||||
|
self,
|
||||||
|
response: requests.Response,
|
||||||
|
audio_file: str,
|
||||||
|
) -> Process:
|
||||||
|
def proc_fn():
|
||||||
|
try:
|
||||||
|
with open(audio_file, 'wb') as file:
|
||||||
|
for chunk in response.iter_content(chunk_size=self._BUFSIZE):
|
||||||
|
if chunk:
|
||||||
|
file.write(chunk)
|
||||||
|
file.flush()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
pass
|
||||||
|
|
||||||
|
self._audio_proc = Process(target=proc_fn, name='openai-tts-response-processor')
|
||||||
|
self._audio_proc.start()
|
||||||
|
return self._audio_proc
|
||||||
|
|
||||||
|
def _make_request(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
model: Optional[str] = None,
|
||||||
|
voice: Optional[str] = None,
|
||||||
|
) -> requests.Response:
|
||||||
|
rs = requests.post(
|
||||||
|
"https://api.openai.com/v1/audio/speech",
|
||||||
|
timeout=self.timeout,
|
||||||
|
stream=True,
|
||||||
|
headers={
|
||||||
|
"Authorization": f"Bearer {self.openai._api_key}", # pylint: disable=protected-access
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
json={
|
||||||
|
"model": model or self.model,
|
||||||
|
"voice": voice or self.voice,
|
||||||
|
"input": text,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
rs.raise_for_status()
|
||||||
|
return rs
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _audio_fifo(self) -> Generator[str, None, None]:
|
||||||
|
fifo_dir = tempfile.mkdtemp()
|
||||||
|
fifo_path = os.path.join(fifo_dir, 'platypush-tts-openai-fifo')
|
||||||
|
os.mkfifo(fifo_path)
|
||||||
|
yield fifo_path
|
||||||
|
|
||||||
|
os.unlink(fifo_path)
|
||||||
|
os.rmdir(fifo_dir)
|
||||||
|
|
||||||
|
@action
|
||||||
|
def say(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
*_,
|
||||||
|
model: Optional[str] = None,
|
||||||
|
voice: Optional[str] = None,
|
||||||
|
**player_args,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Say some text.
|
||||||
|
|
||||||
|
:param text: Text to say.
|
||||||
|
:param model: Default ``model`` override.
|
||||||
|
:param voice: Default ``voice`` override.
|
||||||
|
:param player_args: Extends the additional arguments to be passed to
|
||||||
|
:meth:`platypush.plugins.sound.SoundPlugin.play` (like volume,
|
||||||
|
duration, channels etc.).
|
||||||
|
"""
|
||||||
|
response_processor: Optional[Process] = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self._make_request(text, model=model, voice=voice)
|
||||||
|
|
||||||
|
with self._audio_fifo() as audio_file:
|
||||||
|
response_processor = self._process_response(
|
||||||
|
response=response, audio_file=audio_file
|
||||||
|
)
|
||||||
|
self._playback(audio_file, **player_args)
|
||||||
|
response_processor.join()
|
||||||
|
response_processor = None
|
||||||
|
finally:
|
||||||
|
if response_processor:
|
||||||
|
response_processor.terminate()
|
||||||
|
|
||||||
|
@action
|
||||||
|
def stop(self):
|
||||||
|
super().stop()
|
||||||
|
if self._audio_proc and self._audio_proc.is_alive():
|
||||||
|
self._audio_proc.terminate()
|
||||||
|
self._audio_proc.join()
|
||||||
|
|
||||||
|
|
||||||
|
# vim:sw=4:ts=4:et:
|
34
platypush/plugins/tts/openai/manifest.json
Normal file
34
platypush/plugins/tts/openai/manifest.json
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
{
|
||||||
|
"manifest": {
|
||||||
|
"events": {},
|
||||||
|
"install": {
|
||||||
|
"apk": [
|
||||||
|
"ffmpeg",
|
||||||
|
"portaudio-dev",
|
||||||
|
"py3-numpy"
|
||||||
|
],
|
||||||
|
"apt": [
|
||||||
|
"ffmpeg",
|
||||||
|
"portaudio19-dev",
|
||||||
|
"python3-numpy"
|
||||||
|
],
|
||||||
|
"dnf": [
|
||||||
|
"ffmpeg",
|
||||||
|
"portaudio-devel",
|
||||||
|
"python-numpy"
|
||||||
|
],
|
||||||
|
"pacman": [
|
||||||
|
"ffmpeg",
|
||||||
|
"portaudio",
|
||||||
|
"python-numpy",
|
||||||
|
"python-sounddevice"
|
||||||
|
],
|
||||||
|
"pip": [
|
||||||
|
"numpy",
|
||||||
|
"sounddevice"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"package": "platypush.plugins.tts.openai",
|
||||||
|
"type": "plugin"
|
||||||
|
}
|
||||||
|
}
|
|
@ -93,6 +93,7 @@ mock_imports = [
|
||||||
"pychromecast",
|
"pychromecast",
|
||||||
"pyclip",
|
"pyclip",
|
||||||
"pydbus",
|
"pydbus",
|
||||||
|
"pydub",
|
||||||
"pyfirmata2",
|
"pyfirmata2",
|
||||||
"pyngrok",
|
"pyngrok",
|
||||||
"pyotp",
|
"pyotp",
|
||||||
|
|
Loading…
Reference in a new issue