[#384] Added assistant.openai and tts.openai plugins.
All checks were successful
continuous-integration/drone/push Build is passing

Closes: #384
This commit is contained in:
Fabio Manganiello 2024-06-02 17:28:04 +02:00
parent c3673391f7
commit fa52bbfb5b
Signed by: blacklight
GPG key ID: D90FBA7F76362774
15 changed files with 851 additions and 71 deletions

View file

@ -1,5 +1,9 @@
# Changelog # Changelog
## [1.0.7] - 2024-06-02
- [#384] Added `assistant.openai` and `tts.openai` plugins.
## [1.0.6] - 2024-06-01 ## [1.0.6] - 2024-06-01
- 🐛 Bug fix on one of the entities modules that prevented the application from - 🐛 Bug fix on one of the entities modules that prevented the application from

View file

@ -0,0 +1,5 @@
``assistant.openai``
====================
.. automodule:: platypush.plugins.assistant.openai
:members:

View file

@ -0,0 +1,5 @@
``tts.openai``
==============
.. automodule:: platypush.plugins.tts.openai
:members:

View file

@ -11,6 +11,7 @@ Plugins
platypush/plugins/application.rst platypush/plugins/application.rst
platypush/plugins/arduino.rst platypush/plugins/arduino.rst
platypush/plugins/assistant.google.rst platypush/plugins/assistant.google.rst
platypush/plugins/assistant.openai.rst
platypush/plugins/assistant.picovoice.rst platypush/plugins/assistant.picovoice.rst
platypush/plugins/autoremote.rst platypush/plugins/autoremote.rst
platypush/plugins/bluetooth.rst platypush/plugins/bluetooth.rst
@ -134,6 +135,7 @@ Plugins
platypush/plugins/tts.rst platypush/plugins/tts.rst
platypush/plugins/tts.google.rst platypush/plugins/tts.google.rst
platypush/plugins/tts.mimic3.rst platypush/plugins/tts.mimic3.rst
platypush/plugins/tts.openai.rst
platypush/plugins/tts.picovoice.rst platypush/plugins/tts.picovoice.rst
platypush/plugins/tv.samsung.ws.rst platypush/plugins/tv.samsung.ws.rst
platypush/plugins/twilio.rst platypush/plugins/twilio.rst

View file

@ -8,6 +8,9 @@
"assistant.google": { "assistant.google": {
"class": "fas fa-microphone-lines" "class": "fas fa-microphone-lines"
}, },
"assistant.openai": {
"class": "fas fa-microphone-lines"
},
"assistant.picovoice": { "assistant.picovoice": {
"class": "fas fa-microphone-lines" "class": "fas fa-microphone-lines"
}, },

View file

@ -1,8 +1,6 @@
from collections import namedtuple
from dataclasses import dataclass, field
from logging import getLogger from logging import getLogger
from queue import Full, Queue from queue import Full, Queue
from threading import Event, RLock from threading import Event
from time import time from time import time
from typing import Optional from typing import Optional
@ -10,63 +8,7 @@ import sounddevice as sd
from platypush.utils import wait_for_either from platypush.utils import wait_for_either
from ._state import AudioFrame, PauseState
AudioFrame = namedtuple('AudioFrame', ['data', 'timestamp'])
@dataclass
class PauseState:
"""
Data class to hold the boilerplate (state + synchronization events) for the
audio recorder pause API.
"""
_paused_event: Event = field(default_factory=Event)
_recording_event: Event = field(default_factory=Event)
_state_lock: RLock = field(default_factory=RLock)
@property
def paused(self):
with self._state_lock:
return self._paused_event.is_set()
def pause(self):
"""
Pause the audio recorder.
"""
with self._state_lock:
self._paused_event.set()
self._recording_event.clear()
def resume(self):
"""
Resume the audio recorder.
"""
with self._state_lock:
self._paused_event.clear()
self._recording_event.set()
def toggle(self):
"""
Toggle the audio recorder pause state.
"""
with self._state_lock:
if self.paused:
self.resume()
else:
self.pause()
def wait_paused(self, timeout: Optional[float] = None):
"""
Wait until the audio recorder is paused.
"""
self._paused_event.wait(timeout=timeout)
def wait_recording(self, timeout: Optional[float] = None):
"""
Wait until the audio recorder is resumed.
"""
self._recording_event.wait(timeout=timeout)
class AudioRecorder: class AudioRecorder:
@ -112,9 +54,7 @@ class AudioRecorder:
""" """
Start the audio stream. Start the audio stream.
""" """
self._stop_event.clear() return self.start()
self.stream.start()
return self
def __exit__(self, *_): def __exit__(self, *_):
""" """
@ -145,6 +85,14 @@ class AudioRecorder:
self.logger.debug('Audio queue is empty') self.logger.debug('Audio queue is empty')
return None return None
def start(self):
"""
Start the audio stream.
"""
self._stop_event.clear()
self.stream.start()
return self
def stop(self): def stop(self):
""" """
Stop the audio stream. Stop the audio stream.
@ -186,6 +134,6 @@ class AudioRecorder:
wait_for_either( wait_for_either(
self._stop_event, self._stop_event,
self._upstream_stop_event, self._upstream_stop_event,
self._paused_state._recording_event, self._paused_state._recording_event, # pylint: disable=protected-access
timeout=timeout, timeout=timeout,
) )

View file

@ -0,0 +1,61 @@
from collections import namedtuple
from dataclasses import dataclass, field
from threading import Event, RLock
from typing import Optional
AudioFrame = namedtuple('AudioFrame', ['data', 'timestamp'])
@dataclass
class PauseState:
"""
Data class to hold the boilerplate (state + synchronization events) for the
audio recorder pause API.
"""
_paused_event: Event = field(default_factory=Event)
_recording_event: Event = field(default_factory=Event)
_state_lock: RLock = field(default_factory=RLock)
@property
def paused(self):
with self._state_lock:
return self._paused_event.is_set()
def pause(self):
"""
Pause the audio recorder.
"""
with self._state_lock:
self._paused_event.set()
self._recording_event.clear()
def resume(self):
"""
Resume the audio recorder.
"""
with self._state_lock:
self._paused_event.clear()
self._recording_event.set()
def toggle(self):
"""
Toggle the audio recorder pause state.
"""
with self._state_lock:
if self.paused:
self.resume()
else:
self.pause()
def wait_paused(self, timeout: Optional[float] = None):
"""
Wait until the audio recorder is paused.
"""
self._paused_event.wait(timeout=timeout)
def wait_recording(self, timeout: Optional[float] = None):
"""
Wait until the audio recorder is resumed.
"""
self._recording_event.wait(timeout=timeout)

View file

@ -0,0 +1,447 @@
from io import BytesIO
from threading import Event
from typing import Optional
import numpy as np
from pydub import AudioSegment
from platypush.common.assistant import AudioRecorder
from platypush.context import get_plugin
from platypush.plugins import RunnablePlugin, action
from platypush.plugins.assistant import AssistantPlugin
from platypush.plugins.openai import OpenaiPlugin
from ._state import RecordingState
# pylint: disable=too-many-ancestors
class AssistantOpenaiPlugin(AssistantPlugin, RunnablePlugin):
"""
A voice assistant based on the OpenAI API.
It requires the :class:`platypush.plugins.openai.OpenaiPlugin` plugin to be
configured with an OpenAI API key.
Hotword detection
-----------------
This plugin doesn't have hotword detection, as OpenAI doesn't provide
an API for that. Instead, the assistant can be started and stopped
programmatically through the :meth:`.start_conversation` action.
If you want to implement hotword detection, you can use a separate plugin
such as
:class:`platypush.plugins.assistant.picovoice.AssistantPicovoicePlugin`.
The configuration in this case would be like:
.. code-block:: yaml
assistant.picovoice:
access_key: YOUR_PICOVOICE_ACCESS_KEY
# List of hotwords to listen for
keywords:
- alexa
- computer
- ok google
# Disable speech-to-text and intent recognition, only use hotword
# detection
stt_enabled: false
hotword_enabled: true
conversation_start_sound: /sound/to/play/when/the/conversation/starts.mp3
# speech_model_path: /mnt/hd/models/picovoice/cheetah/custom-en.pv
# intent_model_path: /mnt/hd/models/picovoice/rhino/custom-en-x86.rhn
openai:
api_key: YOUR_OPENAI_API_KEY
# Customize your assistant's context and knowledge base to your
# liking
context:
- role: system
content: >
You are a 16th century noble lady who talks in
Shakespearean English to her peers.
# Enable the assistant plugin
assistant.openai:
# Enable the text-to-speech plugin
tts.openai:
# Customize the voice model
voice: nova
Then you can call :meth:`.start_conversation` when the hotword is detected
:class:`platypush.message.event.assistant.HotwordDetectedEvent` is
triggered:
.. code-block:: python
from platypush import run, when
from platypush.message.event.assistant import HotwordDetectedEvent
@when(HotwordDetectedEvent)
# You can also customize it by running a different assistant logic
# depending on the hotword
# @when(HotwordDetectedEvent, hotword='computer')
def on_hotword_detected():
run("assistant.openai.start_conversation")
This configuration will:
1. Start the hotword detection when the application starts.
2. Start the OpenAI assistant when the hotword is detected.
AI responses
------------
By default (unless you set ``stop_conversation_on_speech_match`` to ``False``),
the plugin will:
1. Process the speech through the OpenAI API (the GPT model to be is
configurable in the OpenAI plugin ``model`` configuration).
2. Render the response through the configured ``tts_plugin`` (default:
``tts.openai``). If ``tts_plugin`` is not set, then the response will
be returned as a string.
Custom speech processing
------------------------
You can create custom hooks on
:class:`platypush.message.event.assistant.SpeechRecognizedEvent` with
custom ``phrase`` strings or (regex) patterns. For example:
.. code-block:: python
from platypush import run, when
from platypush.message.event.assistant import SpeechRecognizedEvent
# Matches any phrase that contains either "play music" or "play the
# music"
@when(SpeechRecognizedEvent, phrase='play (the)? music')
def play_music():
run('music.mpd.play')
If at least a custom hook with a non-empty ``phrase`` string is matched,
then the default response will be disabled. If you still want the assistant
to say something when the event is handled, you can call
``event.assistant.render_response`` on the hook:
.. code-block:: python
from datetime import datetime
from textwrap import dedent
from time import time
from platypush import run, when
from platypush.message.event.assistant import SpeechRecognizedEvent
@when(SpeechRecognizedEvent, phrase='weather today')
def weather_forecast(event: SpeechRecognizedEvent):
limit = time() + 24 * 60 * 60 # 24 hours from now
forecast = [
weather
for weather in run("weather.openweathermap.get_forecast")
if datetime.fromisoformat(weather["time"]).timestamp() < limit
]
min_temp = round(
min(weather["temperature"] for weather in forecast)
)
max_temp = round(
max(weather["temperature"] for weather in forecast)
)
max_wind_gust = round(
(max(weather["wind_gust"] for weather in forecast)) * 3.6
)
summaries = [weather["summary"] for weather in forecast]
most_common_summary = max(summaries, key=summaries.count)
avg_cloud_cover = round(
sum(weather["cloud_cover"] for weather in forecast) / len(forecast)
)
event.assistant.render_response(
dedent(
f\"\"\"
The forecast for today is: {most_common_summary}, with
a minimum of {min_temp} and a maximum of {max_temp}
degrees, wind gust of {max_wind_gust} km/h, and an
average cloud cover of {avg_cloud_cover}%.
\"\"\"
)
)
Conversation follow-up
----------------------
A conversation will have a follow-up (i.e. the assistant will listen for a
phrase after rendering a response) if the response is not empty and ends
with a question mark. If you want to force a follow-up even if the response
doesn't end with a question mark, you can call :meth:`.start_conversation`
programmatically from your hooks.
"""
def __init__(
self,
model: str = "whisper-1",
tts_plugin: Optional[str] = "tts.openai",
min_silence_secs: float = 1.0,
silence_threshold: int = -22,
sample_rate: int = 16000,
frame_size: int = 16384,
channels: int = 1,
conversation_start_timeout: float = 5.0,
conversation_end_timeout: float = 1.0,
conversation_max_duration: float = 15.0,
**kwargs,
):
"""
:param model: OpenAI model to use for audio transcription (default:
``whisper-1``).
:param tts_plugin: Name of the TTS plugin to use for rendering the responses
(default: ``tts.openai``).
:param min_silence_secs: Minimum silence duration in seconds to detect
the end of a conversation (default: 1.0 seconds).
:param silence_threshold: Silence threshold in dBFS (default: -22).
The value of 0 is the maximum amplitude, and -120 is associated to
a silent or nearly silent audio, thus the higher the value, the more
sensitive the silence detection will be (default: -22).
:param sample_rate: Recording sample rate in Hz (default: 16000).
:param frame_size: Recording frame size in samples (default: 16384).
Note that it's important to make sure that ``frame_size`` /
``sample_rate`` isn't smaller than the minimum silence duration,
otherwise the silence detection won't work properly.
:param channels: Number of recording channels (default: 1).
:param conversation_start_timeout: How long to wait for the
conversation to start (i.e. the first non-silent audio frame to be
detected) before giving up and stopping the recording (default: 5.0
seconds).
:param conversation_end_timeout: How many seconds of silence to wait
after the last non-silent audio frame before stopping the recording
(default: 1.5 seconds).
:param conversation_max_duration: Maximum conversation duration in seconds
(default: 15.0 seconds).
"""
kwargs["tts_plugin"] = tts_plugin
super().__init__(**kwargs)
self._model = model
self._min_silence_secs = min_silence_secs
self._silence_threshold = silence_threshold
self._sample_rate = sample_rate
self._frame_size = frame_size
self._channels = channels
self._conversation_start_timeout = conversation_start_timeout
self._conversation_end_timeout = conversation_end_timeout
self._conversation_max_duration = conversation_max_duration
self._start_recording_event = Event()
self._disable_default_response = False
self._recording_state = RecordingState(
sample_rate=sample_rate,
channels=channels,
min_silence_secs=min_silence_secs,
silence_threshold=silence_threshold,
)
self._recorder: Optional[AudioRecorder] = None
def _to_audio_segment(self, data: np.ndarray) -> AudioSegment:
return AudioSegment(
data.tobytes(),
frame_rate=self._sample_rate,
sample_width=data.dtype.itemsize,
channels=self._channels,
)
def _is_conversation_ended(self):
# End if the recording has been stopped
if not self._recorder or self._recorder.should_stop():
return True
# End if we reached the max conversation duration
if self._recording_state.duration >= self._conversation_max_duration:
return True
# End if the conversation hasn't started yet and we reached the
# conversation start timeout
if (
not self._recording_state.conversation_started
and self._recording_state.duration >= self._conversation_start_timeout
):
return True
# End if the conversation has started and the user has been silent for
# more than the conversation end timeout
if (
self._recording_state.conversation_started
and self._recording_state.silence_duration >= self._conversation_end_timeout
):
return True
return False
@property
def _openai(self) -> OpenaiPlugin:
openai: Optional[OpenaiPlugin] = get_plugin("openai")
assert openai, (
"OpenAI plugin not found. "
"Please configure the `openai` plugin to use `assistant.openai`"
)
return openai
def _get_prediction(self, audio: BytesIO) -> str:
return self._openai.transcribe_raw(
audio.getvalue(), extension='mp3', model=self._model
)
def _capture_audio(self, recorder: AudioRecorder):
while not self.should_stop() and not self._is_conversation_ended():
audio_data = recorder.read()
if not audio_data:
continue
self._recording_state.add_audio(audio_data)
def _audio_loop(self):
while not self.should_stop():
self._wait_recording_start()
self._recording_state.reset()
self._on_conversation_start()
try:
with AudioRecorder(
stop_event=self._should_stop,
sample_rate=self._sample_rate,
frame_size=self._frame_size,
channels=self._channels,
) as self._recorder:
self._capture_audio(self._recorder)
finally:
if self._recorder:
try:
self._recorder.stream.close()
except Exception as e:
self.logger.warning("Error closing the audio stream: %s", e)
self._recorder = None
if self._recording_state.is_silent():
self._on_conversation_timeout()
else:
audio = self._recording_state.export_audio()
text = self._get_prediction(audio)
self._on_speech_recognized(text)
def _wait_recording_start(self):
self._start_recording_event.wait()
self._start_recording_event.clear()
def _start_conversation(self, *_, **__):
self._disable_default_response = False
self._recording_state.reset()
self._start_recording_event.set()
def _stop_conversation(self, *_, **__):
self._disable_default_response = True
super()._stop_conversation()
self._recording_state.reset()
if self._recorder:
self._recorder.stop()
self._on_conversation_end()
def _on_speech_recognized(self, phrase: Optional[str]):
super()._on_speech_recognized(phrase)
# Dirty hack: wait a bit before stopping the conversation to make sure
# that there aren't event hooks triggered in other threads that are
# supposed to handle.
if self.stop_conversation_on_speech_match:
self.wait_stop(0.5)
if self.should_stop():
return
if self._disable_default_response:
self.logger.debug("Default response disabled, skipping response")
return
response = self._openai.get_response(phrase).output
if response:
self.render_response(response)
else:
self._on_no_response()
@action
def start_conversation(self, *_, **__):
"""
Start a conversation with the assistant. The conversation will be
automatically stopped after ``conversation_max_duration`` seconds of
audio, or after ``conversation_start_timeout`` seconds of silence
with no audio detected, or after ``conversation_end_timeout`` seconds
after the last non-silent audio frame has been detected, or when the
:meth:`.stop_conversation` method is called.
"""
self._start_conversation()
@action
def mute(self, *_, **__):
"""
.. note:: This plugin has no hotword detection, thus no continuous
audio detection. Speech processing is done on-demand through the
:meth:`.start_conversation` and :meth:`.stop_conversation` methods.
Therefore, the :meth:`.mute` and :meth:`.unmute` methods are not
implemented.
"""
self.logger.warning(
"assistant.openai.mute is not implemented because this plugin "
"has no hotword detection, and the only way to stop a conversation "
"is by calling stop_conversation()"
)
@action
def unmute(self, *_, **__):
"""
.. note:: This plugin has no hotword detection, thus no continuous
audio detection. Speech processing is done on-demand through the
:meth:`.start_conversation` and :meth:`.stop_conversation` methods.
Therefore, the :meth:`.mute` and :meth:`.unmute` methods are not
implemented.
"""
self.logger.warning(
"assistant.openai.unmute is not implemented because this plugin "
"has no hotword detection, and the only way to start a conversation "
"is by calling start_conversation()"
)
@action
def send_text_query(self, text: str, *_, **__):
"""
If the ``tts_plugin`` configuration is set, then the assistant will
process the given text query through
:meth:`platypush.plugins.openai.OpenaiPlugin.get_response` and render
the response through the specified TTS plugin.
:return: The response received from
:meth:`platypush.plugins.openai.OpenaiPlugin.get_response`.
"""
response = self._openai.get_response(text).output
self.render_response(response)
return response
def main(self):
while not self.should_stop():
try:
self._audio_loop()
except Exception as e:
self.logger.error("Audio loop error: %s", e, exc_info=True)
self.wait_stop(5)
finally:
self.stop_conversation()
def stop(self):
self._stop_conversation()
super().stop()

View file

@ -0,0 +1,80 @@
from io import BytesIO
from dataclasses import dataclass, field
from typing import List
import numpy as np
from pydub import AudioSegment, silence
from platypush.common.assistant import AudioFrame
@dataclass
class RecordingState:
"""
Current state of the audio recording.
"""
sample_rate: int
channels: int
min_silence_secs: float
silence_threshold: int
silence_duration: float = 0.0
audio_segments: List[AudioSegment] = field(default_factory=list)
duration: float = 0.0
conversation_started: bool = False
def _silence_duration(self, audio: AudioSegment) -> float:
silent_frames = [
(start / 1000, stop / 1000)
for start, stop in silence.detect_silence(
audio,
min_silence_len=int(self.min_silence_secs * 1000),
silence_thresh=int(self.silence_threshold),
)
]
return sum(stop - start for start, stop in silent_frames)
def _to_audio_segment(self, data: np.ndarray) -> AudioSegment:
return AudioSegment(
data.tobytes(),
frame_rate=self.sample_rate,
sample_width=data.dtype.itemsize,
channels=self.channels,
)
def _add_audio_segment(self, audio: AudioSegment):
self.audio_segments.append(audio)
self.duration += audio.duration_seconds
silence_duration = self._silence_duration(audio)
is_mostly_silent = silence_duration >= audio.duration_seconds * 0.75
if is_mostly_silent:
self.silence_duration += silence_duration
else:
self.conversation_started = True
self.silence_duration = 0.0
def is_silent(self) -> bool:
return self.silence_duration >= self.duration
def add_audio(self, audio: AudioFrame):
self._add_audio_segment(self._to_audio_segment(audio.data))
def export_audio(self) -> BytesIO:
buffer = BytesIO()
if not self.audio_segments:
return buffer
audio = self.audio_segments[0]
for segment in self.audio_segments[1:]:
audio += segment
audio.export(buffer, format="mp3", bitrate='92')
return buffer
def reset(self):
self.audio_segments.clear()
self.duration = 0.0
self.silence_duration = 0.0
self.conversation_started = False

View file

@ -0,0 +1,44 @@
{
"manifest": {
"package": "platypush.plugins.assistant.openai",
"type": "plugin",
"events": [
"platypush.message.event.assistant.ConversationEndEvent",
"platypush.message.event.assistant.ConversationStartEvent",
"platypush.message.event.assistant.ConversationTimeoutEvent",
"platypush.message.event.assistant.HotwordDetectedEvent",
"platypush.message.event.assistant.IntentRecognizedEvent",
"platypush.message.event.assistant.MicMutedEvent",
"platypush.message.event.assistant.MicUnmutedEvent",
"platypush.message.event.assistant.NoResponseEvent",
"platypush.message.event.assistant.ResponseEndEvent",
"platypush.message.event.assistant.ResponseEvent",
"platypush.message.event.assistant.SpeechRecognizedEvent"
],
"install": {
"apk": [
"ffmpeg",
"py3-numpy"
],
"apt": [
"ffmpeg",
"python3-numpy",
"python3-pydub"
],
"dnf": [
"ffmpeg",
"python-numpy"
],
"pacman": [
"ffmpeg",
"python-numpy",
"python-sounddevice"
],
"pip": [
"numpy",
"pydub",
"sounddevice"
]
}
}
}

View file

@ -562,11 +562,8 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
self._assistant.override_speech_model(model_file) self._assistant.override_speech_model(model_file)
self._assistant.state = AssistantState.DETECTING_SPEECH self._assistant.state = AssistantState.DETECTING_SPEECH
@action def _stop_conversation(self, *_, **__):
def stop_conversation(self, *_, **__): super()._stop_conversation()
"""
Programmatically stop a running conversation with the assistant
"""
if not self._assistant: if not self._assistant:
self.logger.warning('Assistant not initialized') self.logger.warning('Assistant not initialized')
return return

View file

@ -7,6 +7,7 @@ from typing import Any, Dict, Optional, Sequence
import pvporcupine import pvporcupine
from platypush.common.assistant import AudioRecorder
from platypush.context import get_plugin from platypush.context import get_plugin
from platypush.message.event.assistant import ( from platypush.message.event.assistant import (
AssistantEvent, AssistantEvent,
@ -16,8 +17,6 @@ from platypush.message.event.assistant import (
SpeechRecognizedEvent, SpeechRecognizedEvent,
) )
from platypush.plugins.tts.picovoice import TtsPicovoicePlugin from platypush.plugins.tts.picovoice import TtsPicovoicePlugin
from ._recorder import AudioRecorder
from ._speech import SpeechProcessor from ._speech import SpeechProcessor
from ._state import AssistantState from ._state import AssistantState

View file

@ -0,0 +1,150 @@
import os
import tempfile
from contextlib import contextmanager
from multiprocessing import Process
from typing import Generator, Optional
import requests
from platypush.context import get_plugin
from platypush.plugins import action
from platypush.plugins.openai import OpenaiPlugin
from platypush.plugins.tts import TtsPlugin
class TtsOpenaiPlugin(TtsPlugin):
r"""
This plugin provides an interface to the `OpenAI text-to-speech API
<https://platform.openai.com/docs/guides/text-to-speech>`_.
It requires the :class:`platypush.plugins.openai.OpenaiPlugin` plugin to be
configured.
"""
_BUFSIZE = 1024
def __init__(
self,
model: str = 'tts-1',
voice: str = 'nova',
timeout: float = 10,
**kwargs,
):
"""
:param model: Model to be used for the text-to-speech conversion.
See the `OpenAI API models documentation
<https://platform.openai.com/docs/models/tts>`_ for the list of
available models (default: ``tts-1``).
:param voice: Default voice to be used. See the `OpenAI API
voices documentation
<https://platform.openai.com/docs/guides/text-to-speech/voice-options>`_
for the list of available voices (default: ``nova``).
:param timeout: Default timeout for the API requests (default: 10s).
"""
super().__init__(**kwargs)
openai = get_plugin('openai')
assert openai, 'openai plugin not configured'
self.openai: OpenaiPlugin = openai
self.model = model
self.voice = voice
self.timeout = timeout
self._audio_proc: Optional[Process] = None
def _process_response(
self,
response: requests.Response,
audio_file: str,
) -> Process:
def proc_fn():
try:
with open(audio_file, 'wb') as file:
for chunk in response.iter_content(chunk_size=self._BUFSIZE):
if chunk:
file.write(chunk)
file.flush()
except KeyboardInterrupt:
pass
self._audio_proc = Process(target=proc_fn, name='openai-tts-response-processor')
self._audio_proc.start()
return self._audio_proc
def _make_request(
self,
text: str,
model: Optional[str] = None,
voice: Optional[str] = None,
) -> requests.Response:
rs = requests.post(
"https://api.openai.com/v1/audio/speech",
timeout=self.timeout,
stream=True,
headers={
"Authorization": f"Bearer {self.openai._api_key}", # pylint: disable=protected-access
"Content-Type": "application/json",
},
json={
"model": model or self.model,
"voice": voice or self.voice,
"input": text,
},
)
rs.raise_for_status()
return rs
@contextmanager
def _audio_fifo(self) -> Generator[str, None, None]:
fifo_dir = tempfile.mkdtemp()
fifo_path = os.path.join(fifo_dir, 'platypush-tts-openai-fifo')
os.mkfifo(fifo_path)
yield fifo_path
os.unlink(fifo_path)
os.rmdir(fifo_dir)
@action
def say(
self,
text: str,
*_,
model: Optional[str] = None,
voice: Optional[str] = None,
**player_args,
):
"""
Say some text.
:param text: Text to say.
:param model: Default ``model`` override.
:param voice: Default ``voice`` override.
:param player_args: Extends the additional arguments to be passed to
:meth:`platypush.plugins.sound.SoundPlugin.play` (like volume,
duration, channels etc.).
"""
response_processor: Optional[Process] = None
try:
response = self._make_request(text, model=model, voice=voice)
with self._audio_fifo() as audio_file:
response_processor = self._process_response(
response=response, audio_file=audio_file
)
self._playback(audio_file, **player_args)
response_processor.join()
response_processor = None
finally:
if response_processor:
response_processor.terminate()
@action
def stop(self):
super().stop()
if self._audio_proc and self._audio_proc.is_alive():
self._audio_proc.terminate()
self._audio_proc.join()
# vim:sw=4:ts=4:et:

View file

@ -0,0 +1,34 @@
{
"manifest": {
"events": {},
"install": {
"apk": [
"ffmpeg",
"portaudio-dev",
"py3-numpy"
],
"apt": [
"ffmpeg",
"portaudio19-dev",
"python3-numpy"
],
"dnf": [
"ffmpeg",
"portaudio-devel",
"python-numpy"
],
"pacman": [
"ffmpeg",
"portaudio",
"python-numpy",
"python-sounddevice"
],
"pip": [
"numpy",
"sounddevice"
]
},
"package": "platypush.plugins.tts.openai",
"type": "plugin"
}
}

View file

@ -93,6 +93,7 @@ mock_imports = [
"pychromecast", "pychromecast",
"pyclip", "pyclip",
"pydbus", "pydbus",
"pydub",
"pyfirmata2", "pyfirmata2",
"pyngrok", "pyngrok",
"pyotp", "pyotp",