From 7a8f30e5e06f2a911956a15204303caa541e7f33 Mon Sep 17 00:00:00 2001 From: Fabio Manganiello Date: Sun, 2 Jun 2024 17:28:04 +0200 Subject: [PATCH] [#384] Added `assistant.openai` and `tts.openai` plugins. Closes: #384 --- CHANGELOG.md | 4 + .../platypush/plugins/assistant.openai.rst | 5 + docs/source/platypush/plugins/tts.openai.rst | 5 + docs/source/plugins.rst | 2 + .../backend/http/webapp/src/assets/icons.json | 3 + .../assistant/__init__.py} | 76 +-- platypush/common/assistant/_state.py | 61 +++ .../plugins/assistant/openai/__init__.py | 447 ++++++++++++++++++ platypush/plugins/assistant/openai/_state.py | 80 ++++ .../plugins/assistant/openai/manifest.json | 44 ++ .../plugins/assistant/picovoice/__init__.py | 7 +- .../plugins/assistant/picovoice/_assistant.py | 3 +- platypush/plugins/tts/openai/__init__.py | 150 ++++++ platypush/plugins/tts/openai/manifest.json | 34 ++ platypush/utils/mock/modules.py | 1 + 15 files changed, 851 insertions(+), 71 deletions(-) create mode 100644 docs/source/platypush/plugins/assistant.openai.rst create mode 100644 docs/source/platypush/plugins/tts.openai.rst rename platypush/{plugins/assistant/picovoice/_recorder.py => common/assistant/__init__.py} (67%) create mode 100644 platypush/common/assistant/_state.py create mode 100644 platypush/plugins/assistant/openai/__init__.py create mode 100644 platypush/plugins/assistant/openai/_state.py create mode 100644 platypush/plugins/assistant/openai/manifest.json create mode 100644 platypush/plugins/tts/openai/__init__.py create mode 100644 platypush/plugins/tts/openai/manifest.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 52f084cc5f..941814af1c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## [1.0.7] - 2024-06-02 + +- [#384] Added `assistant.openai` and `tts.openai` plugins. + ## [1.0.6] - 2024-06-01 - 🐛 Bug fix on one of the entities modules that prevented the application from diff --git a/docs/source/platypush/plugins/assistant.openai.rst b/docs/source/platypush/plugins/assistant.openai.rst new file mode 100644 index 0000000000..d5b4a08567 --- /dev/null +++ b/docs/source/platypush/plugins/assistant.openai.rst @@ -0,0 +1,5 @@ +``assistant.openai`` +==================== + +.. automodule:: platypush.plugins.assistant.openai + :members: diff --git a/docs/source/platypush/plugins/tts.openai.rst b/docs/source/platypush/plugins/tts.openai.rst new file mode 100644 index 0000000000..a8f4d52f67 --- /dev/null +++ b/docs/source/platypush/plugins/tts.openai.rst @@ -0,0 +1,5 @@ +``tts.openai`` +============== + +.. automodule:: platypush.plugins.tts.openai + :members: diff --git a/docs/source/plugins.rst b/docs/source/plugins.rst index 81659ade3f..5ee5802011 100644 --- a/docs/source/plugins.rst +++ b/docs/source/plugins.rst @@ -11,6 +11,7 @@ Plugins platypush/plugins/application.rst platypush/plugins/arduino.rst platypush/plugins/assistant.google.rst + platypush/plugins/assistant.openai.rst platypush/plugins/assistant.picovoice.rst platypush/plugins/autoremote.rst platypush/plugins/bluetooth.rst @@ -134,6 +135,7 @@ Plugins platypush/plugins/tts.rst platypush/plugins/tts.google.rst platypush/plugins/tts.mimic3.rst + platypush/plugins/tts.openai.rst platypush/plugins/tts.picovoice.rst platypush/plugins/tv.samsung.ws.rst platypush/plugins/twilio.rst diff --git a/platypush/backend/http/webapp/src/assets/icons.json b/platypush/backend/http/webapp/src/assets/icons.json index c78dd04b7b..f95532ac3b 100644 --- a/platypush/backend/http/webapp/src/assets/icons.json +++ b/platypush/backend/http/webapp/src/assets/icons.json @@ -8,6 +8,9 @@ "assistant.google": { "class": "fas fa-microphone-lines" }, + "assistant.openai": { + "class": "fas fa-microphone-lines" + }, "assistant.picovoice": { "class": "fas fa-microphone-lines" }, diff --git a/platypush/plugins/assistant/picovoice/_recorder.py b/platypush/common/assistant/__init__.py similarity index 67% rename from platypush/plugins/assistant/picovoice/_recorder.py rename to platypush/common/assistant/__init__.py index 523806be7b..42d47d69ce 100644 --- a/platypush/plugins/assistant/picovoice/_recorder.py +++ b/platypush/common/assistant/__init__.py @@ -1,8 +1,6 @@ -from collections import namedtuple -from dataclasses import dataclass, field from logging import getLogger from queue import Full, Queue -from threading import Event, RLock +from threading import Event from time import time from typing import Optional @@ -10,63 +8,7 @@ import sounddevice as sd from platypush.utils import wait_for_either - -AudioFrame = namedtuple('AudioFrame', ['data', 'timestamp']) - - -@dataclass -class PauseState: - """ - Data class to hold the boilerplate (state + synchronization events) for the - audio recorder pause API. - """ - - _paused_event: Event = field(default_factory=Event) - _recording_event: Event = field(default_factory=Event) - _state_lock: RLock = field(default_factory=RLock) - - @property - def paused(self): - with self._state_lock: - return self._paused_event.is_set() - - def pause(self): - """ - Pause the audio recorder. - """ - with self._state_lock: - self._paused_event.set() - self._recording_event.clear() - - def resume(self): - """ - Resume the audio recorder. - """ - with self._state_lock: - self._paused_event.clear() - self._recording_event.set() - - def toggle(self): - """ - Toggle the audio recorder pause state. - """ - with self._state_lock: - if self.paused: - self.resume() - else: - self.pause() - - def wait_paused(self, timeout: Optional[float] = None): - """ - Wait until the audio recorder is paused. - """ - self._paused_event.wait(timeout=timeout) - - def wait_recording(self, timeout: Optional[float] = None): - """ - Wait until the audio recorder is resumed. - """ - self._recording_event.wait(timeout=timeout) +from ._state import AudioFrame, PauseState class AudioRecorder: @@ -112,9 +54,7 @@ class AudioRecorder: """ Start the audio stream. """ - self._stop_event.clear() - self.stream.start() - return self + return self.start() def __exit__(self, *_): """ @@ -145,6 +85,14 @@ class AudioRecorder: self.logger.debug('Audio queue is empty') return None + def start(self): + """ + Start the audio stream. + """ + self._stop_event.clear() + self.stream.start() + return self + def stop(self): """ Stop the audio stream. @@ -186,6 +134,6 @@ class AudioRecorder: wait_for_either( self._stop_event, self._upstream_stop_event, - self._paused_state._recording_event, + self._paused_state._recording_event, # pylint: disable=protected-access timeout=timeout, ) diff --git a/platypush/common/assistant/_state.py b/platypush/common/assistant/_state.py new file mode 100644 index 0000000000..0b69818fc3 --- /dev/null +++ b/platypush/common/assistant/_state.py @@ -0,0 +1,61 @@ +from collections import namedtuple +from dataclasses import dataclass, field +from threading import Event, RLock +from typing import Optional + +AudioFrame = namedtuple('AudioFrame', ['data', 'timestamp']) + + +@dataclass +class PauseState: + """ + Data class to hold the boilerplate (state + synchronization events) for the + audio recorder pause API. + """ + + _paused_event: Event = field(default_factory=Event) + _recording_event: Event = field(default_factory=Event) + _state_lock: RLock = field(default_factory=RLock) + + @property + def paused(self): + with self._state_lock: + return self._paused_event.is_set() + + def pause(self): + """ + Pause the audio recorder. + """ + with self._state_lock: + self._paused_event.set() + self._recording_event.clear() + + def resume(self): + """ + Resume the audio recorder. + """ + with self._state_lock: + self._paused_event.clear() + self._recording_event.set() + + def toggle(self): + """ + Toggle the audio recorder pause state. + """ + with self._state_lock: + if self.paused: + self.resume() + else: + self.pause() + + def wait_paused(self, timeout: Optional[float] = None): + """ + Wait until the audio recorder is paused. + """ + self._paused_event.wait(timeout=timeout) + + def wait_recording(self, timeout: Optional[float] = None): + """ + Wait until the audio recorder is resumed. + """ + self._recording_event.wait(timeout=timeout) diff --git a/platypush/plugins/assistant/openai/__init__.py b/platypush/plugins/assistant/openai/__init__.py new file mode 100644 index 0000000000..2f06fb2d33 --- /dev/null +++ b/platypush/plugins/assistant/openai/__init__.py @@ -0,0 +1,447 @@ +from io import BytesIO +from threading import Event +from typing import Optional + +import numpy as np +from pydub import AudioSegment + +from platypush.common.assistant import AudioRecorder +from platypush.context import get_plugin +from platypush.plugins import RunnablePlugin, action +from platypush.plugins.assistant import AssistantPlugin +from platypush.plugins.openai import OpenaiPlugin + +from ._state import RecordingState + + +# pylint: disable=too-many-ancestors +class AssistantOpenaiPlugin(AssistantPlugin, RunnablePlugin): + """ + A voice assistant based on the OpenAI API. + + It requires the :class:`platypush.plugins.openai.OpenaiPlugin` plugin to be + configured with an OpenAI API key. + + Hotword detection + ----------------- + + This plugin doesn't have hotword detection, as OpenAI doesn't provide + an API for that. Instead, the assistant can be started and stopped + programmatically through the :meth:`.start_conversation` action. + + If you want to implement hotword detection, you can use a separate plugin + such as + :class:`platypush.plugins.assistant.picovoice.AssistantPicovoicePlugin`. + + The configuration in this case would be like: + + .. code-block:: yaml + + assistant.picovoice: + access_key: YOUR_PICOVOICE_ACCESS_KEY + + # List of hotwords to listen for + keywords: + - alexa + - computer + - ok google + + # Disable speech-to-text and intent recognition, only use hotword + # detection + stt_enabled: false + hotword_enabled: true + + conversation_start_sound: /sound/to/play/when/the/conversation/starts.mp3 + # speech_model_path: /mnt/hd/models/picovoice/cheetah/custom-en.pv + # intent_model_path: /mnt/hd/models/picovoice/rhino/custom-en-x86.rhn + + openai: + api_key: YOUR_OPENAI_API_KEY + + # Customize your assistant's context and knowledge base to your + # liking + context: + - role: system + content: > + You are a 16th century noble lady who talks in + Shakespearean English to her peers. + + # Enable the assistant plugin + assistant.openai: + + # Enable the text-to-speech plugin + tts.openai: + # Customize the voice model + voice: nova + + Then you can call :meth:`.start_conversation` when the hotword is detected + :class:`platypush.message.event.assistant.HotwordDetectedEvent` is + triggered: + + .. code-block:: python + + from platypush import run, when + from platypush.message.event.assistant import HotwordDetectedEvent + + @when(HotwordDetectedEvent) + # You can also customize it by running a different assistant logic + # depending on the hotword + # @when(HotwordDetectedEvent, hotword='computer') + def on_hotword_detected(): + run("assistant.openai.start_conversation") + + This configuration will: + + 1. Start the hotword detection when the application starts. + 2. Start the OpenAI assistant when the hotword is detected. + + AI responses + ------------ + + By default (unless you set ``stop_conversation_on_speech_match`` to ``False``), + the plugin will: + + 1. Process the speech through the OpenAI API (the GPT model to be is + configurable in the OpenAI plugin ``model`` configuration). + + 2. Render the response through the configured ``tts_plugin`` (default: + ``tts.openai``). If ``tts_plugin`` is not set, then the response will + be returned as a string. + + Custom speech processing + ------------------------ + + You can create custom hooks on + :class:`platypush.message.event.assistant.SpeechRecognizedEvent` with + custom ``phrase`` strings or (regex) patterns. For example: + + .. code-block:: python + + from platypush import run, when + from platypush.message.event.assistant import SpeechRecognizedEvent + + # Matches any phrase that contains either "play music" or "play the + # music" + @when(SpeechRecognizedEvent, phrase='play (the)? music') + def play_music(): + run('music.mpd.play') + + If at least a custom hook with a non-empty ``phrase`` string is matched, + then the default response will be disabled. If you still want the assistant + to say something when the event is handled, you can call + ``event.assistant.render_response`` on the hook: + + .. code-block:: python + + from datetime import datetime + from textwrap import dedent + from time import time + + from platypush import run, when + from platypush.message.event.assistant import SpeechRecognizedEvent + + @when(SpeechRecognizedEvent, phrase='weather today') + def weather_forecast(event: SpeechRecognizedEvent): + limit = time() + 24 * 60 * 60 # 24 hours from now + forecast = [ + weather + for weather in run("weather.openweathermap.get_forecast") + if datetime.fromisoformat(weather["time"]).timestamp() < limit + ] + + min_temp = round( + min(weather["temperature"] for weather in forecast) + ) + max_temp = round( + max(weather["temperature"] for weather in forecast) + ) + max_wind_gust = round( + (max(weather["wind_gust"] for weather in forecast)) * 3.6 + ) + summaries = [weather["summary"] for weather in forecast] + most_common_summary = max(summaries, key=summaries.count) + avg_cloud_cover = round( + sum(weather["cloud_cover"] for weather in forecast) / len(forecast) + ) + + event.assistant.render_response( + dedent( + f\"\"\" + The forecast for today is: {most_common_summary}, with + a minimum of {min_temp} and a maximum of {max_temp} + degrees, wind gust of {max_wind_gust} km/h, and an + average cloud cover of {avg_cloud_cover}%. + \"\"\" + ) + ) + + Conversation follow-up + ---------------------- + + A conversation will have a follow-up (i.e. the assistant will listen for a + phrase after rendering a response) if the response is not empty and ends + with a question mark. If you want to force a follow-up even if the response + doesn't end with a question mark, you can call :meth:`.start_conversation` + programmatically from your hooks. + """ + + def __init__( + self, + model: str = "whisper-1", + tts_plugin: Optional[str] = "tts.openai", + min_silence_secs: float = 1.0, + silence_threshold: int = -22, + sample_rate: int = 16000, + frame_size: int = 16384, + channels: int = 1, + conversation_start_timeout: float = 5.0, + conversation_end_timeout: float = 1.0, + conversation_max_duration: float = 15.0, + **kwargs, + ): + """ + :param model: OpenAI model to use for audio transcription (default: + ``whisper-1``). + :param tts_plugin: Name of the TTS plugin to use for rendering the responses + (default: ``tts.openai``). + :param min_silence_secs: Minimum silence duration in seconds to detect + the end of a conversation (default: 1.0 seconds). + :param silence_threshold: Silence threshold in dBFS (default: -22). + The value of 0 is the maximum amplitude, and -120 is associated to + a silent or nearly silent audio, thus the higher the value, the more + sensitive the silence detection will be (default: -22). + :param sample_rate: Recording sample rate in Hz (default: 16000). + :param frame_size: Recording frame size in samples (default: 16384). + Note that it's important to make sure that ``frame_size`` / + ``sample_rate`` isn't smaller than the minimum silence duration, + otherwise the silence detection won't work properly. + :param channels: Number of recording channels (default: 1). + :param conversation_start_timeout: How long to wait for the + conversation to start (i.e. the first non-silent audio frame to be + detected) before giving up and stopping the recording (default: 5.0 + seconds). + :param conversation_end_timeout: How many seconds of silence to wait + after the last non-silent audio frame before stopping the recording + (default: 1.5 seconds). + :param conversation_max_duration: Maximum conversation duration in seconds + (default: 15.0 seconds). + """ + kwargs["tts_plugin"] = tts_plugin + super().__init__(**kwargs) + + self._model = model + self._min_silence_secs = min_silence_secs + self._silence_threshold = silence_threshold + self._sample_rate = sample_rate + self._frame_size = frame_size + self._channels = channels + self._conversation_start_timeout = conversation_start_timeout + self._conversation_end_timeout = conversation_end_timeout + self._conversation_max_duration = conversation_max_duration + self._start_recording_event = Event() + self._disable_default_response = False + self._recording_state = RecordingState( + sample_rate=sample_rate, + channels=channels, + min_silence_secs=min_silence_secs, + silence_threshold=silence_threshold, + ) + + self._recorder: Optional[AudioRecorder] = None + + def _to_audio_segment(self, data: np.ndarray) -> AudioSegment: + return AudioSegment( + data.tobytes(), + frame_rate=self._sample_rate, + sample_width=data.dtype.itemsize, + channels=self._channels, + ) + + def _is_conversation_ended(self): + # End if the recording has been stopped + if not self._recorder or self._recorder.should_stop(): + return True + + # End if we reached the max conversation duration + if self._recording_state.duration >= self._conversation_max_duration: + return True + + # End if the conversation hasn't started yet and we reached the + # conversation start timeout + if ( + not self._recording_state.conversation_started + and self._recording_state.duration >= self._conversation_start_timeout + ): + return True + + # End if the conversation has started and the user has been silent for + # more than the conversation end timeout + if ( + self._recording_state.conversation_started + and self._recording_state.silence_duration >= self._conversation_end_timeout + ): + return True + + return False + + @property + def _openai(self) -> OpenaiPlugin: + openai: Optional[OpenaiPlugin] = get_plugin("openai") + assert openai, ( + "OpenAI plugin not found. " + "Please configure the `openai` plugin to use `assistant.openai`" + ) + return openai + + def _get_prediction(self, audio: BytesIO) -> str: + return self._openai.transcribe_raw( + audio.getvalue(), extension='mp3', model=self._model + ) + + def _capture_audio(self, recorder: AudioRecorder): + while not self.should_stop() and not self._is_conversation_ended(): + audio_data = recorder.read() + if not audio_data: + continue + + self._recording_state.add_audio(audio_data) + + def _audio_loop(self): + while not self.should_stop(): + self._wait_recording_start() + self._recording_state.reset() + self._on_conversation_start() + + try: + with AudioRecorder( + stop_event=self._should_stop, + sample_rate=self._sample_rate, + frame_size=self._frame_size, + channels=self._channels, + ) as self._recorder: + self._capture_audio(self._recorder) + finally: + if self._recorder: + try: + self._recorder.stream.close() + except Exception as e: + self.logger.warning("Error closing the audio stream: %s", e) + + self._recorder = None + + if self._recording_state.is_silent(): + self._on_conversation_timeout() + else: + audio = self._recording_state.export_audio() + text = self._get_prediction(audio) + self._on_speech_recognized(text) + + def _wait_recording_start(self): + self._start_recording_event.wait() + self._start_recording_event.clear() + + def _start_conversation(self, *_, **__): + self._disable_default_response = False + self._recording_state.reset() + self._start_recording_event.set() + + def _stop_conversation(self, *_, **__): + self._disable_default_response = True + super()._stop_conversation() + self._recording_state.reset() + if self._recorder: + self._recorder.stop() + + self._on_conversation_end() + + def _on_speech_recognized(self, phrase: Optional[str]): + super()._on_speech_recognized(phrase) + + # Dirty hack: wait a bit before stopping the conversation to make sure + # that there aren't event hooks triggered in other threads that are + # supposed to handle. + if self.stop_conversation_on_speech_match: + self.wait_stop(0.5) + if self.should_stop(): + return + + if self._disable_default_response: + self.logger.debug("Default response disabled, skipping response") + return + + response = self._openai.get_response(phrase).output + if response: + self.render_response(response) + else: + self._on_no_response() + + @action + def start_conversation(self, *_, **__): + """ + Start a conversation with the assistant. The conversation will be + automatically stopped after ``conversation_max_duration`` seconds of + audio, or after ``conversation_start_timeout`` seconds of silence + with no audio detected, or after ``conversation_end_timeout`` seconds + after the last non-silent audio frame has been detected, or when the + :meth:`.stop_conversation` method is called. + """ + self._start_conversation() + + @action + def mute(self, *_, **__): + """ + .. note:: This plugin has no hotword detection, thus no continuous + audio detection. Speech processing is done on-demand through the + :meth:`.start_conversation` and :meth:`.stop_conversation` methods. + Therefore, the :meth:`.mute` and :meth:`.unmute` methods are not + implemented. + """ + self.logger.warning( + "assistant.openai.mute is not implemented because this plugin " + "has no hotword detection, and the only way to stop a conversation " + "is by calling stop_conversation()" + ) + + @action + def unmute(self, *_, **__): + """ + .. note:: This plugin has no hotword detection, thus no continuous + audio detection. Speech processing is done on-demand through the + :meth:`.start_conversation` and :meth:`.stop_conversation` methods. + Therefore, the :meth:`.mute` and :meth:`.unmute` methods are not + implemented. + """ + self.logger.warning( + "assistant.openai.unmute is not implemented because this plugin " + "has no hotword detection, and the only way to start a conversation " + "is by calling start_conversation()" + ) + + @action + def send_text_query(self, text: str, *_, **__): + """ + If the ``tts_plugin`` configuration is set, then the assistant will + process the given text query through + :meth:`platypush.plugins.openai.OpenaiPlugin.get_response` and render + the response through the specified TTS plugin. + + :return: The response received from + :meth:`platypush.plugins.openai.OpenaiPlugin.get_response`. + """ + response = self._openai.get_response(text).output + self.render_response(response) + return response + + def main(self): + while not self.should_stop(): + try: + self._audio_loop() + except Exception as e: + self.logger.error("Audio loop error: %s", e, exc_info=True) + self.wait_stop(5) + finally: + self.stop_conversation() + + def stop(self): + self._stop_conversation() + super().stop() diff --git a/platypush/plugins/assistant/openai/_state.py b/platypush/plugins/assistant/openai/_state.py new file mode 100644 index 0000000000..c695e10879 --- /dev/null +++ b/platypush/plugins/assistant/openai/_state.py @@ -0,0 +1,80 @@ +from io import BytesIO +from dataclasses import dataclass, field +from typing import List + +import numpy as np +from pydub import AudioSegment, silence + +from platypush.common.assistant import AudioFrame + + +@dataclass +class RecordingState: + """ + Current state of the audio recording. + """ + + sample_rate: int + channels: int + min_silence_secs: float + silence_threshold: int + silence_duration: float = 0.0 + audio_segments: List[AudioSegment] = field(default_factory=list) + duration: float = 0.0 + conversation_started: bool = False + + def _silence_duration(self, audio: AudioSegment) -> float: + silent_frames = [ + (start / 1000, stop / 1000) + for start, stop in silence.detect_silence( + audio, + min_silence_len=int(self.min_silence_secs * 1000), + silence_thresh=int(self.silence_threshold), + ) + ] + + return sum(stop - start for start, stop in silent_frames) + + def _to_audio_segment(self, data: np.ndarray) -> AudioSegment: + return AudioSegment( + data.tobytes(), + frame_rate=self.sample_rate, + sample_width=data.dtype.itemsize, + channels=self.channels, + ) + + def _add_audio_segment(self, audio: AudioSegment): + self.audio_segments.append(audio) + self.duration += audio.duration_seconds + silence_duration = self._silence_duration(audio) + is_mostly_silent = silence_duration >= audio.duration_seconds * 0.75 + + if is_mostly_silent: + self.silence_duration += silence_duration + else: + self.conversation_started = True + self.silence_duration = 0.0 + + def is_silent(self) -> bool: + return self.silence_duration >= self.duration + + def add_audio(self, audio: AudioFrame): + self._add_audio_segment(self._to_audio_segment(audio.data)) + + def export_audio(self) -> BytesIO: + buffer = BytesIO() + if not self.audio_segments: + return buffer + + audio = self.audio_segments[0] + for segment in self.audio_segments[1:]: + audio += segment + + audio.export(buffer, format="mp3", bitrate='92') + return buffer + + def reset(self): + self.audio_segments.clear() + self.duration = 0.0 + self.silence_duration = 0.0 + self.conversation_started = False diff --git a/platypush/plugins/assistant/openai/manifest.json b/platypush/plugins/assistant/openai/manifest.json new file mode 100644 index 0000000000..7263b901a6 --- /dev/null +++ b/platypush/plugins/assistant/openai/manifest.json @@ -0,0 +1,44 @@ +{ + "manifest": { + "package": "platypush.plugins.assistant.openai", + "type": "plugin", + "events": [ + "platypush.message.event.assistant.ConversationEndEvent", + "platypush.message.event.assistant.ConversationStartEvent", + "platypush.message.event.assistant.ConversationTimeoutEvent", + "platypush.message.event.assistant.HotwordDetectedEvent", + "platypush.message.event.assistant.IntentRecognizedEvent", + "platypush.message.event.assistant.MicMutedEvent", + "platypush.message.event.assistant.MicUnmutedEvent", + "platypush.message.event.assistant.NoResponseEvent", + "platypush.message.event.assistant.ResponseEndEvent", + "platypush.message.event.assistant.ResponseEvent", + "platypush.message.event.assistant.SpeechRecognizedEvent" + ], + "install": { + "apk": [ + "ffmpeg", + "py3-numpy" + ], + "apt": [ + "ffmpeg", + "python3-numpy", + "python3-pydub" + ], + "dnf": [ + "ffmpeg", + "python-numpy" + ], + "pacman": [ + "ffmpeg", + "python-numpy", + "python-sounddevice" + ], + "pip": [ + "numpy", + "pydub", + "sounddevice" + ] + } + } +} diff --git a/platypush/plugins/assistant/picovoice/__init__.py b/platypush/plugins/assistant/picovoice/__init__.py index 4a32a7978c..55e7bc5b56 100644 --- a/platypush/plugins/assistant/picovoice/__init__.py +++ b/platypush/plugins/assistant/picovoice/__init__.py @@ -562,11 +562,8 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin): self._assistant.override_speech_model(model_file) self._assistant.state = AssistantState.DETECTING_SPEECH - @action - def stop_conversation(self, *_, **__): - """ - Programmatically stop a running conversation with the assistant - """ + def _stop_conversation(self, *_, **__): + super()._stop_conversation() if not self._assistant: self.logger.warning('Assistant not initialized') return diff --git a/platypush/plugins/assistant/picovoice/_assistant.py b/platypush/plugins/assistant/picovoice/_assistant.py index 9fcf66d04e..c76fcad717 100644 --- a/platypush/plugins/assistant/picovoice/_assistant.py +++ b/platypush/plugins/assistant/picovoice/_assistant.py @@ -7,6 +7,7 @@ from typing import Any, Dict, Optional, Sequence import pvporcupine +from platypush.common.assistant import AudioRecorder from platypush.context import get_plugin from platypush.message.event.assistant import ( AssistantEvent, @@ -16,8 +17,6 @@ from platypush.message.event.assistant import ( SpeechRecognizedEvent, ) from platypush.plugins.tts.picovoice import TtsPicovoicePlugin - -from ._recorder import AudioRecorder from ._speech import SpeechProcessor from ._state import AssistantState diff --git a/platypush/plugins/tts/openai/__init__.py b/platypush/plugins/tts/openai/__init__.py new file mode 100644 index 0000000000..2cd04c7e26 --- /dev/null +++ b/platypush/plugins/tts/openai/__init__.py @@ -0,0 +1,150 @@ +import os +import tempfile +from contextlib import contextmanager +from multiprocessing import Process +from typing import Generator, Optional + +import requests + +from platypush.context import get_plugin +from platypush.plugins import action +from platypush.plugins.openai import OpenaiPlugin +from platypush.plugins.tts import TtsPlugin + + +class TtsOpenaiPlugin(TtsPlugin): + r""" + This plugin provides an interface to the `OpenAI text-to-speech API + `_. + + It requires the :class:`platypush.plugins.openai.OpenaiPlugin` plugin to be + configured. + """ + + _BUFSIZE = 1024 + + def __init__( + self, + model: str = 'tts-1', + voice: str = 'nova', + timeout: float = 10, + **kwargs, + ): + """ + :param model: Model to be used for the text-to-speech conversion. + See the `OpenAI API models documentation + `_ for the list of + available models (default: ``tts-1``). + :param voice: Default voice to be used. See the `OpenAI API + voices documentation + `_ + for the list of available voices (default: ``nova``). + :param timeout: Default timeout for the API requests (default: 10s). + """ + super().__init__(**kwargs) + openai = get_plugin('openai') + assert openai, 'openai plugin not configured' + + self.openai: OpenaiPlugin = openai + self.model = model + self.voice = voice + self.timeout = timeout + self._audio_proc: Optional[Process] = None + + def _process_response( + self, + response: requests.Response, + audio_file: str, + ) -> Process: + def proc_fn(): + try: + with open(audio_file, 'wb') as file: + for chunk in response.iter_content(chunk_size=self._BUFSIZE): + if chunk: + file.write(chunk) + file.flush() + except KeyboardInterrupt: + pass + + self._audio_proc = Process(target=proc_fn, name='openai-tts-response-processor') + self._audio_proc.start() + return self._audio_proc + + def _make_request( + self, + text: str, + model: Optional[str] = None, + voice: Optional[str] = None, + ) -> requests.Response: + rs = requests.post( + "https://api.openai.com/v1/audio/speech", + timeout=self.timeout, + stream=True, + headers={ + "Authorization": f"Bearer {self.openai._api_key}", # pylint: disable=protected-access + "Content-Type": "application/json", + }, + json={ + "model": model or self.model, + "voice": voice or self.voice, + "input": text, + }, + ) + + rs.raise_for_status() + return rs + + @contextmanager + def _audio_fifo(self) -> Generator[str, None, None]: + fifo_dir = tempfile.mkdtemp() + fifo_path = os.path.join(fifo_dir, 'platypush-tts-openai-fifo') + os.mkfifo(fifo_path) + yield fifo_path + + os.unlink(fifo_path) + os.rmdir(fifo_dir) + + @action + def say( + self, + text: str, + *_, + model: Optional[str] = None, + voice: Optional[str] = None, + **player_args, + ): + """ + Say some text. + + :param text: Text to say. + :param model: Default ``model`` override. + :param voice: Default ``voice`` override. + :param player_args: Extends the additional arguments to be passed to + :meth:`platypush.plugins.sound.SoundPlugin.play` (like volume, + duration, channels etc.). + """ + response_processor: Optional[Process] = None + + try: + response = self._make_request(text, model=model, voice=voice) + + with self._audio_fifo() as audio_file: + response_processor = self._process_response( + response=response, audio_file=audio_file + ) + self._playback(audio_file, **player_args) + response_processor.join() + response_processor = None + finally: + if response_processor: + response_processor.terminate() + + @action + def stop(self): + super().stop() + if self._audio_proc and self._audio_proc.is_alive(): + self._audio_proc.terminate() + self._audio_proc.join() + + +# vim:sw=4:ts=4:et: diff --git a/platypush/plugins/tts/openai/manifest.json b/platypush/plugins/tts/openai/manifest.json new file mode 100644 index 0000000000..271bd03f0f --- /dev/null +++ b/platypush/plugins/tts/openai/manifest.json @@ -0,0 +1,34 @@ +{ + "manifest": { + "events": {}, + "install": { + "apk": [ + "ffmpeg", + "portaudio-dev", + "py3-numpy" + ], + "apt": [ + "ffmpeg", + "portaudio19-dev", + "python3-numpy" + ], + "dnf": [ + "ffmpeg", + "portaudio-devel", + "python-numpy" + ], + "pacman": [ + "ffmpeg", + "portaudio", + "python-numpy", + "python-sounddevice" + ], + "pip": [ + "numpy", + "sounddevice" + ] + }, + "package": "platypush.plugins.tts.openai", + "type": "plugin" + } +} diff --git a/platypush/utils/mock/modules.py b/platypush/utils/mock/modules.py index 69968f0116..86adf89511 100644 --- a/platypush/utils/mock/modules.py +++ b/platypush/utils/mock/modules.py @@ -93,6 +93,7 @@ mock_imports = [ "pychromecast", "pyclip", "pydbus", + "pydub", "pyfirmata2", "pyngrok", "pyotp",