[#384] Added assistant.openai and tts.openai plugins.

Closes: #384
2024-06-02 17:28:04 +02:00 · 2024-06-02 17:28:04 +02:00 · fa52bbfb5b
commit fa52bbfb5b
parent c3673391f7
15 changed files with 851 additions and 71 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,9 @@
 # Changelog

+## [1.0.7] - 2024-06-02
+
+- [#384] Added `assistant.openai` and `tts.openai` plugins.
+
 ## [1.0.6] - 2024-06-01

 - 🐛 Bug fix on one of the entities modules that prevented the application from
--- a/docs/source/platypush/plugins/assistant.openai.rst
+++ b/docs/source/platypush/plugins/assistant.openai.rst
@ -0,0 +1,5 @@
+``assistant.openai``
+====================
+
+.. automodule:: platypush.plugins.assistant.openai
+    :members:
--- a/docs/source/platypush/plugins/tts.openai.rst
+++ b/docs/source/platypush/plugins/tts.openai.rst
@ -0,0 +1,5 @@
+``tts.openai``
+==============
+
+.. automodule:: platypush.plugins.tts.openai
+    :members:
--- a/docs/source/plugins.rst
+++ b/docs/source/plugins.rst
@ -11,6 +11,7 @@ Plugins
    platypush/plugins/application.rst
    platypush/plugins/arduino.rst
    platypush/plugins/assistant.google.rst
+    platypush/plugins/assistant.openai.rst
    platypush/plugins/assistant.picovoice.rst
    platypush/plugins/autoremote.rst
    platypush/plugins/bluetooth.rst
@ -134,6 +135,7 @@ Plugins
    platypush/plugins/tts.rst
    platypush/plugins/tts.google.rst
    platypush/plugins/tts.mimic3.rst
+    platypush/plugins/tts.openai.rst
    platypush/plugins/tts.picovoice.rst
    platypush/plugins/tv.samsung.ws.rst
    platypush/plugins/twilio.rst
--- a/platypush/backend/http/webapp/src/assets/icons.json
+++ b/platypush/backend/http/webapp/src/assets/icons.json
@ -8,6 +8,9 @@
  "assistant.google": {
    "class": "fas fa-microphone-lines"
  },
+  "assistant.openai": {
+    "class": "fas fa-microphone-lines"
+  },
  "assistant.picovoice": {
    "class": "fas fa-microphone-lines"
  },
--- a/platypush/plugins/assistant/picovoice/_recorder.py
+++ b/platypush/plugins/assistant/picovoice/_recorder.py
@ -1,8 +1,6 @@
-from collections import namedtuple
-from dataclasses import dataclass, field
 from logging import getLogger
 from queue import Full, Queue
-from threading import Event, RLock
+from threading import Event
 from time import time
 from typing import Optional

@ -10,63 +8,7 @@ import sounddevice as sd

 from platypush.utils import wait_for_either

-
-AudioFrame = namedtuple('AudioFrame', ['data', 'timestamp'])
-
-
-@dataclass
-class PauseState:
-    """
-    Data class to hold the boilerplate (state + synchronization events) for the
-    audio recorder pause API.
-    """
-
-    _paused_event: Event = field(default_factory=Event)
-    _recording_event: Event = field(default_factory=Event)
-    _state_lock: RLock = field(default_factory=RLock)
-
-    @property
-    def paused(self):
-        with self._state_lock:
-            return self._paused_event.is_set()
-
-    def pause(self):
-        """
-        Pause the audio recorder.
-        """
-        with self._state_lock:
-            self._paused_event.set()
-            self._recording_event.clear()
-
-    def resume(self):
-        """
-        Resume the audio recorder.
-        """
-        with self._state_lock:
-            self._paused_event.clear()
-            self._recording_event.set()
-
-    def toggle(self):
-        """
-        Toggle the audio recorder pause state.
-        """
-        with self._state_lock:
-            if self.paused:
-                self.resume()
-            else:
-                self.pause()
-
-    def wait_paused(self, timeout: Optional[float] = None):
-        """
-        Wait until the audio recorder is paused.
-        """
-        self._paused_event.wait(timeout=timeout)
-
-    def wait_recording(self, timeout: Optional[float] = None):
-        """
-        Wait until the audio recorder is resumed.
-        """
-        self._recording_event.wait(timeout=timeout)
+from ._state import AudioFrame, PauseState


 class AudioRecorder:
@ -112,9 +54,7 @@ class AudioRecorder:
        """
        Start the audio stream.
        """
-        self._stop_event.clear()
-        self.stream.start()
-        return self
+        return self.start()

    def __exit__(self, *_):
        """
@ -145,6 +85,14 @@ class AudioRecorder:
            self.logger.debug('Audio queue is empty')
            return None

+    def start(self):
+        """
+        Start the audio stream.
+        """
+        self._stop_event.clear()
+        self.stream.start()
+        return self
+
    def stop(self):
        """
        Stop the audio stream.
@ -186,6 +134,6 @@ class AudioRecorder:
        wait_for_either(
            self._stop_event,
            self._upstream_stop_event,
-            self._paused_state._recording_event,
+            self._paused_state._recording_event,  # pylint: disable=protected-access
            timeout=timeout,
        )
--- a/platypush/common/assistant/_state.py
+++ b/platypush/common/assistant/_state.py
@ -0,0 +1,61 @@
+from collections import namedtuple
+from dataclasses import dataclass, field
+from threading import Event, RLock
+from typing import Optional
+
+AudioFrame = namedtuple('AudioFrame', ['data', 'timestamp'])
+
+
+@dataclass
+class PauseState:
+    """
+    Data class to hold the boilerplate (state + synchronization events) for the
+    audio recorder pause API.
+    """
+
+    _paused_event: Event = field(default_factory=Event)
+    _recording_event: Event = field(default_factory=Event)
+    _state_lock: RLock = field(default_factory=RLock)
+
+    @property
+    def paused(self):
+        with self._state_lock:
+            return self._paused_event.is_set()
+
+    def pause(self):
+        """
+        Pause the audio recorder.
+        """
+        with self._state_lock:
+            self._paused_event.set()
+            self._recording_event.clear()
+
+    def resume(self):
+        """
+        Resume the audio recorder.
+        """
+        with self._state_lock:
+            self._paused_event.clear()
+            self._recording_event.set()
+
+    def toggle(self):
+        """
+        Toggle the audio recorder pause state.
+        """
+        with self._state_lock:
+            if self.paused:
+                self.resume()
+            else:
+                self.pause()
+
+    def wait_paused(self, timeout: Optional[float] = None):
+        """
+        Wait until the audio recorder is paused.
+        """
+        self._paused_event.wait(timeout=timeout)
+
+    def wait_recording(self, timeout: Optional[float] = None):
+        """
+        Wait until the audio recorder is resumed.
+        """
+        self._recording_event.wait(timeout=timeout)
--- a/platypush/plugins/assistant/openai/init.py
+++ b/platypush/plugins/assistant/openai/init.py
@ -0,0 +1,447 @@
+from io import BytesIO
+from threading import Event
+from typing import Optional
+
+import numpy as np
+from pydub import AudioSegment
+
+from platypush.common.assistant import AudioRecorder
+from platypush.context import get_plugin
+from platypush.plugins import RunnablePlugin, action
+from platypush.plugins.assistant import AssistantPlugin
+from platypush.plugins.openai import OpenaiPlugin
+
+from ._state import RecordingState
+
+
+# pylint: disable=too-many-ancestors
+class AssistantOpenaiPlugin(AssistantPlugin, RunnablePlugin):
+    """
+    A voice assistant based on the OpenAI API.
+
+    It requires the :class:`platypush.plugins.openai.OpenaiPlugin` plugin to be
+    configured with an OpenAI API key.
+
+    Hotword detection
+    -----------------
+
+    This plugin doesn't have hotword detection, as OpenAI doesn't provide
+    an API for that. Instead, the assistant can be started and stopped
+    programmatically through the :meth:`.start_conversation` action.
+
+    If you want to implement hotword detection, you can use a separate plugin
+    such as
+    :class:`platypush.plugins.assistant.picovoice.AssistantPicovoicePlugin`.
+
+    The configuration in this case would be like:
+
+        .. code-block:: yaml
+
+            assistant.picovoice:
+              access_key: YOUR_PICOVOICE_ACCESS_KEY
+
+              # List of hotwords to listen for
+              keywords:
+                - alexa
+                - computer
+                - ok google
+
+              # Disable speech-to-text and intent recognition, only use hotword
+              # detection
+              stt_enabled: false
+              hotword_enabled: true
+
+              conversation_start_sound: /sound/to/play/when/the/conversation/starts.mp3
+              # speech_model_path: /mnt/hd/models/picovoice/cheetah/custom-en.pv
+              # intent_model_path: /mnt/hd/models/picovoice/rhino/custom-en-x86.rhn
+
+            openai:
+              api_key: YOUR_OPENAI_API_KEY
+
+              # Customize your assistant's context and knowledge base to your
+              # liking
+              context:
+                - role: system
+                  content: >
+                    You are a 16th century noble lady who talks in
+                    Shakespearean English to her peers.
+
+            # Enable the assistant plugin
+            assistant.openai:
+
+            # Enable the text-to-speech plugin
+            tts.openai:
+              # Customize the voice model
+              voice: nova
+
+    Then you can call :meth:`.start_conversation` when the hotword is detected
+    :class:`platypush.message.event.assistant.HotwordDetectedEvent` is
+    triggered:
+
+        .. code-block:: python
+
+            from platypush import run, when
+            from platypush.message.event.assistant import HotwordDetectedEvent
+
+            @when(HotwordDetectedEvent)
+            # You can also customize it by running a different assistant logic
+            # depending on the hotword
+            # @when(HotwordDetectedEvent, hotword='computer')
+            def on_hotword_detected():
+                run("assistant.openai.start_conversation")
+
+    This configuration will:
+
+        1. Start the hotword detection when the application starts.
+        2. Start the OpenAI assistant when the hotword is detected.
+
+    AI responses
+    ------------
+
+    By default (unless you set ``stop_conversation_on_speech_match`` to ``False``),
+    the plugin will:
+
+        1. Process the speech through the OpenAI API (the GPT model to be is
+           configurable in the OpenAI plugin ``model`` configuration).
+
+        2. Render the response through the configured ``tts_plugin`` (default:
+           ``tts.openai``). If ``tts_plugin`` is not set, then the response will
+           be returned as a string.
+
+    Custom speech processing
+    ------------------------
+
+    You can create custom hooks on
+    :class:`platypush.message.event.assistant.SpeechRecognizedEvent` with
+    custom ``phrase`` strings or (regex) patterns. For example:
+
+        .. code-block:: python
+
+            from platypush import run, when
+            from platypush.message.event.assistant import SpeechRecognizedEvent
+
+            # Matches any phrase that contains either "play music" or "play the
+            # music"
+            @when(SpeechRecognizedEvent, phrase='play (the)? music')
+            def play_music():
+                run('music.mpd.play')
+
+    If at least a custom hook with a non-empty ``phrase`` string is matched,
+    then the default response will be disabled. If you still want the assistant
+    to say something when the event is handled, you can call
+    ``event.assistant.render_response`` on the hook:
+
+        .. code-block:: python
+
+            from datetime import datetime
+            from textwrap import dedent
+            from time import time
+
+            from platypush import run, when
+            from platypush.message.event.assistant import SpeechRecognizedEvent
+
+            @when(SpeechRecognizedEvent, phrase='weather today')
+            def weather_forecast(event: SpeechRecognizedEvent):
+                limit = time() + 24 * 60 * 60  # 24 hours from now
+                forecast = [
+                    weather
+                    for weather in run("weather.openweathermap.get_forecast")
+                    if datetime.fromisoformat(weather["time"]).timestamp() < limit
+                ]
+
+                min_temp = round(
+                    min(weather["temperature"] for weather in forecast)
+                )
+                max_temp = round(
+                    max(weather["temperature"] for weather in forecast)
+                )
+                max_wind_gust = round(
+                    (max(weather["wind_gust"] for weather in forecast)) * 3.6
+                )
+                summaries = [weather["summary"] for weather in forecast]
+                most_common_summary = max(summaries, key=summaries.count)
+                avg_cloud_cover = round(
+                    sum(weather["cloud_cover"] for weather in forecast) / len(forecast)
+                )
+
+                event.assistant.render_response(
+                    dedent(
+                        f\"\"\"
+                        The forecast for today is: {most_common_summary}, with
+                        a minimum of {min_temp} and a maximum of {max_temp}
+                        degrees, wind gust of {max_wind_gust} km/h, and an
+                        average cloud cover of {avg_cloud_cover}%.
+                        \"\"\"
+                    )
+                )
+
+    Conversation follow-up
+    ----------------------
+
+    A conversation will have a follow-up (i.e. the assistant will listen for a
+    phrase after rendering a response) if the response is not empty and ends
+    with a question mark. If you want to force a follow-up even if the response
+    doesn't end with a question mark, you can call :meth:`.start_conversation`
+    programmatically from your hooks.
+    """
+
+    def __init__(
+        self,
+        model: str = "whisper-1",
+        tts_plugin: Optional[str] = "tts.openai",
+        min_silence_secs: float = 1.0,
+        silence_threshold: int = -22,
+        sample_rate: int = 16000,
+        frame_size: int = 16384,
+        channels: int = 1,
+        conversation_start_timeout: float = 5.0,
+        conversation_end_timeout: float = 1.0,
+        conversation_max_duration: float = 15.0,
+        **kwargs,
+    ):
+        """
+        :param model: OpenAI model to use for audio transcription (default:
+            ``whisper-1``).
+        :param tts_plugin: Name of the TTS plugin to use for rendering the responses
+            (default: ``tts.openai``).
+        :param min_silence_secs: Minimum silence duration in seconds to detect
+            the end of a conversation (default: 1.0 seconds).
+        :param silence_threshold: Silence threshold in dBFS (default: -22).
+            The value of 0 is the maximum amplitude, and -120 is associated to
+            a silent or nearly silent audio, thus the higher the value, the more
+            sensitive the silence detection will be (default: -22).
+        :param sample_rate: Recording sample rate in Hz (default: 16000).
+        :param frame_size: Recording frame size in samples (default: 16384).
+            Note that it's important to make sure that ``frame_size`` /
+            ``sample_rate`` isn't smaller than the minimum silence duration,
+            otherwise the silence detection won't work properly.
+        :param channels: Number of recording channels (default: 1).
+        :param conversation_start_timeout: How long to wait for the
+            conversation to start (i.e. the first non-silent audio frame to be
+            detected) before giving up and stopping the recording (default: 5.0
+            seconds).
+        :param conversation_end_timeout: How many seconds of silence to wait
+            after the last non-silent audio frame before stopping the recording
+            (default: 1.5 seconds).
+        :param conversation_max_duration: Maximum conversation duration in seconds
+            (default: 15.0 seconds).
+        """
+        kwargs["tts_plugin"] = tts_plugin
+        super().__init__(**kwargs)
+
+        self._model = model
+        self._min_silence_secs = min_silence_secs
+        self._silence_threshold = silence_threshold
+        self._sample_rate = sample_rate
+        self._frame_size = frame_size
+        self._channels = channels
+        self._conversation_start_timeout = conversation_start_timeout
+        self._conversation_end_timeout = conversation_end_timeout
+        self._conversation_max_duration = conversation_max_duration
+        self._start_recording_event = Event()
+        self._disable_default_response = False
+        self._recording_state = RecordingState(
+            sample_rate=sample_rate,
+            channels=channels,
+            min_silence_secs=min_silence_secs,
+            silence_threshold=silence_threshold,
+        )
+
+        self._recorder: Optional[AudioRecorder] = None
+
+    def _to_audio_segment(self, data: np.ndarray) -> AudioSegment:
+        return AudioSegment(
+            data.tobytes(),
+            frame_rate=self._sample_rate,
+            sample_width=data.dtype.itemsize,
+            channels=self._channels,
+        )
+
+    def _is_conversation_ended(self):
+        # End if the recording has been stopped
+        if not self._recorder or self._recorder.should_stop():
+            return True
+
+        # End if we reached the max conversation duration
+        if self._recording_state.duration >= self._conversation_max_duration:
+            return True
+
+        # End if the conversation hasn't started yet and we reached the
+        # conversation start timeout
+        if (
+            not self._recording_state.conversation_started
+            and self._recording_state.duration >= self._conversation_start_timeout
+        ):
+            return True
+
+        # End if the conversation has started and the user has been silent for
+        # more than the conversation end timeout
+        if (
+            self._recording_state.conversation_started
+            and self._recording_state.silence_duration >= self._conversation_end_timeout
+        ):
+            return True
+
+        return False
+
+    @property
+    def _openai(self) -> OpenaiPlugin:
+        openai: Optional[OpenaiPlugin] = get_plugin("openai")
+        assert openai, (
+            "OpenAI plugin not found. "
+            "Please configure the `openai` plugin to use `assistant.openai`"
+        )
+        return openai
+
+    def _get_prediction(self, audio: BytesIO) -> str:
+        return self._openai.transcribe_raw(
+            audio.getvalue(), extension='mp3', model=self._model
+        )
+
+    def _capture_audio(self, recorder: AudioRecorder):
+        while not self.should_stop() and not self._is_conversation_ended():
+            audio_data = recorder.read()
+            if not audio_data:
+                continue
+
+            self._recording_state.add_audio(audio_data)
+
+    def _audio_loop(self):
+        while not self.should_stop():
+            self._wait_recording_start()
+            self._recording_state.reset()
+            self._on_conversation_start()
+
+            try:
+                with AudioRecorder(
+                    stop_event=self._should_stop,
+                    sample_rate=self._sample_rate,
+                    frame_size=self._frame_size,
+                    channels=self._channels,
+                ) as self._recorder:
+                    self._capture_audio(self._recorder)
+            finally:
+                if self._recorder:
+                    try:
+                        self._recorder.stream.close()
+                    except Exception as e:
+                        self.logger.warning("Error closing the audio stream: %s", e)
+
+                self._recorder = None
+
+            if self._recording_state.is_silent():
+                self._on_conversation_timeout()
+            else:
+                audio = self._recording_state.export_audio()
+                text = self._get_prediction(audio)
+                self._on_speech_recognized(text)
+
+    def _wait_recording_start(self):
+        self._start_recording_event.wait()
+        self._start_recording_event.clear()
+
+    def _start_conversation(self, *_, **__):
+        self._disable_default_response = False
+        self._recording_state.reset()
+        self._start_recording_event.set()
+
+    def _stop_conversation(self, *_, **__):
+        self._disable_default_response = True
+        super()._stop_conversation()
+        self._recording_state.reset()
+        if self._recorder:
+            self._recorder.stop()
+
+        self._on_conversation_end()
+
+    def _on_speech_recognized(self, phrase: Optional[str]):
+        super()._on_speech_recognized(phrase)
+
+        # Dirty hack: wait a bit before stopping the conversation to make sure
+        # that there aren't event hooks triggered in other threads that are
+        # supposed to handle.
+        if self.stop_conversation_on_speech_match:
+            self.wait_stop(0.5)
+            if self.should_stop():
+                return
+
+            if self._disable_default_response:
+                self.logger.debug("Default response disabled, skipping response")
+                return
+
+        response = self._openai.get_response(phrase).output
+        if response:
+            self.render_response(response)
+        else:
+            self._on_no_response()
+
+    @action
+    def start_conversation(self, *_, **__):
+        """
+        Start a conversation with the assistant. The conversation will be
+        automatically stopped after ``conversation_max_duration`` seconds of
+        audio, or after ``conversation_start_timeout`` seconds of silence
+        with no audio detected, or after ``conversation_end_timeout`` seconds
+        after the last non-silent audio frame has been detected, or when the
+        :meth:`.stop_conversation` method is called.
+        """
+        self._start_conversation()
+
+    @action
+    def mute(self, *_, **__):
+        """
+        .. note:: This plugin has no hotword detection, thus no continuous
+            audio detection. Speech processing is done on-demand through the
+            :meth:`.start_conversation` and :meth:`.stop_conversation` methods.
+            Therefore, the :meth:`.mute` and :meth:`.unmute` methods are not
+            implemented.
+        """
+        self.logger.warning(
+            "assistant.openai.mute is not implemented because this plugin "
+            "has no hotword detection, and the only way to stop a conversation "
+            "is by calling stop_conversation()"
+        )
+
+    @action
+    def unmute(self, *_, **__):
+        """
+        .. note:: This plugin has no hotword detection, thus no continuous
+            audio detection. Speech processing is done on-demand through the
+            :meth:`.start_conversation` and :meth:`.stop_conversation` methods.
+            Therefore, the :meth:`.mute` and :meth:`.unmute` methods are not
+            implemented.
+        """
+        self.logger.warning(
+            "assistant.openai.unmute is not implemented because this plugin "
+            "has no hotword detection, and the only way to start a conversation "
+            "is by calling start_conversation()"
+        )
+
+    @action
+    def send_text_query(self, text: str, *_, **__):
+        """
+        If the ``tts_plugin`` configuration is set, then the assistant will
+        process the given text query through
+        :meth:`platypush.plugins.openai.OpenaiPlugin.get_response` and render
+        the response through the specified TTS plugin.
+
+        :return: The response received from
+            :meth:`platypush.plugins.openai.OpenaiPlugin.get_response`.
+        """
+        response = self._openai.get_response(text).output
+        self.render_response(response)
+        return response
+
+    def main(self):
+        while not self.should_stop():
+            try:
+                self._audio_loop()
+            except Exception as e:
+                self.logger.error("Audio loop error: %s", e, exc_info=True)
+                self.wait_stop(5)
+            finally:
+                self.stop_conversation()
+
+    def stop(self):
+        self._stop_conversation()
+        super().stop()
--- a/platypush/plugins/assistant/openai/_state.py
+++ b/platypush/plugins/assistant/openai/_state.py
@ -0,0 +1,80 @@
+from io import BytesIO
+from dataclasses import dataclass, field
+from typing import List
+
+import numpy as np
+from pydub import AudioSegment, silence
+
+from platypush.common.assistant import AudioFrame
+
+
+@dataclass
+class RecordingState:
+    """
+    Current state of the audio recording.
+    """
+
+    sample_rate: int
+    channels: int
+    min_silence_secs: float
+    silence_threshold: int
+    silence_duration: float = 0.0
+    audio_segments: List[AudioSegment] = field(default_factory=list)
+    duration: float = 0.0
+    conversation_started: bool = False
+
+    def _silence_duration(self, audio: AudioSegment) -> float:
+        silent_frames = [
+            (start / 1000, stop / 1000)
+            for start, stop in silence.detect_silence(
+                audio,
+                min_silence_len=int(self.min_silence_secs * 1000),
+                silence_thresh=int(self.silence_threshold),
+            )
+        ]
+
+        return sum(stop - start for start, stop in silent_frames)
+
+    def _to_audio_segment(self, data: np.ndarray) -> AudioSegment:
+        return AudioSegment(
+            data.tobytes(),
+            frame_rate=self.sample_rate,
+            sample_width=data.dtype.itemsize,
+            channels=self.channels,
+        )
+
+    def _add_audio_segment(self, audio: AudioSegment):
+        self.audio_segments.append(audio)
+        self.duration += audio.duration_seconds
+        silence_duration = self._silence_duration(audio)
+        is_mostly_silent = silence_duration >= audio.duration_seconds * 0.75
+
+        if is_mostly_silent:
+            self.silence_duration += silence_duration
+        else:
+            self.conversation_started = True
+            self.silence_duration = 0.0
+
+    def is_silent(self) -> bool:
+        return self.silence_duration >= self.duration
+
+    def add_audio(self, audio: AudioFrame):
+        self._add_audio_segment(self._to_audio_segment(audio.data))
+
+    def export_audio(self) -> BytesIO:
+        buffer = BytesIO()
+        if not self.audio_segments:
+            return buffer
+
+        audio = self.audio_segments[0]
+        for segment in self.audio_segments[1:]:
+            audio += segment
+
+        audio.export(buffer, format="mp3", bitrate='92')
+        return buffer
+
+    def reset(self):
+        self.audio_segments.clear()
+        self.duration = 0.0
+        self.silence_duration = 0.0
+        self.conversation_started = False
--- a/platypush/plugins/assistant/openai/manifest.json
+++ b/platypush/plugins/assistant/openai/manifest.json
@ -0,0 +1,44 @@
+{
+  "manifest": {
+    "package": "platypush.plugins.assistant.openai",
+    "type": "plugin",
+    "events": [
+      "platypush.message.event.assistant.ConversationEndEvent",
+      "platypush.message.event.assistant.ConversationStartEvent",
+      "platypush.message.event.assistant.ConversationTimeoutEvent",
+      "platypush.message.event.assistant.HotwordDetectedEvent",
+      "platypush.message.event.assistant.IntentRecognizedEvent",
+      "platypush.message.event.assistant.MicMutedEvent",
+      "platypush.message.event.assistant.MicUnmutedEvent",
+      "platypush.message.event.assistant.NoResponseEvent",
+      "platypush.message.event.assistant.ResponseEndEvent",
+      "platypush.message.event.assistant.ResponseEvent",
+      "platypush.message.event.assistant.SpeechRecognizedEvent"
+    ],
+    "install": {
+      "apk": [
+        "ffmpeg",
+        "py3-numpy"
+      ],
+      "apt": [
+        "ffmpeg",
+        "python3-numpy",
+        "python3-pydub"
+      ],
+      "dnf": [
+        "ffmpeg",
+        "python-numpy"
+      ],
+      "pacman": [
+        "ffmpeg",
+        "python-numpy",
+        "python-sounddevice"
+      ],
+      "pip": [
+        "numpy",
+        "pydub",
+        "sounddevice"
+      ]
+    }
+  }
+}
--- a/platypush/plugins/assistant/picovoice/init.py
+++ b/platypush/plugins/assistant/picovoice/init.py
@ -562,11 +562,8 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
        self._assistant.override_speech_model(model_file)
        self._assistant.state = AssistantState.DETECTING_SPEECH

-    @action
-    def stop_conversation(self, *_, **__):
-        """
-        Programmatically stop a running conversation with the assistant
-        """
+    def _stop_conversation(self, *_, **__):
+        super()._stop_conversation()
        if not self._assistant:
            self.logger.warning('Assistant not initialized')
            return
--- a/platypush/plugins/assistant/picovoice/_assistant.py
+++ b/platypush/plugins/assistant/picovoice/_assistant.py
@ -7,6 +7,7 @@ from typing import Any, Dict, Optional, Sequence

 import pvporcupine

+from platypush.common.assistant import AudioRecorder
 from platypush.context import get_plugin
 from platypush.message.event.assistant import (
    AssistantEvent,
@ -16,8 +17,6 @@ from platypush.message.event.assistant import (
    SpeechRecognizedEvent,
 )
 from platypush.plugins.tts.picovoice import TtsPicovoicePlugin
-
-from ._recorder import AudioRecorder
 from ._speech import SpeechProcessor
 from ._state import AssistantState

--- a/platypush/plugins/tts/openai/init.py
+++ b/platypush/plugins/tts/openai/init.py
@ -0,0 +1,150 @@
+import os
+import tempfile
+from contextlib import contextmanager
+from multiprocessing import Process
+from typing import Generator, Optional
+
+import requests
+
+from platypush.context import get_plugin
+from platypush.plugins import action
+from platypush.plugins.openai import OpenaiPlugin
+from platypush.plugins.tts import TtsPlugin
+
+
+class TtsOpenaiPlugin(TtsPlugin):
+    r"""
+    This plugin provides an interface to the `OpenAI text-to-speech API
+    <https://platform.openai.com/docs/guides/text-to-speech>`_.
+
+    It requires the :class:`platypush.plugins.openai.OpenaiPlugin` plugin to be
+    configured.
+    """
+
+    _BUFSIZE = 1024
+
+    def __init__(
+        self,
+        model: str = 'tts-1',
+        voice: str = 'nova',
+        timeout: float = 10,
+        **kwargs,
+    ):
+        """
+        :param model: Model to be used for the text-to-speech conversion.
+            See the `OpenAI API models documentation
+            <https://platform.openai.com/docs/models/tts>`_ for the list of
+            available models (default: ``tts-1``).
+        :param voice: Default voice to be used. See the `OpenAI API
+            voices documentation
+            <https://platform.openai.com/docs/guides/text-to-speech/voice-options>`_
+            for the list of available voices (default: ``nova``).
+        :param timeout: Default timeout for the API requests (default: 10s).
+        """
+        super().__init__(**kwargs)
+        openai = get_plugin('openai')
+        assert openai, 'openai plugin not configured'
+
+        self.openai: OpenaiPlugin = openai
+        self.model = model
+        self.voice = voice
+        self.timeout = timeout
+        self._audio_proc: Optional[Process] = None
+
+    def _process_response(
+        self,
+        response: requests.Response,
+        audio_file: str,
+    ) -> Process:
+        def proc_fn():
+            try:
+                with open(audio_file, 'wb') as file:
+                    for chunk in response.iter_content(chunk_size=self._BUFSIZE):
+                        if chunk:
+                            file.write(chunk)
+                            file.flush()
+            except KeyboardInterrupt:
+                pass
+
+        self._audio_proc = Process(target=proc_fn, name='openai-tts-response-processor')
+        self._audio_proc.start()
+        return self._audio_proc
+
+    def _make_request(
+        self,
+        text: str,
+        model: Optional[str] = None,
+        voice: Optional[str] = None,
+    ) -> requests.Response:
+        rs = requests.post(
+            "https://api.openai.com/v1/audio/speech",
+            timeout=self.timeout,
+            stream=True,
+            headers={
+                "Authorization": f"Bearer {self.openai._api_key}",  # pylint: disable=protected-access
+                "Content-Type": "application/json",
+            },
+            json={
+                "model": model or self.model,
+                "voice": voice or self.voice,
+                "input": text,
+            },
+        )
+
+        rs.raise_for_status()
+        return rs
+
+    @contextmanager
+    def _audio_fifo(self) -> Generator[str, None, None]:
+        fifo_dir = tempfile.mkdtemp()
+        fifo_path = os.path.join(fifo_dir, 'platypush-tts-openai-fifo')
+        os.mkfifo(fifo_path)
+        yield fifo_path
+
+        os.unlink(fifo_path)
+        os.rmdir(fifo_dir)
+
+    @action
+    def say(
+        self,
+        text: str,
+        *_,
+        model: Optional[str] = None,
+        voice: Optional[str] = None,
+        **player_args,
+    ):
+        """
+        Say some text.
+
+        :param text: Text to say.
+        :param model: Default ``model`` override.
+        :param voice: Default ``voice`` override.
+        :param player_args: Extends the additional arguments to be passed to
+            :meth:`platypush.plugins.sound.SoundPlugin.play` (like volume,
+            duration, channels etc.).
+        """
+        response_processor: Optional[Process] = None
+
+        try:
+            response = self._make_request(text, model=model, voice=voice)
+
+            with self._audio_fifo() as audio_file:
+                response_processor = self._process_response(
+                    response=response, audio_file=audio_file
+                )
+                self._playback(audio_file, **player_args)
+                response_processor.join()
+                response_processor = None
+        finally:
+            if response_processor:
+                response_processor.terminate()
+
+    @action
+    def stop(self):
+        super().stop()
+        if self._audio_proc and self._audio_proc.is_alive():
+            self._audio_proc.terminate()
+            self._audio_proc.join()
+
+
+# vim:sw=4:ts=4:et:
--- a/platypush/plugins/tts/openai/manifest.json
+++ b/platypush/plugins/tts/openai/manifest.json
@ -0,0 +1,34 @@
+{
+  "manifest": {
+    "events": {},
+    "install": {
+      "apk": [
+        "ffmpeg",
+        "portaudio-dev",
+        "py3-numpy"
+      ],
+      "apt": [
+        "ffmpeg",
+        "portaudio19-dev",
+        "python3-numpy"
+      ],
+      "dnf": [
+        "ffmpeg",
+        "portaudio-devel",
+        "python-numpy"
+      ],
+      "pacman": [
+        "ffmpeg",
+        "portaudio",
+        "python-numpy",
+        "python-sounddevice"
+      ],
+      "pip": [
+        "numpy",
+        "sounddevice"
+      ]
+    },
+    "package": "platypush.plugins.tts.openai",
+    "type": "plugin"
+  }
+}
--- a/platypush/utils/mock/modules.py
+++ b/platypush/utils/mock/modules.py
@ -93,6 +93,7 @@ mock_imports = [
    "pychromecast",
    "pyclip",
    "pydbus",
+    "pydub",
    "pyfirmata2",
    "pyngrok",
    "pyotp",