[assistant.picovoice] Various improvements.

- Added `intent_model_path` parameter. - Always apply `expanduser` to configuration paths. - Better logic to infer the fallback model path. - The Picovoice Leonardo object should always be removed after `assistant.picovoice.transcribe` is called.
2024-04-14 21:24:06 +02:00 · 2024-04-14 21:24:06 +02:00 · bb9b6cd319
commit bb9b6cd319
parent f0a780b759
1 changed files with 82 additions and 15 deletions
--- a/platypush/plugins/assistant/picovoice/init.py
+++ b/platypush/plugins/assistant/picovoice/init.py
@ -59,6 +59,7 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
        keyword_paths: Optional[Sequence[str]] = None,
        keyword_model_path: Optional[str] = None,
        speech_model_path: Optional[str] = None,
        intent_model_path: Optional[str] = None,
        endpoint_duration: Optional[float] = 0.5,
        enable_automatic_punctuation: bool = False,
        start_conversation_on_hotword: bool = True,
@ -106,6 +107,54 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
            to train your custom models. You can use a base model and fine-tune
            it by boosting the detection of your own words and phrases and edit
            the phonetic representation of the words you want to detect.
        :param intent_model_path: Path to the Rhino context model. This is
            required if you want to use the intent recognition engine through
            Rhino. The context model is a file that contains a list of intents
            that can be recognized by the engine. An intent is an action or a
            class of actions that the assistant can recognize, and it can
            contain an optional number of slots to model context variables -
            e.g. temperature, lights group, location, device state etc.
            You can create your own context model using the `Rhino console
            <https://console.picovoice.ai/rhn>`_. For example, you can define a
            context file to control smart home devices by defining the
            following slots:
                - ``device_type``: The device to control (e.g. lights, music)
                - ``device_state``: The target state of the device (e.g. on,
                  off)
                - ``location``: The location of the device (e.g. living
                  room, kitchen, bedroom)
                - ``media_type``: The type of media to play (e.g. music, video)
                - ``media_state``: The state of the media (e.g. play, pause,
                  stop)
            You can then define the following intents:
                - ``device_ctrl``: Control a device state. Supported phrases:
                    - "turn ``$device_state:state`` the ``$location:location``
                      ``$device_type:device``"
                    - "turn ``$device_state:state`` the ``$device_type:device``"
                - ``media_ctrl``: Control media state. Supported phrases:
                    - "``$media_state:state`` the ``$media_type:media``"
                    - "``$media_state:state`` the ``$media_type:media`` in the
                      ``$location:location``"
            Then a phrase like "turn on the lights in the living room" would
            trigger a
            :class:`platypush.message.event.assistant.IntentMatchedEvent` with:
                .. code-block:: json
                  {
                    "intent": "device_ctrl",
                    "slots": {
                      "type": "lights",
                      "state": "on",
                      "location": "living room"
                    }
                  }
        :param endpoint_duration: If set, the assistant will stop listening when
            no speech is detected for the specified duration (in seconds) after
            the end of an utterance.
@ -144,9 +193,19 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
            'stt_enabled': stt_enabled,
            'intent_enabled': intent_enabled,
            'keywords': keywords,
-            'keyword_paths': keyword_paths,
+            'keyword_paths': (
-            'keyword_model_path': keyword_model_path,
+                os.path.expanduser(keyword_path)
-            'speech_model_path': speech_model_path,
+                for keyword_path in (keyword_paths or [])
            ),
            'keyword_model_path': (
                os.path.expanduser(keyword_model_path) if keyword_model_path else None
            ),
            'speech_model_path': (
                os.path.expanduser(speech_model_path) if speech_model_path else None
            ),
            'intent_model_path': (
                os.path.expanduser(intent_model_path) if intent_model_path else None
            ),
            'endpoint_duration': endpoint_duration,
            'enable_automatic_punctuation': enable_automatic_punctuation,
            'start_conversation_on_hotword': start_conversation_on_hotword,
@ -193,6 +252,8 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
            self.logger.warning('Assistant not initialized')
            return
        if not model_file:
            model_file = self._assistant_args['speech_model_path']
        if model_file:
            model_file = os.path.expanduser(model_file)
@ -278,6 +339,8 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
        import pvleopard
        audio_file = os.path.expanduser(audio_file)
        if not model_file:
            model_file = self._assistant_args['speech_model_path']
        if model_file:
            model_file = os.path.expanduser(model_file)
@ -286,18 +349,22 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
        )
        transcript, words = leopard.process_file(audio_file)
-        return {
+
-            'transcription': transcript,
+        try:
-            'words': [
+            return {
-                {
+                'transcription': transcript,
-                    'word': word.word,
+                'words': [
-                    'start': word.start_sec,
+                    {
-                    'end': word.end_sec,
+                        'word': word.word,
-                    'confidence': word.confidence,
+                        'start': word.start_sec,
-                }
+                        'end': word.end_sec,
-                for word in words
+                        'confidence': word.confidence,
-            ],
+                    }
-        }
+                    for word in words
                ],
            }
        finally:
            leopard.delete()
    @action
    def mute(self, *_, **__):