forked from platypush/platypush
[assistant.picovoice] Various improvements.
- Added `intent_model_path` parameter. - Always apply `expanduser` to configuration paths. - Better logic to infer the fallback model path. - The Picovoice Leonardo object should always be removed after `assistant.picovoice.transcribe` is called.
This commit is contained in:
parent
f0a780b759
commit
bb9b6cd319
1 changed files with 82 additions and 15 deletions
|
@ -59,6 +59,7 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
|
||||||
keyword_paths: Optional[Sequence[str]] = None,
|
keyword_paths: Optional[Sequence[str]] = None,
|
||||||
keyword_model_path: Optional[str] = None,
|
keyword_model_path: Optional[str] = None,
|
||||||
speech_model_path: Optional[str] = None,
|
speech_model_path: Optional[str] = None,
|
||||||
|
intent_model_path: Optional[str] = None,
|
||||||
endpoint_duration: Optional[float] = 0.5,
|
endpoint_duration: Optional[float] = 0.5,
|
||||||
enable_automatic_punctuation: bool = False,
|
enable_automatic_punctuation: bool = False,
|
||||||
start_conversation_on_hotword: bool = True,
|
start_conversation_on_hotword: bool = True,
|
||||||
|
@ -106,6 +107,54 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
|
||||||
to train your custom models. You can use a base model and fine-tune
|
to train your custom models. You can use a base model and fine-tune
|
||||||
it by boosting the detection of your own words and phrases and edit
|
it by boosting the detection of your own words and phrases and edit
|
||||||
the phonetic representation of the words you want to detect.
|
the phonetic representation of the words you want to detect.
|
||||||
|
:param intent_model_path: Path to the Rhino context model. This is
|
||||||
|
required if you want to use the intent recognition engine through
|
||||||
|
Rhino. The context model is a file that contains a list of intents
|
||||||
|
that can be recognized by the engine. An intent is an action or a
|
||||||
|
class of actions that the assistant can recognize, and it can
|
||||||
|
contain an optional number of slots to model context variables -
|
||||||
|
e.g. temperature, lights group, location, device state etc.
|
||||||
|
You can create your own context model using the `Rhino console
|
||||||
|
<https://console.picovoice.ai/rhn>`_. For example, you can define a
|
||||||
|
context file to control smart home devices by defining the
|
||||||
|
following slots:
|
||||||
|
|
||||||
|
- ``device_type``: The device to control (e.g. lights, music)
|
||||||
|
- ``device_state``: The target state of the device (e.g. on,
|
||||||
|
off)
|
||||||
|
- ``location``: The location of the device (e.g. living
|
||||||
|
room, kitchen, bedroom)
|
||||||
|
- ``media_type``: The type of media to play (e.g. music, video)
|
||||||
|
- ``media_state``: The state of the media (e.g. play, pause,
|
||||||
|
stop)
|
||||||
|
|
||||||
|
You can then define the following intents:
|
||||||
|
|
||||||
|
- ``device_ctrl``: Control a device state. Supported phrases:
|
||||||
|
- "turn ``$device_state:state`` the ``$location:location``
|
||||||
|
``$device_type:device``"
|
||||||
|
- "turn ``$device_state:state`` the ``$device_type:device``"
|
||||||
|
|
||||||
|
- ``media_ctrl``: Control media state. Supported phrases:
|
||||||
|
- "``$media_state:state`` the ``$media_type:media``"
|
||||||
|
- "``$media_state:state`` the ``$media_type:media`` in the
|
||||||
|
``$location:location``"
|
||||||
|
|
||||||
|
Then a phrase like "turn on the lights in the living room" would
|
||||||
|
trigger a
|
||||||
|
:class:`platypush.message.event.assistant.IntentMatchedEvent` with:
|
||||||
|
|
||||||
|
.. code-block:: json
|
||||||
|
|
||||||
|
{
|
||||||
|
"intent": "device_ctrl",
|
||||||
|
"slots": {
|
||||||
|
"type": "lights",
|
||||||
|
"state": "on",
|
||||||
|
"location": "living room"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
:param endpoint_duration: If set, the assistant will stop listening when
|
:param endpoint_duration: If set, the assistant will stop listening when
|
||||||
no speech is detected for the specified duration (in seconds) after
|
no speech is detected for the specified duration (in seconds) after
|
||||||
the end of an utterance.
|
the end of an utterance.
|
||||||
|
@ -144,9 +193,19 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
|
||||||
'stt_enabled': stt_enabled,
|
'stt_enabled': stt_enabled,
|
||||||
'intent_enabled': intent_enabled,
|
'intent_enabled': intent_enabled,
|
||||||
'keywords': keywords,
|
'keywords': keywords,
|
||||||
'keyword_paths': keyword_paths,
|
'keyword_paths': (
|
||||||
'keyword_model_path': keyword_model_path,
|
os.path.expanduser(keyword_path)
|
||||||
'speech_model_path': speech_model_path,
|
for keyword_path in (keyword_paths or [])
|
||||||
|
),
|
||||||
|
'keyword_model_path': (
|
||||||
|
os.path.expanduser(keyword_model_path) if keyword_model_path else None
|
||||||
|
),
|
||||||
|
'speech_model_path': (
|
||||||
|
os.path.expanduser(speech_model_path) if speech_model_path else None
|
||||||
|
),
|
||||||
|
'intent_model_path': (
|
||||||
|
os.path.expanduser(intent_model_path) if intent_model_path else None
|
||||||
|
),
|
||||||
'endpoint_duration': endpoint_duration,
|
'endpoint_duration': endpoint_duration,
|
||||||
'enable_automatic_punctuation': enable_automatic_punctuation,
|
'enable_automatic_punctuation': enable_automatic_punctuation,
|
||||||
'start_conversation_on_hotword': start_conversation_on_hotword,
|
'start_conversation_on_hotword': start_conversation_on_hotword,
|
||||||
|
@ -193,6 +252,8 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
|
||||||
self.logger.warning('Assistant not initialized')
|
self.logger.warning('Assistant not initialized')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if not model_file:
|
||||||
|
model_file = self._assistant_args['speech_model_path']
|
||||||
if model_file:
|
if model_file:
|
||||||
model_file = os.path.expanduser(model_file)
|
model_file = os.path.expanduser(model_file)
|
||||||
|
|
||||||
|
@ -278,6 +339,8 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
|
||||||
import pvleopard
|
import pvleopard
|
||||||
|
|
||||||
audio_file = os.path.expanduser(audio_file)
|
audio_file = os.path.expanduser(audio_file)
|
||||||
|
if not model_file:
|
||||||
|
model_file = self._assistant_args['speech_model_path']
|
||||||
if model_file:
|
if model_file:
|
||||||
model_file = os.path.expanduser(model_file)
|
model_file = os.path.expanduser(model_file)
|
||||||
|
|
||||||
|
@ -286,18 +349,22 @@ class AssistantPicovoicePlugin(AssistantPlugin, RunnablePlugin):
|
||||||
)
|
)
|
||||||
|
|
||||||
transcript, words = leopard.process_file(audio_file)
|
transcript, words = leopard.process_file(audio_file)
|
||||||
return {
|
|
||||||
'transcription': transcript,
|
try:
|
||||||
'words': [
|
return {
|
||||||
{
|
'transcription': transcript,
|
||||||
'word': word.word,
|
'words': [
|
||||||
'start': word.start_sec,
|
{
|
||||||
'end': word.end_sec,
|
'word': word.word,
|
||||||
'confidence': word.confidence,
|
'start': word.start_sec,
|
||||||
}
|
'end': word.end_sec,
|
||||||
for word in words
|
'confidence': word.confidence,
|
||||||
],
|
}
|
||||||
}
|
for word in words
|
||||||
|
],
|
||||||
|
}
|
||||||
|
finally:
|
||||||
|
leopard.delete()
|
||||||
|
|
||||||
@action
|
@action
|
||||||
def mute(self, *_, **__):
|
def mute(self, *_, **__):
|
||||||
|
|
Loading…
Reference in a new issue