Google Translate API supports a maximum of 2000 characters per API call.

Therefore it's a good idea to split the input text/html.
This commit is contained in:
Fabio Manganiello 2020-07-05 21:04:54 +02:00
parent f1c9554b1b
commit 590d416682

View file

@ -1,5 +1,5 @@
import os import os
from typing import Optional from typing import Optional, List
# noinspection PyPackageRequirements # noinspection PyPackageRequirements
from google.cloud import translate_v2 as translate from google.cloud import translate_v2 as translate
@ -30,6 +30,7 @@ class GoogleTranslatePlugin(Plugin):
""" """
_maximum_text_length = 2000
default_credentials_file = os.path.join(os.path.expanduser('~'), '.credentials', 'platypush', 'google', default_credentials_file = os.path.join(os.path.expanduser('~'), '.credentials', 'platypush', 'google',
'translate.json') 'translate.json')
@ -54,6 +55,33 @@ class GoogleTranslatePlugin(Plugin):
if self.credentials_file: if self.credentials_file:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = self.credentials_file os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = self.credentials_file
@staticmethod
def _nearest_delimiter_index(text: str, pos: int) -> int:
for i in range(min(pos, len(text)-1), -1, -1):
if text[i] in [' ', '\t', ',', '.', ')', '>']:
return i
elif text[i] in ['(', '<']:
return i-1 if i > 0 else 0
return 0
@classmethod
def _split_text(cls, text: str, length: int = _maximum_text_length) -> List[str]:
parts = []
while text:
i = cls._nearest_delimiter_index(text, length)
if i == 0:
parts.append(text)
text = ''
else:
part = text[:i+1]
if part:
parts.append(part.strip())
text = text[i+1:]
return parts
# noinspection PyShadowingBuiltins # noinspection PyShadowingBuiltins
@action @action
def translate(self, text: str, target_language: Optional[str] = None, source_language: Optional[str] = None, def translate(self, text: str, target_language: Optional[str] = None, source_language: Optional[str] = None,
@ -76,11 +104,20 @@ class GoogleTranslatePlugin(Plugin):
if source_language: if source_language:
args['source_language'] = source_language args['source_language'] = source_language
result = client.translate(text, format_=format, **args) inputs = self._split_text(text)
# noinspection PyUnresolvedReferences result = {}
for input in inputs:
response = client.translate(input, format_=format, **args)
if not result:
result = response
else:
# noinspection PyTypeChecker
result['translatedText'] += ' ' + response['translatedText']
return TranslateResponse( return TranslateResponse(
translated_text=result.get('translatedText'), translated_text=result.get('translatedText'),
source_text=result.get('input'), source_text=text,
detected_source_language=result.get('detectedSourceLanguage'), detected_source_language=result.get('detectedSourceLanguage'),
) )