forked from platypush/platypush
Rewritten the http.webpage
plugin.
This commit is contained in:
parent
669f2eb2d2
commit
b6c0ff799b
1 changed files with 276 additions and 77 deletions
|
@ -1,13 +1,66 @@
|
||||||
|
from dataclasses import dataclass
|
||||||
import datetime
|
import datetime
|
||||||
|
from enum import Enum
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import textwrap
|
||||||
|
from typing import Iterable, Optional, Union
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from platypush.plugins import action
|
from platypush.plugins import Plugin, action
|
||||||
from platypush.plugins.http.request import Plugin
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class OutputFormat:
|
||||||
|
"""
|
||||||
|
Definition of a supported output format.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str
|
||||||
|
cmd_fmt: str
|
||||||
|
extensions: Iterable[str] = ()
|
||||||
|
|
||||||
|
|
||||||
|
class OutputFormats(Enum):
|
||||||
|
"""
|
||||||
|
Supported output formats.
|
||||||
|
"""
|
||||||
|
|
||||||
|
HTML = OutputFormat('html', extensions=('html', 'htm'), cmd_fmt='html')
|
||||||
|
# PDF will first be exported to HTML and then converted to PDF
|
||||||
|
PDF = OutputFormat('pdf', extensions=('pdf',), cmd_fmt='html')
|
||||||
|
TEXT = OutputFormat('text', extensions=('txt',), cmd_fmt='text')
|
||||||
|
MARKDOWN = OutputFormat('markdown', extensions=('md',), cmd_fmt='markdown')
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def parse(
|
||||||
|
cls,
|
||||||
|
type: Union[str, "OutputFormats"], # pylint: disable=redefined-builtin
|
||||||
|
outfile: Optional[str] = None,
|
||||||
|
) -> "OutputFormats":
|
||||||
|
"""
|
||||||
|
Parse the format given a type argument and and output file name.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
fmt = (
|
||||||
|
getattr(OutputFormats, type.upper()) if isinstance(type, str) else type
|
||||||
|
)
|
||||||
|
except AttributeError as e:
|
||||||
|
raise AssertionError(
|
||||||
|
f'Unsupported output format: {type}. Supported formats: '
|
||||||
|
+ f'{[f.name for f in OutputFormats]}'
|
||||||
|
) from e
|
||||||
|
|
||||||
|
by_extension = {ext.lower(): f for f in cls for ext in f.value.extensions}
|
||||||
|
if outfile:
|
||||||
|
fmt_by_ext = by_extension.get(os.path.splitext(outfile)[1].lower()[1:])
|
||||||
|
if fmt_by_ext:
|
||||||
|
return fmt_by_ext
|
||||||
|
|
||||||
|
return fmt
|
||||||
|
|
||||||
|
|
||||||
class HttpWebpagePlugin(Plugin):
|
class HttpWebpagePlugin(Plugin):
|
||||||
|
@ -24,34 +77,71 @@ class HttpWebpagePlugin(Plugin):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js')
|
_mercury_script = os.path.join(
|
||||||
|
os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js'
|
||||||
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _parse(proc):
|
def _parse(proc):
|
||||||
|
"""
|
||||||
|
Runs the mercury-parser script and returns the result as a string.
|
||||||
|
"""
|
||||||
with subprocess.Popen(proc, stdout=subprocess.PIPE, stderr=None) as parser:
|
with subprocess.Popen(proc, stdout=subprocess.PIPE, stderr=None) as parser:
|
||||||
return parser.communicate()[0].decode()
|
return parser.communicate()[0].decode()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _fix_relative_links(markdown: str, url: str) -> str:
|
def _fix_relative_links(markdown: str, url: str) -> str:
|
||||||
url = urlparse(url)
|
"""
|
||||||
base_url = f'{url.scheme}://{url.netloc}'
|
Fix relative links to match the base URL of the page (Markdown only).
|
||||||
|
"""
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
|
||||||
return re.sub(r'(\[.+?])\((/.+?)\)', fr'\1({base_url}\2)', markdown)
|
return re.sub(r'(\[.+?])\((/.+?)\)', fr'\1({base_url}\2)', markdown)
|
||||||
|
|
||||||
# noinspection PyShadowingBuiltins
|
|
||||||
@action
|
@action
|
||||||
def simplify(self, url, type='html', html=None, outfile=None):
|
def simplify(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
type: Union[ # pylint: disable=redefined-builtin
|
||||||
|
str, OutputFormats
|
||||||
|
] = OutputFormats.HTML,
|
||||||
|
html: Optional[str] = None,
|
||||||
|
outfile: Optional[str] = None,
|
||||||
|
font_size: str = '19px',
|
||||||
|
font_family: Union[str, Iterable[str]] = (
|
||||||
|
'-apple-system',
|
||||||
|
'Segoe UI',
|
||||||
|
'Roboto',
|
||||||
|
'Oxygen',
|
||||||
|
'Ubuntu',
|
||||||
|
'Cantarell',
|
||||||
|
"Fira Sans",
|
||||||
|
'Open Sans',
|
||||||
|
'Droid Sans',
|
||||||
|
'Helvetica Neue',
|
||||||
|
'Helvetica',
|
||||||
|
'Arial',
|
||||||
|
'sans-serif',
|
||||||
|
),
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Parse the readable content of a web page removing any extra HTML elements using Mercury.
|
Parse the readable content of a web page removing any extra HTML elements using Mercury.
|
||||||
|
|
||||||
:param url: URL to parse.
|
:param url: URL to parse.
|
||||||
:param type: Output format. Supported types: ``html``, ``markdown``, ``text`` (default: ``html``).
|
:param type: Output format. Supported types: ``html``, ``markdown``,
|
||||||
:param html: Set this parameter if you want to parse some HTML content already fetched. Note
|
``text``, ``pdf`` (default: ``html``).
|
||||||
that URL is still required by Mercury to properly style the output, but it won't be used
|
:param html: Set this parameter if you want to parse some HTML content
|
||||||
to actually fetch the content.
|
already fetched. Note that URL is still required by Mercury to
|
||||||
|
properly style the output, but it won't be used to actually fetch
|
||||||
:param outfile: If set then the output will be written to the specified file. If the file extension
|
the content.
|
||||||
is ``.pdf`` then the content will be exported in PDF format. If the output ``type`` is not
|
:param outfile: If set then the output will be written to the specified
|
||||||
specified then it can also be inferred from the extension of the output file.
|
file. If the file extension is ``.pdf`` then the content will be
|
||||||
|
exported in PDF format. If the output ``type`` is not specified
|
||||||
|
then it can also be inferred from the extension of the output file.
|
||||||
|
:param font_size: Font size to use for the output (default: 19px).
|
||||||
|
:param font_family: Custom font family (or list of font families, in
|
||||||
|
decreasing order) to use for the output. It only applies to HTML
|
||||||
|
and PDF.
|
||||||
:return: dict
|
:return: dict
|
||||||
|
|
||||||
Example if outfile is not specified::
|
Example if outfile is not specified::
|
||||||
|
@ -74,26 +164,14 @@ class HttpWebpagePlugin(Plugin):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.logger.info('Parsing URL {}'.format(url))
|
self.logger.info('Parsing URL %s', url)
|
||||||
wants_pdf = False
|
fmt = OutputFormats.parse(type=type, outfile=outfile)
|
||||||
|
proc = ['node', self._mercury_script, url, fmt.value.cmd_fmt]
|
||||||
if outfile:
|
tmp_file = None
|
||||||
wants_pdf = outfile.lower().endswith('.pdf')
|
|
||||||
if (
|
|
||||||
wants_pdf # HTML will be exported to PDF
|
|
||||||
or outfile.lower().split('.')[-1].startswith('htm')
|
|
||||||
):
|
|
||||||
type = 'html'
|
|
||||||
elif outfile.lower().endswith('.md'):
|
|
||||||
type = 'markdown'
|
|
||||||
elif outfile.lower().endswith('.txt'):
|
|
||||||
type = 'text'
|
|
||||||
|
|
||||||
proc = ['node', self._mercury_script, url, type]
|
|
||||||
f = None
|
|
||||||
|
|
||||||
if html:
|
if html:
|
||||||
f = tempfile.NamedTemporaryFile('w+', delete=False)
|
with tempfile.NamedTemporaryFile('w+', delete=False) as f:
|
||||||
|
tmp_file = f.name
|
||||||
f.write(html)
|
f.write(html)
|
||||||
f.flush()
|
f.flush()
|
||||||
proc.append(f.name)
|
proc.append(f.name)
|
||||||
|
@ -101,21 +179,31 @@ class HttpWebpagePlugin(Plugin):
|
||||||
try:
|
try:
|
||||||
response = self._parse(proc)
|
response = self._parse(proc)
|
||||||
finally:
|
finally:
|
||||||
if f:
|
if tmp_file:
|
||||||
os.unlink(f.name)
|
os.unlink(tmp_file)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = json.loads(response.strip())
|
response = json.loads(response.strip())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError('Could not parse JSON: {}. Response: {}'.format(str(e), response))
|
raise RuntimeError(
|
||||||
|
f'Could not parse JSON: {e}. Response: {response}'
|
||||||
|
) from e
|
||||||
|
|
||||||
if type == 'markdown':
|
if fmt == OutputFormats.MARKDOWN:
|
||||||
response['content'] = self._fix_relative_links(response['content'], url)
|
response['content'] = self._fix_relative_links(response['content'], url)
|
||||||
|
|
||||||
self.logger.debug('Got response from Mercury API: {}'.format(response))
|
self.logger.debug('Got response from Mercury API: %s', response)
|
||||||
title = response.get('title', '{} on {}'.format(
|
title = response.get(
|
||||||
'Published' if response.get('date_published') else 'Generated',
|
'title',
|
||||||
response.get('date_published', datetime.datetime.now().isoformat())))
|
(
|
||||||
|
('Published' if response.get('date_published') else 'Generated')
|
||||||
|
+ ' on '
|
||||||
|
+ (
|
||||||
|
response.get('date_published')
|
||||||
|
or datetime.datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
content = response.get('content', '')
|
content = response.get('content', '')
|
||||||
|
|
||||||
|
@ -126,46 +214,134 @@ class HttpWebpagePlugin(Plugin):
|
||||||
'content': content,
|
'content': content,
|
||||||
}
|
}
|
||||||
|
|
||||||
outfile = os.path.abspath(os.path.expanduser(outfile))
|
return self._process_outfile(
|
||||||
style = '''
|
url=url,
|
||||||
body {
|
fmt=fmt,
|
||||||
font-size: 22px;
|
title=title,
|
||||||
font-family: 'Merriweather', Georgia, 'Times New Roman', Times, serif;
|
content=content,
|
||||||
|
outfile=outfile,
|
||||||
|
font_size=font_size,
|
||||||
|
font_family=tuple(
|
||||||
|
font_family,
|
||||||
|
)
|
||||||
|
if isinstance(font_family, str)
|
||||||
|
else tuple(font_family),
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _style_by_format(
|
||||||
|
fmt: OutputFormats,
|
||||||
|
font_size: str,
|
||||||
|
font_family: Iterable[str],
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
:return: The CSS style to be used for the given output format.
|
||||||
|
"""
|
||||||
|
style = textwrap.dedent(
|
||||||
|
f'''
|
||||||
|
._parsed-content-container {{
|
||||||
|
font-size: {font_size};
|
||||||
|
font-family: {', '.join(f'"{f}"' for f in font_family)};
|
||||||
|
}}
|
||||||
|
|
||||||
|
._parsed-content {{
|
||||||
|
text-align: justify;
|
||||||
|
}}
|
||||||
|
|
||||||
|
pre {{
|
||||||
|
white-space: pre-wrap;
|
||||||
|
}}
|
||||||
|
'''
|
||||||
|
)
|
||||||
|
|
||||||
|
if fmt == OutputFormats.HTML:
|
||||||
|
style += textwrap.dedent(
|
||||||
|
'''
|
||||||
|
._parsed-content-container {
|
||||||
|
margin: 1em;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
align-items: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
._parsed-content {
|
||||||
|
max-width: 800px;
|
||||||
|
}
|
||||||
|
|
||||||
|
h1 {
|
||||||
|
max-width: 800px;
|
||||||
}
|
}
|
||||||
'''
|
'''
|
||||||
|
)
|
||||||
|
|
||||||
if type == 'html':
|
return style
|
||||||
content = (
|
|
||||||
'''
|
@classmethod
|
||||||
|
def _process_outfile(
|
||||||
|
cls,
|
||||||
|
url: str,
|
||||||
|
fmt: OutputFormats,
|
||||||
|
title: str,
|
||||||
|
content: str,
|
||||||
|
outfile: str,
|
||||||
|
font_size: str,
|
||||||
|
font_family: Iterable[str],
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Process the output file.
|
||||||
|
|
||||||
|
:param url: URL to parse.
|
||||||
|
:param fmt: Output format. Supported types: ``html``, ``markdown``,
|
||||||
|
``text``, ``pdf`` (default: ``html``).
|
||||||
|
:param title: Page title.
|
||||||
|
:param content: Page content.
|
||||||
|
:param outfile: Output file path.
|
||||||
|
:param font_size: Font size to use for the output (default: 19px).
|
||||||
|
:param font_family: Custom font family (or list of font families, in
|
||||||
|
decreasing order) to use for the output. It only applies to HTML
|
||||||
|
and PDF.
|
||||||
|
:return: dict
|
||||||
|
"""
|
||||||
|
outfile = os.path.abspath(os.path.expanduser(outfile))
|
||||||
|
style = cls._style_by_format(fmt, font_size, font_family)
|
||||||
|
|
||||||
|
if fmt in {OutputFormats.HTML, OutputFormats.PDF}:
|
||||||
|
content = textwrap.dedent(
|
||||||
|
f'''
|
||||||
|
<div class="_parsed-content-container">
|
||||||
<h1><a href="{url}" target="_blank">{title}</a></h1>
|
<h1><a href="{url}" target="_blank">{title}</a></h1>
|
||||||
<div class="_parsed-content">{content}</div>
|
<div class="_parsed-content">{content}</div>
|
||||||
'''.format(title=title, url=url, content=content)
|
</div>
|
||||||
|
'''
|
||||||
)
|
)
|
||||||
|
|
||||||
if not wants_pdf:
|
if fmt == OutputFormats.PDF:
|
||||||
content = '''<html>
|
content = textwrap.dedent(
|
||||||
|
f'''<html>
|
||||||
<head>
|
<head>
|
||||||
<title>{title}</title>
|
|
||||||
<style>{style}</style>
|
<style>{style}</style>
|
||||||
</head>'''.format(title=title, style=style) + \
|
<title>{title}</title>
|
||||||
'<body>{{' + content + '}}</body></html>'
|
</head>
|
||||||
elif type == 'markdown':
|
<body>
|
||||||
content = '# [{title}]({url})\n\n{content}'.format(
|
{content}
|
||||||
title=title, url=url, content=content
|
</body>
|
||||||
|
</html>
|
||||||
|
'''
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
content = textwrap.dedent(
|
||||||
|
f'''
|
||||||
|
<style>
|
||||||
|
{style}
|
||||||
|
</style>
|
||||||
|
{content}
|
||||||
|
'''
|
||||||
|
)
|
||||||
|
elif fmt == OutputFormats.MARKDOWN:
|
||||||
|
content = f'# [{title}]({url})\n\n{content}'
|
||||||
|
|
||||||
if wants_pdf:
|
if fmt == OutputFormats.PDF:
|
||||||
import weasyprint
|
cls._process_pdf(content, outfile, style)
|
||||||
try:
|
|
||||||
from weasyprint.fonts import FontConfiguration
|
|
||||||
except ImportError:
|
|
||||||
from weasyprint.document import FontConfiguration
|
|
||||||
|
|
||||||
font_config = FontConfiguration()
|
|
||||||
css = [weasyprint.CSS('https://fonts.googleapis.com/css?family=Merriweather'),
|
|
||||||
weasyprint.CSS(string=style, font_config=font_config)]
|
|
||||||
|
|
||||||
weasyprint.HTML(string=content).write_pdf(outfile, stylesheets=css)
|
|
||||||
else:
|
else:
|
||||||
with open(outfile, 'w', encoding='utf-8') as f:
|
with open(outfile, 'w', encoding='utf-8') as f:
|
||||||
f.write(content)
|
f.write(content)
|
||||||
|
@ -176,5 +352,28 @@ class HttpWebpagePlugin(Plugin):
|
||||||
'outfile': outfile,
|
'outfile': outfile,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _process_pdf(content: str, outfile: str, style: str):
|
||||||
|
"""
|
||||||
|
Convert the given HTML content to a PDF document.
|
||||||
|
|
||||||
|
:param content: Page content.
|
||||||
|
:param outfile: Output file path.
|
||||||
|
:param style: CSS style to use for the output.
|
||||||
|
"""
|
||||||
|
import weasyprint
|
||||||
|
|
||||||
|
try:
|
||||||
|
from weasyprint.fonts import FontConfiguration # pylint: disable
|
||||||
|
except ImportError:
|
||||||
|
from weasyprint.document import FontConfiguration
|
||||||
|
|
||||||
|
font_config = FontConfiguration()
|
||||||
|
css = [
|
||||||
|
weasyprint.CSS(string=style, font_config=font_config),
|
||||||
|
]
|
||||||
|
|
||||||
|
weasyprint.HTML(string=content).write_pdf(outfile, stylesheets=css)
|
||||||
|
|
||||||
|
|
||||||
# vim:sw=4:ts=4:et:
|
# vim:sw=4:ts=4:et:
|
||||||
|
|
Loading…
Reference in a new issue