forked from platypush/platypush
Rewritten the http.webpage
plugin.
This commit is contained in:
parent
669f2eb2d2
commit
b6c0ff799b
1 changed files with 276 additions and 77 deletions
|
@ -1,13 +1,66 @@
|
|||
from dataclasses import dataclass
|
||||
import datetime
|
||||
from enum import Enum
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
import textwrap
|
||||
from typing import Iterable, Optional, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from platypush.plugins import action
|
||||
from platypush.plugins.http.request import Plugin
|
||||
from platypush.plugins import Plugin, action
|
||||
|
||||
|
||||
@dataclass
|
||||
class OutputFormat:
|
||||
"""
|
||||
Definition of a supported output format.
|
||||
"""
|
||||
|
||||
name: str
|
||||
cmd_fmt: str
|
||||
extensions: Iterable[str] = ()
|
||||
|
||||
|
||||
class OutputFormats(Enum):
|
||||
"""
|
||||
Supported output formats.
|
||||
"""
|
||||
|
||||
HTML = OutputFormat('html', extensions=('html', 'htm'), cmd_fmt='html')
|
||||
# PDF will first be exported to HTML and then converted to PDF
|
||||
PDF = OutputFormat('pdf', extensions=('pdf',), cmd_fmt='html')
|
||||
TEXT = OutputFormat('text', extensions=('txt',), cmd_fmt='text')
|
||||
MARKDOWN = OutputFormat('markdown', extensions=('md',), cmd_fmt='markdown')
|
||||
|
||||
@classmethod
|
||||
def parse(
|
||||
cls,
|
||||
type: Union[str, "OutputFormats"], # pylint: disable=redefined-builtin
|
||||
outfile: Optional[str] = None,
|
||||
) -> "OutputFormats":
|
||||
"""
|
||||
Parse the format given a type argument and and output file name.
|
||||
"""
|
||||
try:
|
||||
fmt = (
|
||||
getattr(OutputFormats, type.upper()) if isinstance(type, str) else type
|
||||
)
|
||||
except AttributeError as e:
|
||||
raise AssertionError(
|
||||
f'Unsupported output format: {type}. Supported formats: '
|
||||
+ f'{[f.name for f in OutputFormats]}'
|
||||
) from e
|
||||
|
||||
by_extension = {ext.lower(): f for f in cls for ext in f.value.extensions}
|
||||
if outfile:
|
||||
fmt_by_ext = by_extension.get(os.path.splitext(outfile)[1].lower()[1:])
|
||||
if fmt_by_ext:
|
||||
return fmt_by_ext
|
||||
|
||||
return fmt
|
||||
|
||||
|
||||
class HttpWebpagePlugin(Plugin):
|
||||
|
@ -24,34 +77,71 @@ class HttpWebpagePlugin(Plugin):
|
|||
|
||||
"""
|
||||
|
||||
_mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js')
|
||||
_mercury_script = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js'
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _parse(proc):
|
||||
"""
|
||||
Runs the mercury-parser script and returns the result as a string.
|
||||
"""
|
||||
with subprocess.Popen(proc, stdout=subprocess.PIPE, stderr=None) as parser:
|
||||
return parser.communicate()[0].decode()
|
||||
|
||||
@staticmethod
|
||||
def _fix_relative_links(markdown: str, url: str) -> str:
|
||||
url = urlparse(url)
|
||||
base_url = f'{url.scheme}://{url.netloc}'
|
||||
"""
|
||||
Fix relative links to match the base URL of the page (Markdown only).
|
||||
"""
|
||||
parsed_url = urlparse(url)
|
||||
base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
|
||||
return re.sub(r'(\[.+?])\((/.+?)\)', fr'\1({base_url}\2)', markdown)
|
||||
|
||||
# noinspection PyShadowingBuiltins
|
||||
@action
|
||||
def simplify(self, url, type='html', html=None, outfile=None):
|
||||
def simplify(
|
||||
self,
|
||||
url: str,
|
||||
type: Union[ # pylint: disable=redefined-builtin
|
||||
str, OutputFormats
|
||||
] = OutputFormats.HTML,
|
||||
html: Optional[str] = None,
|
||||
outfile: Optional[str] = None,
|
||||
font_size: str = '19px',
|
||||
font_family: Union[str, Iterable[str]] = (
|
||||
'-apple-system',
|
||||
'Segoe UI',
|
||||
'Roboto',
|
||||
'Oxygen',
|
||||
'Ubuntu',
|
||||
'Cantarell',
|
||||
"Fira Sans",
|
||||
'Open Sans',
|
||||
'Droid Sans',
|
||||
'Helvetica Neue',
|
||||
'Helvetica',
|
||||
'Arial',
|
||||
'sans-serif',
|
||||
),
|
||||
):
|
||||
"""
|
||||
Parse the readable content of a web page removing any extra HTML elements using Mercury.
|
||||
|
||||
:param url: URL to parse.
|
||||
:param type: Output format. Supported types: ``html``, ``markdown``, ``text`` (default: ``html``).
|
||||
:param html: Set this parameter if you want to parse some HTML content already fetched. Note
|
||||
that URL is still required by Mercury to properly style the output, but it won't be used
|
||||
to actually fetch the content.
|
||||
|
||||
:param outfile: If set then the output will be written to the specified file. If the file extension
|
||||
is ``.pdf`` then the content will be exported in PDF format. If the output ``type`` is not
|
||||
specified then it can also be inferred from the extension of the output file.
|
||||
:param type: Output format. Supported types: ``html``, ``markdown``,
|
||||
``text``, ``pdf`` (default: ``html``).
|
||||
:param html: Set this parameter if you want to parse some HTML content
|
||||
already fetched. Note that URL is still required by Mercury to
|
||||
properly style the output, but it won't be used to actually fetch
|
||||
the content.
|
||||
:param outfile: If set then the output will be written to the specified
|
||||
file. If the file extension is ``.pdf`` then the content will be
|
||||
exported in PDF format. If the output ``type`` is not specified
|
||||
then it can also be inferred from the extension of the output file.
|
||||
:param font_size: Font size to use for the output (default: 19px).
|
||||
:param font_family: Custom font family (or list of font families, in
|
||||
decreasing order) to use for the output. It only applies to HTML
|
||||
and PDF.
|
||||
:return: dict
|
||||
|
||||
Example if outfile is not specified::
|
||||
|
@ -74,26 +164,14 @@ class HttpWebpagePlugin(Plugin):
|
|||
|
||||
"""
|
||||
|
||||
self.logger.info('Parsing URL {}'.format(url))
|
||||
wants_pdf = False
|
||||
|
||||
if outfile:
|
||||
wants_pdf = outfile.lower().endswith('.pdf')
|
||||
if (
|
||||
wants_pdf # HTML will be exported to PDF
|
||||
or outfile.lower().split('.')[-1].startswith('htm')
|
||||
):
|
||||
type = 'html'
|
||||
elif outfile.lower().endswith('.md'):
|
||||
type = 'markdown'
|
||||
elif outfile.lower().endswith('.txt'):
|
||||
type = 'text'
|
||||
|
||||
proc = ['node', self._mercury_script, url, type]
|
||||
f = None
|
||||
self.logger.info('Parsing URL %s', url)
|
||||
fmt = OutputFormats.parse(type=type, outfile=outfile)
|
||||
proc = ['node', self._mercury_script, url, fmt.value.cmd_fmt]
|
||||
tmp_file = None
|
||||
|
||||
if html:
|
||||
f = tempfile.NamedTemporaryFile('w+', delete=False)
|
||||
with tempfile.NamedTemporaryFile('w+', delete=False) as f:
|
||||
tmp_file = f.name
|
||||
f.write(html)
|
||||
f.flush()
|
||||
proc.append(f.name)
|
||||
|
@ -101,21 +179,31 @@ class HttpWebpagePlugin(Plugin):
|
|||
try:
|
||||
response = self._parse(proc)
|
||||
finally:
|
||||
if f:
|
||||
os.unlink(f.name)
|
||||
if tmp_file:
|
||||
os.unlink(tmp_file)
|
||||
|
||||
try:
|
||||
response = json.loads(response.strip())
|
||||
except Exception as e:
|
||||
raise RuntimeError('Could not parse JSON: {}. Response: {}'.format(str(e), response))
|
||||
raise RuntimeError(
|
||||
f'Could not parse JSON: {e}. Response: {response}'
|
||||
) from e
|
||||
|
||||
if type == 'markdown':
|
||||
if fmt == OutputFormats.MARKDOWN:
|
||||
response['content'] = self._fix_relative_links(response['content'], url)
|
||||
|
||||
self.logger.debug('Got response from Mercury API: {}'.format(response))
|
||||
title = response.get('title', '{} on {}'.format(
|
||||
'Published' if response.get('date_published') else 'Generated',
|
||||
response.get('date_published', datetime.datetime.now().isoformat())))
|
||||
self.logger.debug('Got response from Mercury API: %s', response)
|
||||
title = response.get(
|
||||
'title',
|
||||
(
|
||||
('Published' if response.get('date_published') else 'Generated')
|
||||
+ ' on '
|
||||
+ (
|
||||
response.get('date_published')
|
||||
or datetime.datetime.now().isoformat()
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
content = response.get('content', '')
|
||||
|
||||
|
@ -126,46 +214,134 @@ class HttpWebpagePlugin(Plugin):
|
|||
'content': content,
|
||||
}
|
||||
|
||||
outfile = os.path.abspath(os.path.expanduser(outfile))
|
||||
style = '''
|
||||
body {
|
||||
font-size: 22px;
|
||||
font-family: 'Merriweather', Georgia, 'Times New Roman', Times, serif;
|
||||
return self._process_outfile(
|
||||
url=url,
|
||||
fmt=fmt,
|
||||
title=title,
|
||||
content=content,
|
||||
outfile=outfile,
|
||||
font_size=font_size,
|
||||
font_family=tuple(
|
||||
font_family,
|
||||
)
|
||||
if isinstance(font_family, str)
|
||||
else tuple(font_family),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _style_by_format(
|
||||
fmt: OutputFormats,
|
||||
font_size: str,
|
||||
font_family: Iterable[str],
|
||||
) -> str:
|
||||
"""
|
||||
:return: The CSS style to be used for the given output format.
|
||||
"""
|
||||
style = textwrap.dedent(
|
||||
f'''
|
||||
._parsed-content-container {{
|
||||
font-size: {font_size};
|
||||
font-family: {', '.join(f'"{f}"' for f in font_family)};
|
||||
}}
|
||||
|
||||
._parsed-content {{
|
||||
text-align: justify;
|
||||
}}
|
||||
|
||||
pre {{
|
||||
white-space: pre-wrap;
|
||||
}}
|
||||
'''
|
||||
)
|
||||
|
||||
if fmt == OutputFormats.HTML:
|
||||
style += textwrap.dedent(
|
||||
'''
|
||||
._parsed-content-container {
|
||||
margin: 1em;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
._parsed-content {
|
||||
max-width: 800px;
|
||||
}
|
||||
|
||||
h1 {
|
||||
max-width: 800px;
|
||||
}
|
||||
'''
|
||||
)
|
||||
|
||||
if type == 'html':
|
||||
content = (
|
||||
'''
|
||||
return style
|
||||
|
||||
@classmethod
|
||||
def _process_outfile(
|
||||
cls,
|
||||
url: str,
|
||||
fmt: OutputFormats,
|
||||
title: str,
|
||||
content: str,
|
||||
outfile: str,
|
||||
font_size: str,
|
||||
font_family: Iterable[str],
|
||||
):
|
||||
"""
|
||||
Process the output file.
|
||||
|
||||
:param url: URL to parse.
|
||||
:param fmt: Output format. Supported types: ``html``, ``markdown``,
|
||||
``text``, ``pdf`` (default: ``html``).
|
||||
:param title: Page title.
|
||||
:param content: Page content.
|
||||
:param outfile: Output file path.
|
||||
:param font_size: Font size to use for the output (default: 19px).
|
||||
:param font_family: Custom font family (or list of font families, in
|
||||
decreasing order) to use for the output. It only applies to HTML
|
||||
and PDF.
|
||||
:return: dict
|
||||
"""
|
||||
outfile = os.path.abspath(os.path.expanduser(outfile))
|
||||
style = cls._style_by_format(fmt, font_size, font_family)
|
||||
|
||||
if fmt in {OutputFormats.HTML, OutputFormats.PDF}:
|
||||
content = textwrap.dedent(
|
||||
f'''
|
||||
<div class="_parsed-content-container">
|
||||
<h1><a href="{url}" target="_blank">{title}</a></h1>
|
||||
<div class="_parsed-content">{content}</div>
|
||||
'''.format(title=title, url=url, content=content)
|
||||
</div>
|
||||
'''
|
||||
)
|
||||
|
||||
if not wants_pdf:
|
||||
content = '''<html>
|
||||
if fmt == OutputFormats.PDF:
|
||||
content = textwrap.dedent(
|
||||
f'''<html>
|
||||
<head>
|
||||
<title>{title}</title>
|
||||
<style>{style}</style>
|
||||
</head>'''.format(title=title, style=style) + \
|
||||
'<body>{{' + content + '}}</body></html>'
|
||||
elif type == 'markdown':
|
||||
content = '# [{title}]({url})\n\n{content}'.format(
|
||||
title=title, url=url, content=content
|
||||
<title>{title}</title>
|
||||
</head>
|
||||
<body>
|
||||
{content}
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
)
|
||||
else:
|
||||
content = textwrap.dedent(
|
||||
f'''
|
||||
<style>
|
||||
{style}
|
||||
</style>
|
||||
{content}
|
||||
'''
|
||||
)
|
||||
elif fmt == OutputFormats.MARKDOWN:
|
||||
content = f'# [{title}]({url})\n\n{content}'
|
||||
|
||||
if wants_pdf:
|
||||
import weasyprint
|
||||
try:
|
||||
from weasyprint.fonts import FontConfiguration
|
||||
except ImportError:
|
||||
from weasyprint.document import FontConfiguration
|
||||
|
||||
font_config = FontConfiguration()
|
||||
css = [weasyprint.CSS('https://fonts.googleapis.com/css?family=Merriweather'),
|
||||
weasyprint.CSS(string=style, font_config=font_config)]
|
||||
|
||||
weasyprint.HTML(string=content).write_pdf(outfile, stylesheets=css)
|
||||
if fmt == OutputFormats.PDF:
|
||||
cls._process_pdf(content, outfile, style)
|
||||
else:
|
||||
with open(outfile, 'w', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
|
@ -176,5 +352,28 @@ class HttpWebpagePlugin(Plugin):
|
|||
'outfile': outfile,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _process_pdf(content: str, outfile: str, style: str):
|
||||
"""
|
||||
Convert the given HTML content to a PDF document.
|
||||
|
||||
:param content: Page content.
|
||||
:param outfile: Output file path.
|
||||
:param style: CSS style to use for the output.
|
||||
"""
|
||||
import weasyprint
|
||||
|
||||
try:
|
||||
from weasyprint.fonts import FontConfiguration # pylint: disable
|
||||
except ImportError:
|
||||
from weasyprint.document import FontConfiguration
|
||||
|
||||
font_config = FontConfiguration()
|
||||
css = [
|
||||
weasyprint.CSS(string=style, font_config=font_config),
|
||||
]
|
||||
|
||||
weasyprint.HTML(string=content).write_pdf(outfile, stylesheets=css)
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
||||
|
|
Loading…
Reference in a new issue