forked from platypush/platypush
Support for output format type on http.webpage.simplify even when outfile is not specified
This commit is contained in:
parent
f9f9c38a8b
commit
848b736d6e
3 changed files with 51 additions and 18 deletions
|
@ -18,7 +18,7 @@ class HttpWebpagePlugin(Plugin):
|
||||||
|
|
||||||
* **weasyprint** (``pip install weasyprint``), optional, for HTML->PDF conversion
|
* **weasyprint** (``pip install weasyprint``), optional, for HTML->PDF conversion
|
||||||
* **node** and **npm** installed on your system (to use the mercury-parser interface)
|
* **node** and **npm** installed on your system (to use the mercury-parser interface)
|
||||||
* The mercury-parser library installed (``npm install @postlight/mercury-parser``)
|
* The mercury-parser library installed (``npm install -g @postlight/mercury-parser``)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -33,17 +33,17 @@ class HttpWebpagePlugin(Plugin):
|
||||||
@action
|
@action
|
||||||
def simplify(self, url, type='html', html=None, outfile=None):
|
def simplify(self, url, type='html', html=None, outfile=None):
|
||||||
"""
|
"""
|
||||||
Parse the content of a web page removing any extra elements using Mercury
|
Parse the readable content of a web page removing any extra HTML elements using Mercury.
|
||||||
|
|
||||||
:param url: URL to parse.
|
:param url: URL to parse.
|
||||||
:param type: Input type. Supported types: html, markdown, text (default: html).
|
:param type: Output format. Supported types: ``html``, ``markdown``, ``text`` (default: ``html``).
|
||||||
:param html: Set this parameter if you want to parse some HTML content already fetched. Note
|
:param html: Set this parameter if you want to parse some HTML content already fetched. Note
|
||||||
that URL is still required by Mercury to properly style the output, but it won't be used
|
that URL is still required by Mercury to properly style the output, but it won't be used
|
||||||
to actually fetch the content.
|
to actually fetch the content.
|
||||||
|
|
||||||
:param outfile: If set then the output will be written to the specified file
|
:param outfile: If set then the output will be written to the specified file. If the file extension
|
||||||
(supported formats: pdf, html, plain (default)). The plugin will guess
|
is ``.pdf`` then the content will be exported in PDF format. If the output ``type`` is not
|
||||||
the format from the extension
|
specified then it can also be inferred from the extension of the output file.
|
||||||
:return: dict
|
:return: dict
|
||||||
|
|
||||||
Example if outfile is not specified::
|
Example if outfile is not specified::
|
||||||
|
@ -67,6 +67,20 @@ class HttpWebpagePlugin(Plugin):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.logger.info('Parsing URL {}'.format(url))
|
self.logger.info('Parsing URL {}'.format(url))
|
||||||
|
wants_pdf = False
|
||||||
|
|
||||||
|
if outfile:
|
||||||
|
wants_pdf = outfile.lower().endswith('.pdf')
|
||||||
|
if (
|
||||||
|
wants_pdf # HTML will be exported to PDF
|
||||||
|
or outfile.lower().split('.')[-1].startswith('htm')
|
||||||
|
):
|
||||||
|
type = 'html'
|
||||||
|
elif outfile.lower().endswith('.md'):
|
||||||
|
type = 'markdown'
|
||||||
|
elif outfile.lower().endswith('.txt'):
|
||||||
|
type = 'text'
|
||||||
|
|
||||||
proc = ['node', self._mercury_script, url, type]
|
proc = ['node', self._mercury_script, url, type]
|
||||||
f = None
|
f = None
|
||||||
|
|
||||||
|
@ -102,9 +116,6 @@ class HttpWebpagePlugin(Plugin):
|
||||||
}
|
}
|
||||||
|
|
||||||
outfile = os.path.abspath(os.path.expanduser(outfile))
|
outfile = os.path.abspath(os.path.expanduser(outfile))
|
||||||
content = '''<h1>{title}</h1><div class="_parsed-content">{content}</div>'''.\
|
|
||||||
format(title=title, content=content)
|
|
||||||
|
|
||||||
style = '''
|
style = '''
|
||||||
body {
|
body {
|
||||||
font-size: 22px;
|
font-size: 22px;
|
||||||
|
@ -112,7 +123,27 @@ class HttpWebpagePlugin(Plugin):
|
||||||
}
|
}
|
||||||
'''
|
'''
|
||||||
|
|
||||||
if outfile.lower().endswith('.pdf'):
|
if type == 'html':
|
||||||
|
content = (
|
||||||
|
'''
|
||||||
|
<h1><a href="{url}" target="_blank">{title}</a></h1>
|
||||||
|
<div class="_parsed-content">{content}</div>
|
||||||
|
'''.format(title=title, url=url, content=content)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not wants_pdf:
|
||||||
|
content = '''<html>
|
||||||
|
<head>
|
||||||
|
<title>{title}</title>
|
||||||
|
<style>{style}</style>
|
||||||
|
</head>'''.format(title=title, style=style) + \
|
||||||
|
'<body>{{' + content + '}}</body></html>'
|
||||||
|
elif type == 'markdown':
|
||||||
|
content = '# [{title}]({url})\n\n{content}'.format(
|
||||||
|
title=title, url=url, content=content
|
||||||
|
)
|
||||||
|
|
||||||
|
if wants_pdf:
|
||||||
import weasyprint
|
import weasyprint
|
||||||
try:
|
try:
|
||||||
from weasyprint.fonts import FontConfiguration
|
from weasyprint.fonts import FontConfiguration
|
||||||
|
@ -125,13 +156,6 @@ class HttpWebpagePlugin(Plugin):
|
||||||
|
|
||||||
weasyprint.HTML(string=content).write_pdf(outfile, stylesheets=css)
|
weasyprint.HTML(string=content).write_pdf(outfile, stylesheets=css)
|
||||||
else:
|
else:
|
||||||
content = '''<html>
|
|
||||||
<head>
|
|
||||||
<title>{title}</title>
|
|
||||||
<style>{style}</style>
|
|
||||||
</head>'''.format(title=title, style=style) + \
|
|
||||||
'<body>{{' + content + '}}</body></html>'
|
|
||||||
|
|
||||||
with open(outfile, 'w', encoding='utf-8') as f:
|
with open(outfile, 'w', encoding='utf-8') as f:
|
||||||
f.write(content)
|
f.write(content)
|
||||||
|
|
||||||
|
|
|
@ -3,5 +3,14 @@ manifest:
|
||||||
install:
|
install:
|
||||||
pip:
|
pip:
|
||||||
- weasyprint
|
- weasyprint
|
||||||
|
apt:
|
||||||
|
- sudo
|
||||||
|
- nodejs
|
||||||
|
- npm
|
||||||
|
pacman:
|
||||||
|
- sudo
|
||||||
|
- npm
|
||||||
|
exec:
|
||||||
|
- sudo npm install -g @postlight/mercury-parser
|
||||||
package: platypush.plugins.http.webpage
|
package: platypush.plugins.http.webpage
|
||||||
type: plugin
|
type: plugin
|
||||||
|
|
|
@ -10,7 +10,7 @@ const fs = require('fs');
|
||||||
const Mercury = require('@postlight/mercury-parser');
|
const Mercury = require('@postlight/mercury-parser');
|
||||||
|
|
||||||
if (process.argv.length < 3) {
|
if (process.argv.length < 3) {
|
||||||
console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html] [Pre-fetched HTML content file]');
|
console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html|text] [Pre-fetched HTML content file]');
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue