platypush/platypush/plugins/http/webpage/__init__.py

114 lines
3.6 KiB
Python
Raw Normal View History

import datetime
import json
2019-03-29 03:57:16 +01:00
import os
import subprocess
2019-03-29 03:57:16 +01:00
from platypush.plugins import action
from platypush.plugins.http.request import Plugin
class HttpWebpagePlugin(Plugin):
"""
Plugin to handle and parse/simplify web pages.
It used to use the Mercury Reader web API, but now that the API is discontinued this plugin is basically a
wrapper around the `mercury-parser <https://github.com/postlight/mercury-parser>`_ JavaScript library.
2019-03-29 03:57:16 +01:00
Requires:
* **requests** (``pip install requests``)
* **weasyprint** (``pip install weasyprint``), optional, for HTML->PDF conversion
* **node** and **npm** installed on your system (to use the mercury-parser interface)
* The mercury-parser library installed (``npm install @postlight/mercury-parser``)
2019-03-29 03:57:16 +01:00
"""
_mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js')
2019-03-29 03:57:16 +01:00
@action
def simplify(self, url, outfile=None):
"""
Parse the content of a web page removing any extra elements using Mercury
:param url: URL to parse
:param outfile: If set then the output will be written to the specified file
(supported formats: pdf, html, plain (default)). The plugin will guess
the format from the extension
:return: dict
Example if outfile is not specified::
2019-03-29 03:57:16 +01:00
{
"url": <url>,
"title": <page title>,
"content": <page parsed content>
2019-03-29 03:57:16 +01:00
}
Example if outfile is specified::
2019-03-29 03:57:16 +01:00
{
"url": <url>,
"title": <page title>,
"outfile": <output file absolute path>
2019-03-29 03:57:16 +01:00
}
2019-03-29 03:57:16 +01:00
"""
2019-04-13 10:17:45 +02:00
self.logger.info('Parsing URL {}'.format(url))
parser = subprocess.Popen(['node', self._mercury_script, url], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
response = parser.stdout.read().decode()
try:
response = json.loads(response)
except Exception as e:
raise RuntimeError('Could not parse JSON: {}. Response: {}'.format(str(e), response))
2019-03-29 03:57:16 +01:00
self.logger.info('Got response from Mercury API: {}'.format(response))
title = response.get('title', '{} on {}'.format(
'Published' if response.get('date_published') else 'Generated',
response.get('date_published', datetime.datetime.now().isoformat())))
content = response.get('content', '')
2019-03-29 03:57:16 +01:00
if not outfile:
return {
'url': url,
'title': title,
'content': content,
}
content = '''<html>
<head>
<link rel="stylesheet" type="text/css" href="//fonts.googleapis.com/css?family=Merriweather" />
<title>{title}</title>
<style>
body {{
font-size: 22px;
font-family: 'Merriweather', Georgia, 'Times New Roman', Times, serif;
}}
</style>
</head>
<body>
<h1>{title}</h1>
{content}
</body>
</html>'''.format(title=title, content=content)
2019-03-29 03:57:16 +01:00
outfile = os.path.abspath(os.path.expanduser(outfile))
if outfile.lower().endswith('.pdf'):
2019-03-29 03:57:16 +01:00
import weasyprint
weasyprint.HTML(string=content).write_pdf(outfile)
else:
with open(outfile, 'w', encoding='utf-8') as f:
f.write(content)
return {
'url': url,
'title': title,
'outfile': outfile,
}
# vim:sw=4:ts=4:et: