2019-07-24 19:02:53 +02:00
|
|
|
import datetime
|
|
|
|
import json
|
2019-03-29 03:57:16 +01:00
|
|
|
import os
|
2019-07-24 19:02:53 +02:00
|
|
|
import subprocess
|
2019-03-29 03:57:16 +01:00
|
|
|
|
|
|
|
from platypush.plugins import action
|
|
|
|
from platypush.plugins.http.request import Plugin
|
|
|
|
|
|
|
|
|
|
|
|
class HttpWebpagePlugin(Plugin):
|
|
|
|
"""
|
2019-07-24 19:02:53 +02:00
|
|
|
Plugin to handle and parse/simplify web pages.
|
|
|
|
It used to use the Mercury Reader web API, but now that the API is discontinued this plugin is basically a
|
|
|
|
wrapper around the `mercury-parser <https://github.com/postlight/mercury-parser>`_ JavaScript library.
|
2019-03-29 03:57:16 +01:00
|
|
|
|
|
|
|
Requires:
|
|
|
|
|
|
|
|
* **requests** (``pip install requests``)
|
|
|
|
* **weasyprint** (``pip install weasyprint``), optional, for HTML->PDF conversion
|
2019-07-24 19:02:53 +02:00
|
|
|
* **node** and **npm** installed on your system (to use the mercury-parser interface)
|
|
|
|
* The mercury-parser library installed (``npm install @postlight/mercury-parser``)
|
2019-03-29 03:57:16 +01:00
|
|
|
"""
|
|
|
|
|
2019-07-24 19:02:53 +02:00
|
|
|
_mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js')
|
2019-03-29 03:57:16 +01:00
|
|
|
|
|
|
|
@action
|
|
|
|
def simplify(self, url, outfile=None):
|
|
|
|
"""
|
|
|
|
Parse the content of a web page removing any extra elements using Mercury
|
|
|
|
|
|
|
|
:param url: URL to parse
|
|
|
|
:param outfile: If set then the output will be written to the specified file
|
|
|
|
(supported formats: pdf, html, plain (default)). The plugin will guess
|
|
|
|
the format from the extension
|
2019-07-16 23:00:20 +02:00
|
|
|
:return: dict
|
2019-07-16 20:28:00 +02:00
|
|
|
|
2019-07-16 23:00:20 +02:00
|
|
|
Example if outfile is not specified::
|
2019-03-29 03:57:16 +01:00
|
|
|
|
|
|
|
{
|
|
|
|
"url": <url>,
|
|
|
|
"title": <page title>,
|
|
|
|
"content": <page parsed content>
|
2019-07-16 20:28:00 +02:00
|
|
|
|
2019-03-29 03:57:16 +01:00
|
|
|
}
|
|
|
|
|
2019-07-16 23:00:20 +02:00
|
|
|
Example if outfile is specified::
|
2019-07-16 20:28:00 +02:00
|
|
|
|
2019-03-29 03:57:16 +01:00
|
|
|
{
|
|
|
|
"url": <url>,
|
|
|
|
"title": <page title>,
|
|
|
|
"outfile": <output file absolute path>
|
2019-07-16 20:28:00 +02:00
|
|
|
|
2019-03-29 03:57:16 +01:00
|
|
|
}
|
2019-07-16 20:28:00 +02:00
|
|
|
|
2019-03-29 03:57:16 +01:00
|
|
|
"""
|
|
|
|
|
2019-04-13 10:17:45 +02:00
|
|
|
self.logger.info('Parsing URL {}'.format(url))
|
2019-07-25 18:34:00 +02:00
|
|
|
parser = subprocess.Popen(['node', self._mercury_script, url], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
2019-07-25 01:31:27 +02:00
|
|
|
response = parser.stdout.read().decode()
|
|
|
|
|
|
|
|
try:
|
|
|
|
response = json.loads(response)
|
|
|
|
except Exception as e:
|
|
|
|
raise RuntimeError('Could not parse JSON: {}. Response: {}'.format(str(e), response))
|
2019-03-29 03:57:16 +01:00
|
|
|
|
2019-07-24 19:02:53 +02:00
|
|
|
self.logger.info('Got response from Mercury API: {}'.format(response))
|
|
|
|
title = response.get('title', '{} on {}'.format(
|
|
|
|
'Published' if response.get('date_published') else 'Generated',
|
|
|
|
response.get('date_published', datetime.datetime.now().isoformat())))
|
2019-04-13 10:15:08 +02:00
|
|
|
|
2019-07-25 18:08:18 +02:00
|
|
|
content = response.get('content', '')
|
2019-03-29 03:57:16 +01:00
|
|
|
|
|
|
|
if not outfile:
|
|
|
|
return {
|
|
|
|
'url': url,
|
|
|
|
'title': title,
|
|
|
|
'content': content,
|
|
|
|
}
|
|
|
|
|
2019-10-05 22:48:07 +02:00
|
|
|
content = '''<html>
|
|
|
|
<head>
|
|
|
|
<link rel="stylesheet" type="text/css" href="//fonts.googleapis.com/css?family=Merriweather" />
|
|
|
|
<title>{title}</title>
|
|
|
|
<style>
|
|
|
|
body {{
|
|
|
|
font-size: 22px;
|
|
|
|
font-family: 'Merriweather', Georgia, 'Times New Roman', Times, serif;
|
|
|
|
}}
|
|
|
|
</style>
|
|
|
|
</head>
|
|
|
|
<body>
|
|
|
|
<h1>{title}</h1>
|
|
|
|
{content}
|
|
|
|
</body>
|
|
|
|
</html>'''.format(title=title, content=content)
|
2019-07-25 18:08:18 +02:00
|
|
|
|
2019-03-29 03:57:16 +01:00
|
|
|
outfile = os.path.abspath(os.path.expanduser(outfile))
|
|
|
|
|
2019-07-24 19:02:53 +02:00
|
|
|
if outfile.lower().endswith('.pdf'):
|
2019-03-29 03:57:16 +01:00
|
|
|
import weasyprint
|
|
|
|
weasyprint.HTML(string=content).write_pdf(outfile)
|
|
|
|
else:
|
|
|
|
with open(outfile, 'w', encoding='utf-8') as f:
|
|
|
|
f.write(content)
|
|
|
|
|
|
|
|
return {
|
|
|
|
'url': url,
|
|
|
|
'title': title,
|
|
|
|
'outfile': outfile,
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# vim:sw=4:ts=4:et:
|