platypush/platypush/plugins/http/webpage.py
2019-04-13 08:06:40 +00:00

93 lines
2.9 KiB
Python

import os
import requests
import time
from platypush.plugins import action
from platypush.plugins.http.request import Plugin
class HttpWebpagePlugin(Plugin):
"""
Plugin to handle and parse/simplify web pages
Requires:
* **requests** (``pip install requests``)
* **weasyprint** (``pip install weasyprint``), optional, for HTML->PDF conversion
"""
def __init__(self, mercury_api_key=None, **kwargs):
"""
:param mercury_api_key: If set then Mercury will be used to parse web pages content
:type mercury_api_key: str
"""
super().__init__(**kwargs)
self.mercury_api_key = mercury_api_key
@action
def simplify(self, url, outfile=None):
"""
Parse the content of a web page removing any extra elements using Mercury
:param url: URL to parse
:param outfile: If set then the output will be written to the specified file
(supported formats: pdf, html, plain (default)). The plugin will guess
the format from the extension
:return: dict. Example if outfile is not specified::
{
"url": <url>,
"title": <page title>,
"content": <page parsed content>
}
Example if outfile is specified:
{
"url": <url>,
"title": <page title>,
"outfile": <output file absolute path>
}
"""
if not self.mercury_api_key:
raise RuntimeError("mercury_api_key not set")
response = requests.get('https://mercury.postlight.com/parser',
params={'url': url},
headers={'x-api-key': self.mercury_api_key})
if not response or not response.ok:
raise RuntimeError("Unable to parse content for {}: {}".format(url, response.reason))
self.logger.info('Got response from Mercury API: {}'.format(response))
title = response.json().get('title', 'No_title_{}'.format(int(time.time())))
content = '<body style="{body_style}"><h1>{title}</h1>{content}</body>'.\
format(title=title, content=response.json()['content'],
body_style='font-size: 22px; font-family: Tahoma, Geneva, sans-serif')
if not outfile:
return {
'url': url,
'title': title,
'content': content,
}
outfile = os.path.abspath(os.path.expanduser(outfile))
if outfile.endswith('.pdf'):
import weasyprint
weasyprint.HTML(string=content).write_pdf(outfile)
else:
with open(outfile, 'w', encoding='utf-8') as f:
f.write(content)
return {
'url': url,
'title': title,
'outfile': outfile,
}
# vim:sw=4:ts=4:et: