Added plugin to parse web pages

This commit is contained in:
Fabio Manganiello 2019-03-29 03:57:16 +01:00
parent 24d395ce49
commit b8e9adadbe

View file

@ -0,0 +1,90 @@
import os
import requests
from platypush.plugins import action
from platypush.plugins.http.request import Plugin
class HttpWebpagePlugin(Plugin):
"""
Plugin to handle and parse/simplify web pages
Requires:
* **requests** (``pip install requests``)
* **weasyprint** (``pip install weasyprint``), optional, for HTML->PDF conversion
"""
def __init__(self, mercury_api_key=None, **kwargs):
"""
:param mercury_api_key: If set then Mercury will be used to parse web pages content
:type mercury_api_key: str
"""
super().__init__(**kwargs)
self.mercury_api_key = mercury_api_key
@action
def simplify(self, url, outfile=None):
"""
Parse the content of a web page removing any extra elements using Mercury
:param url: URL to parse
:param outfile: If set then the output will be written to the specified file
(supported formats: pdf, html, plain (default)). The plugin will guess
the format from the extension
:return: dict. Example if outfile is not specified::
{
"url": <url>,
"title": <page title>,
"content": <page parsed content>
}
Example if outfile is specified:
{
"url": <url>,
"title": <page title>,
"outfile": <output file absolute path>
}
"""
if not self.mercury_api_key:
raise RuntimeError("mercury_api_key not set")
response = requests.get('https://mercury.postlight.com/parser',
params={'url': url},
headers={'x-api-key': self.mercury_api_key})
if not response or not response.ok:
raise RuntimeError("Unable to parse content for {}: {}".format(url, response.reason))
title = response.json()['title']
content = '<h1>{title}</h1>{content}'.\
format(title=title, content=response.json()['content'])
if not outfile:
return {
'url': url,
'title': title,
'content': content,
}
outfile = os.path.abspath(os.path.expanduser(outfile))
if outfile.endswith('.pdf'):
import weasyprint
weasyprint.HTML(string=content).write_pdf(outfile)
else:
with open(outfile, 'w', encoding='utf-8') as f:
f.write(content)
return {
'url': url,
'title': title,
'outfile': outfile,
}
# vim:sw=4:ts=4:et: