Added plugin to parse web pages
This commit is contained in:
parent
24d395ce49
commit
b8e9adadbe
1 changed files with 90 additions and 0 deletions
90
platypush/plugins/http/webpage.py
Normal file
90
platypush/plugins/http/webpage.py
Normal file
|
@ -0,0 +1,90 @@
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from platypush.plugins import action
|
||||||
|
from platypush.plugins.http.request import Plugin
|
||||||
|
|
||||||
|
|
||||||
|
class HttpWebpagePlugin(Plugin):
|
||||||
|
"""
|
||||||
|
Plugin to handle and parse/simplify web pages
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
|
||||||
|
* **requests** (``pip install requests``)
|
||||||
|
* **weasyprint** (``pip install weasyprint``), optional, for HTML->PDF conversion
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, mercury_api_key=None, **kwargs):
|
||||||
|
"""
|
||||||
|
:param mercury_api_key: If set then Mercury will be used to parse web pages content
|
||||||
|
:type mercury_api_key: str
|
||||||
|
"""
|
||||||
|
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.mercury_api_key = mercury_api_key
|
||||||
|
|
||||||
|
@action
|
||||||
|
def simplify(self, url, outfile=None):
|
||||||
|
"""
|
||||||
|
Parse the content of a web page removing any extra elements using Mercury
|
||||||
|
|
||||||
|
:param url: URL to parse
|
||||||
|
:param outfile: If set then the output will be written to the specified file
|
||||||
|
(supported formats: pdf, html, plain (default)). The plugin will guess
|
||||||
|
the format from the extension
|
||||||
|
:return: dict. Example if outfile is not specified::
|
||||||
|
|
||||||
|
{
|
||||||
|
"url": <url>,
|
||||||
|
"title": <page title>,
|
||||||
|
"content": <page parsed content>
|
||||||
|
}
|
||||||
|
|
||||||
|
Example if outfile is specified:
|
||||||
|
|
||||||
|
{
|
||||||
|
"url": <url>,
|
||||||
|
"title": <page title>,
|
||||||
|
"outfile": <output file absolute path>
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not self.mercury_api_key:
|
||||||
|
raise RuntimeError("mercury_api_key not set")
|
||||||
|
|
||||||
|
response = requests.get('https://mercury.postlight.com/parser',
|
||||||
|
params={'url': url},
|
||||||
|
headers={'x-api-key': self.mercury_api_key})
|
||||||
|
|
||||||
|
if not response or not response.ok:
|
||||||
|
raise RuntimeError("Unable to parse content for {}: {}".format(url, response.reason))
|
||||||
|
|
||||||
|
title = response.json()['title']
|
||||||
|
content = '<h1>{title}</h1>{content}'.\
|
||||||
|
format(title=title, content=response.json()['content'])
|
||||||
|
|
||||||
|
if not outfile:
|
||||||
|
return {
|
||||||
|
'url': url,
|
||||||
|
'title': title,
|
||||||
|
'content': content,
|
||||||
|
}
|
||||||
|
|
||||||
|
outfile = os.path.abspath(os.path.expanduser(outfile))
|
||||||
|
|
||||||
|
if outfile.endswith('.pdf'):
|
||||||
|
import weasyprint
|
||||||
|
weasyprint.HTML(string=content).write_pdf(outfile)
|
||||||
|
else:
|
||||||
|
with open(outfile, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(content)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'url': url,
|
||||||
|
'title': title,
|
||||||
|
'outfile': outfile,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# vim:sw=4:ts=4:et:
|
Loading…
Reference in a new issue