Added plugin to parse web pages
This commit is contained in:
parent
24d395ce49
commit
b8e9adadbe
1 changed files with 90 additions and 0 deletions
90
platypush/plugins/http/webpage.py
Normal file
90
platypush/plugins/http/webpage.py
Normal file
|
@ -0,0 +1,90 @@
|
|||
import os
|
||||
import requests
|
||||
|
||||
from platypush.plugins import action
|
||||
from platypush.plugins.http.request import Plugin
|
||||
|
||||
|
||||
class HttpWebpagePlugin(Plugin):
|
||||
"""
|
||||
Plugin to handle and parse/simplify web pages
|
||||
|
||||
Requires:
|
||||
|
||||
* **requests** (``pip install requests``)
|
||||
* **weasyprint** (``pip install weasyprint``), optional, for HTML->PDF conversion
|
||||
"""
|
||||
|
||||
def __init__(self, mercury_api_key=None, **kwargs):
|
||||
"""
|
||||
:param mercury_api_key: If set then Mercury will be used to parse web pages content
|
||||
:type mercury_api_key: str
|
||||
"""
|
||||
|
||||
super().__init__(**kwargs)
|
||||
self.mercury_api_key = mercury_api_key
|
||||
|
||||
@action
|
||||
def simplify(self, url, outfile=None):
|
||||
"""
|
||||
Parse the content of a web page removing any extra elements using Mercury
|
||||
|
||||
:param url: URL to parse
|
||||
:param outfile: If set then the output will be written to the specified file
|
||||
(supported formats: pdf, html, plain (default)). The plugin will guess
|
||||
the format from the extension
|
||||
:return: dict. Example if outfile is not specified::
|
||||
|
||||
{
|
||||
"url": <url>,
|
||||
"title": <page title>,
|
||||
"content": <page parsed content>
|
||||
}
|
||||
|
||||
Example if outfile is specified:
|
||||
|
||||
{
|
||||
"url": <url>,
|
||||
"title": <page title>,
|
||||
"outfile": <output file absolute path>
|
||||
}
|
||||
"""
|
||||
|
||||
if not self.mercury_api_key:
|
||||
raise RuntimeError("mercury_api_key not set")
|
||||
|
||||
response = requests.get('https://mercury.postlight.com/parser',
|
||||
params={'url': url},
|
||||
headers={'x-api-key': self.mercury_api_key})
|
||||
|
||||
if not response or not response.ok:
|
||||
raise RuntimeError("Unable to parse content for {}: {}".format(url, response.reason))
|
||||
|
||||
title = response.json()['title']
|
||||
content = '<h1>{title}</h1>{content}'.\
|
||||
format(title=title, content=response.json()['content'])
|
||||
|
||||
if not outfile:
|
||||
return {
|
||||
'url': url,
|
||||
'title': title,
|
||||
'content': content,
|
||||
}
|
||||
|
||||
outfile = os.path.abspath(os.path.expanduser(outfile))
|
||||
|
||||
if outfile.endswith('.pdf'):
|
||||
import weasyprint
|
||||
weasyprint.HTML(string=content).write_pdf(outfile)
|
||||
else:
|
||||
with open(outfile, 'w', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
|
||||
return {
|
||||
'url': url,
|
||||
'title': title,
|
||||
'outfile': outfile,
|
||||
}
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
Loading…
Reference in a new issue