From b8e9adadbe11cbdb4a2b62443cf63608c03c73e6 Mon Sep 17 00:00:00 2001 From: Fabio Manganiello Date: Fri, 29 Mar 2019 03:57:16 +0100 Subject: [PATCH] Added plugin to parse web pages --- platypush/plugins/http/webpage.py | 90 +++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 platypush/plugins/http/webpage.py diff --git a/platypush/plugins/http/webpage.py b/platypush/plugins/http/webpage.py new file mode 100644 index 0000000000..425c4a9d2a --- /dev/null +++ b/platypush/plugins/http/webpage.py @@ -0,0 +1,90 @@ +import os +import requests + +from platypush.plugins import action +from platypush.plugins.http.request import Plugin + + +class HttpWebpagePlugin(Plugin): + """ + Plugin to handle and parse/simplify web pages + + Requires: + + * **requests** (``pip install requests``) + * **weasyprint** (``pip install weasyprint``), optional, for HTML->PDF conversion + """ + + def __init__(self, mercury_api_key=None, **kwargs): + """ + :param mercury_api_key: If set then Mercury will be used to parse web pages content + :type mercury_api_key: str + """ + + super().__init__(**kwargs) + self.mercury_api_key = mercury_api_key + + @action + def simplify(self, url, outfile=None): + """ + Parse the content of a web page removing any extra elements using Mercury + + :param url: URL to parse + :param outfile: If set then the output will be written to the specified file + (supported formats: pdf, html, plain (default)). The plugin will guess + the format from the extension + :return: dict. Example if outfile is not specified:: + + { + "url": , + "title": , + "content": + } + + Example if outfile is specified: + + { + "url": , + "title": , + "outfile": + } + """ + + if not self.mercury_api_key: + raise RuntimeError("mercury_api_key not set") + + response = requests.get('https://mercury.postlight.com/parser', + params={'url': url}, + headers={'x-api-key': self.mercury_api_key}) + + if not response or not response.ok: + raise RuntimeError("Unable to parse content for {}: {}".format(url, response.reason)) + + title = response.json()['title'] + content = '

{title}

{content}'.\ + format(title=title, content=response.json()['content']) + + if not outfile: + return { + 'url': url, + 'title': title, + 'content': content, + } + + outfile = os.path.abspath(os.path.expanduser(outfile)) + + if outfile.endswith('.pdf'): + import weasyprint + weasyprint.HTML(string=content).write_pdf(outfile) + else: + with open(outfile, 'w', encoding='utf-8') as f: + f.write(content) + + return { + 'url': url, + 'title': title, + 'outfile': outfile, + } + + +# vim:sw=4:ts=4:et: