Added plugin to parse web pages

2019-03-29 03:57:16 +01:00 · 2019-03-29 03:57:16 +01:00 · b8e9adadbe
commit b8e9adadbe
parent 24d395ce49
1 changed files with 90 additions and 0 deletions
--- a/platypush/plugins/http/webpage.py
+++ b/platypush/plugins/http/webpage.py
@ -0,0 +1,90 @@
+import os
+import requests
+
+from platypush.plugins import action
+from platypush.plugins.http.request import Plugin
+
+
+class HttpWebpagePlugin(Plugin):
+    """
+    Plugin to handle and parse/simplify web pages
+
+    Requires:
+
+        * **requests** (``pip install requests``)
+        * **weasyprint** (``pip install weasyprint``), optional, for HTML->PDF conversion
+    """
+
+    def __init__(self, mercury_api_key=None, **kwargs):
+        """
+        :param mercury_api_key: If set then Mercury will be used to parse web pages content
+        :type mercury_api_key: str
+        """
+
+        super().__init__(**kwargs)
+        self.mercury_api_key = mercury_api_key
+
+    @action
+    def simplify(self, url, outfile=None):
+        """
+        Parse the content of a web page removing any extra elements using Mercury
+
+        :param url: URL to parse
+        :param outfile: If set then the output will be written to the specified file
+            (supported formats: pdf, html, plain (default)). The plugin will guess
+            the format from the extension
+        :return: dict. Example if outfile is not specified::
+
+            {
+                "url": <url>,
+                "title": <page title>,
+                "content": <page parsed content>
+            }
+
+        Example if outfile is specified:
+
+            {
+                "url": <url>,
+                "title": <page title>,
+                "outfile": <output file absolute path>
+            }
+        """
+
+        if not self.mercury_api_key:
+            raise RuntimeError("mercury_api_key not set")
+
+        response = requests.get('https://mercury.postlight.com/parser',
+                                params={'url': url},
+                                headers={'x-api-key': self.mercury_api_key})
+
+        if not response or not response.ok:
+            raise RuntimeError("Unable to parse content for {}: {}".format(url, response.reason))
+
+        title = response.json()['title']
+        content = '<h1>{title}</h1>{content}'.\
+            format(title=title, content=response.json()['content'])
+
+        if not outfile:
+            return {
+                'url': url,
+                'title': title,
+                'content': content,
+            }
+
+        outfile = os.path.abspath(os.path.expanduser(outfile))
+
+        if outfile.endswith('.pdf'):
+            import weasyprint
+            weasyprint.HTML(string=content).write_pdf(outfile)
+        else:
+            with open(outfile, 'w', encoding='utf-8') as f:
+                f.write(content)
+
+        return {
+            'url': url,
+            'title': title,
+            'outfile': outfile,
+        }
+
+
+# vim:sw=4:ts=4:et: