platypush/platypush/plugins/http/webpage/__init__.py

import datetime
import json
import os
import subprocess

from platypush.plugins import action
from platypush.plugins.http.request import Plugin


class HttpWebpagePlugin(Plugin):
    """
    Plugin to handle and parse/simplify web pages.
    It used to use the Mercury Reader web API, but now that the API is discontinued this plugin is basically a
    wrapper around the `mercury-parser <https://github.com/postlight/mercury-parser>`_ JavaScript library.

    Requires:

        * **requests** (``pip install requests``)
        * **weasyprint** (``pip install weasyprint``), optional, for HTML->PDF conversion
        * **node** and **npm** installed on your system (to use the mercury-parser interface)
        * The mercury-parser library installed (``npm install @postlight/mercury-parser``)
    """

    _mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js')

    @action
    def simplify(self, url, outfile=None):
        """
        Parse the content of a web page removing any extra elements using Mercury

        :param url: URL to parse
        :param outfile: If set then the output will be written to the specified file
            (supported formats: pdf, html, plain (default)). The plugin will guess
            the format from the extension
        :return: dict

        Example if outfile is not specified::

            {
                "url": <url>,
                "title": <page title>,
                "content": <page parsed content>

            }

        Example if outfile is specified::

            {
                "url": <url>,
                "title": <page title>,
                "outfile": <output file absolute path>

            }

        """

        self.logger.info('Parsing URL {}'.format(url))
        parser = subprocess.Popen(['node', self._mercury_script, url], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        response = parser.stdout.read().decode()

        try:
            response = json.loads(response)
        except Exception as e:
            raise RuntimeError('Could not parse JSON: {}. Response: {}'.format(str(e), response))

        self.logger.info('Got response from Mercury API: {}'.format(response))
        title = response.get('title', '{} on {}'.format(
            'Published' if response.get('date_published') else 'Generated',
            response.get('date_published', datetime.datetime.now().isoformat())))

        content = response.get('content', '')

        if not outfile:
            return {
                'url': url,
                'title': title,
                'content': content,
            }

        content = '''<html>
            <head>
                <link rel="stylesheet" type="text/css" href="//fonts.googleapis.com/css?family=Merriweather" />
                <title>{title}</title>
                <style>
                    body {{
                        font-size: 22px;
                        font-family: 'Merriweather', Georgia, 'Times New Roman', Times, serif;
                    }}
                </style>
            </head>
            <body>
                <h1>{title}</h1>
                {content}
            </body>
        </html>'''.format(title=title, content=content)

        outfile = os.path.abspath(os.path.expanduser(outfile))

        if outfile.lower().endswith('.pdf'):
            import weasyprint
            weasyprint.HTML(string=content).write_pdf(outfile)
        else:
            with open(outfile, 'w', encoding='utf-8') as f:
                f.write(content)

        return {
            'url': url,
            'title': title,
            'outfile': outfile,
        }


# vim:sw=4:ts=4:et:
#73: Implemented wrapper plugin for the new Node.js mercury-parser. As the Mercury reader web API is deprecated, and the only available implementation is the open source mercury-parser, node, npm and @postlight/mercury-parser have to be added as dependencies for the http.webpage plugin (or at least for the `simplify` action). 2019-07-24 19:02:53 +02:00			`import datetime`
			`import json`
Added plugin to parse web pages 2019-03-29 03:57:16 +01:00			`import os`
#73: Implemented wrapper plugin for the new Node.js mercury-parser. As the Mercury reader web API is deprecated, and the only available implementation is the open source mercury-parser, node, npm and @postlight/mercury-parser have to be added as dependencies for the http.webpage plugin (or at least for the `simplify` action). 2019-07-24 19:02:53 +02:00			`import subprocess`
Added plugin to parse web pages 2019-03-29 03:57:16 +01:00
			`from platypush.plugins import action`
			`from platypush.plugins.http.request import Plugin`


			`class HttpWebpagePlugin(Plugin):`
			`"""`
#73: Implemented wrapper plugin for the new Node.js mercury-parser. As the Mercury reader web API is deprecated, and the only available implementation is the open source mercury-parser, node, npm and @postlight/mercury-parser have to be added as dependencies for the http.webpage plugin (or at least for the `simplify` action). 2019-07-24 19:02:53 +02:00			`Plugin to handle and parse/simplify web pages.`
			`It used to use the Mercury Reader web API, but now that the API is discontinued this plugin is basically a`
			wrapper around the `mercury-parser <https://github.com/postlight/mercury-parser>`_ JavaScript library.
Added plugin to parse web pages 2019-03-29 03:57:16 +01:00
			`Requires:`

			* requests (``pip install requests``)
			* weasyprint (``pip install weasyprint``), optional, for HTML->PDF conversion
#73: Implemented wrapper plugin for the new Node.js mercury-parser. As the Mercury reader web API is deprecated, and the only available implementation is the open source mercury-parser, node, npm and @postlight/mercury-parser have to be added as dependencies for the http.webpage plugin (or at least for the `simplify` action). 2019-07-24 19:02:53 +02:00			`* node and npm installed on your system (to use the mercury-parser interface)`
			* The mercury-parser library installed (``npm install @postlight/mercury-parser``)
Added plugin to parse web pages 2019-03-29 03:57:16 +01:00			`"""`

#73: Implemented wrapper plugin for the new Node.js mercury-parser. As the Mercury reader web API is deprecated, and the only available implementation is the open source mercury-parser, node, npm and @postlight/mercury-parser have to be added as dependencies for the http.webpage plugin (or at least for the `simplify` action). 2019-07-24 19:02:53 +02:00			`_mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js')`
Added plugin to parse web pages 2019-03-29 03:57:16 +01:00
			`@action`
			`def simplify(self, url, outfile=None):`
			`"""`
			`Parse the content of a web page removing any extra elements using Mercury`

			`:param url: URL to parse`
			`:param outfile: If set then the output will be written to the specified file`
			`(supported formats: pdf, html, plain (default)). The plugin will guess`
			`the format from the extension`
Replaced references to in pydoc with format, as doesn't seem to work in RTD 2019-07-16 23:00:20 +02:00			`:return: dict`
Fixed documentation and added missing docs to the index 2019-07-16 20:28:00 +02:00
Replaced references to in pydoc with format, as doesn't seem to work in RTD 2019-07-16 23:00:20 +02:00			`Example if outfile is not specified::`
Added plugin to parse web pages 2019-03-29 03:57:16 +01:00
			`{`
			`"url": <url>,`
			`"title": <page title>,`
			`"content": <page parsed content>`
Fixed documentation and added missing docs to the index 2019-07-16 20:28:00 +02:00
Added plugin to parse web pages 2019-03-29 03:57:16 +01:00			`}`

Replaced references to in pydoc with format, as doesn't seem to work in RTD 2019-07-16 23:00:20 +02:00			`Example if outfile is specified::`
Fixed documentation and added missing docs to the index 2019-07-16 20:28:00 +02:00
Added plugin to parse web pages 2019-03-29 03:57:16 +01:00			`{`
			`"url": <url>,`
			`"title": <page title>,`
			`"outfile": <output file absolute path>`
Fixed documentation and added missing docs to the index 2019-07-16 20:28:00 +02:00
Added plugin to parse web pages 2019-03-29 03:57:16 +01:00			`}`
Fixed documentation and added missing docs to the index 2019-07-16 20:28:00 +02:00
Added plugin to parse web pages 2019-03-29 03:57:16 +01:00			`"""`

Log URL being parsed 2019-04-13 10:17:45 +02:00			`self.logger.info('Parsing URL {}'.format(url))`
Redirect mercury-parser.js stderr to stdout. Moreover, if an error occurs while parsing a link from an RSS feed we should report the link that raised the error, not the URL of the parent feed 2019-07-25 18:34:00 +02:00			`parser = subprocess.Popen(['node', self._mercury_script, url], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)`
Log exception and parser output if it returns invalid JSON 2019-07-25 01:31:27 +02:00			`response = parser.stdout.read().decode()`

			`try:`
			`response = json.loads(response)`
			`except Exception as e:`
			`raise RuntimeError('Could not parse JSON: {}. Response: {}'.format(str(e), response))`
Added plugin to parse web pages 2019-03-29 03:57:16 +01:00
#73: Implemented wrapper plugin for the new Node.js mercury-parser. As the Mercury reader web API is deprecated, and the only available implementation is the open source mercury-parser, node, npm and @postlight/mercury-parser have to be added as dependencies for the http.webpage plugin (or at least for the `simplify` action). 2019-07-24 19:02:53 +02:00			`self.logger.info('Got response from Mercury API: {}'.format(response))`
			`title = response.get('title', '{} on {}'.format(`
			`'Published' if response.get('date_published') else 'Generated',`
			`response.get('date_published', datetime.datetime.now().isoformat())))`
Raise runtime error if mercury API responds with empty response 2019-04-13 10:15:08 +02:00
Don't generate HTML/CSS wrapper for the content unless the output is a destination file (HTML/PDF). 2019-07-25 18:08:18 +02:00			`content = response.get('content', '')`
Added plugin to parse web pages 2019-03-29 03:57:16 +01:00
			`if not outfile:`
			`return {`
			`'url': url,`
			`'title': title,`
			`'content': content,`
			`}`

Improved font in http.webpage.simplify for HTML and PDF output 2019-10-05 22:48:07 +02:00			`content = '''<html>`
			`<head>`
			`<link rel="stylesheet" type="text/css" href="//fonts.googleapis.com/css?family=Merriweather" />`
			`<title>{title}</title>`
			`<style>`
			`body {{`
			`font-size: 22px;`
			`font-family: 'Merriweather', Georgia, 'Times New Roman', Times, serif;`
			`}}`
			`</style>`
			`</head>`
			`<body>`
			`<h1>{title}</h1>`
			`{content}`
			`</body>`
			`</html>'''.format(title=title, content=content)`
Don't generate HTML/CSS wrapper for the content unless the output is a destination file (HTML/PDF). 2019-07-25 18:08:18 +02:00
Added plugin to parse web pages 2019-03-29 03:57:16 +01:00			`outfile = os.path.abspath(os.path.expanduser(outfile))`

#73: Implemented wrapper plugin for the new Node.js mercury-parser. As the Mercury reader web API is deprecated, and the only available implementation is the open source mercury-parser, node, npm and @postlight/mercury-parser have to be added as dependencies for the http.webpage plugin (or at least for the `simplify` action). 2019-07-24 19:02:53 +02:00			`if outfile.lower().endswith('.pdf'):`
Added plugin to parse web pages 2019-03-29 03:57:16 +01:00			`import weasyprint`
			`weasyprint.HTML(string=content).write_pdf(outfile)`
			`else:`
			`with open(outfile, 'w', encoding='utf-8') as f:`
			`f.write(content)`

			`return {`
			`'url': url,`
			`'title': title,`
			`'outfile': outfile,`
			`}`


			`# vim:sw=4:ts=4:et:`