From f50ad767e08aa11a2279fb66868207f3bee7f294 Mon Sep 17 00:00:00 2001 From: Fabio Manganiello Date: Tue, 23 Jun 2020 01:54:32 +0200 Subject: [PATCH] Support for more arguments passed to the Mercury parser --- platypush/plugins/http/webpage/__init__.py | 13 +++++++++---- platypush/plugins/http/webpage/mercury-parser.js | 11 +++++++++-- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/platypush/plugins/http/webpage/__init__.py b/platypush/plugins/http/webpage/__init__.py index 82bd1e8f6..02262e772 100644 --- a/platypush/plugins/http/webpage/__init__.py +++ b/platypush/plugins/http/webpage/__init__.py @@ -24,11 +24,16 @@ class HttpWebpagePlugin(Plugin): _mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js') @action - def simplify(self, url, outfile=None): + def simplify(self, url, type='html', html=None, outfile=None): """ Parse the content of a web page removing any extra elements using Mercury - :param url: URL to parse + :param url: URL to parse. + :param type: Input type. Supported types: html, markdown, text (default: html). + :param html: Set this parameter if you want to parse some HTML content already fetched. Note + that URL is still required by Mercury to properly style the output, but it won't be used + to actually fetch the content. + :param outfile: If set then the output will be written to the specified file (supported formats: pdf, html, plain (default)). The plugin will guess the format from the extension @@ -55,8 +60,8 @@ class HttpWebpagePlugin(Plugin): """ self.logger.info('Parsing URL {}'.format(url)) - parser = subprocess.Popen(['node', self._mercury_script, url], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - response = parser.stdout.read().decode() + parser = subprocess.Popen(['node', self._mercury_script, url, type, html], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + response = parser.stdout.read().decode().strip() try: response = json.loads(response) diff --git a/platypush/plugins/http/webpage/mercury-parser.js b/platypush/plugins/http/webpage/mercury-parser.js index 286b2e25e..200f9d1a6 100755 --- a/platypush/plugins/http/webpage/mercury-parser.js +++ b/platypush/plugins/http/webpage/mercury-parser.js @@ -9,12 +9,19 @@ const parser = require('@postlight/mercury-parser'); if (process.argv.length < 3) { - console.error('Usage: ' + process.argv[1] + ' '); + console.error('Usage: ' + process.argv[1] + ' [markdown|html] [Pre-fetched content]'); process.exit(1); } const url = process.argv[2]; -parser.parse(url).then(result => { +const type = process.argv[3] || 'html'; +const content = process.argv[4]; +const args = { + contentType: type, + html: content, +}; + +parser.parse(url, args).then(result => { console.log(JSON.stringify(result)); });