From 96716dc872b0e3577f92c3117188e2191e451b1b Mon Sep 17 00:00:00 2001 From: Fabio Manganiello Date: Thu, 25 Jun 2020 01:37:59 +0200 Subject: [PATCH] Support for pre-fetched HTML/Markdown content. HTML output can be too large for the process called over the command line. HTML data exchange now happens through an intermediate temporary file. --- platypush/plugins/http/webpage/__init__.py | 29 +++++++++++++++--- .../plugins/http/webpage/mercury-parser.js | 30 ++++++++++++++----- 2 files changed, 48 insertions(+), 11 deletions(-) diff --git a/platypush/plugins/http/webpage/__init__.py b/platypush/plugins/http/webpage/__init__.py index 02262e77..e1721e9f 100644 --- a/platypush/plugins/http/webpage/__init__.py +++ b/platypush/plugins/http/webpage/__init__.py @@ -2,6 +2,7 @@ import datetime import json import os import subprocess +import tempfile from platypush.plugins import action from platypush.plugins.http.request import Plugin @@ -23,6 +24,14 @@ class HttpWebpagePlugin(Plugin): _mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js') + def _parse(self, proc): + output = '' + + with subprocess.Popen(proc, stdout=subprocess.PIPE, stderr=None) as parser: + output = parser.communicate()[0].decode() + + return output + @action def simplify(self, url, type='html', html=None, outfile=None): """ @@ -60,15 +69,27 @@ class HttpWebpagePlugin(Plugin): """ self.logger.info('Parsing URL {}'.format(url)) - parser = subprocess.Popen(['node', self._mercury_script, url, type, html], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) - response = parser.stdout.read().decode().strip() + proc = ['node', self._mercury_script, url, type] + f = None + + if html: + f = tempfile.NamedTemporaryFile('w+', delete=False) + f.write(html) + f.flush() + proc.append(f.name) try: - response = json.loads(response) + response = self._parse(proc) + finally: + if f: + os.unlink(f.name) + + try: + response = json.loads(response.strip()) except Exception as e: raise RuntimeError('Could not parse JSON: {}. Response: {}'.format(str(e), response)) - self.logger.info('Got response from Mercury API: {}'.format(response)) + self.logger.debug('Got response from Mercury API: {}'.format(response)) title = response.get('title', '{} on {}'.format( 'Published' if response.get('date_published') else 'Generated', response.get('date_published', datetime.datetime.now().isoformat()))) diff --git a/platypush/plugins/http/webpage/mercury-parser.js b/platypush/plugins/http/webpage/mercury-parser.js index 200f9d1a..f8e7c9e3 100755 --- a/platypush/plugins/http/webpage/mercury-parser.js +++ b/platypush/plugins/http/webpage/mercury-parser.js @@ -6,22 +6,38 @@ 'use strict'; -const parser = require('@postlight/mercury-parser'); +const fs = require('fs'); +const Mercury = require('@postlight/mercury-parser'); if (process.argv.length < 3) { - console.error('Usage: ' + process.argv[1] + ' [markdown|html] [Pre-fetched content]'); + console.error('Usage: ' + process.argv[1] + ' [markdown|html] [Pre-fetched HTML content file]'); process.exit(1); } const url = process.argv[2]; const type = process.argv[3] || 'html'; -const content = process.argv[4]; +const contentFile = process.argv[4]; const args = { contentType: type, - html: content, }; -parser.parse(url, args).then(result => { - console.log(JSON.stringify(result)); -}); +const parse = (url, args) => { + Mercury.parse(url, args).then(result => { + console.log(JSON.stringify(result)); + }); +}; + +if (contentFile) { + fs.readFile(contentFile, 'utf8', (err, data) => { + if (err) { + console.error(err); + process.exit(1); + } + + args.html = data; + parse(url, args); + }); +} else { + parse(url, args); +}