Support for pre-fetched HTML/Markdown content.

HTML output can be too large for the process called over the command line. HTML data exchange now happens through an intermediate temporary file.
2020-06-25 01:37:59 +02:00 · 2020-06-25 01:37:59 +02:00 · 96716dc872
commit 96716dc872
parent f50ad767e0
2 changed files with 48 additions and 11 deletions
--- a/platypush/plugins/http/webpage/init.py
+++ b/platypush/plugins/http/webpage/init.py
@ -2,6 +2,7 @@ import datetime
 import json
 import os
 import subprocess
+import tempfile

 from platypush.plugins import action
 from platypush.plugins.http.request import Plugin
@ -23,6 +24,14 @@ class HttpWebpagePlugin(Plugin):

    _mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js')

+    def _parse(self, proc):
+        output = ''
+
+        with subprocess.Popen(proc, stdout=subprocess.PIPE, stderr=None) as parser:
+            output = parser.communicate()[0].decode()
+
+        return output
+
    @action
    def simplify(self, url, type='html', html=None, outfile=None):
        """
@ -60,15 +69,27 @@ class HttpWebpagePlugin(Plugin):
        """

        self.logger.info('Parsing URL {}'.format(url))
-        parser = subprocess.Popen(['node', self._mercury_script, url, type, html], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
-        response = parser.stdout.read().decode().strip()
+        proc = ['node', self._mercury_script, url, type]
+        f = None
+
+        if html:
+            f = tempfile.NamedTemporaryFile('w+', delete=False)
+            f.write(html)
+            f.flush()
+            proc.append(f.name)

        try:
-            response = json.loads(response)
+            response = self._parse(proc)
+        finally:
+            if f:
+                os.unlink(f.name)
+
+        try:
+            response = json.loads(response.strip())
        except Exception as e:
            raise RuntimeError('Could not parse JSON: {}. Response: {}'.format(str(e), response))

-        self.logger.info('Got response from Mercury API: {}'.format(response))
+        self.logger.debug('Got response from Mercury API: {}'.format(response))
        title = response.get('title', '{} on {}'.format(
            'Published' if response.get('date_published') else 'Generated',
            response.get('date_published', datetime.datetime.now().isoformat())))
--- a/platypush/plugins/http/webpage/mercury-parser.js
+++ b/platypush/plugins/http/webpage/mercury-parser.js
@ -6,22 +6,38 @@

 'use strict';

-const parser = require('@postlight/mercury-parser');
+const fs = require('fs');
+const Mercury = require('@postlight/mercury-parser');

 if (process.argv.length < 3) {
-    console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html] [Pre-fetched content]');
+    console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html] [Pre-fetched HTML content file]');
    process.exit(1);
 }

 const url = process.argv[2];
 const type = process.argv[3] || 'html';
-const content = process.argv[4];
+const contentFile = process.argv[4];
 const args = {
    contentType: type,
-    html: content,
 };

-parser.parse(url, args).then(result => {
-    console.log(JSON.stringify(result));
-});
+const parse = (url, args) => {
+    Mercury.parse(url, args).then(result => {
+        console.log(JSON.stringify(result));
+    });
+};
+
+if (contentFile) {
+    fs.readFile(contentFile, 'utf8', (err, data) => {
+        if (err) {
+            console.error(err);
+            process.exit(1);
+        }
+
+        args.html = data;
+        parse(url, args);
+    });
+} else {
+    parse(url, args);
+}