Support for pre-fetched HTML/Markdown content.

HTML output can be too large for the process called over the command
line. HTML data exchange now happens through an intermediate temporary
file.
This commit is contained in:
Fabio Manganiello 2020-06-25 01:37:59 +02:00
parent f50ad767e0
commit 96716dc872
2 changed files with 48 additions and 11 deletions

View file

@ -2,6 +2,7 @@ import datetime
import json
import os
import subprocess
import tempfile
from platypush.plugins import action
from platypush.plugins.http.request import Plugin
@ -23,6 +24,14 @@ class HttpWebpagePlugin(Plugin):
_mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js')
def _parse(self, proc):
output = ''
with subprocess.Popen(proc, stdout=subprocess.PIPE, stderr=None) as parser:
output = parser.communicate()[0].decode()
return output
@action
def simplify(self, url, type='html', html=None, outfile=None):
"""
@ -60,15 +69,27 @@ class HttpWebpagePlugin(Plugin):
"""
self.logger.info('Parsing URL {}'.format(url))
parser = subprocess.Popen(['node', self._mercury_script, url, type, html], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
response = parser.stdout.read().decode().strip()
proc = ['node', self._mercury_script, url, type]
f = None
if html:
f = tempfile.NamedTemporaryFile('w+', delete=False)
f.write(html)
f.flush()
proc.append(f.name)
try:
response = json.loads(response)
response = self._parse(proc)
finally:
if f:
os.unlink(f.name)
try:
response = json.loads(response.strip())
except Exception as e:
raise RuntimeError('Could not parse JSON: {}. Response: {}'.format(str(e), response))
self.logger.info('Got response from Mercury API: {}'.format(response))
self.logger.debug('Got response from Mercury API: {}'.format(response))
title = response.get('title', '{} on {}'.format(
'Published' if response.get('date_published') else 'Generated',
response.get('date_published', datetime.datetime.now().isoformat())))

View file

@ -6,22 +6,38 @@
'use strict';
const parser = require('@postlight/mercury-parser');
const fs = require('fs');
const Mercury = require('@postlight/mercury-parser');
if (process.argv.length < 3) {
console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html] [Pre-fetched content]');
console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html] [Pre-fetched HTML content file]');
process.exit(1);
}
const url = process.argv[2];
const type = process.argv[3] || 'html';
const content = process.argv[4];
const contentFile = process.argv[4];
const args = {
contentType: type,
html: content,
};
parser.parse(url, args).then(result => {
const parse = (url, args) => {
Mercury.parse(url, args).then(result => {
console.log(JSON.stringify(result));
});
};
if (contentFile) {
fs.readFile(contentFile, 'utf8', (err, data) => {
if (err) {
console.error(err);
process.exit(1);
}
args.html = data;
parse(url, args);
});
} else {
parse(url, args);
}