Support for pre-fetched HTML/Markdown content.
HTML output can be too large for the process called over the command line. HTML data exchange now happens through an intermediate temporary file.
This commit is contained in:
parent
f50ad767e0
commit
96716dc872
2 changed files with 48 additions and 11 deletions
|
@ -2,6 +2,7 @@ import datetime
|
|||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
from platypush.plugins import action
|
||||
from platypush.plugins.http.request import Plugin
|
||||
|
@ -23,6 +24,14 @@ class HttpWebpagePlugin(Plugin):
|
|||
|
||||
_mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js')
|
||||
|
||||
def _parse(self, proc):
|
||||
output = ''
|
||||
|
||||
with subprocess.Popen(proc, stdout=subprocess.PIPE, stderr=None) as parser:
|
||||
output = parser.communicate()[0].decode()
|
||||
|
||||
return output
|
||||
|
||||
@action
|
||||
def simplify(self, url, type='html', html=None, outfile=None):
|
||||
"""
|
||||
|
@ -60,15 +69,27 @@ class HttpWebpagePlugin(Plugin):
|
|||
"""
|
||||
|
||||
self.logger.info('Parsing URL {}'.format(url))
|
||||
parser = subprocess.Popen(['node', self._mercury_script, url, type, html], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
|
||||
response = parser.stdout.read().decode().strip()
|
||||
proc = ['node', self._mercury_script, url, type]
|
||||
f = None
|
||||
|
||||
if html:
|
||||
f = tempfile.NamedTemporaryFile('w+', delete=False)
|
||||
f.write(html)
|
||||
f.flush()
|
||||
proc.append(f.name)
|
||||
|
||||
try:
|
||||
response = json.loads(response)
|
||||
response = self._parse(proc)
|
||||
finally:
|
||||
if f:
|
||||
os.unlink(f.name)
|
||||
|
||||
try:
|
||||
response = json.loads(response.strip())
|
||||
except Exception as e:
|
||||
raise RuntimeError('Could not parse JSON: {}. Response: {}'.format(str(e), response))
|
||||
|
||||
self.logger.info('Got response from Mercury API: {}'.format(response))
|
||||
self.logger.debug('Got response from Mercury API: {}'.format(response))
|
||||
title = response.get('title', '{} on {}'.format(
|
||||
'Published' if response.get('date_published') else 'Generated',
|
||||
response.get('date_published', datetime.datetime.now().isoformat())))
|
||||
|
|
|
@ -6,22 +6,38 @@
|
|||
|
||||
'use strict';
|
||||
|
||||
const parser = require('@postlight/mercury-parser');
|
||||
const fs = require('fs');
|
||||
const Mercury = require('@postlight/mercury-parser');
|
||||
|
||||
if (process.argv.length < 3) {
|
||||
console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html] [Pre-fetched content]');
|
||||
console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html] [Pre-fetched HTML content file]');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const url = process.argv[2];
|
||||
const type = process.argv[3] || 'html';
|
||||
const content = process.argv[4];
|
||||
const contentFile = process.argv[4];
|
||||
const args = {
|
||||
contentType: type,
|
||||
html: content,
|
||||
};
|
||||
|
||||
parser.parse(url, args).then(result => {
|
||||
console.log(JSON.stringify(result));
|
||||
});
|
||||
const parse = (url, args) => {
|
||||
Mercury.parse(url, args).then(result => {
|
||||
console.log(JSON.stringify(result));
|
||||
});
|
||||
};
|
||||
|
||||
if (contentFile) {
|
||||
fs.readFile(contentFile, 'utf8', (err, data) => {
|
||||
if (err) {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
args.html = data;
|
||||
parse(url, args);
|
||||
});
|
||||
} else {
|
||||
parse(url, args);
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue