Support for pre-fetched HTML/Markdown content.
HTML output can be too large for the process called over the command line. HTML data exchange now happens through an intermediate temporary file.
This commit is contained in:
parent
f50ad767e0
commit
96716dc872
2 changed files with 48 additions and 11 deletions
|
@ -2,6 +2,7 @@ import datetime
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
|
||||||
from platypush.plugins import action
|
from platypush.plugins import action
|
||||||
from platypush.plugins.http.request import Plugin
|
from platypush.plugins.http.request import Plugin
|
||||||
|
@ -23,6 +24,14 @@ class HttpWebpagePlugin(Plugin):
|
||||||
|
|
||||||
_mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js')
|
_mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js')
|
||||||
|
|
||||||
|
def _parse(self, proc):
|
||||||
|
output = ''
|
||||||
|
|
||||||
|
with subprocess.Popen(proc, stdout=subprocess.PIPE, stderr=None) as parser:
|
||||||
|
output = parser.communicate()[0].decode()
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
@action
|
@action
|
||||||
def simplify(self, url, type='html', html=None, outfile=None):
|
def simplify(self, url, type='html', html=None, outfile=None):
|
||||||
"""
|
"""
|
||||||
|
@ -60,15 +69,27 @@ class HttpWebpagePlugin(Plugin):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.logger.info('Parsing URL {}'.format(url))
|
self.logger.info('Parsing URL {}'.format(url))
|
||||||
parser = subprocess.Popen(['node', self._mercury_script, url, type, html], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
|
proc = ['node', self._mercury_script, url, type]
|
||||||
response = parser.stdout.read().decode().strip()
|
f = None
|
||||||
|
|
||||||
|
if html:
|
||||||
|
f = tempfile.NamedTemporaryFile('w+', delete=False)
|
||||||
|
f.write(html)
|
||||||
|
f.flush()
|
||||||
|
proc.append(f.name)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = json.loads(response)
|
response = self._parse(proc)
|
||||||
|
finally:
|
||||||
|
if f:
|
||||||
|
os.unlink(f.name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = json.loads(response.strip())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError('Could not parse JSON: {}. Response: {}'.format(str(e), response))
|
raise RuntimeError('Could not parse JSON: {}. Response: {}'.format(str(e), response))
|
||||||
|
|
||||||
self.logger.info('Got response from Mercury API: {}'.format(response))
|
self.logger.debug('Got response from Mercury API: {}'.format(response))
|
||||||
title = response.get('title', '{} on {}'.format(
|
title = response.get('title', '{} on {}'.format(
|
||||||
'Published' if response.get('date_published') else 'Generated',
|
'Published' if response.get('date_published') else 'Generated',
|
||||||
response.get('date_published', datetime.datetime.now().isoformat())))
|
response.get('date_published', datetime.datetime.now().isoformat())))
|
||||||
|
|
|
@ -6,22 +6,38 @@
|
||||||
|
|
||||||
'use strict';
|
'use strict';
|
||||||
|
|
||||||
const parser = require('@postlight/mercury-parser');
|
const fs = require('fs');
|
||||||
|
const Mercury = require('@postlight/mercury-parser');
|
||||||
|
|
||||||
if (process.argv.length < 3) {
|
if (process.argv.length < 3) {
|
||||||
console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html] [Pre-fetched content]');
|
console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html] [Pre-fetched HTML content file]');
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
const url = process.argv[2];
|
const url = process.argv[2];
|
||||||
const type = process.argv[3] || 'html';
|
const type = process.argv[3] || 'html';
|
||||||
const content = process.argv[4];
|
const contentFile = process.argv[4];
|
||||||
const args = {
|
const args = {
|
||||||
contentType: type,
|
contentType: type,
|
||||||
html: content,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
parser.parse(url, args).then(result => {
|
const parse = (url, args) => {
|
||||||
|
Mercury.parse(url, args).then(result => {
|
||||||
console.log(JSON.stringify(result));
|
console.log(JSON.stringify(result));
|
||||||
});
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
if (contentFile) {
|
||||||
|
fs.readFile(contentFile, 'utf8', (err, data) => {
|
||||||
|
if (err) {
|
||||||
|
console.error(err);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
args.html = data;
|
||||||
|
parse(url, args);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
parse(url, args);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue