Support for more arguments passed to the Mercury parser
This commit is contained in:
parent
4d650da3e5
commit
f50ad767e0
2 changed files with 18 additions and 6 deletions
|
@ -24,11 +24,16 @@ class HttpWebpagePlugin(Plugin):
|
||||||
_mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js')
|
_mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js')
|
||||||
|
|
||||||
@action
|
@action
|
||||||
def simplify(self, url, outfile=None):
|
def simplify(self, url, type='html', html=None, outfile=None):
|
||||||
"""
|
"""
|
||||||
Parse the content of a web page removing any extra elements using Mercury
|
Parse the content of a web page removing any extra elements using Mercury
|
||||||
|
|
||||||
:param url: URL to parse
|
:param url: URL to parse.
|
||||||
|
:param type: Input type. Supported types: html, markdown, text (default: html).
|
||||||
|
:param html: Set this parameter if you want to parse some HTML content already fetched. Note
|
||||||
|
that URL is still required by Mercury to properly style the output, but it won't be used
|
||||||
|
to actually fetch the content.
|
||||||
|
|
||||||
:param outfile: If set then the output will be written to the specified file
|
:param outfile: If set then the output will be written to the specified file
|
||||||
(supported formats: pdf, html, plain (default)). The plugin will guess
|
(supported formats: pdf, html, plain (default)). The plugin will guess
|
||||||
the format from the extension
|
the format from the extension
|
||||||
|
@ -55,8 +60,8 @@ class HttpWebpagePlugin(Plugin):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.logger.info('Parsing URL {}'.format(url))
|
self.logger.info('Parsing URL {}'.format(url))
|
||||||
parser = subprocess.Popen(['node', self._mercury_script, url], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
parser = subprocess.Popen(['node', self._mercury_script, url, type, html], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
|
||||||
response = parser.stdout.read().decode()
|
response = parser.stdout.read().decode().strip()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = json.loads(response)
|
response = json.loads(response)
|
||||||
|
|
|
@ -9,12 +9,19 @@
|
||||||
const parser = require('@postlight/mercury-parser');
|
const parser = require('@postlight/mercury-parser');
|
||||||
|
|
||||||
if (process.argv.length < 3) {
|
if (process.argv.length < 3) {
|
||||||
console.error('Usage: ' + process.argv[1] + ' <url to parse>');
|
console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html] [Pre-fetched content]');
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
const url = process.argv[2];
|
const url = process.argv[2];
|
||||||
parser.parse(url).then(result => {
|
const type = process.argv[3] || 'html';
|
||||||
|
const content = process.argv[4];
|
||||||
|
const args = {
|
||||||
|
contentType: type,
|
||||||
|
html: content,
|
||||||
|
};
|
||||||
|
|
||||||
|
parser.parse(url, args).then(result => {
|
||||||
console.log(JSON.stringify(result));
|
console.log(JSON.stringify(result));
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue