Support for more arguments passed to the Mercury parser
This commit is contained in:
parent
4d650da3e5
commit
f50ad767e0
2 changed files with 18 additions and 6 deletions
|
@ -24,11 +24,16 @@ class HttpWebpagePlugin(Plugin):
|
|||
_mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js')
|
||||
|
||||
@action
|
||||
def simplify(self, url, outfile=None):
|
||||
def simplify(self, url, type='html', html=None, outfile=None):
|
||||
"""
|
||||
Parse the content of a web page removing any extra elements using Mercury
|
||||
|
||||
:param url: URL to parse
|
||||
:param url: URL to parse.
|
||||
:param type: Input type. Supported types: html, markdown, text (default: html).
|
||||
:param html: Set this parameter if you want to parse some HTML content already fetched. Note
|
||||
that URL is still required by Mercury to properly style the output, but it won't be used
|
||||
to actually fetch the content.
|
||||
|
||||
:param outfile: If set then the output will be written to the specified file
|
||||
(supported formats: pdf, html, plain (default)). The plugin will guess
|
||||
the format from the extension
|
||||
|
@ -55,8 +60,8 @@ class HttpWebpagePlugin(Plugin):
|
|||
"""
|
||||
|
||||
self.logger.info('Parsing URL {}'.format(url))
|
||||
parser = subprocess.Popen(['node', self._mercury_script, url], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
||||
response = parser.stdout.read().decode()
|
||||
parser = subprocess.Popen(['node', self._mercury_script, url, type, html], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
|
||||
response = parser.stdout.read().decode().strip()
|
||||
|
||||
try:
|
||||
response = json.loads(response)
|
||||
|
|
|
@ -9,12 +9,19 @@
|
|||
const parser = require('@postlight/mercury-parser');
|
||||
|
||||
if (process.argv.length < 3) {
|
||||
console.error('Usage: ' + process.argv[1] + ' <url to parse>');
|
||||
console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html] [Pre-fetched content]');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const url = process.argv[2];
|
||||
parser.parse(url).then(result => {
|
||||
const type = process.argv[3] || 'html';
|
||||
const content = process.argv[4];
|
||||
const args = {
|
||||
contentType: type,
|
||||
html: content,
|
||||
};
|
||||
|
||||
parser.parse(url, args).then(result => {
|
||||
console.log(JSON.stringify(result));
|
||||
});
|
||||
|
||||
|
|
Loading…
Reference in a new issue