From 09413bc0cca4929382ab59dd146ff0a838080205 Mon Sep 17 00:00:00 2001 From: Fabio Manganiello Date: Wed, 6 Nov 2024 21:22:59 +0100 Subject: [PATCH] [http.webpage] Added headers option. A `headers` parameter has been added both to the `http.webpage` plugin configuration and to the `http.webpage.simplify` action. It can be used to pass extra headers to the Mercury API (e.g. `User-Agent` or `Cookie`). Moreover, the default `User-Agent` sent by Mercury has been changed to an iPhone to increase the success rate of the scraping process. --- platypush/plugins/http/webpage/__init__.py | 34 ++++++-- .../plugins/http/webpage/mercury-parser.js | 81 ++++++++++++++----- 2 files changed, 88 insertions(+), 27 deletions(-) diff --git a/platypush/plugins/http/webpage/__init__.py b/platypush/plugins/http/webpage/__init__.py index 8aea5cc237..d5e6152ba7 100644 --- a/platypush/plugins/http/webpage/__init__.py +++ b/platypush/plugins/http/webpage/__init__.py @@ -79,6 +79,21 @@ class HttpWebpagePlugin(Plugin): os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js' ) + _default_headers = { + 'User-Agent': ( + # Default user agent for a desktop browser + 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 ' + '(KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1' + ), + } + + def __init__(self, *args, headers: Optional[dict] = None, **kwargs): + """ + :param headers: Custom headers to be sent to the Mercury API. + """ + super().__init__(*args, **kwargs) + self._headers = {**self._default_headers, **(headers or {})} + @staticmethod def _parse(proc): """ @@ -104,6 +119,7 @@ class HttpWebpagePlugin(Plugin): str, OutputFormats ] = OutputFormats.HTML, html: Optional[str] = None, + headers: Optional[dict] = None, outfile: Optional[str] = None, font_size: str = '19px', font_family: Union[str, Iterable[str]] = ( @@ -132,6 +148,7 @@ class HttpWebpagePlugin(Plugin): already fetched. Note that URL is still required by Mercury to properly style the output, but it won't be used to actually fetch the content. + :param headers: Custom headers to be sent to the Mercury API. :param outfile: If set then the output will be written to the specified file. If the file extension is ``.pdf`` then the content will be exported in PDF format. If the output ``type`` is not specified @@ -163,6 +180,11 @@ class HttpWebpagePlugin(Plugin): self.logger.info('Parsing URL %s', url) fmt = OutputFormats.parse(type=type, outfile=outfile) proc = ['node', self._mercury_script, url, fmt.value.cmd_fmt] + headers = {**self._headers, **(headers or {})} + + for k, v in headers.items(): + proc.extend((f'--{k}', v)) + tmp_file = None if html: @@ -217,11 +239,13 @@ class HttpWebpagePlugin(Plugin): content=content, outfile=outfile, font_size=font_size, - font_family=tuple( - font_family, - ) - if isinstance(font_family, str) - else tuple(font_family), + font_family=( + tuple( + font_family, + ) + if isinstance(font_family, str) + else tuple(font_family) + ), ) @staticmethod diff --git a/platypush/plugins/http/webpage/mercury-parser.js b/platypush/plugins/http/webpage/mercury-parser.js index d05c8a8f0d..e2e10842d7 100755 --- a/platypush/plugins/http/webpage/mercury-parser.js +++ b/platypush/plugins/http/webpage/mercury-parser.js @@ -9,35 +9,72 @@ const fs = require('fs'); const Mercury = require('@postlight/mercury-parser'); -if (process.argv.length < 3) { - console.error('Usage: ' + process.argv[1] + ' [markdown|html|text] [Pre-fetched HTML content file]'); - process.exit(1); -} +const usage = () => { + console.error( + 'Usage: ' + process.argv[1] + ' [--user-agent "some-user-agent"] ' + + '[--cookie "some-cookie"] [--some-header "some-value"] [markdown|html|text] [Pre-fetched HTML content file]' + ); -const url = process.argv[2]; -const type = process.argv[3] || 'html'; -const contentFile = process.argv[4]; -const args = { - contentType: type, + process.exit(1); +}; + +const parseArgs = (args) => { + const result = { + headers: {}, + }; + + let pos = 0; + + for (let i = 1; i < args.length; i++) { + const arg = args[i]; + if (arg.startsWith('--') && i < args.length - 1 && !args[i + 1].startsWith('--')) { + const key = arg.substring(2).toLowerCase(); + const value = args[++i]; + result.headers[key] = value; + } else if (pos == 0 && arg.match(/^https?:\/\//)) { + result.url = arg; + pos++; + } else if (pos == 1) { + result.contentType = arg; + pos++; + } else if (pos == 2) { + result.contentFile = arg; + pos++; + } + } + + if (!result.url?.length) { + usage(); + } + + result.contentType = result.contentType || 'html'; + result.headers['User-Agent'] = result.headers['User-Agent'] || 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'; + return result; }; const parse = (url, args) => { - Mercury.parse(url, args).then(result => { - console.log(JSON.stringify(result)); - }); + Mercury.parse(url, args).then(result => { + console.log(JSON.stringify(result)); + }); }; +const args = parseArgs(process.argv); +const contentFile = args.contentFile; +const url = args.url; +delete args.url; + if (contentFile) { - fs.readFile(contentFile, 'utf8', (err, data) => { - if (err) { - console.error(err); - process.exit(1); - } + delete args.contentFile; - args.html = data; - parse(url, args); - }); -} else { + fs.readFile(contentFile, 'utf8', (err, data) => { + if (err) { + console.error(err); + process.exit(1); + } + + args.html = data; parse(url, args); + }); +} else { + parse(url, args); } -