[http.webpage] Added headers option.

A `headers` parameter has been added both to the `http.webpage` plugin configuration and to the `http.webpage.simplify` action. It can be used to pass extra headers to the Mercury API (e.g. `User-Agent` or `Cookie`). Moreover, the default `User-Agent` sent by Mercury has been changed to an iPhone to increase the success rate of the scraping process.
2024-11-06 21:22:59 +01:00 · 2024-11-06 21:22:59 +01:00 · 09413bc0cc
commit 09413bc0cc
parent c3766ee423
2 changed files with 88 additions and 27 deletions
--- a/platypush/plugins/http/webpage/init.py
+++ b/platypush/plugins/http/webpage/init.py
@ -79,6 +79,21 @@ class HttpWebpagePlugin(Plugin):
        os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js'
    )
    _default_headers = {
        'User-Agent': (
            # Default user agent for a desktop browser
            'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 '
            '(KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
        ),
    }
    def __init__(self, *args, headers: Optional[dict] = None, **kwargs):
        """
        :param headers: Custom headers to be sent to the Mercury API.
        """
        super().__init__(*args, **kwargs)
        self._headers = {**self._default_headers, **(headers or {})}
    @staticmethod
    def _parse(proc):
        """
@ -104,6 +119,7 @@ class HttpWebpagePlugin(Plugin):
            str, OutputFormats
        ] = OutputFormats.HTML,
        html: Optional[str] = None,
        headers: Optional[dict] = None,
        outfile: Optional[str] = None,
        font_size: str = '19px',
        font_family: Union[str, Iterable[str]] = (
@ -132,6 +148,7 @@ class HttpWebpagePlugin(Plugin):
            already fetched. Note that URL is still required by Mercury to
            properly style the output, but it won't be used to actually fetch
            the content.
        :param headers: Custom headers to be sent to the Mercury API.
        :param outfile: If set then the output will be written to the specified
            file. If the file extension is ``.pdf`` then the content will be
            exported in PDF format. If the output ``type`` is not specified
@ -163,6 +180,11 @@ class HttpWebpagePlugin(Plugin):
        self.logger.info('Parsing URL %s', url)
        fmt = OutputFormats.parse(type=type, outfile=outfile)
        proc = ['node', self._mercury_script, url, fmt.value.cmd_fmt]
        headers = {**self._headers, **(headers or {})}
        for k, v in headers.items():
            proc.extend((f'--{k}', v))
        tmp_file = None
        if html:
@ -217,11 +239,13 @@ class HttpWebpagePlugin(Plugin):
            content=content,
            outfile=outfile,
            font_size=font_size,
-            font_family=tuple(
+            font_family=(
                tuple(
                    font_family,
                )
                if isinstance(font_family, str)
-            else tuple(font_family),
+                else tuple(font_family)
            ),
        )
    @staticmethod
--- a/platypush/plugins/http/webpage/mercury-parser.js
+++ b/platypush/plugins/http/webpage/mercury-parser.js
@ -9,16 +9,47 @@
 const fs = require('fs');
 const Mercury = require('@postlight/mercury-parser');
-if (process.argv.length < 3) {
+const usage = () => {
-    console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html|text] [Pre-fetched HTML content file]');
+  console.error(
-    process.exit(1);
+    'Usage: ' + process.argv[1] + ' <url to parse> [--user-agent "some-user-agent"] ' +
-}
+    '[--cookie "some-cookie"] [--some-header "some-value"] [markdown|html|text] [Pre-fetched HTML content file]'
  );
-const url = process.argv[2];
+  process.exit(1);
-const type = process.argv[3] || 'html';
+};
-const contentFile = process.argv[4];
+
-const args = {
+const parseArgs = (args) => {
-    contentType: type,
+  const result = {
    headers: {},
  };
  let pos = 0;
  for (let i = 1; i < args.length; i++) {
    const arg = args[i];
    if (arg.startsWith('--') && i < args.length - 1 && !args[i + 1].startsWith('--')) {
      const key = arg.substring(2).toLowerCase();
      const value = args[++i];
      result.headers[key] = value;
    } else if (pos == 0 && arg.match(/^https?:\/\//)) {
      result.url = arg;
      pos++;
    } else if (pos == 1) {
      result.contentType = arg;
      pos++;
    } else if (pos == 2) {
      result.contentFile = arg;
      pos++;
    }
  }
  if (!result.url?.length) {
    usage();
  }
  result.contentType = result.contentType || 'html';
  result.headers['User-Agent'] = result.headers['User-Agent'] || 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1';
  return result;
 };
 const parse = (url, args) => {
@ -27,7 +58,14 @@ const parse = (url, args) => {
  });
 };
 const args = parseArgs(process.argv);
 const contentFile = args.contentFile;
 const url = args.url;
 delete args.url;
 if (contentFile) {
  delete args.contentFile;
  fs.readFile(contentFile, 'utf8', (err, data) => {
    if (err) {
      console.error(err);
@ -40,4 +78,3 @@ if (contentFile) {
 } else {
  parse(url, args);
 }