[http.webpage] Added headers option.

A `headers` parameter has been added both to the `http.webpage` plugin configuration and to the `http.webpage.simplify` action. It can be used to pass extra headers to the Mercury API (e.g. `User-Agent` or `Cookie`). Moreover, the default `User-Agent` sent by Mercury has been changed to an iPhone to increase the success rate of the scraping process.
2024-11-06 21:22:59 +01:00 · 2024-11-06 21:22:59 +01:00 · 09413bc0cc
commit 09413bc0cc
parent c3766ee423
2 changed files with 88 additions and 27 deletions
--- a/platypush/plugins/http/webpage/init.py
+++ b/platypush/plugins/http/webpage/init.py
@ -79,6 +79,21 @@ class HttpWebpagePlugin(Plugin):
        os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js'
    )

+    _default_headers = {
+        'User-Agent': (
+            # Default user agent for a desktop browser
+            'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 '
+            '(KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
+        ),
+    }
+
+    def __init__(self, *args, headers: Optional[dict] = None, **kwargs):
+        """
+        :param headers: Custom headers to be sent to the Mercury API.
+        """
+        super().__init__(*args, **kwargs)
+        self._headers = {**self._default_headers, **(headers or {})}
+
    @staticmethod
    def _parse(proc):
        """
@ -104,6 +119,7 @@ class HttpWebpagePlugin(Plugin):
            str, OutputFormats
        ] = OutputFormats.HTML,
        html: Optional[str] = None,
+        headers: Optional[dict] = None,
        outfile: Optional[str] = None,
        font_size: str = '19px',
        font_family: Union[str, Iterable[str]] = (
@ -132,6 +148,7 @@ class HttpWebpagePlugin(Plugin):
            already fetched. Note that URL is still required by Mercury to
            properly style the output, but it won't be used to actually fetch
            the content.
+        :param headers: Custom headers to be sent to the Mercury API.
        :param outfile: If set then the output will be written to the specified
            file. If the file extension is ``.pdf`` then the content will be
            exported in PDF format. If the output ``type`` is not specified
@ -163,6 +180,11 @@ class HttpWebpagePlugin(Plugin):
        self.logger.info('Parsing URL %s', url)
        fmt = OutputFormats.parse(type=type, outfile=outfile)
        proc = ['node', self._mercury_script, url, fmt.value.cmd_fmt]
+        headers = {**self._headers, **(headers or {})}
+
+        for k, v in headers.items():
+            proc.extend((f'--{k}', v))
+
        tmp_file = None

        if html:
@ -217,11 +239,13 @@ class HttpWebpagePlugin(Plugin):
            content=content,
            outfile=outfile,
            font_size=font_size,
-            font_family=tuple(
+            font_family=(
+                tuple(
                    font_family,
                )
                if isinstance(font_family, str)
-            else tuple(font_family),
+                else tuple(font_family)
+            ),
        )

    @staticmethod
--- a/platypush/plugins/http/webpage/mercury-parser.js
+++ b/platypush/plugins/http/webpage/mercury-parser.js
@ -9,16 +9,47 @@
 const fs = require('fs');
 const Mercury = require('@postlight/mercury-parser');

-if (process.argv.length < 3) {
-    console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html|text] [Pre-fetched HTML content file]');
+const usage = () => {
+  console.error(
+    'Usage: ' + process.argv[1] + ' <url to parse> [--user-agent "some-user-agent"] ' +
+    '[--cookie "some-cookie"] [--some-header "some-value"] [markdown|html|text] [Pre-fetched HTML content file]'
+  );
+
  process.exit(1);
+};
+
+const parseArgs = (args) => {
+  const result = {
+    headers: {},
+  };
+
+  let pos = 0;
+
+  for (let i = 1; i < args.length; i++) {
+    const arg = args[i];
+    if (arg.startsWith('--') && i < args.length - 1 && !args[i + 1].startsWith('--')) {
+      const key = arg.substring(2).toLowerCase();
+      const value = args[++i];
+      result.headers[key] = value;
+    } else if (pos == 0 && arg.match(/^https?:\/\//)) {
+      result.url = arg;
+      pos++;
+    } else if (pos == 1) {
+      result.contentType = arg;
+      pos++;
+    } else if (pos == 2) {
+      result.contentFile = arg;
+      pos++;
+    }
  }

-const url = process.argv[2];
-const type = process.argv[3] || 'html';
-const contentFile = process.argv[4];
-const args = {
-    contentType: type,
+  if (!result.url?.length) {
+    usage();
+  }
+
+  result.contentType = result.contentType || 'html';
+  result.headers['User-Agent'] = result.headers['User-Agent'] || 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1';
+  return result;
 };

 const parse = (url, args) => {
@ -27,7 +58,14 @@ const parse = (url, args) => {
  });
 };

+const args = parseArgs(process.argv);
+const contentFile = args.contentFile;
+const url = args.url;
+delete args.url;
+
 if (contentFile) {
+  delete args.contentFile;
+
  fs.readFile(contentFile, 'utf8', (err, data) => {
    if (err) {
      console.error(err);
@ -40,4 +78,3 @@ if (contentFile) {
 } else {
  parse(url, args);
 }
-