From f50ad767e08aa11a2279fb66868207f3bee7f294 Mon Sep 17 00:00:00 2001
From: Fabio Manganiello <blacklight86@gmail.com>
Date: Tue, 23 Jun 2020 01:54:32 +0200
Subject: [PATCH] Support for more arguments passed to the Mercury parser

---
 platypush/plugins/http/webpage/__init__.py       | 13 +++++++++----
 platypush/plugins/http/webpage/mercury-parser.js | 11 +++++++++--
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/platypush/plugins/http/webpage/__init__.py b/platypush/plugins/http/webpage/__init__.py
index 82bd1e8f..02262e77 100644
--- a/platypush/plugins/http/webpage/__init__.py
+++ b/platypush/plugins/http/webpage/__init__.py
@@ -24,11 +24,16 @@ class HttpWebpagePlugin(Plugin):
     _mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js')
 
     @action
-    def simplify(self, url, outfile=None):
+    def simplify(self, url, type='html', html=None, outfile=None):
         """
         Parse the content of a web page removing any extra elements using Mercury
 
-        :param url: URL to parse
+        :param url: URL to parse.
+        :param type: Input type. Supported types: html, markdown, text (default: html).
+        :param html: Set this parameter if you want to parse some HTML content already fetched. Note
+            that URL is still required by Mercury to properly style the output, but it won't be used
+            to actually fetch the content.
+
         :param outfile: If set then the output will be written to the specified file
             (supported formats: pdf, html, plain (default)). The plugin will guess
             the format from the extension
@@ -55,8 +60,8 @@ class HttpWebpagePlugin(Plugin):
         """
 
         self.logger.info('Parsing URL {}'.format(url))
-        parser = subprocess.Popen(['node', self._mercury_script, url], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-        response = parser.stdout.read().decode()
+        parser = subprocess.Popen(['node', self._mercury_script, url, type, html], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
+        response = parser.stdout.read().decode().strip()
 
         try:
             response = json.loads(response)
diff --git a/platypush/plugins/http/webpage/mercury-parser.js b/platypush/plugins/http/webpage/mercury-parser.js
index 286b2e25..200f9d1a 100755
--- a/platypush/plugins/http/webpage/mercury-parser.js
+++ b/platypush/plugins/http/webpage/mercury-parser.js
@@ -9,12 +9,19 @@
 const parser = require('@postlight/mercury-parser');
 
 if (process.argv.length < 3) {
-    console.error('Usage: ' + process.argv[1] + ' <url to parse>');
+    console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html] [Pre-fetched content]');
     process.exit(1);
 }
 
 const url = process.argv[2];
-parser.parse(url).then(result => {
+const type = process.argv[3] || 'html';
+const content = process.argv[4];
+const args = {
+    contentType: type,
+    html: content,
+};
+
+parser.parse(url, args).then(result => {
     console.log(JSON.stringify(result));
 });