forked from platypush/platypush
[http.webpage] Added headers option.
A `headers` parameter has been added both to the `http.webpage` plugin configuration and to the `http.webpage.simplify` action. It can be used to pass extra headers to the Mercury API (e.g. `User-Agent` or `Cookie`). Moreover, the default `User-Agent` sent by Mercury has been changed to an iPhone to increase the success rate of the scraping process.
This commit is contained in:
parent
c3766ee423
commit
09413bc0cc
2 changed files with 88 additions and 27 deletions
|
@ -79,6 +79,21 @@ class HttpWebpagePlugin(Plugin):
|
|||
os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js'
|
||||
)
|
||||
|
||||
_default_headers = {
|
||||
'User-Agent': (
|
||||
# Default user agent for a desktop browser
|
||||
'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 '
|
||||
'(KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
|
||||
),
|
||||
}
|
||||
|
||||
def __init__(self, *args, headers: Optional[dict] = None, **kwargs):
|
||||
"""
|
||||
:param headers: Custom headers to be sent to the Mercury API.
|
||||
"""
|
||||
super().__init__(*args, **kwargs)
|
||||
self._headers = {**self._default_headers, **(headers or {})}
|
||||
|
||||
@staticmethod
|
||||
def _parse(proc):
|
||||
"""
|
||||
|
@ -104,6 +119,7 @@ class HttpWebpagePlugin(Plugin):
|
|||
str, OutputFormats
|
||||
] = OutputFormats.HTML,
|
||||
html: Optional[str] = None,
|
||||
headers: Optional[dict] = None,
|
||||
outfile: Optional[str] = None,
|
||||
font_size: str = '19px',
|
||||
font_family: Union[str, Iterable[str]] = (
|
||||
|
@ -132,6 +148,7 @@ class HttpWebpagePlugin(Plugin):
|
|||
already fetched. Note that URL is still required by Mercury to
|
||||
properly style the output, but it won't be used to actually fetch
|
||||
the content.
|
||||
:param headers: Custom headers to be sent to the Mercury API.
|
||||
:param outfile: If set then the output will be written to the specified
|
||||
file. If the file extension is ``.pdf`` then the content will be
|
||||
exported in PDF format. If the output ``type`` is not specified
|
||||
|
@ -163,6 +180,11 @@ class HttpWebpagePlugin(Plugin):
|
|||
self.logger.info('Parsing URL %s', url)
|
||||
fmt = OutputFormats.parse(type=type, outfile=outfile)
|
||||
proc = ['node', self._mercury_script, url, fmt.value.cmd_fmt]
|
||||
headers = {**self._headers, **(headers or {})}
|
||||
|
||||
for k, v in headers.items():
|
||||
proc.extend((f'--{k}', v))
|
||||
|
||||
tmp_file = None
|
||||
|
||||
if html:
|
||||
|
@ -217,11 +239,13 @@ class HttpWebpagePlugin(Plugin):
|
|||
content=content,
|
||||
outfile=outfile,
|
||||
font_size=font_size,
|
||||
font_family=tuple(
|
||||
font_family,
|
||||
)
|
||||
if isinstance(font_family, str)
|
||||
else tuple(font_family),
|
||||
font_family=(
|
||||
tuple(
|
||||
font_family,
|
||||
)
|
||||
if isinstance(font_family, str)
|
||||
else tuple(font_family)
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -9,35 +9,72 @@
|
|||
const fs = require('fs');
|
||||
const Mercury = require('@postlight/mercury-parser');
|
||||
|
||||
if (process.argv.length < 3) {
|
||||
console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html|text] [Pre-fetched HTML content file]');
|
||||
process.exit(1);
|
||||
}
|
||||
const usage = () => {
|
||||
console.error(
|
||||
'Usage: ' + process.argv[1] + ' <url to parse> [--user-agent "some-user-agent"] ' +
|
||||
'[--cookie "some-cookie"] [--some-header "some-value"] [markdown|html|text] [Pre-fetched HTML content file]'
|
||||
);
|
||||
|
||||
const url = process.argv[2];
|
||||
const type = process.argv[3] || 'html';
|
||||
const contentFile = process.argv[4];
|
||||
const args = {
|
||||
contentType: type,
|
||||
process.exit(1);
|
||||
};
|
||||
|
||||
const parseArgs = (args) => {
|
||||
const result = {
|
||||
headers: {},
|
||||
};
|
||||
|
||||
let pos = 0;
|
||||
|
||||
for (let i = 1; i < args.length; i++) {
|
||||
const arg = args[i];
|
||||
if (arg.startsWith('--') && i < args.length - 1 && !args[i + 1].startsWith('--')) {
|
||||
const key = arg.substring(2).toLowerCase();
|
||||
const value = args[++i];
|
||||
result.headers[key] = value;
|
||||
} else if (pos == 0 && arg.match(/^https?:\/\//)) {
|
||||
result.url = arg;
|
||||
pos++;
|
||||
} else if (pos == 1) {
|
||||
result.contentType = arg;
|
||||
pos++;
|
||||
} else if (pos == 2) {
|
||||
result.contentFile = arg;
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
|
||||
if (!result.url?.length) {
|
||||
usage();
|
||||
}
|
||||
|
||||
result.contentType = result.contentType || 'html';
|
||||
result.headers['User-Agent'] = result.headers['User-Agent'] || 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1';
|
||||
return result;
|
||||
};
|
||||
|
||||
const parse = (url, args) => {
|
||||
Mercury.parse(url, args).then(result => {
|
||||
console.log(JSON.stringify(result));
|
||||
});
|
||||
Mercury.parse(url, args).then(result => {
|
||||
console.log(JSON.stringify(result));
|
||||
});
|
||||
};
|
||||
|
||||
const args = parseArgs(process.argv);
|
||||
const contentFile = args.contentFile;
|
||||
const url = args.url;
|
||||
delete args.url;
|
||||
|
||||
if (contentFile) {
|
||||
fs.readFile(contentFile, 'utf8', (err, data) => {
|
||||
if (err) {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
}
|
||||
delete args.contentFile;
|
||||
|
||||
args.html = data;
|
||||
parse(url, args);
|
||||
});
|
||||
} else {
|
||||
fs.readFile(contentFile, 'utf8', (err, data) => {
|
||||
if (err) {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
args.html = data;
|
||||
parse(url, args);
|
||||
});
|
||||
} else {
|
||||
parse(url, args);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue