[http.webpage] Added headers option.

A `headers` parameter has been added both to the `http.webpage` plugin
configuration and to the `http.webpage.simplify` action.

It can be used to pass extra headers to the Mercury API (e.g.
`User-Agent` or `Cookie`).

Moreover, the default `User-Agent` sent by Mercury has been changed to
an iPhone to increase the success rate of the scraping process.
This commit is contained in:
Fabio Manganiello 2024-11-06 21:22:59 +01:00
parent c3766ee423
commit 09413bc0cc
Signed by untrusted user: blacklight
GPG key ID: D90FBA7F76362774
2 changed files with 88 additions and 27 deletions

View file

@ -79,6 +79,21 @@ class HttpWebpagePlugin(Plugin):
os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js'
)
_default_headers = {
'User-Agent': (
# Default user agent for a desktop browser
'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 '
'(KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
),
}
def __init__(self, *args, headers: Optional[dict] = None, **kwargs):
"""
:param headers: Custom headers to be sent to the Mercury API.
"""
super().__init__(*args, **kwargs)
self._headers = {**self._default_headers, **(headers or {})}
@staticmethod
def _parse(proc):
"""
@ -104,6 +119,7 @@ class HttpWebpagePlugin(Plugin):
str, OutputFormats
] = OutputFormats.HTML,
html: Optional[str] = None,
headers: Optional[dict] = None,
outfile: Optional[str] = None,
font_size: str = '19px',
font_family: Union[str, Iterable[str]] = (
@ -132,6 +148,7 @@ class HttpWebpagePlugin(Plugin):
already fetched. Note that URL is still required by Mercury to
properly style the output, but it won't be used to actually fetch
the content.
:param headers: Custom headers to be sent to the Mercury API.
:param outfile: If set then the output will be written to the specified
file. If the file extension is ``.pdf`` then the content will be
exported in PDF format. If the output ``type`` is not specified
@ -163,6 +180,11 @@ class HttpWebpagePlugin(Plugin):
self.logger.info('Parsing URL %s', url)
fmt = OutputFormats.parse(type=type, outfile=outfile)
proc = ['node', self._mercury_script, url, fmt.value.cmd_fmt]
headers = {**self._headers, **(headers or {})}
for k, v in headers.items():
proc.extend((f'--{k}', v))
tmp_file = None
if html:
@ -217,11 +239,13 @@ class HttpWebpagePlugin(Plugin):
content=content,
outfile=outfile,
font_size=font_size,
font_family=tuple(
font_family,
)
if isinstance(font_family, str)
else tuple(font_family),
font_family=(
tuple(
font_family,
)
if isinstance(font_family, str)
else tuple(font_family)
),
)
@staticmethod

View file

@ -9,35 +9,72 @@
const fs = require('fs');
const Mercury = require('@postlight/mercury-parser');
if (process.argv.length < 3) {
console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html|text] [Pre-fetched HTML content file]');
process.exit(1);
}
const usage = () => {
console.error(
'Usage: ' + process.argv[1] + ' <url to parse> [--user-agent "some-user-agent"] ' +
'[--cookie "some-cookie"] [--some-header "some-value"] [markdown|html|text] [Pre-fetched HTML content file]'
);
const url = process.argv[2];
const type = process.argv[3] || 'html';
const contentFile = process.argv[4];
const args = {
contentType: type,
process.exit(1);
};
const parseArgs = (args) => {
const result = {
headers: {},
};
let pos = 0;
for (let i = 1; i < args.length; i++) {
const arg = args[i];
if (arg.startsWith('--') && i < args.length - 1 && !args[i + 1].startsWith('--')) {
const key = arg.substring(2).toLowerCase();
const value = args[++i];
result.headers[key] = value;
} else if (pos == 0 && arg.match(/^https?:\/\//)) {
result.url = arg;
pos++;
} else if (pos == 1) {
result.contentType = arg;
pos++;
} else if (pos == 2) {
result.contentFile = arg;
pos++;
}
}
if (!result.url?.length) {
usage();
}
result.contentType = result.contentType || 'html';
result.headers['User-Agent'] = result.headers['User-Agent'] || 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1';
return result;
};
const parse = (url, args) => {
Mercury.parse(url, args).then(result => {
console.log(JSON.stringify(result));
});
Mercury.parse(url, args).then(result => {
console.log(JSON.stringify(result));
});
};
const args = parseArgs(process.argv);
const contentFile = args.contentFile;
const url = args.url;
delete args.url;
if (contentFile) {
fs.readFile(contentFile, 'utf8', (err, data) => {
if (err) {
console.error(err);
process.exit(1);
}
delete args.contentFile;
args.html = data;
parse(url, args);
});
} else {
fs.readFile(contentFile, 'utf8', (err, data) => {
if (err) {
console.error(err);
process.exit(1);
}
args.html = data;
parse(url, args);
});
} else {
parse(url, args);
}