[http.webpage] Added headers option.

A `headers` parameter has been added both to the `http.webpage` plugin
configuration and to the `http.webpage.simplify` action.

It can be used to pass extra headers to the Mercury API (e.g.
`User-Agent` or `Cookie`).

Moreover, the default `User-Agent` sent by Mercury has been changed to
an iPhone to increase the success rate of the scraping process.
This commit is contained in:
Fabio Manganiello 2024-11-06 21:22:59 +01:00
parent c3766ee423
commit 09413bc0cc
Signed by untrusted user: blacklight
GPG key ID: D90FBA7F76362774
2 changed files with 88 additions and 27 deletions

View file

@ -79,6 +79,21 @@ class HttpWebpagePlugin(Plugin):
os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js' os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js'
) )
_default_headers = {
'User-Agent': (
# Default user agent for a desktop browser
'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 '
'(KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
),
}
def __init__(self, *args, headers: Optional[dict] = None, **kwargs):
"""
:param headers: Custom headers to be sent to the Mercury API.
"""
super().__init__(*args, **kwargs)
self._headers = {**self._default_headers, **(headers or {})}
@staticmethod @staticmethod
def _parse(proc): def _parse(proc):
""" """
@ -104,6 +119,7 @@ class HttpWebpagePlugin(Plugin):
str, OutputFormats str, OutputFormats
] = OutputFormats.HTML, ] = OutputFormats.HTML,
html: Optional[str] = None, html: Optional[str] = None,
headers: Optional[dict] = None,
outfile: Optional[str] = None, outfile: Optional[str] = None,
font_size: str = '19px', font_size: str = '19px',
font_family: Union[str, Iterable[str]] = ( font_family: Union[str, Iterable[str]] = (
@ -132,6 +148,7 @@ class HttpWebpagePlugin(Plugin):
already fetched. Note that URL is still required by Mercury to already fetched. Note that URL is still required by Mercury to
properly style the output, but it won't be used to actually fetch properly style the output, but it won't be used to actually fetch
the content. the content.
:param headers: Custom headers to be sent to the Mercury API.
:param outfile: If set then the output will be written to the specified :param outfile: If set then the output will be written to the specified
file. If the file extension is ``.pdf`` then the content will be file. If the file extension is ``.pdf`` then the content will be
exported in PDF format. If the output ``type`` is not specified exported in PDF format. If the output ``type`` is not specified
@ -163,6 +180,11 @@ class HttpWebpagePlugin(Plugin):
self.logger.info('Parsing URL %s', url) self.logger.info('Parsing URL %s', url)
fmt = OutputFormats.parse(type=type, outfile=outfile) fmt = OutputFormats.parse(type=type, outfile=outfile)
proc = ['node', self._mercury_script, url, fmt.value.cmd_fmt] proc = ['node', self._mercury_script, url, fmt.value.cmd_fmt]
headers = {**self._headers, **(headers or {})}
for k, v in headers.items():
proc.extend((f'--{k}', v))
tmp_file = None tmp_file = None
if html: if html:
@ -217,11 +239,13 @@ class HttpWebpagePlugin(Plugin):
content=content, content=content,
outfile=outfile, outfile=outfile,
font_size=font_size, font_size=font_size,
font_family=tuple( font_family=(
tuple(
font_family, font_family,
) )
if isinstance(font_family, str) if isinstance(font_family, str)
else tuple(font_family), else tuple(font_family)
),
) )
@staticmethod @staticmethod

View file

@ -9,16 +9,47 @@
const fs = require('fs'); const fs = require('fs');
const Mercury = require('@postlight/mercury-parser'); const Mercury = require('@postlight/mercury-parser');
if (process.argv.length < 3) { const usage = () => {
console.error('Usage: ' + process.argv[1] + ' <url to parse> [markdown|html|text] [Pre-fetched HTML content file]'); console.error(
process.exit(1); 'Usage: ' + process.argv[1] + ' <url to parse> [--user-agent "some-user-agent"] ' +
} '[--cookie "some-cookie"] [--some-header "some-value"] [markdown|html|text] [Pre-fetched HTML content file]'
);
const url = process.argv[2]; process.exit(1);
const type = process.argv[3] || 'html'; };
const contentFile = process.argv[4];
const args = { const parseArgs = (args) => {
contentType: type, const result = {
headers: {},
};
let pos = 0;
for (let i = 1; i < args.length; i++) {
const arg = args[i];
if (arg.startsWith('--') && i < args.length - 1 && !args[i + 1].startsWith('--')) {
const key = arg.substring(2).toLowerCase();
const value = args[++i];
result.headers[key] = value;
} else if (pos == 0 && arg.match(/^https?:\/\//)) {
result.url = arg;
pos++;
} else if (pos == 1) {
result.contentType = arg;
pos++;
} else if (pos == 2) {
result.contentFile = arg;
pos++;
}
}
if (!result.url?.length) {
usage();
}
result.contentType = result.contentType || 'html';
result.headers['User-Agent'] = result.headers['User-Agent'] || 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1';
return result;
}; };
const parse = (url, args) => { const parse = (url, args) => {
@ -27,7 +58,14 @@ const parse = (url, args) => {
}); });
}; };
const args = parseArgs(process.argv);
const contentFile = args.contentFile;
const url = args.url;
delete args.url;
if (contentFile) { if (contentFile) {
delete args.contentFile;
fs.readFile(contentFile, 'utf8', (err, data) => { fs.readFile(contentFile, 'utf8', (err, data) => {
if (err) { if (err) {
console.error(err); console.error(err);
@ -40,4 +78,3 @@ if (contentFile) {
} else { } else {
parse(url, args); parse(url, args);
} }