#73: Implemented wrapper plugin for the new Node.js mercury-parser.
As the Mercury reader web API is deprecated, and the only available implementation is the open source mercury-parser, node, npm and @postlight/mercury-parser have to be added as dependencies for the http.webpage plugin (or at least for the `simplify` action).
This commit is contained in:
parent
43ca3a6f94
commit
5293f5b203
5 changed files with 71 additions and 88 deletions
|
@ -2,11 +2,9 @@ import datetime
|
||||||
import enum
|
import enum
|
||||||
import feedparser
|
import feedparser
|
||||||
import os
|
import os
|
||||||
import requests
|
|
||||||
import time
|
|
||||||
|
|
||||||
from sqlalchemy import create_engine, Column, Integer, String, DateTime, \
|
from sqlalchemy import create_engine, Column, Integer, String, DateTime, \
|
||||||
Enum, UniqueConstraint, ForeignKey
|
Enum, ForeignKey
|
||||||
|
|
||||||
from sqlalchemy.orm import sessionmaker, scoped_session
|
from sqlalchemy.orm import sessionmaker, scoped_session
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
|
@ -14,6 +12,7 @@ from sqlalchemy.sql.expression import func
|
||||||
|
|
||||||
from platypush.backend.http.request import HttpRequest
|
from platypush.backend.http.request import HttpRequest
|
||||||
from platypush.config import Config
|
from platypush.config import Config
|
||||||
|
from platypush.context import get_plugin
|
||||||
from platypush.message.event.http.rss import NewFeedEvent
|
from platypush.message.event.http.rss import NewFeedEvent
|
||||||
|
|
||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
|
@ -23,21 +22,26 @@ Session = scoped_session(sessionmaker())
|
||||||
class RssUpdates(HttpRequest):
|
class RssUpdates(HttpRequest):
|
||||||
""" Gets new items in an RSS feed """
|
""" Gets new items in an RSS feed """
|
||||||
|
|
||||||
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
|
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' + \
|
||||||
|
'Chrome/62.0.3202.94 Safari/537.36'
|
||||||
|
|
||||||
def __init__(self, url, title=None, headers=None, params=None, max_entries=None,
|
def __init__(self, url, title=None, headers=None, params=None, max_entries=None,
|
||||||
mercury_api_key=None, digest_format=None, *argv, **kwargs):
|
extract_content=None, digest_format=None, *argv, **kwargs):
|
||||||
self.workdir = os.path.join(os.path.expanduser(Config.get('workdir')), 'feeds')
|
self.workdir = os.path.join(os.path.expanduser(Config.get('workdir')), 'feeds')
|
||||||
self.dbfile = os.path.join(self.workdir, 'rss.db')
|
self.dbfile = os.path.join(self.workdir, 'rss.db')
|
||||||
self.url = url
|
self.url = url
|
||||||
self.title = title
|
self.title = title
|
||||||
self.max_entries = max_entries
|
self.max_entries = max_entries
|
||||||
self.mercury_api_key = mercury_api_key # Mercury Reader API used to parse the content of the link
|
|
||||||
|
# If true, then the http.webpage plugin will be used to parse the content
|
||||||
|
self.extract_content = extract_content
|
||||||
|
|
||||||
self.digest_format = digest_format.lower() if digest_format else None # Supported formats: html, pdf
|
self.digest_format = digest_format.lower() if digest_format else None # Supported formats: html, pdf
|
||||||
|
|
||||||
os.makedirs(os.path.expanduser(os.path.dirname(self.dbfile)), exist_ok=True)
|
os.makedirs(os.path.expanduser(os.path.dirname(self.dbfile)), exist_ok=True)
|
||||||
|
|
||||||
if headers is None: headers = {}
|
if headers is None:
|
||||||
|
headers = {}
|
||||||
headers['User-Agent'] = self.user_agent
|
headers['User-Agent'] = self.user_agent
|
||||||
|
|
||||||
request_args = {
|
request_args = {
|
||||||
|
@ -58,52 +62,21 @@ class RssUpdates(HttpRequest):
|
||||||
session.commit()
|
session.commit()
|
||||||
return record
|
return record
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def _get_latest_update(self, session, source_id):
|
def _get_latest_update(session, source_id):
|
||||||
return session.query(func.max(FeedEntry.published)).filter_by(source_id=source_id).scalar()
|
return session.query(func.max(FeedEntry.published)).filter_by(source_id=source_id).scalar()
|
||||||
|
|
||||||
|
|
||||||
def _parse_entry_content(self, link):
|
def _parse_entry_content(self, link):
|
||||||
response = None
|
parser = get_plugin('http.webpage')
|
||||||
err = None
|
response = parser.simplify(link).output
|
||||||
n_tries = 5
|
errors = parser.simplify(link).errors
|
||||||
|
|
||||||
for _ in range(0, n_tries):
|
if not response:
|
||||||
try:
|
self.logger.warning('Mercury parser error: '.format(errors or '[unknown error]'))
|
||||||
self.logger.info('Parsing content for {}'.format(link))
|
|
||||||
response = requests.get('https://mercury.postlight.com/parser',
|
|
||||||
params = {'url': link},
|
|
||||||
headers = {'x-api-key': self.mercury_api_key })
|
|
||||||
except Exception as e:
|
|
||||||
err = e
|
|
||||||
|
|
||||||
if response.text:
|
|
||||||
err = None
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
if err:
|
|
||||||
raise err
|
|
||||||
|
|
||||||
if not response.text:
|
|
||||||
self.logger.warning('No response from Mercury API for URL {} after {} tries'.format(link, n_tries))
|
|
||||||
return
|
|
||||||
|
|
||||||
if not response.ok:
|
|
||||||
self.logger.warning('Mercury API call failed with status {}'.format(response.status_code))
|
|
||||||
return
|
|
||||||
|
|
||||||
response = response.json()
|
|
||||||
error = response.get('error')
|
|
||||||
|
|
||||||
if error:
|
|
||||||
self.logger.warning('Mercury API error: {}'.format(error))
|
|
||||||
return
|
return
|
||||||
|
|
||||||
return response.get('content')
|
return response.get('content')
|
||||||
|
|
||||||
|
|
||||||
def get_new_items(self, response):
|
def get_new_items(self, response):
|
||||||
engine = create_engine('sqlite:///{}'.format(self.dbfile),
|
engine = create_engine('sqlite:///{}'.format(self.dbfile),
|
||||||
connect_args={'check_same_thread': False})
|
connect_args={'check_same_thread': False})
|
||||||
|
@ -128,8 +101,7 @@ class RssUpdates(HttpRequest):
|
||||||
<h1 style="margin-top: 30px">{}</h1>
|
<h1 style="margin-top: 30px">{}</h1>
|
||||||
<h2 style="margin-top: 10px; page-break-after: always">
|
<h2 style="margin-top: 10px; page-break-after: always">
|
||||||
Feeds digest generated on {} </h2>'''.format(self.title,
|
Feeds digest generated on {} </h2>'''.format(self.title,
|
||||||
datetime.datetime.now().strftime('%d %B %Y, %H:%M')
|
datetime.datetime.now().strftime('%d %B %Y, %H:%M'))
|
||||||
)
|
|
||||||
|
|
||||||
self.logger.info('Parsed {:d} items from RSS feed <{}>'
|
self.logger.info('Parsed {:d} items from RSS feed <{}>'
|
||||||
.format(len(feed.entries), self.url))
|
.format(len(feed.entries), self.url))
|
||||||
|
@ -146,7 +118,7 @@ class RssUpdates(HttpRequest):
|
||||||
self.logger.info('Processed new item from RSS feed <{}>'.format(self.url))
|
self.logger.info('Processed new item from RSS feed <{}>'.format(self.url))
|
||||||
entry.summary = entry.summary if hasattr(entry, 'summary') else None
|
entry.summary = entry.summary if hasattr(entry, 'summary') else None
|
||||||
|
|
||||||
if self.mercury_api_key:
|
if self.extract_content:
|
||||||
entry.content = self._parse_entry_content(entry.link)
|
entry.content = self._parse_entry_content(entry.link)
|
||||||
elif hasattr(entry, 'summary'):
|
elif hasattr(entry, 'summary'):
|
||||||
entry.content = entry.summary
|
entry.content = entry.summary
|
||||||
|
@ -168,7 +140,8 @@ class RssUpdates(HttpRequest):
|
||||||
|
|
||||||
entries.append(e)
|
entries.append(e)
|
||||||
session.add(FeedEntry(**e))
|
session.add(FeedEntry(**e))
|
||||||
if self.max_entries and len(entries) > self.max_entries: break
|
if self.max_entries and len(entries) > self.max_entries:
|
||||||
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warning('Exception encountered while parsing RSS ' +
|
self.logger.warning('Exception encountered while parsing RSS ' +
|
||||||
'RSS feed {}: {}'.format(self.url, str(e)))
|
'RSS feed {}: {}'.format(self.url, str(e)))
|
||||||
|
@ -259,6 +232,4 @@ class FeedDigest(Base):
|
||||||
filename = Column(String, nullable=False)
|
filename = Column(String, nullable=False)
|
||||||
created_at = Column(DateTime, nullable=False, default=datetime.datetime.utcnow)
|
created_at = Column(DateTime, nullable=False, default=datetime.datetime.utcnow)
|
||||||
|
|
||||||
|
|
||||||
# vim:sw=4:ts=4:et:
|
# vim:sw=4:ts=4:et:
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from platypush.message.event import Event
|
from platypush.message.event import Event
|
||||||
|
|
||||||
|
|
||||||
class HttpEvent(Event):
|
class HttpEvent(Event):
|
||||||
"""
|
"""
|
||||||
Event triggered upon HTTP request/response cycle completion
|
Event triggered upon HTTP request/response cycle completion
|
||||||
|
@ -11,7 +12,7 @@ class HttpEvent(Event):
|
||||||
:type request: dict
|
:type request: dict
|
||||||
|
|
||||||
:param response: The server response
|
:param response: The server response
|
||||||
:type response: dict
|
:type response: dict or list
|
||||||
"""
|
"""
|
||||||
|
|
||||||
super().__init__(request=request, response=response, *args, **kwargs)
|
super().__init__(request=request, response=response, *args, **kwargs)
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
from platypush.message.event.http import HttpEvent
|
from platypush.message.event.http import HttpEvent
|
||||||
|
|
||||||
|
|
||||||
class NewFeedEvent(HttpEvent):
|
class NewFeedEvent(HttpEvent):
|
||||||
"""
|
"""
|
||||||
Event triggered when a monitored RSS feed has some new content
|
Event triggered when a monitored RSS feed has some new content
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, request, response, source_id=None, title=None,
|
def __init__(self, request, response: list, source_id=None, title=None,
|
||||||
digest_format=None, digest_filename=None, *args, **kwargs):
|
digest_format=None, digest_filename=None, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
:param request: Original request
|
:param request: Original request
|
||||||
|
@ -22,4 +23,3 @@ class NewFeedEvent(HttpEvent):
|
||||||
|
|
||||||
|
|
||||||
# vim:sw=4:ts=4:et:
|
# vim:sw=4:ts=4:et:
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import requests
|
import subprocess
|
||||||
import time
|
|
||||||
|
|
||||||
from platypush.plugins import action
|
from platypush.plugins import action
|
||||||
from platypush.plugins.http.request import Plugin
|
from platypush.plugins.http.request import Plugin
|
||||||
|
@ -8,22 +9,19 @@ from platypush.plugins.http.request import Plugin
|
||||||
|
|
||||||
class HttpWebpagePlugin(Plugin):
|
class HttpWebpagePlugin(Plugin):
|
||||||
"""
|
"""
|
||||||
Plugin to handle and parse/simplify web pages
|
Plugin to handle and parse/simplify web pages.
|
||||||
|
It used to use the Mercury Reader web API, but now that the API is discontinued this plugin is basically a
|
||||||
|
wrapper around the `mercury-parser <https://github.com/postlight/mercury-parser>`_ JavaScript library.
|
||||||
|
|
||||||
Requires:
|
Requires:
|
||||||
|
|
||||||
* **requests** (``pip install requests``)
|
* **requests** (``pip install requests``)
|
||||||
* **weasyprint** (``pip install weasyprint``), optional, for HTML->PDF conversion
|
* **weasyprint** (``pip install weasyprint``), optional, for HTML->PDF conversion
|
||||||
|
* **node** and **npm** installed on your system (to use the mercury-parser interface)
|
||||||
|
* The mercury-parser library installed (``npm install @postlight/mercury-parser``)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, mercury_api_key=None, **kwargs):
|
_mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js')
|
||||||
"""
|
|
||||||
:param mercury_api_key: If set then Mercury will be used to parse web pages content
|
|
||||||
:type mercury_api_key: str
|
|
||||||
"""
|
|
||||||
|
|
||||||
super().__init__(**kwargs)
|
|
||||||
self.mercury_api_key = mercury_api_key
|
|
||||||
|
|
||||||
@action
|
@action
|
||||||
def simplify(self, url, outfile=None):
|
def simplify(self, url, outfile=None):
|
||||||
|
@ -56,24 +54,17 @@ class HttpWebpagePlugin(Plugin):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not self.mercury_api_key:
|
|
||||||
raise RuntimeError("mercury_api_key not set")
|
|
||||||
|
|
||||||
self.logger.info('Parsing URL {}'.format(url))
|
self.logger.info('Parsing URL {}'.format(url))
|
||||||
response = requests.get('https://mercury.postlight.com/parser',
|
parser = subprocess.Popen(['node', self._mercury_script, url], stdout=subprocess.PIPE)
|
||||||
params={'url': url},
|
response = json.loads(parser.stdout.read().decode())
|
||||||
headers={'x-api-key': self.mercury_api_key})
|
|
||||||
|
|
||||||
if not response or not response.ok:
|
self.logger.info('Got response from Mercury API: {}'.format(response))
|
||||||
raise RuntimeError("Unable to parse content for {}: {}".format(url, response.reason))
|
title = response.get('title', '{} on {}'.format(
|
||||||
|
'Published' if response.get('date_published') else 'Generated',
|
||||||
|
response.get('date_published', datetime.datetime.now().isoformat())))
|
||||||
|
|
||||||
if not len(response.text):
|
|
||||||
raise RuntimeError("Empty response from Mercury API for URL {}".format(url))
|
|
||||||
|
|
||||||
self.logger.info('Got response from Mercury API: {}'.format(response.json()))
|
|
||||||
title = response.json().get('title', 'No_title_{}'.format(int(time.time())))
|
|
||||||
content = '<body style="{body_style}"><h1>{title}</h1>{content}</body>'.\
|
content = '<body style="{body_style}"><h1>{title}</h1>{content}</body>'.\
|
||||||
format(title=title, content=response.json()['content'],
|
format(title=title, content=response.get('content', '[No content available]'),
|
||||||
body_style='font-size: 22px; font-family: Tahoma, Geneva, sans-serif')
|
body_style='font-size: 22px; font-family: Tahoma, Geneva, sans-serif')
|
||||||
|
|
||||||
if not outfile:
|
if not outfile:
|
||||||
|
@ -85,7 +76,7 @@ class HttpWebpagePlugin(Plugin):
|
||||||
|
|
||||||
outfile = os.path.abspath(os.path.expanduser(outfile))
|
outfile = os.path.abspath(os.path.expanduser(outfile))
|
||||||
|
|
||||||
if outfile.endswith('.pdf'):
|
if outfile.lower().endswith('.pdf'):
|
||||||
import weasyprint
|
import weasyprint
|
||||||
weasyprint.HTML(string=content).write_pdf(outfile)
|
weasyprint.HTML(string=content).write_pdf(outfile)
|
||||||
else:
|
else:
|
20
platypush/plugins/http/webpage/mercury-parser.js
Executable file
20
platypush/plugins/http/webpage/mercury-parser.js
Executable file
|
@ -0,0 +1,20 @@
|
||||||
|
#!node
|
||||||
|
|
||||||
|
// This script will parse the content and title of a webpage using the
|
||||||
|
// mercury-parser JavaScript library (https://github.com/postlight/mercury-parser)
|
||||||
|
// and print a JSON object with the extracted information.
|
||||||
|
|
||||||
|
'use strict';
|
||||||
|
|
||||||
|
const parser = require('@postlight/mercury-parser');
|
||||||
|
|
||||||
|
if (process.argv.length < 3) {
|
||||||
|
console.error('Usage: ' + process.argv[1] + ' <url to parse>');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const url = process.argv[2];
|
||||||
|
parser.parse(url).then(result => {
|
||||||
|
console.log(JSON.stringify(result));
|
||||||
|
});
|
||||||
|
|
Loading…
Reference in a new issue