diff --git a/platypush/backend/http/request/rss/__init__.py b/platypush/backend/http/request/rss/__init__.py
index 39062283..66f61556 100644
--- a/platypush/backend/http/request/rss/__init__.py
+++ b/platypush/backend/http/request/rss/__init__.py
@@ -2,11 +2,9 @@ import datetime
import enum
import feedparser
import os
-import requests
-import time
from sqlalchemy import create_engine, Column, Integer, String, DateTime, \
- Enum, UniqueConstraint, ForeignKey
+ Enum, ForeignKey
from sqlalchemy.orm import sessionmaker, scoped_session
from sqlalchemy.ext.declarative import declarative_base
@@ -14,6 +12,7 @@ from sqlalchemy.sql.expression import func
from platypush.backend.http.request import HttpRequest
from platypush.config import Config
+from platypush.context import get_plugin
from platypush.message.event.http.rss import NewFeedEvent
Base = declarative_base()
@@ -23,21 +22,26 @@ Session = scoped_session(sessionmaker())
class RssUpdates(HttpRequest):
""" Gets new items in an RSS feed """
- user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
+ user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' + \
+ 'Chrome/62.0.3202.94 Safari/537.36'
def __init__(self, url, title=None, headers=None, params=None, max_entries=None,
- mercury_api_key=None, digest_format=None, *argv, **kwargs):
+ extract_content=None, digest_format=None, *argv, **kwargs):
self.workdir = os.path.join(os.path.expanduser(Config.get('workdir')), 'feeds')
self.dbfile = os.path.join(self.workdir, 'rss.db')
self.url = url
self.title = title
self.max_entries = max_entries
- self.mercury_api_key = mercury_api_key # Mercury Reader API used to parse the content of the link
+
+ # If true, then the http.webpage plugin will be used to parse the content
+ self.extract_content = extract_content
+
self.digest_format = digest_format.lower() if digest_format else None # Supported formats: html, pdf
os.makedirs(os.path.expanduser(os.path.dirname(self.dbfile)), exist_ok=True)
- if headers is None: headers = {}
+ if headers is None:
+ headers = {}
headers['User-Agent'] = self.user_agent
request_args = {
@@ -58,55 +62,24 @@ class RssUpdates(HttpRequest):
session.commit()
return record
-
- def _get_latest_update(self, session, source_id):
+ @staticmethod
+ def _get_latest_update(session, source_id):
return session.query(func.max(FeedEntry.published)).filter_by(source_id=source_id).scalar()
-
def _parse_entry_content(self, link):
- response = None
- err = None
- n_tries = 5
+ parser = get_plugin('http.webpage')
+ response = parser.simplify(link).output
+ errors = parser.simplify(link).errors
- for _ in range(0, n_tries):
- try:
- self.logger.info('Parsing content for {}'.format(link))
- response = requests.get('https://mercury.postlight.com/parser',
- params = {'url': link},
- headers = {'x-api-key': self.mercury_api_key })
- except Exception as e:
- err = e
-
- if response.text:
- err = None
- break
- else:
- time.sleep(1)
-
- if err:
- raise err
-
- if not response.text:
- self.logger.warning('No response from Mercury API for URL {} after {} tries'.format(link, n_tries))
- return
-
- if not response.ok:
- self.logger.warning('Mercury API call failed with status {}'.format(response.status_code))
- return
-
- response = response.json()
- error = response.get('error')
-
- if error:
- self.logger.warning('Mercury API error: {}'.format(error))
+ if not response:
+ self.logger.warning('Mercury parser error: '.format(errors or '[unknown error]'))
return
return response.get('content')
-
def get_new_items(self, response):
engine = create_engine('sqlite:///{}'.format(self.dbfile),
- connect_args = { 'check_same_thread': False })
+ connect_args={'check_same_thread': False})
Base.metadata.create_all(engine)
Session.configure(bind=engine)
@@ -128,11 +101,10 @@ class RssUpdates(HttpRequest):
{}
Feeds digest generated on {}
'''.format(self.title,
- datetime.datetime.now().strftime('%d %B %Y, %H:%M')
- )
+ datetime.datetime.now().strftime('%d %B %Y, %H:%M'))
self.logger.info('Parsed {:d} items from RSS feed <{}>'
- .format(len(feed.entries), self.url))
+ .format(len(feed.entries), self.url))
for entry in feed.entries:
if not entry.published_parsed:
@@ -146,7 +118,7 @@ class RssUpdates(HttpRequest):
self.logger.info('Processed new item from RSS feed <{}>'.format(self.url))
entry.summary = entry.summary if hasattr(entry, 'summary') else None
- if self.mercury_api_key:
+ if self.extract_content:
entry.content = self._parse_entry_content(entry.link)
elif hasattr(entry, 'summary'):
entry.content = entry.summary
@@ -168,7 +140,8 @@ class RssUpdates(HttpRequest):
entries.append(e)
session.add(FeedEntry(**e))
- if self.max_entries and len(entries) > self.max_entries: break
+ if self.max_entries and len(entries) > self.max_entries:
+ break
except Exception as e:
self.logger.warning('Exception encountered while parsing RSS ' +
'RSS feed {}: {}'.format(self.url, str(e)))
@@ -196,11 +169,11 @@ class RssUpdates(HttpRequest):
weasyprint.HTML(string=digest).write_pdf(digest_filename)
else:
raise RuntimeError('Unsupported format: {}. Supported formats: ' +
- 'html or pdf'.format(self.digest_format))
+ 'html or pdf'.format(self.digest_format))
digest_entry = FeedDigest(source_id=source_record.id,
- format=self.digest_format,
- filename=digest_filename)
+ format=self.digest_format,
+ filename=digest_filename)
session.add(digest_entry)
self.logger.info('{} digest ready: {}'.format(self.digest_format, digest_filename))
@@ -218,7 +191,7 @@ class FeedSource(Base):
""" Models the FeedSource table, containing RSS sources to be parsed """
__tablename__ = 'FeedSource'
- __table_args__ = ({ 'sqlite_autoincrement': True })
+ __table_args__ = ({'sqlite_autoincrement': True})
id = Column(Integer, primary_key=True)
title = Column(String)
@@ -230,7 +203,7 @@ class FeedEntry(Base):
""" Models the FeedEntry table, which contains RSS entries """
__tablename__ = 'FeedEntry'
- __table_args__ = ({ 'sqlite_autoincrement': True })
+ __table_args__ = ({'sqlite_autoincrement': True})
id = Column(Integer, primary_key=True)
entry_id = Column(String)
@@ -251,7 +224,7 @@ class FeedDigest(Base):
pdf = 2
__tablename__ = 'FeedDigest'
- __table_args__ = ({ 'sqlite_autoincrement': True })
+ __table_args__ = ({'sqlite_autoincrement': True})
id = Column(Integer, primary_key=True)
source_id = Column(Integer, ForeignKey('FeedSource.id'), nullable=False)
@@ -259,6 +232,4 @@ class FeedDigest(Base):
filename = Column(String, nullable=False)
created_at = Column(DateTime, nullable=False, default=datetime.datetime.utcnow)
-
# vim:sw=4:ts=4:et:
-
diff --git a/platypush/message/event/http/__init__.py b/platypush/message/event/http/__init__.py
index d8801512..513bb69a 100644
--- a/platypush/message/event/http/__init__.py
+++ b/platypush/message/event/http/__init__.py
@@ -1,5 +1,6 @@
from platypush.message.event import Event
+
class HttpEvent(Event):
"""
Event triggered upon HTTP request/response cycle completion
@@ -11,7 +12,7 @@ class HttpEvent(Event):
:type request: dict
:param response: The server response
- :type response: dict
+ :type response: dict or list
"""
super().__init__(request=request, response=response, *args, **kwargs)
diff --git a/platypush/message/event/http/rss.py b/platypush/message/event/http/rss.py
index c4bb92b5..aaf9fa52 100644
--- a/platypush/message/event/http/rss.py
+++ b/platypush/message/event/http/rss.py
@@ -1,11 +1,12 @@
from platypush.message.event.http import HttpEvent
+
class NewFeedEvent(HttpEvent):
"""
Event triggered when a monitored RSS feed has some new content
"""
- def __init__(self, request, response, source_id=None, title=None,
+ def __init__(self, request, response: list, source_id=None, title=None,
digest_format=None, digest_filename=None, *args, **kwargs):
"""
:param request: Original request
@@ -22,4 +23,3 @@ class NewFeedEvent(HttpEvent):
# vim:sw=4:ts=4:et:
-
diff --git a/platypush/plugins/http/webpage.py b/platypush/plugins/http/webpage/__init__.py
similarity index 63%
rename from platypush/plugins/http/webpage.py
rename to platypush/plugins/http/webpage/__init__.py
index 38fad1e5..0f230c00 100644
--- a/platypush/plugins/http/webpage.py
+++ b/platypush/plugins/http/webpage/__init__.py
@@ -1,6 +1,7 @@
+import datetime
+import json
import os
-import requests
-import time
+import subprocess
from platypush.plugins import action
from platypush.plugins.http.request import Plugin
@@ -8,22 +9,19 @@ from platypush.plugins.http.request import Plugin
class HttpWebpagePlugin(Plugin):
"""
- Plugin to handle and parse/simplify web pages
+ Plugin to handle and parse/simplify web pages.
+ It used to use the Mercury Reader web API, but now that the API is discontinued this plugin is basically a
+ wrapper around the `mercury-parser `_ JavaScript library.
Requires:
* **requests** (``pip install requests``)
* **weasyprint** (``pip install weasyprint``), optional, for HTML->PDF conversion
+ * **node** and **npm** installed on your system (to use the mercury-parser interface)
+ * The mercury-parser library installed (``npm install @postlight/mercury-parser``)
"""
- def __init__(self, mercury_api_key=None, **kwargs):
- """
- :param mercury_api_key: If set then Mercury will be used to parse web pages content
- :type mercury_api_key: str
- """
-
- super().__init__(**kwargs)
- self.mercury_api_key = mercury_api_key
+ _mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js')
@action
def simplify(self, url, outfile=None):
@@ -56,24 +54,17 @@ class HttpWebpagePlugin(Plugin):
"""
- if not self.mercury_api_key:
- raise RuntimeError("mercury_api_key not set")
-
self.logger.info('Parsing URL {}'.format(url))
- response = requests.get('https://mercury.postlight.com/parser',
- params={'url': url},
- headers={'x-api-key': self.mercury_api_key})
+ parser = subprocess.Popen(['node', self._mercury_script, url], stdout=subprocess.PIPE)
+ response = json.loads(parser.stdout.read().decode())
- if not response or not response.ok:
- raise RuntimeError("Unable to parse content for {}: {}".format(url, response.reason))
+ self.logger.info('Got response from Mercury API: {}'.format(response))
+ title = response.get('title', '{} on {}'.format(
+ 'Published' if response.get('date_published') else 'Generated',
+ response.get('date_published', datetime.datetime.now().isoformat())))
- if not len(response.text):
- raise RuntimeError("Empty response from Mercury API for URL {}".format(url))
-
- self.logger.info('Got response from Mercury API: {}'.format(response.json()))
- title = response.json().get('title', 'No_title_{}'.format(int(time.time())))
content = '{title}
{content}'.\
- format(title=title, content=response.json()['content'],
+ format(title=title, content=response.get('content', '[No content available]'),
body_style='font-size: 22px; font-family: Tahoma, Geneva, sans-serif')
if not outfile:
@@ -85,7 +76,7 @@ class HttpWebpagePlugin(Plugin):
outfile = os.path.abspath(os.path.expanduser(outfile))
- if outfile.endswith('.pdf'):
+ if outfile.lower().endswith('.pdf'):
import weasyprint
weasyprint.HTML(string=content).write_pdf(outfile)
else:
diff --git a/platypush/plugins/http/webpage/mercury-parser.js b/platypush/plugins/http/webpage/mercury-parser.js
new file mode 100755
index 00000000..286b2e25
--- /dev/null
+++ b/platypush/plugins/http/webpage/mercury-parser.js
@@ -0,0 +1,20 @@
+#!node
+
+// This script will parse the content and title of a webpage using the
+// mercury-parser JavaScript library (https://github.com/postlight/mercury-parser)
+// and print a JSON object with the extracted information.
+
+'use strict';
+
+const parser = require('@postlight/mercury-parser');
+
+if (process.argv.length < 3) {
+ console.error('Usage: ' + process.argv[1] + ' ');
+ process.exit(1);
+}
+
+const url = process.argv[2];
+parser.parse(url).then(result => {
+ console.log(JSON.stringify(result));
+});
+