diff --git a/platypush/backend/http/request/rss/__init__.py b/platypush/backend/http/request/rss/__init__.py index 39062283..66f61556 100644 --- a/platypush/backend/http/request/rss/__init__.py +++ b/platypush/backend/http/request/rss/__init__.py @@ -2,11 +2,9 @@ import datetime import enum import feedparser import os -import requests -import time from sqlalchemy import create_engine, Column, Integer, String, DateTime, \ - Enum, UniqueConstraint, ForeignKey + Enum, ForeignKey from sqlalchemy.orm import sessionmaker, scoped_session from sqlalchemy.ext.declarative import declarative_base @@ -14,6 +12,7 @@ from sqlalchemy.sql.expression import func from platypush.backend.http.request import HttpRequest from platypush.config import Config +from platypush.context import get_plugin from platypush.message.event.http.rss import NewFeedEvent Base = declarative_base() @@ -23,21 +22,26 @@ Session = scoped_session(sessionmaker()) class RssUpdates(HttpRequest): """ Gets new items in an RSS feed """ - user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36' + user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' + \ + 'Chrome/62.0.3202.94 Safari/537.36' def __init__(self, url, title=None, headers=None, params=None, max_entries=None, - mercury_api_key=None, digest_format=None, *argv, **kwargs): + extract_content=None, digest_format=None, *argv, **kwargs): self.workdir = os.path.join(os.path.expanduser(Config.get('workdir')), 'feeds') self.dbfile = os.path.join(self.workdir, 'rss.db') self.url = url self.title = title self.max_entries = max_entries - self.mercury_api_key = mercury_api_key # Mercury Reader API used to parse the content of the link + + # If true, then the http.webpage plugin will be used to parse the content + self.extract_content = extract_content + self.digest_format = digest_format.lower() if digest_format else None # Supported formats: html, pdf os.makedirs(os.path.expanduser(os.path.dirname(self.dbfile)), exist_ok=True) - if headers is None: headers = {} + if headers is None: + headers = {} headers['User-Agent'] = self.user_agent request_args = { @@ -58,55 +62,24 @@ class RssUpdates(HttpRequest): session.commit() return record - - def _get_latest_update(self, session, source_id): + @staticmethod + def _get_latest_update(session, source_id): return session.query(func.max(FeedEntry.published)).filter_by(source_id=source_id).scalar() - def _parse_entry_content(self, link): - response = None - err = None - n_tries = 5 + parser = get_plugin('http.webpage') + response = parser.simplify(link).output + errors = parser.simplify(link).errors - for _ in range(0, n_tries): - try: - self.logger.info('Parsing content for {}'.format(link)) - response = requests.get('https://mercury.postlight.com/parser', - params = {'url': link}, - headers = {'x-api-key': self.mercury_api_key }) - except Exception as e: - err = e - - if response.text: - err = None - break - else: - time.sleep(1) - - if err: - raise err - - if not response.text: - self.logger.warning('No response from Mercury API for URL {} after {} tries'.format(link, n_tries)) - return - - if not response.ok: - self.logger.warning('Mercury API call failed with status {}'.format(response.status_code)) - return - - response = response.json() - error = response.get('error') - - if error: - self.logger.warning('Mercury API error: {}'.format(error)) + if not response: + self.logger.warning('Mercury parser error: '.format(errors or '[unknown error]')) return return response.get('content') - def get_new_items(self, response): engine = create_engine('sqlite:///{}'.format(self.dbfile), - connect_args = { 'check_same_thread': False }) + connect_args={'check_same_thread': False}) Base.metadata.create_all(engine) Session.configure(bind=engine) @@ -128,11 +101,10 @@ class RssUpdates(HttpRequest):

{}

Feeds digest generated on {}

'''.format(self.title, - datetime.datetime.now().strftime('%d %B %Y, %H:%M') - ) + datetime.datetime.now().strftime('%d %B %Y, %H:%M')) self.logger.info('Parsed {:d} items from RSS feed <{}>' - .format(len(feed.entries), self.url)) + .format(len(feed.entries), self.url)) for entry in feed.entries: if not entry.published_parsed: @@ -146,7 +118,7 @@ class RssUpdates(HttpRequest): self.logger.info('Processed new item from RSS feed <{}>'.format(self.url)) entry.summary = entry.summary if hasattr(entry, 'summary') else None - if self.mercury_api_key: + if self.extract_content: entry.content = self._parse_entry_content(entry.link) elif hasattr(entry, 'summary'): entry.content = entry.summary @@ -168,7 +140,8 @@ class RssUpdates(HttpRequest): entries.append(e) session.add(FeedEntry(**e)) - if self.max_entries and len(entries) > self.max_entries: break + if self.max_entries and len(entries) > self.max_entries: + break except Exception as e: self.logger.warning('Exception encountered while parsing RSS ' + 'RSS feed {}: {}'.format(self.url, str(e))) @@ -196,11 +169,11 @@ class RssUpdates(HttpRequest): weasyprint.HTML(string=digest).write_pdf(digest_filename) else: raise RuntimeError('Unsupported format: {}. Supported formats: ' + - 'html or pdf'.format(self.digest_format)) + 'html or pdf'.format(self.digest_format)) digest_entry = FeedDigest(source_id=source_record.id, - format=self.digest_format, - filename=digest_filename) + format=self.digest_format, + filename=digest_filename) session.add(digest_entry) self.logger.info('{} digest ready: {}'.format(self.digest_format, digest_filename)) @@ -218,7 +191,7 @@ class FeedSource(Base): """ Models the FeedSource table, containing RSS sources to be parsed """ __tablename__ = 'FeedSource' - __table_args__ = ({ 'sqlite_autoincrement': True }) + __table_args__ = ({'sqlite_autoincrement': True}) id = Column(Integer, primary_key=True) title = Column(String) @@ -230,7 +203,7 @@ class FeedEntry(Base): """ Models the FeedEntry table, which contains RSS entries """ __tablename__ = 'FeedEntry' - __table_args__ = ({ 'sqlite_autoincrement': True }) + __table_args__ = ({'sqlite_autoincrement': True}) id = Column(Integer, primary_key=True) entry_id = Column(String) @@ -251,7 +224,7 @@ class FeedDigest(Base): pdf = 2 __tablename__ = 'FeedDigest' - __table_args__ = ({ 'sqlite_autoincrement': True }) + __table_args__ = ({'sqlite_autoincrement': True}) id = Column(Integer, primary_key=True) source_id = Column(Integer, ForeignKey('FeedSource.id'), nullable=False) @@ -259,6 +232,4 @@ class FeedDigest(Base): filename = Column(String, nullable=False) created_at = Column(DateTime, nullable=False, default=datetime.datetime.utcnow) - # vim:sw=4:ts=4:et: - diff --git a/platypush/message/event/http/__init__.py b/platypush/message/event/http/__init__.py index d8801512..513bb69a 100644 --- a/platypush/message/event/http/__init__.py +++ b/platypush/message/event/http/__init__.py @@ -1,5 +1,6 @@ from platypush.message.event import Event + class HttpEvent(Event): """ Event triggered upon HTTP request/response cycle completion @@ -11,7 +12,7 @@ class HttpEvent(Event): :type request: dict :param response: The server response - :type response: dict + :type response: dict or list """ super().__init__(request=request, response=response, *args, **kwargs) diff --git a/platypush/message/event/http/rss.py b/platypush/message/event/http/rss.py index c4bb92b5..aaf9fa52 100644 --- a/platypush/message/event/http/rss.py +++ b/platypush/message/event/http/rss.py @@ -1,11 +1,12 @@ from platypush.message.event.http import HttpEvent + class NewFeedEvent(HttpEvent): """ Event triggered when a monitored RSS feed has some new content """ - def __init__(self, request, response, source_id=None, title=None, + def __init__(self, request, response: list, source_id=None, title=None, digest_format=None, digest_filename=None, *args, **kwargs): """ :param request: Original request @@ -22,4 +23,3 @@ class NewFeedEvent(HttpEvent): # vim:sw=4:ts=4:et: - diff --git a/platypush/plugins/http/webpage.py b/platypush/plugins/http/webpage/__init__.py similarity index 63% rename from platypush/plugins/http/webpage.py rename to platypush/plugins/http/webpage/__init__.py index 38fad1e5..0f230c00 100644 --- a/platypush/plugins/http/webpage.py +++ b/platypush/plugins/http/webpage/__init__.py @@ -1,6 +1,7 @@ +import datetime +import json import os -import requests -import time +import subprocess from platypush.plugins import action from platypush.plugins.http.request import Plugin @@ -8,22 +9,19 @@ from platypush.plugins.http.request import Plugin class HttpWebpagePlugin(Plugin): """ - Plugin to handle and parse/simplify web pages + Plugin to handle and parse/simplify web pages. + It used to use the Mercury Reader web API, but now that the API is discontinued this plugin is basically a + wrapper around the `mercury-parser `_ JavaScript library. Requires: * **requests** (``pip install requests``) * **weasyprint** (``pip install weasyprint``), optional, for HTML->PDF conversion + * **node** and **npm** installed on your system (to use the mercury-parser interface) + * The mercury-parser library installed (``npm install @postlight/mercury-parser``) """ - def __init__(self, mercury_api_key=None, **kwargs): - """ - :param mercury_api_key: If set then Mercury will be used to parse web pages content - :type mercury_api_key: str - """ - - super().__init__(**kwargs) - self.mercury_api_key = mercury_api_key + _mercury_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mercury-parser.js') @action def simplify(self, url, outfile=None): @@ -56,24 +54,17 @@ class HttpWebpagePlugin(Plugin): """ - if not self.mercury_api_key: - raise RuntimeError("mercury_api_key not set") - self.logger.info('Parsing URL {}'.format(url)) - response = requests.get('https://mercury.postlight.com/parser', - params={'url': url}, - headers={'x-api-key': self.mercury_api_key}) + parser = subprocess.Popen(['node', self._mercury_script, url], stdout=subprocess.PIPE) + response = json.loads(parser.stdout.read().decode()) - if not response or not response.ok: - raise RuntimeError("Unable to parse content for {}: {}".format(url, response.reason)) + self.logger.info('Got response from Mercury API: {}'.format(response)) + title = response.get('title', '{} on {}'.format( + 'Published' if response.get('date_published') else 'Generated', + response.get('date_published', datetime.datetime.now().isoformat()))) - if not len(response.text): - raise RuntimeError("Empty response from Mercury API for URL {}".format(url)) - - self.logger.info('Got response from Mercury API: {}'.format(response.json())) - title = response.json().get('title', 'No_title_{}'.format(int(time.time()))) content = '

{title}

{content}'.\ - format(title=title, content=response.json()['content'], + format(title=title, content=response.get('content', '[No content available]'), body_style='font-size: 22px; font-family: Tahoma, Geneva, sans-serif') if not outfile: @@ -85,7 +76,7 @@ class HttpWebpagePlugin(Plugin): outfile = os.path.abspath(os.path.expanduser(outfile)) - if outfile.endswith('.pdf'): + if outfile.lower().endswith('.pdf'): import weasyprint weasyprint.HTML(string=content).write_pdf(outfile) else: diff --git a/platypush/plugins/http/webpage/mercury-parser.js b/platypush/plugins/http/webpage/mercury-parser.js new file mode 100755 index 00000000..286b2e25 --- /dev/null +++ b/platypush/plugins/http/webpage/mercury-parser.js @@ -0,0 +1,20 @@ +#!node + +// This script will parse the content and title of a webpage using the +// mercury-parser JavaScript library (https://github.com/postlight/mercury-parser) +// and print a JSON object with the extracted information. + +'use strict'; + +const parser = require('@postlight/mercury-parser'); + +if (process.argv.length < 3) { + console.error('Usage: ' + process.argv[1] + ' '); + process.exit(1); +} + +const url = process.argv[2]; +parser.parse(url).then(result => { + console.log(JSON.stringify(result)); +}); +