diff --git a/platypush/backend/http/request/rss/__init__.py b/platypush/backend/http/request/rss/__init__.py index 515f32a7d..412e28928 100644 --- a/platypush/backend/http/request/rss/__init__.py +++ b/platypush/backend/http/request/rss/__init__.py @@ -1,4 +1,5 @@ import datetime +import enum import feedparser import logging import os @@ -6,7 +7,7 @@ import requests import time from sqlalchemy import create_engine, Column, Integer, String, DateTime, \ - UniqueConstraint, ForeignKey + Enum, UniqueConstraint, ForeignKey from sqlalchemy.orm import sessionmaker, scoped_session from sqlalchemy.ext.declarative import declarative_base @@ -19,18 +20,21 @@ Base = declarative_base() Session = scoped_session(sessionmaker()) -class GetRssUpdates(HttpRequest): +class RssUpdates(HttpRequest): """ Gets new items in an RSS feed """ - dbfile = os.path.join(os.path.expanduser(Config.get('workdir')), 'feeds', 'rss.db') - - def __init__(self, url, headers=None, params=None, dbfile=None, - mercury_api_key=None, *args, **kwargs): + workdir = os.path.join(os.path.expanduser(Config.get('workdir')), 'feeds') + dbfile = os.path.join(workdir, 'rss.db') + user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36' + def __init__(self, url, title=None, headers=None, params=None, max_entries=None, + mercury_api_key=None, digest_format='html', *args, **kwargs): self.url = url + self.title = title + self.max_entries = max_entries self.mercury_api_key = mercury_api_key # Mercury Reader API used to parse the content of the link + self.digest_format = digest_format.lower() if digest_format else 'html' - if dbfile: self.dbfile = dbfile os.makedirs(os.path.expanduser(os.path.dirname(self.dbfile)), exist_ok=True) self.engine = create_engine('sqlite:///{}'.format(self.dbfile)) @@ -38,10 +42,13 @@ class GetRssUpdates(HttpRequest): Session.configure(bind=self.engine) self._get_or_create_source(session=Session()) + if headers is None: headers = {} + headers['User-Agent'] = self.user_agent + request_args = { 'method': 'get', 'url': self.url, - 'headers': headers or {}, + 'headers': headers, 'params': params or {}, } @@ -79,22 +86,36 @@ class GetRssUpdates(HttpRequest): parse_start_time = datetime.datetime.utcnow() entries = [] - if source_record.title != feed.feed['title']: - source_record.title = feed.feed['title'] + if not self.title and 'title' in feed.feed: + self.title = feed.feed['title'] + source_record.title = self.title + + digest = u''' +

{}

+

+ Feeds digest generated on {}

'''.format(self.title, + datetime.datetime.now().strftime('%d %B %Y, %H:%M') + ) for entry in feed.entries: entry_timestamp = datetime.datetime(*entry.published_parsed[:6]) if source_record.last_updated_at is None \ or entry_timestamp > source_record.last_updated_at: - entry.content = self._parse_entry_content(entry.link) \ - if self.mercury_api_key else None + if self.mercury_api_key: + entry.content = self._parse_entry_content(entry.link) + elif hasattr(entry, 'summary'): + entry.content = entry.summary + else: + entry.content = None + + digest += '

{}

{}' \ + .format(entry.title, entry.content) e = { 'entry_id': entry.id, 'title': entry.title, 'link': entry.link, - 'summary': entry.summary, 'content': entry.content, 'source_id': source_record.id, 'published': entry_timestamp, @@ -102,18 +123,48 @@ class GetRssUpdates(HttpRequest): entries.append(e) session.add(FeedEntry(**e)) + if self.max_entries and len(entries) > self.max_entries: break source_record.last_updated_at = parse_start_time - session.commit() + digest_filename = None if entries: logging.info('Parsed {} new entries from the RSS feed {}'.format( - len(entries), source_record.title)) + len(entries), self.title)) - return NewFeedEvent(dict(self), entries) + digest_filename = os.path.join(self.workdir, 'cache', '{}_{}.{}'.format( + datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), + self.title, self.digest_format)) + + os.makedirs(os.path.dirname(digest_filename), exist_ok=True) + + if self.digest_format == 'html': + with open(digest_filename, 'w', encoding='utf-8') as f: + f.write(digest) + elif self.digest_format == 'pdf': + import weasyprint + weasyprint.HTML(string=digest).write_pdf(digest_filename) + else: + raise RuntimeError('Unsupported format: {}. Supported formats: ' + + 'html or pdf'.format(self.digest_format)) + + digest_entry = FeedDigest(source_id=source_record.id, + format=self.digest_format, + filename=digest_filename) + + session.add(digest_entry) + + session.commit() + + return NewFeedEvent(request=dict(self), response=entries, + source_id=source_record.id, title=self.title, + digest_format=self.digest_format, + digest_filename=digest_filename) class FeedSource(Base): + """ Models the FeedSource table, containing RSS sources to be parsed """ + __tablename__ = 'FeedSource' __table_args__ = ({ 'sqlite_autoincrement': True }) @@ -124,6 +175,8 @@ class FeedSource(Base): class FeedEntry(Base): + """ Models the FeedEntry table, which contains RSS entries """ + __tablename__ = 'FeedEntry' __table_args__ = ({ 'sqlite_autoincrement': True }) @@ -132,10 +185,27 @@ class FeedEntry(Base): source_id = Column(Integer, ForeignKey('FeedSource.id'), nullable=False) title = Column(String) link = Column(String) - summary = Column(String) content = Column(String) published = Column(DateTime) +class FeedDigest(Base): + """ Models the FeedDigest table, containing feed digests either in HTML + or PDF format """ + + class DigestFormat(enum.Enum): + html = 1 + pdf = 2 + + __tablename__ = 'FeedDigest' + __table_args__ = ({ 'sqlite_autoincrement': True }) + + id = Column(Integer, primary_key=True) + source_id = Column(Integer, ForeignKey('FeedSource.id'), nullable=False) + format = Column(Enum(DigestFormat), nullable=False) + filename = Column(String, nullable=False) + created_at = Column(DateTime, nullable=False, default=datetime.datetime.utcnow) + + # vim:sw=4:ts=4:et: diff --git a/platypush/message/event/http/rss.py b/platypush/message/event/http/rss.py index ebb554964..93d80a1bd 100644 --- a/platypush/message/event/http/rss.py +++ b/platypush/message/event/http/rss.py @@ -1,8 +1,12 @@ from platypush.message.event.http import HttpEvent class NewFeedEvent(HttpEvent): - def __init__(self, request, response, *args, **kwargs): - super().__init__(request=request, response=response, *args, **kwargs) + def __init__(self, request, response, source_id=None, title=None, + digest_format=None, digest_filename=None, *args, **kwargs): + + super().__init__(request=request, response=response, source_id=source_id, + digest_format=digest_format, title=title, + digest_filename=digest_filename, *args, **kwargs) # vim:sw=4:ts=4:et: diff --git a/platypush/plugins/google/mail.py b/platypush/plugins/google/mail.py index 1240912ad..59489a64a 100644 --- a/platypush/plugins/google/mail.py +++ b/platypush/plugins/google/mail.py @@ -1,6 +1,7 @@ import base64 import httplib2 import mimetypes +import os from apiclient import discovery diff --git a/requirements.txt b/requirements.txt index e4b08745e..5c4bfadd8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,6 +23,9 @@ python-dateutil # RSS feeds support feedparser +# PDF generation support +weasyprint + # Philips Hue plugin support phue diff --git a/setup.py b/setup.py index b1a02c517..367f2b28e 100755 --- a/setup.py +++ b/setup.py @@ -67,6 +67,7 @@ setup( 'Support for HTTP poll backend': ['frozendict'], 'Support for database plugin': ['sqlalchemy'], 'Support for RSS feeds': ['feedparser'], + 'Support for PDF generation': ['weasyprint'], 'Support for Philips Hue plugin': ['phue'], 'Support for MPD/Mopidy music server plugin': ['python-mpd2'], 'Support for Belkin WeMo Switch plugin': ['ouimeaux'],