diff --git a/platypush/backend/http/request/rss/__init__.py b/platypush/backend/http/request/rss/__init__.py
index 515f32a7d..412e28928 100644
--- a/platypush/backend/http/request/rss/__init__.py
+++ b/platypush/backend/http/request/rss/__init__.py
@@ -1,4 +1,5 @@
import datetime
+import enum
import feedparser
import logging
import os
@@ -6,7 +7,7 @@ import requests
import time
from sqlalchemy import create_engine, Column, Integer, String, DateTime, \
- UniqueConstraint, ForeignKey
+ Enum, UniqueConstraint, ForeignKey
from sqlalchemy.orm import sessionmaker, scoped_session
from sqlalchemy.ext.declarative import declarative_base
@@ -19,18 +20,21 @@ Base = declarative_base()
Session = scoped_session(sessionmaker())
-class GetRssUpdates(HttpRequest):
+class RssUpdates(HttpRequest):
""" Gets new items in an RSS feed """
- dbfile = os.path.join(os.path.expanduser(Config.get('workdir')), 'feeds', 'rss.db')
-
- def __init__(self, url, headers=None, params=None, dbfile=None,
- mercury_api_key=None, *args, **kwargs):
+ workdir = os.path.join(os.path.expanduser(Config.get('workdir')), 'feeds')
+ dbfile = os.path.join(workdir, 'rss.db')
+ user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
+ def __init__(self, url, title=None, headers=None, params=None, max_entries=None,
+ mercury_api_key=None, digest_format='html', *args, **kwargs):
self.url = url
+ self.title = title
+ self.max_entries = max_entries
self.mercury_api_key = mercury_api_key # Mercury Reader API used to parse the content of the link
+ self.digest_format = digest_format.lower() if digest_format else 'html'
- if dbfile: self.dbfile = dbfile
os.makedirs(os.path.expanduser(os.path.dirname(self.dbfile)), exist_ok=True)
self.engine = create_engine('sqlite:///{}'.format(self.dbfile))
@@ -38,10 +42,13 @@ class GetRssUpdates(HttpRequest):
Session.configure(bind=self.engine)
self._get_or_create_source(session=Session())
+ if headers is None: headers = {}
+ headers['User-Agent'] = self.user_agent
+
request_args = {
'method': 'get',
'url': self.url,
- 'headers': headers or {},
+ 'headers': headers,
'params': params or {},
}
@@ -79,22 +86,36 @@ class GetRssUpdates(HttpRequest):
parse_start_time = datetime.datetime.utcnow()
entries = []
- if source_record.title != feed.feed['title']:
- source_record.title = feed.feed['title']
+ if not self.title and 'title' in feed.feed:
+ self.title = feed.feed['title']
+ source_record.title = self.title
+
+ digest = u'''
+
{}
+
+ Feeds digest generated on {}
'''.format(self.title,
+ datetime.datetime.now().strftime('%d %B %Y, %H:%M')
+ )
for entry in feed.entries:
entry_timestamp = datetime.datetime(*entry.published_parsed[:6])
if source_record.last_updated_at is None \
or entry_timestamp > source_record.last_updated_at:
- entry.content = self._parse_entry_content(entry.link) \
- if self.mercury_api_key else None
+ if self.mercury_api_key:
+ entry.content = self._parse_entry_content(entry.link)
+ elif hasattr(entry, 'summary'):
+ entry.content = entry.summary
+ else:
+ entry.content = None
+
+ digest += '{}
{}' \
+ .format(entry.title, entry.content)
e = {
'entry_id': entry.id,
'title': entry.title,
'link': entry.link,
- 'summary': entry.summary,
'content': entry.content,
'source_id': source_record.id,
'published': entry_timestamp,
@@ -102,18 +123,48 @@ class GetRssUpdates(HttpRequest):
entries.append(e)
session.add(FeedEntry(**e))
+ if self.max_entries and len(entries) > self.max_entries: break
source_record.last_updated_at = parse_start_time
- session.commit()
+ digest_filename = None
if entries:
logging.info('Parsed {} new entries from the RSS feed {}'.format(
- len(entries), source_record.title))
+ len(entries), self.title))
- return NewFeedEvent(dict(self), entries)
+ digest_filename = os.path.join(self.workdir, 'cache', '{}_{}.{}'.format(
+ datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'),
+ self.title, self.digest_format))
+
+ os.makedirs(os.path.dirname(digest_filename), exist_ok=True)
+
+ if self.digest_format == 'html':
+ with open(digest_filename, 'w', encoding='utf-8') as f:
+ f.write(digest)
+ elif self.digest_format == 'pdf':
+ import weasyprint
+ weasyprint.HTML(string=digest).write_pdf(digest_filename)
+ else:
+ raise RuntimeError('Unsupported format: {}. Supported formats: ' +
+ 'html or pdf'.format(self.digest_format))
+
+ digest_entry = FeedDigest(source_id=source_record.id,
+ format=self.digest_format,
+ filename=digest_filename)
+
+ session.add(digest_entry)
+
+ session.commit()
+
+ return NewFeedEvent(request=dict(self), response=entries,
+ source_id=source_record.id, title=self.title,
+ digest_format=self.digest_format,
+ digest_filename=digest_filename)
class FeedSource(Base):
+ """ Models the FeedSource table, containing RSS sources to be parsed """
+
__tablename__ = 'FeedSource'
__table_args__ = ({ 'sqlite_autoincrement': True })
@@ -124,6 +175,8 @@ class FeedSource(Base):
class FeedEntry(Base):
+ """ Models the FeedEntry table, which contains RSS entries """
+
__tablename__ = 'FeedEntry'
__table_args__ = ({ 'sqlite_autoincrement': True })
@@ -132,10 +185,27 @@ class FeedEntry(Base):
source_id = Column(Integer, ForeignKey('FeedSource.id'), nullable=False)
title = Column(String)
link = Column(String)
- summary = Column(String)
content = Column(String)
published = Column(DateTime)
+class FeedDigest(Base):
+ """ Models the FeedDigest table, containing feed digests either in HTML
+ or PDF format """
+
+ class DigestFormat(enum.Enum):
+ html = 1
+ pdf = 2
+
+ __tablename__ = 'FeedDigest'
+ __table_args__ = ({ 'sqlite_autoincrement': True })
+
+ id = Column(Integer, primary_key=True)
+ source_id = Column(Integer, ForeignKey('FeedSource.id'), nullable=False)
+ format = Column(Enum(DigestFormat), nullable=False)
+ filename = Column(String, nullable=False)
+ created_at = Column(DateTime, nullable=False, default=datetime.datetime.utcnow)
+
+
# vim:sw=4:ts=4:et:
diff --git a/platypush/message/event/http/rss.py b/platypush/message/event/http/rss.py
index ebb554964..93d80a1bd 100644
--- a/platypush/message/event/http/rss.py
+++ b/platypush/message/event/http/rss.py
@@ -1,8 +1,12 @@
from platypush.message.event.http import HttpEvent
class NewFeedEvent(HttpEvent):
- def __init__(self, request, response, *args, **kwargs):
- super().__init__(request=request, response=response, *args, **kwargs)
+ def __init__(self, request, response, source_id=None, title=None,
+ digest_format=None, digest_filename=None, *args, **kwargs):
+
+ super().__init__(request=request, response=response, source_id=source_id,
+ digest_format=digest_format, title=title,
+ digest_filename=digest_filename, *args, **kwargs)
# vim:sw=4:ts=4:et:
diff --git a/platypush/plugins/google/mail.py b/platypush/plugins/google/mail.py
index 1240912ad..59489a64a 100644
--- a/platypush/plugins/google/mail.py
+++ b/platypush/plugins/google/mail.py
@@ -1,6 +1,7 @@
import base64
import httplib2
import mimetypes
+import os
from apiclient import discovery
diff --git a/requirements.txt b/requirements.txt
index e4b08745e..5c4bfadd8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,6 +23,9 @@ python-dateutil
# RSS feeds support
feedparser
+# PDF generation support
+weasyprint
+
# Philips Hue plugin support
phue
diff --git a/setup.py b/setup.py
index b1a02c517..367f2b28e 100755
--- a/setup.py
+++ b/setup.py
@@ -67,6 +67,7 @@ setup(
'Support for HTTP poll backend': ['frozendict'],
'Support for database plugin': ['sqlalchemy'],
'Support for RSS feeds': ['feedparser'],
+ 'Support for PDF generation': ['weasyprint'],
'Support for Philips Hue plugin': ['phue'],
'Support for MPD/Mopidy music server plugin': ['python-mpd2'],
'Support for Belkin WeMo Switch plugin': ['ouimeaux'],