Support for RSS digest generation either in HTML or PDF format
This commit is contained in:
parent
91dd975413
commit
55eb689121
5 changed files with 98 additions and 19 deletions
|
@ -1,4 +1,5 @@
|
|||
import datetime
|
||||
import enum
|
||||
import feedparser
|
||||
import logging
|
||||
import os
|
||||
|
@ -6,7 +7,7 @@ import requests
|
|||
import time
|
||||
|
||||
from sqlalchemy import create_engine, Column, Integer, String, DateTime, \
|
||||
UniqueConstraint, ForeignKey
|
||||
Enum, UniqueConstraint, ForeignKey
|
||||
|
||||
from sqlalchemy.orm import sessionmaker, scoped_session
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
@ -19,18 +20,21 @@ Base = declarative_base()
|
|||
Session = scoped_session(sessionmaker())
|
||||
|
||||
|
||||
class GetRssUpdates(HttpRequest):
|
||||
class RssUpdates(HttpRequest):
|
||||
""" Gets new items in an RSS feed """
|
||||
|
||||
dbfile = os.path.join(os.path.expanduser(Config.get('workdir')), 'feeds', 'rss.db')
|
||||
|
||||
def __init__(self, url, headers=None, params=None, dbfile=None,
|
||||
mercury_api_key=None, *args, **kwargs):
|
||||
workdir = os.path.join(os.path.expanduser(Config.get('workdir')), 'feeds')
|
||||
dbfile = os.path.join(workdir, 'rss.db')
|
||||
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
|
||||
|
||||
def __init__(self, url, title=None, headers=None, params=None, max_entries=None,
|
||||
mercury_api_key=None, digest_format='html', *args, **kwargs):
|
||||
self.url = url
|
||||
self.title = title
|
||||
self.max_entries = max_entries
|
||||
self.mercury_api_key = mercury_api_key # Mercury Reader API used to parse the content of the link
|
||||
self.digest_format = digest_format.lower() if digest_format else 'html'
|
||||
|
||||
if dbfile: self.dbfile = dbfile
|
||||
os.makedirs(os.path.expanduser(os.path.dirname(self.dbfile)), exist_ok=True)
|
||||
|
||||
self.engine = create_engine('sqlite:///{}'.format(self.dbfile))
|
||||
|
@ -38,10 +42,13 @@ class GetRssUpdates(HttpRequest):
|
|||
Session.configure(bind=self.engine)
|
||||
self._get_or_create_source(session=Session())
|
||||
|
||||
if headers is None: headers = {}
|
||||
headers['User-Agent'] = self.user_agent
|
||||
|
||||
request_args = {
|
||||
'method': 'get',
|
||||
'url': self.url,
|
||||
'headers': headers or {},
|
||||
'headers': headers,
|
||||
'params': params or {},
|
||||
}
|
||||
|
||||
|
@ -79,22 +86,36 @@ class GetRssUpdates(HttpRequest):
|
|||
parse_start_time = datetime.datetime.utcnow()
|
||||
entries = []
|
||||
|
||||
if source_record.title != feed.feed['title']:
|
||||
source_record.title = feed.feed['title']
|
||||
if not self.title and 'title' in feed.feed:
|
||||
self.title = feed.feed['title']
|
||||
source_record.title = self.title
|
||||
|
||||
digest = u'''
|
||||
<h1 style="margin-top: 30px">{}</h1>
|
||||
<h2 style="margin-top: 10px; page-break-after: always">
|
||||
Feeds digest generated on {} </h2>'''.format(self.title,
|
||||
datetime.datetime.now().strftime('%d %B %Y, %H:%M')
|
||||
)
|
||||
|
||||
for entry in feed.entries:
|
||||
entry_timestamp = datetime.datetime(*entry.published_parsed[:6])
|
||||
|
||||
if source_record.last_updated_at is None \
|
||||
or entry_timestamp > source_record.last_updated_at:
|
||||
entry.content = self._parse_entry_content(entry.link) \
|
||||
if self.mercury_api_key else None
|
||||
if self.mercury_api_key:
|
||||
entry.content = self._parse_entry_content(entry.link)
|
||||
elif hasattr(entry, 'summary'):
|
||||
entry.content = entry.summary
|
||||
else:
|
||||
entry.content = None
|
||||
|
||||
digest += '<h1 style="page-break-before: always">{}</h1>{}' \
|
||||
.format(entry.title, entry.content)
|
||||
|
||||
e = {
|
||||
'entry_id': entry.id,
|
||||
'title': entry.title,
|
||||
'link': entry.link,
|
||||
'summary': entry.summary,
|
||||
'content': entry.content,
|
||||
'source_id': source_record.id,
|
||||
'published': entry_timestamp,
|
||||
|
@ -102,18 +123,48 @@ class GetRssUpdates(HttpRequest):
|
|||
|
||||
entries.append(e)
|
||||
session.add(FeedEntry(**e))
|
||||
if self.max_entries and len(entries) > self.max_entries: break
|
||||
|
||||
source_record.last_updated_at = parse_start_time
|
||||
session.commit()
|
||||
digest_filename = None
|
||||
|
||||
if entries:
|
||||
logging.info('Parsed {} new entries from the RSS feed {}'.format(
|
||||
len(entries), source_record.title))
|
||||
len(entries), self.title))
|
||||
|
||||
return NewFeedEvent(dict(self), entries)
|
||||
digest_filename = os.path.join(self.workdir, 'cache', '{}_{}.{}'.format(
|
||||
datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'),
|
||||
self.title, self.digest_format))
|
||||
|
||||
os.makedirs(os.path.dirname(digest_filename), exist_ok=True)
|
||||
|
||||
if self.digest_format == 'html':
|
||||
with open(digest_filename, 'w', encoding='utf-8') as f:
|
||||
f.write(digest)
|
||||
elif self.digest_format == 'pdf':
|
||||
import weasyprint
|
||||
weasyprint.HTML(string=digest).write_pdf(digest_filename)
|
||||
else:
|
||||
raise RuntimeError('Unsupported format: {}. Supported formats: ' +
|
||||
'html or pdf'.format(self.digest_format))
|
||||
|
||||
digest_entry = FeedDigest(source_id=source_record.id,
|
||||
format=self.digest_format,
|
||||
filename=digest_filename)
|
||||
|
||||
session.add(digest_entry)
|
||||
|
||||
session.commit()
|
||||
|
||||
return NewFeedEvent(request=dict(self), response=entries,
|
||||
source_id=source_record.id, title=self.title,
|
||||
digest_format=self.digest_format,
|
||||
digest_filename=digest_filename)
|
||||
|
||||
|
||||
class FeedSource(Base):
|
||||
""" Models the FeedSource table, containing RSS sources to be parsed """
|
||||
|
||||
__tablename__ = 'FeedSource'
|
||||
__table_args__ = ({ 'sqlite_autoincrement': True })
|
||||
|
||||
|
@ -124,6 +175,8 @@ class FeedSource(Base):
|
|||
|
||||
|
||||
class FeedEntry(Base):
|
||||
""" Models the FeedEntry table, which contains RSS entries """
|
||||
|
||||
__tablename__ = 'FeedEntry'
|
||||
__table_args__ = ({ 'sqlite_autoincrement': True })
|
||||
|
||||
|
@ -132,10 +185,27 @@ class FeedEntry(Base):
|
|||
source_id = Column(Integer, ForeignKey('FeedSource.id'), nullable=False)
|
||||
title = Column(String)
|
||||
link = Column(String)
|
||||
summary = Column(String)
|
||||
content = Column(String)
|
||||
published = Column(DateTime)
|
||||
|
||||
|
||||
class FeedDigest(Base):
|
||||
""" Models the FeedDigest table, containing feed digests either in HTML
|
||||
or PDF format """
|
||||
|
||||
class DigestFormat(enum.Enum):
|
||||
html = 1
|
||||
pdf = 2
|
||||
|
||||
__tablename__ = 'FeedDigest'
|
||||
__table_args__ = ({ 'sqlite_autoincrement': True })
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
source_id = Column(Integer, ForeignKey('FeedSource.id'), nullable=False)
|
||||
format = Column(Enum(DigestFormat), nullable=False)
|
||||
filename = Column(String, nullable=False)
|
||||
created_at = Column(DateTime, nullable=False, default=datetime.datetime.utcnow)
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
||||
|
||||
|
|
|
@ -1,8 +1,12 @@
|
|||
from platypush.message.event.http import HttpEvent
|
||||
|
||||
class NewFeedEvent(HttpEvent):
|
||||
def __init__(self, request, response, *args, **kwargs):
|
||||
super().__init__(request=request, response=response, *args, **kwargs)
|
||||
def __init__(self, request, response, source_id=None, title=None,
|
||||
digest_format=None, digest_filename=None, *args, **kwargs):
|
||||
|
||||
super().__init__(request=request, response=response, source_id=source_id,
|
||||
digest_format=digest_format, title=title,
|
||||
digest_filename=digest_filename, *args, **kwargs)
|
||||
|
||||
|
||||
# vim:sw=4:ts=4:et:
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import base64
|
||||
import httplib2
|
||||
import mimetypes
|
||||
import os
|
||||
|
||||
from apiclient import discovery
|
||||
|
||||
|
|
|
@ -23,6 +23,9 @@ python-dateutil
|
|||
# RSS feeds support
|
||||
feedparser
|
||||
|
||||
# PDF generation support
|
||||
weasyprint
|
||||
|
||||
# Philips Hue plugin support
|
||||
phue
|
||||
|
||||
|
|
1
setup.py
1
setup.py
|
@ -67,6 +67,7 @@ setup(
|
|||
'Support for HTTP poll backend': ['frozendict'],
|
||||
'Support for database plugin': ['sqlalchemy'],
|
||||
'Support for RSS feeds': ['feedparser'],
|
||||
'Support for PDF generation': ['weasyprint'],
|
||||
'Support for Philips Hue plugin': ['phue'],
|
||||
'Support for MPD/Mopidy music server plugin': ['python-mpd2'],
|
||||
'Support for Belkin WeMo Switch plugin': ['ouimeaux'],
|
||||
|
|
Loading…
Add table
Reference in a new issue