2018-01-27 04:31:09 +01:00
|
|
|
import datetime
|
2018-01-28 02:01:54 +01:00
|
|
|
import enum
|
2018-01-27 04:31:09 +01:00
|
|
|
import feedparser
|
|
|
|
import os
|
|
|
|
import requests
|
|
|
|
import time
|
|
|
|
|
|
|
|
from sqlalchemy import create_engine, Column, Integer, String, DateTime, \
|
2018-01-28 02:01:54 +01:00
|
|
|
Enum, UniqueConstraint, ForeignKey
|
2018-01-27 04:31:09 +01:00
|
|
|
|
|
|
|
from sqlalchemy.orm import sessionmaker, scoped_session
|
|
|
|
from sqlalchemy.ext.declarative import declarative_base
|
2018-05-01 10:13:37 +02:00
|
|
|
from sqlalchemy.sql.expression import func
|
2018-01-27 04:31:09 +01:00
|
|
|
|
|
|
|
from platypush.backend.http.request import HttpRequest
|
|
|
|
from platypush.config import Config
|
|
|
|
from platypush.message.event.http.rss import NewFeedEvent
|
|
|
|
|
|
|
|
Base = declarative_base()
|
|
|
|
Session = scoped_session(sessionmaker())
|
|
|
|
|
|
|
|
|
2018-01-28 02:01:54 +01:00
|
|
|
class RssUpdates(HttpRequest):
|
2018-01-27 04:31:09 +01:00
|
|
|
""" Gets new items in an RSS feed """
|
|
|
|
|
2018-01-28 02:01:54 +01:00
|
|
|
dbfile = os.path.join(workdir, 'rss.db')
|
|
|
|
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
|
2018-01-27 04:31:09 +01:00
|
|
|
|
2018-01-28 02:01:54 +01:00
|
|
|
def __init__(self, url, title=None, headers=None, params=None, max_entries=None,
|
2018-06-08 16:54:15 +02:00
|
|
|
mercury_api_key=None, digest_format=None, *argv, **kwargs):
|
2019-07-16 22:38:42 +02:00
|
|
|
self.workdir = os.path.join(os.path.expanduser(Config.get('workdir')), 'feeds')
|
2018-01-27 04:31:09 +01:00
|
|
|
self.url = url
|
2018-01-28 02:01:54 +01:00
|
|
|
self.title = title
|
|
|
|
self.max_entries = max_entries
|
2018-01-27 04:31:09 +01:00
|
|
|
self.mercury_api_key = mercury_api_key # Mercury Reader API used to parse the content of the link
|
2018-05-07 19:44:34 +02:00
|
|
|
self.digest_format = digest_format.lower() if digest_format else None # Supported formats: html, pdf
|
2018-01-27 04:31:09 +01:00
|
|
|
|
2018-01-27 13:54:15 +01:00
|
|
|
os.makedirs(os.path.expanduser(os.path.dirname(self.dbfile)), exist_ok=True)
|
2018-01-27 04:31:09 +01:00
|
|
|
|
2018-01-28 02:01:54 +01:00
|
|
|
if headers is None: headers = {}
|
|
|
|
headers['User-Agent'] = self.user_agent
|
|
|
|
|
2018-01-27 04:31:09 +01:00
|
|
|
request_args = {
|
|
|
|
'method': 'get',
|
|
|
|
'url': self.url,
|
2018-01-28 02:01:54 +01:00
|
|
|
'headers': headers,
|
2018-01-27 04:31:09 +01:00
|
|
|
'params': params or {},
|
|
|
|
}
|
|
|
|
|
2018-06-08 16:54:15 +02:00
|
|
|
super().__init__(skip_first_call=False, args=request_args, *argv, **kwargs)
|
2018-01-27 04:31:09 +01:00
|
|
|
|
|
|
|
def _get_or_create_source(self, session):
|
|
|
|
record = session.query(FeedSource).filter_by(url=self.url).first()
|
|
|
|
if record is None:
|
2018-05-01 10:13:37 +02:00
|
|
|
record = FeedSource(url=self.url, title=self.title)
|
2018-01-27 04:31:09 +01:00
|
|
|
session.add(record)
|
|
|
|
|
|
|
|
session.commit()
|
|
|
|
return record
|
|
|
|
|
|
|
|
|
2018-05-01 10:13:37 +02:00
|
|
|
def _get_latest_update(self, session, source_id):
|
|
|
|
return session.query(func.max(FeedEntry.published)).filter_by(source_id=source_id).scalar()
|
|
|
|
|
|
|
|
|
2018-01-27 04:31:09 +01:00
|
|
|
def _parse_entry_content(self, link):
|
|
|
|
response = None
|
2018-08-19 23:51:59 +02:00
|
|
|
err = None
|
|
|
|
n_tries = 5
|
|
|
|
|
|
|
|
for _ in range(0, n_tries):
|
|
|
|
try:
|
|
|
|
self.logger.info('Parsing content for {}'.format(link))
|
|
|
|
response = requests.get('https://mercury.postlight.com/parser',
|
|
|
|
params = {'url': link},
|
|
|
|
headers = {'x-api-key': self.mercury_api_key })
|
|
|
|
except Exception as e:
|
|
|
|
err = e
|
|
|
|
|
|
|
|
if response.text:
|
|
|
|
err = None
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
if err:
|
|
|
|
raise err
|
|
|
|
|
|
|
|
if not response.text:
|
|
|
|
raise RuntimeError("No response from Mercury API for URL {} after {} tries"
|
|
|
|
.format(link, n_tries))
|
2018-01-27 04:31:09 +01:00
|
|
|
|
|
|
|
return response.json()['content'] if response and response.ok else None
|
|
|
|
|
|
|
|
|
|
|
|
def get_new_items(self, response):
|
2018-04-28 19:11:14 +02:00
|
|
|
engine = create_engine('sqlite:///{}'.format(self.dbfile),
|
|
|
|
connect_args = { 'check_same_thread': False })
|
|
|
|
|
|
|
|
Base.metadata.create_all(engine)
|
|
|
|
Session.configure(bind=engine)
|
|
|
|
self._get_or_create_source(session=Session())
|
|
|
|
|
2018-01-27 04:31:09 +01:00
|
|
|
feed = feedparser.parse(response.text)
|
|
|
|
session = Session()
|
|
|
|
source_record = self._get_or_create_source(session=session)
|
|
|
|
session.add(source_record)
|
|
|
|
parse_start_time = datetime.datetime.utcnow()
|
|
|
|
entries = []
|
2018-05-01 10:13:37 +02:00
|
|
|
latest_update = self._get_latest_update(session, source_record.id)
|
2018-01-27 04:31:09 +01:00
|
|
|
|
2018-01-28 02:01:54 +01:00
|
|
|
if not self.title and 'title' in feed.feed:
|
|
|
|
self.title = feed.feed['title']
|
|
|
|
source_record.title = self.title
|
|
|
|
|
|
|
|
digest = u'''
|
|
|
|
<h1 style="margin-top: 30px">{}</h1>
|
|
|
|
<h2 style="margin-top: 10px; page-break-after: always">
|
|
|
|
Feeds digest generated on {} </h2>'''.format(self.title,
|
|
|
|
datetime.datetime.now().strftime('%d %B %Y, %H:%M')
|
|
|
|
)
|
2018-01-27 04:31:09 +01:00
|
|
|
|
2018-06-06 20:09:18 +02:00
|
|
|
self.logger.info('Parsed {:d} items from RSS feed <{}>'
|
2018-04-28 19:11:14 +02:00
|
|
|
.format(len(feed.entries), self.url))
|
|
|
|
|
2018-01-27 04:31:09 +01:00
|
|
|
for entry in feed.entries:
|
2018-05-07 18:33:44 +02:00
|
|
|
if not entry.published_parsed:
|
|
|
|
continue
|
|
|
|
|
2018-10-21 14:47:52 +02:00
|
|
|
try:
|
|
|
|
entry_timestamp = datetime.datetime(*entry.published_parsed[:6])
|
|
|
|
|
|
|
|
if latest_update is None \
|
|
|
|
or entry_timestamp > latest_update:
|
2018-10-23 00:23:47 +02:00
|
|
|
self.logger.info('Processed new item from RSS feed <{}>'.format(self.url))
|
2018-10-21 14:47:52 +02:00
|
|
|
entry.summary = entry.summary if hasattr(entry, 'summary') else None
|
|
|
|
|
|
|
|
if self.mercury_api_key:
|
|
|
|
entry.content = self._parse_entry_content(entry.link)
|
|
|
|
elif hasattr(entry, 'summary'):
|
|
|
|
entry.content = entry.summary
|
|
|
|
else:
|
|
|
|
entry.content = None
|
|
|
|
|
|
|
|
digest += '<h1 style="page-break-before: always">{}</h1>{}' \
|
|
|
|
.format(entry.title, entry.content)
|
|
|
|
|
|
|
|
e = {
|
|
|
|
'entry_id': entry.id,
|
|
|
|
'title': entry.title,
|
|
|
|
'link': entry.link,
|
|
|
|
'summary': entry.summary,
|
|
|
|
'content': entry.content,
|
|
|
|
'source_id': source_record.id,
|
|
|
|
'published': entry_timestamp,
|
|
|
|
}
|
|
|
|
|
|
|
|
entries.append(e)
|
|
|
|
session.add(FeedEntry(**e))
|
|
|
|
if self.max_entries and len(entries) > self.max_entries: break
|
|
|
|
except Exception as e:
|
|
|
|
self.logger.warning('Exception encountered while parsing RSS ' +
|
|
|
|
'RSS feed {}: {}'.format(self.url, str(e)))
|
2018-01-27 04:31:09 +01:00
|
|
|
|
|
|
|
source_record.last_updated_at = parse_start_time
|
2018-01-28 02:01:54 +01:00
|
|
|
digest_filename = None
|
2018-01-27 04:31:09 +01:00
|
|
|
|
|
|
|
if entries:
|
2018-06-06 20:09:18 +02:00
|
|
|
self.logger.info('Parsed {} new entries from the RSS feed {}'.format(
|
2018-01-28 02:01:54 +01:00
|
|
|
len(entries), self.title))
|
|
|
|
|
2018-05-07 19:44:34 +02:00
|
|
|
if self.digest_format:
|
|
|
|
digest_filename = os.path.join(self.workdir, 'cache', '{}_{}.{}'.format(
|
|
|
|
datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'),
|
|
|
|
self.title, self.digest_format))
|
2018-01-28 02:01:54 +01:00
|
|
|
|
2018-05-07 19:44:34 +02:00
|
|
|
os.makedirs(os.path.dirname(digest_filename), exist_ok=True)
|
2018-01-28 02:01:54 +01:00
|
|
|
|
2018-05-07 19:44:34 +02:00
|
|
|
if self.digest_format == 'html':
|
|
|
|
with open(digest_filename, 'w', encoding='utf-8') as f:
|
|
|
|
f.write(digest)
|
|
|
|
elif self.digest_format == 'pdf':
|
|
|
|
import weasyprint
|
|
|
|
weasyprint.HTML(string=digest).write_pdf(digest_filename)
|
|
|
|
else:
|
|
|
|
raise RuntimeError('Unsupported format: {}. Supported formats: ' +
|
|
|
|
'html or pdf'.format(self.digest_format))
|
2018-01-28 02:01:54 +01:00
|
|
|
|
2018-05-07 19:44:34 +02:00
|
|
|
digest_entry = FeedDigest(source_id=source_record.id,
|
|
|
|
format=self.digest_format,
|
|
|
|
filename=digest_filename)
|
2018-01-28 02:01:54 +01:00
|
|
|
|
2018-05-07 19:44:34 +02:00
|
|
|
session.add(digest_entry)
|
2018-06-06 20:09:18 +02:00
|
|
|
self.logger.info('{} digest ready: {}'.format(self.digest_format, digest_filename))
|
2018-01-28 02:01:54 +01:00
|
|
|
|
|
|
|
session.commit()
|
2018-06-06 20:09:18 +02:00
|
|
|
self.logger.info('Parsing RSS feed {}: completed'.format(self.title))
|
2018-01-27 04:31:09 +01:00
|
|
|
|
2018-01-28 02:01:54 +01:00
|
|
|
return NewFeedEvent(request=dict(self), response=entries,
|
|
|
|
source_id=source_record.id, title=self.title,
|
|
|
|
digest_format=self.digest_format,
|
|
|
|
digest_filename=digest_filename)
|
2018-01-27 04:31:09 +01:00
|
|
|
|
|
|
|
|
|
|
|
class FeedSource(Base):
|
2018-01-28 02:01:54 +01:00
|
|
|
""" Models the FeedSource table, containing RSS sources to be parsed """
|
|
|
|
|
2018-01-27 04:31:09 +01:00
|
|
|
__tablename__ = 'FeedSource'
|
|
|
|
__table_args__ = ({ 'sqlite_autoincrement': True })
|
|
|
|
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
|
|
title = Column(String)
|
|
|
|
url = Column(String, unique=True)
|
|
|
|
last_updated_at = Column(DateTime)
|
|
|
|
|
|
|
|
|
|
|
|
class FeedEntry(Base):
|
2018-01-28 02:01:54 +01:00
|
|
|
""" Models the FeedEntry table, which contains RSS entries """
|
|
|
|
|
2018-01-27 04:31:09 +01:00
|
|
|
__tablename__ = 'FeedEntry'
|
|
|
|
__table_args__ = ({ 'sqlite_autoincrement': True })
|
|
|
|
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
|
|
entry_id = Column(String)
|
|
|
|
source_id = Column(Integer, ForeignKey('FeedSource.id'), nullable=False)
|
|
|
|
title = Column(String)
|
|
|
|
link = Column(String)
|
2018-05-05 23:59:43 +02:00
|
|
|
summary = Column(String)
|
2018-01-27 04:31:09 +01:00
|
|
|
content = Column(String)
|
|
|
|
published = Column(DateTime)
|
|
|
|
|
|
|
|
|
2018-01-28 02:01:54 +01:00
|
|
|
class FeedDigest(Base):
|
|
|
|
""" Models the FeedDigest table, containing feed digests either in HTML
|
|
|
|
or PDF format """
|
|
|
|
|
|
|
|
class DigestFormat(enum.Enum):
|
|
|
|
html = 1
|
|
|
|
pdf = 2
|
|
|
|
|
|
|
|
__tablename__ = 'FeedDigest'
|
|
|
|
__table_args__ = ({ 'sqlite_autoincrement': True })
|
|
|
|
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
|
|
source_id = Column(Integer, ForeignKey('FeedSource.id'), nullable=False)
|
|
|
|
format = Column(Enum(DigestFormat), nullable=False)
|
|
|
|
filename = Column(String, nullable=False)
|
|
|
|
created_at = Column(DateTime, nullable=False, default=datetime.datetime.utcnow)
|
|
|
|
|
|
|
|
|
2018-01-27 04:31:09 +01:00
|
|
|
# vim:sw=4:ts=4:et:
|
|
|
|
|