A more robust logic for spotting new RSS items

This commit is contained in:
Fabio Manganiello 2018-05-01 10:13:37 +02:00
parent d12ebe8810
commit dca41ea86e
1 changed files with 9 additions and 3 deletions

View File

@ -11,6 +11,7 @@ from sqlalchemy import create_engine, Column, Integer, String, DateTime, \
from sqlalchemy.orm import sessionmaker, scoped_session from sqlalchemy.orm import sessionmaker, scoped_session
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.sql.expression import func
from platypush.backend.http.request import HttpRequest from platypush.backend.http.request import HttpRequest
from platypush.config import Config from platypush.config import Config
@ -52,13 +53,17 @@ class RssUpdates(HttpRequest):
def _get_or_create_source(self, session): def _get_or_create_source(self, session):
record = session.query(FeedSource).filter_by(url=self.url).first() record = session.query(FeedSource).filter_by(url=self.url).first()
if record is None: if record is None:
record = FeedSource(url=self.url) record = FeedSource(url=self.url, title=self.title)
session.add(record) session.add(record)
session.commit() session.commit()
return record return record
def _get_latest_update(self, session, source_id):
return session.query(func.max(FeedEntry.published)).filter_by(source_id=source_id).scalar()
def _parse_entry_content(self, link): def _parse_entry_content(self, link):
response = None response = None
@ -86,6 +91,7 @@ class RssUpdates(HttpRequest):
session.add(source_record) session.add(source_record)
parse_start_time = datetime.datetime.utcnow() parse_start_time = datetime.datetime.utcnow()
entries = [] entries = []
latest_update = self._get_latest_update(session, source_record.id)
if not self.title and 'title' in feed.feed: if not self.title and 'title' in feed.feed:
self.title = feed.feed['title'] self.title = feed.feed['title']
@ -104,8 +110,8 @@ class RssUpdates(HttpRequest):
for entry in feed.entries: for entry in feed.entries:
entry_timestamp = datetime.datetime(*entry.published_parsed[:6]) entry_timestamp = datetime.datetime(*entry.published_parsed[:6])
if source_record.last_updated_at is None \ if latest_update is None \
or entry_timestamp > source_record.last_updated_at: or entry_timestamp > latest_update:
logging.info('Processed new item from RSS feed <{}>: "{}"' logging.info('Processed new item from RSS feed <{}>: "{}"'
.format(self.url, entry.title)) .format(self.url, entry.title))