From 14afbcad3a7894c47e08db2b12a962ee2748580a Mon Sep 17 00:00:00 2001 From: Fabio Manganiello Date: Sat, 27 Jan 2018 04:31:09 +0100 Subject: [PATCH] Support for RSS feeds update events, solves #48 --- platypush/backend/http/request/__init__.py | 10 +- .../backend/http/request/rss/__init__.py | 147 ++++++++++++++++++ platypush/config/__init__.py | 10 +- platypush/message/event/http/rss.py | 9 ++ platypush/utils/__init__.py | 12 +- requirements.txt | 3 + setup.py | 3 +- 7 files changed, 180 insertions(+), 14 deletions(-) create mode 100644 platypush/backend/http/request/rss/__init__.py create mode 100644 platypush/message/event/http/rss.py diff --git a/platypush/backend/http/request/__init__.py b/platypush/backend/http/request/__init__.py index 2b203a22b4..e5b273c40f 100644 --- a/platypush/backend/http/request/__init__.py +++ b/platypush/backend/http/request/__init__.py @@ -24,12 +24,14 @@ class HttpRequest(object): self.kwargs = kwargs - def __init__(self, args, bus=None, poll_seconds=None, timeout=None, **kwargs): + def __init__(self, args, bus=None, poll_seconds=None, timeout=None, + skip_first_call=True, **kwargs): super().__init__() self.poll_seconds = poll_seconds or self.poll_seconds self.timeout = timeout or self.timeout self.bus = bus + self.skip_first_call = skip_first_call self.last_request_timestamp = 0 if isinstance(args, self.HttpRequestArguments): @@ -62,8 +64,10 @@ class HttpRequest(object): else: event = HttpEvent(dict(self), new_items) - if new_items and not is_first_call and self.bus: - self.bus.post(event) + if new_items and self.bus: + if not self.skip_first_call or ( + self.skip_first_call and not is_first_call): + self.bus.post(event) response.raise_for_status() diff --git a/platypush/backend/http/request/rss/__init__.py b/platypush/backend/http/request/rss/__init__.py new file mode 100644 index 0000000000..94c1483d8d --- /dev/null +++ b/platypush/backend/http/request/rss/__init__.py @@ -0,0 +1,147 @@ +import datetime +import feedparser +import logging +import os +import requests +import time + +from sqlalchemy import create_engine, Column, Integer, String, DateTime, \ + UniqueConstraint, ForeignKey + +from sqlalchemy.orm import sessionmaker, scoped_session +from sqlalchemy.ext.declarative import declarative_base + +from platypush.backend.http.request import HttpRequest +from platypush.config import Config +from platypush.message.event.http.rss import NewFeedEvent +from platypush.utils import mkdir_p + +Base = declarative_base() +Session = scoped_session(sessionmaker()) + + +class GetRssUpdates(HttpRequest): + """ Gets new items in an RSS feed """ + + dbfile = os.path.join(os.path.expanduser(Config.get('workdir')), 'feeds', 'rss.db') + + def __init__(self, url, headers=None, params=None, dbfile=None, + mercury_api_key=None, *args, **kwargs): + + self.url = url + self.mercury_api_key = mercury_api_key # Mercury Reader API used to parse the content of the link + + if dbfile: self.dbfile = dbfile + mkdir_p(os.path.dirname(self.dbfile)) + + self.engine = create_engine('sqlite:///{}'.format(self.dbfile)) + Base.metadata.create_all(self.engine) + Session.configure(bind=self.engine) + self._get_or_create_source(session=Session()) + + request_args = { + 'method': 'get', + 'url': self.url, + 'headers': headers or {}, + 'params': params or {}, + } + + super().__init__(*args, skip_first_call=False, args=request_args, **kwargs) + + + def _get_or_create_source(self, session): + record = session.query(FeedSource).filter_by(url=self.url).first() + if record is None: + record = FeedSource(url=self.url) + session.add(record) + + session.commit() + return record + + + def _parse_entry_content(self, link): + response = None + + try: + logging.info('Parsing content for {}'.format(link)) + response = requests.get('https://mercury.postlight.com/parser', + params = {'url': link}, + headers = {'x-api-key': self.mercury_api_key }) + except Exception as e: logging.exception(e) + + return response.json()['content'] if response and response.ok else None + + + def get_new_items(self, response): + feed = feedparser.parse(response.text) + session = Session() + source_record = self._get_or_create_source(session=session) + session.add(source_record) + parse_start_time = datetime.datetime.utcnow() + + entries = [] + last_updated_at = source_record.last_updated_at + + if source_record.title != feed.feed['title']: + source_record.title = feed.feed['title'] + + for entry in feed.entries: + entry_timestamp = datetime.datetime(*entry.published_parsed[:6]) + + if last_updated_at is None: last_updated_at = entry_timestamp + if source_record.last_updated_at is None \ + or entry_timestamp > source_record.last_updated_at: + last_updated_at = max(entry_timestamp, last_updated_at) + + entry.content = self._parse_entry_content(entry.link) \ + if self.mercury_api_key else None + + e = { + 'entry_id': entry.id, + 'title': entry.title, + 'link': entry.link, + 'summary': entry.summary, + 'content': entry.content, + 'source_id': source_record.id, + 'published': entry_timestamp, + } + + entries.append(e) + session.add(FeedEntry(**e)) + + source_record.last_updated_at = parse_start_time + session.commit() + + if entries: + logging.info('Parsed {} new entries from the RSS feed {}'.format( + len(entries), source_record.title)) + + return NewFeedEvent(dict(self), entries) + + +class FeedSource(Base): + __tablename__ = 'FeedSource' + __table_args__ = ({ 'sqlite_autoincrement': True }) + + id = Column(Integer, primary_key=True) + title = Column(String) + url = Column(String, unique=True) + last_updated_at = Column(DateTime) + + +class FeedEntry(Base): + __tablename__ = 'FeedEntry' + __table_args__ = ({ 'sqlite_autoincrement': True }) + + id = Column(Integer, primary_key=True) + entry_id = Column(String) + source_id = Column(Integer, ForeignKey('FeedSource.id'), nullable=False) + title = Column(String) + link = Column(String) + summary = Column(String) + content = Column(String) + published = Column(DateTime) + + +# vim:sw=4:ts=4:et: + diff --git a/platypush/config/__init__.py b/platypush/config/__init__.py index a8fe8011b3..45820318a2 100644 --- a/platypush/config/__init__.py +++ b/platypush/config/__init__.py @@ -1,11 +1,13 @@ import datetime -import errno import logging import os import socket import time import yaml +from platypush.utils import mkdir_p + + """ Config singleton instance """ _default_config_instance = None @@ -237,12 +239,6 @@ class Config(object): _default_config_instance._config[key] = key -def mkdir_p(path): - try: os.makedirs(path) - except OSError as exc: # Python >2.5 - if exc.errno == errno.EEXIST and os.path.isdir(path): pass - else: raise - # vim:sw=4:ts=4:et: diff --git a/platypush/message/event/http/rss.py b/platypush/message/event/http/rss.py new file mode 100644 index 0000000000..ebb5549647 --- /dev/null +++ b/platypush/message/event/http/rss.py @@ -0,0 +1,9 @@ +from platypush.message.event.http import HttpEvent + +class NewFeedEvent(HttpEvent): + def __init__(self, request, response, *args, **kwargs): + super().__init__(request=request, response=response, *args, **kwargs) + + +# vim:sw=4:ts=4:et: + diff --git a/platypush/utils/__init__.py b/platypush/utils/__init__.py index 61178fd4f5..ada4492ada 100644 --- a/platypush/utils/__init__.py +++ b/platypush/utils/__init__.py @@ -1,10 +1,9 @@ +import errno import importlib import logging +import os import signal -from platypush.config import Config -from platypush.message import Message - def get_module_and_method_from_action(action): """ Input : action=music.mpd.play @@ -65,5 +64,12 @@ def clear_timeout(): signal.alarm(0) +def mkdir_p(path): + try: os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST and os.path.isdir(path): pass + else: raise + + # vim:sw=4:ts=4:et: diff --git a/requirements.txt b/requirements.txt index 2330a91c28..e4b08745e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,9 @@ sqlalchemy # Dates support python-dateutil +# RSS feeds support +feedparser + # Philips Hue plugin support phue diff --git a/setup.py b/setup.py index ca36eb7844..70be82b83d 100755 --- a/setup.py +++ b/setup.py @@ -65,7 +65,8 @@ setup( 'Support for Pushbullet backend': ['requests', 'websocket-client'], 'Support for HTTP backend': ['flask'], 'Support for HTTP poll backend': ['frozendict'], - 'Support for database plugin': ['sqlalchemy', 'python-dateutil'], + 'Support for database plugin': ['sqlalchemy'], + 'Support for RSS feeds': ['feedparser'], 'Support for Philips Hue plugin': ['phue'], 'Support for MPD/Mopidy music server plugin': ['python-mpd2'], 'Support for Belkin WeMo Switch plugin': ['ouimeaux'],