Merge pull request 'Add support for OPML import and export in the RSS plugin' (#220) from 219-opml-import-export into master

Reviewed-on: platypush/platypush#220
This commit is contained in:
Fabio Manganiello 2022-09-02 00:24:37 +02:00
commit e77d6a4ad4
3 changed files with 225 additions and 52 deletions

View file

@ -1,8 +1,13 @@
import datetime import datetime
import os
import queue import queue
import re
import threading import threading
import time import time
from typing import Optional, Collection
from dateutil.tz import tzutc
from typing import Iterable, Optional, Collection, Set
from xml.etree import ElementTree
import dateutil.parser import dateutil.parser
import requests import requests
@ -24,56 +29,67 @@ class RssPlugin(RunnablePlugin):
Requires: Requires:
* **feedparser** (``pip install feedparser``) * **feedparser** (``pip install feedparser``)
* **defusedxml** (``pip install defusedxml``)
""" """
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' + \ user_agent = (
'Chrome/62.0.3202.94 Safari/537.36' 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
+ 'Chrome/62.0.3202.94 Safari/537.36'
)
def __init__( def __init__(
self, subscriptions: Optional[Collection[str]] = None, poll_seconds: int = 300, self,
user_agent: str = user_agent, **kwargs subscriptions: Optional[Collection[str]] = None,
poll_seconds: int = 300,
user_agent: str = user_agent,
**kwargs,
): ):
""" """
:param subscriptions: List of feeds to monitor for updates, as URLs. :param subscriptions: List of feeds to monitor for updates, as URLs.
OPML URLs/local files are also supported.
:param poll_seconds: How often we should check for updates (default: 300 seconds). :param poll_seconds: How often we should check for updates (default: 300 seconds).
:param user_agent: Custom user agent to use for the requests. :param user_agent: Custom user agent to use for the requests.
""" """
super().__init__(**kwargs) super().__init__(**kwargs)
self.subscriptions = subscriptions or []
self.poll_seconds = poll_seconds self.poll_seconds = poll_seconds
self.user_agent = user_agent self.user_agent = user_agent
self._latest_timestamps = self._get_latest_timestamps() self._feeds_metadata = {}
self._feed_worker_queues = [queue.Queue()] * 5 self._feed_worker_queues = [queue.Queue()] * 5
self._feed_response_queue = queue.Queue() self._feed_response_queue = queue.Queue()
self._feed_workers = [] self._feed_workers = []
self._latest_entries = [] self._latest_entries = []
self.subscriptions = list(self._parse_subscriptions(subscriptions or []))
self._latest_timestamps = self._get_latest_timestamps()
@staticmethod @staticmethod
def _get_feed_latest_timestamp_varname(url: str) -> str: def _get_feed_latest_timestamp_varname(url: str) -> str:
return f'LATEST_FEED_TIMESTAMP[{url}]' return f'LATEST_FEED_TIMESTAMP[{url}]'
@classmethod @classmethod
def _get_feed_latest_timestamp(cls, url: str) -> Optional[datetime.datetime]: def _get_feed_latest_timestamp(cls, url: str) -> Optional[datetime.datetime]:
t = get_plugin('variable').get( t = (
cls._get_feed_latest_timestamp_varname(url) get_plugin('variable')
).output.get(cls._get_feed_latest_timestamp_varname(url)) .get(cls._get_feed_latest_timestamp_varname(url))
.output.get(cls._get_feed_latest_timestamp_varname(url))
)
if t: if t:
return dateutil.parser.isoparse(t) return dateutil.parser.isoparse(t)
def _get_latest_timestamps(self) -> dict: def _get_latest_timestamps(self) -> dict:
return { return {url: self._get_feed_latest_timestamp(url) for url in self.subscriptions}
url: self._get_feed_latest_timestamp(url)
for url in self.subscriptions
}
def _update_latest_timestamps(self) -> None: def _update_latest_timestamps(self) -> None:
variable = get_plugin('variable') variable = get_plugin('variable')
variable.set(**{ variable.set(
self._get_feed_latest_timestamp_varname(url): latest_timestamp **{
for url, latest_timestamp in self._latest_timestamps.items() self._get_feed_latest_timestamp_varname(url): latest_timestamp
}) for url, latest_timestamp in self._latest_timestamps.items()
}
)
@staticmethod @staticmethod
def _parse_content(entry) -> Optional[str]: def _parse_content(entry) -> Optional[str]:
@ -96,23 +112,30 @@ class RssPlugin(RunnablePlugin):
""" """
import feedparser import feedparser
feed = feedparser.parse(requests.get(url, headers={'User-Agent': self.user_agent}).text) feed = feedparser.parse(
requests.get(url, headers={'User-Agent': self.user_agent}).text
)
return RssFeedEntrySchema().dump( return RssFeedEntrySchema().dump(
sorted([ sorted(
{ [
'feed_url': url, {
'feed_title': getattr(feed.feed, 'title', None), 'feed_url': url,
'id': getattr(entry, 'id', None), 'feed_title': getattr(feed.feed, 'title', None),
'url': entry.link, 'id': getattr(entry, 'id', None),
'published': datetime.datetime.fromtimestamp(time.mktime(entry.published_parsed)), 'url': entry.link,
'title': entry.title, 'published': datetime.datetime.fromtimestamp(
'summary': getattr(entry, 'summary', None), time.mktime(entry.published_parsed)
'content': self._parse_content(entry), ),
} 'title': entry.title,
for entry in feed.entries 'summary': getattr(entry, 'summary', None),
if getattr(entry, 'published_parsed', None) 'content': self._parse_content(entry),
], key=lambda e: e['published']), }
many=True for entry in feed.entries
if getattr(entry, 'published_parsed', None)
],
key=lambda e: e['published'],
),
many=True,
) )
@action @action
@ -123,7 +146,9 @@ class RssPlugin(RunnablePlugin):
:param limit: Maximum number of entries to return (default: 20). :param limit: Maximum number of entries to return (default: 20).
:return: .. schema:: rss.RssFeedEntrySchema(many=True) :return: .. schema:: rss.RssFeedEntrySchema(many=True)
""" """
return sorted(self._latest_entries, key=lambda e: e['published'], reverse=True)[:limit] return sorted(self._latest_entries, key=lambda e: e['published'], reverse=True)[
:limit
]
def _feed_worker(self, q: queue.Queue): def _feed_worker(self, q: queue.Queue):
while not self.should_stop(): while not self.should_stop():
@ -133,18 +158,157 @@ class RssPlugin(RunnablePlugin):
continue continue
try: try:
self._feed_response_queue.put({ self._feed_response_queue.put(
'url': url, {
'content': self.parse_feed(url).output, 'url': url,
}) 'content': self.parse_feed(url).output,
}
)
except Exception as e: except Exception as e:
self._feed_response_queue.put({ self._feed_response_queue.put(
'url': url, {
'error': e, 'url': url,
}) 'error': e,
}
)
self._feed_response_queue.put(None) self._feed_response_queue.put(None)
def _parse_opml_lists(self, subs: Iterable[str]) -> Set[str]:
from defusedxml import ElementTree
feeds = set()
subs = set(subs)
content_by_sub = {}
urls = {sub for sub in subs if re.search(r'^https?://', sub)}
files = {os.path.expanduser(sub) for sub in subs if sub not in urls}
for url in urls:
try:
content_by_sub[url] = requests.get(
url,
headers={
'User-Agent': self.user_agent,
},
).text
except Exception as e:
self.logger.warning('Could not retrieve subscription %s: %s', url, e)
for file in files:
try:
with open(file, 'r') as f:
content_by_sub[file] = f.read()
except Exception as e:
self.logger.warning('Could not open file %s: %s', file, e)
for sub, content in content_by_sub.items():
root = ElementTree.fromstring(content.strip())
if root.tag != 'opml':
self.logger.warning('%s is not a valid OPML resource', sub)
continue
feeds.update(self._parse_feeds_from_outlines(root.findall('body/outline')))
return feeds
def _parse_feeds_from_outlines(
self,
outlines: Iterable[ElementTree.Element],
) -> Set[str]:
feeds = set()
outlines = list(outlines)
while outlines:
outline = outlines.pop(0)
if 'xmlUrl' in outline.attrib:
url = outline.attrib['xmlUrl']
feeds.add(url)
self._feeds_metadata[url] = {
**self._feeds_metadata.get(url, {}),
'title': outline.attrib.get('title'),
'description': outline.attrib.get('text'),
'url': outline.attrib.get('htmlUrl'),
}
for i, child in enumerate(outline.iter()):
if i > 0:
outlines.append(child)
return feeds
def _parse_subscriptions(self, subs: Iterable[str]) -> Iterable[str]:
import feedparser
self.logger.info('Parsing feed subscriptions')
feeds = set()
lists = set()
for sub in subs:
try:
# Check if it's an OPML list of feeds or an individual feed
feed = feedparser.parse(sub)
if feed.feed.get('opml'):
lists.add(sub)
else:
channel = feed.get('channel', {})
self._feeds_metadata[sub] = {
**self._feeds_metadata.get(sub, {}),
'title': channel.get('title'),
'description': channel.get('description'),
'url': channel.get('link'),
}
feeds.add(sub)
except Exception as e:
self.logger.warning('Could not parse %s: %s', sub, e)
feeds.update(self._parse_opml_lists(lists))
return feeds
@staticmethod
def _datetime_to_string(dt: datetime.datetime) -> str:
return dt.replace(tzinfo=tzutc()).strftime('%a, %d %b %Y %H:%M:%S %Z')
@action
def export_to_opml(self) -> str:
"""
Export the list of subscriptions into OPML format.
:return: The list of subscriptions as a string in OPML format.
"""
root = ElementTree.Element('opml', {'version': '2.0'})
head = ElementTree.Element('head')
title = ElementTree.Element('title')
title.text = 'Platypush feed subscriptions'
created = ElementTree.Element('dateCreated')
created.text = self._datetime_to_string(datetime.datetime.utcnow())
head.append(title)
head.append(created)
body = ElementTree.Element('body')
feeds = ElementTree.Element('outline', {'text': 'Feeds'})
for sub in self.subscriptions:
metadata = self._feeds_metadata.get(sub, {})
feed = ElementTree.Element(
'outline',
{
'xmlUrl': sub,
'text': metadata.get('description', metadata.get('title', sub)),
**({'htmlUrl': metadata['url']} if metadata.get('url') else {}),
**({'title': metadata['title']} if metadata.get('title') else {}),
},
)
feeds.append(feed)
body.append(feeds)
root.append(head)
root.append(body)
return ElementTree.tostring(root, encoding='utf-8', method='xml').decode()
def main(self): def main(self):
self._feed_workers = [ self._feed_workers = [
threading.Thread(target=self._feed_worker, args=(q,)) threading.Thread(target=self._feed_worker, args=(q,))
@ -154,12 +318,16 @@ class RssPlugin(RunnablePlugin):
for worker in self._feed_workers: for worker in self._feed_workers:
worker.start() worker.start()
self.logger.info(f'Initialized RSS plugin with {len(self.subscriptions)} subscriptions') self.logger.info(
f'Initialized RSS plugin with {len(self.subscriptions)} subscriptions'
)
while not self.should_stop(): while not self.should_stop():
responses = {} responses = {}
for i, url in enumerate(self.subscriptions): for i, url in enumerate(self.subscriptions):
worker_queue = self._feed_worker_queues[i % len(self._feed_worker_queues)] worker_queue = self._feed_worker_queues[
i % len(self._feed_worker_queues)
]
worker_queue.put(url) worker_queue.put(url)
time_start = time.time() time_start = time.time()
@ -168,12 +336,14 @@ class RssPlugin(RunnablePlugin):
new_entries = [] new_entries = []
while ( while (
not self.should_stop() and not self.should_stop()
len(responses) < len(self.subscriptions) and and len(responses) < len(self.subscriptions)
time.time() - time_start <= timeout and time.time() - time_start <= timeout
): ):
try: try:
response = self._feed_response_queue.get(block=True, timeout=max_time-time_start) response = self._feed_response_queue.get(
block=True, timeout=max_time - time_start
)
except queue.Empty: except queue.Empty:
self.logger.warning('RSS parse timeout') self.logger.warning('RSS parse timeout')
break break
@ -189,7 +359,9 @@ class RssPlugin(RunnablePlugin):
else: else:
responses[url] = response['content'] responses[url] = response['content']
responses = {k: v for k, v in responses.items() if not isinstance(v, Exception)} responses = {
k: v for k, v in responses.items() if not isinstance(v, Exception)
}
for url, response in responses.items(): for url, response in responses.items():
latest_timestamp = self._latest_timestamps.get(url) latest_timestamp = self._latest_timestamps.get(url)
@ -205,7 +377,7 @@ class RssPlugin(RunnablePlugin):
self._update_latest_timestamps() self._update_latest_timestamps()
self._latest_entries = new_entries self._latest_entries = new_entries
time.sleep(self.poll_seconds) self.wait_stop(self.poll_seconds)
def stop(self): def stop(self):
super().stop() super().stop()

View file

@ -4,5 +4,6 @@ manifest:
install: install:
pip: pip:
- feedparser - feedparser
- defusedxml
package: platypush.plugins.rss package: platypush.plugins.rss
type: plugin type: plugin

View file

@ -86,7 +86,7 @@ setup(
# Support for MQTT backends # Support for MQTT backends
'mqtt': ['paho-mqtt'], 'mqtt': ['paho-mqtt'],
# Support for RSS feeds parser # Support for RSS feeds parser
'rss': ['feedparser'], 'rss': ['feedparser', 'defusedxml'],
# Support for PDF generation # Support for PDF generation
'pdf': ['weasyprint'], 'pdf': ['weasyprint'],
# Support for Philips Hue plugin # Support for Philips Hue plugin