platypush/platypush/backend/http/request/rss/__init__.py

import datetime
import enum
import os

from sqlalchemy import (
    create_engine,
    Column,
    Integer,
    String,
    DateTime,
    Enum,
    ForeignKey,
)

from sqlalchemy.orm import sessionmaker, scoped_session, declarative_base
from sqlalchemy.sql.expression import func

from platypush.backend.http.request import HttpRequest
from platypush.config import Config
from platypush.context import get_plugin
from platypush.message.event.http.rss import NewFeedEvent

Base = declarative_base()
Session = scoped_session(sessionmaker())


class RssUpdates(HttpRequest):
    """
    Gets new items in an RSS feed. You can use this type of object within the context of the
    :class:`platypush.backend.http.poll.HttpPollBackend` backend. Example:

      .. code-block:: yaml

        backend.http.poll:
            requests:
                - type: platypush.backend.http.request.rss.RssUpdates
                  url: https://www.technologyreview.com/feed/
                  title: MIT Technology Review
                  poll_seconds: 86400  # Poll once a day
                  digest_format: html  # Generate an HTML feed with the new items

    Triggers:

        - :class:`platypush.message.event.http.rss.NewFeedEvent` when new items are parsed from a feed or a new digest
          is available.

    Requires:

        * **feedparser** (``pip install feedparser``)

    """

    user_agent = (
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
        + 'Chrome/62.0.3202.94 Safari/537.36'
    )

    def __init__(
        self,
        url,
        title=None,
        headers=None,
        params=None,
        max_entries=None,
        extract_content=False,
        digest_format=None,
        user_agent: str = user_agent,
        body_style: str = 'font-size: 22px; '
        + 'font-family: "Merriweather", Georgia, "Times New Roman", Times, serif;',
        title_style: str = 'margin-top: 30px',
        subtitle_style: str = 'margin-top: 10px; page-break-after: always',
        article_title_style: str = 'page-break-before: always',
        article_link_style: str = 'color: #555; text-decoration: none; border-bottom: 1px dotted',
        article_content_style: str = '',
        *argv,
        **kwargs,
    ):
        """
        :param url: URL to the RSS feed to be monitored.
        :param title: Optional title for the feed.
        :param headers: Extra headers to be passed to the request.
        :param params: Extra GET parameters to be appended to the URL.
        :param max_entries: Maximum number of entries that will be returned in a single
            :class:`platypush.message.event.http.rss.NewFeedEvent` event.
        :param extract_content: Whether the context should also be extracted (through the
            :class:`platypush.plugins.http.webpage.HttpWebpagePlugin` plugin) (default: ``False``).
        :param digest_format: Format of the digest output file (default: None, text. Other supported types: ``html``
            and ``pdf`` (requires the ``weasyprint`` module installed).
        :param user_agent: User agent string to be passed on the request.
        :param body_style: CSS style for the body.
        :param title_style: CSS style for the feed title.
        :param subtitle_style: CSS style for the feed subtitle.
        :param article_title_style: CSS style for the article titles.
        :param article_link_style: CSS style for the article link.
        :param article_content_style: CSS style for the article content.
        """
        self.workdir = os.path.join(os.path.expanduser(Config.get('workdir')), 'feeds')
        self.dbfile = os.path.join(self.workdir, 'rss.db')
        self.url = url
        self.title = title
        self.max_entries = max_entries
        self.user_agent = user_agent
        self.body_style = body_style
        self.title_style = title_style
        self.subtitle_style = subtitle_style
        self.article_title_style = article_title_style
        self.article_link_style = article_link_style
        self.article_content_style = article_content_style

        # If true, then the http.webpage plugin will be used to parse the content
        self.extract_content = extract_content

        self.digest_format = (
            digest_format.lower() if digest_format else None
        )  # Supported formats: html, pdf

        os.makedirs(os.path.expanduser(os.path.dirname(self.dbfile)), exist_ok=True)

        if headers is None:
            headers = {}
        headers['User-Agent'] = self.user_agent

        request_args = {
            'method': 'get',
            'url': self.url,
            'headers': headers,
            'params': params or {},
        }

        super().__init__(skip_first_call=False, args=request_args, *argv, **kwargs)

    def _get_or_create_source(self, session):
        record = session.query(FeedSource).filter_by(url=self.url).first()
        if record is None:
            record = FeedSource(url=self.url, title=self.title)
            session.add(record)

        session.commit()
        return record

    @staticmethod
    def _get_latest_update(session, source_id):
        return (
            session.query(func.max(FeedEntry.published))
            .filter_by(source_id=source_id)
            .scalar()
        )

    def _parse_entry_content(self, link):
        self.logger.info('Extracting content from {}'.format(link))

        parser = get_plugin('http.webpage')
        response = parser.simplify(link)
        output = response.output
        errors = response.errors

        if not output:
            self.logger.warning(
                'Mercury parser error: {}'.format(errors or '[unknown error]')
            )
            return

        return output.get('content')

    def get_new_items(self, response):
        import feedparser

        engine = create_engine(
            'sqlite:///{}'.format(self.dbfile),
            connect_args={'check_same_thread': False},
        )

        Base.metadata.create_all(engine)
        Session.configure(bind=engine)
        self._get_or_create_source(session=Session())

        feed = feedparser.parse(response.text)
        session = Session()
        source_record = self._get_or_create_source(session=session)
        session.add(source_record)
        parse_start_time = datetime.datetime.utcnow()
        entries = []
        latest_update = self._get_latest_update(session, source_record.id)

        if not self.title and 'title' in feed.feed:
            self.title = feed.feed['title']
            source_record.title = self.title

        content = u'''
            <h1 style="{title_style}">{title}</h1>
            <h2 style="{subtitle_style}">Feeds digest generated on {creation_date}</h2>'''.format(
            title_style=self.title_style,
            title=self.title,
            subtitle_style=self.subtitle_style,
            creation_date=datetime.datetime.now().strftime('%d %B %Y, %H:%M'),
        )

        self.logger.info(
            'Parsed {:d} items from RSS feed <{}>'.format(len(feed.entries), self.url)
        )

        for entry in feed.entries:
            if not entry.published_parsed:
                continue

            try:
                entry_timestamp = datetime.datetime(*entry.published_parsed[:6])

                if latest_update is None or entry_timestamp > latest_update:
                    self.logger.info(
                        'Processed new item from RSS feed <{}>'.format(self.url)
                    )
                    entry.summary = entry.summary if hasattr(entry, 'summary') else None

                    if self.extract_content:
                        entry.content = self._parse_entry_content(entry.link)
                    elif hasattr(entry, 'summary'):
                        entry.content = entry.summary
                    else:
                        entry.content = None

                    content += u'''
                        <h1 style="{article_title_style}">
                            <a href="{link}" target="_blank" style="{article_link_style}">{title}</a>
                        </h1>
                        <div class="_parsed-content" style="{article_content_style}">{content}</div>'''.format(
                        article_title_style=self.article_title_style,
                        article_link_style=self.article_link_style,
                        article_content_style=self.article_content_style,
                        link=entry.link,
                        title=entry.title,
                        content=entry.content,
                    )

                    e = {
                        'entry_id': entry.id,
                        'title': entry.title,
                        'link': entry.link,
                        'summary': entry.summary,
                        'content': entry.content,
                        'source_id': source_record.id,
                        'published': entry_timestamp,
                    }

                    entries.append(e)
                    session.add(FeedEntry(**e))
                    if self.max_entries and len(entries) > self.max_entries:
                        break
            except Exception as e:
                self.logger.warning(
                    'Exception encountered while parsing RSS '
                    + f'RSS feed {entry.link}: {e}'
                )
                self.logger.exception(e)

        source_record.last_updated_at = parse_start_time
        digest_filename = None

        if entries:
            self.logger.info(
                'Parsed {} new entries from the RSS feed {}'.format(
                    len(entries), self.title
                )
            )

            if self.digest_format:
                digest_filename = os.path.join(
                    self.workdir,
                    'cache',
                    '{}_{}.{}'.format(
                        datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'),
                        self.title,
                        self.digest_format,
                    ),
                )

                os.makedirs(os.path.dirname(digest_filename), exist_ok=True)

                if self.digest_format == 'html':
                    content = '''
                        <html>
                            <head>
                                <title>{title}</title>
                            </head>
                            <body style="{body_style}">{content}</body>
                        </html>
                    '''.format(
                        title=self.title, body_style=self.body_style, content=content
                    )

                    with open(digest_filename, 'w', encoding='utf-8') as f:
                        f.write(content)
                elif self.digest_format == 'pdf':
                    from weasyprint import HTML, CSS

                    try:
                        from weasyprint.fonts import FontConfiguration
                    except ImportError:
                        from weasyprint.document import FontConfiguration

                    body_style = 'body { ' + self.body_style + ' }'
                    font_config = FontConfiguration()
                    css = [
                        CSS('https://fonts.googleapis.com/css?family=Merriweather'),
                        CSS(string=body_style, font_config=font_config),
                    ]

                    HTML(string=content).write_pdf(digest_filename, stylesheets=css)
                else:
                    raise RuntimeError(
                        f'Unsupported format: {self.digest_format}. Supported formats: html, pdf'
                    )

                digest_entry = FeedDigest(
                    source_id=source_record.id,
                    format=self.digest_format,
                    filename=digest_filename,
                )

                session.add(digest_entry)
                self.logger.info(
                    '{} digest ready: {}'.format(self.digest_format, digest_filename)
                )

        session.commit()
        self.logger.info('Parsing RSS feed {}: completed'.format(self.title))

        return NewFeedEvent(
            request=dict(self),
            response=entries,
            source_id=source_record.id,
            source_title=source_record.title,
            title=self.title,
            digest_format=self.digest_format,
            digest_filename=digest_filename,
        )


class FeedSource(Base):
    """Models the FeedSource table, containing RSS sources to be parsed"""

    __tablename__ = 'FeedSource'
    __table_args__ = {'sqlite_autoincrement': True}

    id = Column(Integer, primary_key=True)
    title = Column(String)
    url = Column(String, unique=True)
    last_updated_at = Column(DateTime)


class FeedEntry(Base):
    """Models the FeedEntry table, which contains RSS entries"""

    __tablename__ = 'FeedEntry'
    __table_args__ = {'sqlite_autoincrement': True}

    id = Column(Integer, primary_key=True)
    entry_id = Column(String)
    source_id = Column(Integer, ForeignKey('FeedSource.id'), nullable=False)
    title = Column(String)
    link = Column(String)
    summary = Column(String)
    content = Column(String)
    published = Column(DateTime)


class FeedDigest(Base):
    """Models the FeedDigest table, containing feed digests either in HTML
    or PDF format"""

    class DigestFormat(enum.Enum):
        html = 1
        pdf = 2

    __tablename__ = 'FeedDigest'
    __table_args__ = {'sqlite_autoincrement': True}

    id = Column(Integer, primary_key=True)
    source_id = Column(Integer, ForeignKey('FeedSource.id'), nullable=False)
    format = Column(Enum(DigestFormat), nullable=False)
    filename = Column(String, nullable=False)
    created_at = Column(DateTime, nullable=False, default=datetime.datetime.utcnow)


# vim:sw=4:ts=4:et: