blog/app/latex.py

"""
Licensed under Public Domain Mark 1.0.
See https://creativecommons.org/publicdomain/mark/1.0/
Author: Justin Bruce Van Horne <justinvh@gmail.com>

Python-Markdown LaTeX Extension
Adds support for $math mode$ and %text mode%. This plugin supports
multiline equations/text.
The actual image generation is done via LaTeX/DVI output.
It encodes data as base64 so there is no need for images directly.
All the work is done in the preprocessor.
"""

import base64
import hashlib
import json
import os
import re
import tempfile
from subprocess import call as rawcall, PIPE

import markdown


def call(*args, **kwargs):
    """
    Proxy to subprocess.call(), removes timeout argument in case of
    Python2 because that was only implemented in Python3.
    """
    return rawcall(*args, **kwargs)


# Defines our basic inline image
img_expr = '<img class="latex inline math-%s" alt="%s" id="%s" src="data:image/png;base64,%s">'

# Defines multiline expression image
multiline_img_expr = '''<div class="multiline-wrapper">
<img class="latex multiline math-%s" alt="%s" id="%s" src="data:image/png;base64,%s"></div>'''

# Base CSS template
img_css = """<style scoped>
.multiline-wrapper {
    width: 100%;
    text-align: center;
}

img.latex.multiline {
    height: 65%;
}

img.latex.inline {
    height: .9em;
    vertical-align: middle;
}
</style>"""

# Cache and temp file paths
tmpdir = tempfile.gettempdir() + '/markdown-latex'
cache_file = tmpdir + '/latex.cache'


class LaTeXPreprocessor(markdown.preprocessors.Preprocessor):
    # These are our cached expressions that are stored in latex.cache
    cached = {}

    # Basic LaTex Setup as well as our list of expressions to parse
    tex_preamble = r"""\documentclass[14pt]{article}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{bm}
\usepackage{graphicx}
\usepackage[usenames,dvipsnames]{color}
\pagestyle{empty}
"""

    # Math TeX extraction regex
    math_extract_regex = re.compile(r'(.+?)((\\\(.+?\\\))|(\$\$\n.+?\n\$\$\n))(.*)', re.MULTILINE | re.DOTALL)

    # Math TeX matching regex
    math_match_regex = re.compile(r'\s*(\\\(.+?\\\))|(\$\$\n.+?\n\$\$\n)\s*', re.MULTILINE | re.DOTALL)

    def __init__(self, *_, **__):
        if not os.path.isdir(tmpdir):
            os.makedirs(tmpdir)
        try:
            with open(cache_file, 'r') as f:
                self.cached = json.load(f)
        except (IOError, json.JSONDecodeError):
            self.cached = {}

        self.config = {
            ("general", "preamble"): "",
            ("dvipng", "args"): "-q -T tight -bg Transparent -z 9 -D 200",
            ("delimiters", "text"): "%",
            ("delimiters", "math"): "$",
            ("delimiters", "preamble"): "%%"}

    def _latex_to_base64(self, tex):
        """Generates a base64 representation of TeX string"""

        # Generate the temporary file
        tmp_file_fd, path = tempfile.mkstemp(dir=tmpdir)
        with os.fdopen(tmp_file_fd, "w") as tmp_file:
            tmp_file.write(self.tex_preamble)
            tmp_file.write(tex)
            tmp_file.write('\n\\end{document}')

        # compile LaTeX document. A DVI file is created
        status = call(('latex -halt-on-error -output-directory={:s} {:s}'
                       .format(tmpdir, path)).split(),
                      stdout=PIPE, timeout=10)

        # clean up if the above failed
        if status:
            self._cleanup(path, err=True)
            raise Exception("Couldn't compile LaTeX document." +
                            "Please read '%s.log' for more detail." % path)

        # Run dvipng on the generated DVI file. Use tight bounding box.
        # Magnification is set to 1200
        dvi = "%s.dvi" % path
        png = "%s.png" % path

        # Extract the image
        cmd = "dvipng %s %s -o %s" % (self.config[("dvipng", "args")], dvi, png)
        status = call(cmd.split(), stdout=PIPE)

        # clean up if we couldn't make the above work
        if status:
            self._cleanup(path, err=True)
            raise Exception("Couldn't convert LaTeX to image." +
                            "Please read '%s.log' for more detail." % path)

        # Read the png and encode the data
        try:
            with open(png, "rb") as png:
                data = png.read()
                return base64.b64encode(data)
        finally:
            self._cleanup(path)

    @staticmethod
    def _cleanup(path, err=False):
        # don't clean up the log if there's an error
        extensions = ["", ".aux", ".dvi", ".png", ".log"]
        if err:
            extensions.pop()

        # now do the actual cleanup, passing on non-existent files
        for extension in extensions:
            try:
                os.remove("%s%s" % (path, extension))
            except (IOError, OSError):
                pass

    def run(self, lines):
        """Parses the actual page"""
        # Checks for the LaTeX header
        use_latex = any(line == '[//]: # (latex: 1)' for line in lines)
        if not use_latex:
            return lines

        # Re-creates the entire page so we can parse in a multiline env.
        page = "\n".join(lines)

        # Adds a preamble mode
        self.tex_preamble += self.config[("general", "preamble")] + "\n\\begin{document}\n"

        # Figure out our text strings and math-mode strings
        tex_expr = self.math_extract_regex.findall(page)

        # No sense in doing the extra work
        if not len(tex_expr):
            return page.split("\n")

        # Parse the expressions
        new_cache = {}
        new_page = ''
        n_multiline_expressions = 0

        while page:
            m = self.math_extract_regex.match(page)
            if not m:
                new_page += page
                break

            new_page += m.group(1)
            math_match = self.math_match_regex.match(m.group(2))
            if not math_match:
                new_page += m.group(2)
            else:
                expr = m.group(2)
                is_multiline = math_match.group(2) is not None
                tex_hash = self.hash(expr)
                if tex_hash in self.cached:
                    data = self.cached[tex_hash]
                else:
                    data = self._latex_to_base64(expr).decode()
                    new_cache[tex_hash] = data

                if is_multiline and n_multiline_expressions > 0:
                    new_page += '</p>'
                new_page += (multiline_img_expr if is_multiline else img_expr) % ('true', expr, tex_hash, data)

                if is_multiline:
                    new_page += '<p>'
                    n_multiline_expressions += 1

            page = m.group(5)

        if n_multiline_expressions > 0:
            new_page += '</p>'

        # Cache our data
        self.cached.update(new_cache)
        with open(cache_file, 'w') as f:
            json.dump(self.cached, f)

        # Make sure to re-split the lines
        return new_page.split("\n")

    @staticmethod
    def hash(tex: str) -> str:
        return hashlib.sha1(tex.encode()).hexdigest()


class LaTeXPostprocessor(markdown.postprocessors.Postprocessor):
    """This post processor extension just allows us to further
        refine, if necessary, the document after it has been parsed."""

    # noinspection PyMethodMayBeStatic
    def run(self, text):
        # Inline a style for default behavior
        text = img_css + text
        return text


class MarkdownLatex(markdown.Extension):
    """Wrapper for LaTeXPreprocessor"""

    def extendMarkdown(self, md):
        # Our base LaTeX extension
        md.preprocessors.add('latex',
                             LaTeXPreprocessor(self), ">html_block")
        # Our cleanup postprocessing extension
        md.postprocessors.add('latex',
                              LaTeXPostprocessor(self), ">amp_substitute")