CoCalc -- markdown2Mathjax.py

GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/smc_pyutil/smc_pyutil/markdown2Mathjax.py
Views: ²⁸⁵
1
# -*- coding: utf-8 -*-
2

3
from __future__ import absolute_import, print_function
4
__version_info__ = (0, 3, 9)
5
__version__ = '.'.join(map(str, __version_info__))
6
__author__ = "Matthew Young"
7

8
import re
9
from markdown2 import markdown
10

11

12
def break_tie(inline, equation):
13
    """If one of the delimiters is a substring of the other (e.g., $ and $$) it is possible that the two will begin at the same location.  In this case we need some criteria to break the tie and decide which operation takes precedence.  I've gone with the longer of the two delimiters takes priority (for example, $$ over $).  This function should return a 2 for the equation block taking precedence, a 1 for the inline block.  The magic looking return statement is to map 0->2 and 1->1."""
14
    tmp = (inline.end() - inline.start() > equation.end() - equation.start())
15
    return (tmp * 3 + 2) % 4
16

17

18
def markdown_safe(placeholder):
19
    """Is the placeholder changed by markdown?  If it is, this isn't a valid placeholder."""
20
    mdstrip = re.compile("<p>(.*)</p>\n")
21
    md = markdown(placeholder)
22
    mdp = mdstrip.match(md)
23
    if mdp and mdp.group(1) == placeholder:
24
        return True
25
    return False
26

27

28
def mathdown(text):
29
    """Convenience function which runs the basic markdown and mathjax processing sequentially."""
30
    tmp = sanitizeInput(text)
31
    return reconstructMath(markdown(tmp[0]), tmp[1])
32

33

34
def sanitizeInput(string,
35
                  inline_delims=["$", "$"],
36
                  equation_delims=["$$", "$$"],
37
                  placeholder="$0$"):
38
    """Given a string that will be passed to markdown, the content of the different math blocks is stripped out and replaced by a placeholder which MUST be ignored by markdown.  A list is returned containing the text with placeholders and a list of the stripped out equations.  Note that any pre-existing instances of the placeholder are "replaced" with themselves and a corresponding dummy entry is placed in the returned codeblock.  The sanitized string can then be passed safetly through markdown and then reconstructed with reconstructMath.
39

40
    There are potential four delimiters that can be specified.  The left and right delimiters for inline and equation mode math.  These can potentially be anything that isn't already used by markdown and is compatible with mathjax (see documentation for both).
41
    """
42
    #Check placeholder is valid.
43
    if not markdown_safe(placeholder):
44
        raise ValueError("Placeholder %s altered by markdown processing." %
45
                         placeholder)
46
    #really what we want is a reverse markdown function, but as that's too much work, this will do
47
    inline_left = re.compile("(?<!\\\\)" + re.escape(inline_delims[0]))
48
    inline_right = re.compile("(?<!\\\\)" + re.escape(inline_delims[1]))
49
    equation_left = re.compile("(?<!\\\\)" + re.escape(equation_delims[0]))
50
    equation_right = re.compile("(?<!\\\\)" + re.escape(equation_delims[1]))
51
    placeholder_re = re.compile("(?<!\\\\)" + re.escape(placeholder))
52
    placeholder_scan = placeholder_re.scanner(string)
53
    ilscanner = [inline_left.scanner(string), inline_right.scanner(string)]
54
    eqscanner = [equation_left.scanner(string), equation_right.scanner(string)]
55
    scanners = [placeholder_scan, ilscanner, eqscanner]
56
    #There are 3 types of blocks, inline math, equation math and occurances of the placeholder in the text
57
    #inBlack is 0 for a placeholder, 1 for inline block, 2 for equation
58
    inBlock = 0
59
    post = -1
60
    stlen = len(string)
61
    startmatches = [
62
        placeholder_scan.search(), ilscanner[0].search(),
63
        eqscanner[0].search()
64
    ]
65
    startpoints = [stlen, stlen, stlen]
66
    startpoints[0] = startmatches[0].start() if startmatches[0] else stlen
67
    startpoints[1] = startmatches[1].start() if startmatches[1] else stlen
68
    startpoints[2] = startmatches[2].start() if startmatches[2] else stlen
69
    terminator = -1
70
    sanitizedString = ''
71
    codeblocks = []
72
    while 1:
73
        #find the next point of interest.
74
        while startmatches[0] and startmatches[0].start() < post:
75
            startmatches[0] = placeholder_scan.search()
76
            startpoints[0] = startmatches[0].start(
77
            ) if startmatches[0] else stlen
78
        while startmatches[1] and startmatches[1].start() < post:
79
            startmatches[1] = ilscanner[0].search()
80
            startpoints[1] = startmatches[1].start(
81
            ) if startmatches[1] else stlen
82
        while startmatches[2] and startmatches[2].start() < post:
83
            startmatches[2] = eqscanner[0].search()
84
            startpoints[2] = startmatches[2].start(
85
            ) if startmatches[2] else stlen
86
    #Found start of next block of each type
87
    #Placeholder type always takes precedence if it exists and is next...
88
        if startmatches[0] and min(startpoints) == startpoints[0]:
89
            #We can do it all in one!
90
            #First add the "stripped" code to the blocks
91
            codeblocks.append('0' + placeholder)
92
            #Work out where the placeholder ends
93
            tmp = startpoints[0] + len(placeholder)
94
            #Add the "sanitized" text up to and including the placeholder
95
            sanitizedString = sanitizedString + string[post * (post >= 0):tmp]
96
            #Set the new post
97
            post = tmp
98
            #Back to start!
99
            continue
100
        elif startmatches[1] is None and startmatches[2] is None:
101
            #No more blocks, add in the rest of string and be done with it...
102
            sanitizedString = sanitizedString + string[post * (post >= 0):]
103
            return (sanitizedString, codeblocks)
104
        elif startmatches[1] is None:
105
            inBlock = 2
106
        elif startmatches[2] is None:
107
            inBlock = 1
108
        else:
109
            inBlock = (startpoints[1] <
110
                       startpoints[2]) + (startpoints[1] > startpoints[2]) * 2
111
            if not inBlock:
112
                inBlock = break_tie(startmatches[1], startmatches[2])
113
    #Magic to ensure minimum index is 0
114
        sanitizedString = sanitizedString + string[
115
            (post * (post >= 0)):startpoints[inBlock]]
116
        post = startmatches[inBlock].end()
117
        #Now find the matching end...
118
        while terminator < post:
119
            endpoint = scanners[inBlock][1].search()
120
            #If we run out of terminators before ending this loop, we're done
121
            if endpoint is None:
122
                #Add the unterminated codeblock to the sanitized string
123
                sanitizedString = sanitizedString + string[
124
                    startpoints[inBlock]:]
125
                return (sanitizedString, codeblocks)
126
            terminator = endpoint.start()
127
    #We fonud a matching endpoint, add the bit to the appropriate codeblock...
128
        codeblocks.append(str(inBlock) + string[post:endpoint.start()])
129
        #Now add in the appropriate placeholder
130
        sanitizedString = sanitizedString + placeholder
131
        #Fabulous.  Now we can start again once we update post...
132
        post = endpoint.end()
133

134

135
def reconstructMath(processedString,
136
                    codeblocks,
137
                    inline_delims=["$", "$"],
138
                    equation_delims=["$$", "$$"],
139
                    placeholder="$0$",
140
                    htmlSafe=False):
141
    """This is usually the output of sanitizeInput, after having passed the output string through markdown.  The delimiters given to this function should match those used to construct the string to begin with.
142

143
     This will output a string containing html suitable to use with mathjax.
144

145
     "<" and ">" "&" symbols in math can confuse the html interpreter because they mark the begining and end of definition blocks.  To avoid issues, if htmlSafe is set to True these symbols will be replaced by ascii codes in the math blocks. The downside to this is that if anyone is already doing this, there already niced text might be mangled (I think I've taken steps to make sure it won't but not extensively tested...)"""
146
    delims = [['', ''], inline_delims, equation_delims]
147
    placeholder_re = re.compile("(?<!\\\\)" + re.escape(placeholder))
148
    #If we've defined some "new" special characters we'll have to process any escapes of them here
149
    #Make html substitutions.
150
    if htmlSafe:
151
        safeAmp = re.compile("&(?!(?:amp;|lt;|gt;))")
152
        for i in range(len(codeblocks)):
153
            codeblocks[i] = safeAmp.sub("&amp;", codeblocks[i])
154
            codeblocks[i] = codeblocks[i].replace("<", "&lt;")
155
            codeblocks[i] = codeblocks[i].replace(">", "&gt;")
156
    #Step through the codeblocks one at a time and replace the next occurance of the placeholder.  Extra placeholders are invalid math blocks and ignored...
157
    outString = ''
158
    scan = placeholder_re.scanner(processedString)
159
    post = 0
160
    for i in range(len(codeblocks)):
161
        inBlock = int(codeblocks[i][0])
162
        match = scan.search()
163
        if not match:
164
            #raise ValueError("More codeblocks given than valid placeholders in text.")
165
            print(
166
                "WARNING: More codeblocks given than valid placeholders in text."
167
            )
168
            continue  # we make this error non-fatal: see https://github.com/sagemathinc/cocalc/issues/506
169
        outString = outString + processedString[post:match.start(
170
        )] + delims[inBlock][0] + codeblocks[i][1:] + delims[inBlock][1]
171
        post = match.end()
172
    #Add the rest of the string (if we need to)
173
    if post < len(processedString):
174
        outString = outString + processedString[post:]
175
    return outString
176

177

178
def findBoundaries(string):
179
    """A depricated function.  Finds the location of string boundaries in a stupid way."""
180
    last = ''
181
    twod = []
182
    oned = []
183
    boundary = False
184
    inoned = False
185
    intwod = False
186
    for count, char in enumerate(string):
187
        if char == "$" and last != '\\':
188
            #We just hit a valid $ character!
189
            if inoned:
190
                oned.append(count)
191
                inoned = False
192
            elif intwod:
193
                if boundary:
194
                    twod.append(count)
195
                    intwod = False
196
                    boundary = False
197
                else:
198
                    boundary = True
199
            elif boundary:
200
                #This means the last character was also a valid $
201
                twod.append(count)
202
                intwod = True
203
                boundary = False
204
            else:
205
                #This means the last character was NOT a useable $
206
                boundary = True
207
        elif boundary:
208
            #The last character was a valid $, but this one isn't...
209
            #This means the last character was a valid $, but this isn't
210
            if inoned:
211
                print("THIS SHOULD NEVER HAPPEN!")
212
            elif intwod:
213
                #ignore it...
214
                pass
215
            else:
216
                oned.append(count - 1)
217
                inoned = True
218
            boundary = False
219
        last = char
220
    #What if we finished on a boundary character?  Actually doesn't matter, but let's include it for completeness
221
    if boundary:
222
        if not (inoned or intwod):
223
            oned.append(count)
224
            inoned = True
225
    return (oned, twod)
226

227
Product

Resources

Company