CoCalc -- markdown2Mathjax.py

GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/smc_sagews/smc_sagews/markdown2Mathjax.py
⁵⁵³⁷ views
1
from __future__ import absolute_import
2

3
__version_info__ = (0, 3, 9)
4
__version__ = '.'.join(map(str, __version_info__))
5
__author__ = "Matthew Young"
6

7
import re
8
from markdown2 import markdown
9

10

11
def break_tie(inline, equation):
12
    """If one of the delimiters is a substring of the other (e.g., $ and $$) it is possible that the two will begin at the same location.  In this case we need some criteria to break the tie and decide which operation takes precedence.  I've gone with the longer of the two delimiters takes priority (for example, $$ over $).  This function should return a 2 for the equation block taking precedence, a 1 for the inline block.  The magic looking return statement is to map 0->2 and 1->1."""
13
    tmp = (inline.end() - inline.start() > equation.end() - equation.start())
14
    return (tmp * 3 + 2) % 4
15

16

17
def markdown_safe(placeholder):
18
    """Is the placeholder changed by markdown?  If it is, this isn't a valid placeholder."""
19
    mdstrip = re.compile("<p>(.*)</p>\n")
20
    md = markdown(placeholder)
21
    mdp = mdstrip.match(md)
22
    if mdp and mdp.group(1) == placeholder:
23
        return True
24
    return False
25

26

27
def mathdown(text):
28
    """Convenience function which runs the basic markdown and mathjax processing sequentially."""
29
    tmp = sanitizeInput(text)
30
    return reconstructMath(markdown(tmp[0]), tmp[1])
31

32

33
def sanitizeInput(string,
34
                  inline_delims=["$", "$"],
35
                  equation_delims=["$$", "$$"],
36
                  placeholder="$0$"):
37
    """Given a string that will be passed to markdown, the content of the different math blocks is stripped out and replaced by a placeholder which MUST be ignored by markdown.  A list is returned containing the text with placeholders and a list of the stripped out equations.  Note that any pre-existing instances of the placeholder are "replaced" with themselves and a corresponding dummy entry is placed in the returned codeblock.  The sanitized string can then be passed safetly through markdown and then reconstructed with reconstructMath.
38

39
    There are potential four delimiters that can be specified.  The left and right delimiters for inline and equation mode math.  These can potentially be anything that isn't already used by markdown and is compatible with mathjax (see documentation for both).
40
    """
41
    #Check placeholder is valid.
42
    if not markdown_safe(placeholder):
43
        raise ValueError("Placeholder %s altered by markdown processing." %
44
                         placeholder)
45
    #really what we want is a reverse markdown function, but as that's too much work, this will do
46
    inline_left = re.compile("(?<!\\\\)" + re.escape(inline_delims[0]))
47
    inline_right = re.compile("(?<!\\\\)" + re.escape(inline_delims[1]))
48
    equation_left = re.compile("(?<!\\\\)" + re.escape(equation_delims[0]))
49
    equation_right = re.compile("(?<!\\\\)" + re.escape(equation_delims[1]))
50
    placeholder_re = re.compile("(?<!\\\\)" + re.escape(placeholder))
51
    placeholder_scan = placeholder_re.scanner(string)
52
    ilscanner = [inline_left.scanner(string), inline_right.scanner(string)]
53
    eqscanner = [equation_left.scanner(string), equation_right.scanner(string)]
54
    scanners = [placeholder_scan, ilscanner, eqscanner]
55
    #There are 3 types of blocks, inline math, equation math and occurances of the placeholder in the text
56
    #inBlack is 0 for a placeholder, 1 for inline block, 2 for equation
57
    inBlock = 0
58
    post = -1
59
    stlen = len(string)
60
    startmatches = [
61
        placeholder_scan.search(), ilscanner[0].search(),
62
        eqscanner[0].search()
63
    ]
64
    startpoints = [stlen, stlen, stlen]
65
    startpoints[0] = startmatches[0].start() if startmatches[0] else stlen
66
    startpoints[1] = startmatches[1].start() if startmatches[1] else stlen
67
    startpoints[2] = startmatches[2].start() if startmatches[2] else stlen
68
    terminator = -1
69
    sanitizedString = ''
70
    codeblocks = []
71
    while 1:
72
        #find the next point of interest.
73
        while startmatches[0] and startmatches[0].start() < post:
74
            startmatches[0] = placeholder_scan.search()
75
            startpoints[0] = startmatches[0].start(
76
            ) if startmatches[0] else stlen
77
        while startmatches[1] and startmatches[1].start() < post:
78
            startmatches[1] = ilscanner[0].search()
79
            startpoints[1] = startmatches[1].start(
80
            ) if startmatches[1] else stlen
81
        while startmatches[2] and startmatches[2].start() < post:
82
            startmatches[2] = eqscanner[0].search()
83
            startpoints[2] = startmatches[2].start(
84
            ) if startmatches[2] else stlen
85
    #Found start of next block of each type
86
    #Placeholder type always takes precedence if it exists and is next...
87
        if startmatches[0] and min(startpoints) == startpoints[0]:
88
            #We can do it all in one!
89
            #First add the "stripped" code to the blocks
90
            codeblocks.append('0' + placeholder)
91
            #Work out where the placeholder ends
92
            tmp = startpoints[0] + len(placeholder)
93
            #Add the "sanitized" text up to and including the placeholder
94
            sanitizedString = sanitizedString + string[post * (post >= 0):tmp]
95
            #Set the new post
96
            post = tmp
97
            #Back to start!
98
            continue
99
        elif startmatches[1] is None and startmatches[2] is None:
100
            #No more blocks, add in the rest of string and be done with it...
101
            sanitizedString = sanitizedString + string[post * (post >= 0):]
102
            return (sanitizedString, codeblocks)
103
        elif startmatches[1] is None:
104
            inBlock = 2
105
        elif startmatches[2] is None:
106
            inBlock = 1
107
        else:
108
            inBlock = (startpoints[1] <
109
                       startpoints[2]) + (startpoints[1] > startpoints[2]) * 2
110
            if not inBlock:
111
                inBlock = break_tie(startmatches[1], startmatches[2])
112
    #Magic to ensure minimum index is 0
113
        sanitizedString = sanitizedString + string[
114
            (post * (post >= 0)):startpoints[inBlock]]
115
        post = startmatches[inBlock].end()
116
        #Now find the matching end...
117
        while terminator < post:
118
            endpoint = scanners[inBlock][1].search()
119
            #If we run out of terminators before ending this loop, we're done
120
            if endpoint is None:
121
                #Add the unterminated codeblock to the sanitized string
122
                sanitizedString = sanitizedString + string[
123
                    startpoints[inBlock]:]
124
                return (sanitizedString, codeblocks)
125
            terminator = endpoint.start()
126
    #We fonud a matching endpoint, add the bit to the appropriate codeblock...
127
        codeblocks.append(str(inBlock) + string[post:endpoint.start()])
128
        #Now add in the appropriate placeholder
129
        sanitizedString = sanitizedString + placeholder
130
        #Fabulous.  Now we can start again once we update post...
131
        post = endpoint.end()
132

133

134
def reconstructMath(processedString,
135
                    codeblocks,
136
                    inline_delims=["$", "$"],
137
                    equation_delims=["$$", "$$"],
138
                    placeholder="$0$",
139
                    htmlSafe=False):
140
    """This is usually the output of sanitizeInput, after having passed the output string through markdown.  The delimiters given to this function should match those used to construct the string to begin with.
141

142
     This will output a string containing html suitable to use with mathjax.
143
     
144
     "<" and ">" "&" symbols in math can confuse the html interpreter because they mark the begining and end of definition blocks.  To avoid issues, if htmlSafe is set to True these symbols will be replaced by ascii codes in the math blocks. The downside to this is that if anyone is already doing this, there already niced text might be mangled (I think I've taken steps to make sure it won't but not extensively tested...)"""
145
    delims = [['', ''], inline_delims, equation_delims]
146
    placeholder_re = re.compile("(?<!\\\\)" + re.escape(placeholder))
147
    #If we've defined some "new" special characters we'll have to process any escapes of them here
148
    #Make html substitutions.
149
    if htmlSafe:
150
        safeAmp = re.compile("&(?!(?:amp;|lt;|gt;))")
151
        for i in range(len(codeblocks)):
152
            codeblocks[i] = safeAmp.sub("&amp;", codeblock[i])
153
            codeblocks[i] = codeblocks[i].replace("<", "&lt;")
154
            codeblocks[i] = codeblocks[i].replace(">", "&gt;")
155
    #Step through the codeblocks one at a time and replace the next occurance of the placeholder.  Extra placeholders are invalid math blocks and ignored...
156
    outString = ''
157
    scan = placeholder_re.scanner(processedString)
158
    post = 0
159
    for i in range(len(codeblocks)):
160
        inBlock = int(codeblocks[i][0])
161
        match = scan.search()
162
        if not match:
163
            raise ValueError(
164
                "More codeblocks given than valid placeholders in text.")
165
        outString = outString + processedString[post:match.start(
166
        )] + delims[inBlock][0] + codeblocks[i][1:] + delims[inBlock][1]
167
        post = match.end()
168
    #Add the rest of the string (if we need to)
169
    if post < len(processedString):
170
        outString = outString + processedString[post:]
171
    return outString
172

173

174
def findBoundaries(string):
175
    """A depricated function.  Finds the location of string boundaries in a stupid way."""
176
    last = ''
177
    twod = []
178
    oned = []
179
    boundary = False
180
    inoned = False
181
    intwod = False
182
    for count, char in enumerate(string):
183
        if char == "$" and last != '\\':
184
            #We just hit a valid $ character!
185
            if inoned:
186
                oned.append(count)
187
                inoned = False
188
            elif intwod:
189
                if boundary:
190
                    twod.append(count)
191
                    intwod = False
192
                    boundary = False
193
                else:
194
                    boundary = True
195
            elif boundary:
196
                #This means the last character was also a valid $
197
                twod.append(count)
198
                intwod = True
199
                boundary = False
200
            else:
201
                #This means the last character was NOT a useable $
202
                boundary = True
203
        elif boundary:
204
            #The last character was a valid $, but this one isn't...
205
            #This means the last character was a valid $, but this isn't
206
            if inoned:
207
                print("THIS SHOULD NEVER HAPPEN!")
208
            elif intwod:
209
                #ignore it...
210
                pass
211
            else:
212
                oned.append(count - 1)
213
                inoned = True
214
            boundary = False
215
        last = char
216
    #What if we finished on a boundary character?  Actually doesn't matter, but let's include it for completeness
217
    if boundary:
218
        if not (inoned or intwod):
219
            oned.append(count)
220
            inoned = True
221
    return (oned, twod)
222

223
Product

Resources

Company