Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
sagemathinc
GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/smc_sagews/smc_sagews/markdown2Mathjax.py
Views: 286
1
from __future__ import absolute_import
2
3
__version_info__ = (0, 3, 9)
4
__version__ = '.'.join(map(str, __version_info__))
5
__author__ = "Matthew Young"
6
7
import re
8
from markdown2 import markdown
9
10
11
def break_tie(inline, equation):
12
"""If one of the delimiters is a substring of the other (e.g., $ and $$) it is possible that the two will begin at the same location. In this case we need some criteria to break the tie and decide which operation takes precedence. I've gone with the longer of the two delimiters takes priority (for example, $$ over $). This function should return a 2 for the equation block taking precedence, a 1 for the inline block. The magic looking return statement is to map 0->2 and 1->1."""
13
tmp = (inline.end() - inline.start() > equation.end() - equation.start())
14
return (tmp * 3 + 2) % 4
15
16
17
def markdown_safe(placeholder):
18
"""Is the placeholder changed by markdown? If it is, this isn't a valid placeholder."""
19
mdstrip = re.compile("<p>(.*)</p>\n")
20
md = markdown(placeholder)
21
mdp = mdstrip.match(md)
22
if mdp and mdp.group(1) == placeholder:
23
return True
24
return False
25
26
27
def mathdown(text):
28
"""Convenience function which runs the basic markdown and mathjax processing sequentially."""
29
tmp = sanitizeInput(text)
30
return reconstructMath(markdown(tmp[0]), tmp[1])
31
32
33
def sanitizeInput(string,
34
inline_delims=["$", "$"],
35
equation_delims=["$$", "$$"],
36
placeholder="$0$"):
37
"""Given a string that will be passed to markdown, the content of the different math blocks is stripped out and replaced by a placeholder which MUST be ignored by markdown. A list is returned containing the text with placeholders and a list of the stripped out equations. Note that any pre-existing instances of the placeholder are "replaced" with themselves and a corresponding dummy entry is placed in the returned codeblock. The sanitized string can then be passed safetly through markdown and then reconstructed with reconstructMath.
38
39
There are potential four delimiters that can be specified. The left and right delimiters for inline and equation mode math. These can potentially be anything that isn't already used by markdown and is compatible with mathjax (see documentation for both).
40
"""
41
#Check placeholder is valid.
42
if not markdown_safe(placeholder):
43
raise ValueError("Placeholder %s altered by markdown processing." %
44
placeholder)
45
#really what we want is a reverse markdown function, but as that's too much work, this will do
46
inline_left = re.compile("(?<!\\\\)" + re.escape(inline_delims[0]))
47
inline_right = re.compile("(?<!\\\\)" + re.escape(inline_delims[1]))
48
equation_left = re.compile("(?<!\\\\)" + re.escape(equation_delims[0]))
49
equation_right = re.compile("(?<!\\\\)" + re.escape(equation_delims[1]))
50
placeholder_re = re.compile("(?<!\\\\)" + re.escape(placeholder))
51
placeholder_scan = placeholder_re.scanner(string)
52
ilscanner = [inline_left.scanner(string), inline_right.scanner(string)]
53
eqscanner = [equation_left.scanner(string), equation_right.scanner(string)]
54
scanners = [placeholder_scan, ilscanner, eqscanner]
55
#There are 3 types of blocks, inline math, equation math and occurances of the placeholder in the text
56
#inBlack is 0 for a placeholder, 1 for inline block, 2 for equation
57
inBlock = 0
58
post = -1
59
stlen = len(string)
60
startmatches = [
61
placeholder_scan.search(), ilscanner[0].search(),
62
eqscanner[0].search()
63
]
64
startpoints = [stlen, stlen, stlen]
65
startpoints[0] = startmatches[0].start() if startmatches[0] else stlen
66
startpoints[1] = startmatches[1].start() if startmatches[1] else stlen
67
startpoints[2] = startmatches[2].start() if startmatches[2] else stlen
68
terminator = -1
69
sanitizedString = ''
70
codeblocks = []
71
while 1:
72
#find the next point of interest.
73
while startmatches[0] and startmatches[0].start() < post:
74
startmatches[0] = placeholder_scan.search()
75
startpoints[0] = startmatches[0].start(
76
) if startmatches[0] else stlen
77
while startmatches[1] and startmatches[1].start() < post:
78
startmatches[1] = ilscanner[0].search()
79
startpoints[1] = startmatches[1].start(
80
) if startmatches[1] else stlen
81
while startmatches[2] and startmatches[2].start() < post:
82
startmatches[2] = eqscanner[0].search()
83
startpoints[2] = startmatches[2].start(
84
) if startmatches[2] else stlen
85
#Found start of next block of each type
86
#Placeholder type always takes precedence if it exists and is next...
87
if startmatches[0] and min(startpoints) == startpoints[0]:
88
#We can do it all in one!
89
#First add the "stripped" code to the blocks
90
codeblocks.append('0' + placeholder)
91
#Work out where the placeholder ends
92
tmp = startpoints[0] + len(placeholder)
93
#Add the "sanitized" text up to and including the placeholder
94
sanitizedString = sanitizedString + string[post * (post >= 0):tmp]
95
#Set the new post
96
post = tmp
97
#Back to start!
98
continue
99
elif startmatches[1] is None and startmatches[2] is None:
100
#No more blocks, add in the rest of string and be done with it...
101
sanitizedString = sanitizedString + string[post * (post >= 0):]
102
return (sanitizedString, codeblocks)
103
elif startmatches[1] is None:
104
inBlock = 2
105
elif startmatches[2] is None:
106
inBlock = 1
107
else:
108
inBlock = (startpoints[1] <
109
startpoints[2]) + (startpoints[1] > startpoints[2]) * 2
110
if not inBlock:
111
inBlock = break_tie(startmatches[1], startmatches[2])
112
#Magic to ensure minimum index is 0
113
sanitizedString = sanitizedString + string[
114
(post * (post >= 0)):startpoints[inBlock]]
115
post = startmatches[inBlock].end()
116
#Now find the matching end...
117
while terminator < post:
118
endpoint = scanners[inBlock][1].search()
119
#If we run out of terminators before ending this loop, we're done
120
if endpoint is None:
121
#Add the unterminated codeblock to the sanitized string
122
sanitizedString = sanitizedString + string[
123
startpoints[inBlock]:]
124
return (sanitizedString, codeblocks)
125
terminator = endpoint.start()
126
#We fonud a matching endpoint, add the bit to the appropriate codeblock...
127
codeblocks.append(str(inBlock) + string[post:endpoint.start()])
128
#Now add in the appropriate placeholder
129
sanitizedString = sanitizedString + placeholder
130
#Fabulous. Now we can start again once we update post...
131
post = endpoint.end()
132
133
134
def reconstructMath(processedString,
135
codeblocks,
136
inline_delims=["$", "$"],
137
equation_delims=["$$", "$$"],
138
placeholder="$0$",
139
htmlSafe=False):
140
"""This is usually the output of sanitizeInput, after having passed the output string through markdown. The delimiters given to this function should match those used to construct the string to begin with.
141
142
This will output a string containing html suitable to use with mathjax.
143
144
"<" and ">" "&" symbols in math can confuse the html interpreter because they mark the begining and end of definition blocks. To avoid issues, if htmlSafe is set to True these symbols will be replaced by ascii codes in the math blocks. The downside to this is that if anyone is already doing this, there already niced text might be mangled (I think I've taken steps to make sure it won't but not extensively tested...)"""
145
delims = [['', ''], inline_delims, equation_delims]
146
placeholder_re = re.compile("(?<!\\\\)" + re.escape(placeholder))
147
#If we've defined some "new" special characters we'll have to process any escapes of them here
148
#Make html substitutions.
149
if htmlSafe:
150
safeAmp = re.compile("&(?!(?:amp;|lt;|gt;))")
151
for i in range(len(codeblocks)):
152
codeblocks[i] = safeAmp.sub("&amp;", codeblock[i])
153
codeblocks[i] = codeblocks[i].replace("<", "&lt;")
154
codeblocks[i] = codeblocks[i].replace(">", "&gt;")
155
#Step through the codeblocks one at a time and replace the next occurance of the placeholder. Extra placeholders are invalid math blocks and ignored...
156
outString = ''
157
scan = placeholder_re.scanner(processedString)
158
post = 0
159
for i in range(len(codeblocks)):
160
inBlock = int(codeblocks[i][0])
161
match = scan.search()
162
if not match:
163
raise ValueError(
164
"More codeblocks given than valid placeholders in text.")
165
outString = outString + processedString[post:match.start(
166
)] + delims[inBlock][0] + codeblocks[i][1:] + delims[inBlock][1]
167
post = match.end()
168
#Add the rest of the string (if we need to)
169
if post < len(processedString):
170
outString = outString + processedString[post:]
171
return outString
172
173
174
def findBoundaries(string):
175
"""A depricated function. Finds the location of string boundaries in a stupid way."""
176
last = ''
177
twod = []
178
oned = []
179
boundary = False
180
inoned = False
181
intwod = False
182
for count, char in enumerate(string):
183
if char == "$" and last != '\\':
184
#We just hit a valid $ character!
185
if inoned:
186
oned.append(count)
187
inoned = False
188
elif intwod:
189
if boundary:
190
twod.append(count)
191
intwod = False
192
boundary = False
193
else:
194
boundary = True
195
elif boundary:
196
#This means the last character was also a valid $
197
twod.append(count)
198
intwod = True
199
boundary = False
200
else:
201
#This means the last character was NOT a useable $
202
boundary = True
203
elif boundary:
204
#The last character was a valid $, but this one isn't...
205
#This means the last character was a valid $, but this isn't
206
if inoned:
207
print("THIS SHOULD NEVER HAPPEN!")
208
elif intwod:
209
#ignore it...
210
pass
211
else:
212
oned.append(count - 1)
213
inoned = True
214
boundary = False
215
last = char
216
#What if we finished on a boundary character? Actually doesn't matter, but let's include it for completeness
217
if boundary:
218
if not (inoned or intwod):
219
oned.append(count)
220
inoned = True
221
return (oned, twod)
222
223