Path: blob/master/src/smc_sagews/smc_sagews/markdown2Mathjax.py
Views: 286
from __future__ import absolute_import12__version_info__ = (0, 3, 9)3__version__ = '.'.join(map(str, __version_info__))4__author__ = "Matthew Young"56import re7from markdown2 import markdown8910def break_tie(inline, equation):11"""If one of the delimiters is a substring of the other (e.g., $ and $$) it is possible that the two will begin at the same location. In this case we need some criteria to break the tie and decide which operation takes precedence. I've gone with the longer of the two delimiters takes priority (for example, $$ over $). This function should return a 2 for the equation block taking precedence, a 1 for the inline block. The magic looking return statement is to map 0->2 and 1->1."""12tmp = (inline.end() - inline.start() > equation.end() - equation.start())13return (tmp * 3 + 2) % 4141516def markdown_safe(placeholder):17"""Is the placeholder changed by markdown? If it is, this isn't a valid placeholder."""18mdstrip = re.compile("<p>(.*)</p>\n")19md = markdown(placeholder)20mdp = mdstrip.match(md)21if mdp and mdp.group(1) == placeholder:22return True23return False242526def mathdown(text):27"""Convenience function which runs the basic markdown and mathjax processing sequentially."""28tmp = sanitizeInput(text)29return reconstructMath(markdown(tmp[0]), tmp[1])303132def sanitizeInput(string,33inline_delims=["$", "$"],34equation_delims=["$$", "$$"],35placeholder="$0$"):36"""Given a string that will be passed to markdown, the content of the different math blocks is stripped out and replaced by a placeholder which MUST be ignored by markdown. A list is returned containing the text with placeholders and a list of the stripped out equations. Note that any pre-existing instances of the placeholder are "replaced" with themselves and a corresponding dummy entry is placed in the returned codeblock. The sanitized string can then be passed safetly through markdown and then reconstructed with reconstructMath.3738There are potential four delimiters that can be specified. The left and right delimiters for inline and equation mode math. These can potentially be anything that isn't already used by markdown and is compatible with mathjax (see documentation for both).39"""40#Check placeholder is valid.41if not markdown_safe(placeholder):42raise ValueError("Placeholder %s altered by markdown processing." %43placeholder)44#really what we want is a reverse markdown function, but as that's too much work, this will do45inline_left = re.compile("(?<!\\\\)" + re.escape(inline_delims[0]))46inline_right = re.compile("(?<!\\\\)" + re.escape(inline_delims[1]))47equation_left = re.compile("(?<!\\\\)" + re.escape(equation_delims[0]))48equation_right = re.compile("(?<!\\\\)" + re.escape(equation_delims[1]))49placeholder_re = re.compile("(?<!\\\\)" + re.escape(placeholder))50placeholder_scan = placeholder_re.scanner(string)51ilscanner = [inline_left.scanner(string), inline_right.scanner(string)]52eqscanner = [equation_left.scanner(string), equation_right.scanner(string)]53scanners = [placeholder_scan, ilscanner, eqscanner]54#There are 3 types of blocks, inline math, equation math and occurances of the placeholder in the text55#inBlack is 0 for a placeholder, 1 for inline block, 2 for equation56inBlock = 057post = -158stlen = len(string)59startmatches = [60placeholder_scan.search(), ilscanner[0].search(),61eqscanner[0].search()62]63startpoints = [stlen, stlen, stlen]64startpoints[0] = startmatches[0].start() if startmatches[0] else stlen65startpoints[1] = startmatches[1].start() if startmatches[1] else stlen66startpoints[2] = startmatches[2].start() if startmatches[2] else stlen67terminator = -168sanitizedString = ''69codeblocks = []70while 1:71#find the next point of interest.72while startmatches[0] and startmatches[0].start() < post:73startmatches[0] = placeholder_scan.search()74startpoints[0] = startmatches[0].start(75) if startmatches[0] else stlen76while startmatches[1] and startmatches[1].start() < post:77startmatches[1] = ilscanner[0].search()78startpoints[1] = startmatches[1].start(79) if startmatches[1] else stlen80while startmatches[2] and startmatches[2].start() < post:81startmatches[2] = eqscanner[0].search()82startpoints[2] = startmatches[2].start(83) if startmatches[2] else stlen84#Found start of next block of each type85#Placeholder type always takes precedence if it exists and is next...86if startmatches[0] and min(startpoints) == startpoints[0]:87#We can do it all in one!88#First add the "stripped" code to the blocks89codeblocks.append('0' + placeholder)90#Work out where the placeholder ends91tmp = startpoints[0] + len(placeholder)92#Add the "sanitized" text up to and including the placeholder93sanitizedString = sanitizedString + string[post * (post >= 0):tmp]94#Set the new post95post = tmp96#Back to start!97continue98elif startmatches[1] is None and startmatches[2] is None:99#No more blocks, add in the rest of string and be done with it...100sanitizedString = sanitizedString + string[post * (post >= 0):]101return (sanitizedString, codeblocks)102elif startmatches[1] is None:103inBlock = 2104elif startmatches[2] is None:105inBlock = 1106else:107inBlock = (startpoints[1] <108startpoints[2]) + (startpoints[1] > startpoints[2]) * 2109if not inBlock:110inBlock = break_tie(startmatches[1], startmatches[2])111#Magic to ensure minimum index is 0112sanitizedString = sanitizedString + string[113(post * (post >= 0)):startpoints[inBlock]]114post = startmatches[inBlock].end()115#Now find the matching end...116while terminator < post:117endpoint = scanners[inBlock][1].search()118#If we run out of terminators before ending this loop, we're done119if endpoint is None:120#Add the unterminated codeblock to the sanitized string121sanitizedString = sanitizedString + string[122startpoints[inBlock]:]123return (sanitizedString, codeblocks)124terminator = endpoint.start()125#We fonud a matching endpoint, add the bit to the appropriate codeblock...126codeblocks.append(str(inBlock) + string[post:endpoint.start()])127#Now add in the appropriate placeholder128sanitizedString = sanitizedString + placeholder129#Fabulous. Now we can start again once we update post...130post = endpoint.end()131132133def reconstructMath(processedString,134codeblocks,135inline_delims=["$", "$"],136equation_delims=["$$", "$$"],137placeholder="$0$",138htmlSafe=False):139"""This is usually the output of sanitizeInput, after having passed the output string through markdown. The delimiters given to this function should match those used to construct the string to begin with.140141This will output a string containing html suitable to use with mathjax.142143"<" and ">" "&" symbols in math can confuse the html interpreter because they mark the begining and end of definition blocks. To avoid issues, if htmlSafe is set to True these symbols will be replaced by ascii codes in the math blocks. The downside to this is that if anyone is already doing this, there already niced text might be mangled (I think I've taken steps to make sure it won't but not extensively tested...)"""144delims = [['', ''], inline_delims, equation_delims]145placeholder_re = re.compile("(?<!\\\\)" + re.escape(placeholder))146#If we've defined some "new" special characters we'll have to process any escapes of them here147#Make html substitutions.148if htmlSafe:149safeAmp = re.compile("&(?!(?:amp;|lt;|gt;))")150for i in range(len(codeblocks)):151codeblocks[i] = safeAmp.sub("&", codeblock[i])152codeblocks[i] = codeblocks[i].replace("<", "<")153codeblocks[i] = codeblocks[i].replace(">", ">")154#Step through the codeblocks one at a time and replace the next occurance of the placeholder. Extra placeholders are invalid math blocks and ignored...155outString = ''156scan = placeholder_re.scanner(processedString)157post = 0158for i in range(len(codeblocks)):159inBlock = int(codeblocks[i][0])160match = scan.search()161if not match:162raise ValueError(163"More codeblocks given than valid placeholders in text.")164outString = outString + processedString[post:match.start(165)] + delims[inBlock][0] + codeblocks[i][1:] + delims[inBlock][1]166post = match.end()167#Add the rest of the string (if we need to)168if post < len(processedString):169outString = outString + processedString[post:]170return outString171172173def findBoundaries(string):174"""A depricated function. Finds the location of string boundaries in a stupid way."""175last = ''176twod = []177oned = []178boundary = False179inoned = False180intwod = False181for count, char in enumerate(string):182if char == "$" and last != '\\':183#We just hit a valid $ character!184if inoned:185oned.append(count)186inoned = False187elif intwod:188if boundary:189twod.append(count)190intwod = False191boundary = False192else:193boundary = True194elif boundary:195#This means the last character was also a valid $196twod.append(count)197intwod = True198boundary = False199else:200#This means the last character was NOT a useable $201boundary = True202elif boundary:203#The last character was a valid $, but this one isn't...204#This means the last character was a valid $, but this isn't205if inoned:206print("THIS SHOULD NEVER HAPPEN!")207elif intwod:208#ignore it...209pass210else:211oned.append(count - 1)212inoned = True213boundary = False214last = char215#What if we finished on a boundary character? Actually doesn't matter, but let's include it for completeness216if boundary:217if not (inoned or intwod):218oned.append(count)219inoned = True220return (oned, twod)221222223