Path: blob/master/src/smc_pyutil/smc_pyutil/markdown2Mathjax.py
Views: 285
# -*- coding: utf-8 -*-12from __future__ import absolute_import, print_function3__version_info__ = (0, 3, 9)4__version__ = '.'.join(map(str, __version_info__))5__author__ = "Matthew Young"67import re8from markdown2 import markdown91011def break_tie(inline, equation):12"""If one of the delimiters is a substring of the other (e.g., $ and $$) it is possible that the two will begin at the same location. In this case we need some criteria to break the tie and decide which operation takes precedence. I've gone with the longer of the two delimiters takes priority (for example, $$ over $). This function should return a 2 for the equation block taking precedence, a 1 for the inline block. The magic looking return statement is to map 0->2 and 1->1."""13tmp = (inline.end() - inline.start() > equation.end() - equation.start())14return (tmp * 3 + 2) % 4151617def markdown_safe(placeholder):18"""Is the placeholder changed by markdown? If it is, this isn't a valid placeholder."""19mdstrip = re.compile("<p>(.*)</p>\n")20md = markdown(placeholder)21mdp = mdstrip.match(md)22if mdp and mdp.group(1) == placeholder:23return True24return False252627def mathdown(text):28"""Convenience function which runs the basic markdown and mathjax processing sequentially."""29tmp = sanitizeInput(text)30return reconstructMath(markdown(tmp[0]), tmp[1])313233def sanitizeInput(string,34inline_delims=["$", "$"],35equation_delims=["$$", "$$"],36placeholder="$0$"):37"""Given a string that will be passed to markdown, the content of the different math blocks is stripped out and replaced by a placeholder which MUST be ignored by markdown. A list is returned containing the text with placeholders and a list of the stripped out equations. Note that any pre-existing instances of the placeholder are "replaced" with themselves and a corresponding dummy entry is placed in the returned codeblock. The sanitized string can then be passed safetly through markdown and then reconstructed with reconstructMath.3839There are potential four delimiters that can be specified. The left and right delimiters for inline and equation mode math. These can potentially be anything that isn't already used by markdown and is compatible with mathjax (see documentation for both).40"""41#Check placeholder is valid.42if not markdown_safe(placeholder):43raise ValueError("Placeholder %s altered by markdown processing." %44placeholder)45#really what we want is a reverse markdown function, but as that's too much work, this will do46inline_left = re.compile("(?<!\\\\)" + re.escape(inline_delims[0]))47inline_right = re.compile("(?<!\\\\)" + re.escape(inline_delims[1]))48equation_left = re.compile("(?<!\\\\)" + re.escape(equation_delims[0]))49equation_right = re.compile("(?<!\\\\)" + re.escape(equation_delims[1]))50placeholder_re = re.compile("(?<!\\\\)" + re.escape(placeholder))51placeholder_scan = placeholder_re.scanner(string)52ilscanner = [inline_left.scanner(string), inline_right.scanner(string)]53eqscanner = [equation_left.scanner(string), equation_right.scanner(string)]54scanners = [placeholder_scan, ilscanner, eqscanner]55#There are 3 types of blocks, inline math, equation math and occurances of the placeholder in the text56#inBlack is 0 for a placeholder, 1 for inline block, 2 for equation57inBlock = 058post = -159stlen = len(string)60startmatches = [61placeholder_scan.search(), ilscanner[0].search(),62eqscanner[0].search()63]64startpoints = [stlen, stlen, stlen]65startpoints[0] = startmatches[0].start() if startmatches[0] else stlen66startpoints[1] = startmatches[1].start() if startmatches[1] else stlen67startpoints[2] = startmatches[2].start() if startmatches[2] else stlen68terminator = -169sanitizedString = ''70codeblocks = []71while 1:72#find the next point of interest.73while startmatches[0] and startmatches[0].start() < post:74startmatches[0] = placeholder_scan.search()75startpoints[0] = startmatches[0].start(76) if startmatches[0] else stlen77while startmatches[1] and startmatches[1].start() < post:78startmatches[1] = ilscanner[0].search()79startpoints[1] = startmatches[1].start(80) if startmatches[1] else stlen81while startmatches[2] and startmatches[2].start() < post:82startmatches[2] = eqscanner[0].search()83startpoints[2] = startmatches[2].start(84) if startmatches[2] else stlen85#Found start of next block of each type86#Placeholder type always takes precedence if it exists and is next...87if startmatches[0] and min(startpoints) == startpoints[0]:88#We can do it all in one!89#First add the "stripped" code to the blocks90codeblocks.append('0' + placeholder)91#Work out where the placeholder ends92tmp = startpoints[0] + len(placeholder)93#Add the "sanitized" text up to and including the placeholder94sanitizedString = sanitizedString + string[post * (post >= 0):tmp]95#Set the new post96post = tmp97#Back to start!98continue99elif startmatches[1] is None and startmatches[2] is None:100#No more blocks, add in the rest of string and be done with it...101sanitizedString = sanitizedString + string[post * (post >= 0):]102return (sanitizedString, codeblocks)103elif startmatches[1] is None:104inBlock = 2105elif startmatches[2] is None:106inBlock = 1107else:108inBlock = (startpoints[1] <109startpoints[2]) + (startpoints[1] > startpoints[2]) * 2110if not inBlock:111inBlock = break_tie(startmatches[1], startmatches[2])112#Magic to ensure minimum index is 0113sanitizedString = sanitizedString + string[114(post * (post >= 0)):startpoints[inBlock]]115post = startmatches[inBlock].end()116#Now find the matching end...117while terminator < post:118endpoint = scanners[inBlock][1].search()119#If we run out of terminators before ending this loop, we're done120if endpoint is None:121#Add the unterminated codeblock to the sanitized string122sanitizedString = sanitizedString + string[123startpoints[inBlock]:]124return (sanitizedString, codeblocks)125terminator = endpoint.start()126#We fonud a matching endpoint, add the bit to the appropriate codeblock...127codeblocks.append(str(inBlock) + string[post:endpoint.start()])128#Now add in the appropriate placeholder129sanitizedString = sanitizedString + placeholder130#Fabulous. Now we can start again once we update post...131post = endpoint.end()132133134def reconstructMath(processedString,135codeblocks,136inline_delims=["$", "$"],137equation_delims=["$$", "$$"],138placeholder="$0$",139htmlSafe=False):140"""This is usually the output of sanitizeInput, after having passed the output string through markdown. The delimiters given to this function should match those used to construct the string to begin with.141142This will output a string containing html suitable to use with mathjax.143144"<" and ">" "&" symbols in math can confuse the html interpreter because they mark the begining and end of definition blocks. To avoid issues, if htmlSafe is set to True these symbols will be replaced by ascii codes in the math blocks. The downside to this is that if anyone is already doing this, there already niced text might be mangled (I think I've taken steps to make sure it won't but not extensively tested...)"""145delims = [['', ''], inline_delims, equation_delims]146placeholder_re = re.compile("(?<!\\\\)" + re.escape(placeholder))147#If we've defined some "new" special characters we'll have to process any escapes of them here148#Make html substitutions.149if htmlSafe:150safeAmp = re.compile("&(?!(?:amp;|lt;|gt;))")151for i in range(len(codeblocks)):152codeblocks[i] = safeAmp.sub("&", codeblocks[i])153codeblocks[i] = codeblocks[i].replace("<", "<")154codeblocks[i] = codeblocks[i].replace(">", ">")155#Step through the codeblocks one at a time and replace the next occurance of the placeholder. Extra placeholders are invalid math blocks and ignored...156outString = ''157scan = placeholder_re.scanner(processedString)158post = 0159for i in range(len(codeblocks)):160inBlock = int(codeblocks[i][0])161match = scan.search()162if not match:163#raise ValueError("More codeblocks given than valid placeholders in text.")164print(165"WARNING: More codeblocks given than valid placeholders in text."166)167continue # we make this error non-fatal: see https://github.com/sagemathinc/cocalc/issues/506168outString = outString + processedString[post:match.start(169)] + delims[inBlock][0] + codeblocks[i][1:] + delims[inBlock][1]170post = match.end()171#Add the rest of the string (if we need to)172if post < len(processedString):173outString = outString + processedString[post:]174return outString175176177def findBoundaries(string):178"""A depricated function. Finds the location of string boundaries in a stupid way."""179last = ''180twod = []181oned = []182boundary = False183inoned = False184intwod = False185for count, char in enumerate(string):186if char == "$" and last != '\\':187#We just hit a valid $ character!188if inoned:189oned.append(count)190inoned = False191elif intwod:192if boundary:193twod.append(count)194intwod = False195boundary = False196else:197boundary = True198elif boundary:199#This means the last character was also a valid $200twod.append(count)201intwod = True202boundary = False203else:204#This means the last character was NOT a useable $205boundary = True206elif boundary:207#The last character was a valid $, but this one isn't...208#This means the last character was a valid $, but this isn't209if inoned:210print("THIS SHOULD NEVER HAPPEN!")211elif intwod:212#ignore it...213pass214else:215oned.append(count - 1)216inoned = True217boundary = False218last = char219#What if we finished on a boundary character? Actually doesn't matter, but let's include it for completeness220if boundary:221if not (inoned or intwod):222oned.append(count)223inoned = True224return (oned, twod)225226227