Path: blob/main/converter/textbook-converter/textbook_converter/TextbookExporter.py
3855 views
import re12from nbconvert.exporters import Exporter345INDENT = " "67HERO_IMAGE_START = "![hero:"8hero_regex = re.compile(r"^!\[hero:.*]\((.*)\)")910VUE_COMPONENT_START = "![vue:"11vue_regex = re.compile(r"^!\[vue:(.*)]\(.*\)")1213IMAGE_START = "!["14markdown_img_regex = re.compile(r"^!\[.*]\((.*)\)")15html_img_regex = re.compile(r'<img(.+?)src="(.+?)"(.*?)/?>')16mathigon_ximg_regex = re.compile(r'x-img\(src="(.*)"\)')17inline_markdown_img_regex = re.compile(r'!\[(.*?)]\((.+?)\)')1819HEADING_START = "#"20tag_id_regex = re.compile(r'(<.*\sid=["\'])(.*)(["\'])')2122COMMENT_START = "<!--"23comment_regex = re.compile(r"^<!--\s+(:::.*)\s+-->")2425blank_regex = re.compile(r"\[\[(.+?)]]")2627inline_code_regex = re.compile(r"`(.+?)`")2829CODE_BLOCK_START = "```"303132JS_CLICK_GOAL = """33const {elt} = $section.$("{selector}");34if ({elt}) {{35{elt}.on("click", () => {{36$section.score("{id}");37}});38}}39"""4041JS_VALUE_GOAL = """42const {elt} = $section.$("{selector}");43if ({elt}) {{44{elt}.on("change keyup input paste", (e) => {{45if ({elt}.value === "{value}" || ("{value}" === "checked" && {elt}.checked)) {{46e.preventDefault();47$section.score("{id}");48}}49}});50}}51"""525354def handle_inline_images(line):55"""Convert syntax from this:56575859to this:6061<img src="path/image" alt="alt text">62"""63for match_alt, match_link in inline_markdown_img_regex.findall(line):64if match_link:65line = line.replace(66f'',67f'<img src="{match_link}" alt="{match_alt}">'68)69return line707172def handle_inline_code(line):73"""Convert inline code from:7475`some text`7677to this:7879`{code} some text`80"""81for match in inline_code_regex.findall(line):82if not match.startswith("{") and not match.startswith("`"):83line = line.replace(f"`{match}`", f"`{{code}} {match}`")84return line8586def handle_inline_latex(line):87"""Escape \{ and \} in inline equations"""88if "$" not in line:89return line90newline = ""91in_latex = False92for text in line.split("$"):93if in_latex:94text = text.replace(r"\{", r"\\{")95text = text.replace(r"\}", r"\\}")96newline += text + "$"97in_latex = not in_latex98return newline[:-1]99100101def handle_block_comment(comment_syntax):102"""Convert syntax from:103104<!-- ::: block content -->105106to this:107108::: block content109"""110match = comment_regex.search(comment_syntax.lstrip())111if match is not None:112return match.group(1)113else:114return comment_syntax115116117def handle_vue_component(vue_component_syntax):118"""Convert syntax from this:119120![vue:some-component]()121122to this (the indentation is required):123124div(data-vue-mount)125some-component126127"""128match = vue_regex.search(vue_component_syntax.lstrip())129if match is not None:130return f"""131{match.group(1)}132"""133else:134return vue_component_syntax135136137def get_attachment_data(image_source, cell=None):138"""Returns the data URI for the given image attachment"""139if cell and image_source.startswith("attachment:"):140img_data = cell["attachments"][image_source[len("attachment:") :]] or []141for x in img_data.keys():142if x.startswith("image/"):143img_data = f"data:{x};base64,{img_data[x]}"144break145return img_data if len(img_data) else image_source146return image_source147148149def handle_attachments(line, cell):150"""Convert syntax from this:151152<img src="attachment:file.png">153154to this:155156<img src="...">157158"""159match = html_img_regex.search(line)160if match is not None:161img_src = match.group(2)162img_data = get_attachment_data(img_src, cell)163return line.replace(img_src, img_data)164else:165return line166167168def handle_images(line, cell):169"""Convert syntax from this:170171172173to this (the indentation is required):174175figure: x-img(src="path/image")176177"""178match = markdown_img_regex.search(line.lstrip())179if match is not None:180return f"""181figure: x-img(src="{get_attachment_data(match.group(1), cell)}")182"""183else:184return line185186187def handle_hero_image(hero_image_syntax):188"""Convert syntax from this:189190191192to this:193194> hero: path/image195"""196match = hero_regex.search(hero_image_syntax.lstrip())197if match is not None:198return f"> hero: {match.group(1)}"199else:200return hero_image_syntax201202203def handle_heading(heading_syntax, in_block, suffix, section, is_problem_set=False):204"""Increase header level and compute level, title, and id"""205header, title = heading_syntax.split(" ", 1)206title = handle_inline_code(title)207level = header.count("#")208if in_block:209return None, None, title, f"#{heading_syntax}\n"210else:211match = tag_id_regex.search(heading_syntax)212if match is None:213id = section if section else re.sub(r"\s", "-", title.strip().lower())214id = re.sub(r"[^\w-]", "", id)215if level == 1:216# Mathigon requires all sections to start with `##`217text = heading_syntax if is_problem_set else f"#{heading_syntax}\n"218elif "-0-0" in suffix:219# Mathigon requires all sections to start with `##`220text = f'## {heading_syntax.split(" ", 1)[-1]}\n'221elif level == 2 and is_problem_set:222id = re.sub(r"\s", "-", heading_syntax.split(" ", 1)[-1].strip().lower())223text = f'\n---\n\n> section: {id}\n\n## {heading_syntax.split(" ", 1)[-1]}\n'224else:225id = id.split("-", 1)[0][:25] + suffix226text = f'<h{level}>{title} <a id="{id}"></a>\n</h{level}>\n'227return id, level, title.strip(), text228else:229title = heading_syntax[0 : match.start()].split(" ", 1)[-1].strip()230id = match.group(2)231if level == 1:232# Mathigon requires all sections to start with `##`233text = f"#{heading_syntax}\n"234elif "-0-0" in suffix:235# Mathigon requires all sections to start with `##`236text = f'## {heading_syntax.split(" ", 1)[-1]}\n'237else:238text = f"<h{level}>{heading_syntax[level:]}\n</h{level}>\n"239return id, level, title, text240241242def handle_markdown_cell(cell, resources, cell_number, is_problem_set=False):243"""Reformat code markdown"""244markdown_lines = []245lines = cell.source.splitlines()246in_latex = False247in_block = False248in_code = False249headings = []250251for count, line in enumerate(lines):252if in_latex:253if line.rstrip(" .").endswith("$$"):254l = line.replace("$$", "")255markdown_lines.append(f"{l}\n" if len(l) else l)256markdown_lines.append(f"{indent}```\n")257in_latex = False258else:259markdown_lines.append(line)260markdown_lines.append("\n")261in_latex = True262continue263if line.lstrip().startswith("$$"):264indent, l = line.split("$$", 1)265assert not indent or indent.isspace()266markdown_lines.append(f"{indent}```latex\n")267if l.rstrip(" .").endswith("$$"):268l = l.replace("$$", "")269markdown_lines.append(f"{indent}{l}\n" if len(l) else l)270markdown_lines.append(f"{indent}```\n")271in_latex = False272else:273markdown_lines.append(f"{indent}{l}\n" if len(l) else l)274in_latex = True275continue276277if in_code:278if line.lstrip().startswith(CODE_BLOCK_START):279in_code = False280markdown_lines.append(line + "\n")281else:282markdown_lines.append(line + "\n")283continue284elif line.lstrip().startswith(CODE_BLOCK_START):285in_code = True286if line.rstrip().endswith(CODE_BLOCK_START):287markdown_lines.append(line.rstrip() + "code\n")288else:289markdown_lines.append(line + "\n")290continue291292line = handle_attachments(line, cell)293294if line.lstrip().startswith(COMMENT_START):295l = handle_block_comment(line)296if l.strip().endswith(":::"):297in_block = False298elif l.strip().startswith(":::"):299in_block = True300markdown_lines.append(l)301elif line.lstrip().startswith(HERO_IMAGE_START):302markdown_lines.append(handle_hero_image(line))303elif line.lstrip().startswith(VUE_COMPONENT_START):304markdown_lines.append(handle_vue_component(line))305elif line.lstrip().startswith(IMAGE_START):306markdown_lines.append(handle_images(line, cell))307elif line.lstrip().startswith(HEADING_START):308section = (309resources["textbook"]["section"]310if "section" in resources["textbook"]311else None312)313id, level, title, heading_text = handle_heading(314line, in_block, f"-{cell_number}-{count}", section, is_problem_set315)316if not in_block:317headings.append((id, level, title))318markdown_lines.append(heading_text)319else:320line = handle_inline_latex(line)321line = handle_inline_code(line)322line = handle_inline_images(line)323markdown_lines.append(324line.replace("\\%", "\\\\%")325) # .replace('$$', '$').replace('\\', '\\\\'))326markdown_lines.append("\n")327328markdown_lines.append("\n")329updated_lines = "".join(markdown_lines)330return updated_lines, resources, headings331332333def handle_code_cell_output(cell_output):334if "data" in cell_output:335for k, v in cell_output["data"].items():336if "image/svg+xml" in k:337return "".join(cell_output["data"]["image/svg+xml"])338elif "image/" in k:339return f'<img src="data:{k};base64,{v}"/>'340if "text/html" in cell_output["data"]:341return "".join(cell_output["data"]["text/html"])342if "text/latex" in cell_output["data"]:343return "".join(cell_output["data"]["text/latex"]).strip().replace("$$", "")344elif "text/plain" in cell_output["data"]:345return f"pre \n{INDENT}| " + "".join(346cell_output["data"]["text/plain"]347).replace("\n", f"\n{INDENT}| ")348elif "text" in cell_output:349return f"pre \n{INDENT}| " + "".join(cell_output["text"]).replace(350"\n", f"\n{INDENT}| "351)352353return None354355356def handle_grader_metadata(cell_metada):357"""Parse grader metadata and return code exercise widget syntax358"""359grader_attr = None360361if "grader_import" in cell_metada and "grader_function" in cell_metada:362grader_import = cell_metada["grader_import"]363grader_function = cell_metada["grader_function"]364grader_attr = f'grader-import="{grader_import}" grader-function="{grader_function}"'365elif "grader_id" in cell_metada and "grader_answer" in cell_metada:366grader_id = cell_metada["grader_id"]367grader_answer = cell_metada["grader_answer"]368grader_attr = f'grader-id="{grader_id}" grader-answer="{grader_answer}"'369370if grader_attr:371goal = cell_metada["goals"] if "goals" in cell_metada else None372373if goal is not None:374grader_attr = f"{grader_attr} goal=\"{goal[0].id}\""375376return f"q-code-exercise({grader_attr or ''})"377378379def handle_code_cell(cell, resources):380"""Prepend code with:381382pre(data-executable="true" data-language="python").383384and indent all lines. Include cell output if configured.385"""386formatted_source = (387cell.source.replace("\n", "\n ")388.replace("<", "<")389.replace("[[", "[ [")390.replace("]]", "] ]")391)392formatted_source = re.sub(r'[\^]?\s*# pylint:.*', '', formatted_source)393394grader_widget = handle_grader_metadata(cell.metadata)395396code_lines = [397f"\n::: {grader_widget}\n",398" pre.\n ",399formatted_source,400"\n\n"401]402403if "textbook" not in resources:404resources["textbook"] = {}405406include_output = (407cell.metadata["include_output"] if "include_output" in cell.metadata else None408)409if include_output is None and "include_output" in resources["textbook"]:410include_output = resources["textbook"]["include_output"]411412if include_output is not False and len(cell.outputs):413code_lines.append(f'\n output\n')414for cell_output in cell.outputs:415is_latex = "data" in cell_output and "text/latex" in cell_output["data"]416output = handle_code_cell_output(cell_output) or ""417if output.startswith("pre"):418output = f"{INDENT * 2}" + output.replace("\n", f"\n{INDENT * 2}")419code_lines.append(f"{output}\n\n")420elif is_latex:421output = f"{INDENT * 2}div.md.\n{INDENT * 3}```latex\n{INDENT * 3}" + output.replace(422"\n", f"\n{INDENT * 3}"423).strip() + f"\n{INDENT * 3}```"424code_lines.append(f"{output}\n\n")425elif len(output):426output = f"{INDENT * 2}div.\n{INDENT * 3}" + output.replace(427"\n", f"\n{INDENT * 3}"428)429code_lines.append(f"{output}\n\n")430431code_lines.append(":::\n")432joined_lines = "".join(code_lines)433return joined_lines, resources434435436def handle_cell_glossary(cell, resources={}):437"""Gather 'gloss' data"""438if "gloss" in cell.metadata and cell.metadata["gloss"]:439glossary = cell.metadata["gloss"]440441if "textbook" not in resources:442resources["textbook"] = {}443if "glossary" not in resources["textbook"]:444resources["textbook"]["glossary"] = {}445446g = resources["textbook"]["glossary"]447resources["textbook"]["glossary"] = {**g, **glossary}448449return resources450451452def handle_cell_formulas(cell, resources={}):453"""Gather 'formulas' data"""454if "formulas" in cell.metadata and cell.metadata["formulas"]:455formulas = cell.metadata["formulas"]456457if "textbook" not in resources:458resources["textbook"] = {}459if "formulas" not in resources["textbook"]:460resources["textbook"]["formulas"] = {}461462f = resources["textbook"]["formulas"]463resources["textbook"]["formulas"] = {**f, **formulas}464465return resources466467468def handle_cell_goals(id, cell, resources={}):469"""Convert 'goals' dictionary to javascript function (string)"""470goals = set([])471472if "goals" in cell.metadata and cell.metadata["goals"]:473goals_meta = cell.metadata["goals"]474actions = [f"export function {id}($section: Step) {{ "]475actions.append(" setTimeout(() => {")476477for count, goal in enumerate(goals_meta):478if "click" in goal:479actions.append(480JS_CLICK_GOAL.format(481elt="elt" + str(count), selector=goal["selector"], id=goal["id"]482)483)484485if "value" in goal:486actions.append(487JS_VALUE_GOAL.format(488elt="elt" + str(count),489selector=goal["selector"],490id=goal["id"],491value=goal["value"],492)493)494495goals.add(goal["id"])496497actions.append(" }, 250);")498actions.append("}\n")499500if "textbook" not in resources:501resources["textbook"] = {}502if "functions" not in resources["textbook"]:503resources["textbook"]["functions"] = ""504505resources["textbook"]["functions"] += "\n".join(actions)506507return list(goals), resources508509510def handle_index(headers, resources={}):511"""Create an index of the subsections (with max depth of 2)"""512top_section = ""513index = []514last_level = -1515516for id, level, title in headers:517if level > 3:518continue519if not top_section:520top_section = id521elif level <= last_level or len(index) == 0:522index.append({"id": id, "title": title, "subsections": []})523last_level = level524else:525index[-1]["subsections"].append(526{"id": id, "title": title, "subsections": []}527)528529index = {top_section: index}530531if "textbook" not in resources:532resources["textbook"] = {}533if "index" not in resources["textbook"]:534resources["textbook"]["index"] = index535536return index, resources537538539class TextbookExporter(Exporter):540output_mimetype = "text/markdown"541542def _file_extension_default(self):543return ".md"544545def from_notebook_node(self, nb, resources=None, **kw):546nb_copy, resources = super().from_notebook_node(nb, resources)547548markdown_lines = []549prefix = ""550is_problem_set = False551552if "textbook" not in resources:553resources["textbook"] = {}554if "id" in resources["textbook"]:555id = resources["textbook"]["id"]556prefix = re.compile("[^a-zA-Z]").sub("", id).lower()557if "is_problem_set" in resources["textbook"]:558is_problem_set = resources["textbook"]["is_problem_set"]559560nb_headings = []561for count, cell in enumerate(nb_copy.cells):562id = prefix + str(count)563if cell.cell_type == "markdown":564resources = handle_cell_glossary(cell, resources)565resources = handle_cell_formulas(cell, resources)566567blanks = blank_regex.findall(cell.source)568if not len(blanks):569goals, resources = handle_cell_goals(id, cell, resources)570if goals:571markdown_lines.append(f"\n---\n> id: {id}")572markdown_lines.append(f'\n> goals: {" ".join(goals)}\n\n')573else:574markdown_lines.append(f"\n---\n> id: {id}\n\n")575576markdown_output, resources, headings = handle_markdown_cell(577cell, resources, count, is_problem_set=is_problem_set578)579markdown_lines.append(markdown_output)580581if goals or len(blanks):582markdown_lines.append(f"\n\n---\n")583if headings:584nb_headings += headings585continue586587if cell.cell_type == "code" and cell.source.strip():588if 'tags' in cell.metadata and 'sanity-check' in cell.metadata['tags']:589# Ignore cell590continue591goals, resources = handle_cell_goals(id, cell, resources)592if goals:593markdown_lines.append(f"\n---\n> id: {id}")594markdown_lines.append(f'\n> goals: {" ".join(goals)}\n\n')595code_output, resources = handle_code_cell(cell, resources)596markdown_lines.append(code_output)597598if nb_headings:599_, resources = handle_index(nb_headings, resources)600601markdown_lines.append("\n")602603full_text = "".join(markdown_lines)604if is_problem_set:605full_text = full_text.replace("\n---\n\n>", "\n\n>", 1)606return (full_text, resources)607608609