Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
quantum-kittens
GitHub Repository: quantum-kittens/platypus
Path: blob/main/converter/textbook-converter/textbook_converter/TextbookExporter.py
3855 views
1
import re
2
3
from nbconvert.exporters import Exporter
4
5
6
INDENT = " "
7
8
HERO_IMAGE_START = "![hero:"
9
hero_regex = re.compile(r"^!\[hero:.*]\((.*)\)")
10
11
VUE_COMPONENT_START = "![vue:"
12
vue_regex = re.compile(r"^!\[vue:(.*)]\(.*\)")
13
14
IMAGE_START = "!["
15
markdown_img_regex = re.compile(r"^!\[.*]\((.*)\)")
16
html_img_regex = re.compile(r'<img(.+?)src="(.+?)"(.*?)/?>')
17
mathigon_ximg_regex = re.compile(r'x-img\(src="(.*)"\)')
18
inline_markdown_img_regex = re.compile(r'!\[(.*?)]\((.+?)\)')
19
20
HEADING_START = "#"
21
tag_id_regex = re.compile(r'(<.*\sid=["\'])(.*)(["\'])')
22
23
COMMENT_START = "<!--"
24
comment_regex = re.compile(r"^<!--\s+(:::.*)\s+-->")
25
26
blank_regex = re.compile(r"\[\[(.+?)]]")
27
28
inline_code_regex = re.compile(r"`(.+?)`")
29
30
CODE_BLOCK_START = "```"
31
32
33
JS_CLICK_GOAL = """
34
const {elt} = $section.$("{selector}");
35
if ({elt}) {{
36
{elt}.on("click", () => {{
37
$section.score("{id}");
38
}});
39
}}
40
"""
41
42
JS_VALUE_GOAL = """
43
const {elt} = $section.$("{selector}");
44
if ({elt}) {{
45
{elt}.on("change keyup input paste", (e) => {{
46
if ({elt}.value === "{value}" || ("{value}" === "checked" && {elt}.checked)) {{
47
e.preventDefault();
48
$section.score("{id}");
49
}}
50
}});
51
}}
52
"""
53
54
55
def handle_inline_images(line):
56
"""Convert syntax from this:
57
58
![alt text](path/image)
59
60
to this:
61
62
<img src="path/image" alt="alt text">
63
"""
64
for match_alt, match_link in inline_markdown_img_regex.findall(line):
65
if match_link:
66
line = line.replace(
67
f'![{match_alt}]({match_link})',
68
f'<img src="{match_link}" alt="{match_alt}">'
69
)
70
return line
71
72
73
def handle_inline_code(line):
74
"""Convert inline code from:
75
76
`some text`
77
78
to this:
79
80
`{code} some text`
81
"""
82
for match in inline_code_regex.findall(line):
83
if not match.startswith("{") and not match.startswith("`"):
84
line = line.replace(f"`{match}`", f"`{{code}} {match}`")
85
return line
86
87
def handle_inline_latex(line):
88
"""Escape \{ and \} in inline equations"""
89
if "$" not in line:
90
return line
91
newline = ""
92
in_latex = False
93
for text in line.split("$"):
94
if in_latex:
95
text = text.replace(r"\{", r"\\{")
96
text = text.replace(r"\}", r"\\}")
97
newline += text + "$"
98
in_latex = not in_latex
99
return newline[:-1]
100
101
102
def handle_block_comment(comment_syntax):
103
"""Convert syntax from:
104
105
<!-- ::: block content -->
106
107
to this:
108
109
::: block content
110
"""
111
match = comment_regex.search(comment_syntax.lstrip())
112
if match is not None:
113
return match.group(1)
114
else:
115
return comment_syntax
116
117
118
def handle_vue_component(vue_component_syntax):
119
"""Convert syntax from this:
120
121
![vue:some-component]()
122
123
to this (the indentation is required):
124
125
div(data-vue-mount)
126
some-component
127
128
"""
129
match = vue_regex.search(vue_component_syntax.lstrip())
130
if match is not None:
131
return f"""
132
{match.group(1)}
133
"""
134
else:
135
return vue_component_syntax
136
137
138
def get_attachment_data(image_source, cell=None):
139
"""Returns the data URI for the given image attachment"""
140
if cell and image_source.startswith("attachment:"):
141
img_data = cell["attachments"][image_source[len("attachment:") :]] or []
142
for x in img_data.keys():
143
if x.startswith("image/"):
144
img_data = f"data:{x};base64,{img_data[x]}"
145
break
146
return img_data if len(img_data) else image_source
147
return image_source
148
149
150
def handle_attachments(line, cell):
151
"""Convert syntax from this:
152
153
<img src="attachment:file.png">
154
155
to this:
156
157
<img src="data:image/png;base64,ajdfjaclencQWInak...">
158
159
"""
160
match = html_img_regex.search(line)
161
if match is not None:
162
img_src = match.group(2)
163
img_data = get_attachment_data(img_src, cell)
164
return line.replace(img_src, img_data)
165
else:
166
return line
167
168
169
def handle_images(line, cell):
170
"""Convert syntax from this:
171
172
![alt text](path/image)
173
174
to this (the indentation is required):
175
176
figure: x-img(src="path/image")
177
178
"""
179
match = markdown_img_regex.search(line.lstrip())
180
if match is not None:
181
return f"""
182
figure: x-img(src="{get_attachment_data(match.group(1), cell)}")
183
"""
184
else:
185
return line
186
187
188
def handle_hero_image(hero_image_syntax):
189
"""Convert syntax from this:
190
191
![hero:alt text](path/image)
192
193
to this:
194
195
> hero: path/image
196
"""
197
match = hero_regex.search(hero_image_syntax.lstrip())
198
if match is not None:
199
return f"> hero: {match.group(1)}"
200
else:
201
return hero_image_syntax
202
203
204
def handle_heading(heading_syntax, in_block, suffix, section, is_problem_set=False):
205
"""Increase header level and compute level, title, and id"""
206
header, title = heading_syntax.split(" ", 1)
207
title = handle_inline_code(title)
208
level = header.count("#")
209
if in_block:
210
return None, None, title, f"#{heading_syntax}\n"
211
else:
212
match = tag_id_regex.search(heading_syntax)
213
if match is None:
214
id = section if section else re.sub(r"\s", "-", title.strip().lower())
215
id = re.sub(r"[^\w-]", "", id)
216
if level == 1:
217
# Mathigon requires all sections to start with `##`
218
text = heading_syntax if is_problem_set else f"#{heading_syntax}\n"
219
elif "-0-0" in suffix:
220
# Mathigon requires all sections to start with `##`
221
text = f'## {heading_syntax.split(" ", 1)[-1]}\n'
222
elif level == 2 and is_problem_set:
223
id = re.sub(r"\s", "-", heading_syntax.split(" ", 1)[-1].strip().lower())
224
text = f'\n---\n\n> section: {id}\n\n## {heading_syntax.split(" ", 1)[-1]}\n'
225
else:
226
id = id.split("-", 1)[0][:25] + suffix
227
text = f'<h{level}>{title} <a id="{id}"></a>\n</h{level}>\n'
228
return id, level, title.strip(), text
229
else:
230
title = heading_syntax[0 : match.start()].split(" ", 1)[-1].strip()
231
id = match.group(2)
232
if level == 1:
233
# Mathigon requires all sections to start with `##`
234
text = f"#{heading_syntax}\n"
235
elif "-0-0" in suffix:
236
# Mathigon requires all sections to start with `##`
237
text = f'## {heading_syntax.split(" ", 1)[-1]}\n'
238
else:
239
text = f"<h{level}>{heading_syntax[level:]}\n</h{level}>\n"
240
return id, level, title, text
241
242
243
def handle_markdown_cell(cell, resources, cell_number, is_problem_set=False):
244
"""Reformat code markdown"""
245
markdown_lines = []
246
lines = cell.source.splitlines()
247
in_latex = False
248
in_block = False
249
in_code = False
250
headings = []
251
252
for count, line in enumerate(lines):
253
if in_latex:
254
if line.rstrip(" .").endswith("$$"):
255
l = line.replace("$$", "")
256
markdown_lines.append(f"{l}\n" if len(l) else l)
257
markdown_lines.append(f"{indent}```\n")
258
in_latex = False
259
else:
260
markdown_lines.append(line)
261
markdown_lines.append("\n")
262
in_latex = True
263
continue
264
if line.lstrip().startswith("$$"):
265
indent, l = line.split("$$", 1)
266
assert not indent or indent.isspace()
267
markdown_lines.append(f"{indent}```latex\n")
268
if l.rstrip(" .").endswith("$$"):
269
l = l.replace("$$", "")
270
markdown_lines.append(f"{indent}{l}\n" if len(l) else l)
271
markdown_lines.append(f"{indent}```\n")
272
in_latex = False
273
else:
274
markdown_lines.append(f"{indent}{l}\n" if len(l) else l)
275
in_latex = True
276
continue
277
278
if in_code:
279
if line.lstrip().startswith(CODE_BLOCK_START):
280
in_code = False
281
markdown_lines.append(line + "\n")
282
else:
283
markdown_lines.append(line + "\n")
284
continue
285
elif line.lstrip().startswith(CODE_BLOCK_START):
286
in_code = True
287
if line.rstrip().endswith(CODE_BLOCK_START):
288
markdown_lines.append(line.rstrip() + "code\n")
289
else:
290
markdown_lines.append(line + "\n")
291
continue
292
293
line = handle_attachments(line, cell)
294
295
if line.lstrip().startswith(COMMENT_START):
296
l = handle_block_comment(line)
297
if l.strip().endswith(":::"):
298
in_block = False
299
elif l.strip().startswith(":::"):
300
in_block = True
301
markdown_lines.append(l)
302
elif line.lstrip().startswith(HERO_IMAGE_START):
303
markdown_lines.append(handle_hero_image(line))
304
elif line.lstrip().startswith(VUE_COMPONENT_START):
305
markdown_lines.append(handle_vue_component(line))
306
elif line.lstrip().startswith(IMAGE_START):
307
markdown_lines.append(handle_images(line, cell))
308
elif line.lstrip().startswith(HEADING_START):
309
section = (
310
resources["textbook"]["section"]
311
if "section" in resources["textbook"]
312
else None
313
)
314
id, level, title, heading_text = handle_heading(
315
line, in_block, f"-{cell_number}-{count}", section, is_problem_set
316
)
317
if not in_block:
318
headings.append((id, level, title))
319
markdown_lines.append(heading_text)
320
else:
321
line = handle_inline_latex(line)
322
line = handle_inline_code(line)
323
line = handle_inline_images(line)
324
markdown_lines.append(
325
line.replace("\\%", "\\\\%")
326
) # .replace('$$', '$').replace('\\', '\\\\'))
327
markdown_lines.append("\n")
328
329
markdown_lines.append("\n")
330
updated_lines = "".join(markdown_lines)
331
return updated_lines, resources, headings
332
333
334
def handle_code_cell_output(cell_output):
335
if "data" in cell_output:
336
for k, v in cell_output["data"].items():
337
if "image/svg+xml" in k:
338
return "".join(cell_output["data"]["image/svg+xml"])
339
elif "image/" in k:
340
return f'<img src="data:{k};base64,{v}"/>'
341
if "text/html" in cell_output["data"]:
342
return "".join(cell_output["data"]["text/html"])
343
if "text/latex" in cell_output["data"]:
344
return "".join(cell_output["data"]["text/latex"]).strip().replace("$$", "")
345
elif "text/plain" in cell_output["data"]:
346
return f"pre \n{INDENT}| " + "".join(
347
cell_output["data"]["text/plain"]
348
).replace("\n", f"\n{INDENT}| ")
349
elif "text" in cell_output:
350
return f"pre \n{INDENT}| " + "".join(cell_output["text"]).replace(
351
"\n", f"\n{INDENT}| "
352
)
353
354
return None
355
356
357
def handle_grader_metadata(cell_metada):
358
"""Parse grader metadata and return code exercise widget syntax
359
"""
360
grader_attr = None
361
362
if "grader_import" in cell_metada and "grader_function" in cell_metada:
363
grader_import = cell_metada["grader_import"]
364
grader_function = cell_metada["grader_function"]
365
grader_attr = f'grader-import="{grader_import}" grader-function="{grader_function}"'
366
elif "grader_id" in cell_metada and "grader_answer" in cell_metada:
367
grader_id = cell_metada["grader_id"]
368
grader_answer = cell_metada["grader_answer"]
369
grader_attr = f'grader-id="{grader_id}" grader-answer="{grader_answer}"'
370
371
if grader_attr:
372
goal = cell_metada["goals"] if "goals" in cell_metada else None
373
374
if goal is not None:
375
grader_attr = f"{grader_attr} goal=\"{goal[0].id}\""
376
377
return f"q-code-exercise({grader_attr or ''})"
378
379
380
def handle_code_cell(cell, resources):
381
"""Prepend code with:
382
383
pre(data-executable="true" data-language="python").
384
385
and indent all lines. Include cell output if configured.
386
"""
387
formatted_source = (
388
cell.source.replace("\n", "\n ")
389
.replace("<", "&lt;")
390
.replace("[[", "[ [")
391
.replace("]]", "] ]")
392
)
393
formatted_source = re.sub(r'[\^]?\s*# pylint:.*', '', formatted_source)
394
395
grader_widget = handle_grader_metadata(cell.metadata)
396
397
code_lines = [
398
f"\n::: {grader_widget}\n",
399
" pre.\n ",
400
formatted_source,
401
"\n\n"
402
]
403
404
if "textbook" not in resources:
405
resources["textbook"] = {}
406
407
include_output = (
408
cell.metadata["include_output"] if "include_output" in cell.metadata else None
409
)
410
if include_output is None and "include_output" in resources["textbook"]:
411
include_output = resources["textbook"]["include_output"]
412
413
if include_output is not False and len(cell.outputs):
414
code_lines.append(f'\n output\n')
415
for cell_output in cell.outputs:
416
is_latex = "data" in cell_output and "text/latex" in cell_output["data"]
417
output = handle_code_cell_output(cell_output) or ""
418
if output.startswith("pre"):
419
output = f"{INDENT * 2}" + output.replace("\n", f"\n{INDENT * 2}")
420
code_lines.append(f"{output}\n\n")
421
elif is_latex:
422
output = f"{INDENT * 2}div.md.\n{INDENT * 3}```latex\n{INDENT * 3}" + output.replace(
423
"\n", f"\n{INDENT * 3}"
424
).strip() + f"\n{INDENT * 3}```"
425
code_lines.append(f"{output}\n\n")
426
elif len(output):
427
output = f"{INDENT * 2}div.\n{INDENT * 3}" + output.replace(
428
"\n", f"\n{INDENT * 3}"
429
)
430
code_lines.append(f"{output}\n\n")
431
432
code_lines.append(":::\n")
433
joined_lines = "".join(code_lines)
434
return joined_lines, resources
435
436
437
def handle_cell_glossary(cell, resources={}):
438
"""Gather 'gloss' data"""
439
if "gloss" in cell.metadata and cell.metadata["gloss"]:
440
glossary = cell.metadata["gloss"]
441
442
if "textbook" not in resources:
443
resources["textbook"] = {}
444
if "glossary" not in resources["textbook"]:
445
resources["textbook"]["glossary"] = {}
446
447
g = resources["textbook"]["glossary"]
448
resources["textbook"]["glossary"] = {**g, **glossary}
449
450
return resources
451
452
453
def handle_cell_formulas(cell, resources={}):
454
"""Gather 'formulas' data"""
455
if "formulas" in cell.metadata and cell.metadata["formulas"]:
456
formulas = cell.metadata["formulas"]
457
458
if "textbook" not in resources:
459
resources["textbook"] = {}
460
if "formulas" not in resources["textbook"]:
461
resources["textbook"]["formulas"] = {}
462
463
f = resources["textbook"]["formulas"]
464
resources["textbook"]["formulas"] = {**f, **formulas}
465
466
return resources
467
468
469
def handle_cell_goals(id, cell, resources={}):
470
"""Convert 'goals' dictionary to javascript function (string)"""
471
goals = set([])
472
473
if "goals" in cell.metadata and cell.metadata["goals"]:
474
goals_meta = cell.metadata["goals"]
475
actions = [f"export function {id}($section: Step) {{ "]
476
actions.append(" setTimeout(() => {")
477
478
for count, goal in enumerate(goals_meta):
479
if "click" in goal:
480
actions.append(
481
JS_CLICK_GOAL.format(
482
elt="elt" + str(count), selector=goal["selector"], id=goal["id"]
483
)
484
)
485
486
if "value" in goal:
487
actions.append(
488
JS_VALUE_GOAL.format(
489
elt="elt" + str(count),
490
selector=goal["selector"],
491
id=goal["id"],
492
value=goal["value"],
493
)
494
)
495
496
goals.add(goal["id"])
497
498
actions.append(" }, 250);")
499
actions.append("}\n")
500
501
if "textbook" not in resources:
502
resources["textbook"] = {}
503
if "functions" not in resources["textbook"]:
504
resources["textbook"]["functions"] = ""
505
506
resources["textbook"]["functions"] += "\n".join(actions)
507
508
return list(goals), resources
509
510
511
def handle_index(headers, resources={}):
512
"""Create an index of the subsections (with max depth of 2)"""
513
top_section = ""
514
index = []
515
last_level = -1
516
517
for id, level, title in headers:
518
if level > 3:
519
continue
520
if not top_section:
521
top_section = id
522
elif level <= last_level or len(index) == 0:
523
index.append({"id": id, "title": title, "subsections": []})
524
last_level = level
525
else:
526
index[-1]["subsections"].append(
527
{"id": id, "title": title, "subsections": []}
528
)
529
530
index = {top_section: index}
531
532
if "textbook" not in resources:
533
resources["textbook"] = {}
534
if "index" not in resources["textbook"]:
535
resources["textbook"]["index"] = index
536
537
return index, resources
538
539
540
class TextbookExporter(Exporter):
541
output_mimetype = "text/markdown"
542
543
def _file_extension_default(self):
544
return ".md"
545
546
def from_notebook_node(self, nb, resources=None, **kw):
547
nb_copy, resources = super().from_notebook_node(nb, resources)
548
549
markdown_lines = []
550
prefix = ""
551
is_problem_set = False
552
553
if "textbook" not in resources:
554
resources["textbook"] = {}
555
if "id" in resources["textbook"]:
556
id = resources["textbook"]["id"]
557
prefix = re.compile("[^a-zA-Z]").sub("", id).lower()
558
if "is_problem_set" in resources["textbook"]:
559
is_problem_set = resources["textbook"]["is_problem_set"]
560
561
nb_headings = []
562
for count, cell in enumerate(nb_copy.cells):
563
id = prefix + str(count)
564
if cell.cell_type == "markdown":
565
resources = handle_cell_glossary(cell, resources)
566
resources = handle_cell_formulas(cell, resources)
567
568
blanks = blank_regex.findall(cell.source)
569
if not len(blanks):
570
goals, resources = handle_cell_goals(id, cell, resources)
571
if goals:
572
markdown_lines.append(f"\n---\n> id: {id}")
573
markdown_lines.append(f'\n> goals: {" ".join(goals)}\n\n')
574
else:
575
markdown_lines.append(f"\n---\n> id: {id}\n\n")
576
577
markdown_output, resources, headings = handle_markdown_cell(
578
cell, resources, count, is_problem_set=is_problem_set
579
)
580
markdown_lines.append(markdown_output)
581
582
if goals or len(blanks):
583
markdown_lines.append(f"\n\n---\n")
584
if headings:
585
nb_headings += headings
586
continue
587
588
if cell.cell_type == "code" and cell.source.strip():
589
if 'tags' in cell.metadata and 'sanity-check' in cell.metadata['tags']:
590
# Ignore cell
591
continue
592
goals, resources = handle_cell_goals(id, cell, resources)
593
if goals:
594
markdown_lines.append(f"\n---\n> id: {id}")
595
markdown_lines.append(f'\n> goals: {" ".join(goals)}\n\n')
596
code_output, resources = handle_code_cell(cell, resources)
597
markdown_lines.append(code_output)
598
599
if nb_headings:
600
_, resources = handle_index(nb_headings, resources)
601
602
markdown_lines.append("\n")
603
604
full_text = "".join(markdown_lines)
605
if is_problem_set:
606
full_text = full_text.replace("\n---\n\n>", "\n\n>", 1)
607
return (full_text, resources)
608
609