CoCalc -- TextbookExporter.py

GitHub Repository: quantum-kittens/platypus
Path: blob/main/converter/textbook-converter/textbook_converter/TextbookExporter.py
³⁸⁵⁵ views
1
import re
2

3
from nbconvert.exporters import Exporter
4

5

6
INDENT = "    "
7

8
HERO_IMAGE_START = "![hero:"
9
hero_regex = re.compile(r"^!\[hero:.*]\((.*)\)")
10

11
VUE_COMPONENT_START = "![vue:"
12
vue_regex = re.compile(r"^!\[vue:(.*)]\(.*\)")
13

14
IMAGE_START = "!["
15
markdown_img_regex = re.compile(r"^!\[.*]\((.*)\)")
16
html_img_regex = re.compile(r'<img(.+?)src="(.+?)"(.*?)/?>')
17
mathigon_ximg_regex = re.compile(r'x-img\(src="(.*)"\)')
18
inline_markdown_img_regex = re.compile(r'!\[(.*?)]\((.+?)\)')
19

20
HEADING_START = "#"
21
tag_id_regex = re.compile(r'(<.*\sid=["\'])(.*)(["\'])')
22

23
COMMENT_START = "<!--"
24
comment_regex = re.compile(r"^<!--\s+(:::.*)\s+-->")
25

26
blank_regex = re.compile(r"\[\[(.+?)]]")
27

28
inline_code_regex = re.compile(r"`(.+?)`")
29

30
CODE_BLOCK_START = "```"
31

32

33
JS_CLICK_GOAL = """
34
    const {elt} = $section.$("{selector}");
35
    if ({elt}) {{
36
      {elt}.on("click", () => {{
37
        $section.score("{id}");
38
      }});
39
    }}
40
"""
41

42
JS_VALUE_GOAL = """
43
    const {elt} = $section.$("{selector}");
44
    if ({elt}) {{
45
      {elt}.on("change keyup input paste", (e) => {{
46
        if ({elt}.value === "{value}" || ("{value}" === "checked" && {elt}.checked)) {{
47
          e.preventDefault();
48
          $section.score("{id}");
49
        }}
50
      }});
51
    }}
52
"""
53

54

55
def handle_inline_images(line):
56
    """Convert syntax from this:
57

58
        ![alt text](path/image)
59

60
        to this:
61

62
            <img src="path/image" alt="alt text">
63
    """
64
    for match_alt, match_link in inline_markdown_img_regex.findall(line):
65
        if match_link:
66
            line = line.replace(
67
                f'![{match_alt}]({match_link})',
68
                f'<img src="{match_link}" alt="{match_alt}">'
69
            )
70
    return line
71

72

73
def handle_inline_code(line):
74
    """Convert inline code from:
75

76
    `some text`
77

78
    to this:
79

80
    `{code} some text`
81
    """
82
    for match in inline_code_regex.findall(line):
83
        if not match.startswith("{") and not match.startswith("`"):
84
            line = line.replace(f"`{match}`", f"`{{code}} {match}`")
85
    return line
86

87
def handle_inline_latex(line):
88
    """Escape \{ and \} in inline equations"""
89
    if "$" not in line:
90
        return line
91
    newline = ""
92
    in_latex = False
93
    for text in line.split("$"):
94
        if in_latex:
95
            text = text.replace(r"\{", r"\\{")
96
            text = text.replace(r"\}", r"\\}")
97
        newline += text + "$"
98
        in_latex = not in_latex
99
    return newline[:-1]
100

101

102
def handle_block_comment(comment_syntax):
103
    """Convert syntax from:
104

105
    <!-- ::: block content -->
106

107
    to this:
108

109
    ::: block content
110
    """
111
    match = comment_regex.search(comment_syntax.lstrip())
112
    if match is not None:
113
        return match.group(1)
114
    else:
115
        return comment_syntax
116

117

118
def handle_vue_component(vue_component_syntax):
119
    """Convert syntax from this:
120

121
    ![vue:some-component]()
122

123
    to this (the indentation is required):
124

125
        div(data-vue-mount)
126
            some-component
127

128
    """
129
    match = vue_regex.search(vue_component_syntax.lstrip())
130
    if match is not None:
131
        return f"""
132
    {match.group(1)}
133
        """
134
    else:
135
        return vue_component_syntax
136

137

138
def get_attachment_data(image_source, cell=None):
139
    """Returns the data URI for the given image attachment"""
140
    if cell and image_source.startswith("attachment:"):
141
        img_data = cell["attachments"][image_source[len("attachment:") :]] or []
142
        for x in img_data.keys():
143
            if x.startswith("image/"):
144
                img_data = f"data:{x};base64,{img_data[x]}"
145
                break
146
        return img_data if len(img_data) else image_source
147
    return image_source
148

149

150
def handle_attachments(line, cell):
151
    """Convert syntax from this:
152

153
    <img src="attachment:file.png">
154

155
    to this:
156

157
     <img src="data:image/png;base64,ajdfjaclencQWInak...">
158

159
    """
160
    match = html_img_regex.search(line)
161
    if match is not None:
162
        img_src = match.group(2)
163
        img_data = get_attachment_data(img_src, cell)
164
        return line.replace(img_src, img_data)
165
    else:
166
        return line
167

168

169
def handle_images(line, cell):
170
    """Convert syntax from this:
171

172
    ![alt text](path/image)
173

174
    to this (the indentation is required):
175

176
        figure: x-img(src="path/image")
177

178
    """
179
    match = markdown_img_regex.search(line.lstrip())
180
    if match is not None:
181
        return f"""
182
    figure: x-img(src="{get_attachment_data(match.group(1), cell)}")
183
        """
184
    else:
185
        return line
186

187

188
def handle_hero_image(hero_image_syntax):
189
    """Convert syntax from this:
190

191
    ![hero:alt text](path/image)
192

193
    to this:
194

195
    > hero: path/image
196
    """
197
    match = hero_regex.search(hero_image_syntax.lstrip())
198
    if match is not None:
199
        return f"> hero: {match.group(1)}"
200
    else:
201
        return hero_image_syntax
202

203

204
def handle_heading(heading_syntax, in_block, suffix, section, is_problem_set=False):
205
    """Increase header level and compute level, title, and id"""
206
    header, title = heading_syntax.split(" ", 1)
207
    title = handle_inline_code(title)
208
    level = header.count("#")
209
    if in_block:
210
        return None, None, title, f"#{heading_syntax}\n"
211
    else:
212
        match = tag_id_regex.search(heading_syntax)
213
        if match is None:
214
            id = section if section else re.sub(r"\s", "-", title.strip().lower())
215
            id = re.sub(r"[^\w-]", "", id)
216
            if level == 1:
217
                # Mathigon requires all sections to start with `##`
218
                text = heading_syntax if is_problem_set else f"#{heading_syntax}\n"
219
            elif "-0-0" in suffix:
220
                # Mathigon requires all sections to start with `##`
221
                text = f'## {heading_syntax.split(" ", 1)[-1]}\n'
222
            elif level == 2 and is_problem_set:
223
                id = re.sub(r"\s", "-", heading_syntax.split(" ", 1)[-1].strip().lower())
224
                text = f'\n---\n\n> section: {id}\n\n## {heading_syntax.split(" ", 1)[-1]}\n'
225
            else:
226
                id = id.split("-", 1)[0][:25] + suffix
227
                text = f'<h{level}>{title} <a id="{id}"></a>\n</h{level}>\n'
228
            return id, level, title.strip(), text
229
        else:
230
            title = heading_syntax[0 : match.start()].split(" ", 1)[-1].strip()
231
            id = match.group(2)
232
            if level == 1:
233
                # Mathigon requires all sections to start with `##`
234
                text = f"#{heading_syntax}\n"
235
            elif "-0-0" in suffix:
236
                # Mathigon requires all sections to start with `##`
237
                text = f'## {heading_syntax.split(" ", 1)[-1]}\n'
238
            else:
239
                text = f"<h{level}>{heading_syntax[level:]}\n</h{level}>\n"
240
            return id, level, title, text
241

242

243
def handle_markdown_cell(cell, resources, cell_number, is_problem_set=False):
244
    """Reformat code markdown"""
245
    markdown_lines = []
246
    lines = cell.source.splitlines()
247
    in_latex = False
248
    in_block = False
249
    in_code = False
250
    headings = []
251

252
    for count, line in enumerate(lines):
253
        if in_latex:
254
            if line.rstrip(" .").endswith("$$"):
255
                l = line.replace("$$", "")
256
                markdown_lines.append(f"{l}\n" if len(l) else l)
257
                markdown_lines.append(f"{indent}```\n")
258
                in_latex = False
259
            else:
260
                markdown_lines.append(line)
261
                markdown_lines.append("\n")
262
                in_latex = True
263
            continue
264
        if line.lstrip().startswith("$$"):
265
            indent, l = line.split("$$", 1)
266
            assert not indent or indent.isspace()
267
            markdown_lines.append(f"{indent}```latex\n")
268
            if l.rstrip(" .").endswith("$$"):
269
                l = l.replace("$$", "")
270
                markdown_lines.append(f"{indent}{l}\n" if len(l) else l)
271
                markdown_lines.append(f"{indent}```\n")
272
                in_latex = False
273
            else:
274
                markdown_lines.append(f"{indent}{l}\n" if len(l) else l)
275
                in_latex = True
276
            continue
277

278
        if in_code:
279
            if line.lstrip().startswith(CODE_BLOCK_START):
280
                in_code = False
281
                markdown_lines.append(line + "\n")
282
            else:
283
                markdown_lines.append(line + "\n")
284
            continue
285
        elif line.lstrip().startswith(CODE_BLOCK_START):
286
            in_code = True
287
            if line.rstrip().endswith(CODE_BLOCK_START):
288
                markdown_lines.append(line.rstrip() + "code\n")
289
            else:
290
                markdown_lines.append(line + "\n")
291
            continue
292

293
        line = handle_attachments(line, cell)
294

295
        if line.lstrip().startswith(COMMENT_START):
296
            l = handle_block_comment(line)
297
            if l.strip().endswith(":::"):
298
                in_block = False
299
            elif l.strip().startswith(":::"):
300
                in_block = True
301
            markdown_lines.append(l)
302
        elif line.lstrip().startswith(HERO_IMAGE_START):
303
            markdown_lines.append(handle_hero_image(line))
304
        elif line.lstrip().startswith(VUE_COMPONENT_START):
305
            markdown_lines.append(handle_vue_component(line))
306
        elif line.lstrip().startswith(IMAGE_START):
307
            markdown_lines.append(handle_images(line, cell))
308
        elif line.lstrip().startswith(HEADING_START):
309
            section = (
310
                resources["textbook"]["section"]
311
                if "section" in resources["textbook"]
312
                else None
313
            )
314
            id, level, title, heading_text = handle_heading(
315
                line, in_block, f"-{cell_number}-{count}", section, is_problem_set
316
            )
317
            if not in_block:
318
                headings.append((id, level, title))
319
            markdown_lines.append(heading_text)
320
        else:
321
            line = handle_inline_latex(line)
322
            line = handle_inline_code(line)
323
            line = handle_inline_images(line)
324
            markdown_lines.append(
325
                line.replace("\\%", "\\\\%")
326
            )  # .replace('$$', '$').replace('\\', '\\\\'))
327
            markdown_lines.append("\n")
328

329
    markdown_lines.append("\n")
330
    updated_lines = "".join(markdown_lines)
331
    return updated_lines, resources, headings
332

333

334
def handle_code_cell_output(cell_output):
335
    if "data" in cell_output:
336
        for k, v in cell_output["data"].items():
337
            if "image/svg+xml" in k:
338
                return "".join(cell_output["data"]["image/svg+xml"])
339
            elif "image/" in k:
340
                return f'<img src="data:{k};base64,{v}"/>'
341
        if "text/html" in cell_output["data"]:
342
            return "".join(cell_output["data"]["text/html"])
343
        if "text/latex" in cell_output["data"]:
344
            return "".join(cell_output["data"]["text/latex"]).strip().replace("$$", "")
345
        elif "text/plain" in cell_output["data"]:
346
            return f"pre \n{INDENT}| " + "".join(
347
                cell_output["data"]["text/plain"]
348
            ).replace("\n", f"\n{INDENT}| ")
349
    elif "text" in cell_output:
350
        return f"pre \n{INDENT}| " + "".join(cell_output["text"]).replace(
351
            "\n", f"\n{INDENT}| "
352
        )
353

354
    return None
355

356

357
def handle_grader_metadata(cell_metada):
358
    """Parse grader metadata and return code exercise widget syntax
359
    """
360
    grader_attr = None
361

362
    if "grader_import" in cell_metada and "grader_function" in cell_metada:
363
        grader_import = cell_metada["grader_import"]
364
        grader_function = cell_metada["grader_function"]
365
        grader_attr = f'grader-import="{grader_import}" grader-function="{grader_function}"'
366
    elif "grader_id" in cell_metada and "grader_answer" in cell_metada:
367
        grader_id = cell_metada["grader_id"]
368
        grader_answer = cell_metada["grader_answer"]
369
        grader_attr = f'grader-id="{grader_id}" grader-answer="{grader_answer}"'
370

371
    if grader_attr:
372
        goal = cell_metada["goals"] if "goals" in cell_metada else None
373

374
        if goal is not None:
375
            grader_attr = f"{grader_attr} goal=\"{goal[0].id}\""
376

377
    return f"q-code-exercise({grader_attr or ''})"
378

379

380
def handle_code_cell(cell, resources):
381
    """Prepend code with:
382

383
        pre(data-executable="true" data-language="python").
384

385
    and indent all lines. Include cell output if configured.
386
    """
387
    formatted_source = (
388
        cell.source.replace("\n", "\n      ")
389
        .replace("<", "&lt;")
390
        .replace("[[", "[ [")
391
        .replace("]]", "] ]")
392
    )
393
    formatted_source = re.sub(r'[\^]?\s*# pylint:.*', '', formatted_source)
394

395
    grader_widget = handle_grader_metadata(cell.metadata)
396

397
    code_lines = [
398
        f"\n::: {grader_widget}\n",
399
        "    pre.\n      ",
400
        formatted_source,
401
        "\n\n"
402
    ]
403

404
    if "textbook" not in resources:
405
        resources["textbook"] = {}
406

407
    include_output = (
408
        cell.metadata["include_output"] if "include_output" in cell.metadata else None
409
    )
410
    if include_output is None and "include_output" in resources["textbook"]:
411
        include_output = resources["textbook"]["include_output"]
412

413
    if include_output is not False and len(cell.outputs):
414
        code_lines.append(f'\n    output\n')
415
        for cell_output in cell.outputs:
416
            is_latex = "data" in cell_output and "text/latex" in cell_output["data"]
417
            output = handle_code_cell_output(cell_output) or ""
418
            if output.startswith("pre"):
419
                output = f"{INDENT * 2}" + output.replace("\n", f"\n{INDENT * 2}")
420
                code_lines.append(f"{output}\n\n")
421
            elif is_latex:
422
                output = f"{INDENT * 2}div.md.\n{INDENT * 3}```latex\n{INDENT * 3}" + output.replace(
423
                    "\n", f"\n{INDENT * 3}"
424
                ).strip() + f"\n{INDENT * 3}```"
425
                code_lines.append(f"{output}\n\n")
426
            elif len(output):
427
                output = f"{INDENT * 2}div.\n{INDENT * 3}" + output.replace(
428
                    "\n", f"\n{INDENT * 3}"
429
                )
430
                code_lines.append(f"{output}\n\n")
431

432
    code_lines.append(":::\n")
433
    joined_lines = "".join(code_lines)
434
    return joined_lines, resources
435

436

437
def handle_cell_glossary(cell, resources={}):
438
    """Gather 'gloss' data"""
439
    if "gloss" in cell.metadata and cell.metadata["gloss"]:
440
        glossary = cell.metadata["gloss"]
441

442
        if "textbook" not in resources:
443
            resources["textbook"] = {}
444
        if "glossary" not in resources["textbook"]:
445
            resources["textbook"]["glossary"] = {}
446

447
        g = resources["textbook"]["glossary"]
448
        resources["textbook"]["glossary"] = {**g, **glossary}
449

450
    return resources
451

452

453
def handle_cell_formulas(cell, resources={}):
454
    """Gather 'formulas' data"""
455
    if "formulas" in cell.metadata and cell.metadata["formulas"]:
456
        formulas = cell.metadata["formulas"]
457

458
        if "textbook" not in resources:
459
            resources["textbook"] = {}
460
        if "formulas" not in resources["textbook"]:
461
            resources["textbook"]["formulas"] = {}
462

463
        f = resources["textbook"]["formulas"]
464
        resources["textbook"]["formulas"] = {**f, **formulas}
465

466
    return resources
467

468

469
def handle_cell_goals(id, cell, resources={}):
470
    """Convert 'goals' dictionary to javascript function (string)"""
471
    goals = set([])
472

473
    if "goals" in cell.metadata and cell.metadata["goals"]:
474
        goals_meta = cell.metadata["goals"]
475
        actions = [f"export function {id}($section: Step) {{ "]
476
        actions.append("  setTimeout(() => {")
477

478
        for count, goal in enumerate(goals_meta):
479
            if "click" in goal:
480
                actions.append(
481
                    JS_CLICK_GOAL.format(
482
                        elt="elt" + str(count), selector=goal["selector"], id=goal["id"]
483
                    )
484
                )
485

486
            if "value" in goal:
487
                actions.append(
488
                    JS_VALUE_GOAL.format(
489
                        elt="elt" + str(count),
490
                        selector=goal["selector"],
491
                        id=goal["id"],
492
                        value=goal["value"],
493
                    )
494
                )
495

496
            goals.add(goal["id"])
497

498
        actions.append("  }, 250);")
499
        actions.append("}\n")
500

501
        if "textbook" not in resources:
502
            resources["textbook"] = {}
503
        if "functions" not in resources["textbook"]:
504
            resources["textbook"]["functions"] = ""
505

506
        resources["textbook"]["functions"] += "\n".join(actions)
507

508
    return list(goals), resources
509

510

511
def handle_index(headers, resources={}):
512
    """Create an index of the subsections (with max depth of 2)"""
513
    top_section = ""
514
    index = []
515
    last_level = -1
516

517
    for id, level, title in headers:
518
        if level > 3:
519
            continue
520
        if not top_section:
521
            top_section = id
522
        elif level <= last_level or len(index) == 0:
523
            index.append({"id": id, "title": title, "subsections": []})
524
            last_level = level
525
        else:
526
            index[-1]["subsections"].append(
527
                {"id": id, "title": title, "subsections": []}
528
            )
529

530
    index = {top_section: index}
531

532
    if "textbook" not in resources:
533
        resources["textbook"] = {}
534
    if "index" not in resources["textbook"]:
535
        resources["textbook"]["index"] = index
536

537
    return index, resources
538

539

540
class TextbookExporter(Exporter):
541
    output_mimetype = "text/markdown"
542

543
    def _file_extension_default(self):
544
        return ".md"
545

546
    def from_notebook_node(self, nb, resources=None, **kw):
547
        nb_copy, resources = super().from_notebook_node(nb, resources)
548

549
        markdown_lines = []
550
        prefix = ""
551
        is_problem_set = False
552

553
        if "textbook" not in resources:
554
            resources["textbook"] = {}
555
        if "id" in resources["textbook"]:
556
            id = resources["textbook"]["id"]
557
            prefix = re.compile("[^a-zA-Z]").sub("", id).lower()
558
        if "is_problem_set" in resources["textbook"]:
559
            is_problem_set = resources["textbook"]["is_problem_set"] 
560

561
        nb_headings = []
562
        for count, cell in enumerate(nb_copy.cells):
563
            id = prefix + str(count)
564
            if cell.cell_type == "markdown":
565
                resources = handle_cell_glossary(cell, resources)
566
                resources = handle_cell_formulas(cell, resources)
567

568
                blanks = blank_regex.findall(cell.source)
569
                if not len(blanks):
570
                    goals, resources = handle_cell_goals(id, cell, resources)
571
                    if goals:
572
                        markdown_lines.append(f"\n---\n> id: {id}")
573
                        markdown_lines.append(f'\n> goals: {" ".join(goals)}\n\n')
574
                else:
575
                    markdown_lines.append(f"\n---\n> id: {id}\n\n")
576

577
                markdown_output, resources, headings = handle_markdown_cell(
578
                    cell, resources, count, is_problem_set=is_problem_set
579
                )
580
                markdown_lines.append(markdown_output)
581

582
                if goals or len(blanks):
583
                    markdown_lines.append(f"\n\n---\n")
584
                if headings:
585
                    nb_headings += headings
586
                continue
587

588
            if cell.cell_type == "code" and cell.source.strip():
589
                if 'tags' in cell.metadata and 'sanity-check' in cell.metadata['tags']:
590
                    # Ignore cell
591
                    continue
592
                goals, resources = handle_cell_goals(id, cell, resources)
593
                if goals:
594
                    markdown_lines.append(f"\n---\n> id: {id}")
595
                    markdown_lines.append(f'\n> goals: {" ".join(goals)}\n\n')
596
                code_output, resources = handle_code_cell(cell, resources)
597
                markdown_lines.append(code_output)
598

599
        if nb_headings:
600
            _, resources = handle_index(nb_headings, resources)
601

602
        markdown_lines.append("\n")
603

604
        full_text = "".join(markdown_lines)
605
        if is_problem_set:
606
            full_text = full_text.replace("\n---\n\n>", "\n\n>", 1)
607
        return (full_text, resources)
608

609
Product

Resources

Company