CoCalc -- tutobooks.py

GitHub Repository: keras-team/keras-io
Path: blob/master/scripts/tutobooks.py
³²⁷³ views
1
"""Keras tutobooks implementation.
2

3
A tutobook is a tutorial available simultaneously as a notebook,
4
as a Python script, and as a nicely rendered webpage.
5

6
Its source-of-truth (for manual edition and version control) is
7
its Python script form, but you can also create one by starting
8
from a notebook and converting it with the command `nb2py`.
9

10
Text cells are stored in markdown-formatted comment blocks.
11
the first line (starting with " * 3) may optionally contain a special
12
annotation, one of:
13

14
- invisible: do not render this block.
15
- shell: execute this block while prefixing each line with `!`.
16

17
The script form should start with a header with the following fields:
18
Title:
19
Author: (could be `Authors`: as well, and may contain markdown links)
20
Date created: (date in yyyy/mm/dd format)
21
Last modified: (date in yyyy/mm/dd format)
22
Description: (one-line text description)
23

24
## How to add a new code example to Keras.io
25

26
You would typically start from an existing notebook.
27

28
Save it to disk (let's say as `path_to_your_nb.ipynb`).
29
`cd` to the `keras-io/scripts/` directory.
30

31
Then run:
32

33
```
34
python tutobooks nb2py path_to_your_nb.ipynb ../examples/your_example.py
35
```
36

37
This will create the file `examples/your_example.py`. Open it,
38
fill in the headers, and generally edit it so that it looks nice.
39

40
NOTE THAT THE CONVERSION SCRIPT MAY MAKE MISTAKES IN ITS ATTEMPTS
41
TO SHORTEN LINES. MAKE SURE TO PROOFREAD THE GENERATED .py IN FULL.
42
Or alternatively, make sure to keep your lines reasonably-sized (<90 char)
43
to start with, so that the script won't have to shorten them.
44

45
You can then preview what it looks like when converted back again
46
to ipynb by running:
47

48
```
49
python tutobooks py2nb ../examples/your_example.py preview.ipynb
50
```
51

52
NOTE THAT THIS COMMAND WILL ERROR OUT IF ANY CELLS TAKES TOO LONG
53
TO EXECUTE. In that case, make your code lighter/faster.
54
Remember that examples are meant to demonstrate workflows, not
55
train state-of-the-art models. They should
56
stay very lightweight.
57

58
Open the generated `preview.ipynb` and make sure it looks like what
59
you expect. If not, keep editing `your_example.py` until it does.
60

61
Finally, submit a PR adding `examples/your_example.py`.
62
"""
63

64
import os
65
import re
66
import sys
67
import json
68
import copy
69
import random
70
import shutil
71
import tempfile
72
from pathlib import Path
73

74
TIMEOUT = 12 * 60 * 60  # 12 hours
75
MAX_LOC = 350
76

77

78
def nb_to_py(nb_path, py_path):
79
    f = open(nb_path)
80
    content = f.read()
81
    f.close()
82
    nb = json.loads(content)
83
    py = '"""\n'
84
    py += "Title: FILLME\n"
85
    py += "Author: FILLME\n"
86
    py += "Date created: FILLME\n"
87
    py += "Last modified: FILLME\n"
88
    py += "Description: FILLME\n"
89
    py += '"""\n'
90
    for cell in nb["cells"]:
91
        if cell["cell_type"] == "code":
92
            # Is it a shell cell?
93
            if cell["source"] and cell["source"][0] and cell["source"][0][0] == "!":
94
                # It's a shell cell
95
                py += '"""shell\n'
96
                py += "".join(cell["source"]) + "\n"
97
                py += '"""\n\n'
98
            else:
99
                # It's a Python cell
100
                py += "".join(cell["source"]) + "\n\n"
101
        elif cell["cell_type"] == "markdown":
102
            py += '"""\n'
103
            py += "".join(cell["source"]) + "\n"
104
            py += '"""\n\n'
105
    # Save file
106
    f = open(py_path, "w")
107
    f.write(py)
108
    f.close()
109
    # Format file with Black
110
    os.system("black " + py_path)
111
    # Shorten lines
112
    py = open(py_path).read()
113
    try:
114
        py = _shorten_lines(py)
115
    finally:
116
        f = open(py_path, "w")
117
        f.write(py)
118
        f.close()
119

120

121
def py_to_nb(py_path, nb_path, fill_outputs=False):
122
    f = open(py_path)
123
    py = f.read()
124
    f.close()
125
    validate(py)
126

127
    header, _, py, tag = _get_next_script_element(py)
128
    attributes = _parse_header(header)
129
    cells = []
130
    loc = 0
131
    # Write first header cell
132
    header_cell = {
133
        "cell_type": "markdown",
134
        "source": [
135
            "# " + attributes["title"] + "\n",
136
            "\n",
137
            "**" + attributes["auth_field"] + ":** " + attributes["author"] + "<br>\n",
138
            "**Date created:** " + attributes["date_created"] + "<br>\n",
139
            "**Last modified:** " + attributes["last_modified"] + "<br>\n",
140
            "**Description:** " + attributes["description"],
141
        ],
142
        "metadata": {"colab_type": "text"},
143
    }
144
    cells.append(header_cell)
145
    while py:
146
        e, cell_type, py, tag = _get_next_script_element(py)
147
        lines = e.split("\n")
148

149
        if all(l == "" for l in lines):
150
            continue
151

152
        if lines and not lines[0]:
153
            lines = lines[1:]
154
        source = [l + "\n" for l in lines]
155
        # Drop last newline char
156
        if source and not source[-1].strip():
157
            source = source[:-1]
158
        if source:
159
            source[-1] = source[-1].rstrip()
160
        if tag == "shell":
161
            source = ["!" + l for l in source]
162
            cell_type = "code"
163
        if tag != "invisible" and source:
164
            cell = {"cell_type": cell_type, "source": source}
165
            if cell_type == "code":
166
                cell["outputs"] = []
167
                cell["metadata"] = {"colab_type": "code"}
168
                cell["execution_count"] = 0
169
                loc += _count_locs(source)
170
            else:
171
                cell["metadata"] = {"colab_type": "text"}
172
            cells.append(cell)
173
    notebook = {}
174
    for key in NB_BASE.keys():
175
        notebook[key] = copy.deepcopy(NB_BASE[key])
176
    notebook["metadata"]["colab"]["name"] = str(py_path).split("/")[-1][:-3]
177
    notebook["metadata"]["accelerator"] = attributes["accelerator"]
178
    notebook["cells"] = cells
179
    if loc > MAX_LOC:
180
        raise ValueError(
181
            f"Found {loc} lines of code, but expected fewer than {MAX_LOC}"
182
        )
183

184
    f = open(nb_path, "w")
185
    f.write(json.dumps(notebook, indent=1, sort_keys=True))
186
    f.close()
187
    if fill_outputs:
188
        print("Generating ipynb")
189
        parent_dir = Path(nb_path).parent
190
        current_files = os.listdir(parent_dir)
191
        try:
192
            os.system(
193
                "jupyter nbconvert --to notebook --execute --debug "
194
                + str(nb_path)
195
                + " --inplace"
196
                + " --ExecutePreprocessor.timeout="
197
                + str(TIMEOUT)
198
            )
199
        finally:
200
            new_files = os.listdir(parent_dir)
201
            for fname in new_files:
202
                if fname not in current_files:
203
                    fpath = parent_dir / fname
204
                    if os.path.isdir(fpath):
205
                        print("Removing created folder:", fname)
206
                        shutil.rmtree(fpath)
207
                    else:
208
                        print("Removing created file:", fname)
209
                        os.remove(fpath)
210

211

212
def nb_to_md(nb_path, md_path, img_dir, working_dir=None):
213
    img_exts = ("png", "jpg", "jpeg")
214
    # Assumes an already populated notebook.
215
    assert str(md_path).endswith(".md")
216
    current_dir = os.getcwd()
217
    original_img_dir = str(img_dir)
218
    if original_img_dir.endswith("/"):
219
        original_img_dir = original_img_dir[:-1]
220
    img_dir = os.path.abspath(img_dir)
221
    nb_path = os.path.abspath(nb_path)
222
    nb_fname = str(nb_path).split(os.path.sep)[-1]
223

224
    del_working_dir = False
225
    if working_dir is None:
226
        del_working_dir = True
227
        working_dir = "tmp_" + str(random.randint(1e6, 1e7))
228
    if not os.path.exists(working_dir):
229
        os.makedirs(working_dir)
230
    print("Using working_dir:", working_dir)
231

232
    os.chdir(working_dir)
233
    shutil.copyfile(nb_path, nb_fname)
234

235
    md_name = str(md_path).split("/")[-1][:-3]
236
    target_md = md_name + ".md"
237
    img_dir = Path(img_dir) / md_name
238
    if not os.path.exists(img_dir):
239
        os.makedirs(img_dir)
240

241
    os.system(
242
        "jupyter nbconvert --to markdown --execute --debug "
243
        + nb_fname
244
        + " --output "
245
        + target_md
246
        + " --ExecutePreprocessor.timeout="
247
        + str(TIMEOUT)
248
    )
249
    if os.path.exists(md_name + ".md"):
250
        success = True
251
        tmp_img_dir = md_name + "_files"
252
        if os.path.exists(tmp_img_dir):
253
            for fname in os.listdir(tmp_img_dir):
254
                if fname.endswith(img_exts):
255
                    src = Path(tmp_img_dir) / fname
256
                    target = Path(img_dir) / fname
257
                    print("copy", src, "to", target)
258
                    shutil.copyfile(src, target)
259

260
        os.chdir(current_dir)
261
        md_content = open(Path(working_dir) / (md_name + ".md")).read()
262
        for ext in img_exts:
263
            md_content = md_content.replace(
264
                "![" + ext + "](" + md_name + "_files",
265
                "![" + ext + "](" + original_img_dir + "/" + md_name,
266
            )
267
        md_content = _make_output_code_blocks(md_content)
268
        open(md_path, "w").write(md_content)
269
    else:
270
        success = False
271
        os.chdir(current_dir)
272

273
    if del_working_dir:
274
        shutil.rmtree(working_dir)
275

276
    if not success:
277
        raise RuntimeError(
278
            "An error was encountered when attempting to run the notebook. "
279
            "See logs for details."
280
        )
281

282

283
def py_to_md(py_path, nb_path, md_path, img_dir, working_dir=None):
284
    py_to_nb(py_path, nb_path, fill_outputs=False)
285
    nb_to_md(nb_path, md_path, img_dir, working_dir=working_dir)
286

287

288
def validate(py):
289
    """Validate the format of a tutobook script.
290

291
    Specifically:
292
        - validate headers
293
        - validate style with black
294
    """
295
    lines = py.split("\n")
296
    if not lines[0].startswith('"""'):
297
        raise ValueError('Missing `"""`-fenced header at top of script.')
298
    if not lines[1].startswith("Title: "):
299
        raise ValueError("Missing `Title:` field.")
300
    if not lines[2].startswith("Author: ") and not lines[2].startswith("Authors: "):
301
        raise ValueError("Missing `Author:` field.")
302
    if not lines[3].startswith("Date created: "):
303
        raise ValueError("Missing `Date created:` field.")
304
    if not lines[4].startswith("Last modified: "):
305
        raise ValueError("Missing `Last modified:` field.")
306
    if not lines[5].startswith("Description: "):
307
        raise ValueError("Missing `Description:` field.")
308
    if not lines[6].startswith("Accelerator: "):
309
        raise ValueError("Missing `Accelerator:` field.")
310
    description = lines[5][len("Description: ") :]
311
    if not description:
312
        raise ValueError("Missing `Description:` field content.")
313
    if not description[0] == description[0].upper():
314
        raise ValueError("Description field content must be capitalized.")
315
    if not description[-1] == ".":
316
        raise ValueError("Description field content must end with a period.")
317
    if len(description) > 100:
318
        raise ValueError("Description field content must be less than 100 chars.")
319
    accelerator = lines[6][len("Accelerator: ") :]
320
    accelerator_options = ["GPU", "TPU", "None"]
321
    if accelerator not in accelerator_options:
322
        raise ValueError(
323
            f"Accelerator field content must be one of: {accelerator_options}. "
324
            f"Received: accelerator={accelerator}"
325
        )
326
    for i, line in enumerate(lines):
327
        if line.startswith('"""') and line.endswith('"""') and len(line) > 3:
328
            raise ValueError(
329
                'Do not use single line `"""`-fenced comments. '
330
                "Encountered at line %d" % (i,)
331
            )
332
    for i, line in enumerate(lines):
333
        if line.endswith(" "):
334
            raise ValueError("Found trailing space on line %d; line: `%s`" % (i, line))
335
    # Validate style with black
336

337
    tmp = tempfile.gettempdir()
338
    fpath = os.path.join(tmp, str(random.randint(1e6, 1e7)) + ".py")
339
    f = open(fpath, "w")
340
    pre_formatting = "\n".join(lines)
341
    f.write(pre_formatting)
342
    f.close()
343
    os.system("black " + fpath)
344
    f = open(fpath)
345
    formatted = f.read()
346
    f.close()
347
    os.remove(fpath)
348
    if formatted != pre_formatting:
349
        raise ValueError(
350
            "Your python file did not follow `black` conventions. "
351
            "Run `black your_file.py` to autoformat it."
352
        )
353

354
    # Extra checks.
355
    if "//arxiv.org/pdf/" in py:
356
        raise ValueError(
357
            "Do not link to arXiv PDFs directly. " "Instead, link to the abstract page."
358
        )
359

360

361
def count_locs_in_file(py_path):
362
    f = open(py_path)
363
    py = f.read()
364
    f.close()
365
    _get_next_script_element(py)  # Header
366
    loc = 0
367
    while py:
368
        e, cell_type, py, _ = _get_next_script_element(py)
369
        lines = e.split("\n")
370
        if cell_type == "code":
371
            loc += _count_locs(lines)
372
    return loc
373

374

375
def _count_locs(lines):
376
    loc = 0
377
    string_open = False
378
    for line in lines:
379
        line = line.strip()
380
        if not line or line.startswith("#"):
381
            continue
382
        if not string_open:
383
            if not line.startswith('"""'):
384
                loc += 1
385
            else:
386
                if not line.endswith('"""'):
387
                    string_open = True
388
        else:
389
            if line.startswith('"""'):
390
                string_open = False
391
    return loc
392

393

394
def _shorten_lines(py):
395
    max_len = 90
396
    lines = []
397
    for line in py.split("\n"):
398
        if len(line) <= max_len:
399
            lines.append(line)
400
            continue
401
        i = 0
402
        while len(line) > max_len:
403
            line = line.lstrip()
404
            if " " not in line[1:]:
405
                lines.append(line)
406
                break
407
            else:
408
                short_line = line[:max_len]
409
                line = line[max_len:]
410
                if " " in short_line:
411
                    reversed_short_line = short_line[::-1]
412
                    index = reversed_short_line.find(" ") + 1
413
                    line = short_line[-index:] + line
414
                    short_line = short_line[:-index]
415

416
                lines.append(short_line.lstrip())
417
            i += 1
418
            if i > 10:
419
                raise
420
        lines.append(line.lstrip())
421
    return "\n".join(lines)
422

423

424
def _get_next_script_element(py):
425
    lines = py.split("\n")
426
    assert lines
427
    elines = []
428
    i = 0
429
    tag = None
430
    if lines[0].startswith('"""'):
431
        assert len(lines) >= 2
432
        etype = "markdown"
433
        if len(lines[0]) > 3:
434
            tag = lines[0][3:]
435
            if tag not in ["shell", "invisible"]:
436
                raise ValueError("Found unknown cell tag:", tag)
437
        lines = lines[1:]
438
    else:
439
        etype = "code"
440

441
    for i, line in enumerate(lines):
442
        if line.startswith('"""'):
443
            break
444
        else:
445
            elines.append(line)
446

447
    if etype == "markdown":
448
        py = "\n".join(lines[i + 1 :])
449
    else:
450
        py = "\n".join(lines[i:])
451
    e = "\n".join(elines)
452

453
    return e, etype, py, tag
454

455

456
def _parse_header(header):
457
    lines = header.split("\n")
458
    if len(lines) not in (6, 7):
459
        raise ValueError("Invalid header, it should be exactly 6 or 7 lines.")
460
    title = lines[0][len("Title: ") :]
461
    author_line = lines[1]
462
    if author_line.startswith("Authors"):
463
        author = author_line[len("Authors: ") :]
464
        auth_field = "Authors"
465
    else:
466
        author = author_line[len("Author: ") :]
467
        auth_field = "Author"
468
    date_created = lines[2][len("Date created: ") :]
469
    last_modified = lines[3][len("Last modified: ") :]
470
    description = lines[4][len("Description: ") :]
471
    accelerator = lines[5][len("Accelerator: ") :]
472
    return {
473
        "title": title,
474
        "author": author,
475
        "auth_field": auth_field,
476
        "date_created": date_created,
477
        "last_modified": last_modified,
478
        "description": description,
479
        "accelerator": accelerator,
480
    }
481

482

483
def _make_output_code_blocks(md):
484
    lines = md.split("\n")
485
    output_lines = []
486
    final_lines = []
487
    is_inside_backticks = False
488

489
    def is_output_line(line, prev_line, output_lines):
490
        if (
491
            output_lines
492
            and "\x08" in output_lines[-1]
493
            and not output_lines[-1].replace("\x08", "").strip()
494
        ):
495
            # We already started a block and the last line is just deletes, that
496
            # implies that the current line will be part of the block (progress
497
            # bar being re-written).
498
            return True
499

500
        if line.startswith("    ") and len(line) >= 5:
501
            # Non-empty indented line
502
            if output_lines:
503
                # Continuation of the output block
504
                return True
505
            if not prev_line.strip():
506
                # Begining of an output block
507
                return True
508
        elif not line.strip():
509
            # Empty line
510
            if output_lines:
511
                # Continuation of the output block
512
                return True
513
        elif line.strip()[0] in ("\x1b", "\x08"):
514
            # Line starts with ESC or delete character, it must be a progress
515
            # bar, which is often not indented.
516
            return True
517
        return False
518

519
    def flush(output_lines, final_lines):
520
        final_lines.append('<div class="k-default-codeblock">')
521
        final_lines.append("```")
522

523
        # When not None, we are in a progress bar and this is its last state.
524
        progress_bar = None
525
        # Used to dedupe empty lines. Also used when in a progress bar.
526
        previous_line_empty = False
527

528
        for line in output_lines:
529
            # Unindent.
530
            if line.startswith("    "):
531
                # Normal block is indented by 4 spaces.
532
                line = line[4:]
533
            else:
534
                # Progress bar and empty lines.
535
                line = line.strip()
536

537
            if "\x1b" in line or "\x08" in line:
538
                # This is a progress bar.
539
                if "\x1b" in line:
540
                    # Remove escape sequences.
541
                    line = re.sub(r"\x1b\[[0-9][0-9]?m", "", line)
542

543
                if "\x08" in line:
544
                    # Delete characters, remove everything up to the last one.
545
                    line = line[line.rindex("\x08") + 1 :].strip()
546

547
                if previous_line_empty and progress_bar is None:
548
                    # We're starting a progress bar, flush the empty line.
549
                    final_lines.append("")
550

551
                if progress_bar is None or line:
552
                    # Update latest progress bar content.
553
                    progress_bar = line
554

555
                previous_line_empty = not line
556
                # When in a progress bar, don't append.
557
                continue
558

559
            if progress_bar is not None and not line:
560
                # In a progress bar with an empty line.
561
                previous_line_empty = True
562
                # We're staying in the progress bar, don't append.
563
                continue
564

565
            # If we get here, we're not / no longer in a progress bar.
566

567
            if progress_bar:
568
                # Flush progress bar content with the last value.
569
                final_lines.append(progress_bar)
570
                progress_bar = None
571

572
            if line:
573
                if previous_line_empty:
574
                    # Flush empty line before appending non-empty line.
575
                    final_lines.append("")
576
                final_lines.append(line)
577
                previous_line_empty = False
578
            else:
579
                previous_line_empty = True
580

581
        if progress_bar:
582
            # Flush progress bar content with the last value.
583
            final_lines.append(progress_bar)
584

585
        final_lines.append("```")
586
        final_lines.append("</div>")
587

588
        if previous_line_empty:
589
            # If the last line in the block was empty, put it after the block.
590
            final_lines.append("")
591

592
    for line in lines:
593
        if line.startswith("```"):
594
            is_inside_backticks = not is_inside_backticks
595
            final_lines.append(line)
596
            continue
597

598
        if is_inside_backticks:
599
            final_lines.append(line)
600
            continue
601

602
        if final_lines and is_output_line(line, final_lines[-1], output_lines):
603
            output_lines.append(line)
604
        elif not line:
605
            if output_lines:
606
                if output_lines[-1]:
607
                    output_lines.append(line)
608
            else:
609
                final_lines.append(line)
610
        else:
611
            if output_lines:
612
                flush(output_lines, final_lines)
613
                output_lines = []
614
            final_lines.append(line)
615
    if output_lines:
616
        flush(output_lines, final_lines)
617
    return "\n".join(final_lines)
618

619

620
NB_BASE = {
621
    "metadata": {
622
        "accelerator": "GPU",
623
        "colab": {
624
            "collapsed_sections": [],
625
            "name": "",  # FILL ME
626
            "private_outputs": False,
627
            "provenance": [],
628
            "toc_visible": True,
629
        },
630
        "kernelspec": {
631
            "display_name": "Python 3",
632
            "language": "python",
633
            "name": "python3",
634
        },
635
        "language_info": {
636
            "codemirror_mode": {"name": "ipython", "version": 3},
637
            "file_extension": ".py",
638
            "mimetype": "text/x-python",
639
            "name": "python",
640
            "nbconvert_exporter": "python",
641
            "pygments_lexer": "ipython3",
642
            "version": "3.7.0",
643
        },
644
    },
645
    "nbformat": 4,
646
    "nbformat_minor": 0,
647
}
648

649

650
if __name__ == "__main__":
651
    cmd = sys.argv[1]
652
    if cmd not in {"nb2py", "py2nb", "count_loc"}:
653
        raise ValueError(
654
            "Specify a command: either "
655
            "`nb2py source_filename.ipynb target_filename.py` or "
656
            "`py2nb source_filename.py target_file name.ipynb` or "
657
            "`count_loc source_filename.py`."
658
        )
659
    if cmd == "count_loc":
660
        source = sys.argv[2]
661
        loc = count_locs_in_file(source)
662
        print(f"Counted {loc} lines of code in {source}.")
663
    else:
664
        if len(sys.argv) < 4:
665
            raise ValueError("Specify a source filename and a target filename")
666
        source = sys.argv[2]
667
        target = sys.argv[3]
668

669
        if cmd == "py2nb":
670
            if not source.endswith(".py"):
671
                raise ValueError(
672
                    f"The source filename should be a Python file. Got: {source}"
673
                )
674
            if not target.endswith(".ipynb"):
675
                raise ValueError(
676
                    f"The target filename should be a notebook file. Got: {target}"
677
                )
678
            py_to_nb(source, target)
679
        if cmd == "nb2py":
680
            if not source.endswith(".ipynb"):
681
                raise ValueError(
682
                    f"The source filename should be a notebook file. Got: {source}"
683
                )
684
            if not target.endswith(".py"):
685
                raise ValueError(
686
                    f"The target filename should be a Python file. Got: {target}"
687
                )
688
            nb_to_py(source, target)
689

690
Product

Resources

Company