Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
keras-team
GitHub Repository: keras-team/keras-io
Path: blob/master/scripts/tutobooks.py
3273 views
1
"""Keras tutobooks implementation.
2
3
A tutobook is a tutorial available simultaneously as a notebook,
4
as a Python script, and as a nicely rendered webpage.
5
6
Its source-of-truth (for manual edition and version control) is
7
its Python script form, but you can also create one by starting
8
from a notebook and converting it with the command `nb2py`.
9
10
Text cells are stored in markdown-formatted comment blocks.
11
the first line (starting with " * 3) may optionally contain a special
12
annotation, one of:
13
14
- invisible: do not render this block.
15
- shell: execute this block while prefixing each line with `!`.
16
17
The script form should start with a header with the following fields:
18
Title:
19
Author: (could be `Authors`: as well, and may contain markdown links)
20
Date created: (date in yyyy/mm/dd format)
21
Last modified: (date in yyyy/mm/dd format)
22
Description: (one-line text description)
23
24
## How to add a new code example to Keras.io
25
26
You would typically start from an existing notebook.
27
28
Save it to disk (let's say as `path_to_your_nb.ipynb`).
29
`cd` to the `keras-io/scripts/` directory.
30
31
Then run:
32
33
```
34
python tutobooks nb2py path_to_your_nb.ipynb ../examples/your_example.py
35
```
36
37
This will create the file `examples/your_example.py`. Open it,
38
fill in the headers, and generally edit it so that it looks nice.
39
40
NOTE THAT THE CONVERSION SCRIPT MAY MAKE MISTAKES IN ITS ATTEMPTS
41
TO SHORTEN LINES. MAKE SURE TO PROOFREAD THE GENERATED .py IN FULL.
42
Or alternatively, make sure to keep your lines reasonably-sized (<90 char)
43
to start with, so that the script won't have to shorten them.
44
45
You can then preview what it looks like when converted back again
46
to ipynb by running:
47
48
```
49
python tutobooks py2nb ../examples/your_example.py preview.ipynb
50
```
51
52
NOTE THAT THIS COMMAND WILL ERROR OUT IF ANY CELLS TAKES TOO LONG
53
TO EXECUTE. In that case, make your code lighter/faster.
54
Remember that examples are meant to demonstrate workflows, not
55
train state-of-the-art models. They should
56
stay very lightweight.
57
58
Open the generated `preview.ipynb` and make sure it looks like what
59
you expect. If not, keep editing `your_example.py` until it does.
60
61
Finally, submit a PR adding `examples/your_example.py`.
62
"""
63
64
import os
65
import re
66
import sys
67
import json
68
import copy
69
import random
70
import shutil
71
import tempfile
72
from pathlib import Path
73
74
TIMEOUT = 12 * 60 * 60 # 12 hours
75
MAX_LOC = 350
76
77
78
def nb_to_py(nb_path, py_path):
79
f = open(nb_path)
80
content = f.read()
81
f.close()
82
nb = json.loads(content)
83
py = '"""\n'
84
py += "Title: FILLME\n"
85
py += "Author: FILLME\n"
86
py += "Date created: FILLME\n"
87
py += "Last modified: FILLME\n"
88
py += "Description: FILLME\n"
89
py += '"""\n'
90
for cell in nb["cells"]:
91
if cell["cell_type"] == "code":
92
# Is it a shell cell?
93
if cell["source"] and cell["source"][0] and cell["source"][0][0] == "!":
94
# It's a shell cell
95
py += '"""shell\n'
96
py += "".join(cell["source"]) + "\n"
97
py += '"""\n\n'
98
else:
99
# It's a Python cell
100
py += "".join(cell["source"]) + "\n\n"
101
elif cell["cell_type"] == "markdown":
102
py += '"""\n'
103
py += "".join(cell["source"]) + "\n"
104
py += '"""\n\n'
105
# Save file
106
f = open(py_path, "w")
107
f.write(py)
108
f.close()
109
# Format file with Black
110
os.system("black " + py_path)
111
# Shorten lines
112
py = open(py_path).read()
113
try:
114
py = _shorten_lines(py)
115
finally:
116
f = open(py_path, "w")
117
f.write(py)
118
f.close()
119
120
121
def py_to_nb(py_path, nb_path, fill_outputs=False):
122
f = open(py_path)
123
py = f.read()
124
f.close()
125
validate(py)
126
127
header, _, py, tag = _get_next_script_element(py)
128
attributes = _parse_header(header)
129
cells = []
130
loc = 0
131
# Write first header cell
132
header_cell = {
133
"cell_type": "markdown",
134
"source": [
135
"# " + attributes["title"] + "\n",
136
"\n",
137
"**" + attributes["auth_field"] + ":** " + attributes["author"] + "<br>\n",
138
"**Date created:** " + attributes["date_created"] + "<br>\n",
139
"**Last modified:** " + attributes["last_modified"] + "<br>\n",
140
"**Description:** " + attributes["description"],
141
],
142
"metadata": {"colab_type": "text"},
143
}
144
cells.append(header_cell)
145
while py:
146
e, cell_type, py, tag = _get_next_script_element(py)
147
lines = e.split("\n")
148
149
if all(l == "" for l in lines):
150
continue
151
152
if lines and not lines[0]:
153
lines = lines[1:]
154
source = [l + "\n" for l in lines]
155
# Drop last newline char
156
if source and not source[-1].strip():
157
source = source[:-1]
158
if source:
159
source[-1] = source[-1].rstrip()
160
if tag == "shell":
161
source = ["!" + l for l in source]
162
cell_type = "code"
163
if tag != "invisible" and source:
164
cell = {"cell_type": cell_type, "source": source}
165
if cell_type == "code":
166
cell["outputs"] = []
167
cell["metadata"] = {"colab_type": "code"}
168
cell["execution_count"] = 0
169
loc += _count_locs(source)
170
else:
171
cell["metadata"] = {"colab_type": "text"}
172
cells.append(cell)
173
notebook = {}
174
for key in NB_BASE.keys():
175
notebook[key] = copy.deepcopy(NB_BASE[key])
176
notebook["metadata"]["colab"]["name"] = str(py_path).split("/")[-1][:-3]
177
notebook["metadata"]["accelerator"] = attributes["accelerator"]
178
notebook["cells"] = cells
179
if loc > MAX_LOC:
180
raise ValueError(
181
f"Found {loc} lines of code, but expected fewer than {MAX_LOC}"
182
)
183
184
f = open(nb_path, "w")
185
f.write(json.dumps(notebook, indent=1, sort_keys=True))
186
f.close()
187
if fill_outputs:
188
print("Generating ipynb")
189
parent_dir = Path(nb_path).parent
190
current_files = os.listdir(parent_dir)
191
try:
192
os.system(
193
"jupyter nbconvert --to notebook --execute --debug "
194
+ str(nb_path)
195
+ " --inplace"
196
+ " --ExecutePreprocessor.timeout="
197
+ str(TIMEOUT)
198
)
199
finally:
200
new_files = os.listdir(parent_dir)
201
for fname in new_files:
202
if fname not in current_files:
203
fpath = parent_dir / fname
204
if os.path.isdir(fpath):
205
print("Removing created folder:", fname)
206
shutil.rmtree(fpath)
207
else:
208
print("Removing created file:", fname)
209
os.remove(fpath)
210
211
212
def nb_to_md(nb_path, md_path, img_dir, working_dir=None):
213
img_exts = ("png", "jpg", "jpeg")
214
# Assumes an already populated notebook.
215
assert str(md_path).endswith(".md")
216
current_dir = os.getcwd()
217
original_img_dir = str(img_dir)
218
if original_img_dir.endswith("/"):
219
original_img_dir = original_img_dir[:-1]
220
img_dir = os.path.abspath(img_dir)
221
nb_path = os.path.abspath(nb_path)
222
nb_fname = str(nb_path).split(os.path.sep)[-1]
223
224
del_working_dir = False
225
if working_dir is None:
226
del_working_dir = True
227
working_dir = "tmp_" + str(random.randint(1e6, 1e7))
228
if not os.path.exists(working_dir):
229
os.makedirs(working_dir)
230
print("Using working_dir:", working_dir)
231
232
os.chdir(working_dir)
233
shutil.copyfile(nb_path, nb_fname)
234
235
md_name = str(md_path).split("/")[-1][:-3]
236
target_md = md_name + ".md"
237
img_dir = Path(img_dir) / md_name
238
if not os.path.exists(img_dir):
239
os.makedirs(img_dir)
240
241
os.system(
242
"jupyter nbconvert --to markdown --execute --debug "
243
+ nb_fname
244
+ " --output "
245
+ target_md
246
+ " --ExecutePreprocessor.timeout="
247
+ str(TIMEOUT)
248
)
249
if os.path.exists(md_name + ".md"):
250
success = True
251
tmp_img_dir = md_name + "_files"
252
if os.path.exists(tmp_img_dir):
253
for fname in os.listdir(tmp_img_dir):
254
if fname.endswith(img_exts):
255
src = Path(tmp_img_dir) / fname
256
target = Path(img_dir) / fname
257
print("copy", src, "to", target)
258
shutil.copyfile(src, target)
259
260
os.chdir(current_dir)
261
md_content = open(Path(working_dir) / (md_name + ".md")).read()
262
for ext in img_exts:
263
md_content = md_content.replace(
264
"![" + ext + "](" + md_name + "_files",
265
"![" + ext + "](" + original_img_dir + "/" + md_name,
266
)
267
md_content = _make_output_code_blocks(md_content)
268
open(md_path, "w").write(md_content)
269
else:
270
success = False
271
os.chdir(current_dir)
272
273
if del_working_dir:
274
shutil.rmtree(working_dir)
275
276
if not success:
277
raise RuntimeError(
278
"An error was encountered when attempting to run the notebook. "
279
"See logs for details."
280
)
281
282
283
def py_to_md(py_path, nb_path, md_path, img_dir, working_dir=None):
284
py_to_nb(py_path, nb_path, fill_outputs=False)
285
nb_to_md(nb_path, md_path, img_dir, working_dir=working_dir)
286
287
288
def validate(py):
289
"""Validate the format of a tutobook script.
290
291
Specifically:
292
- validate headers
293
- validate style with black
294
"""
295
lines = py.split("\n")
296
if not lines[0].startswith('"""'):
297
raise ValueError('Missing `"""`-fenced header at top of script.')
298
if not lines[1].startswith("Title: "):
299
raise ValueError("Missing `Title:` field.")
300
if not lines[2].startswith("Author: ") and not lines[2].startswith("Authors: "):
301
raise ValueError("Missing `Author:` field.")
302
if not lines[3].startswith("Date created: "):
303
raise ValueError("Missing `Date created:` field.")
304
if not lines[4].startswith("Last modified: "):
305
raise ValueError("Missing `Last modified:` field.")
306
if not lines[5].startswith("Description: "):
307
raise ValueError("Missing `Description:` field.")
308
if not lines[6].startswith("Accelerator: "):
309
raise ValueError("Missing `Accelerator:` field.")
310
description = lines[5][len("Description: ") :]
311
if not description:
312
raise ValueError("Missing `Description:` field content.")
313
if not description[0] == description[0].upper():
314
raise ValueError("Description field content must be capitalized.")
315
if not description[-1] == ".":
316
raise ValueError("Description field content must end with a period.")
317
if len(description) > 100:
318
raise ValueError("Description field content must be less than 100 chars.")
319
accelerator = lines[6][len("Accelerator: ") :]
320
accelerator_options = ["GPU", "TPU", "None"]
321
if accelerator not in accelerator_options:
322
raise ValueError(
323
f"Accelerator field content must be one of: {accelerator_options}. "
324
f"Received: accelerator={accelerator}"
325
)
326
for i, line in enumerate(lines):
327
if line.startswith('"""') and line.endswith('"""') and len(line) > 3:
328
raise ValueError(
329
'Do not use single line `"""`-fenced comments. '
330
"Encountered at line %d" % (i,)
331
)
332
for i, line in enumerate(lines):
333
if line.endswith(" "):
334
raise ValueError("Found trailing space on line %d; line: `%s`" % (i, line))
335
# Validate style with black
336
337
tmp = tempfile.gettempdir()
338
fpath = os.path.join(tmp, str(random.randint(1e6, 1e7)) + ".py")
339
f = open(fpath, "w")
340
pre_formatting = "\n".join(lines)
341
f.write(pre_formatting)
342
f.close()
343
os.system("black " + fpath)
344
f = open(fpath)
345
formatted = f.read()
346
f.close()
347
os.remove(fpath)
348
if formatted != pre_formatting:
349
raise ValueError(
350
"Your python file did not follow `black` conventions. "
351
"Run `black your_file.py` to autoformat it."
352
)
353
354
# Extra checks.
355
if "//arxiv.org/pdf/" in py:
356
raise ValueError(
357
"Do not link to arXiv PDFs directly. " "Instead, link to the abstract page."
358
)
359
360
361
def count_locs_in_file(py_path):
362
f = open(py_path)
363
py = f.read()
364
f.close()
365
_get_next_script_element(py) # Header
366
loc = 0
367
while py:
368
e, cell_type, py, _ = _get_next_script_element(py)
369
lines = e.split("\n")
370
if cell_type == "code":
371
loc += _count_locs(lines)
372
return loc
373
374
375
def _count_locs(lines):
376
loc = 0
377
string_open = False
378
for line in lines:
379
line = line.strip()
380
if not line or line.startswith("#"):
381
continue
382
if not string_open:
383
if not line.startswith('"""'):
384
loc += 1
385
else:
386
if not line.endswith('"""'):
387
string_open = True
388
else:
389
if line.startswith('"""'):
390
string_open = False
391
return loc
392
393
394
def _shorten_lines(py):
395
max_len = 90
396
lines = []
397
for line in py.split("\n"):
398
if len(line) <= max_len:
399
lines.append(line)
400
continue
401
i = 0
402
while len(line) > max_len:
403
line = line.lstrip()
404
if " " not in line[1:]:
405
lines.append(line)
406
break
407
else:
408
short_line = line[:max_len]
409
line = line[max_len:]
410
if " " in short_line:
411
reversed_short_line = short_line[::-1]
412
index = reversed_short_line.find(" ") + 1
413
line = short_line[-index:] + line
414
short_line = short_line[:-index]
415
416
lines.append(short_line.lstrip())
417
i += 1
418
if i > 10:
419
raise
420
lines.append(line.lstrip())
421
return "\n".join(lines)
422
423
424
def _get_next_script_element(py):
425
lines = py.split("\n")
426
assert lines
427
elines = []
428
i = 0
429
tag = None
430
if lines[0].startswith('"""'):
431
assert len(lines) >= 2
432
etype = "markdown"
433
if len(lines[0]) > 3:
434
tag = lines[0][3:]
435
if tag not in ["shell", "invisible"]:
436
raise ValueError("Found unknown cell tag:", tag)
437
lines = lines[1:]
438
else:
439
etype = "code"
440
441
for i, line in enumerate(lines):
442
if line.startswith('"""'):
443
break
444
else:
445
elines.append(line)
446
447
if etype == "markdown":
448
py = "\n".join(lines[i + 1 :])
449
else:
450
py = "\n".join(lines[i:])
451
e = "\n".join(elines)
452
453
return e, etype, py, tag
454
455
456
def _parse_header(header):
457
lines = header.split("\n")
458
if len(lines) not in (6, 7):
459
raise ValueError("Invalid header, it should be exactly 6 or 7 lines.")
460
title = lines[0][len("Title: ") :]
461
author_line = lines[1]
462
if author_line.startswith("Authors"):
463
author = author_line[len("Authors: ") :]
464
auth_field = "Authors"
465
else:
466
author = author_line[len("Author: ") :]
467
auth_field = "Author"
468
date_created = lines[2][len("Date created: ") :]
469
last_modified = lines[3][len("Last modified: ") :]
470
description = lines[4][len("Description: ") :]
471
accelerator = lines[5][len("Accelerator: ") :]
472
return {
473
"title": title,
474
"author": author,
475
"auth_field": auth_field,
476
"date_created": date_created,
477
"last_modified": last_modified,
478
"description": description,
479
"accelerator": accelerator,
480
}
481
482
483
def _make_output_code_blocks(md):
484
lines = md.split("\n")
485
output_lines = []
486
final_lines = []
487
is_inside_backticks = False
488
489
def is_output_line(line, prev_line, output_lines):
490
if (
491
output_lines
492
and "\x08" in output_lines[-1]
493
and not output_lines[-1].replace("\x08", "").strip()
494
):
495
# We already started a block and the last line is just deletes, that
496
# implies that the current line will be part of the block (progress
497
# bar being re-written).
498
return True
499
500
if line.startswith(" ") and len(line) >= 5:
501
# Non-empty indented line
502
if output_lines:
503
# Continuation of the output block
504
return True
505
if not prev_line.strip():
506
# Begining of an output block
507
return True
508
elif not line.strip():
509
# Empty line
510
if output_lines:
511
# Continuation of the output block
512
return True
513
elif line.strip()[0] in ("\x1b", "\x08"):
514
# Line starts with ESC or delete character, it must be a progress
515
# bar, which is often not indented.
516
return True
517
return False
518
519
def flush(output_lines, final_lines):
520
final_lines.append('<div class="k-default-codeblock">')
521
final_lines.append("```")
522
523
# When not None, we are in a progress bar and this is its last state.
524
progress_bar = None
525
# Used to dedupe empty lines. Also used when in a progress bar.
526
previous_line_empty = False
527
528
for line in output_lines:
529
# Unindent.
530
if line.startswith(" "):
531
# Normal block is indented by 4 spaces.
532
line = line[4:]
533
else:
534
# Progress bar and empty lines.
535
line = line.strip()
536
537
if "\x1b" in line or "\x08" in line:
538
# This is a progress bar.
539
if "\x1b" in line:
540
# Remove escape sequences.
541
line = re.sub(r"\x1b\[[0-9][0-9]?m", "", line)
542
543
if "\x08" in line:
544
# Delete characters, remove everything up to the last one.
545
line = line[line.rindex("\x08") + 1 :].strip()
546
547
if previous_line_empty and progress_bar is None:
548
# We're starting a progress bar, flush the empty line.
549
final_lines.append("")
550
551
if progress_bar is None or line:
552
# Update latest progress bar content.
553
progress_bar = line
554
555
previous_line_empty = not line
556
# When in a progress bar, don't append.
557
continue
558
559
if progress_bar is not None and not line:
560
# In a progress bar with an empty line.
561
previous_line_empty = True
562
# We're staying in the progress bar, don't append.
563
continue
564
565
# If we get here, we're not / no longer in a progress bar.
566
567
if progress_bar:
568
# Flush progress bar content with the last value.
569
final_lines.append(progress_bar)
570
progress_bar = None
571
572
if line:
573
if previous_line_empty:
574
# Flush empty line before appending non-empty line.
575
final_lines.append("")
576
final_lines.append(line)
577
previous_line_empty = False
578
else:
579
previous_line_empty = True
580
581
if progress_bar:
582
# Flush progress bar content with the last value.
583
final_lines.append(progress_bar)
584
585
final_lines.append("```")
586
final_lines.append("</div>")
587
588
if previous_line_empty:
589
# If the last line in the block was empty, put it after the block.
590
final_lines.append("")
591
592
for line in lines:
593
if line.startswith("```"):
594
is_inside_backticks = not is_inside_backticks
595
final_lines.append(line)
596
continue
597
598
if is_inside_backticks:
599
final_lines.append(line)
600
continue
601
602
if final_lines and is_output_line(line, final_lines[-1], output_lines):
603
output_lines.append(line)
604
elif not line:
605
if output_lines:
606
if output_lines[-1]:
607
output_lines.append(line)
608
else:
609
final_lines.append(line)
610
else:
611
if output_lines:
612
flush(output_lines, final_lines)
613
output_lines = []
614
final_lines.append(line)
615
if output_lines:
616
flush(output_lines, final_lines)
617
return "\n".join(final_lines)
618
619
620
NB_BASE = {
621
"metadata": {
622
"accelerator": "GPU",
623
"colab": {
624
"collapsed_sections": [],
625
"name": "", # FILL ME
626
"private_outputs": False,
627
"provenance": [],
628
"toc_visible": True,
629
},
630
"kernelspec": {
631
"display_name": "Python 3",
632
"language": "python",
633
"name": "python3",
634
},
635
"language_info": {
636
"codemirror_mode": {"name": "ipython", "version": 3},
637
"file_extension": ".py",
638
"mimetype": "text/x-python",
639
"name": "python",
640
"nbconvert_exporter": "python",
641
"pygments_lexer": "ipython3",
642
"version": "3.7.0",
643
},
644
},
645
"nbformat": 4,
646
"nbformat_minor": 0,
647
}
648
649
650
if __name__ == "__main__":
651
cmd = sys.argv[1]
652
if cmd not in {"nb2py", "py2nb", "count_loc"}:
653
raise ValueError(
654
"Specify a command: either "
655
"`nb2py source_filename.ipynb target_filename.py` or "
656
"`py2nb source_filename.py target_file name.ipynb` or "
657
"`count_loc source_filename.py`."
658
)
659
if cmd == "count_loc":
660
source = sys.argv[2]
661
loc = count_locs_in_file(source)
662
print(f"Counted {loc} lines of code in {source}.")
663
else:
664
if len(sys.argv) < 4:
665
raise ValueError("Specify a source filename and a target filename")
666
source = sys.argv[2]
667
target = sys.argv[3]
668
669
if cmd == "py2nb":
670
if not source.endswith(".py"):
671
raise ValueError(
672
f"The source filename should be a Python file. Got: {source}"
673
)
674
if not target.endswith(".ipynb"):
675
raise ValueError(
676
f"The target filename should be a notebook file. Got: {target}"
677
)
678
py_to_nb(source, target)
679
if cmd == "nb2py":
680
if not source.endswith(".ipynb"):
681
raise ValueError(
682
f"The source filename should be a notebook file. Got: {source}"
683
)
684
if not target.endswith(".py"):
685
raise ValueError(
686
f"The target filename should be a Python file. Got: {target}"
687
)
688
nb_to_py(source, target)
689
690