Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/internal/book2/create_dummy_notebook.py
1192 views
1
"""
2
command usage:
3
python3 internal/book2/create_dummy_notebook.py --lof=internal/book2.lof --book_no=2
4
"""
5
6
import argparse
7
from email.policy import default
8
from random import choices
9
from TexSoup import TexSoup
10
import regex as re
11
import os
12
import nbformat as nbf
13
import pandas as pd
14
from glob import glob
15
from probml_utils.url_utils import (
16
extract_scripts_name_from_caption,
17
make_url_from_fig_no_and_script_name,
18
figure_url_mapping_from_lof,
19
)
20
21
parser = argparse.ArgumentParser(description="create dummy notebook")
22
parser.add_argument("-lof", "--lof", type=str, help="")
23
parser.add_argument("-book_no", "--book_no", type=int, default=2, choices=[1, 2], help="")
24
parser.add_argument("-nb_path", "--nb_path", type=str, default="notebooks/", help="")
25
26
args = parser.parse_args()
27
28
lof_file = str(args.lof)
29
book_no = args.book_no
30
nb_path = args.nb_path
31
32
33
def convert_to_ipynb(file):
34
if ".py" in file:
35
return file[:-3] + ".ipynb"
36
return file
37
38
39
def find_multinotebooks():
40
fig_no_urls_mapping = figure_url_mapping_from_lof(lof_file, "", book_no=book_no)
41
more_than_one = 0
42
multi_notebooks = {}
43
for fig_no in fig_no_urls_mapping:
44
if "fig_" in fig_no_urls_mapping[fig_no]:
45
print(fig_no_urls_mapping[fig_no])
46
multi_notebooks[fig_no] = fig_no_urls_mapping[fig_no]
47
more_than_one += 1
48
print(f"{more_than_one} notebooks have more than one figure")
49
return multi_notebooks
50
51
52
def delete_existing_multinotebooks():
53
"""
54
delete existing notebooks
55
"""
56
notebooks = glob(f"notebooks/book{book_no}/*/*.ipynb")
57
cnt = 0
58
for notebook in notebooks:
59
if "fig_" in notebook.split("/")[-1]:
60
os.remove(notebook)
61
print(f"{notebook} deleted!")
62
cnt += 1
63
64
print(f"{cnt} notebooks deleted")
65
66
67
def preprocess_caption(captions):
68
# create mapping of fig_no to list of script_name
69
whole_link_ipynb = r"\{\S+\.ipynb\}" # find {https://<path/to/>foo.ipynb}{foo.ipynb} from caption
70
whole_link_py = r"\{\S+\.py\}"
71
72
fig_cnt = 0
73
cleaned_caption = {}
74
75
multi_notebooks = find_multinotebooks()
76
for caption in captions:
77
fig_no = str(caption.contents[0])
78
79
# if it does not contain multi_notebooks
80
if fig_no not in multi_notebooks:
81
continue
82
83
caption = (
84
str(caption)
85
.replace(r"\ignorespaces", "")
86
.replace(r" \relax", "")
87
.replace(r"\href", "")
88
.replace(r"\url", "")
89
.replace(r'\cc@accent {"705E}', "")
90
.replace(r"\numberline", "")
91
.replace(r"\bm", "")
92
.replace(r"\DOTSB", "")
93
.replace(r"\slimits", "")
94
.replace(r"\oset", "")
95
)
96
97
# print(fig_no, end=" ")
98
links = re.findall(whole_link_ipynb, str(caption)) + re.findall(whole_link_py, str(caption))
99
# print(fig_no, links)
100
for link in links:
101
script = extract_scripts_name_from_caption(link)[0]
102
script_ipynb = convert_to_ipynb(script)
103
original_url = f"[{script_ipynb}]({make_url_from_fig_no_and_script_name(fig_no,script_ipynb, book_no = book_no)})" # in form of markdown hyperlink
104
caption = caption.replace(link, original_url)
105
106
caption = re.findall(r"{\d+.\d+}{(.*)}", caption)[0].strip() # extract caption from {4.13}{caption}
107
108
# print(fig_no, caption, end="\n\n")
109
cleaned_caption[fig_no] = caption
110
111
return cleaned_caption
112
113
114
def parse_lof(lof_file):
115
with open(lof_file) as fp:
116
LoF_File_Contents = fp.read()
117
return LoF_File_Contents
118
119
120
def make_dummy_notebook_name(fig_no):
121
"""
122
convert 1.11 to fig_1_11.ipynb
123
"""
124
return f"fig_{fig_no.replace('.','_')}.ipynb"
125
126
127
def create_multi_notebooks(cleaned_captions, relative_path=nb_path):
128
"""
129
create new notebook and add caption to it
130
"""
131
# https://stackoverflow.com/questions/38193878/how-to-create-modify-a-jupyter-notebook-from-code-python
132
cnt = 0
133
for fig_no in cleaned_captions:
134
135
# make relative path for new dummy notebook
136
chapter_no = int(fig_no.split(".")[0])
137
138
dummpy_notebook = make_dummy_notebook_name(fig_no)
139
fig_path = os.path.join(relative_path, f"book{book_no}/{chapter_no:02d}", dummpy_notebook)
140
print(fig_path.split("/")[-1], end="\n")
141
142
nb = nbf.v4.new_notebook()
143
nb["cells"] = [nbf.v4.new_markdown_cell(cleaned_captions[fig_no])]
144
with open(fig_path, "w") as f:
145
nbf.write(nb, f)
146
cnt += 1
147
148
print(f"\n{cnt} notebooks written!")
149
150
151
if __name__ == "__main__":
152
# delete existing multinotebooks
153
delete_existing_multinotebooks()
154
155
# find multinotebooks
156
print(find_multinotebooks())
157
158
# parse lof file
159
soup = TexSoup(parse_lof(lof_file))
160
161
# preprocess caption
162
cleaned_captions = preprocess_caption(soup.find_all("numberline"))
163
164
# create multinoteboos and write caption
165
create_multi_notebooks(cleaned_captions)
166
167