CoCalc -- create_dummy

GitHub Repository: probml/pyprobml
Path: blob/master/internal/book2/create_dummy_notebook.py
¹¹⁹² views
1
"""
2
command usage:
3
python3 internal/book2/create_dummy_notebook.py --lof=internal/book2.lof --book_no=2
4
"""
5

6
import argparse
7
from email.policy import default
8
from random import choices
9
from TexSoup import TexSoup
10
import regex as re
11
import os
12
import nbformat as nbf
13
import pandas as pd
14
from glob import glob
15
from probml_utils.url_utils import (
16
    extract_scripts_name_from_caption,
17
    make_url_from_fig_no_and_script_name,
18
    figure_url_mapping_from_lof,
19
)
20

21
parser = argparse.ArgumentParser(description="create dummy notebook")
22
parser.add_argument("-lof", "--lof", type=str, help="")
23
parser.add_argument("-book_no", "--book_no", type=int, default=2, choices=[1, 2], help="")
24
parser.add_argument("-nb_path", "--nb_path", type=str, default="notebooks/", help="")
25

26
args = parser.parse_args()
27

28
lof_file = str(args.lof)
29
book_no = args.book_no
30
nb_path = args.nb_path
31

32

33
def convert_to_ipynb(file):
34
    if ".py" in file:
35
        return file[:-3] + ".ipynb"
36
    return file
37

38

39
def find_multinotebooks():
40
    fig_no_urls_mapping = figure_url_mapping_from_lof(lof_file, "", book_no=book_no)
41
    more_than_one = 0
42
    multi_notebooks = {}
43
    for fig_no in fig_no_urls_mapping:
44
        if "fig_" in fig_no_urls_mapping[fig_no]:
45
            print(fig_no_urls_mapping[fig_no])
46
            multi_notebooks[fig_no] = fig_no_urls_mapping[fig_no]
47
            more_than_one += 1
48
    print(f"{more_than_one} notebooks have more than one figure")
49
    return multi_notebooks
50

51

52
def delete_existing_multinotebooks():
53
    """
54
    delete existing notebooks
55
    """
56
    notebooks = glob(f"notebooks/book{book_no}/*/*.ipynb")
57
    cnt = 0
58
    for notebook in notebooks:
59
        if "fig_" in notebook.split("/")[-1]:
60
            os.remove(notebook)
61
            print(f"{notebook} deleted!")
62
            cnt += 1
63

64
    print(f"{cnt} notebooks deleted")
65

66

67
def preprocess_caption(captions):
68
    # create mapping of fig_no to list of script_name
69
    whole_link_ipynb = r"\{\S+\.ipynb\}"  # find {https://<path/to/>foo.ipynb}{foo.ipynb} from caption
70
    whole_link_py = r"\{\S+\.py\}"
71

72
    fig_cnt = 0
73
    cleaned_caption = {}
74

75
    multi_notebooks = find_multinotebooks()
76
    for caption in captions:
77
        fig_no = str(caption.contents[0])
78

79
        # if it does not contain multi_notebooks
80
        if fig_no not in multi_notebooks:
81
            continue
82

83
        caption = (
84
            str(caption)
85
            .replace(r"\ignorespaces", "")
86
            .replace(r" \relax", "")
87
            .replace(r"\href", "")
88
            .replace(r"\url", "")
89
            .replace(r'\cc@accent {"705E}', "")
90
            .replace(r"\numberline", "")
91
            .replace(r"\bm", "")
92
            .replace(r"\DOTSB", "")
93
            .replace(r"\slimits", "")
94
            .replace(r"\oset", "")
95
        )
96

97
        # print(fig_no, end=" ")
98
        links = re.findall(whole_link_ipynb, str(caption)) + re.findall(whole_link_py, str(caption))
99
        # print(fig_no, links)
100
        for link in links:
101
            script = extract_scripts_name_from_caption(link)[0]
102
            script_ipynb = convert_to_ipynb(script)
103
            original_url = f"[{script_ipynb}]({make_url_from_fig_no_and_script_name(fig_no,script_ipynb, book_no = book_no)})"  # in form of markdown hyperlink
104
            caption = caption.replace(link, original_url)
105

106
        caption = re.findall(r"{\d+.\d+}{(.*)}", caption)[0].strip()  # extract caption from {4.13}{caption}
107

108
        # print(fig_no, caption, end="\n\n")
109
        cleaned_caption[fig_no] = caption
110

111
    return cleaned_caption
112

113

114
def parse_lof(lof_file):
115
    with open(lof_file) as fp:
116
        LoF_File_Contents = fp.read()
117
    return LoF_File_Contents
118

119

120
def make_dummy_notebook_name(fig_no):
121
    """
122
    convert 1.11 to fig_1_11.ipynb
123
    """
124
    return f"fig_{fig_no.replace('.','_')}.ipynb"
125

126

127
def create_multi_notebooks(cleaned_captions, relative_path=nb_path):
128
    """
129
    create new notebook and add caption to it
130
    """
131
    # https://stackoverflow.com/questions/38193878/how-to-create-modify-a-jupyter-notebook-from-code-python
132
    cnt = 0
133
    for fig_no in cleaned_captions:
134

135
        # make relative path for new dummy notebook
136
        chapter_no = int(fig_no.split(".")[0])
137

138
        dummpy_notebook = make_dummy_notebook_name(fig_no)
139
        fig_path = os.path.join(relative_path, f"book{book_no}/{chapter_no:02d}", dummpy_notebook)
140
        print(fig_path.split("/")[-1], end="\n")
141

142
        nb = nbf.v4.new_notebook()
143
        nb["cells"] = [nbf.v4.new_markdown_cell(cleaned_captions[fig_no])]
144
        with open(fig_path, "w") as f:
145
            nbf.write(nb, f)
146
            cnt += 1
147

148
    print(f"\n{cnt} notebooks written!")
149

150

151
if __name__ == "__main__":
152
    # delete existing multinotebooks
153
    delete_existing_multinotebooks()
154

155
    # find multinotebooks
156
    print(find_multinotebooks())
157

158
    # parse lof file
159
    soup = TexSoup(parse_lof(lof_file))
160

161
    # preprocess caption
162
    cleaned_captions = preprocess_caption(soup.find_all("numberline"))
163

164
    # create multinoteboos and write caption
165
    create_multi_notebooks(cleaned_captions)
166

167
Product

Resources

Company