CoCalc -- duplicate_notebooks.py

GitHub Repository: probml/pyprobml
Path: blob/master/internal/duplicate_notebooks.py
¹¹⁹¹ views
1
from glob import glob
2
import os
3
import shutil
4
import regex as re
5
from pathlib import Path
6

7
book1_notebooks = glob("notebooks/book1/*/*.ipynb")
8
book2_notebooks = glob("notebooks/book2/*/*.ipynb")
9
misc_notebooks = glob("notebooks/misc/*.ipynb") + glob("notebooks/misc/*/*.ipynb")
10
base_url = "https://colab.research.google.com/github/probml/pyprobml/blob/master/"
11

12
print(len(book1_notebooks), len(book2_notebooks), len(misc_notebooks))
13

14
get_notebook_name = lambda notebook: notebook.split("/")[-1]
15

16
book1_notebooks_names = set(list(map(get_notebook_name, book1_notebooks)))
17
book2_notebooks_names = set(list(map(get_notebook_name, book2_notebooks)))
18
notebook_names = book1_notebooks_names.union(book2_notebooks_names)
19

20
def remove_duplicate_nb_by_name():
21
    for misc_notebook in misc_notebooks:
22
        notebook_name = get_notebook_name(misc_notebook)
23
        if notebook_name in notebook_names:
24
            print(f"{misc_notebook} is a duplicate")
25
            shutil.move(misc_notebook, f"deprecated/")
26
        
27
def get_path_nb(nb):
28
    for notebook in book1_notebooks:
29
        if get_notebook_name(notebook) == nb:
30
            return notebook
31

32
def copy_referred_nb():
33
    # Readme.md
34
    readme_files = glob("notebooks/book1/*/README.md")
35
    refered_nb = []
36
    copied_nb = []
37
    for readme_file in readme_files:
38
        print(f"************* {readme_file} **************")
39
        with open(readme_file, "r") as f:
40
            updated_flg = 0
41
            content = f.read()
42
            if "## Supplementary material" in content:
43
                new_content_lines = content.split("## Supplementary material")[0].split("\n")
44
                new_content_lines.append("## Supplementary material")
45
                content = content.split("## Supplementary material")[1]
46
                for line in content.split("\n"):
47
                    last_field = line.split("|")[-1]
48
                    if "Notebook" in last_field or "[d2lbook]" in last_field:
49
                        link = last_field.replace("[Notebook]", "").replace("[d2lbook]", "").replace("(", "").replace(")", "") #get link to nb
50
                        nb_name = link.split("/")[-1]
51
                        refered_nb.append(nb_name)
52
                        nb_misc_file = f"notebooks/misc/{nb_name}"
53
                        nb_dest = f"{readme_file.replace('README.md','')}{nb_name}"
54
                        if  nb_misc_file in misc_notebooks:
55
                            shutil.copy(nb_misc_file, nb_dest) #copy from misc to current chapter
56
                            print(f"{nb_misc_file} -> {nb_dest}")
57
                            line = line.replace(last_field,f"[{nb_name}]({os.path.join(base_url,nb_dest)})") #update the link
58
                            copied_nb.append(nb_misc_file) #track which nb are copied
59
                            updated_flg = 1
60
                        else:
61
                            curr_chapter_nb = glob(f"{readme_file.replace('README.md','')}*.ipynb")
62
                            #print(curr_chapter_nb)
63
                            # check if notebook in current chapter
64
                            if nb_dest in curr_chapter_nb:
65
                                line = line.replace(last_field,f"[{nb_name}]({os.path.join(base_url,nb_dest)})") #update the link
66
                                updated_flg = 1
67
                                print(f"{nb_dest} exists in current chapter")
68

69
                            #check if notebook is in different chapter
70
                            else:
71
                                nb_dest = get_path_nb(nb_name)
72
                                #print(nb_dest, book1_notebooks)
73
                                if nb_dest in book1_notebooks:
74
                                    updated_link = os.path.join(base_url, nb_dest)
75
                                    line = line.replace(last_field,f"[{nb_name}]({updated_link})")
76
                                    updated_flg = 1
77
                                    print(f"{nb_dest} exists in different chapter") 
78
                                    
79
                                else:
80
                                    print(f"{link} not in misc and not in current chapter!!")
81

82
                    new_content_lines.append(line)
83

84
        if updated_flg:
85
            with open(readme_file, "w") as f:
86
                f.write("\n".join(new_content_lines))
87
        #break
88

89
    return copied_nb
90
    
91

92

93
def delete_nb(notebook_list):
94
    [os.remove(nb) for nb in notebook_list]
95
    print(f"{len(notebook_list)} deleted!")
96

97
def store_copied_nb(notebooks,fname = "internal/ignored_notebooks.txt"):
98
    with open(fname,"w") as fp:
99
        [fp.write(nb+"\n") for nb in notebooks]
100

101
if __name__ == "__main__":
102
    print("main")
103
    # copied_nb = copy_referred_nb()
104
    # print(len(copied_nb), len(set(copied_nb)))
105
    # print(copied_nb[:4])
106
    # store_copied_nb(copied_nb)
107
    # delete_nb(set(copied_nb))
108

109
'''
110
# some issues
111
1. Needs to update probml-notebooks/ link to pyprobml/
112
'''
113

114
Product

Resources

Company