CoCalc -- common_notebooks

GitHub Repository: probml/pyprobml
Path: blob/master/internal/common_notebooks_index.ipynb
¹¹⁹¹ views

Kernel: Python [conda env:py3713]

In [1]:

from glob import glob
import pandas as pd
import os
import nbformat
import probml_utils.url_utils as url_utils

In [2]:

root_path = "../notebooks"

def get_notebook_path(book_str, chap_no, nb_name):
    return os.path.join(root_path, book_str, chap_no, nb_name)


def seperate_stuffs(nb_path):
    return nb_path.split("/")[-3:]

In [3]:

book2 = glob("../notebooks/book1/*/*.ipynb") #+ glob("../notebooks/book2/*/*.ipynb")
book2
len(book2)

Out[3]:

316

In [4]:

nb_list = list(map(seperate_stuffs, book2))
df_nb_list = pd.DataFrame(nb_list, columns=["book_no", "chap_no", "nb_name"])
df_nb_list

Out[4]:

In [5]:

df_nb_list_grp = df_nb_list.groupby("nb_name").agg(lambda x: list(x)).reset_index()
df_nb_list_grp

Out[5]:

In [6]:

def is_query_in_nb(notebook, query):
    """
    fun should take one argument: code
    """
    nb = nbformat.read(notebook, as_version=4)
    for cell in nb.cells:
        code = cell["source"]
        if query in code:
            return 1
    return 0


def get_n_cells_nb(notebook):
    """
    fun should take one argument: code
    """
    nb = nbformat.read(notebook, as_version=4)
    return len(nb.cells)


def get_original_nb(df_nb_list_grp_ser):
    nb_name = df_nb_list_grp_ser["nb_name"]
    books = df_nb_list_grp_ser["book_no"]
    chaps = df_nb_list_grp_ser["chap_no"]
    t = []
    for book, chap in zip(books, chaps):
        nb_path = get_notebook_path(book, chap, nb_name)
        is_source = is_query_in_nb(nb_path, "Source of this notebook")
        t.append(is_source)
    return t

In [7]:

df_nb_list_grp["is_source_present"] = df_nb_list_grp.apply(get_original_nb, axis=1)
df_nb_list_grp

Out[7]:

In [9]:

# Delete duplicate notebooks from book2
def del_duplicate_notebook(df_root_ser):
    is_source = df_root_ser["is_source_present"]
    nb_name = df_root_ser["nb_name"]
    # print(is_source)
    for i in range(len(is_source)):
        if is_source[i] == 1 and df_root_ser["book_no"][i] == "book1":  # delete only book2's duplicate notebook:
            nb_path = get_notebook_path(df_root_ser["book_no"][i], df_root_ser["chap_no"][i], nb_name)
            if is_query_in_nb(nb_path, "/pyprobml/"):  # delete notebooks who have pyprobml links
                os.remove(nb_path)
                print("Deleted: ", nb_path)


def del_fig_notebook(df_root_ser):
    is_source = df_root_ser["is_source_present"]
    nb_name = df_root_ser["nb_name"]
    # print(is_source)

    if "fig_" in nb_name and df_root_ser["book_no"] == "book2":  # delete only book2's notebook:
        nb_path = get_notebook_path(df_root_ser["book_no"], df_root_ser["chap_no"], nb_name)
        os.remove(nb_path)
        print("Deleted: ", nb_path)


df_root = df_nb_list_grp
df_nb_list_grp.apply(del_duplicate_notebook, axis=1)
#df_nb_list_grp.apply(del_fig_notebook, axis=1)

Out[9]:

Deleted:  ../notebooks/book1/13/activation_fun_plot.ipynb
Deleted:  ../notebooks/book1/04/gauss_infer_1d.ipynb
Deleted:  ../notebooks/book1/04/gauss_infer_2d.ipynb
Deleted:  ../notebooks/book1/21/gmm_2d.ipynb
Deleted:  ../notebooks/book1/17/gprDemoArd.ipynb
Deleted:  ../notebooks/book1/17/gprDemoChangeHparams.ipynb
Deleted:  ../notebooks/book1/17/gprDemoNoiseFree.ipynb
Deleted:  ../notebooks/book1/17/gpr_demo_marglik.ipynb
Deleted:  ../notebooks/book1/18/hinge_loss_plot.ipynb
Deleted:  ../notebooks/book1/11/huberLossPlot.ipynb
Deleted:  ../notebooks/book1/17/huberLossPlot.ipynb
Deleted:  ../notebooks/book1/10/iris_logreg.ipynb
Deleted:  ../notebooks/book1/11/linreg_poly_vs_degree.ipynb
Deleted:  ../notebooks/book1/11/linreg_post_pred_plot.ipynb
Deleted:  ../notebooks/book1/03/prob.ipynb

    None
    None
    None
    None
    None
       ... 
  None
  None
  None
  None
  None
Length: 306, dtype: object

In [19]:

def get_root_col(df_root_ser, col):
    is_source = df_root_ser["is_source_present"]
    nb_name = df_root_ser["nb_name"]

    if is_source.count(0) == 0:
        print(f"{nb_name} is not in pyprobml!")
        return df_root_ser[col][0]

    elif is_source.count(0) > 1:
        print(f"{nb_name} - multiple copies exist")

    else:
        return df_root_ser[col][is_source.index(0)]


df_root = df_nb_list_grp
df_root["chap_no"] = df_nb_list_grp.apply(get_root_col, col="chap_no", axis=1)
df_root["book_no"] = df_nb_list_grp.apply(get_root_col, col="book_no", axis=1)
df_root

Out[19]:

bootstrap_filter.ipynb is not in pyprobml!
bootstrap_filter_maneuver.ipynb is not in pyprobml!
ekf_mlp.ipynb is not in pyprobml!
ekf_vs_ukf.ipynb is not in pyprobml!
gauss-bp-1d-line.ipynb is not in pyprobml!
gprDemoArd.ipynb is not in pyprobml!
gprDemoChangeHparams.ipynb is not in pyprobml!
gprDemoNoiseFree.ipynb is not in pyprobml!
pendulum_1d.ipynb is not in pyprobml!
rbpf_maneuver.ipynb is not in pyprobml!
rbpf_maneuver_demo.ipynb is not in pyprobml!
sis_vs_smc.ipynb is not in pyprobml!
bootstrap_filter.ipynb is not in pyprobml!
bootstrap_filter_maneuver.ipynb is not in pyprobml!
ekf_mlp.ipynb is not in pyprobml!
ekf_vs_ukf.ipynb is not in pyprobml!
gauss-bp-1d-line.ipynb is not in pyprobml!
gprDemoArd.ipynb is not in pyprobml!
gprDemoChangeHparams.ipynb is not in pyprobml!
gprDemoNoiseFree.ipynb is not in pyprobml!
pendulum_1d.ipynb is not in pyprobml!
rbpf_maneuver.ipynb is not in pyprobml!
rbpf_maneuver_demo.ipynb is not in pyprobml!
sis_vs_smc.ipynb is not in pyprobml!

In [20]:

df_root[df_root["book_no"].isna()]

Out[20]:

In [21]:

df_root

Out[21]:

In [14]:

url_utils.github_to_rawcontent_url("https://github.com/probml/pyprobml/blob/master/notebooks/book2/03/dtheory.ipynb")

Out[14]:

'https://raw.githubusercontent.com/probml/pyprobml/master/notebooks/book2/03/dtheory.ipynb'

In [22]:

# check dead urls
# df_root["url"] = df_root.apply(
#     lambda x: url_utils.make_url_from_chapter_no_and_script_name(
#         chapter_no=int(x["chap_no"]),
#         script_name=x["nb_name"],
#         book_no=int(x["book_no"][-1]),
#         convert_to_which_url="github-raw",
#     ),
#     axis=1,
# )
# df_root

In [23]:

# url_utils.check_dead_urls(list(df_root["url"]))

In [17]:

# Add colab url
df_root["colab_url"] = df_root.apply(
    lambda x: url_utils.make_url_from_chapter_no_and_script_name(
        chapter_no=int(x["chap_no"]),
        script_name=x["nb_name"],
        book_no=int(x["book_no"][-1]),
        convert_to_which_url="colab",
    ),
    axis=1,
)

# Add colab url
df_root["github_url"] = df_root.apply(
    lambda x: url_utils.make_url_from_chapter_no_and_script_name(
        chapter_no=int(x["chap_no"]),
        script_name=x["nb_name"],
        book_no=int(x["book_no"][-1]),
        convert_to_which_url="github",
    ),
    axis=1,
)
df_root

Out[17]:

In [18]:

t = df_root["url"][1]
t

Out[18]:

'https://raw.githubusercontent.com/probml/pyprobml/master/notebooks/book1/11/LinearRegressionProbML.ipynb'

In [19]:

enclose_span = lambda text, nb_id: f"<span id={nb_id}>{text}</span>"
to_md_url = lambda text, url: f"[{text}]({url})"

# to_md_url(enclose_span("GAN_JAX_CelebA_demo.ipynb"), t)
df_root["md_colab_url"] = df_root.apply(
    lambda x: to_md_url(enclose_span("colab", x["nb_name"]), x["colab_url"]), axis=1
)
df_root["md_github_url"] = df_root.apply(
    lambda x: to_md_url(enclose_span("github", x["nb_name"]), x["github_url"]), axis=1
)
df_root

Out[19]:

In [20]:

df_final = df_root[["nb_name", "md_colab_url", "md_github_url"]]
df_final.columns = ["Notebook", "Colab url", "Github url"]
df_final

Out[20]:

In [42]:

df_final.iloc[0, 1]

Out[42]:

'[<span id=KLfwdReverseMixGauss.ipynb>colab</span>](https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/06/KLfwdReverseMixGauss.ipynb)'

Handle supplementary notebooks

In [36]:

github_root = "https://github.com/probml/pyprobml/blob/master"
colab_root = "https://colab.research.google.com/github/probml/pyprobml/blob/master"

In [37]:

supp_book2 = glob("../notebooks/book2/*/*/*.ipynb") + glob("../notebooks/book1/*/*/*.ipynb")
supp_book2
len(supp_book2), supp_book2[0]

Out[37]:

(17, '../notebooks/book2/09/supplementary/discretized_ssm.ipynb')

In [43]:

nb_github_colab_list = list(
    map(lambda x: [x.split("/")[-1], colab_root + x.replace("..", ""), github_root + x.replace("..", "")], supp_book2)
)
df_supp = pd.DataFrame(nb_github_colab_list, columns=df_final.columns)
df_supp

Out[43]:

In [44]:

df_supp["Colab url"] = df_supp.apply(lambda x: to_md_url(enclose_span("colab", x["Notebook"]), x["Colab url"]), axis=1)
df_supp["Github url"] = df_supp.apply(
    lambda x: to_md_url(enclose_span("github", x["Notebook"]), x["Github url"]), axis=1
)
df_supp

Out[44]:

In [45]:

df_supp.iloc[0, 2]

Out[45]:

'[<span id=discretized_ssm.ipynb>github</span>](https://github.com/probml/pyprobml/blob/master/notebooks/book2/09/supplementary/discretized_ssm.ipynb)'

Combine supplementary + chapters notebooks

In [46]:

df_chap_supp = pd.concat([df_final, df_supp])
df_chap_supp = df_chap_supp.sort_values(by="Notebook", key=lambda col: col.str.lower())
df_chap_supp

Out[46]:

In [47]:

df_chap_supp.to_markdown("../notebooks.md", index=False)

In [ ]:

pd.concat()

In [25]:

df_external = pd.read_csv("external_links.csv")
df_external

Out[25]:

In [27]:

df1 = pd.DataFrame([[0, 1], [2, 3]])
df1

Out[27]:

In [31]:

df2 = pd.DataFrame([[11, 12], [13, 14]], index=[1, 2])
df2

Out[31]:

In [38]:

import numpy as np

In [40]:

Out[40]:

array([1])

In [41]:

df1.drop(index=np.intersect1d(df1.index.values, df2.index.values))

Out[41]:

In [ ]:

Handle supplementary notebooks

Combine supplementary + chapters notebooks

Product

Resources

Company