CoCalc -- fig_height.ipynb

GitHub Repository: probml/pyprobml
Path: blob/master/internal/fig_height/fig_height.ipynb
¹¹⁹² views

Kernel: Python [conda env:py3713]

In [1]:

import jax
import requests
from typing import Any
from TexSoup import TexSoup
import regex as re
import os
import pandas as pd
from glob import glob
import multiprocessing as mp
from probml_utils.url_utils import extract_scripts_name_from_caption
from IPython.display import clear_output

In [186]:

root_path = "../../"
path1 = root_path + "../bookv2/book1/*/*.tex"
path2 = root_path + "../bookv2/book2/*/*.tex"

In [187]:

tex_files = glob(path2) + glob(path1)
len(tex_files)

Out[187]:

605

In [13]:

# def make_soup(tex_file_path):
#     with open(tex_file_path, "r") as fp:
#         contents = fp.read()
#     try:
#         obj = TexSoup(contents)
#     except:
#         print(f"{tex_file_path} failed to read!")
#         return tex_file_path

In [14]:

# pool = mp.Pool(mp.cpu_count() - 2)
# soups_list = pool.map(make_soup, tex_files)

Out[14]:

../bookv2/book2/pf/pf-other.tex failed to read!
../bookv2/book2/opt/bayesopt.tex failed to read!
../bookv2/book2/shift/adversarial.tex failed to read!
../bookv2/book2/lfm/ica.tex failed to read!
../bookv2/book2/vi/recursive-vi.tex failed to read!
../bookv2/book2/pred/testbed.tex failed to read!
../bookv2/book2/mcmc/hmc.tex failed to read!
../bookv2/book2/mcmc/sgmcmc.tex failed to read!
../bookv2/book2/comp/comp-methods.tex failed to read!
../bookv2/book2/ssm/hmm-short.tex failed to read!
../bookv2/book2/vi/intro-vi.tex failed to read!
../bookv2/book2/pf/proposals.tex failed to read!
../bookv2/book2/vi/vb.tex failed to read!
../bookv2/book2/mcmc/bigdata.tex failed to read!
../bookv2/book2/stats/bayes-solns.tex failed to read!
../bookv2/book2/nfm/vae.tex failed to read!
../bookv2/book2/pf/old.tex failed to read!
../bookv2/book2/prob/expfamLong.tex failed to read!
../bookv2/book2/flows/flows.tex failed to read!
../bookv2/book2/stats/josh.tex failed to read!
../bookv2/book2/lfm/topic-inf.tex failed to read!
../bookv2/book2/info/kl.tex failed to read!
../bookv2/book2/stats/noninf.tex failed to read!
../bookv2/book2/mcmc/convergence.tex failed to read!
../bookv2/book2/diffusion/old2.tex failed to read!
../bookv2/book2/vi/lower-bounds.tex failed to read!
../bookv2/book2/npbayes/point_proc.tex failed to read!
../bookv2/book2/bp/jinfer.tex failed to read!
../bookv2/book2/gp/gp-classification.tex failed to read!
../bookv2/book2/rl/policy-rl.tex failed to read!
../bookv2/book2/rl/offpolicy-rl.tex failed to read!
../bookv2/book2/glm/glm-supp.tex failed to read!
../bookv2/book2/gp/kernels.tex failed to read!
../bookv2/book1/Kernels1/svmStruct1.tex failed to read!
../bookv2/book1/Lda1/gda1.tex failed to read!
../bookv2/book2/gan/algorithms_theory.tex failed to read!
../bookv2/book1/Fewer1/active1.tex failed to read!
../bookv2/book2/genmo/evaluation.tex failed to read!
../bookv2/book1/Dimred1/cca2.tex failed to read!
../bookv2/book1/Glm1/glm1.tex failed to read!
../bookv2/book1/Dtheory1/hyptest-freq.tex failed to read!
../bookv2/book1/Linalg1/junk.tex failed to read!
../bookv2/book1/Fewer1/ssl1.tex failed to read!
../bookv2/book1/Glm1/expfamLong.tex failed to read!
../bookv2/book1/Trees1/boosting.tex failed to read!
../bookv2/book1/Trees1/rf.tex failed to read!
../bookv2/book1/Linreg1/sparse1.tex failed to read!
../bookv2/book1/Fewer1/transfer1.tex failed to read!

In [55]:

# # save to txt
# defective_tex = []
# with open("tex_defective.txt", "w") as fp:
#     for each in soups_list:
#         if each != None:
#             defective_tex.append(each)
#             print(each, file=fp)

In [188]:

defective_tex = []
with open("tex_defective.txt", "r") as fp:
    defective_tex = fp.read().split("\n")[:-1]
    defective_tex = list(map(lambda x: root_path + x, defective_tex))

In [189]:

defective_tex[:4]

Out[189]:

['../../../bookv2/book2/comp/comp-methods.tex',
 '../../../bookv2/book2/mcmc/sgmcmc.tex',
 '../../../bookv2/book2/mcmc/hmc.tex',
 '../../../bookv2/book2/mcmc/bigdata.tex']

In [165]:

len(tex_files), len(defective_tex)

Out[165]:

(607, 48)

In [19]:

soups = {}
for i, file in enumerate(tex_files):
    clear_output(wait=True)
    print(i)
    if file not in defective_tex:
        with open(file, "r") as fp:
            soups[file] = TexSoup(fp.read())

Out[19]:

606

In [200]:

def preprocess_incl_graphics_line(line):
    line = str(line).replace("\\twofigheight", "1.85in").replace("\\textwidth", "*6in").replace("\\dldir", "\\figdir")
    return line


def extract_fig_height_from_incl_graphics_line(line):
    if "height" not in line:
        return "Not mentioned "
    else:
        try:
            fig_height = re.findall(r"height=(.+?in)", str(line))[0]
            return fig_height
        except IndexError:
            if "height" in line:
                print(f"Not able to detect height in : {line}")
            return "-"
            print(f"->>>>>>>>>> fig_height is missing!! in {line} <<<<<<<<<<< - ")
            # print(line)


def extract_fig_name_from_incl_graphics_line(line):
    try:
        fig_name = re.findall(r"figdir/(.+)?}", str(line))[0]
        return fig_name
    except IndexError:
        return False

In [201]:

fig_name_to_height = {"book1": {}, "book2": {}}
repeated_figures = []
c = 0
for tex_file in soups:
    if "book1" in tex_file:
        book_no = "book1"
    else:
        book_no = "book2"

    soup = soups[tex_file]
    for fig in soup.find_all("figure"):
        incl_graphs = fig.find_all("includegraphics")

        for line in incl_graphs:
            # post process
            line = preprocess_incl_graphics_line(line)

            # extract height
            fig_height = extract_fig_height_from_incl_graphics_line(line)

            # extract fig name
            fig_name = extract_fig_name_from_incl_graphics_line(line)
            if not fig_name:
                print(f"fig_name not detected in {line}")
                continue

            if fig_name in fig_name_to_height and fig_name_to_height[fig_name] != fig_height:
                repeated_figures.append([fig_name, fig_height])
                c += 1
                # print(f"{fig_name} exists already")
                pass

            fig_name_to_height[book_no][fig_name] = fig_height

Out[201]:

fig_name not detected in \includegraphics[width=*6in]{figures/graph-embedding-figs/gat.png}
Not able to detect height in : \includegraphics[height = 0.6\linewidth, width=0.9\linewidth]{\figdir/model_view.png}

In [202]:

# handle defective tex

for tex_file in defective_tex:
    book_no = "book1" if "book1" in tex_file else "book2"
    with open(tex_file) as fp:
        test = fp.read()
        test = test.replace("\_", "_").replace("\n", "")
        for fig_content in re.findall(r"\\begin{figure}.+?\\end{figure}", test):
            incl_grphs = re.findall(r"\\includegraphics.*?{.+?}", fig_content)
            for line in incl_grphs:
                # post process
                line = preprocess_incl_graphics_line(line)

                # extract height
                fig_height = extract_fig_height_from_incl_graphics_line(line)

                # extract fig name
                fig_name = extract_fig_name_from_incl_graphics_line(line)
                if not fig_name:
                    print(f"fig_name not detected in {line}")
                    continue

                if fig_name in fig_name_to_height and fig_name_to_height[fig_name] != fig_height:
                    repeated_figures.append([fig_name, fig_height])
                    c += 1
                    # print(f"{fig_name} exists already")
                    pass

                fig_name_to_height[book_no][fig_name] = fig_height

In [203]:

fig_name_to_height_book1 = fig_name_to_height["book1"]
fig_name_to_height_book2 = fig_name_to_height["book2"]

In [204]:

len(fig_name_to_height_book1)

Out[204]:

713

In [205]:

len(fig_name_to_height_book2)

Out[205]:

1004

In [206]:

df1 = pd.DataFrame(
    zip(fig_name_to_height_book1.keys(), fig_name_to_height_book1.values()), columns=["fig_name", "fig_height"]
)
df1.sort_values(by="fig_name", inplace=True)
df1

Out[206]:

In [207]:

df2 = pd.DataFrame(
    zip(fig_name_to_height_book2.keys(), fig_name_to_height_book2.values()), columns=["fig_name", "fig_height"]
)
df2.sort_values(by="fig_name", inplace=True)
df2

Out[207]:

In [208]:

df2["fig_height"].unique()

Out[208]:

array(['2in', '2.5in', '1.5in', '1in', '3in', '4.5in', '0.75in',
       'Not mentioned ', '0.6in', '4in', '1.1in', '1.75in', '1.2in',
       '1.85in', '1.8in', '1.25in', '1.65in', '0.2in', '0.8in', '2.7in',
       '0.5in', '1.525in', '2.25in', '1.3in', '0.23*6in', '3.5in', '.5in',
       '0.195*6in', '.8in', '2.0in', '1.4in', '1.6in', '1.57in',
       '0.181*6in', '0.6*6in', '2.75in', '2.2in'], dtype=object)

In [216]:

root_path = ""

In [221]:

md = f"<details>\n<summary>Book-1</summary>\n\n# Book1 figures to height mapping\n"
md += df1.to_markdown(index=False)
md += "\n</details>"

In [222]:

md += f"<details open>\n<summary>Book-2</summary>\n\n# Book2 figures to height mapping\n"
md += df2.to_markdown(index=False)
md += "\n</details>"

In [223]:

with open(os.path.join(root_path, "fig_height_mapping.md"), "w") as fp:
    fp.write(md)

In [232]:

book1_fig_names = set(df1["fig_name"])
len(book1_fig_names)

Out[232]:

713

In [233]:

book2_fig_names = set(df2["fig_name"])
len(book2_fig_names)

Out[233]:

1004

In [235]:

with open("fig_names_book1.txt", "w") as fp:
    [fp.write(each + "\n") for each in book1_fig_names]

In [236]:

with open("fig_names_book2.txt", "w") as fp:
    [fp.write(each + "\n") for each in book2_fig_names]

In [ ]:

Product

Resources

Company