Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/internal/fig_height/fig_height.ipynb
1192 views
Kernel: Python [conda env:py3713]
import jax import requests from typing import Any from TexSoup import TexSoup import regex as re import os import pandas as pd from glob import glob import multiprocessing as mp from probml_utils.url_utils import extract_scripts_name_from_caption from IPython.display import clear_output
root_path = "../../" path1 = root_path + "../bookv2/book1/*/*.tex" path2 = root_path + "../bookv2/book2/*/*.tex"
tex_files = glob(path2) + glob(path1) len(tex_files)
605
# def make_soup(tex_file_path): # with open(tex_file_path, "r") as fp: # contents = fp.read() # try: # obj = TexSoup(contents) # except: # print(f"{tex_file_path} failed to read!") # return tex_file_path
# pool = mp.Pool(mp.cpu_count() - 2) # soups_list = pool.map(make_soup, tex_files)
../bookv2/book2/pf/pf-other.tex failed to read! ../bookv2/book2/opt/bayesopt.tex failed to read! ../bookv2/book2/shift/adversarial.tex failed to read! ../bookv2/book2/lfm/ica.tex failed to read! ../bookv2/book2/vi/recursive-vi.tex failed to read! ../bookv2/book2/pred/testbed.tex failed to read! ../bookv2/book2/mcmc/hmc.tex failed to read! ../bookv2/book2/mcmc/sgmcmc.tex failed to read! ../bookv2/book2/comp/comp-methods.tex failed to read! ../bookv2/book2/ssm/hmm-short.tex failed to read! ../bookv2/book2/vi/intro-vi.tex failed to read! ../bookv2/book2/pf/proposals.tex failed to read! ../bookv2/book2/vi/vb.tex failed to read! ../bookv2/book2/mcmc/bigdata.tex failed to read! ../bookv2/book2/stats/bayes-solns.tex failed to read! ../bookv2/book2/nfm/vae.tex failed to read! ../bookv2/book2/pf/old.tex failed to read! ../bookv2/book2/prob/expfamLong.tex failed to read! ../bookv2/book2/flows/flows.tex failed to read! ../bookv2/book2/stats/josh.tex failed to read! ../bookv2/book2/lfm/topic-inf.tex failed to read! ../bookv2/book2/info/kl.tex failed to read! ../bookv2/book2/stats/noninf.tex failed to read! ../bookv2/book2/mcmc/convergence.tex failed to read! ../bookv2/book2/diffusion/old2.tex failed to read! ../bookv2/book2/vi/lower-bounds.tex failed to read! ../bookv2/book2/npbayes/point_proc.tex failed to read! ../bookv2/book2/bp/jinfer.tex failed to read! ../bookv2/book2/gp/gp-classification.tex failed to read! ../bookv2/book2/rl/policy-rl.tex failed to read! ../bookv2/book2/rl/offpolicy-rl.tex failed to read! ../bookv2/book2/glm/glm-supp.tex failed to read! ../bookv2/book2/gp/kernels.tex failed to read! ../bookv2/book1/Kernels1/svmStruct1.tex failed to read! ../bookv2/book1/Lda1/gda1.tex failed to read! ../bookv2/book2/gan/algorithms_theory.tex failed to read! ../bookv2/book1/Fewer1/active1.tex failed to read! ../bookv2/book2/genmo/evaluation.tex failed to read! ../bookv2/book1/Dimred1/cca2.tex failed to read! ../bookv2/book1/Glm1/glm1.tex failed to read! ../bookv2/book1/Dtheory1/hyptest-freq.tex failed to read! ../bookv2/book1/Linalg1/junk.tex failed to read! ../bookv2/book1/Fewer1/ssl1.tex failed to read! ../bookv2/book1/Glm1/expfamLong.tex failed to read! ../bookv2/book1/Trees1/boosting.tex failed to read! ../bookv2/book1/Trees1/rf.tex failed to read! ../bookv2/book1/Linreg1/sparse1.tex failed to read! ../bookv2/book1/Fewer1/transfer1.tex failed to read!
# # save to txt # defective_tex = [] # with open("tex_defective.txt", "w") as fp: # for each in soups_list: # if each != None: # defective_tex.append(each) # print(each, file=fp)
defective_tex = [] with open("tex_defective.txt", "r") as fp: defective_tex = fp.read().split("\n")[:-1] defective_tex = list(map(lambda x: root_path + x, defective_tex))
defective_tex[:4]
['../../../bookv2/book2/comp/comp-methods.tex', '../../../bookv2/book2/mcmc/sgmcmc.tex', '../../../bookv2/book2/mcmc/hmc.tex', '../../../bookv2/book2/mcmc/bigdata.tex']
len(tex_files), len(defective_tex)
(607, 48)
soups = {} for i, file in enumerate(tex_files): clear_output(wait=True) print(i) if file not in defective_tex: with open(file, "r") as fp: soups[file] = TexSoup(fp.read())
606
def preprocess_incl_graphics_line(line): line = str(line).replace("\\twofigheight", "1.85in").replace("\\textwidth", "*6in").replace("\\dldir", "\\figdir") return line def extract_fig_height_from_incl_graphics_line(line): if "height" not in line: return "Not mentioned " else: try: fig_height = re.findall(r"height=(.+?in)", str(line))[0] return fig_height except IndexError: if "height" in line: print(f"Not able to detect height in : {line}") return "-" print(f"->>>>>>>>>> fig_height is missing!! in {line} <<<<<<<<<<< - ") # print(line) def extract_fig_name_from_incl_graphics_line(line): try: fig_name = re.findall(r"figdir/(.+)?}", str(line))[0] return fig_name except IndexError: return False
fig_name_to_height = {"book1": {}, "book2": {}} repeated_figures = [] c = 0 for tex_file in soups: if "book1" in tex_file: book_no = "book1" else: book_no = "book2" soup = soups[tex_file] for fig in soup.find_all("figure"): incl_graphs = fig.find_all("includegraphics") for line in incl_graphs: # post process line = preprocess_incl_graphics_line(line) # extract height fig_height = extract_fig_height_from_incl_graphics_line(line) # extract fig name fig_name = extract_fig_name_from_incl_graphics_line(line) if not fig_name: print(f"fig_name not detected in {line}") continue if fig_name in fig_name_to_height and fig_name_to_height[fig_name] != fig_height: repeated_figures.append([fig_name, fig_height]) c += 1 # print(f"{fig_name} exists already") pass fig_name_to_height[book_no][fig_name] = fig_height
fig_name not detected in \includegraphics[width=*6in]{figures/graph-embedding-figs/gat.png} Not able to detect height in : \includegraphics[height = 0.6\linewidth, width=0.9\linewidth]{\figdir/model_view.png}
# handle defective tex for tex_file in defective_tex: book_no = "book1" if "book1" in tex_file else "book2" with open(tex_file) as fp: test = fp.read() test = test.replace("\_", "_").replace("\n", "") for fig_content in re.findall(r"\\begin{figure}.+?\\end{figure}", test): incl_grphs = re.findall(r"\\includegraphics.*?{.+?}", fig_content) for line in incl_grphs: # post process line = preprocess_incl_graphics_line(line) # extract height fig_height = extract_fig_height_from_incl_graphics_line(line) # extract fig name fig_name = extract_fig_name_from_incl_graphics_line(line) if not fig_name: print(f"fig_name not detected in {line}") continue if fig_name in fig_name_to_height and fig_name_to_height[fig_name] != fig_height: repeated_figures.append([fig_name, fig_height]) c += 1 # print(f"{fig_name} exists already") pass fig_name_to_height[book_no][fig_name] = fig_height
fig_name_to_height_book1 = fig_name_to_height["book1"] fig_name_to_height_book2 = fig_name_to_height["book2"]
len(fig_name_to_height_book1)
713
len(fig_name_to_height_book2)
1004
df1 = pd.DataFrame( zip(fig_name_to_height_book1.keys(), fig_name_to_height_book1.values()), columns=["fig_name", "fig_height"] ) df1.sort_values(by="fig_name", inplace=True) df1
df2 = pd.DataFrame( zip(fig_name_to_height_book2.keys(), fig_name_to_height_book2.values()), columns=["fig_name", "fig_height"] ) df2.sort_values(by="fig_name", inplace=True) df2
df2["fig_height"].unique()
array(['2in', '2.5in', '1.5in', '1in', '3in', '4.5in', '0.75in', 'Not mentioned ', '0.6in', '4in', '1.1in', '1.75in', '1.2in', '1.85in', '1.8in', '1.25in', '1.65in', '0.2in', '0.8in', '2.7in', '0.5in', '1.525in', '2.25in', '1.3in', '0.23*6in', '3.5in', '.5in', '0.195*6in', '.8in', '2.0in', '1.4in', '1.6in', '1.57in', '0.181*6in', '0.6*6in', '2.75in', '2.2in'], dtype=object)
root_path = ""
md = f"<details>\n<summary>Book-1</summary>\n\n# Book1 figures to height mapping\n" md += df1.to_markdown(index=False) md += "\n</details>"
md += f"<details open>\n<summary>Book-2</summary>\n\n# Book2 figures to height mapping\n" md += df2.to_markdown(index=False) md += "\n</details>"
with open(os.path.join(root_path, "fig_height_mapping.md"), "w") as fp: fp.write(md)
book1_fig_names = set(df1["fig_name"]) len(book1_fig_names)
713
book2_fig_names = set(df2["fig_name"]) len(book2_fig_names)
1004
with open("fig_names_book1.txt", "w") as fp: [fp.write(each + "\n") for each in book1_fig_names]
with open("fig_names_book2.txt", "w") as fp: [fp.write(each + "\n") for each in book2_fig_names]