Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/book2/30/chow_liu_tree_demo.ipynb
1193 views
Kernel: Unknown Kernel
# Chow-Liu algorithm # Author: Drishtii@ # Based on # https://github.com/probml/pmtk3/blob/master/demos/chowliuTreeDemo.m #!pip install -qq pgmpy import pandas as pd from sklearn.datasets import fetch_20newsgroups try: import networkx as nx except ModuleNotFoundError: %pip install -qq networkx import networkx as nx try: from pgmpy.estimators import TreeSearch except ModuleNotFoundError: %pip install -qq pgmpy from pgmpy.estimators import TreeSearch from sklearn.feature_extraction.text import CountVectorizer try: import pydot except ModuleNotFoundError: %pip install -qq pydot import pydot from networkx.drawing.nx_pydot import graphviz_layout from IPython.display import Image, display try: import probml_utils as pml except ModuleNotFoundError: %pip install -qq git+https://github.com/probml/probml-utils.git import probml_utils as pml newsgroups_train = fetch_20newsgroups(subset="train") list_of_words = [ "email", "disk", "ftp", "files", "format", "image", "display", "phone", "number", "card", "graphics", "windows", "driver", "pc", "drive", "memory", "scsi", "data", "system", "problem", "dos", "program", "space", "version", "win", "team", "won", "car", "video", "software", "bmw", "dealer", "engine", "honda", "mac", "help", "server", "launch", "moon", "nasa", "orbit", "shuttle", "technology", "fans", "games", "hockey", "league", "players", "puck", "season", "oil", "lunar", "bible", "children", "mars", "earth", "god", "satellite", "solar", "mission", "nhl", "war", "world", "science", "computer", "baseball", "hit", "christian", "power", "jesus", "religion", "jews", "government", "israel", "state", "university", "research", "question", "aids", "msg", "food", "water", "health", "insurance", "patients", "medicine", "studies", "case", "president", "human", "fact", "course", "rights", "law", "gun", "evidence", ] count_vect = CountVectorizer(newsgroups_train.data, vocabulary=list_of_words) X_train_counts = count_vect.fit_transform(newsgroups_train.data) df_ = pd.DataFrame.sparse.from_spmatrix(X_train_counts, columns=list_of_words) # Learning graph structure est = TreeSearch(df_, root_node="email") dag = est.estimate(estimator_type="chow-liu") # Plot and display def view_pydot(pdot): plt = Image(pdot.create_png()) display(plt) p = nx.drawing.nx_pydot.to_pydot(dag) view_pydot(p) p.write_png("../figures/tree_structure.png")