Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/deprecated/notebooks/student_pgm.ipynb
1192 views
Kernel: Python 3

Open In Colab

!pip install -q causalgraphicalmodels !pip install -q pgmpy !pip install superimport !wget -q https://raw.githubusercontent.com/probml/pyprobml/master/scripts/pyprobml_utils.py !wget -q https://raw.githubusercontent.com/probml/pyprobml/master/scripts/pgmpy_utils.py # import pyprobml_utils as pml import pgmpy_utils as pgm
|████████████████████████████████| 1.9 MB 4.1 MB/s Collecting superimport Downloading superimport-0.3.4.tar.gz (6.0 kB) Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from superimport) (2.23.0) Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->superimport) (2.10) Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->superimport) (3.0.4) Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->superimport) (1.24.3) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->superimport) (2021.10.8) Building wheels for collected packages: superimport Building wheel for superimport (setup.py) ... done Created wheel for superimport: filename=superimport-0.3.4-py3-none-any.whl size=5888 sha256=aff8dbabb07f3802e05efca77d01225c61beaf08305bca7ad76b0891f4a287a9 Stored in directory: /root/.cache/pip/wheels/6c/66/dc/337052d868002cf3830606ee34d91d1ceff6a67bf8df982c72 Successfully built superimport Installing collected packages: superimport Successfully installed superimport-0.3.4
INFO:numexpr.utils:NumExpr defaulting to 2 threads.
from causalgraphicalmodels import CausalGraphicalModel import pgmpy import numpy as np import pandas as pd

Model

from pgmpy.models import BayesianModel from pgmpy.factors.discrete import TabularCPD # Defining the model structure. We can define the network by just passing a list of edges. # model = BayesianModel([('D', 'G'), ('I', 'G'), ('G', 'L'), ('I', 'S')]) model = BayesianModel([("Diff", "Grade"), ("Intel", "Grade"), ("Grade", "Letter"), ("Intel", "SAT")])
/usr/local/lib/python3.7/dist-packages/pgmpy/models/BayesianModel.py:10: FutureWarning: BayesianModel has been renamed to BayesianNetwork. Please use BayesianNetwork class, BayesianModel will be removed in future. FutureWarning,

Basic CPDs

# Defining individual CPDs. cpd_d = TabularCPD(variable="Diff", variable_card=2, values=[[0.6], [0.4]]) cpd_i = TabularCPD(variable="Intel", variable_card=2, values=[[0.7], [0.3]]) # The representation of CPD in pgmpy is a bit different than the CPD shown in the above picture. In pgmpy the colums # are the evidences and rows are the states of the variable. So the grade CPD is represented like this: # # +---------+---------+---------+---------+---------+ # | diff | intel_0 | intel_0 | intel_1 | intel_1 | # +---------+---------+---------+---------+---------+ # | intel | diff_0 | diff_1 | diff_0 | diff_1 | # +---------+---------+---------+---------+---------+ # | grade_0 | 0.3 | 0.05 | 0.9 | 0.5 | # +---------+---------+---------+---------+---------+ # | grade_1 | 0.4 | 0.25 | 0.08 | 0.3 | # +---------+---------+---------+---------+---------+ # | grade_2 | 0.3 | 0.7 | 0.02 | 0.2 | # +---------+---------+---------+---------+---------+ cpd_g = TabularCPD( variable="Grade", variable_card=3, values=[[0.3, 0.05, 0.9, 0.5], [0.4, 0.25, 0.08, 0.3], [0.3, 0.7, 0.02, 0.2]], evidence=["Intel", "Diff"], evidence_card=[2, 2], ) cpd_l = TabularCPD( variable="Letter", variable_card=2, values=[[0.1, 0.4, 0.99], [0.9, 0.6, 0.01]], evidence=["Grade"], evidence_card=[3], ) cpd_s = TabularCPD( variable="SAT", variable_card=2, values=[[0.95, 0.2], [0.05, 0.8]], evidence=["Intel"], evidence_card=[2] ) # Associating the CPDs with the network model.add_cpds(cpd_d, cpd_i, cpd_g, cpd_l, cpd_s) # check_model checks for the network structure and CPDs and verifies that the CPDs are correctly # defined and sum to 1. model.check_model()
True
print(model.get_cpds("Grade"))
+----------+----------+----------+----------+----------+ | Intel | Intel(0) | Intel(0) | Intel(1) | Intel(1) | +----------+----------+----------+----------+----------+ | Diff | Diff(0) | Diff(1) | Diff(0) | Diff(1) | +----------+----------+----------+----------+----------+ | Grade(0) | 0.3 | 0.05 | 0.9 | 0.5 | +----------+----------+----------+----------+----------+ | Grade(1) | 0.4 | 0.25 | 0.08 | 0.3 | +----------+----------+----------+----------+----------+ | Grade(2) | 0.3 | 0.7 | 0.02 | 0.2 | +----------+----------+----------+----------+----------+

CPDs with names states

# CPDs can also be defined using the state names of the variables. If the state names are not provided # like in the previous example, pgmpy will automatically assign names as: 0, 1, 2, .... cpd_d_sn = TabularCPD(variable="Diff", variable_card=2, values=[[0.6], [0.4]], state_names={"Diff": ["Easy", "Hard"]}) cpd_i_sn = TabularCPD(variable="Intel", variable_card=2, values=[[0.7], [0.3]], state_names={"Intel": ["Low", "High"]}) cpd_g_sn = TabularCPD( variable="Grade", variable_card=3, values=[[0.3, 0.05, 0.9, 0.5], [0.4, 0.25, 0.08, 0.3], [0.3, 0.7, 0.02, 0.2]], evidence=["Intel", "Diff"], evidence_card=[2, 2], state_names={"Grade": ["A", "B", "C"], "Intel": ["Low", "High"], "Diff": ["Easy", "Hard"]}, ) cpd_l_sn = TabularCPD( variable="Letter", variable_card=2, values=[[0.1, 0.4, 0.99], [0.9, 0.6, 0.01]], evidence=["Grade"], evidence_card=[3], state_names={"Letter": ["Bad", "Good"], "Grade": ["A", "B", "C"]}, ) cpd_s_sn = TabularCPD( variable="SAT", variable_card=2, values=[[0.95, 0.2], [0.05, 0.8]], evidence=["Intel"], evidence_card=[2], state_names={"SAT": ["Bad", "Good"], "Intel": ["Low", "High"]}, ) # These defined CPDs can be added to the model. Since, the model already has CPDs associated to variables, it will # show warning that pmgpy is now replacing those CPDs with the new ones. model.add_cpds(cpd_d_sn, cpd_i_sn, cpd_g_sn, cpd_l_sn, cpd_s_sn) model.check_model()
INFO:root:Replacing existing CPD for Diff INFO:root:Replacing existing CPD for Intel INFO:root:Replacing existing CPD for Grade INFO:root:Replacing existing CPD for Letter INFO:root:Replacing existing CPD for SAT
True
# Printing a CPD with it's state names defined. print(model.get_cpds("Grade"))
+----------+------------+------------+-------------+-------------+ | Intel | Intel(Low) | Intel(Low) | Intel(High) | Intel(High) | +----------+------------+------------+-------------+-------------+ | Diff | Diff(Easy) | Diff(Hard) | Diff(Easy) | Diff(Hard) | +----------+------------+------------+-------------+-------------+ | Grade(A) | 0.3 | 0.05 | 0.9 | 0.5 | +----------+------------+------------+-------------+-------------+ | Grade(B) | 0.4 | 0.25 | 0.08 | 0.3 | +----------+------------+------------+-------------+-------------+ | Grade(C) | 0.3 | 0.7 | 0.02 | 0.2 | +----------+------------+------------+-------------+-------------+
for cpd in model.get_cpds(): print(cpd)
+------------+-----+ | Diff(Easy) | 0.6 | +------------+-----+ | Diff(Hard) | 0.4 | +------------+-----+ +-------------+-----+ | Intel(Low) | 0.7 | +-------------+-----+ | Intel(High) | 0.3 | +-------------+-----+ +----------+------------+------------+-------------+-------------+ | Intel | Intel(Low) | Intel(Low) | Intel(High) | Intel(High) | +----------+------------+------------+-------------+-------------+ | Diff | Diff(Easy) | Diff(Hard) | Diff(Easy) | Diff(Hard) | +----------+------------+------------+-------------+-------------+ | Grade(A) | 0.3 | 0.05 | 0.9 | 0.5 | +----------+------------+------------+-------------+-------------+ | Grade(B) | 0.4 | 0.25 | 0.08 | 0.3 | +----------+------------+------------+-------------+-------------+ | Grade(C) | 0.3 | 0.7 | 0.02 | 0.2 | +----------+------------+------------+-------------+-------------+ +--------------+----------+----------+----------+ | Grade | Grade(A) | Grade(B) | Grade(C) | +--------------+----------+----------+----------+ | Letter(Bad) | 0.1 | 0.4 | 0.99 | +--------------+----------+----------+----------+ | Letter(Good) | 0.9 | 0.6 | 0.01 | +--------------+----------+----------+----------+ +-----------+------------+-------------+ | Intel | Intel(Low) | Intel(High) | +-----------+------------+-------------+ | SAT(Bad) | 0.95 | 0.2 | +-----------+------------+-------------+ | SAT(Good) | 0.05 | 0.8 | +-----------+------------+-------------+

Inference

from pgmpy.inference import VariableElimination infer = VariableElimination(model)
/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead. import pandas.util.testing as tm

Posterior given Grade=C

evidence = {"Grade": "C"} postD = infer.query(["Diff"], evidence=evidence).values postI = infer.query(["Intel"], evidence=evidence).values print("\n") print("Pr(Difficulty=Hard|Grade=C) = {:0.2f}".format(postD[1])) print("Pr(Intelligence=High|Grade=C) = {:0.2f}".format(postI[1]))
0%| | 0/1 [00:00<?, ?it/s]
0%| | 0/1 [00:00<?, ?it/s]
0%| | 0/1 [00:00<?, ?it/s]
0%| | 0/1 [00:00<?, ?it/s]
Pr(Difficulty=Hard|Grade=C) = 0.63 Pr(Intelligence=High|Grade=C) = 0.08

Posterior given Grade=C, SAT=Good

evidence = {"Grade": "C", "SAT": "Good"} postD = infer.query(["Diff"], evidence=evidence).values postI = infer.query(["Intel"], evidence=evidence).values print("\n") print("Pr(Difficulty=Hard|Grade=C,SAT=Good) = {:0.2f}".format(postD[1])) print("Pr(Intelligence=High|Grade=C,SAT=Good) = {:0.2f}".format(postI[1]))
0%| | 0/1 [00:00<?, ?it/s]
0%| | 0/1 [00:00<?, ?it/s]
0%| | 0/1 [00:00<?, ?it/s]
0%| | 0/1 [00:00<?, ?it/s]
Pr(Difficulty=Hard|Grade=C,SAT=Good) = 0.76 Pr(Intelligence=High|Grade=C,SAT=Good) = 0.58

Visualization

DAG

model2 = CausalGraphicalModel(nodes=model.nodes(), edges=model.edges()) dot = model2.draw() print(type(dot)) display(dot) dot.render(filename="student_pgm", format="pdf") # creates student_pgm (a text file of the graph) and student_pgm.pdf
<class 'graphviz.dot.Digraph'>
Image in a Jupyter notebook
'student_pgm.pdf'
from google.colab import files # files.view('student_pgm') # open text file

CPTs

dot = pgm.visualize_model(model) display(dot) dot.render("student_pgm_with_cpt", format="pdf")
Image in a Jupyter notebook
'student_pgm_with_cpt.pdf'

Marginals

evidence = {"Grade": "C"} marginals = pgm.get_marginals(model, evidence) print(marginals)
0%| | 0/1 [00:00<?, ?it/s]
0%| | 0/1 [00:00<?, ?it/s]
0%| | 0/1 [00:00<?, ?it/s]
0%| | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0%| | 0/2 [00:00<?, ?it/s]
0%| | 0/2 [00:00<?, ?it/s]
{'Diff': array([0.37070938, 0.62929062]), 'Grade': array([0., 0., 1.]), 'Intel': array([0.92105263, 0.07894737]), 'Letter': array([0.99, 0.01]), 'SAT': array([0.89078947, 0.10921053])}
dot = pgm.visualize_marginals(model, evidence, marginals) display(dot) dot.render("student_pgm_marginals_given_grade", format="pdf")
Image in a Jupyter notebook
'student_pgm_marginals_given_grade.pdf'
evidence = {"Grade": "C", "SAT": "Good"} marginals = pgm.get_marginals(model, evidence) print(marginals) dot = pgm.visualize_marginals(model, evidence, marginals) display(dot) dot.render("student_pgm_marginals_given_grade_sat", format="pdf")
0%| | 0/1 [00:00<?, ?it/s]
0%| | 0/1 [00:00<?, ?it/s]
0%| | 0/1 [00:00<?, ?it/s]
0%| | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
{'Diff': array([0.24044002, 0.75955998]), 'Grade': array([0., 0., 1.]), 'Intel': array([0.42168675, 0.57831325]), 'Letter': array([0.99, 0.01]), 'SAT': array([0., 1.])}
Image in a Jupyter notebook
'student_pgm_marginals_given_grade_sat.pdf'
from google.colab import drive drive.mount("/content/drive")