Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/thinkbayes2
Path: blob/master/examples/hospital3.ipynb
1901 views
Kernel: Python 3 (ipykernel)

Grid algorithm for a beta-binomial hierarchical model

Bayesian Inference with PyMC

Copyright 2021 Allen B. Downey

License: Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)

# If we're running on Colab, install libraries import sys IN_COLAB = 'google.colab' in sys.modules if IN_COLAB: !pip install pymc3 !pip install arviz !pip install empiricaldist
# PyMC generates a FutureWarning we don't need to deal with yet import warnings warnings.filterwarnings("ignore", category=FutureWarning)
import matplotlib.pyplot as plt def legend(**options): """Make a legend only if there are labels.""" handles, labels = plt.gca().get_legend_handles_labels() if len(labels): plt.legend(**options)
def decorate(**options): plt.gca().set(**options) legend() plt.tight_layout()
from empiricaldist import Cdf def compare_cdf(pmf, sample): pmf.make_cdf().plot(label='grid') Cdf.from_seq(sample).plot(label='mcmc') print(pmf.mean(), sample.mean()) decorate()
from empiricaldist import Pmf def make_pmf(ps, qs, name): pmf = Pmf(ps, qs) pmf.normalize() pmf.index.name = name return pmf

Heart Attack Data

This example is based on Chapter 10 of Probability and Bayesian Modeling; it uses data on death rates due to heart attack for patients treated at various hospitals in New York City.

We can use Pandas to read the data into a DataFrame.

import os filename = 'DeathHeartAttackManhattan.csv' if not os.path.exists(filename): !wget https://github.com/AllenDowney/BayesianInferencePyMC/raw/main/DeathHeartAttackManhattan.csv
import pandas as pd df = pd.read_csv(filename) df

The columns we need are Cases, which is the number of patients treated at each hospital, and Deaths, which is the number of those patients who died.

data_ns = df['Cases'].values data_ks = df['Deaths'].values
import numpy as np logn = np.log(np.max(data_ns)) logn = np.log(100) logn
4.605170185988092

Hospital Data with PyMC

Here's a hierarchical model that estimates the death rate for each hospital, and simultaneously estimates the distribution of rates across hospitals.

import pymc3 as pm import theano.tensor as tt def make_model(): with pm.Model() as model: mu = pm.Beta('mu', alpha=1, beta=1) logeta = pm.Logistic('logeta', mu=logn, s=1) eta = pm.Deterministic('eta', tt.exp(logeta)) alpha = pm.Deterministic('alpha', mu * eta) beta = pm.Deterministic('beta', (1-mu) * eta) xs = pm.Beta('xs', alpha, beta, shape=len(data_ns)) ks = pm.Binomial('ks', n=data_ns, p=xs, observed=data_ks) return model
%time model = make_model() pm.model_to_graphviz(model)
CPU times: user 869 ms, sys: 2.44 ms, total: 871 ms Wall time: 870 ms
Image in a Jupyter notebook
with model: pred = pm.sample_prior_predictive(1000) %time trace = pm.sample(500)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [xs, logeta, mu]
Sampling 4 chains for 1_000 tune and 500 draw iterations (4_000 + 2_000 draws total) took 2 seconds. There was 1 divergence after tuning. Increase `target_accept` or reparameterize. There were 4 divergences after tuning. Increase `target_accept` or reparameterize. There were 4 divergences after tuning. Increase `target_accept` or reparameterize. There were 2 divergences after tuning. Increase `target_accept` or reparameterize. The number of effective samples is smaller than 25% for some parameters.
CPU times: user 6.33 s, sys: 164 ms, total: 6.49 s Wall time: 7.8 s

Here are the posterior distributions of the hyperparameters

import arviz as az with model: az.plot_posterior(trace, var_names=['mu', 'logeta'])
Image in a Jupyter notebook

And we can extract the posterior distributions of the xs.

trace_xs = trace['xs'].transpose() trace_xs.shape
(13, 2000)

As an example, here's the posterior distribution of x for the first hospital.

with model: az.plot_posterior(trace_xs[0])
Image in a Jupyter notebook

Just one update

i = 3 data_n = data_ns[i] data_k = data_ks[i]
def make_model1(): with pm.Model() as model1: mu = pm.Beta('mu', alpha=1, beta=1) logeta = pm.Logistic('logeta', mu=logn, s=1) eta = pm.Deterministic('eta', tt.exp(logeta)) alpha = pm.Deterministic('alpha', mu * eta) beta = pm.Deterministic('beta', (1-mu) * eta) x = pm.Beta('x', alpha, beta) k = pm.Binomial('k', n=data_n, p=x, observed=data_k) return model1
model1 = make_model1() pm.model_to_graphviz(model1)
Image in a Jupyter notebook
with model1: pred1 = pm.sample_prior_predictive(1000) trace1 = pm.sample(500)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [x, logeta, mu]
Sampling 4 chains for 1_000 tune and 500 draw iterations (4_000 + 2_000 draws total) took 2 seconds. There were 77 divergences after tuning. Increase `target_accept` or reparameterize. The acceptance probability does not match the target. It is 0.5516602192663524, but should be close to 0.8. Try to increase the number of tuning steps. There were 32 divergences after tuning. Increase `target_accept` or reparameterize. There were 33 divergences after tuning. Increase `target_accept` or reparameterize. There were 15 divergences after tuning. Increase `target_accept` or reparameterize. The estimated number of effective samples is smaller than 200 for some parameters.
Cdf.from_seq(pred1['mu']).plot(label='prior', color='gray') Cdf.from_seq(trace1['mu']).plot(label='posterior') decorate(title='Distribution of mu')
Image in a Jupyter notebook
Cdf.from_seq(pred1['logeta']).plot(label='prior', color='gray') Cdf.from_seq(trace1['logeta']).plot(label='posterior') decorate(title='Distribution of logeta')
Image in a Jupyter notebook
Cdf.from_seq(pred1['x']).plot(label='prior', color='gray') Cdf.from_seq(trace1['x']).plot(label='posterior') decorate(title='Distribution of x')
Image in a Jupyter notebook

The grid priors

from scipy.stats import beta as betadist mus = np.linspace(0.01, 0.99, 101) ps = betadist.pdf(mus, 1, 1) prior_mu = make_pmf(ps, mus, 'mu') prior_mu.plot() decorate(title='Prior distribution of mu')
Image in a Jupyter notebook
from scipy.stats import logistic logetas = np.linspace(0.1, 2*logn, 90) ps = logistic.pdf(logetas, loc=logn, scale=1) prior_logeta = make_pmf(ps, logetas, 'logeta') prior_logeta.plot() decorate(title='Prior distribution of logeta')
Image in a Jupyter notebook
compare_cdf(prior_mu, pred1['mu']) decorate(title='Prior distribution of mu')
0.49999999999999994 0.5083188286330487
Image in a Jupyter notebook
compare_cdf(prior_logeta, pred1['logeta']) decorate(title='Prior distribution of logeta')
4.609771582860065 4.562741091382068
Image in a Jupyter notebook

The joint distribution of hyperparameters

# TODO: Change these variable names def make_hyper(prior_alpha, prior_beta): PA, PB = np.meshgrid(prior_alpha.ps, prior_beta.ps, indexing='ij') hyper = PA * PB return hyper
prior_hyper = make_hyper(prior_mu, prior_logeta) prior_hyper.shape
(101, 90)
import pandas as pd from utils import plot_contour plot_contour(pd.DataFrame(prior_hyper, index=mus, columns=logetas)) decorate(title="Joint prior of mu and logeta")
Image in a Jupyter notebook

Joint prior of alpha, beta, and x

M, L = np.meshgrid(mus, logetas, indexing='ij')
import pandas as pd pd.DataFrame(A).describe()
xs = np.linspace(0.01, 0.99, 99) M, L, X = np.meshgrid(mus, logetas, xs, indexing='ij')
E = np.exp(L) A = M * E B = (1-M) * E
B.sum()
513779743.8067446
from scipy.stats import beta as betadist %time betapdf = betadist.pdf(X, A, B) betapdf.sum()
CPU times: user 423 ms, sys: 11.6 ms, total: 434 ms Wall time: 433 ms
881056.3544591597

We can speed this up by computing just xα1(1x)β1x^{\alpha-1} (1-x)^{\beta-1} and skipping the terms that don't depend on x

logx = np.log(xs) logy = np.log(1-xs)
logpdf = (A-1) * logx + (B-1) * logy betapdf = np.exp(logpdf) betapdf.sum()
273720.6027955034
totals = betapdf.sum(axis=2) totals.sum()
273720.6027955031
totals
array([[4.89166505e+002, 4.72178006e+002, 4.55639947e+002, ..., 6.58509827e-197, 2.86364503e-218, 6.20570778e-242], [4.77146830e+002, 4.58877476e+002, 4.40973240e+002, ..., 0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [4.65611809e+002, 4.46159331e+002, 4.27006575e+002, ..., 0.00000000e+000, 0.00000000e+000, 0.00000000e+000], ..., [4.65611809e+002, 4.46159331e+002, 4.27006575e+002, ..., 0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [4.77146830e+002, 4.58877476e+002, 4.40973240e+002, ..., 0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [4.89166505e+002, 4.72178006e+002, 4.55639947e+002, ..., 6.58509827e-197, 2.86364503e-218, 6.20570778e-242]])
(totals==0).sum(), (totals!=0).sum(),
(1826, 7264)
shape = totals.shape + (1,) betapdf /= totals.reshape(shape) betapdf.sum()
/tmp/ipykernel_325165/991674067.py:2: RuntimeWarning: overflow encountered in true_divide betapdf /= totals.reshape(shape) /home/downey/anaconda3/envs/ThinkBayes2/lib/python3.9/site-packages/numpy/core/_methods.py:48: RuntimeWarning: overflow encountered in reduce return umr_sum(a, axis, dtype, out, keepdims, initial, where)
nan
def make_prior(hyper): # reshape hyper so we can multiply along axis 0 shape = hyper.shape + (1,) prior = betapdf * hyper.reshape(shape) return prior
%time prior = make_prior(prior_hyper) prior.sum()

The following function computes the marginal distributions.

def marginal(joint, axis): axes = [i for i in range(3) if i != axis] return joint.sum(axis=tuple(axes))

And let's confirm that the marginal distributions are what they are supposed to be.

prior_mu.plot() marginal_mu = Pmf(marginal(prior, 0), mus) marginal_mu.plot() decorate(title='Checking the marginal distribution of mu')
prior_logeta.plot() marginal_eta = Pmf(marginal(prior, 1), logetas) marginal_eta.plot() decorate(title='Checking the marginal distribution of eta')
prior_x = Pmf(marginal(prior, 2), xs) prior_x.plot() decorate(title='Prior distribution of x')
# TODO: Compare to predictive distribution from PyMC #marginal_x = Pmf(marginal(prior, 2), xs) #compare_cdf(marginal_x, pred1['x']) #decorate(title='Checking the marginal distribution of x')
def get_hyper(joint): return joint.sum(axis=2)
hyper = get_hyper(prior)
plot_contour(pd.DataFrame(hyper, index=mus, columns=logetas)) decorate(title="Joint prior of mu and eta")

The Update

from scipy.stats import binom like_x = binom.pmf(data_k, data_n, xs) like_x.shape
plt.plot(xs, like_x) decorate(title='Likelihood of the data')
def update(prior, data): n, k = data like_x = binom.pmf(k, n, xs) posterior = prior * like_x posterior /= posterior.sum() return posterior
data = data_n, data_k %time posterior = update(prior, data)
marginal_mu = Pmf(marginal(posterior, 0), mus) compare_cdf(marginal_mu, trace1['mu'])
marginal_eta = Pmf(marginal(posterior, 1), prior_logeta.qs) compare_cdf(marginal_eta, trace1['eta'])
marginal_x = Pmf(marginal(posterior, 2), xs) compare_cdf(marginal_x, trace1['x']) marginal_x.mean(), trace1['x'].mean()
posterior_hyper = get_hyper(posterior)
plot_contour(pd.DataFrame(posterior_hyper, index=mus, columns=logetas)) decorate(title="Joint posterior of alpha and beta")
like_hyper = posterior_hyper / prior_hyper
plot_contour(pd.DataFrame(like_hyper, index=mus, columns=logetas)) decorate(title="Likelihood of mu and eta")

Multiple updates

prior = make_prior(prior_hyper) prior.shape
def multiple_updates(prior, ns, ks, xs): for data in zip(ns, ks): print(data) posterior = update(prior, data) hyper = get_hyper(posterior) prior = make_prior(hyper) return posterior
%time posterior = multiple_updates(prior, data_ns, data_ks, xs)
marginal_mu = Pmf(marginal(posterior, 0), mus) compare_cdf(marginal_mu, trace['mu'])
marginal_eta = Pmf(marginal(posterior, 1), logetas) compare_cdf(marginal_eta, trace['eta'])
marginal_x = Pmf(marginal(posterior, 2), prior_x.qs) compare_cdf(marginal_x, trace_xs[-1])
posterior_hyper = get_hyper(posterior)
plot_contour(pd.DataFrame(posterior_hyper, index=mus, columns=logetas)) decorate(title="Joint posterior of mu and eta")
like_hyper = posterior_hyper / prior_hyper
plot_contour(pd.DataFrame(like_hyper, index=mus, columns=logetas)) decorate(title="Likelihood of mu and eta")

One at a time

def compute_likes_hyper(ns, ks): shape = ns.shape + mus.shape + logetas.shape likes_hyper = np.empty(shape) for i, data in enumerate(zip(ns, ks)): print(data) n, k = data like_x = binom.pmf(k, n, xs) posterior = betapdf * like_x likes_hyper[i] = posterior.sum(axis=2) print(likes_hyper[i].sum()) return likes_hyper
%time likes_hyper = compute_likes_hyper(data_ns, data_ks)
likes_hyper.sum()
like_hyper_all = likes_hyper.prod(axis=0) like_hyper_all.sum()
plot_contour(pd.DataFrame(like_hyper_all, index=mus, columns=logetas)) decorate(title="Likelihood of mu and eta")
posterior_hyper_all = prior_hyper * like_hyper_all posterior_hyper_all /= posterior_hyper_all.sum() np.allclose(posterior_hyper_all, posterior_hyper)
marginal_mu2 = Pmf(posterior_hyper_all.sum(axis=1), mus) marginal_mu2.make_cdf().plot() marginal_mu.make_cdf().plot() np.allclose(marginal_mu, marginal_mu2)
marginal_eta2 = Pmf(posterior_hyper_all.sum(axis=0), logetas) marginal_eta2.make_cdf().plot() marginal_eta.make_cdf().plot() np.allclose(marginal_eta, marginal_eta2)
plot_contour(pd.DataFrame(posterior_hyper_all, index=mus, columns=logetas)) decorate(title="Joint posterior of mu and eta")
i = 3 data = data_ns[i], data_ks[i] data
hyper_i = prior_hyper * like_hyper_all / likes_hyper[i] hyper_i.sum()
prior_i = make_prior(hyper_i)
posterior_i = update(prior_i, data)
Pmf(marginal(posterior_i, 0), mus).make_cdf().plot() marginal_mu.make_cdf().plot()
Pmf(marginal(posterior_i, 1), logetas).make_cdf().plot() marginal_eta.make_cdf().plot()
marginal_mu = Pmf(marginal(posterior_i, 0), mus) marginal_eta = Pmf(marginal(posterior_i, 1), logetas) marginal_x = Pmf(marginal(posterior_i, 2), xs)
compare_cdf(marginal_mu, trace['mu'])
compare_cdf(marginal_eta, trace['eta'])
compare_cdf(marginal_x, trace_xs[i])
def compute_all_marginals(ns, ks): prior = prior_hyper * like_hyper_all for i, data in enumerate(zip(ns, ks)): print(data) n, k = data hyper_i = prior / likes_hyper[i] prior_i = make_prior(hyper_i) posterior_i = update(prior_i, data) marginal_x = Pmf(marginal(posterior_i, 2), xs) marginal_x.make_cdf().plot()
%time compute_all_marginals(data_ns, data_ks)